From 9a35e049e891a4642e237f39eb70b546ba1083bb Mon Sep 17 00:00:00 2001
From: Mike He <dh63@cs.washington.edu>
Date: Wed, 14 Sep 2022 22:53:17 -0400
Subject: [PATCH 01/32] [Lang] Fixes matrix-vector multiplication (#6014)

Related issue = fix #5988

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->
---
 python/taichi/lang/matrix.py | 12 ++++++++++++
 tests/python/test_matrix.py  | 20 +++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 3fcddd7dfe7035..48a6013e37a9e3 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -447,6 +447,11 @@ def __init__(self,
             is_matrix = isinstance(arr[0], Iterable) and not is_vector(self)
             initializer = _make_entries_initializer(is_matrix)
             self.ndim = 2 if is_matrix else 1
+            if not is_matrix and isinstance(arr[0], Iterable):
+                flattened = []
+                for row in arr:
+                    flattened += row
+                arr = flattened
 
             if in_python_scope() or is_ref:
                 mat = initializer.pyscope_or_ref(arr)
@@ -553,6 +558,11 @@ def __matmul__(self, other):
 
         """
         assert isinstance(other, Matrix), "rhs of `@` is not a matrix / vector"
+        if is_vector(self) and not is_vector(other):
+            # left multiplication
+            assert self.n == other.m, f"Dimension mismatch between shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
+            return other.transpose() @ self
+        # right multiplication
         assert self.m == other.n, f"Dimension mismatch between shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
         entries = []
         for i in range(self.n):
@@ -562,6 +572,8 @@ def __matmul__(self, other):
                 for k in range(1, other.n):
                     acc = acc + self(i, k) * other(k, j)
                 entries[i].append(acc)
+        if is_vector(other) and other.m == 1:
+            return Vector(entries)
         return Matrix(entries)
 
     # host access & python scope operation
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index 1ecd1b65911170..4e3ad8d6883198 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -115,7 +115,7 @@ def func(t: ti.i32):
         m += ti.Matrix([[3, 4], [5, t]])
         print(m @ v)
         print(r.x, r.y, r.z, r.w)
-        s = w.transpose() @ m
+        s = w @ m
         print(s)
         print(m)
 
@@ -701,6 +701,24 @@ def bar():
         bar()
 
 
+@test_utils.test(arch=get_host_arch_list(), debug=True)
+def test_matrix_vector_multiplication():
+    mat = ti.math.mat3(1)
+    vec = ti.math.vec3(3)
+    r = mat @ vec
+    for i in range(3):
+        assert r[i] == 9
+
+    @ti.kernel
+    def foo():
+        mat = ti.math.mat3(1)
+        vec = ti.math.vec3(3)
+        r = mat @ vec
+        assert r[0] == r[1] == r[2] == 9
+
+    foo()
+
+
 @test_utils.test(arch=[ti.cuda, ti.cpu], real_matrix=True)
 def test_local_matrix_read():
 

From 72804ae992212ddee5357bae8b27d6e3f2148831 Mon Sep 17 00:00:00 2001
From: Chang Yu <g1n0st@live.com>
Date: Thu, 15 Sep 2022 14:22:28 +0800
Subject: [PATCH 02/32] [Mesh] [bug] Fix nested mesh for (#6062)

Related issue = #3608

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->
---
 python/taichi/lang/ast/ast_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 6fac2c6418e0ce..07600d02bfe328 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -1110,7 +1110,7 @@ def build_nested_mesh_for(ctx, node):
             loop_var = expr.Expr(ctx.ast_builder.make_id_expr(''))
             ctx.create_variable(loop_name, loop_var)
             begin = expr.Expr(0)
-            end = node.iter.ptr.size
+            end = ti_ops.cast(node.iter.ptr.size, primitive_types.i32)
             ctx.ast_builder.begin_frontend_range_for(loop_var.ptr, begin.ptr,
                                                      end.ptr)
             entry_expr = _ti_core.get_relation_access(

From 574b05e70f31e0ae5bc314d1fa7c2fe8991643b8 Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 15 Sep 2022 18:17:52 +0800
Subject: [PATCH 03/32] [ci] Upgrade conda cudatoolkit version to 11.3 (#6070)

Related issue = #

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/workflows/release.yml |  4 ++--
 .github/workflows/testing.yml |  6 +++---
 ci/Dockerfile.ubuntu.18.04    | 18 +++++++++---------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ba6b5128dfa31a..0f2cbc90a76407 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -87,7 +87,7 @@ jobs:
           mkdir -m777 shared
           docker create --user dev --name taichi_build --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e DISPLAY -e PY -e GPU_BUILD -e TAICHI_CMAKE_ARGS -e PROJECT_NAME \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.2 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build:/home/dev/
           docker start -a taichi_build
@@ -111,7 +111,7 @@ jobs:
         run: |
           docker create --user dev --name taichi_test --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e DISPLAY -e PY -e GPU_TEST \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.2 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/unix_test.sh
           docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
           docker cp .github/workflows/scripts/common-utils.sh taichi_test:/home/dev/common-utils.sh
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 95262572addf94..4aa46e4944852f 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -405,7 +405,7 @@ jobs:
             --gpus 'all,"capabilities=graphics,utility,display,video,compute"' \
             -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           # A tarball is needed because sccache needs some permissions that only the file owner has.
           # 1000 is the uid and gid of user "dev" in the container.
@@ -444,7 +444,7 @@ jobs:
             -e TI_LITE_TEST \
             -e TI_TEST_OFFLINE_CACHE \
             -e DISPLAY -e PY -e GPU_TEST -e TI_WANTED_ARCHS -e TI_RUN_RELEASE_TESTS \
-             registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+             registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
              /home/dev/unix_test.sh
           docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
           docker cp .github/workflows/scripts/common-utils.sh taichi_test:/home/dev/common-utils.sh
@@ -671,7 +671,7 @@ jobs:
           docker run --user dev --name taichi_build_host \
             $DOCKER_RUN_ARGS \
             -v $TAICHI_WHEEL_DIR:/home/dev/taichi/dist \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix-build-v2.sh
         env:
           TAICHI_CMAKE_ARGS: >-
diff --git a/ci/Dockerfile.ubuntu.18.04 b/ci/Dockerfile.ubuntu.18.04
index 07aef59d22d6d3..f991f0390461ad 100644
--- a/ci/Dockerfile.ubuntu.18.04
+++ b/ci/Dockerfile.ubuntu.18.04
@@ -93,17 +93,17 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh &
     bash Miniconda3-latest-Linux-x86_64.sh -p /home/dev/miniconda -b
 ENV PATH="/home/dev/miniconda/bin:$PATH"
 
-# Set up multi-python environment
-RUN conda init bash
-RUN conda create -n py36 python=3.6 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py37 python=3.7 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py38 python=3.8 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py39 python=3.9 pytorch cudatoolkit=10.2 -c pytorch -y
-# TODO add torch to 3.10 when supported
-RUN conda create -n py310 python=3.10 -y
-
 # Remove mesa EGL driver, which interferes with the propritary NVIDIA drivers
 RUN rm -f /usr/lib/x86_64-linux-gnu/libEGL_mesa*
 
 WORKDIR /home/dev
 ENV LANG="C.UTF-8"
+
+# Set up multi-python environment
+RUN conda init bash
+RUN conda create -n py36 python=3.6 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py37 python=3.7 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py38 python=3.8 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py39 python=3.9 pytorch cudatoolkit=11.3 -c pytorch -y
+# TODO add torch to 3.10 when supported
+RUN conda create -n py310 python=3.10 -y

From 4d94b31aed9758a167cf7918a5b04f3b8884d08d Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Thu, 15 Sep 2022 03:43:01 -0700
Subject: [PATCH 04/32] [ci] [dx12] Enable dx12 build for windows cpu ci.
 (#6069)

Fix build fail and enable dx12 build for windows cpu ci to make sure it
compiles.

Related issue = #5276
---
 .github/workflows/scripts/win_build_test_cpu.ps1 | 1 +
 taichi/codegen/dx12/codegen_dx12.cpp             | 4 ++--
 taichi/codegen/dx12/codegen_dx12.h               | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/scripts/win_build_test_cpu.ps1 b/.github/workflows/scripts/win_build_test_cpu.ps1
index 0b1a4cd5e7faaf..4aa90582517edc 100644
--- a/.github/workflows/scripts/win_build_test_cpu.ps1
+++ b/.github/workflows/scripts/win_build_test_cpu.ps1
@@ -66,6 +66,7 @@ if (!$llvmVer.CompareTo("10")) {
 } else {
     $env:TAICHI_CMAKE_ARGS += " -DLLVM_AS_EXECUTABLE=C:\\taichi_llvm_15\\bin\\llvm-as.exe -DTI_WITH_VULKAN:BOOL=OFF"
     $env:TAICHI_CMAKE_ARGS += " -DTI_LLVM_15:BOOL=ON"
+    $env:TAICHI_CMAKE_ARGS += " -DTI_WITH_DX12:BOOL=ON"
 }
 
 
diff --git a/taichi/codegen/dx12/codegen_dx12.cpp b/taichi/codegen/dx12/codegen_dx12.cpp
index 4be95a53f7b1b4..bee1d3c27a6e8b 100644
--- a/taichi/codegen/dx12/codegen_dx12.cpp
+++ b/taichi/codegen/dx12/codegen_dx12.cpp
@@ -227,7 +227,7 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
 #ifdef TI_WITH_LLVM
 
 static std::vector<uint8_t> generate_dxil_from_llvm(
-    LLVMCompiledData &compiled_data,
+    LLVMCompiledTask &compiled_data,
     taichi::lang::Kernel *kernel) {
   // generate dxil from llvm ir.
   auto offloaded_local = compiled_data.tasks;
@@ -286,7 +286,7 @@ KernelCodeGenDX12::CompileResult KernelCodeGenDX12::compile() {
   return Result;
 }
 
-LLVMCompiledData KernelCodeGenDX12::compile_task(
+LLVMCompiledTask KernelCodeGenDX12::compile_task(
     std::unique_ptr<llvm::Module> &&module,
     OffloadedStmt *stmt) {
   TaskCodeGenLLVMDX12 gen(kernel, stmt);
diff --git a/taichi/codegen/dx12/codegen_dx12.h b/taichi/codegen/dx12/codegen_dx12.h
index 1b9e920e71873f..5d352231a1e6b9 100644
--- a/taichi/codegen/dx12/codegen_dx12.h
+++ b/taichi/codegen/dx12/codegen_dx12.h
@@ -22,7 +22,7 @@ class KernelCodeGenDX12 : public KernelCodeGen {
   };
   CompileResult compile();
 #ifdef TI_WITH_LLVM
-  LLVMCompiledData compile_task(
+  LLVMCompiledTask compile_task(
       std::unique_ptr<llvm::Module> &&module = nullptr,
       OffloadedStmt *stmt = nullptr) override;
 #endif

From e5ed336724c757cf1e3a4beb5ae6540ca968c1cb Mon Sep 17 00:00:00 2001
From: PGZXB <pgzxb@qq.com>
Date: Fri, 16 Sep 2022 09:15:51 +0800
Subject: [PATCH 05/32] [bug] Set cfg.offline_cache after reset() (#6073)

Related issue = #4401
---
 python/taichi/lang/misc.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 4b6c38ebdeb4ef..758892310e193d 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -347,8 +347,6 @@ def init(arch=None,
     # changed by the Vulkan backend initialization on OS X.
     current_dir = os.getcwd()
 
-    cfg = impl.default_cfg()
-    cfg.offline_cache = True  # Enable offline cache in frontend instead of C++ side
     # Check if installed version meets the requirements.
     if require_version is not None:
         check_require_version(require_version)
@@ -365,6 +363,9 @@ def init(arch=None,
     kwargs = _deepcopy(kwargs)
     reset()
 
+    cfg = impl.default_cfg()
+    cfg.offline_cache = True  # Enable offline cache in frontend instead of C++ side
+
     spec_cfg = _SpecialConfig()
     env_comp = _EnvironmentConfigurator(kwargs, cfg)
     env_spec = _EnvironmentConfigurator(kwargs, spec_cfg)

From 45400e579315de29289cbce2984bb0f9d4d7a4c8 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 16 Sep 2022 11:21:52 +0800
Subject: [PATCH 06/32] [Bug] Disable vkCmdWriteTimestamp with MacOS to enable
 tests on Vulkan (#6020)

---
 .github/workflows/testing.yml       |  2 +-
 taichi/rhi/vulkan/vulkan_device.cpp | 15 +++++++++-
 tests/python/test_ggui.py           |  8 ++---
 tests/python/test_graph.py          |  2 +-
 tests/python/test_scan.py           |  2 +-
 tests/python/test_simt.py           |  4 +--
 tests/test_utils.py                 | 46 +++++++++++++++++++++++++----
 7 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 4aa46e4944852f..6ac9cd1927c7a7 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -219,7 +219,7 @@ jobs:
             python: 3.7
             with_cc: OFF
             with_cpp_tests: ON
-            wanted_archs: 'cpu'
+            wanted_archs: 'cpu,vulkan'
     runs-on:
     - self-hosted
     - ${{ matrix.os }}
diff --git a/taichi/rhi/vulkan/vulkan_device.cpp b/taichi/rhi/vulkan/vulkan_device.cpp
index 595b6b757e347e..d1bf10886ec84d 100644
--- a/taichi/rhi/vulkan/vulkan_device.cpp
+++ b/taichi/rhi/vulkan/vulkan_device.cpp
@@ -782,9 +782,13 @@ VulkanCommandList::VulkanCommandList(VulkanDevice *ti_device,
   info.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
 
   vkBeginCommandBuffer(buffer->buffer, &info);
+
+// Workaround for MacOS: https://github.com/taichi-dev/taichi/issues/5888
+#if !defined(__APPLE__)
   vkCmdResetQueryPool(buffer->buffer, query_pool_->query_pool, 0, 2);
   vkCmdWriteTimestamp(buffer->buffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                       query_pool_->query_pool, 0);
+#endif
 }
 
 VulkanCommandList::~VulkanCommandList() {
@@ -1304,8 +1308,11 @@ vkapi::IVkRenderPass VulkanCommandList::current_renderpass() {
 
 vkapi::IVkCommandBuffer VulkanCommandList::finalize() {
   if (!finalized_) {
+// Workaround for MacOS: https://github.com/taichi-dev/taichi/issues/5888
+#if !defined(__APPLE__)
     vkCmdWriteTimestamp(buffer_->buffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                         query_pool_->query_pool, 1);
+#endif
     vkEndCommandBuffer(buffer_->buffer);
     finalized_ = true;
   }
@@ -1719,11 +1726,17 @@ void VulkanStream::command_sync() {
       continue;
     }
 
+    double duration_us = 0.0;
+
+// Workaround for MacOS: https://github.com/taichi-dev/taichi/issues/5888
+#if !defined(__APPLE__)
     uint64_t t[2];
     vkGetQueryPoolResults(device_.vk_device(), cmdbuf.query_pool->query_pool, 0,
                           2, sizeof(uint64_t) * 2, &t, sizeof(uint64_t),
                           VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
-    double duration_us = (t[1] - t[0]) * props.limits.timestampPeriod / 1000.0;
+    duration_us = (t[1] - t[0]) * props.limits.timestampPeriod / 1000.0;
+#endif
+
     device_time_elapsed_us_ += duration_us;
   }
 
diff --git a/tests/python/test_ggui.py b/tests/python/test_ggui.py
index dca8494990a39f..88f83976a33250 100644
--- a/tests/python/test_ggui.py
+++ b/tests/python/test_ggui.py
@@ -390,7 +390,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_fetching_depth_attachment():
     window = ti.ui.Window("test", (512, 512), vsync=True, show_window=False)
     canvas = window.get_canvas()
@@ -419,7 +419,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_draw_lines():
     N = 10
     particles_pos = ti.Vector.field(3, dtype=ti.f32, shape=N)
@@ -464,7 +464,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_draw_part_of_particles():
     N = 10
     particles_pos = ti.Vector.field(3, dtype=ti.f32, shape=N)
@@ -598,7 +598,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_draw_part_of_lines():
     N = 10
     particles_pos = ti.Vector.field(3, dtype=ti.f32, shape=N)
diff --git a/tests/python/test_graph.py b/tests/python/test_graph.py
index 3ca1e56f4f1138..92787c6354e083 100644
--- a/tests/python/test_graph.py
+++ b/tests/python/test_graph.py
@@ -306,7 +306,7 @@ def foo(a: dt, b: ti.types.ndarray(dtype=dt, field_dim=1)):
 
 
 @pytest.mark.parametrize('dt', [ti.i32, ti.i64, ti.u32, ti.u64])
-@test_utils.test(arch=supported_archs_cgraph)
+@test_utils.test(arch=supported_archs_cgraph, exclude=[(ti.vulkan, "Darwin")])
 def test_arg_int(dt):
     @ti.kernel
     def foo(a: dt, b: ti.types.ndarray(dtype=dt, field_dim=1)):
diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
index 0a19bc855460df..72dd5f415cb30d 100644
--- a/tests/python/test_scan.py
+++ b/tests/python/test_scan.py
@@ -2,7 +2,7 @@
 from tests import test_utils
 
 
-@test_utils.test(arch=[ti.cuda, ti.vulkan])
+@test_utils.test(arch=[ti.cuda, ti.vulkan], exclude=[(ti.vulkan, "Darwin")])
 def test_scan():
     def test_scan_for_dtype(dtype, N):
         arr = ti.field(dtype, N)
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index accf5b1dbc9a94..3679d72f861837 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -434,7 +434,7 @@ def reduce_all() -> dtype:
 # i.e. any device other than a subgroup size of 1 should have one non active group
 
 
-@test_utils.test(arch=ti.vulkan)
+@test_utils.test(arch=ti.vulkan, exclude=[(ti.vulkan, "Darwin")])
 def test_subgroup_reduction_add_i32():
     _test_subgroup_reduce(ti.atomic_add, subgroup.reduce_add, np.sum, 2677, 0,
                           ti.i32)
@@ -451,7 +451,7 @@ def test_subgroup_reduction_add_f32():
 #     _test_subgroup_reduce(ti.atomic_add, subgroup.reduce_mul, np.prod, 8, 1, ti.f32)
 
 
-@test_utils.test(arch=ti.vulkan)
+@test_utils.test(arch=ti.vulkan, exclude=[(ti.vulkan, "Darwin")])
 def test_subgroup_reduction_max_i32():
     _test_subgroup_reduce(ti.atomic_max, subgroup.reduce_max, np.max, 2677, 0,
                           ti.i32)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d0eecec78bc613..26e06fd2afcacf 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -3,6 +3,7 @@
 import itertools
 import os
 import pathlib
+import platform
 from errno import EEXIST
 from tempfile import NamedTemporaryFile, mkstemp
 
@@ -402,22 +403,51 @@ def test(arch=None, exclude=None, require=None, **options):
 .. function:: ti.test(arch=[], exclude=[], require=[], **options)
 
     :parameter arch: backends to include
-    :parameter exclude: backends to exclude
+    :parameter exclude: backends and platforms to exclude
     :parameter require: extensions required
     :parameter options: other options to be passed into ``ti.init``
 
     """
+    def exclude_arch_platform(arch, system, exclude):
+        # Preprocess exclude
+        if exclude is None:
+            exclude = []
+        if not isinstance(exclude, (list, tuple)):
+            exclude = [exclude]
+
+        for pair in exclude:
+            exclude_arch = None
+            exclude_sys = None
+            if isinstance(pair, (list, tuple)):
+                if len(pair) == 1:
+                    # exclude = [(vulkan), ...]
+                    exclude_arch = pair[0]
+                else:
+                    # exclude = [(vulkan, Darwin), ...]
+                    assert len(pair) == 2
+                    exclude_arch = pair[0]
+                    exclude_sys = pair[1]
+            else:
+                # exclude = [vulkan, cpu, ...]
+                exclude_arch = pair
+
+            assert (exclude_arch is not None) or (exclude_sys is not None)
+            if exclude_arch and exclude_sys:
+                if exclude_arch == arch and exclude_sys == system:
+                    return True
+            elif exclude_arch and exclude_arch == arch:
+                return True
+            elif exclude_sys and exclude_sys == system:
+                return True
+
+        return False
 
     if arch is None:
         arch = []
-    if exclude is None:
-        exclude = []
     if require is None:
         require = []
     if not isinstance(arch, (list, tuple)):
         arch = [arch]
-    if not isinstance(exclude, (list, tuple)):
-        exclude = [exclude]
     if not isinstance(require, (list, tuple)):
         require = [require]
     archs_expected = expected_archs()
@@ -434,7 +464,11 @@ def test(arch=None, exclude=None, require=None, **options):
         # List of (arch, options) to parametrize the test function
         parameters = []
         for req_arch, *req_params in itertools.product(*arch_params_sets):
-            if (req_arch not in arch) or (req_arch in exclude):
+            if req_arch not in arch:
+                continue
+
+            curr_system = platform.system()
+            if exclude_arch_platform(req_arch, curr_system, exclude):
                 continue
 
             if not all(

From 2469127b674e1aaf51f1d69ada9fee6d453fb0ca Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Fri, 16 Sep 2022 11:24:45 +0800
Subject: [PATCH 07/32] [misc] Simplify PR template (#6063)

Now that we use PR title & description as commit message let's simplify
this template a bit to avoid polluting commit history.

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/pull_request_template.md | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 4dd4981dfb48a7..32bc9364832d5f 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,17 +1,3 @@
-Related issue = #
+Issue: #
 
-<!--
-Thank you for your contribution!
-
-If it is your first time contributing to Taichi, please read our Contributor Guidelines:
-  https://docs.taichi-lang.org/docs/contributor_guide
-
-- Please always prepend your PR title with tags such as [CUDA], [Lang], [Doc], [Example]. For a complete list of valid PR tags, please check out https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
-- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs. Otherwise, please use lower-case tags (e.g., [metal]).
-- More details: https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags
-
-- Please fill in the issue number that this PR relates to.
-- If your PR fixes the issue **completely**, use the `close` or `fixes` prefix so that GitHub automatically closes the issue when the PR is merged. For example,
-    Related issue = close #2345
-- If the PR does not belong to any existing issue, free to leave it blank.
--->
+### Brief Summary

From 5b05116323ca5a84472eb445e1212deab4f65c80 Mon Sep 17 00:00:00 2001
From: Mike He <dh63@cs.washington.edu>
Date: Thu, 15 Sep 2022 23:57:54 -0400
Subject: [PATCH 08/32] [lang] Preserve shape info for Vectors (#6076)

Related issue = fix #6067
---
 python/taichi/_funcs.py      |  5 +----
 python/taichi/lang/impl.py   |  3 ++-
 python/taichi/lang/matrix.py | 22 +++++++++++++---------
 tests/python/test_matrix.py  | 11 +++++++++++
 4 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/python/taichi/_funcs.py b/python/taichi/_funcs.py
index 40a9f3b097cc6f..c8b0ee6280f344 100644
--- a/python/taichi/_funcs.py
+++ b/python/taichi/_funcs.py
@@ -3,7 +3,7 @@
 from taichi.lang import impl, matrix, ops
 from taichi.lang.impl import expr_init, get_runtime, grouped, static
 from taichi.lang.kernel_impl import func, pyfunc
-from taichi.lang.matrix import Matrix, Vector, is_vector
+from taichi.lang.matrix import Matrix, Vector
 from taichi.types import f32, f64
 from taichi.types.annotations import template
 
@@ -59,9 +59,6 @@ def _matrix_transpose(mat):
     Returns:
         Transpose of the input matrix.
     """
-    if static(is_vector(mat)):
-        # Convert to row vector
-        return matrix.Matrix([[mat(i) for i in range(mat.n)]])
     return matrix.Matrix([[mat(i, j) for i in range(mat.n)]
                           for j in range(mat.m)],
                          ndim=mat.ndim)
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index dc92af1b77a928..0bca4fe175704b 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -64,7 +64,8 @@ def expr_init(rhs):
                 entries = [[rhs(i, j) for j in range(rhs.m)]
                            for i in range(rhs.n)]
             return make_matrix(entries)
-        if isinstance(rhs, Vector) or getattr(rhs, "ndim", None) == 1:
+        if (isinstance(rhs, Vector)
+                or getattr(rhs, "ndim", None) == 1) and rhs.m == 1:
             # _IntermediateMatrix may reach here
             return Vector(rhs.to_list(), ndim=rhs.ndim)
         return Matrix(rhs.to_list(), ndim=rhs.ndim)
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 48a6013e37a9e3..c1e148ef005840 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -117,6 +117,10 @@ def is_vector(x):
     return isinstance(x, Vector) or getattr(x, "ndim", None) == 1
 
 
+def is_col_vector(x):
+    return is_vector(x) and getattr(x, "m", None) == 1
+
+
 class _MatrixBaseImpl:
     def __init__(self, m, n, entries):
         self.m = m
@@ -498,7 +502,7 @@ def __init__(self,
 
     def _element_wise_binary(self, foo, other):
         other = self._broadcast_copy(other)
-        if is_vector(self):
+        if is_col_vector(self):
             return Vector([foo(self(i), other(i)) for i in range(self.n)],
                           ndim=self.ndim)
         return Matrix([[foo(self(i, j), other(i, j)) for j in range(self.m)]
@@ -507,7 +511,7 @@ def _element_wise_binary(self, foo, other):
 
     def _broadcast_copy(self, other):
         if isinstance(other, (list, tuple)):
-            if is_vector(self):
+            if is_col_vector(self):
                 other = Vector(other, ndim=self.ndim)
             else:
                 other = Matrix(other, ndim=self.ndim)
@@ -558,9 +562,9 @@ def __matmul__(self, other):
 
         """
         assert isinstance(other, Matrix), "rhs of `@` is not a matrix / vector"
-        if is_vector(self) and not is_vector(other):
+        if (is_col_vector(self)) and not is_vector(other):
             # left multiplication
-            assert self.n == other.m, f"Dimension mismatch between shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
+            assert self.n == other.m, f"Dimension mismatch between (left multiplication) shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
             return other.transpose() @ self
         # right multiplication
         assert self.m == other.n, f"Dimension mismatch between shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
@@ -572,7 +576,7 @@ def __matmul__(self, other):
                 for k in range(1, other.n):
                     acc = acc + self(i, k) * other(k, j)
                 entries[i].append(acc)
-        if is_vector(other) and other.m == 1:
+        if is_col_vector(other):
             return Vector(entries)
         return Matrix(entries)
 
@@ -669,7 +673,7 @@ def to_list(self):
         This is similar to `numpy.ndarray`'s `flatten` and `ravel` methods,
         the difference is that this function always returns a new list.
         """
-        if is_vector(self):
+        if is_col_vector(self):
             return [self(i) for i in range(self.n)]
         return [[self(i, j) for j in range(self.m)] for i in range(self.n)]
 
@@ -691,7 +695,7 @@ def cast(self, dtype):
             >>> B
             [0.0, 1.0, 2.0]
         """
-        if is_vector(self):
+        if is_col_vector(self):
             # when using _IntermediateMatrix, we can only check `self.ndim`
             return Vector(
                 [ops_mod.cast(self(i), dtype) for i in range(self.n)])
@@ -1636,8 +1640,8 @@ def fill(self, val):
                         (list, tuple)) and isinstance(val[0], numbers.Number):
             assert self.m == 1
             val = tuple(val)
-        elif is_vector(val) or self.ndim == 1:
-            val = tuple([(val(i), ) for i in range(self.n)])
+        elif is_vector(val):
+            val = tuple([(val(i), ) for i in range(self.n * self.m)])
         elif isinstance(val, Matrix):
             val_tuple = []
             for i in range(val.n):
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index 4e3ad8d6883198..a79746f8f40457 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -806,3 +806,14 @@ def bar():
     with pytest.raises(TaichiCompilationError,
                        match=r'Expected 1 indices, but got 2'):
         bar()
+
+
+@test_utils.test()
+def test_vector_vector_t():
+    @ti.kernel
+    def foo() -> ti.types.matrix(2, 2, ti.f32):
+        a = ti.Vector([1.0, 2.0])
+        b = ti.Vector([1.0, 2.0])
+        return a @ b.transpose()
+
+    assert foo() == [[1.0, 2.0], [2.0, 4.0]]

From 50c925b89852b548a2c0f920382282c59fb9c8de Mon Sep 17 00:00:00 2001
From: PGZXB <pgzxb@qq.com>
Date: Fri, 16 Sep 2022 14:29:19 +0800
Subject: [PATCH 09/32] [bug] Remove unnecessary lower() in
 AotModuleBuilder::add (#6068)

---
 taichi/aot/module_builder.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/taichi/aot/module_builder.cpp b/taichi/aot/module_builder.cpp
index 1b1b71fb58c8ff..9c1a8a71fda0da 100644
--- a/taichi/aot/module_builder.cpp
+++ b/taichi/aot/module_builder.cpp
@@ -5,9 +5,6 @@ namespace taichi {
 namespace lang {
 
 void AotModuleBuilder::add(const std::string &identifier, Kernel *kernel) {
-  if (!kernel->lowered() && Kernel::supports_lowering(kernel->arch)) {
-    kernel->lower(/*to_executable=*/!arch_uses_llvm(kernel->arch));
-  }
   add_per_backend(identifier, kernel);
 }
 
@@ -25,9 +22,6 @@ void AotModuleBuilder::add_field(const std::string &identifier,
 void AotModuleBuilder::add_kernel_template(const std::string &identifier,
                                            const std::string &key,
                                            Kernel *kernel) {
-  if (!kernel->lowered() && Kernel::supports_lowering(kernel->arch)) {
-    kernel->lower();
-  }
   add_per_backend_tmpl(identifier, key, kernel);
 }
 

From 8fd752208ec20ec39f46193de1c53c9e9f388214 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Fri, 16 Sep 2022 14:35:13 +0800
Subject: [PATCH 10/32] [Lang] MatrixField refactor 4/n: Disallow invalid
 matrix field definition (#6074)

Related issue = #5959, #4857

Support for different element types of matrix fields was introduced in
#2135 for quant. As discussed in
https://github.com/taichi-dev/taichi/issues/4857#issuecomment-1123964395,
the only case we need to support is different element types with **same
compute type**. This PR adds the validity check and removes test cases
which are actually not allowed.

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/ir/frontend_ir.h                    |  13 +++
 tests/python/test_matrix_different_type.py | 100 +++++----------------
 2 files changed, 36 insertions(+), 77 deletions(-)

diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index cadf3663b78fb2..cd022734835fb1 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -531,6 +531,19 @@ class MatrixFieldExpression : public Expression {
   MatrixFieldExpression(const std::vector<Expr> &fields,
                         const std::vector<int> &element_shape)
       : fields(fields), element_shape(element_shape) {
+    for (auto &field : fields) {
+      TI_ASSERT(field.is<FieldExpression>());
+    }
+    TI_ASSERT(!fields.empty());
+    auto compute_type =
+        fields[0].cast<FieldExpression>()->dt->get_compute_type();
+    for (auto &field : fields) {
+      if (field.cast<FieldExpression>()->dt->get_compute_type() !=
+          compute_type) {
+        throw TaichiRuntimeError(
+            "Member fields of a matrix field must have the same compute type");
+      }
+    }
   }
 
   void type_check(CompileConfig *config) override {
diff --git a/tests/python/test_matrix_different_type.py b/tests/python/test_matrix_different_type.py
index 397e72c40f3c1b..9a33ca4e7d369b 100644
--- a/tests/python/test_matrix_different_type.py
+++ b/tests/python/test_matrix_different_type.py
@@ -1,80 +1,14 @@
-from pytest import approx
+import pytest
 
 import taichi as ti
 from tests import test_utils
 
 
-# TODO: test more matrix operations
-@test_utils.test()
-def test_vector():
-    type_list = [ti.f32, ti.i32]
-
-    a = ti.Vector.field(len(type_list), dtype=type_list, shape=())
-    b = ti.Vector.field(len(type_list), dtype=type_list, shape=())
-    c = ti.Vector.field(len(type_list), dtype=type_list, shape=())
-
-    @ti.kernel
-    def init():
-        a[None] = [1.0, 3]
-        b[None] = [2.0, 4]
-        c[None] = a[None] + b[None]
-
-    def verify():
-        assert isinstance(a[None][0], float)
-        assert isinstance(a[None][1], int)
-        assert isinstance(b[None][0], float)
-        assert isinstance(b[None][1], int)
-        assert c[None][0] == 3.0
-        assert c[None][1] == 7
-
-    init()
-    verify()
-
-
-# TODO: Support different element types of Matrix on opengl
-@test_utils.test(require=ti.extension.data64, exclude=ti.opengl)
-def test_matrix():
-    type_list = [[ti.f32, ti.i32], [ti.i64, ti.f32]]
-    a = ti.Matrix.field(len(type_list),
-                        len(type_list[0]),
-                        dtype=type_list,
-                        shape=())
-    b = ti.Matrix.field(len(type_list),
-                        len(type_list[0]),
-                        dtype=type_list,
-                        shape=())
-    c = ti.Matrix.field(len(type_list),
-                        len(type_list[0]),
-                        dtype=type_list,
-                        shape=())
-
-    @ti.kernel
-    def init():
-        a[None] = [[1.0, 3], [1, 3.0]]
-        b[None] = [[2.0, 4], [-2, -3.0]]
-        c[None] = a[None] + b[None]
-
-    def verify():
-        assert isinstance(a[None][0, 0], float)
-        assert isinstance(a[None][0, 1], int)
-        assert isinstance(b[None][0, 0], float)
-        assert isinstance(b[None][0, 1], int)
-        assert c[None][0, 0] == 3.0
-        assert c[None][0, 1] == 7
-        assert c[None][1, 0] == -1
-        assert c[None][1, 1] == 0.0
-
-    init()
-    verify()
-
-
 @test_utils.test(require=ti.extension.quant_basic)
-def test_quant_type():
-    qit1 = ti.types.quant.int(bits=10, signed=True)
-    qfxt1 = ti.types.quant.fixed(bits=10, signed=True, scale=0.1)
-    qit2 = ti.types.quant.int(bits=22, signed=False)
-    qfxt2 = ti.types.quant.fixed(bits=22, signed=False, scale=0.1)
-    type_list = [[qit1, qfxt2], [qfxt1, qit2]]
+def test_valid():
+    qflt = ti.types.quant.float(exp=8, frac=5, signed=True)
+    qfxt = ti.types.quant.fixed(bits=10, signed=True, scale=0.1)
+    type_list = [[qflt, qfxt], [qflt, qfxt]]
     a = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
     b = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
     c = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
@@ -99,15 +33,27 @@ def test_quant_type():
 
     @ti.kernel
     def init():
-        a[0] = [[1, 3.], [2., 1]]
-        b[0] = [[2, 4.], [-2., 1]]
+        a[0] = [[1.0, 3.0], [2.0, 1.0]]
+        b[0] = [[2.0, 4.0], [-2.0, 1.0]]
         c[0] = a[0] + b[0]
 
     def verify():
-        assert c[0][0, 0] == approx(3, 1e-3)
-        assert c[0][0, 1] == approx(7.0, 1e-3)
-        assert c[0][1, 0] == approx(0, 1e-3)
-        assert c[0][1, 1] == approx(2, 1e-3)
+        assert c[0][0, 0] == pytest.approx(3.0)
+        assert c[0][0, 1] == pytest.approx(7.0)
+        assert c[0][1, 0] == pytest.approx(0.0)
+        assert c[0][1, 1] == pytest.approx(2.0)
 
     init()
     verify()
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_invalid():
+    qit = ti.types.quant.int(bits=10, signed=True)
+    qfxt = ti.types.quant.fixed(bits=10, signed=True, scale=0.1)
+    type_list = [qit, qfxt]
+    with pytest.raises(
+            RuntimeError,
+            match=
+            'Member fields of a matrix field must have the same compute type'):
+        a = ti.Vector.field(len(type_list), dtype=type_list)

From e23ad2d9abd6a612c7c889c7bb67161b8776cd87 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 16 Sep 2022 17:03:41 +0800
Subject: [PATCH 11/32] [Lang] MatrixNdarray refactor part6: Add scalarization
 for LocalLoadStmt & GlobalLoadStmt with TensorType (#6024)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Related issue = https://github.com/taichi-dev/taichi/issues/5873,
https://github.com/taichi-dev/taichi/issues/5819

This PR is working "Part ④" in
https://github.com/taichi-dev/taichi/issues/5873.
---
 python/taichi/lang/ast/ast_transformer.py |  9 ++-
 python/taichi/lang/impl.py                |  2 +-
 taichi/analysis/offline_cache_util.cpp    |  1 +
 taichi/python/export_lang.cpp             |  2 +
 taichi/transforms/scalarize.cpp           | 79 +++++++++++++++++++++--
 tests/cpp/transforms/scalarize_test.cpp   | 66 ++++++++++++++++++-
 tests/python/test_matrix.py               | 38 +++++++++++
 7 files changed, 187 insertions(+), 10 deletions(-)

diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 07600d02bfe328..f72456a664f3d4 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -15,8 +15,9 @@
 from taichi.lang.ast.symbol_resolver import ASTResolver
 from taichi.lang.exception import TaichiSyntaxError, TaichiTypeError
 from taichi.lang.field import Field
+from taichi.lang.impl import current_cfg
 from taichi.lang.matrix import (Matrix, MatrixType, Vector, _PyScopeMatrixImpl,
-                                _TiScopeMatrixImpl)
+                                _TiScopeMatrixImpl, make_matrix)
 from taichi.lang.snode import append
 from taichi.lang.util import in_taichi_scope, is_taichi_class, to_taichi_type
 from taichi.types import (annotations, ndarray_type, primitive_types,
@@ -114,6 +115,12 @@ def build_Assign(ctx, node):
     @staticmethod
     def build_assign_slice(ctx, node_target, values, is_static_assign):
         target = ASTTransformer.build_Subscript(ctx, node_target, get_ref=True)
+        if current_cfg().real_matrix:
+            if isinstance(node_target.value.ptr,
+                          any_array.AnyArray) and isinstance(
+                              values, (list, tuple)):
+                values = make_matrix(values)
+
         if isinstance(node_target.value.ptr, Matrix):
             if isinstance(node_target.value.ptr._impl, _TiScopeMatrixImpl):
                 target._assign(values)
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 0bca4fe175704b..e91a17d1971379 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -16,7 +16,7 @@
 from taichi.lang.kernel_arguments import SparseMatrixProxy
 from taichi.lang.matrix import (Matrix, MatrixField, MatrixNdarray, MatrixType,
                                 Vector, _IntermediateMatrix,
-                                _MatrixFieldElement, make_matrix)
+                                _MatrixFieldElement)
 from taichi.lang.mesh import (ConvType, MeshElementFieldProxy, MeshInstance,
                               MeshRelationAccessProxy,
                               MeshReorderedMatrixFieldProxy,
diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
index 3207e0d887834d..2237b3be60c63f 100644
--- a/taichi/analysis/offline_cache_util.cpp
+++ b/taichi/analysis/offline_cache_util.cpp
@@ -66,6 +66,7 @@ static std::vector<std::uint8_t> get_offline_cache_key_of_compile_config(
   serializer(config->experimental_auto_mesh_local);
   serializer(config->auto_mesh_local_default_occupacy);
   serializer(config->real_matrix);
+  serializer(config->real_matrix_scalarize);
   serializer.finalize();
 
   return serializer.data;
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index c3d58b362e52ed..73a0ca345bc4c6 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -203,6 +203,8 @@ void export_lang(py::module &m) {
                      &CompileConfig::ndarray_use_cached_allocator)
       .def_readwrite("use_mesh", &CompileConfig::use_mesh)
       .def_readwrite("real_matrix", &CompileConfig::real_matrix)
+      .def_readwrite("real_matrix_scalarize",
+                     &CompileConfig::real_matrix_scalarize)
       .def_readwrite("cc_compile_cmd", &CompileConfig::cc_compile_cmd)
       .def_readwrite("cc_link_cmd", &CompileConfig::cc_link_cmd)
       .def_readwrite("quant_opt_store_fusion",
diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp
index e140a3bf0e6659..70ca1cd551e493 100644
--- a/taichi/transforms/scalarize.cpp
+++ b/taichi/transforms/scalarize.cpp
@@ -8,10 +8,14 @@ TLANG_NAMESPACE_BEGIN
 
 class Scalarize : public IRVisitor {
  public:
+  DelayedIRModifier modifier_;
+
   Scalarize(IRNode *node) {
     allow_undefined_visitor = true;
     invoke_default_visitor = false;
     node->accept(this);
+
+    modifier_.modify_ir();
   }
 
   /*
@@ -51,18 +55,75 @@ class Scalarize : public IRVisitor {
       int num_elements = val_tensor_type->get_num_elements();
       for (int i = 0; i < num_elements; i++) {
         auto const_stmt = std::make_unique<ConstStmt>(
-            TypedConstant(stmt->val->ret_type.get_element_type(), i));
+            TypedConstant(get_data_type<int32>(), i));
 
         auto ptr_offset_stmt =
             std::make_unique<PtrOffsetStmt>(stmt->dest, const_stmt.get());
         auto scalarized_stmt = std::make_unique<T>(ptr_offset_stmt.get(),
                                                    matrix_init_stmt->values[i]);
 
-        stmt->insert_before_me(std::move(const_stmt));
-        stmt->insert_before_me(std::move(ptr_offset_stmt));
-        stmt->insert_before_me(std::move(scalarized_stmt));
+        modifier_.insert_before(stmt, std::move(const_stmt));
+        modifier_.insert_before(stmt, std::move(ptr_offset_stmt));
+        modifier_.insert_before(stmt, std::move(scalarized_stmt));
+      }
+      modifier_.erase(stmt);
+    }
+  }
+
+  /*
+
+    Before:
+      TensorType<4 x i32> val = LoadStmt(TensorType<4 x i32>* src)
+
+    After:
+      i32* addr0 = PtrOffsetStmt(TensorType<4 x i32>* src, 0)
+      i32* addr1 = PtrOffsetStmt(TensorType<4 x i32>* src, 1)
+      i32* addr2 = PtrOffsetStmt(TensorType<4 x i32>* src, 2)
+      i32* addr3 = PtrOffsetStmt(TensorType<4 x i32>* src, 3)
+
+      i32 val0 = LoadStmt(addr0)
+      i32 val1 = LoadStmt(addr1)
+      i32 val2 = LoadStmt(addr2)
+      i32 val3 = LoadStmt(addr3)
+
+      tmp = MatrixInitStmt(val0, val1, val2, val3)
+
+      stmt->replace_all_usages_with(tmp)
+  */
+  template <typename T>
+  void scalarize_load_stmt(T *stmt) {
+    auto src_dtype = stmt->src->ret_type.ptr_removed();
+    if (src_dtype->template is<TensorType>()) {
+      // Needs scalarize
+      auto src_tensor_type = src_dtype->template as<TensorType>();
+
+      std::vector<Stmt *> matrix_init_values;
+      int num_elements = src_tensor_type->get_num_elements();
+
+      for (size_t i = 0; i < num_elements; i++) {
+        auto const_stmt = std::make_unique<ConstStmt>(
+            TypedConstant(get_data_type<int32>(), i));
+
+        auto ptr_offset_stmt =
+            std::make_unique<PtrOffsetStmt>(stmt->src, const_stmt.get());
+        auto scalarized_stmt = std::make_unique<T>(ptr_offset_stmt.get());
+
+        matrix_init_values.push_back(scalarized_stmt.get());
+
+        modifier_.insert_before(stmt, std::move(const_stmt));
+        modifier_.insert_before(stmt, std::move(ptr_offset_stmt));
+        modifier_.insert_before(stmt, std::move(scalarized_stmt));
       }
-      stmt->parent->erase(stmt);
+
+      auto matrix_init_stmt =
+          std::make_unique<MatrixInitStmt>(matrix_init_values);
+
+      matrix_init_stmt->ret_type = src_dtype;
+
+      stmt->replace_usages_with(matrix_init_stmt.get());
+      modifier_.insert_before(stmt, std::move(matrix_init_stmt));
+
+      modifier_.erase(stmt);
     }
   }
 
@@ -107,6 +168,14 @@ class Scalarize : public IRVisitor {
   void visit(LocalStoreStmt *stmt) override {
     scalarize_store_stmt<LocalStoreStmt>(stmt);
   }
+
+  void visit(GlobalLoadStmt *stmt) override {
+    scalarize_load_stmt<GlobalLoadStmt>(stmt);
+  }
+
+  void visit(LocalLoadStmt *stmt) override {
+    scalarize_load_stmt<LocalLoadStmt>(stmt);
+  }
 };
 
 namespace irpass {
diff --git a/tests/cpp/transforms/scalarize_test.cpp b/tests/cpp/transforms/scalarize_test.cpp
index 749fb055e3d988..ab373492f6c0d1 100644
--- a/tests/cpp/transforms/scalarize_test.cpp
+++ b/tests/cpp/transforms/scalarize_test.cpp
@@ -9,7 +9,7 @@ namespace lang {
 
 // Basic tests within a basic block
 template <typename T>
-void test_scalarize() {
+void test_store_scalarize() {
   TestProgram test_prog;
   test_prog.setup();
 
@@ -76,9 +76,69 @@ void test_scalarize() {
   EXPECT_EQ(block->statements[16]->is<T>(), true);
 }
 
+template <typename T>
+void test_load_scalarize() {
+  TestProgram test_prog;
+  test_prog.setup();
+
+  auto block = std::make_unique<Block>();
+
+  auto func = []() {};
+  auto kernel =
+      std::make_unique<Kernel>(*test_prog.prog(), func, "fake_kernel");
+  block->kernel = kernel.get();
+
+  auto &type_factory = TypeFactory::get_instance();
+
+  /*
+    TensorType<4 x i32>* %1 = ExternalPtrStmt()
+    TensorType<4 x i32>  %2 = LoadStmt(%1)
+  */
+  Type *tensor_type = type_factory.get_tensor_type(
+      {2, 2}, type_factory.get_primitive_type(PrimitiveTypeID::i32));
+  auto argload_stmt = block->push_back<ArgLoadStmt>(0 /*arg_id*/, tensor_type);
+
+  std::vector<Stmt *> indices = {};
+  Stmt *src_stmt = block->push_back<ExternalPtrStmt>(
+      argload_stmt, indices);  // fake ExternalPtrStmt
+  src_stmt->ret_type = type_factory.get_pointer_type(tensor_type);
+
+  block->push_back<T>(src_stmt);
+
+  irpass::scalarize(block.get());
+
+  EXPECT_EQ(block->size(), 1 /*argload*/ + 1 /*external_ptr*/ + 4 /*const*/ +
+                               4 /*ptroffset*/ + 4 /*load*/ +
+                               1 /*matrix_init*/);
+
+  // Check for scalarized statements
+  EXPECT_EQ(block->statements[2]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[3]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[4]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[5]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[6]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[7]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[8]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[9]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[10]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[11]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[12]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[13]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[14]->is<MatrixInitStmt>(), true);
+}
+
 TEST(Scalarize, ScalarizeStore) {
-  test_scalarize<GlobalStoreStmt>();
-  test_scalarize<LocalStoreStmt>();
+  test_store_scalarize<GlobalStoreStmt>();
+  test_store_scalarize<LocalStoreStmt>();
+}
+
+TEST(Scalarize, ScalarizeLoad) {
+  test_load_scalarize<GlobalLoadStmt>();
+  test_load_scalarize<LocalLoadStmt>();
 }
 
 }  // namespace lang
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index a79746f8f40457..715e59c9210adc 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -817,3 +817,41 @@ def foo() -> ti.types.matrix(2, 2, ti.f32):
         return a @ b.transpose()
 
     assert foo() == [[1.0, 2.0], [2.0, 4.0]]
+
+
+@test_utils.test(arch=[ti.cuda, ti.cpu],
+                 real_matrix=True,
+                 real_matrix_scalarize=True)
+def test_store_scalarize():
+    @ti.kernel
+    def func(a: ti.types.ndarray()):
+        for i in range(5):
+            a[i] = [[i, i + 1], [i + 2, i + 3]]
+
+    x = ti.Matrix.ndarray(2, 2, ti.i32, shape=5)
+    func(x)
+
+    assert (x[0] == [[0, 1], [2, 3]])
+    assert (x[1] == [[1, 2], [3, 4]])
+    assert (x[2] == [[2, 3], [4, 5]])
+    assert (x[3] == [[3, 4], [5, 6]])
+    assert (x[4] == [[4, 5], [6, 7]])
+
+
+@test_utils.test(arch=[ti.cuda, ti.cpu],
+                 real_matrix=True,
+                 real_matrix_scalarize=True)
+def test_load_store_scalarize():
+    @ti.kernel
+    def func(a: ti.types.ndarray()):
+        for i in range(3):
+            a[i] = [[i, i + 1], [i + 2, i + 3]]
+
+        a[3] = a[1]
+        a[4] = a[2]
+
+    x = ti.Matrix.ndarray(2, 2, ti.i32, shape=5)
+    func(x)
+
+    assert (x[3] == [[1, 2], [3, 4]])
+    assert (x[4] == [[2, 3], [4, 5]])

From 5c7d0eb148dc732bf00be9fa9365b42b2b82bcbd Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Sat, 17 Sep 2022 11:20:05 +0800
Subject: [PATCH 12/32] [build] Refactor test infrastructure for AOT tests
 (#6064)

Related issue = #
---
 tests/test_config.json | 195 +++++++++++++++++++++++++++++++++++
 tests/test_utils.py    | 228 ++++++-----------------------------------
 2 files changed, 227 insertions(+), 196 deletions(-)
 create mode 100644 tests/test_config.json

diff --git a/tests/test_config.json b/tests/test_config.json
new file mode 100644
index 00000000000000..10cd191721e048
--- /dev/null
+++ b/tests/test_config.json
@@ -0,0 +1,195 @@
+{
+"aot_test_cases" : {
+    "LlvmAotTest.CpuKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cuda"
+    ],
+    "LlvmAotTest.CpuField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmAotTest.CpuDynamic": [
+        ["cpp", "aot", "python_scripts", "dynamic_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaDynamic": [
+        ["cpp", "aot", "python_scripts", "dynamic_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmAotTest.CpuBitmasked": [
+        ["cpp", "aot", "python_scripts", "bitmasked_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaBitmasked": [
+        ["cpp", "aot", "python_scripts", "bitmasked_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmCGraph.RunGraphCpu": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmCGraph.RunGraphCuda": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmCGraph.CpuField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cpu --cgraph"
+    ],
+    "LlvmCGraph.CudaField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cuda --cgraph"
+    ],
+    "LlvmCGraph.Mpm88Cpu": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=cpu --cgraph"
+    ],
+    "LlvmCGraph.Mpm88Cuda": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=cuda --cgraph"
+    ],
+    "CGraphAotTest.VulkanMpm88": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=vulkan --cgraph"
+    ],
+    "CGraphAotTest.OpenglMpm88": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=opengl --cgraph"
+    ],
+    "GfxAotTest.VulkanDenseField": [
+        ["cpp", "aot", "python_scripts",
+                     "dense_field_aot_test.py"], "--arch=vulkan"
+    ],
+    "GfxAotTest.OpenglDenseField": [
+        ["cpp", "aot", "python_scripts",
+                     "dense_field_aot_test.py"], "--arch=opengl"
+    ],
+    "GfxAotTest.VulkanKernelTest1": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=vulkan"
+    ],
+    "GfxAotTest.OpenglKernelTest1": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=opengl"
+    ],
+    "GfxAotTest.VulkanKernelTest2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=vulkan"
+    ],
+    "GfxAotTest.OpenglKernelTest2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=opengl"
+    ],
+    "CGraphAotTest.VulkanRunCGraph1": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=vulkan"
+    ],
+    "CGraphAotTest.VulkanRunCGraph2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=vulkan --cgraph"
+    ],
+    "CGraphAotTest.OpenglRunCGraph1": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=opengl"
+    ],
+    "CGraphAotTest.OpenglRunCGraph2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=opengl --cgraph"
+    ]
+},
+
+"capi_aot_test_cases" : {
+    "CapiMpm88Test.Vulkan": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=vulkan"
+    ],
+    "CapiMpm88Test.Opengl": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=opengl"
+    ],
+    "CapiMpm88Test.Cuda": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=cuda"
+    ],
+    "CapiSphTest.Vulkan": [
+        ["cpp", "aot", "python_scripts", "sph_aot.py"],
+        "--arch=vulkan"
+    ],
+    "CapiSphTest.Opengl": [
+        ["cpp", "aot", "python_scripts", "sph_aot.py"],
+        "--arch=opengl"
+    ],
+    "CapiSphTest.Cuda": [
+        ["cpp", "aot", "python_scripts", "sph_aot.py"],
+        "--arch=cuda"
+    ],
+    "CapiCometTest.Cuda": [
+        ["cpp", "aot", "python_scripts", "comet_aot.py"],
+        "--arch=cuda"
+    ],
+    "CapiTaichiSparseTest.Cuda": [
+        ["cpp", "aot", "python_scripts", "taichi_sparse_test.py"],
+        ""
+    ],
+    "CapiAotTest.CpuField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "CapiAotTest.CudaField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "CapiGraphTest.CpuGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "CapiGraphTest.CudaGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "CapiGraphTest.VulkanGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=vulkan"
+    ],
+    "CapiGraphTest.VulkanTextureGraph": [
+        ["cpp", "aot", "python_scripts", "texture_aot_test.py"],
+        "--arch=vulkan"
+    ],
+    "CapiGraphTest.OpenglGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=opengl"
+    ],
+    "CapiAotTest.CpuKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cpu"
+    ],
+    "CapiAotTest.CudaKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cuda"
+    ],
+    "CapiAotTest.VulkanKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=vulkan"
+    ],
+    "CapiAotTest.OpenglKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=opengl"
+    ],
+    "CapiDryRun.VulkanAotModule": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=vulkan"
+    ],
+    "CapiDryRun.OpenglAotModule": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=opengl"
+    ]
+}
+}
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 26e06fd2afcacf..17a9c0b880b31b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,7 @@
 import copy
 import functools
 import itertools
+import json
 import os
 import pathlib
 import platform
@@ -15,199 +16,34 @@
 
 import taichi as ti
 
-__aot_test_cases = {
-    "LlvmAotTest.CpuKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cuda"
-    ],
-    "LlvmAotTest.CpuField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmAotTest.CpuDynamic": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'dynamic_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaDynamic": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'dynamic_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmAotTest.CpuBitmasked": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'bitmasked_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaBitmasked": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'bitmasked_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmCGraph.RunGraphCpu": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmCGraph.RunGraphCuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmCGraph.CpuField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cpu --cgraph"
-    ],
-    "LlvmCGraph.CudaField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cuda --cgraph"
-    ],
-    "LlvmCGraph.Mpm88Cpu": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=cpu --cgraph"
-    ],
-    "LlvmCGraph.Mpm88Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=cuda --cgraph"
-    ],
-    "CGraphAotTest.VulkanMpm88": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=vulkan --cgraph"
-    ],
-    "CGraphAotTest.OpenglMpm88": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=opengl --cgraph"
-    ],
-    "GfxAotTest.VulkanDenseField": [
-        os.path.join('cpp', 'aot', 'python_scripts',
-                     'dense_field_aot_test.py'), "--arch=vulkan"
-    ],
-    "GfxAotTest.OpenglDenseField": [
-        os.path.join('cpp', 'aot', 'python_scripts',
-                     'dense_field_aot_test.py'), "--arch=opengl"
-    ],
-    "GfxAotTest.VulkanKernelTest1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=vulkan"
-    ],
-    "GfxAotTest.OpenglKernelTest1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=opengl"
-    ],
-    "GfxAotTest.VulkanKernelTest2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=vulkan"
-    ],
-    "GfxAotTest.OpenglKernelTest2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=opengl"
-    ],
-    "CGraphAotTest.VulkanRunCGraph1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=vulkan"
-    ],
-    "CGraphAotTest.VulkanRunCGraph2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=vulkan --cgraph"
-    ],
-    "CGraphAotTest.OpenglRunCGraph1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=opengl"
-    ],
-    "CGraphAotTest.OpenglRunCGraph2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=opengl --cgraph"
-    ],
-}
-
-__capi_aot_test_cases = {
-    "CapiMpm88Test.Vulkan": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=vulkan"
-    ],
-    "CapiMpm88Test.Opengl": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=opengl"
-    ],
-    "CapiMpm88Test.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=cuda"
-    ],
-    "CapiSphTest.Vulkan": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'sph_aot.py'),
-        "--arch=vulkan"
-    ],
-    "CapiSphTest.Opengl": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'sph_aot.py'),
-        "--arch=opengl"
-    ],
-    "CapiSphTest.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'sph_aot.py'),
-        "--arch=cuda"
-    ],
-    "CapiCometTest.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'comet_aot.py'),
-        "--arch=cuda"
-    ],
-    "CapiTaichiSparseTest.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'taichi_sparse_test.py'),
-        ""
-    ],
-    "CapiAotTest.CpuField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "CapiAotTest.CudaField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "CapiGraphTest.CpuGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "CapiGraphTest.CudaGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "CapiGraphTest.VulkanGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=vulkan"
-    ],
-    "CapiGraphTest.VulkanTextureGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'texture_aot_test.py'),
-        "--arch=vulkan"
-    ],
-    "CapiGraphTest.OpenglGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=opengl"
-    ],
-    "CapiAotTest.CpuKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cpu"
-    ],
-    "CapiAotTest.CudaKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cuda"
-    ],
-    "CapiAotTest.VulkanKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=vulkan"
-    ],
-    "CapiAotTest.OpenglKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=opengl"
-    ],
-    "CapiDryRun.VulkanAotModule": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=vulkan"
-    ],
-    "CapiDryRun.OpenglAotModule": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=opengl"
-    ],
-}
+
+def parse_test_configs():
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    test_config_path = os.path.join(curr_dir, "test_config.json")
+    with open(test_config_path, "r") as f:
+        test_config = json.loads(f.read())
+
+    assert ("aot_test_cases" in test_config.keys())
+    assert ("capi_aot_test_cases" in test_config.keys())
+
+    for cpp_test_name, value in test_config["aot_test_cases"].items():
+        test_paths = value[0]
+        test_args = value[1]
+        test_config["aot_test_cases"][cpp_test_name] = [
+            os.path.join(*test_paths), test_args
+        ]
+
+    for cpp_test_name, value in test_config["capi_aot_test_cases"].items():
+        test_paths = value[0]
+        test_args = value[1]
+        test_config["capi_aot_test_cases"][cpp_test_name] = [
+            os.path.join(*test_paths), test_args
+        ]
+
+    return test_config["aot_test_cases"], test_config["capi_aot_test_cases"]
+
+
+__aot_test_cases, __capi_aot_test_cases = parse_test_configs()
 
 
 def print_aot_test_guide():
@@ -218,14 +54,14 @@ def print_aot_test_guide():
 1. A python script that compiles the Kernels and serialize into file.
 2. A C++ test that loads the file then perform execution.
 
-AOT test writer will have to configure your test case for "__aot_test_cases",
+AOT test writer will have to configure your test case for "__aot_test_cases" in "test_config.json",
 the format of which follows:
 
-        "cpp_test_name" : ["python_program_path", "--arguments"]
+        "cpp_test_name" : [["python_program_path"], "--arguments"]
 
 For example:
 
-        "LlvmProgramTest.FullPipeline": ["cpp/aot/llvm/kernel_aot_test1.py", "--arch=cpu"]
+        "LlvmProgramTest.FullPipeline": [["cpp", "aot", "llvm", "kernel_aot_test1.py"], "--arch=cpu"]
 
 The temporary directory where serialized cache file stays will be generated by run_tests.py. Both python program and C++ tests receives this directory path via environment variable "TAICHI_AOT_FOLDER_PATH".
 

From 33a606a0bc469d738b75d640a412c9acbd901494 Mon Sep 17 00:00:00 2001
From: pengyu <6712304+FantasyVR@users.noreply.github.com>
Date: Sat, 17 Sep 2022 20:54:48 +0800
Subject: [PATCH 13/32] [Lang] Sort coo to build correct csr format sparse
 matrix on GPU (#6050)

Related issue = #2906

When building a coo format sparse matrix, the indices are not in order.
To build a valid csr format sparse matrix. We need to first sort the coo
indices arrays.
---
 taichi/program/sparse_matrix.cpp         | 38 ++++++++++++++++++++++--
 taichi/rhi/cuda/cuda_types.h             |  2 ++
 taichi/rhi/cuda/cusparse_functions.inc.h |  6 ++++
 tests/python/test_sparse_matrix.py       |  6 ++--
 4 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index 3e8096a98d72f9..f142766111c64b 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -203,11 +203,40 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
                                         void *coo_values_ptr,
                                         int nnz) {
 #if defined(TI_WITH_CUDA)
+  // Step 1: Sort coo first
+  cusparseHandle_t cusparse_handle = NULL;
+  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
+  cusparseSpVecDescr_t vec_permutation;
+  cusparseDnVecDescr_t vec_values;
+  void *d_permutation = NULL, *d_values_sorted = NULL;
+  CUDADriver::get_instance().malloc(&d_permutation, nnz * sizeof(int));
+  CUDADriver::get_instance().malloc(&d_values_sorted, nnz * sizeof(float));
+  CUSPARSEDriver::get_instance().cpCreateSpVec(
+      &vec_permutation, nnz, nnz, d_permutation, d_values_sorted,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_values, nnz, coo_values_ptr,
+                                               CUDA_R_32F);
+  size_t bufferSize = 0;
+  CUSPARSEDriver::get_instance().cpXcoosort_bufferSizeExt(
+      cusparse_handle, rows_, cols_, nnz, coo_row_ptr, coo_col_ptr,
+      &bufferSize);
+  void *dbuffer = NULL;
+  if (bufferSize > 0)
+    CUDADriver::get_instance().malloc(&dbuffer, bufferSize);
+  // Setup permutation vector to identity
+  CUSPARSEDriver::get_instance().cpCreateIdentityPermutation(
+      cusparse_handle, nnz, d_permutation);
+  CUSPARSEDriver::get_instance().cpXcoosortByRow(cusparse_handle, rows_, cols_,
+                                                 nnz, coo_row_ptr, coo_col_ptr,
+                                                 d_permutation, dbuffer);
+  CUSPARSEDriver::get_instance().cpGather(cusparse_handle, vec_values,
+                                          vec_permutation);
+  CUDADriver::get_instance().memcpy_device_to_device(
+      coo_values_ptr, d_values_sorted, nnz * sizeof(float));
+  // Step 2: coo to csr
   void *csr_row_offset_ptr = NULL;
   CUDADriver::get_instance().malloc(&csr_row_offset_ptr,
                                     sizeof(int) * (rows_ + 1));
-  cusparseHandle_t cusparse_handle;
-  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
   CUSPARSEDriver::get_instance().cpCoo2Csr(
       cusparse_handle, (void *)coo_row_ptr, nnz, rows_,
       (void *)csr_row_offset_ptr, CUSPARSE_INDEX_BASE_ZERO);
@@ -216,9 +245,14 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
       &matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr,
       coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
+  CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
   CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
   // TODO: free csr_row_offset_ptr
   // CUDADriver::get_instance().mem_free(csr_row_offset_ptr);
+  CUDADriver::get_instance().mem_free(d_values_sorted);
+  CUDADriver::get_instance().mem_free(d_permutation);
+  CUDADriver::get_instance().mem_free(dbuffer);
 #endif
 }
 
diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h
index 64a369f5a76b59..88bb33951e3e74 100644
--- a/taichi/rhi/cuda/cuda_types.h
+++ b/taichi/rhi/cuda/cuda_types.h
@@ -441,8 +441,10 @@ typedef struct cusparseContext *cusparseHandle_t;
 struct cusparseMatDescr;
 typedef struct cusparseMatDescr *cusparseMatDescr_t;
 
+struct cusparseSpVecDescr;
 struct cusparseDnVecDescr;
 struct cusparseSpMatDescr;
+typedef struct cusparseSpVecDescr *cusparseSpVecDescr_t;
 typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
 typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
 typedef enum {
diff --git a/taichi/rhi/cuda/cusparse_functions.inc.h b/taichi/rhi/cuda/cusparse_functions.inc.h
index 7476b83c094730..1a70e36de7883e 100644
--- a/taichi/rhi/cuda/cusparse_functions.inc.h
+++ b/taichi/rhi/cuda/cusparse_functions.inc.h
@@ -13,6 +13,12 @@ PER_CUSPARSE_FUNCTION(cpCreateMatDescr, cusparseCreateMatDescr, cusparseMatDescr
 PER_CUSPARSE_FUNCTION(cpSetMatType, cusparseSetMatType, cusparseMatDescr_t, cusparseMatrixType_t);
 PER_CUSPARSE_FUNCTION(cpSetMatIndexBase, cusparseSetMatIndexBase, cusparseMatDescr_t, cusparseIndexBase_t);
 PER_CUSPARSE_FUNCTION(cpDestroySpMat, cusparseDestroySpMat, cusparseSpMatDescr_t);
+PER_CUSPARSE_FUNCTION(cpCreateSpVec, cusparseCreateSpVec, cusparseSpVecDescr_t* ,int ,int,void*,void*,cusparseIndexType_t,cusparseIndexBase_t,cudaDataType);
+PER_CUSPARSE_FUNCTION(cpDestroySpVec, cusparseDestroySpVec, cusparseSpVecDescr_t);
+PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermutation, cusparseHandle_t, int, void*);
+PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*);
+PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*);
+PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t);
 
 // cusparse dense vector description
 PER_CUSPARSE_FUNCTION(cpCreateDnVec, cusparseCreateDnVec, cusparseDnVecDescr_t*, int, void*, cudaDataType);
diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
index 1a1973ee6fa35f..10cf8451a25f03 100644
--- a/tests/python/test_sparse_matrix.py
+++ b/tests/python/test_sparse_matrix.py
@@ -379,9 +379,9 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
 
 @test_utils.test(arch=ti.cuda)
 def test_gpu_sparse_matrix():
-    h_coo_row = np.asarray([0, 0, 0, 1, 2, 2, 2, 3, 3], dtype=np.int32)
-    h_coo_col = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np.int32)
-    h_coo_val = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0],
+    h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32)
+    h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32)
+    h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0],
                            dtype=np.float32)
     h_X = np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
     h_Y = np.asarray([19.0, 8.0, 51.0, 52.0], dtype=np.float32)

From 448bf41ab7d7b60ec3fc8cd44c6788627cb974b7 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Sun, 18 Sep 2022 11:16:21 +0800
Subject: [PATCH 14/32] [aot] Pragma once in taichi.cpp (#6088)

RT
---
 c_api/include/taichi/cpp/taichi.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c_api/include/taichi/cpp/taichi.hpp b/c_api/include/taichi/cpp/taichi.hpp
index 8e594753d177fd..82af92bcabca90 100644
--- a/c_api/include/taichi/cpp/taichi.hpp
+++ b/c_api/include/taichi/cpp/taichi.hpp
@@ -1,4 +1,5 @@
 // C++ wrapper of Taichi C-API
+#pragma once
 #include <cstring>
 #include <list>
 #include <vector>

From 7d24b9e9eb9a3dace71d59b75818aa1e8be5d4a3 Mon Sep 17 00:00:00 2001
From: Bob Cao <bobcaocheng@gmail.com>
Date: Sat, 17 Sep 2022 20:17:08 -0700
Subject: [PATCH 15/32] [example] Add RHI examples (#5969)

Split from https://github.com/taichi-dev/taichi/pull/5880
---
 CMakeLists.txt                                |  13 +-
 cpp_examples/rhi_examples/CMakeLists.txt      |  31 +++++
 cpp_examples/rhi_examples/common.h            | 129 ++++++++++++++++++
 cpp_examples/rhi_examples/sample_1_window.cpp |  21 +++
 .../rhi_examples/sample_2_triangle.cpp        | 115 ++++++++++++++++
 .../rhi_examples/shaders/2_triangle.frag      |   9 ++
 .../shaders/2_triangle.frag.spv.h             |  26 ++++
 .../rhi_examples/shaders/2_triangle.vert      |  11 ++
 .../shaders/2_triangle.vert.spv.h             |  47 +++++++
 taichi/rhi/vulkan/vulkan_device.cpp           |   6 +-
 10 files changed, 401 insertions(+), 7 deletions(-)
 create mode 100644 cpp_examples/rhi_examples/CMakeLists.txt
 create mode 100644 cpp_examples/rhi_examples/common.h
 create mode 100644 cpp_examples/rhi_examples/sample_1_window.cpp
 create mode 100644 cpp_examples/rhi_examples/sample_2_triangle.cpp
 create mode 100644 cpp_examples/rhi_examples/shaders/2_triangle.frag
 create mode 100644 cpp_examples/rhi_examples/shaders/2_triangle.frag.spv.h
 create mode 100644 cpp_examples/rhi_examples/shaders/2_triangle.vert
 create mode 100644 cpp_examples/rhi_examples/shaders/2_triangle.vert.spv.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b111d69110219..53a4bcc3b584bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,15 +117,12 @@ if (TI_BUILD_TESTS)
 endif()
 
 option(TI_BUILD_EXAMPLES "Build the CPP examples" ON)
+option(TI_BUILD_RHI_EXAMPLES "Build the Unified Device API examples" OFF)
 
 if(NOT TI_WITH_LLVM OR NOT TI_WITH_METAL)
     set(TI_BUILD_EXAMPLES OFF)
 endif()
 
-if (TI_BUILD_EXAMPLES)
-  include(cmake/TaichiExamples.cmake)
-endif()
-
 message("C++ Flags: ${CMAKE_CXX_FLAGS}")
 message("Build type: ${CMAKE_BUILD_TYPE}")
 
@@ -216,6 +213,14 @@ if (TI_WITH_C_API)
   endif()
 endif()
 
+if (TI_BUILD_EXAMPLES)
+  include(cmake/TaichiExamples.cmake)
+endif()
+
+if (TI_BUILD_RHI_EXAMPLES)
+  add_subdirectory(cpp_examples/rhi_examples)
+endif()
+
 
 option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF)
 if (TI_WITH_GRAPHVIZ)
diff --git a/cpp_examples/rhi_examples/CMakeLists.txt b/cpp_examples/rhi_examples/CMakeLists.txt
new file mode 100644
index 00000000000000..5cedaca844d22f
--- /dev/null
+++ b/cpp_examples/rhi_examples/CMakeLists.txt
@@ -0,0 +1,31 @@
+macro(make_sample executable_name src_file)
+add_executable(${executable_name})
+set_property(TARGET ${executable_name} PROPERTY CXX_STANDARD 17)
+set_property(TARGET ${executable_name} PROPERTY C_STANDARD 17)
+target_sources(${executable_name} PRIVATE ${src_file} "common.h")
+target_include_directories(${executable_name}
+PRIVATE
+  ${PROJECT_SOURCE_DIR}
+
+  ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+  ${PROJECT_SOURCE_DIR}/external/eigen
+  ${PROJECT_SOURCE_DIR}/external/FP16/include
+  ${PROJECT_SOURCE_DIR}/external/SPIRV-Reflect
+  ${PROJECT_SOURCE_DIR}/external/spdlog/include
+  ${LLVM_INCLUDE_DIRS}
+
+  ${PROJECT_SOURCE_DIR}/external/volk
+  ${PROJECT_SOURCE_DIR}/external/Vulkan-Headers/include
+  ${PROJECT_SOURCE_DIR}/external/glfw/include
+  ${PROJECT_SOURCE_DIR}/external/glm
+  ${PROJECT_SOURCE_DIR}/external/imgui
+)
+target_include_directories(${executable_name} SYSTEM
+  PUBLIC
+    ${PROJECT_SOURCE_DIR}/external/VulkanMemoryAllocator/include
+  )
+target_link_libraries(${executable_name} taichi_c_api glfw)
+endmacro()
+
+make_sample(sample_1_window sample_1_window.cpp)
+make_sample(sample_2_triangle sample_2_triangle.cpp)
diff --git a/cpp_examples/rhi_examples/common.h b/cpp_examples/rhi_examples/common.h
new file mode 100644
index 00000000000000..b379258131d8ce
--- /dev/null
+++ b/cpp_examples/rhi_examples/common.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "taichi/rhi/vulkan/vulkan_device.h"
+#include "taichi/rhi/vulkan/vulkan_common.h"
+#include "taichi/rhi/vulkan/vulkan_loader.h"
+#include "taichi/rhi/vulkan/vulkan_device_creator.h"
+
+#define GLFW_INCLUDE_NONE
+#include "GLFW/glfw3.h"
+#include "glm/glm.hpp"
+
+using namespace taichi::lang;
+
+static void glfw_error_callback(int code, const char *description) {
+  TI_WARN("GLFW Error {}: {}", code, description);
+}
+
+std::vector<std::string> get_required_instance_extensions() {
+  std::vector<std::string> extensions;
+
+  uint32_t glfw_ext_count = 0;
+  const char **glfw_extensions;
+  glfw_extensions = glfwGetRequiredInstanceExtensions(&glfw_ext_count);
+
+  for (int i = 0; i < glfw_ext_count; ++i) {
+    extensions.push_back(glfw_extensions[i]);
+  }
+  // VulkanDeviceCreator will check that these are supported
+  extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+
+  return extensions;
+}
+
+std::vector<std::string> get_required_device_extensions() {
+  static std::vector<std::string> extensions{
+      VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+  };
+
+  return extensions;
+}
+
+class App {
+ public:
+  App(int width, int height, const std::string &title) {
+    TI_INFO("Creating App '{}' of {}x{}", title, width, height);
+
+    TI_ASSERT(taichi::lang::vulkan::is_vulkan_api_available());
+
+    if (glfwInit()) {
+      TI_INFO("Initialized GLFW");
+
+      glfwSetErrorCallback(glfw_error_callback);
+
+      glfwWindowHint(GLFW_VISIBLE, GLFW_TRUE);
+      glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+      glfw_window =
+          glfwCreateWindow(width, height, "Sample Window", nullptr, nullptr);
+
+      TI_INFO("Initialized GLFWWindow");
+    } else {
+      TI_ERROR("failed to init GLFW");
+    }
+
+    {
+      vulkan::VulkanDeviceCreator::Params evd_params;
+      evd_params.api_version = std::nullopt;
+      evd_params.additional_instance_extensions =
+          get_required_instance_extensions();
+      evd_params.additional_device_extensions =
+          get_required_device_extensions();
+      evd_params.is_for_ui = true;
+      evd_params.surface_creator = [&](VkInstance instance) -> VkSurfaceKHR {
+        VkSurfaceKHR surface = VK_NULL_HANDLE;
+
+        if (glfwCreateWindowSurface(instance, glfw_window, nullptr, &surface) !=
+            VK_SUCCESS) {
+          TI_ERROR("failed to create window surface!");
+        }
+        return surface;
+      };
+      evd_params.enable_validation_layer = true;
+
+      device_creator =
+          std::make_unique<vulkan::VulkanDeviceCreator>(evd_params);
+      device = device_creator->device();
+
+      TI_INFO("Initialized VulkanDevice");
+    }
+
+    {
+      SurfaceConfig config;
+      config.window_handle = glfw_window;
+      config.native_surface_handle = device_creator->get_surface();
+
+      surface = device->create_surface(config);
+    }
+  }
+
+  virtual ~App() {
+    surface.reset();
+    device_creator.reset();
+    glfwDestroyWindow(glfw_window);
+    glfwTerminate();
+  }
+
+  virtual std::vector<StreamSemaphore> render_loop(
+      StreamSemaphore image_available_semaphore) {
+    return {};
+  }
+
+  void run() {
+    while (!glfwWindowShouldClose(glfw_window)) {
+      auto image_available_semaphore = surface->acquire_next_image();
+
+      glfwPollEvents();
+
+      surface->present_image(render_loop(image_available_semaphore));
+    }
+  }
+
+ public:
+  // Owned
+  GLFWwindow *glfw_window;
+  std::unique_ptr<vulkan::VulkanDeviceCreator> device_creator;
+  std::unique_ptr<Surface> surface;
+
+  // Weak references
+  vulkan::VulkanDevice *device;
+};
diff --git a/cpp_examples/rhi_examples/sample_1_window.cpp b/cpp_examples/rhi_examples/sample_1_window.cpp
new file mode 100644
index 00000000000000..c1e5d0272a5dc8
--- /dev/null
+++ b/cpp_examples/rhi_examples/sample_1_window.cpp
@@ -0,0 +1,21 @@
+#include "common.h"
+
+class SampleApp : public App {
+ public:
+  SampleApp() : App(1920, 1080, "Sample 1: Window") {
+  }
+
+  std::vector<StreamSemaphore> render_loop(
+      StreamSemaphore image_available_semaphore) override {
+    return {};
+  }
+
+ public:
+};
+
+int main() {
+  std::unique_ptr<SampleApp> app = std::make_unique<SampleApp>();
+  app->run();
+
+  return 0;
+}
diff --git a/cpp_examples/rhi_examples/sample_2_triangle.cpp b/cpp_examples/rhi_examples/sample_2_triangle.cpp
new file mode 100644
index 00000000000000..8b7b84d4de827d
--- /dev/null
+++ b/cpp_examples/rhi_examples/sample_2_triangle.cpp
@@ -0,0 +1,115 @@
+#include "common.h"
+
+std::vector<uint32_t> frag_spv =
+#include "shaders/2_triangle.frag.spv.h"
+    ;
+
+std::vector<uint32_t> vert_spv =
+#include "shaders/2_triangle.vert.spv.h"
+    ;
+
+struct Vertex {
+  glm::vec2 pos;
+  glm::vec3 color;
+};
+
+class SampleApp : public App {
+ public:
+  SampleApp() : App(1920, 1080, "Sample 2: Triangle") {
+    // Create the triangle raster pipeline
+    {
+      // Load the SPIRV source
+      std::vector<PipelineSourceDesc> src_desc(2);
+
+      src_desc[0].data = (void *)frag_spv.data();
+      src_desc[0].size = frag_spv.size() * sizeof(uint32_t);
+      src_desc[0].type = PipelineSourceType::spirv_binary;
+      src_desc[0].stage = PipelineStageType::fragment;
+
+      src_desc[1].data = (void *)vert_spv.data();
+      src_desc[1].size = vert_spv.size() * sizeof(uint32_t);
+      src_desc[1].type = PipelineSourceType::spirv_binary;
+      src_desc[1].stage = PipelineStageType::vertex;
+
+      // Setup rasterizer parameters
+      RasterParams raster_params;  // use default
+
+      // Setup vertex input parameters
+      std::vector<VertexInputBinding> vertex_inputs = {
+          {.binding = 0, .stride = sizeof(Vertex), .instance = false}};
+      std::vector<VertexInputAttribute> vertex_attrs = {
+          {.location = 0,
+           .binding = 0,
+           .format = BufferFormat::rg32f,
+           .offset = offsetof(Vertex, pos)},
+          {.location = 1,
+           .binding = 0,
+           .format = BufferFormat::rgb32f,
+           .offset = offsetof(Vertex, color)}};
+
+      // Create pipeline
+      pipeline = device->create_raster_pipeline(src_desc, raster_params,
+                                                vertex_inputs, vertex_attrs);
+    }
+
+    // Create the vertex buffer
+    {
+      vertex_buffer = device->allocate_memory_unique(
+          Device::AllocParams{.size = 3 * sizeof(Vertex),
+                              .host_write = true,
+                              .usage = AllocUsage::Vertex});
+      Vertex *mapped = (Vertex *)device->map(*vertex_buffer);
+      mapped[0] = {{0.0, 0.5}, {1.0, 0.0, 0.0}};
+      mapped[1] = {{0.5, -0.5}, {0.0, 1.0, 0.0}};
+      mapped[2] = {{-0.5, -0.5}, {0.0, 0.0, 1.0}};
+      device->unmap(*vertex_buffer);
+    }
+
+    TI_INFO("App Init Done");
+  }
+
+  std::vector<StreamSemaphore> render_loop(
+      StreamSemaphore image_available_semaphore) override {
+    auto cmdlist = device->get_graphics_stream()->new_command_list();
+
+    // Set-up our frame buffer attachment
+    DeviceAllocation surface_image = surface->get_target_image();
+    cmdlist->image_transition(surface_image, ImageLayout::undefined,
+                              ImageLayout::color_attachment);
+
+    // Renderpass: render to surface image, clear color values
+    bool clear = true;
+    std::vector<float> clear_color = {0.1, 0.2, 0.3, 1.0};
+    const auto &[width, height] = surface->get_size();
+    cmdlist->begin_renderpass(0, 0, width, height, 1, &surface_image, &clear,
+                              &clear_color, nullptr, false);
+
+    // Bind our triangle pipeline
+    cmdlist->bind_pipeline(pipeline.get());
+    // Get the binder and bind our vertex buffer
+    auto resource_binder = pipeline->resource_binder();
+    resource_binder->vertex_buffer(vertex_buffer->get_ptr(0), 0);
+    cmdlist->bind_resources(resource_binder);
+    // Render the triangle
+    cmdlist->draw(3, 0);
+    // End rendering
+    cmdlist->end_renderpass();
+
+    // Submit command list, returns render complete semaphore
+    auto render_complete_semaphore = device->get_graphics_stream()->submit(
+        cmdlist.get(), {image_available_semaphore});
+    return {render_complete_semaphore};
+  }
+
+ public:
+  std::unique_ptr<Pipeline> pipeline;
+
+  std::unique_ptr<DeviceAllocationGuard> vertex_buffer;
+};
+
+int main() {
+  std::unique_ptr<SampleApp> app = std::make_unique<SampleApp>();
+  app->run();
+
+  return 0;
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.frag b/cpp_examples/rhi_examples/shaders/2_triangle.frag
new file mode 100644
index 00000000000000..cb913b404f108e
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.frag
@@ -0,0 +1,9 @@
+#version 460
+
+layout(location = 0) in vec3 color;
+
+layout(location = 0) out vec4 frag_output;
+
+void main() {
+  frag_output = vec4(color, 1.0);
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.frag.spv.h b/cpp_examples/rhi_examples/shaders/2_triangle.frag.spv.h
new file mode 100644
index 00000000000000..aeb0437777d39e
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.frag.spv.h
@@ -0,0 +1,26 @@
+{
+  0x07230203, 0x00010000, 0x000d000a, 0x00000013, 0x00000000, 0x00020011,
+      0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e,
+      0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0007000f, 0x00000004,
+      0x00000004, 0x6e69616d, 0x00000000, 0x00000009, 0x0000000c, 0x00030010,
+      0x00000004, 0x00000007, 0x00030003, 0x00000002, 0x000001cc, 0x000a0004,
+      0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c,
+      0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f,
+      0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005,
+      0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 0x00000009, 0x67617266,
+      0x74756f5f, 0x00747570, 0x00040005, 0x0000000c, 0x6f6c6f63, 0x00000072,
+      0x00040047, 0x00000009, 0x0000001e, 0x00000000, 0x00040047, 0x0000000c,
+      0x0000001e, 0x00000000, 0x00020013, 0x00000002, 0x00030021, 0x00000003,
+      0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007,
+      0x00000006, 0x00000004, 0x00040020, 0x00000008, 0x00000003, 0x00000007,
+      0x0004003b, 0x00000008, 0x00000009, 0x00000003, 0x00040017, 0x0000000a,
+      0x00000006, 0x00000003, 0x00040020, 0x0000000b, 0x00000001, 0x0000000a,
+      0x0004003b, 0x0000000b, 0x0000000c, 0x00000001, 0x0004002b, 0x00000006,
+      0x0000000e, 0x3f800000, 0x00050036, 0x00000002, 0x00000004, 0x00000000,
+      0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x0000000a, 0x0000000d,
+      0x0000000c, 0x00050051, 0x00000006, 0x0000000f, 0x0000000d, 0x00000000,
+      0x00050051, 0x00000006, 0x00000010, 0x0000000d, 0x00000001, 0x00050051,
+      0x00000006, 0x00000011, 0x0000000d, 0x00000002, 0x00070050, 0x00000007,
+      0x00000012, 0x0000000f, 0x00000010, 0x00000011, 0x0000000e, 0x0003003e,
+      0x00000009, 0x00000012, 0x000100fd, 0x00010038
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.vert b/cpp_examples/rhi_examples/shaders/2_triangle.vert
new file mode 100644
index 00000000000000..120141945591eb
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.vert
@@ -0,0 +1,11 @@
+#version 460
+
+layout(location = 0) in vec2 v_position;
+layout(location = 1) in vec3 v_color;
+
+layout(location = 0) out vec3 color;
+
+void main() {
+  gl_Position = vec4(v_position, 0.0, 1.0);
+  color = v_color;
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.vert.spv.h b/cpp_examples/rhi_examples/shaders/2_triangle.vert.spv.h
new file mode 100644
index 00000000000000..ec77d9d4d12933
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.vert.spv.h
@@ -0,0 +1,47 @@
+{
+  0x07230203, 0x00010000, 0x000d000a, 0x00000021, 0x00000000, 0x00020011,
+      0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e,
+      0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0009000f, 0x00000000,
+      0x00000004, 0x6e69616d, 0x00000000, 0x0000000d, 0x00000012, 0x0000001d,
+      0x0000001f, 0x00030003, 0x00000002, 0x000001cc, 0x000a0004, 0x475f4c47,
+      0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f,
+      0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45,
+      0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004,
+      0x6e69616d, 0x00000000, 0x00060005, 0x0000000b, 0x505f6c67, 0x65567265,
+      0x78657472, 0x00000000, 0x00060006, 0x0000000b, 0x00000000, 0x505f6c67,
+      0x7469736f, 0x006e6f69, 0x00070006, 0x0000000b, 0x00000001, 0x505f6c67,
+      0x746e696f, 0x657a6953, 0x00000000, 0x00070006, 0x0000000b, 0x00000002,
+      0x435f6c67, 0x4470696c, 0x61747369, 0x0065636e, 0x00070006, 0x0000000b,
+      0x00000003, 0x435f6c67, 0x446c6c75, 0x61747369, 0x0065636e, 0x00030005,
+      0x0000000d, 0x00000000, 0x00050005, 0x00000012, 0x6f705f76, 0x69746973,
+      0x00006e6f, 0x00040005, 0x0000001d, 0x6f6c6f63, 0x00000072, 0x00040005,
+      0x0000001f, 0x6f635f76, 0x00726f6c, 0x00050048, 0x0000000b, 0x00000000,
+      0x0000000b, 0x00000000, 0x00050048, 0x0000000b, 0x00000001, 0x0000000b,
+      0x00000001, 0x00050048, 0x0000000b, 0x00000002, 0x0000000b, 0x00000003,
+      0x00050048, 0x0000000b, 0x00000003, 0x0000000b, 0x00000004, 0x00030047,
+      0x0000000b, 0x00000002, 0x00040047, 0x00000012, 0x0000001e, 0x00000000,
+      0x00040047, 0x0000001d, 0x0000001e, 0x00000000, 0x00040047, 0x0000001f,
+      0x0000001e, 0x00000001, 0x00020013, 0x00000002, 0x00030021, 0x00000003,
+      0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007,
+      0x00000006, 0x00000004, 0x00040015, 0x00000008, 0x00000020, 0x00000000,
+      0x0004002b, 0x00000008, 0x00000009, 0x00000001, 0x0004001c, 0x0000000a,
+      0x00000006, 0x00000009, 0x0006001e, 0x0000000b, 0x00000007, 0x00000006,
+      0x0000000a, 0x0000000a, 0x00040020, 0x0000000c, 0x00000003, 0x0000000b,
+      0x0004003b, 0x0000000c, 0x0000000d, 0x00000003, 0x00040015, 0x0000000e,
+      0x00000020, 0x00000001, 0x0004002b, 0x0000000e, 0x0000000f, 0x00000000,
+      0x00040017, 0x00000010, 0x00000006, 0x00000002, 0x00040020, 0x00000011,
+      0x00000001, 0x00000010, 0x0004003b, 0x00000011, 0x00000012, 0x00000001,
+      0x0004002b, 0x00000006, 0x00000014, 0x00000000, 0x0004002b, 0x00000006,
+      0x00000015, 0x3f800000, 0x00040020, 0x00000019, 0x00000003, 0x00000007,
+      0x00040017, 0x0000001b, 0x00000006, 0x00000003, 0x00040020, 0x0000001c,
+      0x00000003, 0x0000001b, 0x0004003b, 0x0000001c, 0x0000001d, 0x00000003,
+      0x00040020, 0x0000001e, 0x00000001, 0x0000001b, 0x0004003b, 0x0000001e,
+      0x0000001f, 0x00000001, 0x00050036, 0x00000002, 0x00000004, 0x00000000,
+      0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x00000010, 0x00000013,
+      0x00000012, 0x00050051, 0x00000006, 0x00000016, 0x00000013, 0x00000000,
+      0x00050051, 0x00000006, 0x00000017, 0x00000013, 0x00000001, 0x00070050,
+      0x00000007, 0x00000018, 0x00000016, 0x00000017, 0x00000014, 0x00000015,
+      0x00050041, 0x00000019, 0x0000001a, 0x0000000d, 0x0000000f, 0x0003003e,
+      0x0000001a, 0x00000018, 0x0004003d, 0x0000001b, 0x00000020, 0x0000001f,
+      0x0003003e, 0x0000001d, 0x00000020, 0x000100fd, 0x00010038
+}
diff --git a/taichi/rhi/vulkan/vulkan_device.cpp b/taichi/rhi/vulkan/vulkan_device.cpp
index d1bf10886ec84d..4e1b30fa0d6af9 100644
--- a/taichi/rhi/vulkan/vulkan_device.cpp
+++ b/taichi/rhi/vulkan/vulkan_device.cpp
@@ -2403,12 +2403,12 @@ void VulkanSurface::create_swap_chain() {
   extent.height =
       std::max(capabilities.minImageExtent.height,
                std::min(capabilities.maxImageExtent.height, extent.height));
-  TI_INFO("Creating suface of {}x{}", width, height);
+  TI_INFO("Creating suface of {}x{}", extent.width, extent.height);
   VkImageUsageFlags usage =
       VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
 
-  this->width_ = width;
-  this->height_ = height;
+  this->width_ = extent.width;
+  this->height_ = extent.height;
 
   VkSwapchainCreateInfoKHR createInfo;
   createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;

From 5e5321e5dc702820994ac3b79378565083ea2fda Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Sun, 18 Sep 2022 18:47:04 -0700
Subject: [PATCH 16/32] [ci] Update prebuild binary for llvm 15. (#6091)

Build based on release/15.x branch and cherry-picked some patch for
DX12.

Issue: #5276

### Brief Summary
---
 .github/workflows/scripts/win_build.ps1 | 2 +-
 ci/windows/win_build_test.ps1           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/win_build.ps1 b/.github/workflows/scripts/win_build.ps1
index 38ecd156a97530..dba6d15e0cc9cf 100644
--- a/.github/workflows/scripts/win_build.ps1
+++ b/.github/workflows/scripts/win_build.ps1
@@ -62,7 +62,7 @@ if (!$llvmVer.CompareTo("10")) {
 } else {
     if (-not (Test-Path "taichi_llvm_15")) {
         WriteInfo("Download and extract LLVM")
-        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang_220731/taichi-llvm-15.0.0-msvc2019.zip -LO
+        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang/taichi-llvm-15.0.0-msvc2019.zip -LO
         if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
         7z x taichi-llvm-15.0.0-msvc2019.zip -otaichi_llvm_15
     }
diff --git a/ci/windows/win_build_test.ps1 b/ci/windows/win_build_test.ps1
index 2e75e991e371a8..ed32d138bbe6d6 100644
--- a/ci/windows/win_build_test.ps1
+++ b/ci/windows/win_build_test.ps1
@@ -32,7 +32,7 @@ if (!$llvmVer.CompareTo("10")) {
 } else {
     if (-not (Test-Path "taichi_llvm_15")) {
         WriteInfo("Download and extract LLVM")
-        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang_220731/taichi-llvm-15.0.0-msvc2019.zip -LO
+        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang/taichi-llvm-15.0.0-msvc2019.zip -LO
         if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
         7z x taichi-llvm-15.0.0-msvc2019.zip -otaichi_llvm_15
     }

From 81995fe4791070316e0f3215609500256e0e69ba Mon Sep 17 00:00:00 2001
From: PGZXB <pgzxb@qq.com>
Date: Mon, 19 Sep 2022 10:24:41 +0800
Subject: [PATCH 17/32] [bug] Fix crashing when loading old offline cache files
 (#6089)

Issue: #4401, fixes #6081

In future, if necessary, we should maintain a version number for offline
cache instead of using the Taichi version directly.

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 cmake/TaichiTests.cmake                       |   3 +-
 taichi/cache/gfx/cache_manager.cpp            |  24 ++--
 taichi/common/serialization.h                 |  24 +++-
 taichi/runtime/llvm/llvm_offline_cache.cpp    |  48 ++------
 taichi/runtime/llvm/llvm_offline_cache.h      |   1 +
 taichi/util/io.h                              |  46 +++++++
 taichi/util/offline_cache.h                   |  86 ++++++++++----
 .../cpp/offline_cache/load_metadata_test.cpp  | 112 ++++++++++++++++++
 8 files changed, 265 insertions(+), 79 deletions(-)
 create mode 100644 tests/cpp/offline_cache/load_metadata_test.cpp

diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index 8655ce7ecb5e97..569de8d71cfa9a 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -20,7 +20,8 @@ file(GLOB_RECURSE TAICHI_TESTS_SOURCE
         "tests/cpp/llvm/*.cpp"
         "tests/cpp/program/*.cpp"
         "tests/cpp/struct/*.cpp"
-        "tests/cpp/transforms/*.cpp")
+        "tests/cpp/transforms/*.cpp"
+        "tests/cpp/offline_cache/*.cpp")
 
 if (TI_WITH_OPENGL OR TI_WITH_VULKAN)
     file(GLOB TAICHI_TESTS_GFX_UTILS_SOURCE
diff --git a/taichi/cache/gfx/cache_manager.cpp b/taichi/cache/gfx/cache_manager.cpp
index 833d43ad9ee992..f3284cee0f0c91 100644
--- a/taichi/cache/gfx/cache_manager.cpp
+++ b/taichi/cache/gfx/cache_manager.cpp
@@ -47,13 +47,6 @@ struct CacheCleanerUtils<gfx::CacheManager::Metadata> {
   using MetadataType = gfx::CacheManager::Metadata;
   using KernelMetaData = MetadataType::KernelMetadata;
 
-  // To load metadata from file
-  static bool load_metadata(const CacheCleanerConfig &config,
-                            MetadataType &result) {
-    return read_from_binary_file(
-        result, taichi::join_path(config.path, config.metadata_filename));
-  }
-
   // To save metadata as file
   static bool save_metadata(const CacheCleanerConfig &config,
                             const MetadataType &data) {
@@ -81,13 +74,6 @@ struct CacheCleanerUtils<gfx::CacheManager::Metadata> {
     return true;
   }
 
-  // To check version
-  static bool check_version(const CacheCleanerConfig &config,
-                            const Version &version) {
-    return version[0] == TI_VERSION_MAJOR && version[1] == TI_VERSION_MINOR &&
-           version[2] == TI_VERSION_PATCH;
-  }
-
   // To get cache files name
   static std::vector<std::string> get_cache_files(
       const CacheCleanerConfig &config,
@@ -106,6 +92,12 @@ struct CacheCleanerUtils<gfx::CacheManager::Metadata> {
         taichi::join_path(config.path, kDebuggingAotMetadataFilename));
     taichi::remove(taichi::join_path(config.path, kGraphMetadataFilename));
   }
+
+  // To check if a file is cache file
+  static bool is_valid_cache_file(const CacheCleanerConfig &config,
+                                  const std::string &name) {
+    return filename_extension(name) == "spv";
+  }
 };
 
 }  // namespace offline_cache
@@ -184,10 +176,12 @@ void CacheManager::dump_with_merging() const {
       cache_builder->dump(path_, "");
 
       // Update offline_cache_metadata.tcb
+      using offline_cache::load_metadata_with_checking;
+      using Error = offline_cache::LoadMetadataError;
       Metadata old_data;
       const auto filename =
           taichi::join_path(path_, kOfflineCacheMetadataFilename);
-      if (read_from_binary_file(old_data, filename)) {
+      if (load_metadata_with_checking(old_data, filename) == Error::kNoError) {
         for (auto &[k, v] : offline_cache_metadata_.kernels) {
           auto iter = old_data.kernels.find(k);
           if (iter != old_data.kernels.end()) {  // Update
diff --git a/taichi/common/serialization.h b/taichi/common/serialization.h
index 84d77a6c165dd7..8458c1b7514d11 100644
--- a/taichi/common/serialization.h
+++ b/taichi/common/serialization.h
@@ -345,15 +345,20 @@ class BinarySerializer : public Serializer {
     preserved = 0;
   }
 
+  template <bool writing_ = writing>
+  typename std::enable_if<!writing_, std::size_t>::type retrieve_length() {
+    return *reinterpret_cast<std::size_t *>(c_data);
+  }
+
   void finalize() {
-    if (writing) {
+    if constexpr (writing) {
       if (c_data) {
         *reinterpret_cast<std::size_t *>(&c_data[0]) = head;
       } else {
         *reinterpret_cast<std::size_t *>(&data[0]) = head;
       }
     } else {
-      assert(head == *reinterpret_cast<std::size_t *>(c_data));
+      assert(head == retrieve_length());
     }
   }
 
@@ -880,6 +885,21 @@ operator<<(std::ostream &os, const T &t) {
 }
 
 // Returns true if deserialization succeeded.
+template <typename T>
+bool read_from_binary(T &t,
+                      const void *bin,
+                      std::size_t len,
+                      bool match_all = true) {
+  BinaryInputSerializer reader;
+  reader.initialize(const_cast<void *>(bin));
+  if (len != reader.retrieve_length()) {
+    return false;
+  }
+  reader(t);
+  auto head = reader.head;
+  return match_all ? head == len : head <= len;
+}
+
 template <typename T>
 bool read_from_binary_file(T &t, const std::string &file_name) {
   BinaryInputSerializer reader;
diff --git a/taichi/runtime/llvm/llvm_offline_cache.cpp b/taichi/runtime/llvm/llvm_offline_cache.cpp
index fed16a0911d710..cf742239f54b22 100644
--- a/taichi/runtime/llvm/llvm_offline_cache.cpp
+++ b/taichi/runtime/llvm/llvm_offline_cache.cpp
@@ -27,13 +27,6 @@ using Format = LlvmOfflineCache::Format;
 constexpr char kMetadataFilename[] = "metadata";
 constexpr char kMetadataFileLockName[] = "metadata.lock";
 
-static bool is_current_llvm_cache_version(
-    const LlvmOfflineCache::Version &ver) {
-  // TODO(PGZXB): Do more detailed checking
-  return ver[0] == TI_VERSION_MAJOR && ver[1] == TI_VERSION_MINOR &&
-         ver[2] == TI_VERSION_PATCH;
-}
-
 static std::string get_llvm_cache_metadata_file_path(const std::string &dir) {
   return taichi::join_path(dir, std::string(kMetadataFilename) + ".tcb");
 }
@@ -60,13 +53,6 @@ struct CacheCleanerUtils<LlvmOfflineCache> {
   using MetadataType = LlvmOfflineCache;
   using KernelMetaData = typename MetadataType::KernelMetadata;
 
-  // To load metadata from file
-  static bool load_metadata(const CacheCleanerConfig &config,
-                            MetadataType &result) {
-    return read_from_binary_file(
-        result, taichi::join_path(config.path, config.metadata_filename));
-  }
-
   // To save metadata as file
   static bool save_metadata(const CacheCleanerConfig &config,
                             const MetadataType &data) {
@@ -84,12 +70,6 @@ struct CacheCleanerUtils<LlvmOfflineCache> {
     return true;
   }
 
-  // To check version
-  static bool check_version(const CacheCleanerConfig &config,
-                            const Version &version) {
-    return is_current_llvm_cache_version(version);
-  }
-
   // To get cache files name
   static std::vector<std::string> get_cache_files(
       const CacheCleanerConfig &config,
@@ -106,6 +86,13 @@ struct CacheCleanerUtils<LlvmOfflineCache> {
   static void remove_other_files(const CacheCleanerConfig &config) {
     // Do nothing
   }
+
+  // To check if a file is cache file
+  static bool is_valid_cache_file(const CacheCleanerConfig &config,
+                                  const std::string &name) {
+    std::string ext = filename_extension(name);
+    return ext == "ll" || ext == "bc";
+  }
 };
 
 }  // namespace offline_cache
@@ -126,20 +113,12 @@ bool LlvmOfflineCacheFileReader::load_meta_data(
     LlvmOfflineCache &data,
     const std::string &cache_file_path,
     bool with_lock) {
+  using offline_cache::load_metadata_with_checking;
+  using Error = offline_cache::LoadMetadataError;
   const auto tcb_path = get_llvm_cache_metadata_file_path(cache_file_path);
-  {
-    // No the best way to check for filepath existence, but whatever... See
-    // https://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exists-using-standard-c-c11-14-17-c
-    std::ifstream fs(tcb_path, std::ios::in | std::ios::binary);
-    if (!fs.good()) {
-      TI_DEBUG("LLVM cache {} does not exist", cache_file_path);
-      return false;
-    }
-  }
 
   if (!with_lock) {
-    read_from_binary_file(data, tcb_path);
-    return true;
+    return Error::kNoError == load_metadata_with_checking(data, tcb_path);
   }
 
   std::string lock_path =
@@ -150,8 +129,7 @@ bool LlvmOfflineCacheFileReader::load_meta_data(
         TI_WARN("Unlock {} failed", lock_path);
       }
     });
-    read_from_binary_file(data, tcb_path);
-    return true;
+    return Error::kNoError == load_metadata_with_checking(data, tcb_path);
   }
   TI_WARN("Lock {} failed", lock_path);
   return false;
@@ -389,10 +367,6 @@ void LlvmOfflineCacheFileWriter::mangle_offloaded_task_name(
     for (auto &offload : compiled_data.tasks) {
       std::string mangled_name =
           offline_cache::mangle_name(offload.name, kernel_key);
-      TI_DEBUG(
-          "Mangle offloaded-task from internal name '{}' to offline cache "
-          "key '{}'",
-          offload.name, mangled_name);
       auto func = compiled_data.module->getFunction(offload.name);
       TI_ASSERT(func != nullptr);
       func->setName(mangled_name);
diff --git a/taichi/runtime/llvm/llvm_offline_cache.h b/taichi/runtime/llvm/llvm_offline_cache.h
index c4eac6cc4fc078..5142da03bb4a10 100644
--- a/taichi/runtime/llvm/llvm_offline_cache.h
+++ b/taichi/runtime/llvm/llvm_offline_cache.h
@@ -100,6 +100,7 @@ struct LlvmOfflineCache {
   std::unordered_map<std::string, KernelCacheData>
       kernels;  // key = kernel_name
 
+  // NOTE: The "version" must be the first field to be serialized
   TI_IO_DEF(version, size, fields, kernels);
 };
 
diff --git a/taichi/util/io.h b/taichi/util/io.h
index 30c41f91516929..616ce342301a58 100644
--- a/taichi/util/io.h
+++ b/taichi/util/io.h
@@ -14,6 +14,11 @@
 
 #if defined(TI_PLATFORM_WINDOWS)
 #include <filesystem>
+#else  // POSIX
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #endif
 
 TI_NAMESPACE_BEGIN
@@ -51,6 +56,47 @@ inline bool remove(const std::string &path) {
   return std::remove(path.c_str()) == 0;
 }
 
+template <typename Visitor>  // void(const std::string &name, bool is_dir)
+inline bool traverse_directory(const std::string &dir, Visitor v) {
+#if defined(TI_PLATFORM_WINDOWS)
+  namespace fs = std::filesystem;
+  std::error_code ec{};
+  auto iter = fs::directory_iterator(dir, ec);
+  if (ec) {
+    return false;
+  }
+  for (auto &f : iter) {
+    v(f.path().filename().string(), f.is_directory());
+  }
+  return true;
+#else  // POSIX
+  struct dirent *f = nullptr;
+  DIR *directory = ::opendir(dir.c_str());
+  if (!directory) {
+    return false;
+  }
+  while ((f = ::readdir(directory))) {
+    struct stat *stat_buf = nullptr;
+    auto fullpath = join_path(dir, f->d_name);
+    auto ret = ::stat(fullpath.c_str(), stat_buf);
+    TI_ASSERT(ret == 0);
+    v(f->d_name, S_ISDIR(stat_buf->st_mode));
+  }
+  auto ret = ::closedir(directory);
+  TI_ASSERT(ret == 0);
+  return true;
+#endif
+}
+
+inline std::string filename_extension(const std::string &filename) {
+  std::string postfix;
+  auto pos = filename.find_last_of('.');
+  if (pos != std::string::npos) {
+    postfix = filename.substr(pos + 1);
+  }
+  return postfix;
+}
+
 template <typename T>
 void write_to_disk(const T &dat, std::string fn) {
   FILE *f = fopen(fn.c_str(), "wb");
diff --git a/taichi/util/offline_cache.h b/taichi/util/offline_cache.h
index 961c17b75392f3..ff760de1996a95 100644
--- a/taichi/util/offline_cache.h
+++ b/taichi/util/offline_cache.h
@@ -9,6 +9,7 @@
 
 #include "taichi/common/core.h"
 #include "taichi/common/cleanup.h"
+#include "taichi/common/version.h"
 #include "taichi/util/io.h"
 #include "taichi/util/lock.h"
 
@@ -60,9 +61,46 @@ struct Metadata {
   std::size_t size{0};  // byte
   std::unordered_map<std::string, KernelMetadata> kernels;
 
+  // NOTE: The "version" must be the first field to be serialized
   TI_IO_DEF(version, size, kernels);
 };
 
+enum class LoadMetadataError {
+  kNoError,
+  kCorrupted,
+  kFileNotFound,
+  kVersionNotMatched,
+};
+
+template <typename MetadataType>
+inline LoadMetadataError load_metadata_with_checking(
+    MetadataType &result,
+    const std::string &filepath) {
+  if (!taichi::path_exists(filepath)) {
+    TI_DEBUG("Offline cache metadata file {} not found", filepath);
+    return LoadMetadataError::kFileNotFound;
+  }
+
+  using VerType = std::remove_reference_t<decltype(result.version)>;
+  static_assert(std::is_same_v<VerType, Version>);
+  const std::vector<uint8> bytes = read_data_from_file(filepath);
+
+  VerType ver{};
+  if (!read_from_binary(ver, bytes.data(), bytes.size(), false)) {
+    return LoadMetadataError::kCorrupted;
+  }
+  if (ver[0] != TI_VERSION_MAJOR || ver[1] != TI_VERSION_MINOR ||
+      ver[2] != TI_VERSION_PATCH) {
+    TI_DEBUG("The offline cache metadata file {} is old (version={}.{}.{})",
+             filepath, ver[0], ver[1], ver[2]);
+    return LoadMetadataError::kVersionNotMatched;
+  }
+
+  return !read_from_binary(result, bytes.data(), bytes.size())
+             ? LoadMetadataError::kCorrupted
+             : LoadMetadataError::kNoError;
+}
+
 struct CacheCleanerConfig {
   std::string path;
   CleanCachePolicy policy;
@@ -77,12 +115,6 @@ template <typename MetadataType>
 struct CacheCleanerUtils {
   using KernelMetaData = typename MetadataType::KernelMetadata;
 
-  // To load metadata from file
-  static bool load_metadata(const CacheCleanerConfig &config,
-                            MetadataType &result) {
-    TI_NOT_IMPLEMENTED;
-  }
-
   // To save metadata as file
   static bool save_metadata(const CacheCleanerConfig &config,
                             const MetadataType &data) {
@@ -94,12 +126,6 @@ struct CacheCleanerUtils {
     TI_NOT_IMPLEMENTED;
   }
 
-  // To check version
-  static bool check_version(const CacheCleanerConfig &config,
-                            const Version &version) {
-    TI_NOT_IMPLEMENTED;
-  }
-
   // To get cache files name
   static std::vector<std::string> get_cache_files(
       const CacheCleanerConfig &config,
@@ -111,6 +137,12 @@ struct CacheCleanerUtils {
   static void remove_other_files(const CacheCleanerConfig &config) {
     TI_NOT_IMPLEMENTED;
   }
+
+  // To check if a file is cache file
+  static bool is_valid_cache_file(const CacheCleanerConfig &config,
+                                  const std::string &name) {
+    TI_NOT_IMPLEMENTED;
+  }
 };
 
 template <typename MetadataType>
@@ -158,19 +190,25 @@ class CacheCleaner {
       });
       TI_DEBUG("Start cleaning cache");
 
-      if (!Utils::load_metadata(config, cache_data)) {
+      using Error = LoadMetadataError;
+      Error error = load_metadata_with_checking(cache_data, metadata_file);
+      if (error == Error::kFileNotFound) {
         return;
-      }
-
-      if ((policy & CleanOldVersion) &&
-          !Utils::check_version(config, cache_data.version)) {
-        if (taichi::remove(metadata_file)) {
-          taichi::remove(debugging_metadata_file);
-          Utils::remove_other_files(config);
-          for (const auto &[k, v] : cache_data.kernels) {
-            for (const auto &f : Utils::get_cache_files(config, v)) {
-              taichi::remove(taichi::join_path(path, f));
-            }
+      } else if (error == Error::kCorrupted ||
+                 error == Error::kVersionNotMatched) {
+        if (policy &
+            CleanOldVersion) {  // Remove cache files and metadata files
+          TI_DEBUG("Removing all cache files");
+          if (taichi::remove(metadata_file)) {
+            taichi::remove(debugging_metadata_file);
+            Utils::remove_other_files(config);
+            bool success = taichi::traverse_directory(
+                config.path, [&config](const std::string &name, bool is_dir) {
+                  if (!is_dir && Utils::is_valid_cache_file(config, name)) {
+                    taichi::remove(taichi::join_path(config.path, name));
+                  }
+                });
+            TI_ASSERT(success);
           }
         }
         return;
diff --git a/tests/cpp/offline_cache/load_metadata_test.cpp b/tests/cpp/offline_cache/load_metadata_test.cpp
new file mode 100644
index 00000000000000..9db7d7de92d9a7
--- /dev/null
+++ b/tests/cpp/offline_cache/load_metadata_test.cpp
@@ -0,0 +1,112 @@
+#include "gtest/gtest.h"
+#include "taichi/common/version.h"
+#include "taichi/util/offline_cache.h"
+
+#ifdef TI_WITH_LLVM
+#include "taichi/runtime/llvm/llvm_offline_cache.h"
+#endif  // TI_WITH_LLVM
+
+namespace taichi {
+namespace lang {
+
+namespace {
+
+namespace oc = offline_cache;
+
+inline void gen_old_version(oc::Version &ver) {
+  auto &[major, minor, patch] = ver;
+  major = std::max(TI_VERSION_MAJOR - 1, 0);
+  minor = std::max(TI_VERSION_MINOR - 1, 0);
+  patch = std::max(TI_VERSION_PATCH - 1, 0);
+}
+
+template <typename MetadataType>
+MetadataType gen_metadata(const oc::Version &ver) {
+  MetadataType result;
+  result.size = 1024;
+  std::copy(std::begin(ver), std::end(ver), std::begin(result.version));
+  result.kernels["1"] = {};
+  result.kernels["2"] = {};
+  return result;
+}
+
+template <typename MetadataType>
+MetadataType gen_old_metadata() {
+  oc::Version old_ver{};
+  gen_old_version(old_ver);
+  return gen_metadata<MetadataType>(old_ver);
+}
+
+template <typename MetadataType>
+MetadataType gen_correct_metadata() {
+  oc::Version ver{TI_VERSION_MAJOR, TI_VERSION_MINOR, TI_VERSION_PATCH};
+  return gen_metadata<MetadataType>(ver);
+}
+
+template <typename MetadataType>
+void load_metadata_test() {
+  std::string fake_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+  std::string old_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+  std::string corrupted_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+  std::string true_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+
+  // Generate metadata & Save as file
+  write_to_binary_file(gen_correct_metadata<MetadataType>(), true_file);
+  // Generate old metadata & Save as file
+  write_to_binary_file(gen_old_metadata<MetadataType>(), old_file);
+  // Generate corrupted metadata file
+  write_to_binary_file(gen_correct_metadata<MetadataType>(), corrupted_file);
+  std::ofstream(corrupted_file, std::ios::app | std::ios::binary)
+      << "I-AM-BAD-BYTES" << std::flush;
+
+  using Error = oc::LoadMetadataError;
+  Error error = Error::kNoError;
+
+  // Load a non-existing metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, fake_file);
+    EXPECT_EQ(error, Error::kFileNotFound);
+  }
+  // Load a old metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, old_file);
+    EXPECT_EQ(error, Error::kVersionNotMatched);
+  }
+  // Load a corrupted metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, corrupted_file);
+    EXPECT_EQ(error, Error::kCorrupted);
+  }
+  // Load a correct metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, true_file);
+    auto [major, minor, patch] = data.version;
+    EXPECT_EQ(error, Error::kNoError);
+    EXPECT_EQ(major, TI_VERSION_MAJOR);
+    EXPECT_EQ(minor, TI_VERSION_MINOR);
+    EXPECT_EQ(patch, TI_VERSION_PATCH);
+    EXPECT_EQ(data.size, 1024);
+    EXPECT_TRUE(data.kernels.count("1"));
+    EXPECT_TRUE(data.kernels.count("2"));
+  }
+
+  taichi::remove(old_file);
+  taichi::remove(corrupted_file);
+  taichi::remove(true_file);
+}
+
+}  // namespace
+
+TEST(OfflineCache, LoadMetadata) {
+#ifdef TI_WITH_LLVM
+  load_metadata_test<LlvmOfflineCache>();
+#endif  // TI_WITH_LLVM
+  load_metadata_test<oc::Metadata>();
+}
+
+}  // namespace lang
+}  // namespace taichi

From 1d70030c3a9433ba4928a585be79ae5226e33c85 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Mon, 19 Sep 2022 10:28:25 +0800
Subject: [PATCH 18/32] [vulkan] Support printing in debug mode on vulkan
 backend (#6075)

Fixes #2713

This is a long waited debugging utils that we finally get it done. Note
since debug printing requires validation layer support, you'll need to
have Vulkan SDK installed on your system to use it properly. For example
https://vulkan.lunarg.com/doc/sdk/1.3.224.1/linux/getting_started_ubuntu.html.
Also to enable debug printing on vulkan backend you'll need to make sure
`ti.init(..., debug=True)` so that we enable the validation layer
properly.

Nit: our test_print.py doesn't really validate the printed message but
I've run them with `-v -s` to verify the outputs visually.

Note: validation layer is not available on molten-vk so vulkan printing
on macos won't work.

TODO:
- support on opengl backend.

Co-authored-by: PENGUINLIONG <admin@penguinliong.moe>
---
 taichi/codegen/spirv/spirv_codegen.cpp        | 25 ++++++++++++--
 taichi/codegen/spirv/spirv_ir_builder.cpp     | 17 ++++++++++
 taichi/codegen/spirv/spirv_ir_builder.h       | 20 +++++++++++
 taichi/inc/rhi_constants.inc.h                |  1 +
 taichi/rhi/vulkan/vulkan_device_creator.cpp   | 34 +++++++++++++++++--
 .../program_impls/vulkan/vulkan_program.cpp   |  5 +++
 tests/python/test_print.py                    | 28 +++++++--------
 7 files changed, 110 insertions(+), 20 deletions(-)

diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 85995d20fb4907..8eba20eeca0c59 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -2,6 +2,7 @@
 
 #include <string>
 #include <vector>
+#include <variant>
 
 #include "taichi/program/program.h"
 #include "taichi/program/kernel.h"
@@ -146,8 +147,28 @@ class TaskCodegen : public IRVisitor {
     }
   }
 
-  void visit(PrintStmt *print_stmt) override {
-    TI_WARN("Printing is not yet supported in Vulkan");
+  void visit(PrintStmt *stmt) override {
+    if (!device_->get_cap(DeviceCapability::spirv_has_non_semantic_info)) {
+      return;
+    }
+
+    std::string formats;
+    std::vector<Value> vals;
+
+    for (auto const &content : stmt->contents) {
+      if (std::holds_alternative<Stmt *>(content)) {
+        auto arg_stmt = std::get<Stmt *>(content);
+        TI_ASSERT(!arg_stmt->ret_type->is<TensorType>());
+
+        auto value = ir_->query_value(arg_stmt->raw_name());
+        vals.push_back(value);
+        formats += data_type_format(arg_stmt->ret_type);
+      } else {
+        auto arg_str = std::get<std::string>(content);
+        formats += arg_str;
+      }
+    }
+    ir_->call_debugprintf(formats, vals);
   }
 
   void visit(ConstStmt *const_stmt) override {
diff --git a/taichi/codegen/spirv/spirv_ir_builder.cpp b/taichi/codegen/spirv/spirv_ir_builder.cpp
index f9d217d4d0696f..2398c247c53d51 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.cpp
+++ b/taichi/codegen/spirv/spirv_ir_builder.cpp
@@ -80,6 +80,12 @@ void IRBuilder::init_header() {
       .add("SPV_KHR_storage_buffer_storage_class")
       .commit(&header_);
 
+  if (device_->get_cap(cap::spirv_has_non_semantic_info)) {
+    ib_.begin(spv::OpExtension)
+        .add("SPV_KHR_non_semantic_info")
+        .commit(&header_);
+  }
+
   if (device_->get_cap(cap::spirv_has_variable_ptr)) {
     ib_.begin(spv::OpExtension)
         .add("SPV_KHR_variable_pointers")
@@ -125,6 +131,7 @@ std::vector<uint32_t> IRBuilder::finalize() {
   data.insert(data.end(), header_.begin(), header_.end());
   data.insert(data.end(), entry_.begin(), entry_.end());
   data.insert(data.end(), exec_mode_.begin(), exec_mode_.end());
+  data.insert(data.end(), strings_.begin(), strings_.end());
   data.insert(data.end(), debug_.begin(), debug_.end());
   data.insert(data.end(), decorate_.begin(), decorate_.end());
   data.insert(data.end(), global_.begin(), global_.end());
@@ -135,6 +142,10 @@ std::vector<uint32_t> IRBuilder::finalize() {
 
 void IRBuilder::init_pre_defs() {
   ext_glsl450_ = ext_inst_import("GLSL.std.450");
+  if (device_->get_cap(cap::spirv_has_non_semantic_info)) {
+    debug_printf_ = ext_inst_import("NonSemantic.DebugPrintf");
+  }
+
   t_bool_ = declare_primitive_type(get_data_type<bool>());
   if (device_->get_cap(cap::spirv_has_int8)) {
     t_int8_ = declare_primitive_type(get_data_type<int8>());
@@ -207,6 +218,12 @@ void IRBuilder::init_pre_defs() {
   const_i32_one_ = int_immediate_number(t_int32_, 1);
 }
 
+Value IRBuilder::make_string(std::string s) {
+  Value val = new_value(SType(), ValueKind::kNormal);
+  ib_.begin(spv::OpString).add_seq(val, s).commit(&strings_);
+  return val;
+}
+
 PhiValue IRBuilder::make_phi(const SType &out_type, uint32_t num_incoming) {
   Value val = new_value(out_type, ValueKind::kNormal);
   ib_.begin(spv::OpPhi).add_seq(out_type, val);
diff --git a/taichi/codegen/spirv/spirv_ir_builder.h b/taichi/codegen/spirv/spirv_ir_builder.h
index fcf76c8471c522..57eaa6b76da3fd 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.h
+++ b/taichi/codegen/spirv/spirv_ir_builder.h
@@ -294,6 +294,8 @@ class IRBuilder {
                                double value,
                                bool cache = true);
 
+  Value make_string(std::string str);
+
   // Match zero type
   Value get_zero(const SType &stype) {
     TI_ASSERT(stype.flag == TypeKind::kPrimitive);
@@ -460,6 +462,18 @@ class IRBuilder {
     return val;
   }
 
+  // Create a debugPrintf call
+  void call_debugprintf(std::string formats, const std::vector<Value> &args) {
+    Value format_str = make_string(formats);
+    Value val = new_value(t_void_, ValueKind::kNormal);
+    ib_.begin(spv::OpExtInst)
+        .add_seq(t_void_, val, debug_printf_, 1, format_str);
+    for (const auto &arg : args) {
+      ib_.add(arg);
+    }
+    ib_.commit(&function_);
+  }
+
   // Local allocate, load, store methods
   Value alloca_variable(const SType &type);
   Value alloca_workgroup_array(const SType &type);
@@ -551,6 +565,9 @@ class IRBuilder {
   // glsl 450 extension
   Value ext_glsl450_;
 
+  // debugprint extension
+  Value debug_printf_;
+
   SType t_bool_;
   SType t_int8_;
   SType t_int16_;
@@ -602,6 +619,9 @@ class IRBuilder {
   std::vector<uint32_t> entry_;
   // Header segment
   std::vector<uint32_t> exec_mode_;
+  // OpString segment
+  std::vector<uint32_t> strings_;
+  // TODO: Rename this to names_ in a followup PR
   // Debug segment
   std::vector<uint32_t> debug_;
   // Annotation segment
diff --git a/taichi/inc/rhi_constants.inc.h b/taichi/inc/rhi_constants.inc.h
index 023e94356b4336..d5d60dc725dadb 100644
--- a/taichi/inc/rhi_constants.inc.h
+++ b/taichi/inc/rhi_constants.inc.h
@@ -28,6 +28,7 @@ PER_DEVICE_CAPABILITY(spirv_has_subgroup_basic)
 PER_DEVICE_CAPABILITY(spirv_has_subgroup_vote)
 PER_DEVICE_CAPABILITY(spirv_has_subgroup_arithmetic)
 PER_DEVICE_CAPABILITY(spirv_has_subgroup_ballot)
+PER_DEVICE_CAPABILITY(spirv_has_non_semantic_info)
 // Graphics Caps
 PER_DEVICE_CAPABILITY(wide_lines)
 #endif
diff --git a/taichi/rhi/vulkan/vulkan_device_creator.cpp b/taichi/rhi/vulkan/vulkan_device_creator.cpp
index 8910934a65f36a..1bd8f522198d10 100644
--- a/taichi/rhi/vulkan/vulkan_device_creator.cpp
+++ b/taichi/rhi/vulkan/vulkan_device_creator.cpp
@@ -46,7 +46,16 @@ vk_debug_callback(VkDebugUtilsMessageSeverityFlagBitsEXT message_severity,
                   const VkDebugUtilsMessengerCallbackDataEXT *p_callback_data,
                   void *p_user_data) {
   if (message_severity > VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) {
-    TI_WARN("validation layer: {}", p_callback_data->pMessage);
+    TI_WARN("validation layer: {}, {}", message_type,
+            p_callback_data->pMessage);
+  }
+  if (message_type == VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT &&
+      message_severity == VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT &&
+      strstr(p_callback_data->pMessage, "DEBUG-PRINTF") != NULL) {
+    // Message format is "BLABLA | MessageID=xxxxx | <DEBUG_PRINT_MSG>"
+    std::string msg(p_callback_data->pMessage);
+    auto const pos = msg.find_last_of("|");
+    std::cout << msg.substr(pos + 2);
   }
   return VK_FALSE;
 }
@@ -56,6 +65,7 @@ void populate_debug_messenger_create_info(
   *create_info = {};
   create_info->sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
   create_info->messageSeverity =
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
       VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
       VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
       VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
@@ -256,8 +266,10 @@ void VulkanDeviceCreator::create_instance(bool manual_create) {
   create_info.pApplicationInfo = &app_info;
 
   if (params_.enable_validation_layer) {
-    TI_ASSERT_INFO(check_validation_layer_support(),
-                   "validation layers requested but not available");
+    if (!check_validation_layer_support()) {
+      TI_WARN("validation layers requested but not available, turning off...");
+      params_.enable_validation_layer = false;
+    }
   }
 
   VkDebugUtilsMessengerCreateInfoEXT debug_create_info{};
@@ -273,6 +285,19 @@ void VulkanDeviceCreator::create_instance(bool manual_create) {
     create_info.pNext = nullptr;
   }
 
+  // Response to `DebugPrintf`.
+  VkValidationFeaturesEXT vf = {};
+  if (params_.enable_validation_layer) {
+    std::array<VkValidationFeatureEnableEXT, 1> vfes = {
+        VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT};
+
+    vf.sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT;
+    vf.pNext = create_info.pNext;
+    vf.enabledValidationFeatureCount = vfes.size();
+    vf.pEnabledValidationFeatures = vfes.data();
+    create_info.pNext = &vf;
+  }
+
   std::unordered_set<std::string> extensions;
   for (auto ext : get_required_extensions(params_.enable_validation_layer)) {
     extensions.insert(std::string(ext));
@@ -515,6 +540,9 @@ void VulkanDeviceCreator::create_logical_device(bool manual_create) {
       enabled_extensions.push_back(ext.extensionName);
     } else if (name == VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME) {
       enabled_extensions.push_back(ext.extensionName);
+    } else if (name == VK_KHR_SHADER_NON_SEMANTIC_INFO_EXTENSION_NAME) {
+      ti_device_->set_cap(DeviceCapability::spirv_has_non_semantic_info, true);
+      enabled_extensions.push_back(ext.extensionName);
     } else if (std::find(params_.additional_device_extensions.begin(),
                          params_.additional_device_extensions.end(),
                          name) != params_.additional_device_extensions.end()) {
diff --git a/taichi/runtime/program_impls/vulkan/vulkan_program.cpp b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
index f256c73128b815..13562b7743c8ec 100644
--- a/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
+++ b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
@@ -130,6 +130,11 @@ void VulkanProgramImpl::materialize_runtime(MemoryPool *memory_pool,
     int32_t patch = std::atoll(config->vk_api_version.c_str() + idot2 + 1);
     evd_params.api_version = VK_MAKE_API_VERSION(0, major, minor, patch);
   }
+
+  if (config->debug) {
+    TI_WARN("Enabling vulkan validation layer in debug mode");
+    evd_params.enable_validation_layer = true;
+  }
 #if !defined(ANDROID)
   if (glfw_window) {
     // then we should be able to create a device with graphics abilities
diff --git a/tests/python/test_print.py b/tests/python/test_print.py
index df3328149345ce..49244dd5f14ff1 100644
--- a/tests/python/test_print.py
+++ b/tests/python/test_print.py
@@ -3,6 +3,9 @@
 import taichi as ti
 from tests import test_utils
 
+#TODO: validation layer support on macos vulkan backend is not working.
+vk_on_mac = (ti.vulkan, 'Darwin')
+
 
 # Not really testable..
 # Just making sure it does not crash
@@ -23,8 +26,7 @@ def func():
 
 # TODO: As described by @k-ye above, what we want to ensure
 #       is that, the content shows on console is *correct*.
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_multi_print():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -34,8 +36,8 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+# TODO: vulkan doesn't support %s but we should ignore it instead of crashing.
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_print_string():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -47,8 +49,7 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_matrix():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -64,8 +65,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_sep_end():
     @ti.kernel
     def func():
@@ -85,8 +85,7 @@ def func():
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_multiple_threads():
     x = ti.field(dtype=ti.f32, shape=(128, ))
 
@@ -102,8 +101,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_list():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(2, 3))
     y = ti.Vector.field(3, dtype=ti.f32, shape=())
@@ -124,7 +122,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(arch=ti.cpu)
+@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
 def test_python_scope_print_field():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -135,7 +133,7 @@ def test_python_scope_print_field():
     print(z)
 
 
-@test_utils.test(arch=ti.cpu)
+@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
 def test_print_string_format():
     @ti.kernel
     def func(k: ti.f32):
@@ -151,7 +149,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(arch=ti.cpu)
+@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
 def test_print_fstring():
     def foo1(x):
         return x + 1

From fe20fd77b718a57ea387d425afcda99ee4b5592a Mon Sep 17 00:00:00 2001
From: Bob Cao <bobcaocheng@gmail.com>
Date: Sun, 18 Sep 2022 20:44:51 -0700
Subject: [PATCH 19/32] [gui] Direct image presentation & faster direct copy
 routine (#6085)

Split from https://github.com/taichi-dev/taichi/pull/5880

Depending on #5969

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/shaders/SetImage_vk.frag        |   9 +-
 python/taichi/shaders/SetImage_vk_frag.spv    | Bin 1052 -> 1412 bytes
 python/taichi/shaders/SetImage_vk_vert.spv    | Bin 1156 -> 1244 bytes
 python/taichi/ui/canvas.py                    |  16 +-
 taichi/codegen/spirv/spirv_codegen.cpp        |  31 ++--
 taichi/program/program.cpp                    |   6 +
 taichi/program/program.h                      |   9 ++
 taichi/program/program_impl.h                 |  15 ++
 taichi/program/texture.h                      |   8 +
 taichi/python/export_ggui.cpp                 |   5 +
 taichi/runtime/gfx/runtime.cpp                |  17 +++
 taichi/runtime/gfx/runtime.h                  |   5 +
 .../program_impls/vulkan/vulkan_program.cpp   |   6 +
 .../program_impls/vulkan/vulkan_program.h     |   4 +
 taichi/system/memory_pool.h                   |   1 +
 taichi/ui/backends/vulkan/canvas.cpp          |   4 +
 taichi/ui/backends/vulkan/canvas.h            |   2 +
 taichi/ui/backends/vulkan/renderable.cpp      |  54 ++++---
 .../backends/vulkan/renderables/set_image.cpp | 139 ++++++++++++++----
 .../backends/vulkan/renderables/set_image.h   |  12 +-
 taichi/ui/backends/vulkan/renderer.cpp        |   6 +
 taichi/ui/backends/vulkan/renderer.h          |   2 +
 taichi/ui/common/canvas_base.h                |   9 ++
 tests/python/test_ggui.py                     |   2 +-
 24 files changed, 285 insertions(+), 77 deletions(-)

diff --git a/python/taichi/shaders/SetImage_vk.frag b/python/taichi/shaders/SetImage_vk.frag
index 19715a7327145a..0ad5b40dbf4571 100644
--- a/python/taichi/shaders/SetImage_vk.frag
+++ b/python/taichi/shaders/SetImage_vk.frag
@@ -9,11 +9,10 @@ layout(location = 0) out vec4 out_color;
 layout(binding = 1) uniform UBO {
   float x_factor;
   float y_factor;
-}
-ubo;
+  int is_transposed;
+} ubo;
 
 void main() {
-  vec2 coord = frag_texcoord.yx * vec2(ubo.y_factor,ubo.x_factor);
-  out_color = texture(texSampler, coord);
-  // out_color = vec4(frag_texcoord.xy,0,1);
+  vec2 coord = frag_texcoord * vec2(ubo.x_factor,ubo.y_factor);
+  out_color = texture(texSampler, ubo.is_transposed != 0 ? coord.yx : coord);
 }
diff --git a/python/taichi/shaders/SetImage_vk_frag.spv b/python/taichi/shaders/SetImage_vk_frag.spv
index 42ea34c3a07010e6be6f17c999aba45d3bbf6ed0..750ef2e42a3b2c76eb8f6e31dc4ffea3c63535dc 100644
GIT binary patch
literal 1412
zcmY+C>uS_c6ot<;bE&P>*50hunrXdP#|kQfC@4t$F))Q9{tVMhDuHw|Ofu3wfKTIF
z_)=a#aDB-+8V_vt-s`Nh_9gM=_BzHi%z{}mPfT@I%$yKo7EQbKLI0@V%ZpL(>9YeE
zmyA~u;aoNy)0D?fhsl_WMcH-PvUuPspvaS|G2is4scJz#3<iUs|1OLslQ1vN(>P3%
zaU714X&e>F=NR66jhBq0bT+CJT4L1el3!4bD9ffJ`Ih>pmy_x6Q&_}j|8P$;81v!n
zK#X=7D>Y}~$uKIiDY3(u@%%r$Q}UT5$-`ng9OsiPkE?o~W;SMaoT+xJ+(9-g!YE5i
zQ7~WkYUOBnI!R+y3e+<_?#MXr#i2LOUgiE`^a94rjz=G0^RhrY<}Dby^s+1CxV)<v
z9df>rt4r=Huk*>_b5^B_ZnylBRgDeBZDGD_Q}6lC<nVmu^aO^HFUU7+{?g{m4z{j*
zN#3{lj@<)#SasLb|5oWUCmQ6;NzHw$v7(qm%`5WUq2cb=6;qR*uh~0x3V&OkzZ)EV
za`ptarN8TjJX*Z3AD3^7$xh*+chi!<&?EOP=K9%H?smqvBxc?<MR(+%YZqYj$f1`X
zLR#`@vS<E#_U!U|Hh-mi@QX5b{YJ0Y$*PRKu`4*tv?c?0oDIcr*q7sMDh5B44!T!m
zyaWGYu`R{a<L<S(r8h7%*h5$5%c#S=w`A^Lg~RN(E&f#;c5v5X^!r_!59B$&#OLn5
z?2(MU5TiqX`!ebel#xHQ7<fa*p-1+`-A}zo8y)nVPERp9j!RzW^2Tu=D@O05dgnhn
RkTEkj@A;p=eyf8QvOjWVVzU4M

literal 1052
zcmYk4+e%we5QZnl9j%AfLp@tfJX{IVi&hFnL2u%PpdjL9h%t(w-4Y|{1NdTns@@3w
zzI|x3V3_&;_0M6=-qoSak%&r+#b|6tex_pxCSp8lnQxw)pX@&MuXn$F|El6+R5PKP
zPcah1c<rHm_sffM><c!9PM!utRjNk(CMkg()QdlG*GbdfH9n!QHEw$C+g88(@}IBL
zJL2N#6h$qIf$`G1X?OZ*&$#5B?R9`p=syMbe3eo^$>vL+`>jrTpC!sQxToED`{D7v
zOSL9*Ima62IYwiD&t~raarTq*T}^ta^S#!v?*`^6vAxU@yQ{K?*t<c!So6$M)w+#r
zqj`^c;u>5XTOi+ki`CV^qj)(@Jch3n_-BEAr`#IY+0D;u4)cxkTJq|}zRCJ?;Awmv
zGk>SRGX=KaEU{YamG|?+9_L-bd*|-duj2i~<n1SRM{-O2Deg+W{RU5oS5U5Vq0V~s
zLL_HBaiie&_@>TgZ;w6xz$AFJ?#_SH-R*<*6PWut1&JpMPQHS94yoX~KV$C7eVFgu
zbC`UoSs+&9PD;%pvHTfz?6HKIC;wJ(%f$Bf?tsaiQ_dQ`bp!LA<=p<CYA(rlw_BJR
OdGp<|+&i!Mf&B&f-8NnT

diff --git a/python/taichi/shaders/SetImage_vk_vert.spv b/python/taichi/shaders/SetImage_vk_vert.spv
index 211bd77be93591d6adc2ac06b00c0107642824d6..0c41bc96fa42bea3e650b5626104b1291db1c0b6 100644
GIT binary patch
delta 155
zcmZqSyu-=M%%sfDz`)4B#lXv;I*~UzT!n#wff)#yfb=0o1_l<Ol)F#7yT8A?k86B#
zK|y?RNo7uId`@OwYJ5s&QEGBYW?3po9S4%S%)I2B(i9{G6U%=xs!T3qT+gVoS&^xk
bQBnb@Lm6lX&<X~i3mB}Se3i*-nXdx?Yg;3{

delta 67
zcmcb^*}}=o%%sfDz`)4B#lXR!GLbi$Uzve{ff)#yfb=0ohKc)sGAd8r$+(_Td2=RH
TGb5)p0}F#7P)>O=E6a5NXRQnR

diff --git a/python/taichi/ui/canvas.py b/python/taichi/ui/canvas.py
index 5f09f359d6d944..b1783dff3a2b32 100644
--- a/python/taichi/ui/canvas.py
+++ b/python/taichi/ui/canvas.py
@@ -1,3 +1,7 @@
+from taichi._lib import core as _ti_core
+from taichi.lang import impl
+from taichi.lang._texture import Texture
+
 from .staging_buffer import (copy_colors_to_vbo, copy_vertices_to_vbo,
                              get_vbo_field, to_rgba8)
 from .utils import get_field_info
@@ -28,9 +32,15 @@ def set_image(self, img):
             img (numpy.ndarray, :class:`~taichi.MatrixField`, :class:`~taichi.Field`, :class:`~taichi.Texture`): \
                 the image to be shown.
         """
-        staging_img = to_rgba8(img)
-        info = get_field_info(staging_img)
-        self.canvas.set_image(info)
+        is_texture = isinstance(img, Texture)
+        prog_is_vk = impl.pytaichi.prog.config.arch == _ti_core.Arch.vulkan
+        # FIXME: Remove this hack. Maybe add a query function for whether the texture can be presented
+        if is_texture and prog_is_vk:
+            self.canvas.set_image_texture(img.tex)
+        else:
+            staging_img = to_rgba8(img)
+            info = get_field_info(staging_img)
+            self.canvas.set_image(info)
 
     def triangles(self,
                   vertices,
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 8eba20eeca0c59..6b2119c151ebb7 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -2321,6 +2321,7 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
     std::vector<uint32_t> optimized_spv(task_res.spirv_code);
 
     size_t last_size;
+    bool success = true;
     do {
       last_size = optimized_spv.size();
       bool result = false;
@@ -2328,25 +2329,31 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
           (result = !spirv_opt_->Run(optimized_spv.data(), optimized_spv.size(),
                                      &optimized_spv, spirv_opt_options_)),
           "SPIRV optimization failed");
-      if (result)
+      if (result) {
+        success = false;
         break;
+      }
     } while (last_size != optimized_spv.size());
 
     TI_TRACE("SPIRV-Tools-opt: binary size, before={}, after={}",
              task_res.spirv_code.size(), optimized_spv.size());
 
     // Enable to dump SPIR-V assembly of kernels
-#if 0
-    std::string spirv_asm;
-    spirv_tools_->Disassemble(optimized_spv, &spirv_asm);
-    auto kernel_name = tp.ti_kernel_name;
-    TI_WARN("SPIR-V Assembly dump for {} :\n{}\n\n", kernel_name, spirv_asm);
-
-    std::ofstream fout(kernel_name + ".spv", std::ios::binary | std::ios::out);
-    fout.write(reinterpret_cast<const char *>(optimized_spv.data()),
-               optimized_spv.size() * sizeof(uint32_t));
-    fout.close();
-#endif
+    if constexpr (false) {
+      std::vector<uint32_t> &spirv =
+          success ? optimized_spv : task_res.spirv_code;
+
+      std::string spirv_asm;
+      spirv_tools_->Disassemble(optimized_spv, &spirv_asm);
+      auto kernel_name = tp.ti_kernel_name;
+      TI_WARN("SPIR-V Assembly dump for {} :\n{}\n\n", kernel_name, spirv_asm);
+
+      std::ofstream fout(kernel_name + ".spv",
+                         std::ios::binary | std::ios::out);
+      fout.write(reinterpret_cast<const char *>(spirv.data()),
+                 spirv.size() * sizeof(uint32_t));
+      fout.close();
+    }
 
     kernel_attribs.tasks_attribs.push_back(std::move(task_res.task_attribs));
     generated_spirv.push_back(std::move(optimized_spv));
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index acfa4264a9d363..e0af071f09cb32 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -541,5 +541,11 @@ void Program::prepare_runtime_context(RuntimeContext *ctx) {
   program_impl_->prepare_runtime_context(ctx);
 }
 
+void Program::enqueue_compute_op_lambda(
+    std::function<void(Device *device, CommandList *cmdlist)> op,
+    const std::vector<ComputeOpImageRef> &image_refs) {
+  program_impl_->enqueue_compute_op_lambda(op, image_refs);
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 329237993e7041..e2dc1ef5f0d8ea 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -331,6 +331,15 @@ class TI_DLL_EXPORT Program {
 
   void prepare_runtime_context(RuntimeContext *ctx);
 
+  /** Enqueue a custom compute op to the current program execution flow.
+   *
+   *  @params op The lambda that is invoked to construct the custom compute Op
+   *  @params image_refs The image resource references used in this compute Op
+   */
+  void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs);
+
   /**
    * TODO(zhanlue): Remove this interface
    *
diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
index b140b50b760ca9..4f2a5d5a473810 100644
--- a/taichi/program/program_impl.h
+++ b/taichi/program/program_impl.h
@@ -13,6 +13,15 @@
 namespace taichi {
 namespace lang {
 
+// Represents an image resource reference for a compute/render Op
+struct ComputeOpImageRef {
+  DeviceAllocation image;
+  // The requested initial layout of the image, when Op is invoked
+  ImageLayout initial_layout;
+  // The final layout the image will be in once Op finishes
+  ImageLayout final_layout;
+};
+
 struct RuntimeContext;
 
 class ProgramImpl {
@@ -128,6 +137,12 @@ class ProgramImpl {
   virtual void prepare_runtime_context(RuntimeContext *ctx) {
   }
 
+  virtual void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs) {
+    TI_NOT_IMPLEMENTED;
+  }
+
   virtual void print_memory_profiler_info(
       std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
       uint64 *result_buffer) {
diff --git a/taichi/program/texture.h b/taichi/program/texture.h
index 0b54f6a3487506..7b439a79eac40b 100644
--- a/taichi/program/texture.h
+++ b/taichi/program/texture.h
@@ -46,6 +46,14 @@ class TI_DLL_EXPORT Texture {
 
   ~Texture();
 
+  BufferFormat get_buffer_format() const {
+    return format_;
+  }
+
+  std::array<int, 3> get_size() const {
+    return {width_, height_, depth_};
+  }
+
  private:
   DeviceAllocation texture_alloc_{kDeviceNullAllocation};
   DataType dtype_;
diff --git a/taichi/python/export_ggui.cpp b/taichi/python/export_ggui.cpp
index bd6ad0fc885ac5..b68e60f67a3d6a 100644
--- a/taichi/python/export_ggui.cpp
+++ b/taichi/python/export_ggui.cpp
@@ -290,6 +290,10 @@ struct PyCanvas {
     canvas->set_image({img});
   }
 
+  void set_image_texture(Texture *texture) {
+    canvas->set_image(texture);
+  }
+
   void scene(PyScene &scene) {
     canvas->scene(scene.scene);
   }
@@ -499,6 +503,7 @@ void export_ggui(py::module &m) {
   py::class_<PyCanvas>(m, "PyCanvas")
       .def("set_background_color", &PyCanvas::set_background_color)
       .def("set_image", &PyCanvas::set_image)
+      .def("set_image_texture", &PyCanvas::set_image_texture)
       .def("triangles", &PyCanvas::triangles)
       .def("lines", &PyCanvas::lines)
       .def("circles", &PyCanvas::circles)
diff --git a/taichi/runtime/gfx/runtime.cpp b/taichi/runtime/gfx/runtime.cpp
index 9d5333512a9e5a..273e9a35157ec8 100644
--- a/taichi/runtime/gfx/runtime.cpp
+++ b/taichi/runtime/gfx/runtime.cpp
@@ -702,6 +702,23 @@ size_t GfxRuntime::get_root_buffer_size(int id) const {
   return it->second;
 }
 
+void GfxRuntime::enqueue_compute_op_lambda(
+    std::function<void(Device *device, CommandList *cmdlist)> op,
+    const std::vector<ComputeOpImageRef> &image_refs) {
+  for (const auto &ref : image_refs) {
+    TI_ASSERT(last_image_layouts_.find(ref.image.alloc_id) !=
+              last_image_layouts_.end());
+    transition_image(ref.image, ref.initial_layout);
+  }
+
+  ensure_current_cmdlist();
+  op(device_, current_cmdlist_.get());
+
+  for (const auto &ref : image_refs) {
+    last_image_layouts_[ref.image.alloc_id] = ref.final_layout;
+  }
+}
+
 GfxRuntime::RegisterParams run_codegen(
     Kernel *kernel,
     Device *device,
diff --git a/taichi/runtime/gfx/runtime.h b/taichi/runtime/gfx/runtime.h
index 810299b83e0c51..9e9116fbc78b2f 100644
--- a/taichi/runtime/gfx/runtime.h
+++ b/taichi/runtime/gfx/runtime.h
@@ -11,6 +11,7 @@
 #include "taichi/program/compile_config.h"
 #include "taichi/struct/snode_tree.h"
 #include "taichi/program/snode_expr_utils.h"
+#include "taichi/program/program_impl.h"
 
 namespace taichi {
 namespace lang {
@@ -124,6 +125,10 @@ class TI_DLL_EXPORT GfxRuntime {
 
   size_t get_root_buffer_size(int id) const;
 
+  void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs);
+
  private:
   friend class taichi::lang::gfx::SNodeTreeManager;
 
diff --git a/taichi/runtime/program_impls/vulkan/vulkan_program.cpp b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
index 13562b7743c8ec..9d1ac33268ce73 100644
--- a/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
+++ b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
@@ -212,6 +212,12 @@ std::unique_ptr<aot::Kernel> VulkanProgramImpl::make_aot_kernel(
                                            std::move(params));
 }
 
+void VulkanProgramImpl::enqueue_compute_op_lambda(
+    std::function<void(Device *device, CommandList *cmdlist)> op,
+    const std::vector<ComputeOpImageRef> &image_refs) {
+  vulkan_runtime_->enqueue_compute_op_lambda(op, image_refs);
+}
+
 void VulkanProgramImpl::dump_cache_data_to_disk() {
   const auto &mgr = get_cache_manager();
   mgr->clean_offline_cache(offline_cache::string_to_clean_cache_policy(
diff --git a/taichi/runtime/program_impls/vulkan/vulkan_program.h b/taichi/runtime/program_impls/vulkan/vulkan_program.h
index 754983a3c2a9a8..9a8edd8b2232a8 100644
--- a/taichi/runtime/program_impls/vulkan/vulkan_program.h
+++ b/taichi/runtime/program_impls/vulkan/vulkan_program.h
@@ -93,6 +93,10 @@ class VulkanProgramImpl : public ProgramImpl {
 
   std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) override;
 
+  void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs) override;
+
   void dump_cache_data_to_disk() override;
 
   const std::unique_ptr<gfx::CacheManager> &get_cache_manager();
diff --git a/taichi/system/memory_pool.h b/taichi/system/memory_pool.h
index c41185f1bbc4a1..25da2e840ddf03 100644
--- a/taichi/system/memory_pool.h
+++ b/taichi/system/memory_pool.h
@@ -3,6 +3,7 @@
 #include "taichi/system/unified_allocator.h"
 #define TI_RUNTIME_HOST
 #include "taichi/runtime/llvm/runtime_module/mem_request.h"
+#undef TI_RUNTIME_HOST
 #include "taichi/rhi/device.h"
 #include <mutex>
 #include <vector>
diff --git a/taichi/ui/backends/vulkan/canvas.cpp b/taichi/ui/backends/vulkan/canvas.cpp
index b2c7a2af52c7a7..55aa7900549f7c 100644
--- a/taichi/ui/backends/vulkan/canvas.cpp
+++ b/taichi/ui/backends/vulkan/canvas.cpp
@@ -18,6 +18,10 @@ void Canvas::set_image(const SetImageInfo &info) {
   renderer_->set_image(info);
 }
 
+void Canvas::set_image(Texture *tex) {
+  renderer_->set_image(tex);
+}
+
 void Canvas::triangles(const TrianglesInfo &info) {
   renderer_->triangles(info);
 }
diff --git a/taichi/ui/backends/vulkan/canvas.h b/taichi/ui/backends/vulkan/canvas.h
index 287c1953621434..e4e6c12da8e19a 100644
--- a/taichi/ui/backends/vulkan/canvas.h
+++ b/taichi/ui/backends/vulkan/canvas.h
@@ -43,6 +43,8 @@ class TI_DLL_EXPORT Canvas final : public CanvasBase {
 
   virtual void set_image(const SetImageInfo &info) override;
 
+  virtual void set_image(taichi::lang::Texture *tex) override;
+
   virtual void triangles(const TrianglesInfo &info) override;
 
   virtual void circles(const CirclesInfo &info) override;
diff --git a/taichi/ui/backends/vulkan/renderable.cpp b/taichi/ui/backends/vulkan/renderable.cpp
index 81e92b2fc8d659..01050ec168c550 100644
--- a/taichi/ui/backends/vulkan/renderable.cpp
+++ b/taichi/ui/backends/vulkan/renderable.cpp
@@ -39,12 +39,39 @@ void Renderable::init_buffers() {
   create_bindings();
 }
 
+void copy_helper(Program *prog,
+                 DevicePtr dst,
+                 DevicePtr src,
+                 DevicePtr staging,
+                 size_t size) {
+  if (prog && dst.device == src.device &&
+      dst.device == prog->get_graphics_device()) {
+    prog->enqueue_compute_op_lambda(
+        [=](Device *device, CommandList *cmdlist) {
+          cmdlist->buffer_barrier(src);
+          cmdlist->buffer_copy(dst, src, size);
+          cmdlist->buffer_barrier(dst);
+        },
+        {});
+  } else {
+    Device::MemcpyCapability memcpy_cap =
+        Device::check_memcpy_capability(dst, src, size);
+    if (memcpy_cap == Device::MemcpyCapability::Direct) {
+      Device::memcpy_direct(dst, src, size);
+    } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
+      Device::memcpy_via_staging(dst, staging, src, size);
+    } else {
+      TI_NOT_IMPLEMENTED;
+    }
+  }
+}
+
 void Renderable::update_data(const RenderableInfo &info) {
   TI_ASSERT(info.vbo_attrs == config_.vbo_attrs);
   // We might not have a current program if GGUI is used in external apps to
   // load AOT modules
   Program *prog = app_context_->prog();
-  if (prog) {
+  if (prog && prog->get_graphics_device() != &app_context_->device()) {
     prog->flush();
   }
 
@@ -114,18 +141,8 @@ void Renderable::update_data(const RenderableInfo &info) {
   }
 
   const uint64_t vbo_size = config_.vbo_size() * num_vertices;
-
-  Device::MemcpyCapability memcpy_cap = Device::check_memcpy_capability(
-      vertex_buffer_.get_ptr(), vbo_dev_ptr, vbo_size);
-  if (memcpy_cap == Device::MemcpyCapability::Direct) {
-    Device::memcpy_direct(vertex_buffer_.get_ptr(), vbo_dev_ptr, vbo_size);
-  } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
-    Device::memcpy_via_staging(vertex_buffer_.get_ptr(),
-                               staging_vertex_buffer_.get_ptr(), vbo_dev_ptr,
-                               vbo_size);
-  } else {
-    TI_NOT_IMPLEMENTED;
-  }
+  copy_helper(prog, vertex_buffer_.get_ptr(0), vbo_dev_ptr,
+              staging_vertex_buffer_.get_ptr(), vbo_size);
 
   if (info.indices.valid) {
     indexed_ = true;
@@ -134,15 +151,8 @@ void Renderable::update_data(const RenderableInfo &info) {
       ibo_dev_ptr = get_device_ptr(prog, info.indices.snode);
     }
     uint64_t ibo_size = num_indices * sizeof(int);
-    if (memcpy_cap == Device::MemcpyCapability::Direct) {
-      Device::memcpy_direct(index_buffer_.get_ptr(), ibo_dev_ptr, ibo_size);
-    } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
-      Device::memcpy_via_staging(index_buffer_.get_ptr(),
-                                 staging_index_buffer_.get_ptr(), ibo_dev_ptr,
-                                 ibo_size);
-    } else {
-      TI_NOT_IMPLEMENTED;
-    }
+    copy_helper(prog, index_buffer_.get_ptr(), ibo_dev_ptr,
+                staging_index_buffer_.get_ptr(), ibo_size);
   }
 }
 
diff --git a/taichi/ui/backends/vulkan/renderables/set_image.cpp b/taichi/ui/backends/vulkan/renderables/set_image.cpp
index e0d3e331ccd239..b56514a687e86b 100644
--- a/taichi/ui/backends/vulkan/renderables/set_image.cpp
+++ b/taichi/ui/backends/vulkan/renderables/set_image.cpp
@@ -1,6 +1,7 @@
 #include "set_image.h"
 
 #include "taichi/program/program.h"
+#include "taichi/program/texture.h"
 #include "taichi/ui/utils/utils.h"
 
 using taichi::lang::Program;
@@ -20,8 +21,8 @@ int SetImage::get_correct_dimension(int dimension) {
   }
 }
 
-void SetImage::update_ubo(float x_factor, float y_factor) {
-  UniformBufferObject ubo = {x_factor, y_factor};
+void SetImage::update_ubo(float x_factor, float y_factor, bool transpose) {
+  UniformBufferObject ubo = {x_factor, y_factor, int(transpose)};
   void *mapped = app_context_->device().map(uniform_buffer_);
   memcpy(mapped, &ubo, sizeof(ubo));
   app_context_->device().unmap(uniform_buffer_);
@@ -31,10 +32,7 @@ void SetImage::update_data(const SetImageInfo &info) {
   // We might not have a current program if GGUI is used in external apps to
   // load AOT modules
   Program *prog = app_context_->prog();
-  StreamSemaphore data_ready_sema{nullptr};
-  if (prog) {
-    data_ready_sema = prog->flush();
-  }
+  StreamSemaphore sema = nullptr;
 
   const FieldInfo &img = info.img;
 
@@ -57,19 +55,22 @@ void SetImage::update_data(const SetImageInfo &info) {
   int new_width = get_correct_dimension(img.shape[0]);
   int new_height = get_correct_dimension(img.shape[1]);
 
-  if (new_width != width || new_height != height) {
+  BufferFormat fmt = BufferFormat::rgba8;
+  if (texture_dtype_ == taichi::lang::PrimitiveType::f32) {
+    fmt = BufferFormat::rgba32f;
+  }
+
+  if (new_width != width || new_height != height || fmt != format_) {
     destroy_texture();
     free_buffers();
-    init_set_image(app_context_, new_width, new_height);
+    init_set_image(app_context_, new_width, new_height, fmt);
   }
 
-  update_ubo(img.shape[0] / (float)new_width, img.shape[1] / (float)new_height);
+  update_ubo(img.shape[0] / (float)new_width, img.shape[1] / (float)new_height,
+             true);
 
   int pixels = width * height;
 
-  app_context_->device().image_transition(texture_, ImageLayout::undefined,
-                                          ImageLayout::transfer_dst);
-
   uint64_t img_size = pixels * data_type_size(texture_dtype_) * 4;
 
   // If there is no current program, VBO information should be provided directly
@@ -77,12 +78,22 @@ void SetImage::update_data(const SetImageInfo &info) {
   DevicePtr img_dev_ptr = info.img.dev_alloc.get_ptr();
   if (prog) {
     img_dev_ptr = get_device_ptr(prog, img.snode);
+    if (img_dev_ptr.device != &app_context_->device()) {
+      sema = prog->flush();
+    }
   }
+  bool use_enqueued_op =
+      prog && (img_dev_ptr.device == &app_context_->device());
 
   Device::MemcpyCapability memcpy_cap = Device::check_memcpy_capability(
       gpu_staging_buffer_.get_ptr(), img_dev_ptr, img_size);
   if (memcpy_cap == Device::MemcpyCapability::Direct) {
-    Device::memcpy_direct(gpu_staging_buffer_.get_ptr(), img_dev_ptr, img_size);
+    // If it's the same device, we do not use the staging buffer and directly
+    // copy from the src ptr to the image in the `copy_op`
+    if (!use_enqueued_op) {
+      Device::memcpy_direct(gpu_staging_buffer_.get_ptr(), img_dev_ptr,
+                            img_size);
+    }
   } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
     Device::memcpy_via_staging(gpu_staging_buffer_.get_ptr(),
                                cpu_staging_buffer_.get_ptr(), img_dev_ptr,
@@ -96,29 +107,95 @@ void SetImage::update_data(const SetImageInfo &info) {
   copy_params.image_extent.x = height;
   copy_params.image_extent.y = width;
 
-  auto stream = app_context_->device().get_graphics_stream();
-  auto cmd_list = stream->new_command_list();
-  cmd_list->image_transition(texture_, ImageLayout::undefined,
-                             ImageLayout::transfer_dst);
-  cmd_list->buffer_to_image(texture_, gpu_staging_buffer_.get_ptr(0),
-                            ImageLayout::transfer_dst, copy_params);
-
-  cmd_list->image_transition(texture_, ImageLayout::transfer_dst,
-                             ImageLayout::shader_read);
-  if (data_ready_sema) {
-    stream->submit(cmd_list.get(), {data_ready_sema});
+  DevicePtr src_ptr =
+      use_enqueued_op ? img_dev_ptr : gpu_staging_buffer_.get_ptr(0);
+
+  auto copy_op = [texture = this->texture_, src_ptr, copy_params](
+                     Device *device, CommandList *cmdlist) {
+    cmdlist->image_transition(texture, ImageLayout::undefined,
+                              ImageLayout::transfer_dst);
+    cmdlist->buffer_barrier(src_ptr);
+    cmdlist->buffer_to_image(texture, src_ptr, ImageLayout::transfer_dst,
+                             copy_params);
+    cmdlist->image_transition(texture, ImageLayout::transfer_dst,
+                              ImageLayout::shader_read);
+  };
+
+  if (use_enqueued_op) {
+    prog->enqueue_compute_op_lambda(copy_op, {});
+  } else {
+    auto stream = app_context_->device().get_graphics_stream();
+    auto cmd_list = stream->new_command_list();
+    copy_op(&app_context_->device(), cmd_list.get());
+    if (sema) {
+      stream->submit(cmd_list.get(), {sema});
+    } else {
+      stream->submit(cmd_list.get());
+    }
+  }
+}
+
+void SetImage::update_data(Texture *tex) {
+  Program *prog = app_context_->prog();
+
+  auto shape = tex->get_size();
+  auto fmt = tex->get_buffer_format();
+
+  TI_ASSERT_INFO(shape[2] == 1,
+                 "Must be a 2D image! Received image shape: {}x{}x{}", shape[0],
+                 shape[1], shape[2]);
+
+  // Reminder: y/x is flipped in Taichi. I would like to use the correct
+  // orientation, but we have existing code already using the previous
+  // convention
+  if (shape[1] != width || shape[0] != height || fmt != format_) {
+    destroy_texture();
+    free_buffers();
+    init_set_image(app_context_, shape[1], shape[0], fmt);
+  }
+
+  update_ubo(1.0f, 1.0f, false);
+
+  ImageCopyParams copy_params;
+  copy_params.width = shape[0];
+  copy_params.height = shape[1];
+  copy_params.depth = shape[2];
+
+  DeviceAllocation src_alloc = tex->get_device_allocation();
+  auto copy_op = [texture = this->texture_, src_alloc, copy_params](
+                     Device *device, CommandList *cmdlist) {
+    cmdlist->image_transition(texture, ImageLayout::undefined,
+                              ImageLayout::transfer_dst);
+    cmdlist->copy_image(texture, src_alloc, ImageLayout::transfer_dst,
+                        ImageLayout::transfer_src, copy_params);
+    cmdlist->image_transition(texture, ImageLayout::transfer_dst,
+                              ImageLayout::shader_read);
+  };
+
+  // In the current state if we called this direct image update data method, we
+  // gurantee to have a program.
+  // FIXME: However, if we don't have a Program, where does the layout come
+  // from?
+  if (prog) {
+    prog->enqueue_compute_op_lambda(
+        copy_op, {ComputeOpImageRef{src_alloc, ImageLayout::transfer_src,
+                                    ImageLayout::transfer_src}});
   } else {
+    auto stream = app_context_->device().get_graphics_stream();
+    auto cmd_list = stream->new_command_list();
+    copy_op(&app_context_->device(), cmd_list.get());
     stream->submit(cmd_list.get());
   }
 }
 
 SetImage::SetImage(AppContext *app_context, VertexAttributes vbo_attrs) {
-  init_set_image(app_context, 1, 1);
+  init_set_image(app_context, 1, 1, BufferFormat::rgba8);
 }
 
 void SetImage::init_set_image(AppContext *app_context,
                               int img_width,
-                              int img_height) {
+                              int img_height,
+                              taichi::lang::BufferFormat format) {
   RenderableConfig config = {
       6,
       6,
@@ -138,8 +215,9 @@ void SetImage::init_set_image(AppContext *app_context,
 
   Renderable::init(config, app_context);
 
-  width = img_width;
-  height = img_height;
+  this->width = img_width;
+  this->height = img_height;
+  format_ = format;
 
   create_texture();
 
@@ -154,10 +232,7 @@ void SetImage::create_texture() {
 
   ImageParams params;
   params.dimension = ImageDimension::d2D;
-  params.format = BufferFormat::rgba8;
-  if (texture_dtype_ == taichi::lang::PrimitiveType::f32) {
-    params.format = BufferFormat::rgba32f;
-  }
+  params.format = format_;
   params.initial_layout = ImageLayout::shader_read;
   // these are flipped because taichi is y-major and vulkan is x-major
   params.x = height;
diff --git a/taichi/ui/backends/vulkan/renderables/set_image.h b/taichi/ui/backends/vulkan/renderables/set_image.h
index 6e2323fcf9f64e..8698726d0433f1 100644
--- a/taichi/ui/backends/vulkan/renderables/set_image.h
+++ b/taichi/ui/backends/vulkan/renderables/set_image.h
@@ -35,12 +35,15 @@ class SetImage final : public Renderable {
     // the actual image is only a corner of the whole image
     float x_factor{1.0};
     float y_factor{1.0};
+    int transpose{0};
   };
 
   SetImage(AppContext *app_context, VertexAttributes vbo_attrs);
 
   void update_data(const SetImageInfo &info);
 
+  void update_data(taichi::lang::Texture *tex);
+
   virtual void cleanup() override;
 
  private:
@@ -50,8 +53,13 @@ class SetImage final : public Renderable {
   taichi::lang::DataType texture_dtype_{taichi::lang::PrimitiveType::u8};
   taichi::lang::DeviceAllocation texture_;
 
+  taichi::lang::BufferFormat format_;
+
  private:
-  void init_set_image(AppContext *app_context, int img_width, int img_height);
+  void init_set_image(AppContext *app_context,
+                      int img_width,
+                      int img_height,
+                      taichi::lang::BufferFormat format);
 
   virtual void create_bindings() override;
 
@@ -64,7 +72,7 @@ class SetImage final : public Renderable {
 
   int get_correct_dimension(int dimension);
 
-  void update_ubo(float x_factor, float y_factor);
+  void update_ubo(float x_factor, float y_factor, bool transpose);
 };
 
 }  // namespace vulkan
diff --git a/taichi/ui/backends/vulkan/renderer.cpp b/taichi/ui/backends/vulkan/renderer.cpp
index e01803940c1091..74c52ceabb45d2 100644
--- a/taichi/ui/backends/vulkan/renderer.cpp
+++ b/taichi/ui/backends/vulkan/renderer.cpp
@@ -50,6 +50,12 @@ void Renderer::set_image(const SetImageInfo &info) {
   next_renderable_ += 1;
 }
 
+void Renderer::set_image(Texture *tex) {
+  SetImage *s = get_renderable_of_type<SetImage>(VboHelpers::all());
+  s->update_data(tex);
+  next_renderable_ += 1;
+}
+
 void Renderer::triangles(const TrianglesInfo &info) {
   Triangles *triangles =
       get_renderable_of_type<Triangles>(info.renderable_info.vbo_attrs);
diff --git a/taichi/ui/backends/vulkan/renderer.h b/taichi/ui/backends/vulkan/renderer.h
index e7e0084e673468..dd9bdc664a83b8 100644
--- a/taichi/ui/backends/vulkan/renderer.h
+++ b/taichi/ui/backends/vulkan/renderer.h
@@ -54,6 +54,8 @@ class TI_DLL_EXPORT Renderer {
 
   void set_image(const SetImageInfo &info);
 
+  void set_image(taichi::lang::Texture *tex);
+
   void triangles(const TrianglesInfo &info);
 
   void circles(const CirclesInfo &info);
diff --git a/taichi/ui/common/canvas_base.h b/taichi/ui/common/canvas_base.h
index fbf29c4fe1f28b..98af1523b177cf 100644
--- a/taichi/ui/common/canvas_base.h
+++ b/taichi/ui/common/canvas_base.h
@@ -5,6 +5,14 @@
 #include "taichi/ui/common/renderable_info.h"
 #include "taichi/ui/utils/utils.h"
 
+namespace taichi {
+namespace lang {
+
+class Texture;
+
+}
+}  // namespace taichi
+
 TI_UI_NAMESPACE_BEGIN
 
 struct SetImageInfo {
@@ -32,6 +40,7 @@ class CanvasBase {
  public:
   virtual void set_background_color(const glm::vec3 &color) = 0;
   virtual void set_image(const SetImageInfo &info) = 0;
+  virtual void set_image(taichi::lang::Texture *tex) = 0;
   virtual void triangles(const TrianglesInfo &info) = 0;
   virtual void circles(const CirclesInfo &info) = 0;
   virtual void lines(const LinesInfo &info) = 0;
diff --git a/tests/python/test_ggui.py b/tests/python/test_ggui.py
index 88f83976a33250..d4a1bb606f26c9 100644
--- a/tests/python/test_ggui.py
+++ b/tests/python/test_ggui.py
@@ -301,7 +301,7 @@ def render():
 
     render()
 
-    verify_image(window.get_image_buffer_as_numpy(), 'test_set_image')
+    verify_image(window.get_image_buffer_as_numpy(), 'test_set_image', 0.3)
     window.destroy()
 
 

From a8b5fc277577764a12933ee9a70b710559caa7ae Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Sun, 18 Sep 2022 21:50:22 -0700
Subject: [PATCH 20/32] [dx12] Update codegen for range_for and mesh_for
 (#6092)

Copied from cuda codegen.

Issue: #5276

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/dx12/codegen_dx12.cpp | 92 +++++++++++-----------------
 1 file changed, 37 insertions(+), 55 deletions(-)

diff --git a/taichi/codegen/dx12/codegen_dx12.cpp b/taichi/codegen/dx12/codegen_dx12.cpp
index bee1d3c27a6e8b..bc2a65e2ae94e9 100644
--- a/taichi/codegen/dx12/codegen_dx12.cpp
+++ b/taichi/codegen/dx12/codegen_dx12.cpp
@@ -1,3 +1,5 @@
+#include "llvm/IR/IntrinsicsDirectX.h"
+
 #include "taichi/codegen/dx12/codegen_dx12.h"
 #include "taichi/codegen/dx12/dx12_llvm_passes.h"
 #include "taichi/rhi/dx12/dx12_api.h"
@@ -26,24 +28,13 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
   }
 
   void create_offload_range_for(OffloadedStmt *stmt) override {
-    int step = 1;
-
-    // In parallel for-loops reversing the order doesn't make sense.
-    // However, we may need to support serial offloaded range for's in the
-    // future, so it still makes sense to reverse the order here.
-    if (stmt->reversed) {
-      step = -1;
-    }
+    auto tls_prologue = create_xlogue(stmt->tls_prologue);
 
-    auto *tls_prologue = create_xlogue(stmt->tls_prologue);
-
-    // The loop body
     llvm::Function *body;
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
-           tlctx->get_data_type<int>()});
+           get_tls_buffer_type(), tlctx->get_data_type<int>()});
 
       auto loop_var = create_entry_block_alloca(PrimitiveType::i32);
       loop_vars_llvm[stmt].push_back(loop_var);
@@ -53,36 +44,22 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
       body = guard.body;
     }
 
-    llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue);
+    auto epilogue = create_xlogue(stmt->tls_epilogue);
 
     auto [begin, end] = get_range_for_bounds(stmt);
-
-    // adaptive block_dim
-    if (prog->config.cpu_block_dim_adaptive) {
-      int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step);
-      int num_threads = stmt->num_cpu_threads;
-      int items_per_thread = std::max(1, num_items / (num_threads * 32));
-      // keep each task has at least 512 items to amortize scheduler overhead
-      // also saturate the value to 1024 for better load balancing
-      stmt->block_dim = std::min(1024, std::max(512, items_per_thread));
-    }
-
-    create_call(
-        "gpu_parallel_range_for",
-        {get_arg(0), tlctx->get_constant(stmt->num_cpu_threads), begin, end,
-         tlctx->get_constant(step), tlctx->get_constant(stmt->block_dim),
-         tls_prologue, body, epilogue, tlctx->get_constant(stmt->tls_size)});
+    create_call("gpu_parallel_range_for",
+                {get_arg(0), begin, end, tls_prologue, body, epilogue,
+                 tlctx->get_constant(stmt->tls_size)});
   }
 
   void create_offload_mesh_for(OffloadedStmt *stmt) override {
-    auto *tls_prologue = create_mesh_xlogue(stmt->tls_prologue);
+    auto tls_prologue = create_mesh_xlogue(stmt->tls_prologue);
 
     llvm::Function *body;
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
-           tlctx->get_data_type<int>()});
+           get_tls_buffer_type(), tlctx->get_data_type<int>()});
 
       for (int i = 0; i < stmt->mesh_prologue->size(); i++) {
         auto &s = stmt->mesh_prologue->statements[i];
@@ -91,6 +68,7 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
 
       if (stmt->bls_prologue) {
         stmt->bls_prologue->accept(this);
+        call("block_barrier");  // "__syncthreads()"
       }
 
       auto loop_test_bb =
@@ -99,21 +77,27 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
           llvm::BasicBlock::Create(*llvm_context, "loop_body", func);
       auto func_exit =
           llvm::BasicBlock::Create(*llvm_context, "func_exit", func);
-      auto loop_index =
-          create_entry_block_alloca(llvm::Type::getInt32Ty(*llvm_context));
-      builder->CreateStore(tlctx->get_constant(0), loop_index);
+      auto i32_ty = llvm::Type::getInt32Ty(*llvm_context);
+      auto loop_index = create_entry_block_alloca(i32_ty);
+      llvm::Value *thread_idx =
+          builder->CreateIntrinsic(llvm::Intrinsic::dx_thread_id_in_group,
+                                   {i32_ty}, {builder->getInt32(0)});
+      // FIXME: use correct block dim.
+      llvm::Value *block_dim =
+          builder->getInt32(64); /*builder->CreateIntrinsic(
+          llvm::Intrinsic::dx, {}, {});*/
+      builder->CreateStore(thread_idx, loop_index);
       builder->CreateBr(loop_test_bb);
 
       {
         builder->SetInsertPoint(loop_test_bb);
+        auto cond = builder->CreateICmp(
+            llvm::CmpInst::Predicate::ICMP_SLT,
+            builder->CreateLoad(
 #ifdef TI_LLVM_15
-        auto *loop_index_load =
-            builder->CreateLoad(builder->getInt32Ty(), loop_index);
-#else
-        auto *loop_index_load = builder->CreateLoad(loop_index);
+                i32_ty,
 #endif
-        auto cond = builder->CreateICmp(
-            llvm::CmpInst::Predicate::ICMP_SLT, loop_index_load,
+                loop_index),
             llvm_val[stmt->owned_num_local.find(stmt->major_from_type)
                          ->second]);
         builder->CreateCondBr(cond, loop_body_bb, func_exit);
@@ -126,33 +110,31 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
           auto &s = stmt->body->statements[i];
           s->accept(this);
         }
+        builder->CreateStore(builder->CreateAdd(builder->CreateLoad(
 #ifdef TI_LLVM_15
-        auto *loop_index_load =
-            builder->CreateLoad(builder->getInt32Ty(), loop_index);
-#else
-        auto *loop_index_load = builder->CreateLoad(loop_index);
+                                                    i32_ty,
 #endif
-        builder->CreateStore(
-            builder->CreateAdd(loop_index_load, tlctx->get_constant(1)),
-            loop_index);
+                                                    loop_index),
+                                                block_dim),
+                             loop_index);
         builder->CreateBr(loop_test_bb);
         builder->SetInsertPoint(func_exit);
       }
 
       if (stmt->bls_epilogue) {
+        call("block_barrier");  // "__syncthreads()"
         stmt->bls_epilogue->accept(this);
       }
 
       body = guard.body;
     }
 
-    llvm::Value *epilogue = create_mesh_xlogue(stmt->tls_epilogue);
+    auto tls_epilogue = create_mesh_xlogue(stmt->tls_epilogue);
 
-    create_call("gpu_parallel_mesh_for",
-                {get_arg(0), tlctx->get_constant(stmt->num_cpu_threads),
-                 tlctx->get_constant(stmt->mesh->num_patches),
-                 tlctx->get_constant(stmt->block_dim), tls_prologue, body,
-                 epilogue, tlctx->get_constant(stmt->tls_size)});
+    create_call(
+        "gpu_parallel_mesh_for",
+        {get_arg(0), tlctx->get_constant(stmt->mesh->num_patches), tls_prologue,
+         body, tls_epilogue, tlctx->get_constant(stmt->tls_size)});
   }
 
   void create_bls_buffer(OffloadedStmt *stmt) {

From 18c946ea67d23265896d3fb967482dfdd4e63795 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Mon, 19 Sep 2022 12:53:30 +0800
Subject: [PATCH 21/32] [spirv] [refactor] Rename debug_ segment to names_
 (#6094)

Issue: #6075 (followup)

### Brief Summary
Rename debug_ segment to a more proper name.
---
 taichi/codegen/spirv/spirv_codegen.cpp    | 12 +++----
 taichi/codegen/spirv/spirv_ir_builder.cpp | 40 +++++++++++------------
 taichi/codegen/spirv/spirv_ir_builder.h   | 23 ++++++++-----
 3 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 6b2119c151ebb7..8c51a8f38f3074 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -105,7 +105,7 @@ class TaskCodegen : public IRVisitor {
   Result run() {
     ir_->init_header();
     kernel_function_ = ir_->new_function();  // void main();
-    ir_->debug(spv::OpName, kernel_function_, "main");
+    ir_->debug_name(spv::OpName, kernel_function_, "main");
 
     compile_args_struct();
     compile_ret_struct();
@@ -1670,15 +1670,15 @@ class TaskCodegen : public IRVisitor {
       task_attribs_.advisory_total_num_threads = kMaxNumThreadsGridStrideLoop;
     }
     task_attribs_.advisory_num_threads_per_group = stmt->block_dim;
-    ir_->debug(spv::OpName, begin_expr_value, "begin_expr_value");
-    ir_->debug(spv::OpName, total_elems, total_elems_name);
+    ir_->debug_name(spv::OpName, begin_expr_value, "begin_expr_value");
+    ir_->debug_name(spv::OpName, total_elems, total_elems_name);
 
     spirv::Value begin_ =
         ir_->add(ir_->cast(ir_->i32_type(), ir_->get_global_invocation_id(0)),
                  begin_expr_value);
-    ir_->debug(spv::OpName, begin_, "begin_");
+    ir_->debug_name(spv::OpName, begin_, "begin_");
     spirv::Value end_ = ir_->add(total_elems, begin_expr_value);
-    ir_->debug(spv::OpName, end_, "end_");
+    ir_->debug_name(spv::OpName, end_, "end_");
     const std::string total_invocs_name = "total_invocs";
     // For now, |total_invocs_name| is equal to |total_elems|. Once we support
     // dynamic range, they will be different.
@@ -1700,7 +1700,7 @@ class TaskCodegen : public IRVisitor {
         false);
         */
 
-    ir_->debug(spv::OpName, total_invocs, total_invocs_name);
+    ir_->debug_name(spv::OpName, total_invocs, total_invocs_name);
 
     // Must get init label after making value(to make sure they are correct)
     spirv::Label init_label = ir_->current_label();
diff --git a/taichi/codegen/spirv/spirv_ir_builder.cpp b/taichi/codegen/spirv/spirv_ir_builder.cpp
index 2398c247c53d51..3505ceb11b0bca 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.cpp
+++ b/taichi/codegen/spirv/spirv_ir_builder.cpp
@@ -132,7 +132,7 @@ std::vector<uint32_t> IRBuilder::finalize() {
   data.insert(data.end(), entry_.begin(), entry_.end());
   data.insert(data.end(), exec_mode_.begin(), exec_mode_.end());
   data.insert(data.end(), strings_.begin(), strings_.end());
-  data.insert(data.end(), debug_.begin(), debug_.end());
+  data.insert(data.end(), names_.begin(), names_.end());
   data.insert(data.end(), decorate_.begin(), decorate_.end());
   data.insert(data.end(), global_.begin(), global_.end());
   data.insert(data.end(), func_header_.begin(), func_header_.end());
@@ -218,7 +218,7 @@ void IRBuilder::init_pre_defs() {
   const_i32_one_ = int_immediate_number(t_int32_, 1);
 }
 
-Value IRBuilder::make_string(std::string s) {
+Value IRBuilder::debug_string(std::string s) {
   Value val = new_value(SType(), ValueKind::kNormal);
   ib_.begin(spv::OpString).add_seq(val, s).commit(&strings_);
   return val;
@@ -592,7 +592,7 @@ SType IRBuilder::create_struct_type(
   for (auto &[type, name, offset] : components) {
     this->decorate(spv::OpMemberDecorate, struct_type, i, spv::DecorationOffset,
                    offset);
-    this->debug(spv::OpMemberName, struct_type, i, name);
+    this->debug_name(spv::OpMemberName, struct_type, i, name);
     i++;
   }
 
@@ -612,7 +612,7 @@ Value IRBuilder::buffer_struct_argument(const SType &struct_type,
     storage_class = spv::StorageClassStorageBuffer;
   }
 
-  this->debug(spv::OpName, struct_type, name + "_t");
+  this->debug_name(spv::OpName, struct_type, name + "_t");
 
   if (device_->get_cap(cap::spirv_version) < 0x10300) {
     // NOTE: BufferBlock was deprecated in SPIRV 1.3
@@ -625,14 +625,14 @@ Value IRBuilder::buffer_struct_argument(const SType &struct_type,
 
   SType ptr_type = get_pointer_type(struct_type, storage_class);
 
-  this->debug(spv::OpName, ptr_type, name + "_ptr");
+  this->debug_name(spv::OpName, ptr_type, name + "_ptr");
 
   Value val = new_value(ptr_type, ValueKind::kStructArrayPtr);
   ib_.begin(spv::OpVariable)
       .add_seq(ptr_type, val, storage_class)
       .commit(&global_);
 
-  this->debug(spv::OpName, val, name);
+  this->debug_name(spv::OpName, val, name);
 
   this->decorate(spv::OpDecorate, val, spv::DecorationDescriptorSet,
                  descriptor_set);
@@ -648,20 +648,20 @@ Value IRBuilder::uniform_struct_argument(const SType &struct_type,
   // use StorageClassStorageBuffer instead.
   spv::StorageClass storage_class = spv::StorageClassUniform;
 
-  this->debug(spv::OpName, struct_type, name + "_t");
+  this->debug_name(spv::OpName, struct_type, name + "_t");
 
   this->decorate(spv::OpDecorate, struct_type, spv::DecorationBlock);
 
   SType ptr_type = get_pointer_type(struct_type, storage_class);
 
-  this->debug(spv::OpName, ptr_type, name + "_ptr");
+  this->debug_name(spv::OpName, ptr_type, name + "_ptr");
 
   Value val = new_value(ptr_type, ValueKind::kStructArrayPtr);
   ib_.begin(spv::OpVariable)
       .add_seq(ptr_type, val, storage_class)
       .commit(&global_);
 
-  this->debug(spv::OpName, val, name);
+  this->debug_name(spv::OpName, val, name);
 
   this->decorate(spv::OpDecorate, val, spv::DecorationDescriptorSet,
                  descriptor_set);
@@ -686,18 +686,18 @@ Value IRBuilder::buffer_argument(const SType &value_type,
 
   auto typed_name = name + "_" + value_type.dt.to_string();
 
-  this->debug(spv::OpName, sarr_type, typed_name + "_struct_array");
+  this->debug_name(spv::OpName, sarr_type, typed_name + "_struct_array");
 
   SType ptr_type = get_pointer_type(sarr_type, storage_class);
 
-  this->debug(spv::OpName, sarr_type, typed_name + "_ptr");
+  this->debug_name(spv::OpName, sarr_type, typed_name + "_ptr");
 
   Value val = new_value(ptr_type, ValueKind::kStructArrayPtr);
   ib_.begin(spv::OpVariable)
       .add_seq(ptr_type, val, storage_class)
       .commit(&global_);
 
-  this->debug(spv::OpName, val, typed_name);
+  this->debug_name(spv::OpName, val, typed_name);
 
   this->decorate(spv::OpDecorate, val, spv::DecorationDescriptorSet,
                  descriptor_set);
@@ -744,7 +744,7 @@ Value IRBuilder::texture_argument(int num_channels,
                  descriptor_set);
   this->decorate(spv::OpDecorate, val, spv::DecorationBinding, binding);
 
-  this->debug(spv::OpName, val, "tex");
+  this->debug_name(spv::OpName, val, "tex");
 
   this->global_values.push_back(val);
 
@@ -769,7 +769,7 @@ Value IRBuilder::storage_image_argument(int num_channels,
                  descriptor_set);
   this->decorate(spv::OpDecorate, val, spv::DecorationBinding, binding);
 
-  this->debug(spv::OpName, val, "tex");
+  this->debug_name(spv::OpName, val, "tex");
 
   this->global_values.push_back(val);
 
@@ -1201,7 +1201,7 @@ void IRBuilder::register_value(std::string name, Value value) {
   if (it != value_name_tbl_.end() && it->second.flag != ValueKind::kConstant) {
     TI_ERROR("{} already exists.", name);
   }
-  this->debug(
+  this->debug_name(
       spv::OpName, value,
       fmt::format("{}_{}", name, value.stype.dt.to_string()));  // Debug info
   value_name_tbl_[name] = value;
@@ -1439,13 +1439,13 @@ void IRBuilder::init_random_function(Value global_tmp_) {
   ib_.begin(spv::OpVariable)
       .add_seq(local_type, rand_w_, spv::StorageClassPrivate)
       .commit(&global_);
-  debug(spv::OpName, rand_x_, "_rand_x");
-  debug(spv::OpName, rand_y_, "_rand_y");
-  debug(spv::OpName, rand_z_, "_rand_z");
-  debug(spv::OpName, rand_w_, "_rand_w");
+  debug_name(spv::OpName, rand_x_, "_rand_x");
+  debug_name(spv::OpName, rand_y_, "_rand_y");
+  debug_name(spv::OpName, rand_z_, "_rand_z");
+  debug_name(spv::OpName, rand_w_, "_rand_w");
   SType gtmp_type = get_pointer_type(t_uint32_, spv::StorageClassStorageBuffer);
   Value rand_gtmp_ = new_value(gtmp_type, ValueKind::kVariablePtr);
-  debug(spv::OpName, rand_gtmp_, "rand_gtmp");
+  debug_name(spv::OpName, rand_gtmp_, "rand_gtmp");
 
   auto load_var = [&](Value pointer, const SType &res_type) {
     TI_ASSERT(pointer.flag == ValueKind::kVariablePtr ||
diff --git a/taichi/codegen/spirv/spirv_ir_builder.h b/taichi/codegen/spirv/spirv_ir_builder.h
index 57eaa6b76da3fd..337985bad360b2 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.h
+++ b/taichi/codegen/spirv/spirv_ir_builder.h
@@ -209,10 +209,12 @@ class IRBuilder {
   }
 
   template <typename... Args>
-  void debug(spv::Op op, Args &&...args) {
-    ib_.begin(op).add_seq(std::forward<Args>(args)...).commit(&debug_);
+  void debug_name(spv::Op op, Args &&...args) {
+    ib_.begin(op).add_seq(std::forward<Args>(args)...).commit(&names_);
   }
 
+  Value debug_string(std::string str);
+
   template <typename... Args>
   void execution_mode(Value func, Args &&...args) {
     ib_.begin(spv::OpExecutionMode)
@@ -294,8 +296,6 @@ class IRBuilder {
                                double value,
                                bool cache = true);
 
-  Value make_string(std::string str);
-
   // Match zero type
   Value get_zero(const SType &stype) {
     TI_ASSERT(stype.flag == TypeKind::kPrimitive);
@@ -464,7 +464,7 @@ class IRBuilder {
 
   // Create a debugPrintf call
   void call_debugprintf(std::string formats, const std::vector<Value> &args) {
-    Value format_str = make_string(formats);
+    Value format_str = debug_string(formats);
     Value val = new_value(t_void_, ValueKind::kNormal);
     ib_.begin(spv::OpExtInst)
         .add_seq(t_void_, val, debug_printf_, 1, format_str);
@@ -619,11 +619,18 @@ class IRBuilder {
   std::vector<uint32_t> entry_;
   // Header segment
   std::vector<uint32_t> exec_mode_;
+  // Debug segment
+  //   According to SPIR-V spec, the following debug instructions must be
+  //   grouped in the order:
+  //   - All OpString, OpSourceExtension, OpSource, and OpSourceContinued,
+  //   without forward references.
+  //   - All OpName and all OpMemberName.
+  //   - All OpModuleProcessed instructions.
+
   // OpString segment
   std::vector<uint32_t> strings_;
-  // TODO: Rename this to names_ in a followup PR
-  // Debug segment
-  std::vector<uint32_t> debug_;
+  // OpName segment
+  std::vector<uint32_t> names_;
   // Annotation segment
   std::vector<uint32_t> decorate_;
   // Global segment: types, variables, types

From 88f030e9b3b786439e39c0492a6776549d3aac60 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Mon, 19 Sep 2022 17:04:44 +0800
Subject: [PATCH 22/32] [aot] Support multi-target builds for Apple M1 (#6083)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR enables Taichi to be built into multi-target libraries. Recent
releases of macOS supports dynamically linking against x86_64 or arm64
of the same `.dylib` on demand, depending on the arch of the parent
process. Previously this was not possible because the CMake script
forced a `-march=nehalem` that is not a kind of arm64; and
`CMAKE_OSX_ARCHITECTURES` is forced `arm64`.

The multi-target feature is only enabled with
`-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"`. Python builds will select
one of the archs, based on the arch of the skbuild Python process.

<img width="1260" alt="图片"
src="https://user-images.githubusercontent.com/1487605/190690041-f4a04941-a0dc-440e-b083-c59ae00b111f.png">

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CMakeLists.txt             | 14 --------------
 cmake/TaichiCXXFlags.cmake | 11 +++++++++--
 setup.py                   |  6 ++++++
 taichi/program/program.cpp | 11 ++++++-----
 taichi/system/timer.cpp    |  2 +-
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53a4bcc3b584bd..0cc5703b7d5dd1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,20 +6,6 @@ cmake_minimum_required(VERSION 3.17)
 
 project(taichi)
 
-# Taichi does not set target architecture explicitly,
-# but rather rely on CMake to detect the host arch.
-#
-# However on Mac m1, there are two available architectures namely x86_64 and arm64.
-# On some combination of "OSX version" and "CMake version", CMake will use x86_64 as default architecture even if it's on m1 chip.
-# This causes conflicts with the precompiled LLVM/Clang binaries downloaded from Taichi's repo (pre-built for arm64)
-#
-# Therefore we force CMake to choose arm64 architecture on arm64 chips.
-if (APPLE)
-  if( "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "arm64" )
-    set(CMAKE_OSX_ARCHITECTURES ${CMAKE_HOST_SYSTEM_PROCESSOR})
-  endif()
-endif()
-
 if (NOT DEFINED TI_VERSION_MAJOR)
     message(WARNING "It seems that you are running cmake manually, which may cause issues. Please use setup.py to build taichi from source, see https://docs.taichi-lang.org/docs/dev_install for more details.")
     set(TI_VERSION_MAJOR 0)
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index 0b73ac4324c2c0..687b705f37084e 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -81,8 +81,15 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64" OR "${CMAKE_SYSTEM_PROCESSOR}"
     if (MSVC)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D \"TI_ARCH_x64\"")
     else()
-        message("Setting -march=nehalem for x86_64 processors")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=nehalem -DTI_ARCH_x64")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_ARCH_x64")
+        if ("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
+            # TODO: (penguinliong) Will probably need this in a future version
+            # of Clang. Clang11 doesn't recognize this.
+            #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1")
+        else()
+            message("Setting -march=nehalem for x86_64 processors")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=nehalem")
+        endif()
     endif()
     set(ARCH "x64")
 elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64")
diff --git a/setup.py b/setup.py
index c7fe63fa51b177..7196db9694c8a8 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@
 import glob
 import multiprocessing
 import os
+import platform
 import shutil
 import sys
 from distutils.command.clean import clean
@@ -131,6 +132,11 @@ def get_cmake_args():
 
     if sys.platform != 'win32':
         os.environ['SKBUILD_BUILD_OPTIONS'] = f'-j{num_threads}'
+    if sys.platform == "darwin":
+        if platform.machine() == "arm64":
+            cmake_args += ["-DCMAKE_OSX_ARCHITECTURES=arm64"]
+        else:
+            cmake_args += ["-DCMAKE_OSX_ARCHITECTURES=x86_64"]
     return cmake_args
 
 
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index e0af071f09cb32..57dcd27a0de560 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -40,10 +40,10 @@
 #include "taichi/rhi/dx/dx_api.h"
 #endif
 
-#if defined(TI_ARCH_x64)
+#if defined(_M_X64) || defined(__x86_64)
 // For _MM_SET_FLUSH_ZERO_MODE
 #include <xmmintrin.h>
-#endif
+#endif  // defined(_M_X64) || defined(__x86_64)
 
 namespace taichi {
 namespace lang {
@@ -55,9 +55,10 @@ Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) {
   // For performance considerations and correctness of QuantFloatType
   // operations, we force floating-point operations to flush to zero on all
   // backends (including CPUs).
-#if defined(TI_ARCH_x64)
+#if defined(_M_X64) || defined(__x86_64)
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-#else
+#endif  // defined(_M_X64) || defined(__x86_64)
+#if defined(__arm64__) || defined(__aarch64__)
   // Enforce flush to zero on arm64 CPUs
   // https://developer.arm.com/documentation/100403/0201/register-descriptions/advanced-simd-and-floating-point-registers/aarch64-register-descriptions/fpcr--floating-point-control-register?lang=en
   std::uint64_t fpcr;
@@ -68,7 +69,7 @@ Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) {
                        :
                        : "ri"(fpcr | (1 << 24)));  // Bit 24 is FZ
   __asm__ __volatile__("");
-#endif
+#endif  // defined(__arm64__) || defined(__aarch64__)
   config = default_compile_config;
   config.arch = desired_arch;
   // TODO: allow users to run in debug mode without out-of-bound checks
diff --git a/taichi/system/timer.cpp b/taichi/system/timer.cpp
index 45491f4eea29a9..fb0d61472e2310 100644
--- a/taichi/system/timer.cpp
+++ b/taichi/system/timer.cpp
@@ -220,7 +220,7 @@ uint64 Time::get_cycles() {
 #else
 
 uint64 Time::get_cycles() {
-#if defined(TI_ARCH_x64)
+#if defined(TI_ARCH_x64) && !(defined(__arm64__) || defined(__aarch64__))
   unsigned int lo, hi;
   __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
   return ((uint64)hi << 32) | lo;

From bf365c7e44f151c3758d684b6ce560223e780986 Mon Sep 17 00:00:00 2001
From: PGZXB <pgzxb@qq.com>
Date: Mon, 19 Sep 2022 17:13:52 +0800
Subject: [PATCH 23/32] [bug] Fix cleaning cache failed (#6100)

Issue: fixes #6081
---
 taichi/runtime/llvm/llvm_offline_cache.cpp | 5 +++++
 taichi/util/io.h                           | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_offline_cache.cpp b/taichi/runtime/llvm/llvm_offline_cache.cpp
index cf742239f54b22..70d6bf853889f7 100644
--- a/taichi/runtime/llvm/llvm_offline_cache.cpp
+++ b/taichi/runtime/llvm/llvm_offline_cache.cpp
@@ -117,6 +117,11 @@ bool LlvmOfflineCacheFileReader::load_meta_data(
   using Error = offline_cache::LoadMetadataError;
   const auto tcb_path = get_llvm_cache_metadata_file_path(cache_file_path);
 
+  if (!taichi::path_exists(tcb_path)) {
+    TI_DEBUG("File {} not found", tcb_path);
+    return false;
+  }
+
   if (!with_lock) {
     return Error::kNoError == load_metadata_with_checking(data, tcb_path);
   }
diff --git a/taichi/util/io.h b/taichi/util/io.h
index 616ce342301a58..6409dbf2cd2bf3 100644
--- a/taichi/util/io.h
+++ b/taichi/util/io.h
@@ -76,11 +76,11 @@ inline bool traverse_directory(const std::string &dir, Visitor v) {
     return false;
   }
   while ((f = ::readdir(directory))) {
-    struct stat *stat_buf = nullptr;
     auto fullpath = join_path(dir, f->d_name);
-    auto ret = ::stat(fullpath.c_str(), stat_buf);
+    struct stat stat_buf;
+    auto ret = ::stat(fullpath.c_str(), &stat_buf);
     TI_ASSERT(ret == 0);
-    v(f->d_name, S_ISDIR(stat_buf->st_mode));
+    v(f->d_name, S_ISDIR(stat_buf.st_mode));
   }
   auto ret = ::closedir(directory);
   TI_ASSERT(ret == 0);

From 1262a70a9080537be170ceb405bc9ae38a64e7c0 Mon Sep 17 00:00:00 2001
From: Mocki <34432001+Morcki@users.noreply.github.com>
Date: Tue, 20 Sep 2022 09:22:45 +0800
Subject: [PATCH 24/32] [bug] [gui] Fix a bug of drawing mesh instacing that
 cpu/cuda objects have an offset when copying to vulkan object (#6028)

This only happend on Windows.

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->
---
 python/taichi/ui/scene.py                     | 12 +++++++----
 taichi/rhi/device.cpp                         |  9 ++++++++-
 taichi/rhi/interop/vulkan_cpu_interop.cpp     | 20 +++++++++++++++++++
 taichi/rhi/interop/vulkan_cpu_interop.h       |  2 ++
 .../ui/backends/vulkan/renderables/mesh.cpp   | 19 +++++++++++-------
 tests/python/test_ggui.py                     |  8 --------
 6 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/python/taichi/ui/scene.py b/python/taichi/ui/scene.py
index 46f95deb413a49..a058ada09ddb81 100644
--- a/python/taichi/ui/scene.py
+++ b/python/taichi/ui/scene.py
@@ -307,10 +307,14 @@ def mesh_instance(self,
                 index_count = vertex_count
             else:
                 index_count = indices.shape[0]
-        if instance_count is None:
-            instance_count = transforms.shape[0]
-        if transforms and (transforms.m != 4 or transforms.n != 4):
-            raise Exception("Error! Transform matrix must be 4x4 shape")
+        if transforms:
+            if (transforms.m != 4 or transforms.n != 4):
+                raise Exception("Error! Transform matrix must be 4x4 shape")
+            if instance_count is None:
+                instance_count = transforms.shape[0]
+        else:
+            instance_count = 1
+
         copy_normals_to_vbo(vbo, normals)
         vbo_info = get_field_info(vbo)
         indices_info = get_field_info(indices)
diff --git a/taichi/rhi/device.cpp b/taichi/rhi/device.cpp
index 460cbba420c403..1d3af12cd80221 100644
--- a/taichi/rhi/device.cpp
+++ b/taichi/rhi/device.cpp
@@ -244,7 +244,14 @@ void Device::memcpy_direct(DevicePtr dst, DevicePtr src, uint64_t size) {
     dst.device->memcpy_internal(dst, src, size);
     return;
   }
-  // Inter-device copy
+#if TI_WITH_VULKAN && TI_WITH_LLVM
+  // cross-device copy directly
+  else if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
+           dynamic_cast<cpu::CpuDevice *>(src.device)) {
+    memcpy_cpu_to_vulkan(dst, src, size);
+    return;
+  }
+#endif
 #if TI_WITH_VULKAN && TI_WITH_CUDA
   if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
       dynamic_cast<cuda::CudaDevice *>(src.device)) {
diff --git a/taichi/rhi/interop/vulkan_cpu_interop.cpp b/taichi/rhi/interop/vulkan_cpu_interop.cpp
index e3a1158457ceed..598e833977eee5 100644
--- a/taichi/rhi/interop/vulkan_cpu_interop.cpp
+++ b/taichi/rhi/interop/vulkan_cpu_interop.cpp
@@ -15,6 +15,23 @@ namespace lang {
 using namespace taichi::lang::vulkan;
 using namespace taichi::lang::cpu;
 
+void memcpy_cpu_to_vulkan(DevicePtr dst, DevicePtr src, uint64_t size) {
+  // Note that `dst` must point to host-visible memory, if `dst` point to
+  // device-local memory, please choose to use `memcpy_via_staging`.
+  VulkanDevice *vk_dev = dynamic_cast<VulkanDevice *>(dst.device);
+  CpuDevice *cpu_dev = dynamic_cast<CpuDevice *>(src.device);
+
+  DeviceAllocation src_alloc(src);
+
+  CpuDevice::AllocInfo src_alloc_info = cpu_dev->get_alloc_info(src_alloc);
+
+  unsigned char *dst_ptr = (unsigned char *)(vk_dev->map_range(dst, size));
+  unsigned char *src_ptr = (unsigned char *)src_alloc_info.ptr + src.offset;
+
+  memcpy(dst_ptr, src_ptr, size);
+  vk_dev->unmap(dst);
+}
+
 void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
                                       DevicePtr staging,
                                       DevicePtr src,
@@ -39,6 +56,9 @@ void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
 }
 
 #else
+void memcpy_cpu_to_vulkan(DevicePtr dst, DevicePtr src, uint64_t size) {
+  TI_NOT_IMPLEMENTED;
+}
 void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
                                       DevicePtr stagin,
                                       DevicePtr src,
diff --git a/taichi/rhi/interop/vulkan_cpu_interop.h b/taichi/rhi/interop/vulkan_cpu_interop.h
index ac95959f0251f5..4042ceca80be9a 100644
--- a/taichi/rhi/interop/vulkan_cpu_interop.h
+++ b/taichi/rhi/interop/vulkan_cpu_interop.h
@@ -5,6 +5,8 @@
 namespace taichi {
 namespace lang {
 
+void memcpy_cpu_to_vulkan(DevicePtr dst, DevicePtr src, uint64_t size);
+
 void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
                                       DevicePtr staging,
                                       DevicePtr src,
diff --git a/taichi/ui/backends/vulkan/renderables/mesh.cpp b/taichi/ui/backends/vulkan/renderables/mesh.cpp
index e82b62aefc199c..66423e09efa903 100644
--- a/taichi/ui/backends/vulkan/renderables/mesh.cpp
+++ b/taichi/ui/backends/vulkan/renderables/mesh.cpp
@@ -88,18 +88,22 @@ void Mesh::update_data(const MeshInfo &info, const Scene &scene) {
       attr_dev_ptr =
           get_device_ptr(prog, info.mesh_attribute_info.mesh_attribute.snode);
     }
+    // TODO : At present, we donnot support copying from cuda device memory to a
+    // host-visible vulkan device memory directly on Windows platform, which is
+    // not a ideal way for handling storage buffer. So here we set the
+    // `mesh_ssbo` vulkan buffer as device-local memory using staging buffer
+    // filling data. However, that is not what is used to do for a storage
+    // buffer (usually set as host-visible memory), we should f`ix this on
+    // Windows in future.
     Device::MemcpyCapability memcpy_cap = Device::check_memcpy_capability(
         mesh_storage_buffer_.get_ptr(), attr_dev_ptr, mesh_ssbo_size_);
     if (memcpy_cap == Device::MemcpyCapability::Direct) {
       Device::memcpy_direct(mesh_storage_buffer_.get_ptr(), attr_dev_ptr,
                             mesh_ssbo_size_);
     } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
-      void *ssbo_mapped = app_context_->device().map(mesh_storage_buffer_);
-      DeviceAllocation attr_buffer(attr_dev_ptr);
-      void *attr_mapped = attr_dev_ptr.device->map(attr_buffer);
-      memcpy(ssbo_mapped, attr_mapped, mesh_ssbo_size_);
-      app_context_->device().unmap(mesh_storage_buffer_);
-      attr_dev_ptr.device->unmap(attr_buffer);
+      Device::memcpy_via_staging(mesh_storage_buffer_.get_ptr(),
+                                 staging_vertex_buffer_.get_ptr(), attr_dev_ptr,
+                                 mesh_ssbo_size_);
     } else {
       TI_NOT_IMPLEMENTED;
     }
@@ -163,7 +167,8 @@ void Mesh::create_mesh_storage_buffers() {
   if (mesh_ssbo_size_ == 0) {
     mesh_ssbo_size_ = 4 * 4 * sizeof(float);
   }
-  Device::AllocParams sb_params{mesh_ssbo_size_, true, false, true,
+  Device::AllocParams sb_params{mesh_ssbo_size_, false, false,
+                                app_context_->requires_export_sharing(),
                                 AllocUsage::Storage};
   mesh_storage_buffer_ = app_context_->device().allocate_memory(sb_params);
 }
diff --git a/tests/python/test_ggui.py b/tests/python/test_ggui.py
index d4a1bb606f26c9..4550dcccf1090f 100644
--- a/tests/python/test_ggui.py
+++ b/tests/python/test_ggui.py
@@ -753,10 +753,6 @@ def render():
                             transforms=instances_transforms)
         canvas.scene(scene)
 
-    if (platform.system() == 'Windows'):
-        # FIXME:Fix the bug that drawing mesh instance report bugs on Windows
-        return
-
     for i in range(30):
         update_transform(30)
         render()
@@ -868,10 +864,6 @@ def render():
                             instance_offset=2)
         canvas.scene(scene)
 
-    if (platform.system() == 'Windows'):
-        # FIXME:Fix the bug that drawing mesh instance report bugs on Windows
-        return
-
     for _ in range(RENDER_REPEAT):
         render()
         window.get_image_buffer_as_numpy()

From 567df97e6d6b4e7676eb8260bb191c71854d0aa2 Mon Sep 17 00:00:00 2001
From: PGZXB <pgzxb@qq.com>
Date: Tue, 20 Sep 2022 11:27:30 +0800
Subject: [PATCH 25/32] [build] Fix building with TI_WITH_OPENGL:BOOL=OFF and
 TI_WITH_DX11:BOOL=ON failed (#6108)

Building with `TI_WITH_OPENGL:BOOL=OFF` and `TI_WITH_DX11:BOOL=ON`
failed:
```
# With Clang
FAILED: taichi_python.cp37-win_amd64.pyd
cmd.exe /C "cd . && D:\programming_tools_\LLVM14\bin\clang++.exe -fuse-ld=lld-link -nostartfiles -nostdlib -DTI_ISE_NONE -std=c++17 -fsized-deallocation -target x86_64-pc-windows-msvc -DTI_ARCH_x64 -march=nehalem -DTI_PASS_EXCEPTION_TO_PYTHON -DTI_INCLUDED -DTI_WITH_LLVM -DTI_WITH_CUDA -DTI_WITH_METAL -DTI_WITH_DX11 -DTI_WITH_VULKAN  -DTI_ISE_NONE -flto=thin -D_DLL -D_MT -Xclang --dependent-lib=msvcrt   -shared -o taichi_python.cp37-win_amd64.pyd  -Xlinker /MANIFEST:EMBED -Xlinker /implib:taichi_python.lib -Xlinker /pdb:taichi_python.pdb -Xlinker /version:0.0 @CMakeFiles\taichi_python.rsp  && cd ."
lld-link: error: could not open 'spirv-cross-hlsl.lib': no such file or directory
lld-link: error: could not open 'spirv-cross-core.lib': no such file or directory
clang++: error: linker command failed with exit code 1 (use -v to see invocation)
```
```
# With MSVC
FAILED: taichi_python.cp37-win_amd64.pyd
cmd.exe /C "cd . && D:\programming_tools_\CMAKE\bin\cmake.exe -E vs_link_dll --intdir=CMakeFiles\taichi_python.dir --rc=C:\PROGRA~2\WI3CF2~1\10\bin\100190~1.0\x86\rc.exe --mt=C:\PROGRA~2\WI3CF2~1\10\bin\100190~1.0\x86\mt.exe --manifests  -- D:\programming_tools_\VS2022\VS2022\VC\Tools\MSVC\14.31.31103\bin\Hostx86\x64\link.exe /nologo @CMakeFiles\taichi_python.rsp  /out:taichi_python.cp37-win_amd64.pyd /implib:taichi_python.lib /pdb:taichi_python.pdb /dll /version:0.0 /machine:x64 /INCREMENTAL:NO /DEBUG  && cd ."
LINK: command "D:\programming_tools_\VS2022\VS2022\VC\Tools\MSVC\14.31.31103\bin\Hostx86\x64\link.exe /nologo @CMakeFiles\taichi_python.rsp /out:taichi_python.cp37-win_amd64.pyd /implib:taichi_python.lib /pdb:taichi_python.pdb /dll /version:0.0 /machine:x64 /INCREMENTAL:NO /DEBUG /MANIFEST /MANIFESTFILE:taichi_python.cp37-win_amd64.pyd.manifest" failed (exit code 1181) with the following output:
LINK : fatal error LNK1181: cannot open input file 'spirv-cross-hlsl.lib'
```
---
 cmake/TaichiCore.cmake           | 5 +++++
 taichi/rhi/opengl/CMakeLists.txt | 3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 4c57b56796c924..7478f031890382 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -319,6 +319,11 @@ if (TI_WITH_OPENGL OR TI_WITH_VULKAN OR TI_WITH_DX11)
   target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE gfx_runtime)
 endif()
 
+if (TI_WITH_OPENGL OR TI_WITH_DX11)
+  set(SPIRV_CROSS_CLI false)
+  add_subdirectory(${PROJECT_SOURCE_DIR}/external/SPIRV-Cross ${PROJECT_BINARY_DIR}/external/SPIRV-Cross)
+endif()
+
 # Vulkan Device API
 if (TI_WITH_VULKAN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_VULKAN")
diff --git a/taichi/rhi/opengl/CMakeLists.txt b/taichi/rhi/opengl/CMakeLists.txt
index 1a78c4fd49d997..4e88b328064a13 100644
--- a/taichi/rhi/opengl/CMakeLists.txt
+++ b/taichi/rhi/opengl/CMakeLists.txt
@@ -22,7 +22,4 @@ target_include_directories(${OPENGL_RHI}
   )
 
 target_link_libraries(opengl_rhi PRIVATE glfw)
-
-set(SPIRV_CROSS_CLI false)
-add_subdirectory(${PROJECT_SOURCE_DIR}/external/SPIRV-Cross ${PROJECT_BINARY_DIR}/external/SPIRV-Cross)
 target_link_libraries(opengl_rhi PRIVATE spirv-cross-glsl spirv-cross-core)

From 7be96cf55fb4f7124f28a857360d266ea1a8173c Mon Sep 17 00:00:00 2001
From: PGZXB <pgzxb@qq.com>
Date: Tue, 20 Sep 2022 11:28:46 +0800
Subject: [PATCH 26/32] [opengl] Support offline cache on opengl (#6104)

Issue: #4401

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/analysis/offline_cache_util.cpp        |  2 +-
 .../program_impls/opengl/opengl_program.cpp   | 58 ++++++++++++++-----
 .../program_impls/opengl/opengl_program.h     |  6 ++
 tests/python/test_offline_cache.py            | 21 +++++--
 4 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
index 2237b3be60c63f..ee41a3973784d6 100644
--- a/taichi/analysis/offline_cache_util.cpp
+++ b/taichi/analysis/offline_cache_util.cpp
@@ -185,7 +185,7 @@ std::string get_cache_path_by_arch(const std::string &base_path, Arch arch) {
   std::string subdir;
   if (arch_uses_llvm(arch)) {
     subdir = "llvm";
-  } else if (arch == Arch::vulkan) {
+  } else if (arch == Arch::vulkan || arch == Arch::opengl) {
     subdir = "gfx";
   } else {
     return base_path;
diff --git a/taichi/runtime/program_impls/opengl/opengl_program.cpp b/taichi/runtime/program_impls/opengl/opengl_program.cpp
index c0da4c403a1065..4df1cad1db3add 100644
--- a/taichi/runtime/program_impls/opengl/opengl_program.cpp
+++ b/taichi/runtime/program_impls/opengl/opengl_program.cpp
@@ -1,5 +1,6 @@
 #include "opengl_program.h"
 
+#include "taichi/analysis/offline_cache_util.h"
 #include "taichi/rhi/opengl/opengl_api.h"
 #include "taichi/runtime/gfx/aot_module_builder_impl.h"
 #include "taichi/runtime/gfx/aot_module_loader_impl.h"
@@ -7,20 +8,18 @@
 namespace taichi {
 namespace lang {
 
-namespace opengl {
+namespace {
 
-FunctionType compile_to_executable(Kernel *kernel,
-                                   gfx::GfxRuntime *runtime,
-                                   gfx::SNodeTreeManager *snode_tree_mgr) {
-  auto handle = runtime->register_taichi_kernel(
-      gfx::run_codegen(kernel, runtime->get_ti_device(),
-                       snode_tree_mgr->get_compiled_structs()));
+FunctionType register_params_to_executable(
+    gfx::GfxRuntime::RegisterParams &&params,
+    gfx::GfxRuntime *runtime) {
+  auto handle = runtime->register_taichi_kernel(std::move(params));
   return [runtime, handle](RuntimeContext &ctx) {
     runtime->launch_kernel(handle, &ctx);
   };
 }
 
-}  // namespace opengl
+}  // namespace
 
 OpenglProgramImpl::OpenglProgramImpl(CompileConfig &config)
     : ProgramImpl(config) {
@@ -28,9 +27,8 @@ OpenglProgramImpl::OpenglProgramImpl(CompileConfig &config)
 
 FunctionType OpenglProgramImpl::compile(Kernel *kernel,
                                         OffloadedStmt *offloaded) {
-  spirv::lower(kernel);
-  return opengl::compile_to_executable(kernel, runtime_.get(),
-                                       snode_tree_mgr_.get());
+  return register_params_to_executable(
+      get_cache_manager()->load_or_compile(config, kernel), runtime_.get());
 }
 
 void OpenglProgramImpl::materialize_runtime(MemoryPool *memory_pool,
@@ -87,11 +85,39 @@ DeviceAllocation OpenglProgramImpl::allocate_texture(
 
 std::unique_ptr<aot::Kernel> OpenglProgramImpl::make_aot_kernel(
     Kernel &kernel) {
-  spirv::lower(&kernel);
-  std::vector<gfx::CompiledSNodeStructs> compiled_structs;
-  gfx::GfxRuntime::RegisterParams kparams =
-      gfx::run_codegen(&kernel, get_compute_device(), compiled_structs);
-  return std::make_unique<gfx::KernelImpl>(runtime_.get(), std::move(kparams));
+  auto params = get_cache_manager()->load_or_compile(config, &kernel);
+  return std::make_unique<gfx::KernelImpl>(runtime_.get(), std::move(params));
+}
+
+void OpenglProgramImpl::dump_cache_data_to_disk() {
+  const auto &mgr = get_cache_manager();
+  mgr->clean_offline_cache(offline_cache::string_to_clean_cache_policy(
+                               config->offline_cache_cleaning_policy),
+                           config->offline_cache_max_size_of_files,
+                           config->offline_cache_cleaning_factor);
+  mgr->dump_with_merging();
+}
+
+const std::unique_ptr<gfx::CacheManager>
+    &OpenglProgramImpl::get_cache_manager() {
+  if (!cache_manager_) {
+    TI_ASSERT(runtime_ && snode_tree_mgr_ && device_);
+    auto target_device = std::make_unique<aot::TargetDevice>(config->arch);
+    device_->clone_caps(*target_device);
+    using Mgr = gfx::CacheManager;
+    Mgr::Params params;
+    params.arch = config->arch;
+    params.mode =
+        offline_cache::enabled_wip_offline_cache(config->offline_cache)
+            ? Mgr::MemAndDiskCache
+            : Mgr::MemCache;
+    params.cache_path = config->offline_cache_file_path;
+    params.runtime = runtime_.get();
+    params.target_device = std::move(target_device);
+    params.compiled_structs = &snode_tree_mgr_->get_compiled_structs();
+    cache_manager_ = std::make_unique<gfx::CacheManager>(std::move(params));
+  }
+  return cache_manager_;
 }
 
 }  // namespace lang
diff --git a/taichi/runtime/program_impls/opengl/opengl_program.h b/taichi/runtime/program_impls/opengl/opengl_program.h
index 9d4ccbf9b7405b..7965c9221b22a5 100644
--- a/taichi/runtime/program_impls/opengl/opengl_program.h
+++ b/taichi/runtime/program_impls/opengl/opengl_program.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "taichi/cache/gfx/cache_manager.h"
 #include "taichi/runtime/gfx/runtime.h"
 #include "taichi/runtime/gfx/snode_tree_manager.h"
 #include "taichi/program/program_impl.h"
@@ -63,11 +64,16 @@ class OpenglProgramImpl : public ProgramImpl {
 
   std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) override;
 
+  void dump_cache_data_to_disk() override;
+
+  const std::unique_ptr<gfx::CacheManager> &get_cache_manager();
+
  private:
   std::shared_ptr<Device> device_{nullptr};
   std::unique_ptr<gfx::GfxRuntime> runtime_{nullptr};
   std::unique_ptr<gfx::SNodeTreeManager> snode_tree_mgr_{nullptr};
   std::vector<spirv::CompiledSNodeStructs> aot_compiled_snode_structs_;
+  std::unique_ptr<gfx::CacheManager> cache_manager_{nullptr};
 };
 
 }  // namespace lang
diff --git a/tests/python/test_offline_cache.py b/tests/python/test_offline_cache.py
index 1a0f13dfae5226..cb09af713a192c 100644
--- a/tests/python/test_offline_cache.py
+++ b/tests/python/test_offline_cache.py
@@ -18,7 +18,9 @@
 OFFLINE_CACHE_TEMP_DIR = mkdtemp()
 atexit.register(lambda: rmdir(OFFLINE_CACHE_TEMP_DIR))
 
-supported_archs_offline_cache = [ti.cpu, ti.cuda, ti.vulkan]
+supported_llvm_archs = {ti.cpu, ti.cuda}
+supported_gfx_archs = {ti.opengl, ti.vulkan}
+supported_archs_offline_cache = supported_llvm_archs | supported_gfx_archs
 supported_archs_offline_cache = [
     v for v in supported_archs_offline_cache
     if v in test_utils.expected_archs()
@@ -40,12 +42,19 @@ def cache_files_size(path):
 
 
 def expected_num_cache_files(arch, num_offloads: List[int] = None) -> int:
+    assert arch in supported_archs_offline_cache
     if not num_offloads:
         return 0
-    result = sum(num_offloads) if arch in [ti.vulkan] else len(num_offloads)
-    if arch in [ti.cpu, ti.cuda]:
+    result = 0
+    # code files
+    if arch in supported_llvm_archs:
+        result += len(num_offloads)
+    elif arch in supported_gfx_archs:
+        result += sum(num_offloads)
+    # metadata files
+    if arch in supported_llvm_archs:
         result += 2  # metadata.{json, tcb}
-    elif arch in [ti.vulkan]:
+    elif arch in supported_gfx_archs:
         # metadata.{json, tcb}, graphs.tcb, offline_cache_metadata.tcb
         result += 4
     return result
@@ -56,9 +65,9 @@ def tmp_offline_cache_file_path():
 
 
 def backend_specified_cache_path(arch):
-    if arch in [ti.cpu, ti.cuda]:
+    if arch in supported_llvm_archs:
         return join(tmp_offline_cache_file_path(), 'llvm')
-    elif arch in [ti.vulkan]:
+    elif arch in supported_gfx_archs:
         return join(tmp_offline_cache_file_path(), 'gfx')
     assert False
 

From 91231587d8bca5f3c7219733dba4d5f3207d72e2 Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Tue, 20 Sep 2022 11:50:02 +0800
Subject: [PATCH 27/32] [Doc] Refactor ODOP (#6013)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Olinaaaloompa <106292061+Olinaaaloompa@users.noreply.github.com>
---
 .../articles/advanced/{odop.md => odop1.md}   | 92 ++-----------------
 docs/lang/articles/advanced/odop2.md          | 83 +++++++++++++++++
 docs/lang/articles/advanced/quant.md          |  2 +-
 3 files changed, 90 insertions(+), 87 deletions(-)
 rename docs/lang/articles/advanced/{odop.md => odop1.md} (63%)
 create mode 100644 docs/lang/articles/advanced/odop2.md

diff --git a/docs/lang/articles/advanced/odop.md b/docs/lang/articles/advanced/odop1.md
similarity index 63%
rename from docs/lang/articles/advanced/odop.md
rename to docs/lang/articles/advanced/odop1.md
index a9f9154308d13a..179d659a2cc59a 100644
--- a/docs/lang/articles/advanced/odop.md
+++ b/docs/lang/articles/advanced/odop1.md
@@ -2,18 +2,15 @@
 sidebar_position: 2
 ---
 
-# Objective Data-oriented Programming
+# Objective Data-oriented Programming I
 
-Taichi is a
-[data-oriented](https://en.wikipedia.org/wiki/Data-oriented_design)
-programming (DOP) language. However, simple DOP makes modularization
-hard.
+Taichi is a [data-oriented](https://en.wikipedia.org/wiki/Data-oriented_design) programming (DOP) language. However, simple DOP makes modularization hard. To allow modularized code, Taichi borrows some concepts from object-oriented programming (OOP). For convenience, let's call the hybrid scheme **objective data-oriented programming** (ODOP).
 
-To allow modularized code, Taichi borrow some concepts from
-object-oriented programming (OOP).
+The ODOP scheme allows you to organize data and methods into a class and call the methods to manipulate the data in the Taichi scope. Taichi offers two different types of classes that serve this purpose, and they are distinguished by the two decorators `@ti.data_oriented` and `@ti.dataclass`, respectively:
 
-For convenience, let's call the hybrid scheme **objective data-oriented
-programming** (ODOP).
+1. `@ti.data_oriented`: It should be used when your data is actively updated in the Python scope (such as current time and user input events) and tracked in Taichi kernels. This type of class can have native Python objects as members and must be instantiated in the Python scope. This article will discuss this type of class in full detail.
+
+2. `@ti.dataclass`: It is a wrapper over `ti.types.struct` but offers more flexibility: You can define Taichi functions as its methods and invoke these methods in the Taichi scope. We will discuss this type of class in the next article.
 
 
 ## Data-oriented classes
@@ -277,80 +274,3 @@ b = Counter((4, 10))
 print(a.num())  # 6
 print(b.num())  # 7
 ```
-
-## Taichi dataclasses
-
-Taichi provides custom [struct types](../type_system/type.md#compound-types) for developers to associate pieces of data together. However, it is often convenient to have:
-  1. A Python representation of the struct type which is more object oriented.
-  2. Functions associated with a struct type. (C++ style structs)
-
-
-To achieve these two points, developers can use the `@ti.dataclass` decorator on a Python class.  This is heavily inspired by the Python [dataclass](https://docs.python.org/3/library/dataclasses.html) feature, which uses class fields with annotations to create data types.
-
-### Creating a struct from a Python class
-Here is an example of how we could create a Taichi struct type from a Python class:
-
-```python
-@ti.dataclass
-class Sphere:
-    center: vec3
-    radius: ti.f32
-```
-This will create the *exact* same type as doing:
-
-```python
-Sphere = ti.types.struct(center=vec3, radius=ti.f32)
-```
-Using the `@ti.dataclass` decorator will convert the annotated fields in the Python class to members in the resulting struct type.  In both of the above examples you would create a field of the struct the same way.
-
-```python
-sphere_field = Sphere.field(shape=(n,))
-```
-
-### Associating functions with the struct type
-Python classes can have functions attached to them, as can Taichi struct types.  Building from the above example, here is how one would add functions to the struct.
-
-```python
-@ti.dataclass
-class Sphere:
-    center: vec3
-    radius: ti.f32
-
-    @ti.func
-    def area(self):
-        # a function to run in taichi scope
-        return 4 * math.pi * self.radius * self.radius
-
-    def is_zero_sized(self):
-        # a python scope function
-        return self.radius == 0.0
-```
-
-Functions associated with structs follow the same [scope rules](../kernels/syntax.md#taichi-scope-vs-python-scope) as normal functions, in that they can be in Taichi or Python scope.  Each instance of the `Sphere` struct type now will have the above functions added to them.  The functions can be called such as:
-
-```python
-a_python_struct = Sphere(center=vec3(0.0), radius=1.0)
-# calls a python scope function from python
-a_python_struct.is_zero_sized() # False
-
-@ti.kernel
-def get_area() -> ti.f32:
-    a_taichi_struct = Sphere(center=vec3(0.0), radius=4.0)
-    # return the area of the sphere, a taichi scope function
-    return a_taichi_struct.area()
-get_area() # 201.062...
-```
-
-### Notes
-- Inheritance of Taichi dataclasses is not implemented.
-- While functions attached to a struct with the `@ti.dataclass` decorator is convenient and encouraged, it is actually possible to associate a function to structs with the older method of defining structs.  As mentioned above, the two methods for defining a struct type are identical in their output.  To do this, use the `__struct_methods` argument with the `ti.types.struct` call:
-
-```python
-@ti.func
-def area(self):
-    # a function to run in taichi scope
-    return 4 * math.pi * self.radius * self.radius
-
-Sphere = ti.types.struct(center=vec3, radius=ti.f32,
-                         __struct_methods={'area': area})
-```
diff --git a/docs/lang/articles/advanced/odop2.md b/docs/lang/articles/advanced/odop2.md
new file mode 100644
index 00000000000000..44db428f6f9039
--- /dev/null
+++ b/docs/lang/articles/advanced/odop2.md
@@ -0,0 +1,83 @@
+---
+sidebar_position: 3
+---
+
+# Objective Data-oriented Programming II
+
+
+## Taichi dataclasses
+
+Taichi provides custom [struct types](../type_system/type.md#compound-types) for developers to assemble pieces of data together. However, it would be more convenient to have:
+  1. A Python representation of the struct type which is more object oriented.
+  2. Functions associated with a struct type (C++-style structs).
+
+
+To achieve the ends, Taichi enabled the `@ti.dataclass` decorator on a Python class. This is inspired by Python's [dataclass](https://docs.python.org/3/library/dataclasses.html) feature, which uses class fields with annotations to create data types.
+
+### Creating a struct from a Python class
+Here is an example of how we could define a Taichi struct type under a Python class:
+
+```python
+@ti.dataclass
+class Sphere:
+    center: vec3
+    radius: ti.f32
+```
+This will create the *exact* same type as using `ti.types.struct()`:
+
+```python
+Sphere = ti.types.struct(center=vec3, radius=ti.f32)
+```
+The `@ti.dataclass` decorator converts the annotated members in the Python class to members in the resulting struct type. In both of the above examples, you end up with the same struct field.
+
+```python
+sphere_field = Sphere.field(shape=(n,))
+```
+
+### Associating functions with the struct type
+Python classes can have functions attached to them, and so can Taichi struct types. Building from the above example, one can embed functions in the struct as follows:
+
+```python
+@ti.dataclass
+class Sphere:
+    center: vec3
+    radius: ti.f32
+
+    @ti.func
+    def area(self):
+        # a function to run in taichi scope
+        return 4 * math.pi * self.radius * self.radius
+
+    def is_zero_sized(self):
+        # a python scope function
+        return self.radius == 0.0
+```
+
+Functions associated with structs follow the same scope rules as other functions. In other words, they can be placed in either the Taichi scope or the Python scope. Each instance of the `Sphere` struct type now have the above functions attached to them. The functions can be called in the following way:
+
+```python
+a_python_struct = Sphere(center=vec3(0.0), radius=1.0)
+# calls a python scope function from python
+a_python_struct.is_zero_sized() # False
+
+@ti.kernel
+def get_area() -> ti.f32:
+    a_taichi_struct = Sphere(center=vec3(0.0), radius=4.0)
+    # return the area of the sphere, a taichi scope function
+    return a_taichi_struct.area()
+get_area() # 201.062...
+```
+
+### Notes
+- Inheritance of Taichi dataclasses is not supported.
+- While it is convenient and recommended to associate functions with a struct defined via `@ti.dataclass`, `ti.types.struct` can serve the same purpose with the help of the `__struct_methods` argument. As mentioned above, the two methods of defining a struct type produce identical output.
+
+```python
+@ti.func
+def area(self):
+    # a function to run in taichi scope
+    return 4 * math.pi * self.radius * self.radius
+
+Sphere = ti.types.struct(center=vec3, radius=ti.f32,
+                         __struct_methods={'area': area})
+```
diff --git a/docs/lang/articles/advanced/quant.md b/docs/lang/articles/advanced/quant.md
index ec2d60d3faa6c2..95fc0132d26949 100644
--- a/docs/lang/articles/advanced/quant.md
+++ b/docs/lang/articles/advanced/quant.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 3
+sidebar_position: 4
 ---
 
 # Use quantized data types

From 59e3ab053dac9054001ae15ade00e18cc75bb0fd Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Tue, 20 Sep 2022 13:29:21 +0800
Subject: [PATCH 28/32] [Doc] Added Accelerate PyTorch (#6106)

---
 .../get-started/accelerate_pytorch.md         | 223 ++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 docs/lang/articles/get-started/accelerate_pytorch.md

diff --git a/docs/lang/articles/get-started/accelerate_pytorch.md b/docs/lang/articles/get-started/accelerate_pytorch.md
new file mode 100644
index 00000000000000..2c94768a590fb2
--- /dev/null
+++ b/docs/lang/articles/get-started/accelerate_pytorch.md
@@ -0,0 +1,223 @@
+---
+sidebar_position: 4
+---
+
+# Accelerate PyTorch with Taichi
+
+Taichi and Torch serve different application scenarios but can complement each other.
+
+- Taichi provides finer control over parallelization and enables more 'granular' (element-level) operations, giving its users much more flexibilities.
+- Torch abstracts such details into Tensor-level operations like LEGO bricks, enabling its users to focus on building ML (Machine Learning) models.
+
+This document uses two examples to explain how to use Taichi kernel to implement data preprocessing operators and custom high-performance ML operators.
+
+## Data preprocessing
+
+This section uses padding as an example to show you how Taichi can complement PyTorch in data preprocessing.
+
+Padding is a commonly-used data preprocessing technique in machine learning. For example, padding can prevent convolution operations from changing the size of the input image. However, no PyTorch operators are designed specifically for padding in a specific customized pattern. Previously, you have two options to work around this:
+
+- Using Python or PyTorch to iterate over matrix elements.
+- Writing a C++/CUDA operator and connecting it to PyTorch via Python's custom operator extension.
+
+The former has very poor efficiency and could become a drain of the neural network training performance; the latter requires large amount of domain-specific knowledge about the underlying hardware architectures and it could take a long while to get started.
+
+Now, you can use Taichi to pad a brick wall of a specific customized pattern in a much more efficient way.
+
+The following sections compare PyTorch's implementation of this workflow with Taichi's implementation:
+
+1. Create a 'brick' and fill it with changing colors.
+
+   ![brick](https://user-images.githubusercontent.com/93570324/191012540-4035cf95-c9e0-4fcf-94f1-1be4cc8abfae.png)
+
+2. Repeat the bricks horizontally with a fixed offset to form a staggered layout.
+
+   ![bricks](https://user-images.githubusercontent.com/93570324/191012612-2834db6b-8c31-4986-92a9-0c462b2ee9c5.png)
+
+### Padding with PyTorch
+
+The following code implements a PyTorch kernel `torch_pad()` for padding. To improve efficiency, the kernel turns the padding process into a series of native PyTorch matrix operations. But such matrix operations are usually unintuitive and require so many intermediate results to be stored in the GPU memory that old GPUs with less RAM cannot even afford them.
+
+```python
+def torch_pad(arr, tile, y):
+    # image_pixel_to_coord
+    arr[:, :, 0] = image_height - 1 + ph - arr[:, :, 0]
+    arr[:, :, 1] -= pw
+    arr1 = torch.flip(arr, (2, ))
+    # map_coord
+    v = torch.floor(arr1[:, :, 1] / tile_height).to(torch.int)
+    u = torch.floor((arr1[:, :, 0] - v * shift_y[0]) / tile_width).to(torch.int)
+    uu = torch.stack((u, u), axis=2)
+    vv = torch.stack((v, v), axis=2)
+    arr2 = arr1 - uu * shift_x - vv * shift_y
+    # coord_to_tile_pixel
+    arr2[:, :, 1] = tile_height - 1 - arr2[:, :, 1]
+    table = torch.flip(arr2, (2, ))
+    table = table.view(-1, 2).to(torch.float)
+    inds = table.mv(y)
+    gathered = torch.index_select(tile.view(-1), 0, inds.to(torch.long))
+    return gathered
+
+with Timer():
+    gathered = torch_pad(coords, tile, y)
+    torch.cuda.synchronize(device=device)
+```
+
+### Padding with Taichi
+
+The following code implements a Taichi kernel `ti_pad()` for padding. The kernel iterates over the pixels in the output image, works out each pixel's corresponding position in the input 'brick', and fills the pixel with the RGB color in that position.
+
+Taichi automatically runs the top-level for-loops in parallel, and matrix operations written in Taichi are much more readable. Moreover, as you can tell from the following code, `ti_pad()` takes in the PyTorch tensors directly so that it can reuse the memory allocated by PyTorch and would not cause extra overhead from the data transfer between the two frameworks.
+
+```python
+@ti.kernel
+def ti_pad(image_pixels: ti.types.ndarray(), tile: ti.types.ndarray()):
+    for row, col in ti.ndrange(image_height, image_width):
+        # image_pixel_to_coord
+        x1, y1 = ti.math.ivec2(col - pw, image_height - 1 - row + ph)
+        # map_coord
+        v: ti.i32 = ti.floor(y1 / tile_height)
+        u: ti.i32 = ti.floor((x1 - v * shift_y[0]) / tile_width)
+        x2, y2 = ti.math.ivec2(x1 - u * shift_x[0] - v * shift_y[0],
+                 y1 - u * shift_x[1] - v * shift_y[1])
+        # coord_to_tile_pixel
+        x, y = ti.math.ivec2(tile_height - 1 - y2, x2)
+        image_pixels[row, col] = tile[x, y]
+with Timer():
+    ti_pad(image_pixels, tile）
+    ti.sync()
+```
+
+### Performance comparison
+
+As the following table shows, the PyTorch kernel takes 30.392 ms[1] to complete padding; the Taichi kernel takes 0.267 ms only. Taichi outruns PyTorch by more than 100x (30.392/0.267).
+
+`torch_pad()` launches 58 CUDA kernels, whilst Taichi compiles all computation into one CUDA kernel. The fewer the CUDA kernels, the less GPU launch overhead is incurred. Moreover, the Taichi kernel manages to save a lot more redundant memory operations than the PyTorch kernel. The GPU launch overhead and the redundant memory operations are the potential source for optimization and acceleration.
+
+| Kernel function  | Average time (ms)  | CUDA kernels launched (number)  |
+| :--------------- | :----------------- | :------------------------------ |
+| `torch_pad()`    | 30.392             | 58                              |
+| `ti_pad()`       | 0.267              | 1                               |
+
+> - GPU: RTX3090
+> - PyTorch version: v1.12.1; Taichi version: v1.1.0
+> - The actual acceleration rate may vary depending on your implementation and GPU setup.
+
+## Customize ML operators
+
+Researchers in machine learning usually spend a lot of time designing model architectures. Because they cannot find decent support for their newly-designed or customized operators from PyTorch, they have to spend time studying CUDA for fine tuning and to improve efficiency. But writing in CUDA is hard, tuning CUDA code is even harder, and accelerating model iteration with CUDA is difficult.
+
+[This repo](https://github.com/BlinkDL/RWKV-CUDA) introduces an example of customizing an ML operator in CUDA. The author developed an RWKV language model using sort of a one-dimensional depthwise convolution custom operator. The model does not involve much computation but still runs slow because PyTorch does not have native support for it.  So, the author customized the operator in CUDA using a set of optimization techniques, such as loop fusion and Shared Memory, and achieved a performance 20x better than he did with PyTorch.
+
+Referring to the CUDA code[3], we customized a Taichi depthwise convolution operator[4] in the RWKV model using the same optimization techniques.
+
+The function of the depth wise convolution operator:
+
+1. Iterates over two input Tensors `w` and `k`,
+2. Adds up the product of the respective elements in `w` and `k` into `s`,
+3. Saves `s` to an output Tensor `out`.
+
+The following subsections take the Baseline implementations as an example to show you how to implement a depthwise convolution operator with Python, PyTorch, CUDA, and Taichi, and how they compare to each other. With Taichi, you can accelerate your ML model development with ease and get rid of the tedious low-level parallel programming.
+
+|  Implementation |  Readability |  Performance                               |
+| :-------------- | :----------- | :----------------------------------------- |
+| Python          | Excellent    | The slowest                                |
+| PyTorch         | Poor         | Slow                                       |
+| CUDA            | Poor         | Fast                                       |
+| Taichi          | Excellent    | Comparable to that of CUDA or even better  |
+
+### Implement a depthwise convolution operator with Python
+
+The Python reference code is straightforward and easy to understand, but it runs so slow that the result can hardly make itself into the diagram above.
+
+```python
+def run_formula_very_slow(w, k, B, C, T, eps):
+    out = torch.empty((B, C, T), device='cpu')
+    for b in range(B):
+        for c in range(C):
+            for t in range(T):
+                s = eps
+                for u in range(t-T+1, t+1):
+                    s += w[c][0][(T-1)-(t-u)] * k[b][c][u+T-1]
+                out[b][c][t] = s
+    return out
+```
+
+### Implement a depthwise convolution operator with PyTorch
+
+It is very challenging to translate the Python reference code above to this code line. To come up with this, you have to know very well the underlying logic of these PyTorch operators.
+
+```python
+out = eps + F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w.unsqueeze(1), groups=C)
+```
+
+### Implement a depthwise convolution operator with CUDA
+
+The CUDA reference code has much poorer readability: The outmost loop is implicitly defined by thread parallelism. The index calculation is complicated, and each element's position in the matrix is not clear at a glance. Besides, it could be rather error-prone to implement more sophisticated algorithms with CUDA.
+
+```cpp
+__global__ void kernel_forward(const float* w, const float* k, float* x,
+                               const float eps, const int B, const int C, const int T)
+{
+    const int i = blockIdx.y;
+    const int t = threadIdx.x;
+    float s = eps;
+    const float* www = w + (i % C) * T + (T - 1) - t;
+    const float* kk = k + i * T;
+    for (int u = 0; u <= t; u++){
+        s += www[u] * kk[u];
+    }
+    x[i * T + t] = s;
+}
+```
+
+Further, you need a proper compile environment to run your CUDA code! If you have precompiled your CUDA code into a dynamic link library, then you also need to spend time working hard on trivial matters such as environment settings and Python API encapsulation.
+
+### Implement a depthwise convolution operator with Taichi
+
+The Taichi reference code is almost identical to its Python counterpart. And a good advantage that Taichi has over CUDA is that, without worrying about low-level details like parallelization and pointer offsets, one can easily use Taichi to achieve comparable performance.
+
+```python
+@ti.kernel
+def taichi_forward_v0(
+        out: ti.types.ndarray(field_dim=3),
+        w: ti.types.ndarray(field_dim=3),
+        k: ti.types.ndarray(field_dim=3),
+        eps: ti.f32):
+
+    for b, c, t in out:
+        s = eps
+        for u in range(t-T+1, t+1):
+            s += w[c, 0, (T-1)-(t-u)] * k[b, c, u+T-1]
+        out[b, c, t] = s
+```
+
+### Performance comparison
+
+The following diagram shows that Taichi always shows a performance that is comparable to its CUDA counterpart or even better under certain circumstances.
+
+![comparison](https://user-images.githubusercontent.com/93570324/191012778-99408533-c3a2-4868-a750-e853a63d2697.png)
+
+> - The RWKV compute time in the diagram is in milliseconds. The less the compute time, the better the performance is.
+> - 'Baseline': The reference code is a faithful implementation of the algorithm without any modification.
+> - v1 to v3: The three different optimized implementations.
+
+## Recap
+
+PyTorch is efficient in handling a large proportion of computation tasks in machine learning. Still, there are niches and needs that it falls short of addressing, such as native support for many operators and unsatisfactory runtime performance.
+
+As a high-performance programming language embedded in Python, Taichi features:
+
+- Eeasy readability,
+- Optimized memory consumption,
+- Runtime performance comparable to that of CUDA,
+- Good portability that encourages reproducible code sharing among the community.
+
+All these features set Taichi apart as a convenient tool for ML operator customization.The two examples provided in this document give you a glimpse of how Taichi and PyTorch can complement each other to solve real-world high-performance programming issues.
+
+## Reference
+
+- [1] [Pure PyTorch padding](https://github.com/ailzhang/blog_code/blob/master/tile/demo_torch.py)
+- [2] [Padding PyTorch tensor in Taichi kernel](https://github.com/ailzhang/blog_code/blob/master/tile/demo_taichi.py)
+- [3] [RWKV-CUDA](https://github.com/BlinkDL/RWKV-CUDA/tree/main/depthwise_conv1d)
+- [4] [RWKV-Taichi ](https://github.com/ailzhang/blog_code/tree/master/rwkv)

From 78e3404b414d4f5bbc8bcf284dba59d67e6b6f0d Mon Sep 17 00:00:00 2001
From: Olinaaaloompa <106292061+Olinaaaloompa@users.noreply.github.com>
Date: Tue, 20 Sep 2022 13:29:45 +0800
Subject: [PATCH 29/32] [Doc] Move developer utilities to contribution (#6109)

---
 .../articles/{debug => contribution}/developer_utilities.md     | 2 +-
 docs/lang/articles/contribution/development_tips.md             | 2 +-
 docs/lang/articles/contribution/doc_writing.md                  | 2 +-
 docs/lang/articles/contribution/style_guide_en.md               | 2 +-
 docs/lang/articles/contribution/write_test.md                   | 2 +-
 docs/lang/articles/contribution/writing_cpp_tests.md            | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)
 rename docs/lang/articles/{debug => contribution}/developer_utilities.md (99%)

diff --git a/docs/lang/articles/debug/developer_utilities.md b/docs/lang/articles/contribution/developer_utilities.md
similarity index 99%
rename from docs/lang/articles/debug/developer_utilities.md
rename to docs/lang/articles/contribution/developer_utilities.md
index aebba0f23d5a0e..c3e1e8bc98d872 100644
--- a/docs/lang/articles/debug/developer_utilities.md
+++ b/docs/lang/articles/contribution/developer_utilities.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 2
+sidebar_position: 3
 ---
 
 # Developer Utilities
diff --git a/docs/lang/articles/contribution/development_tips.md b/docs/lang/articles/contribution/development_tips.md
index b648056b8128a3..43f47cccf812af 100644
--- a/docs/lang/articles/contribution/development_tips.md
+++ b/docs/lang/articles/contribution/development_tips.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 5
+sidebar_position: 6
 ---
 
 # Development Tips
diff --git a/docs/lang/articles/contribution/doc_writing.md b/docs/lang/articles/contribution/doc_writing.md
index fd01e8da852bb3..fc8b9943999d02 100644
--- a/docs/lang/articles/contribution/doc_writing.md
+++ b/docs/lang/articles/contribution/doc_writing.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 6
+sidebar_position: 7
 ---
 
 # Markdown Syntax
diff --git a/docs/lang/articles/contribution/style_guide_en.md b/docs/lang/articles/contribution/style_guide_en.md
index c5aadb327746fa..f613feb389b0be 100644
--- a/docs/lang/articles/contribution/style_guide_en.md
+++ b/docs/lang/articles/contribution/style_guide_en.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 7
+sidebar_position: 8
 ---
 
 # Document Style Guide
diff --git a/docs/lang/articles/contribution/write_test.md b/docs/lang/articles/contribution/write_test.md
index ae45477d794081..867ea02e1b7935 100644
--- a/docs/lang/articles/contribution/write_test.md
+++ b/docs/lang/articles/contribution/write_test.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 3
+sidebar_position: 4
 ---
 
 # Write a Python test
diff --git a/docs/lang/articles/contribution/writing_cpp_tests.md b/docs/lang/articles/contribution/writing_cpp_tests.md
index 80ab14e8c96e4e..08b9289c32141e 100644
--- a/docs/lang/articles/contribution/writing_cpp_tests.md
+++ b/docs/lang/articles/contribution/writing_cpp_tests.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 4
+sidebar_position: 5
 ---
 
 # Write a C++ test

From 001f4d78b34fceb253e29416fc7565aa0a2e1a1a Mon Sep 17 00:00:00 2001
From: Qian Bao <houkensjtu@gmail.com>
Date: Tue, 20 Sep 2022 13:39:06 +0800
Subject: [PATCH 30/32] [doc] Update explanation on data-layout (#6110)

---
 docs/lang/articles/basic/layout.md | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/docs/lang/articles/basic/layout.md b/docs/lang/articles/basic/layout.md
index e764cc8f0b8117..db46b10ddbdad1 100644
--- a/docs/lang/articles/basic/layout.md
+++ b/docs/lang/articles/basic/layout.md
@@ -58,13 +58,40 @@ ti.root.dense(ti.ij, (3, 4)).place(x)
 x = ti.field(ti.f32, shape=(3, 4))
 ```
 
-You can also nest two 1D `dense` statements to describe the same 2D array.
+You can also nest two 1D `dense` statements to describe a 2D array of the same shape.
 
 ```python {1-2}
 x = ti.field(ti.f32)
 ti.root.dense(ti.i, 3).dense(ti.j, 4).place(x)
+# has the same shape with
+x = ti.field(ti.f32, shape=(3,4))
 ```
 
+:::note
+
+The above 2D array built with nested `dense` statements is *not* equivalent to the 2D array built with `ti.field`.
+Although both statements result in a 2D array of the same shape, they have
+different layers of `SNodeTree`. In other words,
+```python
+x = ti.field(ti.f32)
+ti.root.dense(ti.i, 3).dense(ti.j, 4).place(x)
+```
+has two `SNodeTree` layers below the root;
+```python
+x = ti.field(ti.f32)
+ti.root.dense(ti.ij, (3, 4)).place(x)
+# or equivalently
+x = ti.field(ti.f32, shape=(3,4))
+```
+has only one `SNodeTree` layer below the root. See the sketch below:
+
+![2D data-layout sketch](https://user-images.githubusercontent.com/2747993/190545525-305563dc-d09e-4af2-b99b-166d5c4398d0.png)
+
+The difference here is subtle because both arrays are row-major, but it may have slight performance impact
+because the overhead of calculating the `SNodeTree` index is different for the two.
+
+:::
+
 In a nutshell, the `ti.root.X` statement progressively binds a shape to the corresponding axis.
 By nesting multiple statements, we can construct a field with higher dimensions.
 <!-- haidong: how far can we go? how many default axis exist? -->

From 02822e70d1100b1adca9bc3796f7cee9408053fa Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Tue, 20 Sep 2022 14:07:04 +0800
Subject: [PATCH 31/32] [doc] Fixed a broken link (#6111)

---
 docs/lang/articles/type_system/type.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/lang/articles/type_system/type.md b/docs/lang/articles/type_system/type.md
index 1ddd482703bdcc..27fdb8cc3cc8c4 100644
--- a/docs/lang/articles/type_system/type.md
+++ b/docs/lang/articles/type_system/type.md
@@ -255,7 +255,7 @@ The code above serves the same purpose as the line below does but provides bette
 Sphere = ti.types.struct(center=vec3, radius=float)
 ```
 
-Another advantage of using `@ti.dataclass` over `ti.types.struct` is that you can define member functions in a dataclass and call them in the Taichi scope, making object-oriented programming (OOP) possible. See the article [objective data-oriented programming](../advanced/odop.md) for more details.
+Another advantage of using `@ti.dataclass` over `ti.types.struct` is that you can define member functions in a dataclass and call them in the Taichi scope, making object-oriented programming (OOP) possible. See the article [objective data-oriented programming](../advanced/odop2.md) for more details.
 
 
 ### Initialization

From 966f7fa8b80286883bc8fca4bae275e478a7ddc5 Mon Sep 17 00:00:00 2001
From: Taichi Gardener <62079278+taichi-gardener@users.noreply.github.com>
Date: Tue, 20 Sep 2022 14:58:53 +0800
Subject: [PATCH 32/32] [misc] Bump version to v1.1.4 (#6112)

Bump version to v1.1.4
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 99a4aef0c4d4d5..c641220244f1c6 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v1.1.3
+v1.1.4