From 70822bd1fbb7df53f4e7d243f6588d9f2c5d140e Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 11 May 2023 18:46:57 +0800
Subject: [PATCH 01/18] [misc] Do not print CHANGELOG when specified --save
 (make_changelog.py)

ghstack-source-id: 9b846a6261afc4c30f1ea8ba1034d6608a8425f5
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7983
---
 misc/make_changelog.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/misc/make_changelog.py b/misc/make_changelog.py
index a7e23e593ece4..c373aa52bdf4a 100644
--- a/misc/make_changelog.py
+++ b/misc/make_changelog.py
@@ -112,4 +112,5 @@ def format(c):
     if args.save:
         with open("./python/taichi/CHANGELOG.md", "w", encoding="utf-8") as f:
             f.write(res)
-    print(res)
+    else:
+        print(res)

From c71549b57489fe4116042bb9584781da7f393abb Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 11 May 2023 18:47:03 +0800
Subject: [PATCH 02/18] [ci] Do not try to terminate sccache server after
 compilation

Can disturb concurrent builds

ghstack-source-id: fe3d955213ba68569af16d033de0b24b5ec2d029
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7984
---
 .github/workflows/scripts/ti_build/entry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index 41dd136be38e9..5a7683290bdd0 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -89,7 +89,7 @@ def action_wheel():
     handle_alternate_actions()
     build_wheel(python, pip)
     try:
-        sccache("--stop-server")
+        sccache("-s")
     except CommandFailed:
         pass
 
@@ -100,7 +100,7 @@ def action_android():
     handle_alternate_actions()
     build_android(python, pip)
     try:
-        sccache("--stop-server")
+        sccache("-s")
     except CommandFailed:
         pass
 

From 2c4ed0dad011b4464a65ca67032d6bb9cc467e15 Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 11 May 2023 18:47:09 +0800
Subject: [PATCH 03/18] [ci] build.py: Add nice when compiling

ghstack-source-id: bc7b2bb9014e36cf8e24b732906b0def7b3131a3
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7985
---
 .github/workflows/scripts/ti_build/entry.py  |  5 +++--
 .github/workflows/scripts/ti_build/tinysh.py | 13 +++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index 5a7683290bdd0..f9ea83aef8f6e 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -20,7 +20,7 @@
 from .ospkg import setup_os_pkgs
 from .python import get_desired_python_version, setup_python
 from .sccache import setup_sccache
-from .tinysh import Command, CommandFailed, git
+from .tinysh import Command, CommandFailed, git, nice
 from .vulkan import setup_vulkan
 
 
@@ -50,7 +50,8 @@ def build_wheel(python: Command, pip: Command) -> None:
     python("setup.py", "clean")
     python("misc/make_changelog.py", "--ver", "origin/master", "--repo_dir", "./", "--save")
 
-    python("setup.py", *proj_tags, "bdist_wheel", *extra)
+    with nice():
+        python("setup.py", *proj_tags, "bdist_wheel", *extra)
 
 
 @banner("Install Build Wheel Dependencies")
diff --git a/.github/workflows/scripts/ti_build/tinysh.py b/.github/workflows/scripts/ti_build/tinysh.py
index 4a88d88077b84..516136de6b52e 100644
--- a/.github/workflows/scripts/ti_build/tinysh.py
+++ b/.github/workflows/scripts/ti_build/tinysh.py
@@ -218,6 +218,19 @@ def sudo():
         return prefix("sudo")
 
 
+def nice():
+    """
+    Wrap a command with sudo.
+    """
+    if IS_WINDOWS:
+        from .misc import warn
+
+        warn("nice is not yet implemented on Windows")
+        return with_options({})
+    else:
+        return prefix("nice")
+
+
 sh = Command()
 git = sh.git
 # Use setup_python !

From 4d406ec101275d9a7b0c2b9742d1edacb72ca5aa Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 11 May 2023 18:47:15 +0800
Subject: [PATCH 04/18] [ci] Tag wheel with TI_WITH_xxx tags

ghstack-source-id: 90bf91396419802d8bba58ea92bf381d9ba30b9e
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7986
---
 .github/workflows/scripts/ti_build/cmake.py | 27 ++++++++++------
 .github/workflows/scripts/ti_build/entry.py | 35 +++++++++++++++------
 CMakeLists.txt                              |  8 ++---
 cmake/TaichiCore.cmake                      | 20 ++++++------
 4 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/cmake.py b/.github/workflows/scripts/ti_build/cmake.py
index cf6d07ed5df14..2dc327710e40b 100644
--- a/.github/workflows/scripts/ti_build/cmake.py
+++ b/.github/workflows/scripts/ti_build/cmake.py
@@ -14,7 +14,7 @@
 from .misc import banner
 
 # -- code --
-OPTION_RE = re.compile(r'option\(([A-Z0-9_]*) +"(.*?)" +(ON|OFF)\)')
+OPTION_RE = re.compile(r'option\(([A-Z0-9_]*) +"(.*?)" +(ON|OFF)\)(?: *# wheel-tag: (.*))?')
 DEF_RE = re.compile(r"-D([A-Z0-9_]*)(?::BOOL)?=([^ ]+)(?: |$)")
 
 
@@ -28,10 +28,7 @@ def __init__(self, environ_name):
         self.environ_name = environ_name
         self.definitions = {}
         self.option_definitions = {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": (
-                "Generate compile_commands.json",
-                False,
-            ),
+            "CMAKE_EXPORT_COMPILE_COMMANDS": ("Generate compile_commands.json", False, ""),
         }
 
         self.finalized = False
@@ -39,9 +36,9 @@ def __init__(self, environ_name):
     def collect_options(self, *files: str) -> None:
         for fn in files:
             with open(fn, "r") as f:
-                for name, desc, default in OPTION_RE.findall(f.read()):
+                for name, desc, default, wheel_tag in OPTION_RE.findall(f.read()):
                     default = self._VMAP.get(default, default)
-                    self.option_definitions[name] = (desc, default)
+                    self.option_definitions[name] = (desc, default, wheel_tag)
 
     def parse_initial_args(self) -> None:
         args = os.environ.get(self.environ_name, "")
@@ -49,7 +46,7 @@ def parse_initial_args(self) -> None:
             self.set(name, value)
 
     def get_effective(self, name: str) -> Union[str, bool]:
-        _, default = self.option_definitions.get(name, ("", None))
+        _, default, _ = self.option_definitions.get(name, ("", None, ""))
         return self.definitions.get(name, default)
 
     def set(self, name: str, value: Union[str, bool]) -> None:
@@ -57,7 +54,7 @@ def set(self, name: str, value: Union[str, bool]) -> None:
         desc = ""
         value = self._VMAP.get(value, value)
         default = None
-        desc, default = self.option_definitions.get(name, ("", None))
+        desc, default, wheel_tag = self.option_definitions.get(name, ("", None, ""))
         desc = desc and f" ({desc}) "
         is_bool = isinstance(default, bool)
         assert not is_bool or isinstance(value, bool), f"Option {name} must be bool"
@@ -84,6 +81,7 @@ def set(self, name: str, value: Union[str, bool]) -> None:
                 else:
                     p(f"{B}:: CMAKE: Already disabled: {name}{desc}{N}")
         else:
+            assert not wheel_tag, "Set a non boolean value to an option with wheel-tag"
             if orig != value:
                 if orig != default:
                     p(f"{R}:: CMAKE- {name}={orig}{desc}{N}")
@@ -99,7 +97,7 @@ def render(self) -> List[Tuple[str, str, str]]:
             else:
                 v = f"-D{name}={value}"
 
-            desc, _ = self.option_definitions.get(name, ("", None))
+            desc, _, _ = self.option_definitions.get(name, ("", None, ""))
             if desc:
                 prefix = "DO NOT " if not value else ""
                 desc = f" ({prefix}{desc})"
@@ -108,6 +106,15 @@ def render(self) -> List[Tuple[str, str, str]]:
 
         return lst
 
+    def render_wheel_tag(self) -> str:
+        tags = []
+        for name, (_, default, wheel_tag) in self.option_definitions.items():
+            if not wheel_tag:
+                continue
+            if self.definitions.get(name, default):
+                tags.append(wheel_tag)
+        return ".".join(sorted(tags))
+
     @banner("{self.environ_name} Summary")
     def print_summary(self, rendered) -> None:
         p = lambda s: print(s, file=sys.stderr, flush=True)
diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index f9ea83aef8f6e..663de529f1463 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -2,6 +2,7 @@
 
 # -- stdlib --
 import argparse
+import datetime
 import os
 import platform
 import subprocess
@@ -30,15 +31,22 @@ def build_wheel(python: Command, pip: Command) -> None:
     """
     Build the Taichi wheel
     """
+
     git.fetch("origin", "master", "--tags")
-    proj = os.environ.get("PROJECT_NAME", "taichi")
     proj_tags = []
     extra = []
 
-    if proj == "taichi-nightly":
-        proj_tags.extend(["egg_info", "--tag-date", "--tag-build=.post"])
-        # Include C-API in nightly builds
-        cmake_args["TI_WITH_C_API"] = True
+    cmake_args.writeback()
+    wheel_tag = cmake_args.render_wheel_tag()
+    if misc.options.tag_local:
+        wheel_tag = misc.options.tag_local
+
+    if misc.options.nightly:
+        os.environ["PROJECT_NAME"] = "taichi-nightly"
+        now = datetime.datetime.now().strftime("%Y%m%d")
+        proj_tags.extend(["egg_info", f"--tag-build=.post{now}+{wheel_tag}"])
+    elif misc.options.tag_config or misc.options.tag_local:
+        proj_tags.extend(["egg_info", f"--tag-build=+{wheel_tag}"])
 
     if platform.system() == "Linux":
         if is_manylinux2014():
@@ -46,7 +54,6 @@ def build_wheel(python: Command, pip: Command) -> None:
         else:
             extra.extend(["-p", "manylinux_2_27_x86_64"])
 
-    cmake_args.writeback()
     python("setup.py", "clean")
     python("misc/make_changelog.py", "--ver", "origin/master", "--repo_dir", "./", "--save")
 
@@ -60,7 +67,7 @@ def install_build_wheel_deps(python: Command, pip: Command) -> None:
     pip.install("-r", "requirements_dev.txt")
 
 
-def setup_basic_build_env(force_vulkan=False):
+def setup_basic_build_env():
     u = platform.uname()
     if (u.system, u.machine) == ("Windows", "AMD64"):
         # Use MSVC on Windows
@@ -71,8 +78,7 @@ def setup_basic_build_env(force_vulkan=False):
         setup_clang()
 
     setup_llvm()
-    if force_vulkan or cmake_args.get_effective("TI_WITH_VULKAN"):
-        setup_vulkan()
+    setup_vulkan()
 
     sccache = setup_sccache()
 
@@ -96,7 +102,7 @@ def action_wheel():
 
 
 def action_android():
-    sccache, python, pip = setup_basic_build_env(force_vulkan=True)
+    sccache, python, pip = setup_basic_build_env()
     setup_android_ndk()
     handle_alternate_actions()
     build_android(python, pip)
@@ -151,6 +157,15 @@ def parse_args():
     help = "Continue when encounters error."
     parser.add_argument("--permissive", action="store_true", default=False, help=help)
 
+    help = "Tag built wheel with TI_WITH_xxx config."
+    parser.add_argument("--tag-config", action="store_true", default=False, help=help)
+
+    help = "Set a local version. Overrides --tag-config."
+    parser.add_argument("--tag-local", type=str, default=None, help=help)
+
+    help = "Build nightly wheel."
+    parser.add_argument("--nightly", action="store_true", default=False, help=help)
+
     options = parser.parse_args()
     return options
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d024821b1f5e0..b5988e4285255 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 option(USE_LLD "Use lld (from llvm) linker" OFF)
 option(USE_MOLD "Use mold (A Modern Linker)" OFF)
-option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF)
+option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF)  # wheel-tag: bt
 
 if(LINUX OR APPLE)
     if (NOT IOS)
@@ -205,8 +205,8 @@ endif()
 configure_file(taichi/common/version.h.in ${CMAKE_SOURCE_DIR}/taichi/common/version.h)
 configure_file(taichi/common/commit_hash.h.in ${CMAKE_SOURCE_DIR}/taichi/common/commit_hash.h)
 
-option(TI_WITH_C_API "build taichi runtime c-api library" ON)
-option(TI_WITH_STATIC_C_API "build static taichi runtime c-api library" OFF)
+option(TI_WITH_C_API "build taichi runtime c-api library" ON)  # wheel-tag: aot
+option(TI_WITH_STATIC_C_API "build static taichi runtime c-api library" OFF)  # wheel-tag: static_aot
 
 if(TI_WITH_STATIC_C_API)
     set(TI_WITH_C_API ${TI_WITH_STATIC_C_API})
@@ -231,7 +231,7 @@ if (TI_BUILD_RHI_EXAMPLES)
 endif()
 
 
-option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF)
+option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF)  # wheel-tag: viz
 if (TI_WITH_GRAPHVIZ)
   set(GRAPHVIZ_GRAPH_NAME "ti_targets")
   add_custom_target(graphviz ALL
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 8579d2d3bdacb..a3b44521f23d2 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -1,14 +1,14 @@
 option(USE_STDCPP "Use -stdlib=libc++" OFF)
-option(TI_WITH_LLVM "Build with LLVM backends" ON)
-option(TI_WITH_METAL "Build with the Metal backend" ON)
-option(TI_WITH_CUDA "Build with the CUDA backend" ON)
-option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)
-option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF)
-option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
-option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)
-option(TI_WITH_DX11 "Build with the DX11 backend" OFF)
-option(TI_WITH_DX12 "Build with the DX12 backend" OFF)
-option(TI_WITH_GGUI "Build with GGUI" OFF)
+option(TI_WITH_LLVM "Build with LLVM backends" ON)              # wheel-tag: llvm
+option(TI_WITH_METAL "Build with the Metal backend" ON)         # wheel-tag: mtl
+option(TI_WITH_CUDA "Build with the CUDA backend" ON)           # wheel-tag: cu
+option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)  # wheel-tag: cutk
+option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF)      # wheel-tag: amd
+option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)       # wheel-tag: gl
+option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)      # wheel-tag: vk
+option(TI_WITH_DX11 "Build with the DX11 backend" OFF)          # wheel-tag: dx11
+option(TI_WITH_DX12 "Build with the DX12 backend" OFF)          # wheel-tag: dx12
+option(TI_WITH_GGUI "Build with GGUI" OFF)                      # wheel-tag: ggui
 
 # Force symbols to be 'hidden' by default so nothing is exported from the Taichi
 # library including the third-party dependencies.

From d12826555d54c35f882213f9e2624861b64837b2 Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 11 May 2023 18:47:21 +0800
Subject: [PATCH 05/18] [build] Not generating PDB files by default (for
 compliation caching)

ghstack-source-id: 44bdc44b4312a4fdd46e14e8d76c6db1e16643eb
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7987
---
 CMakeLists.txt              |  1 +
 cmake/TaichiCAPITests.cmake |  2 +-
 cmake/TaichiCXXFlags.cmake  |  6 ++++--
 cmake/TaichiTests.cmake     | 10 +++++-----
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b5988e4285255..49311f433a7d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 option(USE_LLD "Use lld (from llvm) linker" OFF)
 option(USE_MOLD "Use mold (A Modern Linker)" OFF)
 option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF)  # wheel-tag: bt
+option(TI_GENERATE_PDB "Generate Program Database (PDB) files (will make compilation uncacheable)" OFF)
 
 if(LINUX OR APPLE)
     if (NOT IOS)
diff --git a/cmake/TaichiCAPITests.cmake b/cmake/TaichiCAPITests.cmake
index de894cf131b54..ad743eec3cb52 100644
--- a/cmake/TaichiCAPITests.cmake
+++ b/cmake/TaichiCAPITests.cmake
@@ -22,7 +22,7 @@ if (WIN32)
     set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${C_API_TESTS_OUTPUT_DIR})
     set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${C_API_TESTS_OUTPUT_DIR})
     set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${C_API_TESTS_OUTPUT_DIR})
-    if (MSVC)
+    if (MSVC AND TI_GENERATE_PDB)
         target_compile_options(${C_API_TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/Zi>")
         target_link_options(${C_API_TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/DEBUG>")
         target_link_options(${C_API_TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/OPT:REF>")
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index a03e41d9fc59e..9a704cf557bc5 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -39,8 +39,10 @@ if (WIN32)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus /Zc:inline /std:c++17")
         # Linker & object related flags
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /bigobj")
-        # Debugging (generate PBD files)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /Zf")
+        # Debugging (generate PDB files)
+        if (TI_GENERATE_PDB)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /Zf")
+        endif()
         # Performance and optimizations
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Oi")
         # C4244: conversion from 'type1' to 'type2', possible loss of data
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index c3d59577d15a9..63d96c3a60c11 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -56,11 +56,11 @@ if (WIN32)
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${TESTS_OUTPUT_DIR})
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${TESTS_OUTPUT_DIR})
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${TESTS_OUTPUT_DIR})
-    if (MSVC)
-        target_compile_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/Zi>")
-        target_link_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/DEBUG>")
-        target_link_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/OPT:REF>")
-        target_link_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/OPT:ICF>")
+    if (MSVC AND TI_GENERATE_PDB)
+        target_compile_options(${TESTS_NAME} PRIVATE "/Zi")
+        target_link_options(${TESTS_NAME} PRIVATE "/DEBUG")
+        target_link_options(${TESTS_NAME} PRIVATE "/OPT:REF")
+        target_link_options(${TESTS_NAME} PRIVATE "/OPT:ICF")
     endif()
 endif()
 target_link_libraries(${TESTS_NAME} PRIVATE taichi_core)

From 4c7ec67b6c24113a3683815d433e8cee717133ad Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 11 May 2023 18:47:26 +0800
Subject: [PATCH 06/18] [build] Use Ninja and MSVC to build on Windows

ghstack-source-id: d2cc403815fe196fdf0770beb4a3bd2872727496
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7988
---
 .../workflows/scripts/ti_build/compiler.py    | 114 +++++++++++++-----
 .github/workflows/scripts/ti_build/tinysh.py  |   2 +
 taichi/rhi/CMakeLists.txt                     |   5 +
 taichi/rhi/dummy.cpp                          |   0
 4 files changed, 93 insertions(+), 28 deletions(-)
 create mode 100644 taichi/rhi/dummy.cpp

diff --git a/.github/workflows/scripts/ti_build/compiler.py b/.github/workflows/scripts/ti_build/compiler.py
index 5854705bc3f55..4c3b66dc15f60 100644
--- a/.github/workflows/scripts/ti_build/compiler.py
+++ b/.github/workflows/scripts/ti_build/compiler.py
@@ -3,14 +3,18 @@
 # -- stdlib --
 from pathlib import Path
 import os
+import json
 import platform
 import shutil
+import tempfile
+import sys
 
 # -- third party --
 # -- own --
 from .cmake import cmake_args
 from .dep import download_dep
 from .misc import banner, error, get_cache_home, warn
+from .tinysh import powershell
 
 
 # -- code --
@@ -56,36 +60,90 @@ def setup_clang(as_compiler=True) -> None:
             cmake_args["CMAKE_CXX_COMPILER"] = clangpp
 
 
+ENV_EXTRACT_SCRIPT = """
+param ([string]$DevShell, [string]$VsPath, [string]$OutFile)
+$WarningPreference = 'SilentlyContinue'
+Import-Module $DevShell
+Enter-VsDevShell -VsInstallPath $VsPath -SkipAutomaticLocation -DevCmdArguments "-arch=x64"
+Get-ChildItem env:* | ConvertTo-Json -Depth 1 | Out-File $OutFile
+"""
+
+
+def _vs_devshell(vs):
+    dll = vs / "Common7" / "Tools" / "Microsoft.VisualStudio.DevShell.dll"
+
+    if not dll.exists():
+        error("Could not find Visual Studio DevShell")
+        return
+
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp = Path(tmp)
+        script = tmp / "extract.ps1"
+        with script.open("w") as f:
+            f.write(ENV_EXTRACT_SCRIPT)
+        outfile = tmp / "env.json"
+        powershell(
+            "-ExecutionPolicy",
+            "Bypass",
+            "-File",
+            str(script),
+            "-DevShell",
+            str(dll),
+            "-VsPath",
+            str(vs),
+            "-OutFile",
+            str(outfile),
+        )
+        with outfile.open(encoding="utf-16") as f:
+            envs = json.load(f)
+
+    for v in envs:
+        os.environ[v["Key"]] = v["Value"]
+
+
 @banner("Setup MSVC")
 def setup_msvc() -> None:
     assert platform.system() == "Windows"
-    os.environ["TAICHI_USE_MSBUILD"] = "1"
 
-    base = Path(r"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools")
-    for edition in ("Enterprise", "Professional", "Community", "BuildTools"):
-        if (base / edition).exists():
-            return
+    base = Path("C:\\Program Files (x86)\\Microsoft Visual Studio")
+    for ver in ("2022",):
+        for edition in ("Enterprise", "Professional", "Community", "BuildTools"):
+            vs = base / ver / edition
+            if not vs.exists():
+                continue
+
+            if os.environ.get("TI_CI") and not os.environ.get("TAICHI_USE_MSBUILD"):
+                # Use Ninja + MSVC in CI, for better caching
+                _vs_devshell(vs)
+                cmake_args["CMAKE_C_COMPILER"] = "cl.exe"
+                cmake_args["CMAKE_CXX_COMPILER"] = "cl.exe"
+            else:
+                os.environ["TAICHI_USE_MSBUILD"] = "1"
 
-    url = "https://aka.ms/vs/17/release/vs_BuildTools.exe"
-    out = base
-    download_dep(
-        url,
-        out,
-        elevate=True,
-        args=[
-            "--passive",
-            "--wait",
-            "--norestart",
-            "--includeRecommended",
-            "--add",
-            "Microsoft.VisualStudio.Workload.VCTools",
-            # NOTE: We are using the custom built Clang++,
-            #       so components below are not necessary anymore.
-            # '--add',
-            # 'Microsoft.VisualStudio.Component.VC.Llvm.Clang',
-            # '--add',
-            # 'Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Llvm.Clang',
-            # '--add',
-            # 'Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset',
-        ],
-    )
+            return
+    else:
+        url = "https://aka.ms/vs/17/release/vs_BuildTools.exe"
+        out = base / "2022" / "BuildTools"
+        download_dep(
+            url,
+            out,
+            elevate=True,
+            args=[
+                "--passive",
+                "--wait",
+                "--norestart",
+                "--includeRecommended",
+                "--add",
+                "Microsoft.VisualStudio.Workload.VCTools",
+                # NOTE: We are using the custom built Clang++,
+                #       so components below are not necessary anymore.
+                # '--add',
+                # 'Microsoft.VisualStudio.Component.VC.Llvm.Clang',
+                # '--add',
+                # 'Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Llvm.Clang',
+                # '--add',
+                # 'Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset',
+            ],
+        )
+        warn("Please restart build.py after Visual Studio Build Tools is installed.")
+        sys.exit(1)
diff --git a/.github/workflows/scripts/ti_build/tinysh.py b/.github/workflows/scripts/ti_build/tinysh.py
index 516136de6b52e..9a7e126095bad 100644
--- a/.github/workflows/scripts/ti_build/tinysh.py
+++ b/.github/workflows/scripts/ti_build/tinysh.py
@@ -241,3 +241,5 @@ def nice():
 bash = sh.bash
 start = sh.start.bake("/wait")
 apt = sh.sudo.apt
+powershell = Command("powershell.exe")
+pwsh = Command("pwsh.exe")
diff --git a/taichi/rhi/CMakeLists.txt b/taichi/rhi/CMakeLists.txt
index 0e26f5579f500..4b88b7d974c0d 100644
--- a/taichi/rhi/CMakeLists.txt
+++ b/taichi/rhi/CMakeLists.txt
@@ -113,3 +113,8 @@ target_link_libraries(${TAICHI_DEVICE_API} PUBLIC common_rhi)
 # Generate shared library
 add_library(ti_device_api_shared SHARED public_device.h)
 target_link_libraries(ti_device_api_shared PUBLIC ${TAICHI_DEVICE_API})
+
+# When building targets on Windows using Ninja + MSVC, the linker requires at least 1 object file
+# to work properly, else link.exe would complain about LNK4001 warning, and fail afterwards.
+# Adding a dummy file to workaround this.
+target_sources(ti_device_api_shared PRIVATE dummy.cpp)
diff --git a/taichi/rhi/dummy.cpp b/taichi/rhi/dummy.cpp
new file mode 100644
index 0000000000000..e69de29bb2d1d

From f86663b9686a772fb8965880861a6269304464fb Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 11 May 2023 18:47:32 +0800
Subject: [PATCH 07/18] [build] Guard Windows LTO with flags

ghstack-source-id: 045bd93247b0448fe6c29b20b968aa83c59fb050
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7989
---
 CMakeLists.txt             |  1 +
 cmake/TaichiCXXFlags.cmake | 12 ++++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49311f433a7d4..b29755fbcad3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ option(USE_LLD "Use lld (from llvm) linker" OFF)
 option(USE_MOLD "Use mold (A Modern Linker)" OFF)
 option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF)  # wheel-tag: bt
 option(TI_GENERATE_PDB "Generate Program Database (PDB) files (will make compilation uncacheable)" OFF)
+option(TI_WITH_LTO "Enable Link Time Optimization (LTO) (affects Windows + MSVC for now)" OFF)  # wheel-tag: lto
 
 if(LINUX OR APPLE)
     if (NOT IOS)
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index 9a704cf557bc5..58b9a5a3ded3d 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -25,10 +25,14 @@ if (WIN32)
         set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -flto=thin")
         set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -flto=thin")
     elseif (MSVC)
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /GL /Gy")
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /GL /Gy")
-        set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
-        set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /Gy")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /Gy")
+        if (TI_WITH_LTO)
+            set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /GL")
+            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /GL")
+            set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
+            set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
+        endif()
     endif()
 endif()
 

From 6edff3175e9cd852b18f5cd7e3e932cb79bd49fc Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Fri, 12 May 2023 20:50:28 +0800
Subject: [PATCH 08/18] [ci] Add dedicated build pipeline

ghstack-source-id: 43522332b4d24e95fb3333727d5d49ac1a432b61
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7990
---
 .github/actionlint.yaml          |  19 ++
 .github/workflows/build.yaml     | 395 +++++++++++++++++++++++++++++++
 .github/workflows/initiator.yaml |  74 ++++++
 3 files changed, 488 insertions(+)
 create mode 100644 .github/actionlint.yaml
 create mode 100644 .github/workflows/build.yaml
 create mode 100644 .github/workflows/initiator.yaml

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
new file mode 100644
index 0000000000000..36fa23577f27d
--- /dev/null
+++ b/.github/actionlint.yaml
@@ -0,0 +1,19 @@
+self-hosted-runner:
+  labels:
+  - cn
+  - cuda
+  - OpenGL
+  - vulkan
+  - m1
+  - driver470
+  - driver510
+  - benchmark
+  - release
+  - build
+  - sm70
+  - sm86
+  - amdgpu
+  - online
+
+# config-variables:
+#   - ENVIRONMENT_STAGE
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
new file mode 100644
index 0000000000000..5da34e3b05b9b
--- /dev/null
+++ b/.github/workflows/build.yaml
@@ -0,0 +1,395 @@
+name: Build Taichi
+on:
+  workflow_call:
+    inputs:
+      build_id:
+        required: true
+        type: string
+      nightly:
+        required: true
+        type: boolean
+      python:
+        required: true
+        type: string
+    secrets:
+      BOT_MINIO_ACCESS_KEY:
+        required: true
+      BOT_MINIO_SECRET_KEY:
+        required: true
+  workflow_dispatch:
+    inputs:
+      build_id:
+        description: 'The build id. e.g.: 20230427-102544-abcdefab'
+        required: true
+        type: string
+      nightly:
+        description: 'Are we building nightly wheels?'
+        required: true
+        default: false
+        type: boolean
+      python:
+        description: 'JSON encoded python versions need building wheel. e.g.: ["3.7"]'
+        required: true
+        type: string
+
+concurrency:
+  group: build-${{ github.event.number || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  TI_CI: "1"
+  TI_SKIP_VERSION_CHECK: 'ON'
+  CI_IMAGE_VERSION: '202304251731'
+  TI_USE_GIT_CACHE: ${{ vars.TI_USE_GIT_CACHE }}
+  NIGHTLY: ${{ inputs.nightly && 'nightly' || '' }}
+
+jobs:
+  show_environ:
+    name: Show Environment Variables
+    # Disable this workflow on forks
+    if: github.repository_owner == 'taichi-dev'
+    runs-on: [self-hosted, Linux]
+    steps:
+    - name: Environment Variables
+      run: env
+    - name: Github Object
+      run: |
+        cat <<'EOF'
+        ${{ toJson(github) }}
+        EOF
+
+  build_cpu_mac:
+    name: Build macOS-x86
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        _designated: ['']
+        include:
+        - _designated: ''
+          designated: designated
+    runs-on:
+    - self-hosted
+    - macos-10.15
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=ON
+        -DTI_WITH_C_API:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - name: Build
+        run: ./build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_linux:
+    name: Build Linux
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda: ['', cuda]
+        llvm: ['', llvm]
+        gl: ['', gl]
+        vk: ['', vk]
+        python: ${{ fromJSON(inputs.python) }}
+        include:
+        - {cuda: cuda, llvm: llvm, gl: gl, vk: vk, designated: designated}
+        exclude:
+        - {llvm: '', cuda: cuda}
+    runs-on: [self-hosted, online, Linux, build]
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_CUDA:BOOL=${{ matrix.cuda && 'ON' || 'OFF' }}
+        -DTI_WITH_LLVM:BOOL=${{ matrix.llvm && 'ON' || 'OFF' }}
+        -DTI_WITH_OPENGL:BOOL=${{ matrix.gl && 'ON' || 'OFF' }}
+        -DTI_WITH_VULKAN:BOOL=${{ matrix.vk && 'ON' || 'OFF' }}
+        -DTI_WITH_METAL:BOOL=OFF
+        -DTI_WITH_BACKTRACE:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+          fetch-depth: '0'
+
+      - name: Build
+        run: |
+          . .github/workflows/scripts/common-utils.sh
+          ci-docker-run-gpu \
+            -v $(pwd):/home/dev/taichi \
+            registry.botmaster.tgr/taichi-build-cuda:${{ env.CI_IMAGE_VERSION }} \
+            /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config}
+        env:
+          SHOULD_TAG_CONFIG: ${{ !matrix.designated && 'yes' || '' }}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_manylinux2014:
+    name: Build manylinux2014
+    timeout-minutes: 30
+    runs-on: [self-hosted, online, Linux, build]
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        _designated: ['']
+        include:
+        - _designated: ''
+          designated: designated
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=OFF
+        -DTI_BUILD_TESTS:BOOL=ON
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+          fetch-depth: '0'
+
+      - name: Build
+        run: |
+          . .github/workflows/scripts/common-utils.sh
+
+          ci-docker-run-gpu \
+            -v $(pwd):/home/dev/taichi \
+            registry.botmaster.tgr/taichi-build-manylinux2014-cuda:${{ env.CI_IMAGE_VERSION }} \
+            /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_amdgpu_linux:
+    name: Build AMDGPU
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        designated: ['']
+    runs-on: [self-hosted, online, Linux, build]
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_CUDA:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=OFF
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_BUILD_TESTS:BOOL=ON
+        -DTI_WITH_AMDGPU:BOOL=ON
+
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+          fetch-depth: '0'
+
+      - name: Build & Install
+        run: |
+          . .github/workflows/scripts/common-utils.sh
+
+          ci-docker-run \
+            -v $(pwd):/home/dev/taichi \
+            registry.botmaster.tgr/taichi-build-amdgpu:${{ env.CI_IMAGE_VERSION }} \
+            /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} --tag-local=amd
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_windows:
+    name: Build Windows
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda: ['', cuda]
+        llvm: ['', llvm]
+        gl: ['', gl]
+        vk: ['', vk]
+        python: ${{ fromJSON(inputs.python) }}
+        include:
+        - {cuda: cuda, llvm: llvm, gl: gl, vk: vk, lto: lto, pdb: pdb, designated: designated}
+        exclude:
+        - {llvm: '', cuda: cuda}
+    runs-on: [self-hosted, online, Windows, build]
+    timeout-minutes: 30
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_CUDA:BOOL=${{ matrix.cuda && 'ON' || 'OFF' }}
+        -DTI_WITH_LLVM:BOOL=${{ matrix.llvm && 'ON' || 'OFF' }}
+        -DTI_WITH_OPENGL:BOOL=${{ matrix.gl && 'ON' || 'OFF' }}
+        -DTI_WITH_VULKAN:BOOL=${{ matrix.vk && 'ON' || 'OFF' }}
+        -DTI_WITH_METAL:BOOL=OFF
+        -DTI_WITH_BACKTRACE:BOOL=ON
+        -DTI_WITH_DX11:BOOL=ON
+        -DTI_WITH_DX12:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+        -DTI_WITH_C_API:BOOL=ON
+        -DTI_WITH_LTO:BOOL=${{ matrix.lto && 'ON' || 'OFF' }}
+        -DTI_GENERATE_PDB:BOOL=${{ matrix.pdb && 'ON' || 'OFF' }}
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        shell: pwsh
+        run: |
+          $ErrorActionPreference = 'SilentlyContinue'
+          git config --system core.longpaths true
+          git submodule foreach --recursive 'git rev-parse HEAD || rm -rf $PWD'
+          $LASTEXITCODE = 0
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v4
+        with:
+          # force a 3.7 is ok, build.py will handle actual python env
+          python-version: 3.7
+
+      - name: Build
+        shell: pwsh
+        run: |
+          $nightlyFlag = $null
+          if ($env:NIGHTLY) { $nightlyFlag = "--nightly" }
+          $tagFlag = $null
+          if ($env:SHOULD_TAG_CONFIG) { $tagFlag = "--tag-config" }
+          python build.py $nightlyFlag --python=${{ matrix.python }} $tagFlag
+        env:
+          SHOULD_TAG_CONFIG: ${{ !matrix.designated && 'yes' || '' }}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+      - name: Cleanup Git Cache Configs
+        shell: pwsh
+        if: always()
+        run: |
+          . .github/workflows/scripts/common-utils.ps1
+          UnsetGitCachingProxy
+          exit 0
+
+  build_m1:
+    name: Build M1
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        _designated: ['']
+        exclude:
+        - python: "3.7"
+        include:
+        - _designated: ''
+          designated: designated
+    defaults:
+      run:
+        shell: '/usr/bin/arch -arch arm64e /bin/bash --noprofile --norc -eo pipefail {0}'
+    runs-on: [self-hosted, online, m1]
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_WITH_CUDA:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+        -DTI_WITH_C_API=ON
+        -DTI_WITH_STATIC_C_API=ON
+      PLATFORM: 'm1'
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - name: Build
+        run: |
+          brew install molten-vk
+          ./build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_ios_capi:
+    name: Build iOS C-API Static Library
+    timeout-minutes: 30
+    runs-on: [self-hosted, online, m1]
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - name: Build
+        run: .github/workflows/scripts/build.py ios
+
+      - name: Save Compiled Static Library
+        uses: actions/upload-artifact@v3
+        with:
+          name: libtaichi_c_api.iOS.a
+          path: 'dist/C-API-iOS/*.a'
+          retention-days: 7
diff --git a/.github/workflows/initiator.yaml b/.github/workflows/initiator.yaml
new file mode 100644
index 0000000000000..8cd5cfd4ddf8a
--- /dev/null
+++ b/.github/workflows/initiator.yaml
@@ -0,0 +1,74 @@
+name: Taichi Workflow Initiator
+on:
+  push:
+    branches:
+    - master
+    - rc-*
+
+concurrency:
+  group: ${{ github.event.number || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  TI_CI: "1"
+  TI_SKIP_VERSION_CHECK: 'ON'
+  TI_LITE_TEST: ${{ github.event_name == 'pull_request' && ! contains(github.event.pull_request.labels.*.name, 'full-ci') && ! startsWith(github.base_ref, 'rc-') && '1' || '' }}
+  TI_TEST_OFFLINE_CACHE: ${{ github.event.schedule == '0 18 * * *' && '1' || '' }}
+  CI_IMAGE_VERSION: '202304251731'
+  TI_USE_GIT_CACHE: ${{ vars.TI_USE_GIT_CACHE }}
+  REDIS_HOST: 172.16.5.1
+
+jobs:
+  show_environ:
+    name: Show Environment Variables
+    # Disable this workflow on forks
+    if: github.repository_owner == 'taichi-dev'
+    runs-on: [self-hosted, Linux]
+    steps:
+    - name: Environment Variables
+      run: env
+    - name: Github Object
+      run: |
+        cat <<'EOF'
+        ${{ toJson(github) }}
+        EOF
+
+  preparation:
+    name: Preparation
+    runs-on: [self-hosted, Linux]
+    outputs:
+      date: ${{ steps.gather.outputs.date }}
+      short_sha: ${{ steps.gather.outputs.short_sha }}
+      build_id: ${{ steps.gather.outputs.build_id }}
+    steps:
+      - name: Gather Information
+        id: gather
+        run: |
+          DATE=$(date +'%Y%m%d-%H%M%S')
+          SHORT_SHA=$(echo $GITHUB_SHA | cut -c '1-10')
+          echo date=$DATE >> $GITHUB_OUTPUT
+          echo short_sha=$SHORT_SHA >> $GITHUB_OUTPUT
+          echo build_id=$DATE-$SHORT_SHA >> $GITHUB_OUTPUT
+
+  build:
+    name: Build Artifacts
+    needs: [preparation]
+    uses: ./.github/workflows/build.yaml
+    with:
+      build_id: ${{ needs.preparation.outputs.build_id }}
+      nightly: false
+      python: '["3.9", "3.10", "3.11"]'
+    secrets:
+      BOT_MINIO_ACCESS_KEY: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+      BOT_MINIO_SECRET_KEY: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+
+  save_build_metadata:
+    name: Save Build Metadata
+    runs-on: [self-hosted, Linux]
+    needs: [preparation, build]
+    steps:
+      - name: Saving Metadata
+        id: gather
+        run: >-
+          redis-cli -h $REDIS_HOST --raw
+          set "latest-build-id:$GITHUB_REPOSITORY:$GITHUB_REF" ${{ needs.preparation.outputs.build_id }}

From 7f749506ccaf246c4652b6f6b9360fe31b93f736 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=A7=8B=E4=BA=91=E6=9C=AA=E4=BA=91?=
 <listerily@outlook.com>
Date: Sat, 13 May 2023 05:43:23 +0800
Subject: [PATCH 09/18] [Opengl] Fix: runtime caught error cannot be displayed
 in opengl (#7998)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Brief Summary

This PR fixes the issue that caught errors weren't logged out in opengl
device.

<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at 9738324</samp>

Refactor and improve error handling of `create_pipeline` function in
`opengl_device.cpp`. Catch and log different exceptions and return
corresponding error codes.

### Walkthrough

<!--
copilot:walkthrough
-->
### <samp>🤖 Generated by Copilot at 9738324</samp>

* Improve error handling and reporting in `create_pipeline` function
([link](https://github.com/taichi-dev/taichi/pull/7998/files?diff=unified&w=0#diff-28721a9ee9ac35b296afebd149e19d760c079aac1be524a048e77c5fd8f51069L647-R658))
by catching and logging different types of exceptions and returning
different error codes
---
 taichi/rhi/opengl/opengl_device.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/taichi/rhi/opengl/opengl_device.cpp b/taichi/rhi/opengl/opengl_device.cpp
index e2e65e25f2a28..cae2caeb4d0ea 100644
--- a/taichi/rhi/opengl/opengl_device.cpp
+++ b/taichi/rhi/opengl/opengl_device.cpp
@@ -644,9 +644,18 @@ RhiResult GLDevice::create_pipeline(Pipeline **out_pipeline,
                                     PipelineCache *cache) noexcept {
   try {
     *out_pipeline = new GLPipeline(src, name);
-  } catch (std::bad_alloc &) {
+  } catch (std::bad_alloc &e) {
     *out_pipeline = nullptr;
+    RHI_LOG_ERROR(e.what());
     return RhiResult::out_of_memory;
+  } catch (std::invalid_argument &e) {
+    *out_pipeline = nullptr;
+    RHI_LOG_ERROR(e.what());
+    return RhiResult::invalid_usage;
+  } catch (std::runtime_error &e) {
+    *out_pipeline = nullptr;
+    RHI_LOG_ERROR(e.what());
+    return RhiResult::error;
   }
   return RhiResult::success;
 }

From 6e9fe2e6b34c6e69afb169cc75461b0b9dde67bc Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Sat, 13 May 2023 18:14:12 +0800
Subject: [PATCH 10/18] [Lang] Fix error with irpass::check_out_of_bound() for
 TensorTyped ExternalPtrStmt (#7997)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue: #

### Brief Summary

<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at fde5c3c</samp>

Simplify and clean up the code for out-of-bound check for external
tensors in `taichi/transforms/check_out_of_bound.cpp`. Remove the
deprecated SOA layout branch and use consistent naming for dimensions
and shapes.

### Walkthrough

<!--
copilot:walkthrough
-->
### <samp>🤖 Generated by Copilot at fde5c3c</samp>

* Simplify code and remove deprecated SOA layout branch for ndarray in
`check_out_of_bound` transform
([link](https://github.com/taichi-dev/taichi/pull/7997/files?diff=unified&w=0#diff-2d1e902d25643016ff6e05dc05dfdc05d1615face8181014196fe26796c35e0fL62-R77))
---
 taichi/ir/frontend_ir.cpp                |  2 +-
 taichi/ir/ir_builder.cpp                 |  2 +-
 taichi/ir/statements.cpp                 |  3 +++
 taichi/ir/statements.h                   |  7 +++++++
 taichi/transforms/auto_diff.cpp          | 16 +++++++++-------
 taichi/transforms/check_out_of_bound.cpp | 19 ++++++++++++-------
 taichi/transforms/scalarize.cpp          |  6 +++---
 taichi/transforms/vectorize_half2.cpp    |  3 ++-
 tests/python/test_ndarray.py             | 14 ++++++++++++++
 9 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index ae597e7199ebd..8b152dcb1fa4f 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -656,7 +656,7 @@ Stmt *make_ndarray_access(Expression::FlattenContext *ctx,
   auto var_stmt = flatten_lvalue(var, ctx);
   auto expr = var.cast<ExternalTensorExpression>();
   auto external_ptr_stmt = std::make_unique<ExternalPtrStmt>(
-      var_stmt, index_stmts, expr->dt.get_shape(), expr->element_dim,
+      var_stmt, index_stmts, expr->dim, expr->dt.get_shape(), expr->element_dim,
       expr->is_grad);
   if (expr->dim == indices.size()) {
     // Indexing into an scalar element
diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp
index 29df5319cf31e..27a2f70d4b4aa 100644
--- a/taichi/ir/ir_builder.cpp
+++ b/taichi/ir/ir_builder.cpp
@@ -441,7 +441,7 @@ ExternalPtrStmt *IRBuilder::create_external_ptr(
     const std::vector<Stmt *> &indices,
     bool is_grad) {
   return insert(Stmt::make_typed<ExternalPtrStmt>(
-      ptr, indices, std::vector<int>(), 0, is_grad));
+      ptr, indices, indices.size(), std::vector<int>(), 0, is_grad));
 }
 
 AdStackAllocaStmt *IRBuilder::create_ad_stack(const DataType &dt,
diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
index 8234ddce9b868..1666a614d8e89 100644
--- a/taichi/ir/statements.cpp
+++ b/taichi/ir/statements.cpp
@@ -36,6 +36,7 @@ ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr,
                                  const std::vector<Stmt *> &indices,
                                  bool is_grad)
     : base_ptr(base_ptr), indices(indices), is_grad(is_grad) {
+  ndim = indices.size();
   TI_ASSERT(base_ptr != nullptr);
   TI_ASSERT(base_ptr->is<ArgLoadStmt>());
   TI_STMT_REG_FIELDS;
@@ -43,12 +44,14 @@ ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr,
 
 ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr,
                                  const std::vector<Stmt *> &indices,
+                                 int ndim,
                                  const std::vector<int> &element_shape,
                                  int element_dim,
                                  bool is_grad)
     : ExternalPtrStmt(base_ptr, indices, is_grad) {
   this->element_shape = element_shape;
   this->element_dim = element_dim;
+  this->ndim = ndim;
 }
 
 GlobalPtrStmt::GlobalPtrStmt(SNode *snode,
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index 04fe2175f1838..f3bfd718207e9 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -333,7 +333,13 @@ class AtomicOpStmt : public Stmt,
 class ExternalPtrStmt : public Stmt {
  public:
   Stmt *base_ptr;
+
   std::vector<Stmt *> indices;
+
+  // Number of dimensions of external shape
+  int ndim;
+
+  // Shape of element type
   std::vector<int> element_shape;
   // AOS: element_dim < 0
   // SOA: element_dim > 0
@@ -352,6 +358,7 @@ class ExternalPtrStmt : public Stmt {
 
   ExternalPtrStmt(Stmt *base_ptr,
                   const std::vector<Stmt *> &indices,
+                  int ndim,
                   const std::vector<int> &element_shape,
                   int element_dim,
                   bool is_grad = false);
diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
index 67a4b08eb29e3..d15c90fb1f7fa 100644
--- a/taichi/transforms/auto_diff.cpp
+++ b/taichi/transforms/auto_diff.cpp
@@ -1518,9 +1518,10 @@ class MakeAdjoint : public ADTransform {
                        "Cannot automatically differentiate through a grad "
                        "tensor, if you really want to do that, pass the grad "
                        "tensor into the kernel directly");
-        auto adj_ptr = insert<ExternalPtrStmt>(
-            src->base_ptr, src->indices, src->element_shape, src->element_dim,
-            /*is_grad=*/true);
+        auto adj_ptr =
+            insert<ExternalPtrStmt>(src->base_ptr, src->indices, src->ndim,
+                                    src->element_shape, src->element_dim,
+                                    /*is_grad=*/true);
         adj_ptr->ret_type = src->ret_type;
 
         if (is_ptr_offset) {
@@ -1592,9 +1593,10 @@ class MakeAdjoint : public ADTransform {
                      "Cannot automatically differentiate through a grad "
                      "tensor, if you really want to do that, pass the grad "
                      "tensor into the kernel directly");
-      adjoint_ptr = insert<ExternalPtrStmt>(
-          dest->base_ptr, dest->indices, dest->element_shape, dest->element_dim,
-          /*is_grad=*/true);
+      adjoint_ptr =
+          insert<ExternalPtrStmt>(dest->base_ptr, dest->indices, dest->ndim,
+                                  dest->element_shape, dest->element_dim,
+                                  /*is_grad=*/true);
       adjoint_ptr->ret_type = dest->ret_type;
 
       if (is_ptr_offset) {
@@ -1659,7 +1661,7 @@ class MakeAdjoint : public ADTransform {
                        "tensor, if you really want to do that, pass the grad "
                        "tensor into the kernel directly");
         auto adjoint_ptr =
-            insert<ExternalPtrStmt>(dest->base_ptr, dest->indices,
+            insert<ExternalPtrStmt>(dest->base_ptr, dest->indices, dest->ndim,
                                     dest->element_shape, dest->element_dim,
                                     /*is_grad=*/true);
         adjoint_ptr->ret_type = dest->ret_type;
diff --git a/taichi/transforms/check_out_of_bound.cpp b/taichi/transforms/check_out_of_bound.cpp
index 4bd5dd51e51af..79acfff34a754 100644
--- a/taichi/transforms/check_out_of_bound.cpp
+++ b/taichi/transforms/check_out_of_bound.cpp
@@ -59,17 +59,22 @@ class CheckOutOfBound : public BasicStmtVisitor {
       auto check_lower_bound = new_stmts.push_back<BinaryOpStmt>(
           BinaryOpType::cmp_ge, stmt->indices[i], lower_bound);
       Stmt *upper_bound{nullptr};
-      // TODO: Simplify logic here since SOA layout for ndarray is deprecated
-      if ((stmt->element_dim < 0 && i == (stmt->indices.size() - 1)) ||
-          (stmt->element_dim > 0 && i == 0)) {
-        upper_bound =
-            new_stmts.push_back<ConstStmt>(TypedConstant(flattened_element));
-      } else {
-        auto axis = stmt->element_dim <= 0 ? i : (i - stmt->element_dim);
+
+      // SOA layout for ndarray is deprecated, assert it's AOS layout
+      TI_ASSERT(stmt->element_dim <= 0);
+      auto ndim = stmt->ndim;
+      if (i < ndim) {
+        // Check for External Shape
+        auto axis = i;
         upper_bound = new_stmts.push_back<ExternalTensorShapeAlongAxisStmt>(
             /*axis=*/axis,
             /*arg_id=*/stmt->base_ptr->as<ArgLoadStmt>()->arg_id);
+      } else {
+        // Check for Element Shape
+        upper_bound =
+            new_stmts.push_back<ConstStmt>(TypedConstant(flattened_element));
       }
+
       auto check_upper_bound = new_stmts.push_back<BinaryOpStmt>(
           BinaryOpType::cmp_lt, stmt->indices[i], upper_bound);
       auto check_i = new_stmts.push_back<BinaryOpStmt>(
diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp
index 69073000b5329..25253f29b00d8 100644
--- a/taichi/transforms/scalarize.cpp
+++ b/taichi/transforms/scalarize.cpp
@@ -1119,9 +1119,9 @@ class MergeExternalAndMatrixPtr : public BasicStmtVisitor {
           std::accumulate(begin(origin->element_shape),
                           end(origin->element_shape), 1, std::multiplies<>())};
 
-      auto fused = std::make_unique<ExternalPtrStmt>(origin->base_ptr, indices,
-                                                     element_shape, element_dim,
-                                                     origin->is_grad);
+      auto fused = std::make_unique<ExternalPtrStmt>(
+          origin->base_ptr, indices, origin->ndim, element_shape, element_dim,
+          origin->is_grad);
       fused->ret_type = stmt->ret_type;
       // Note: Update base_ptr's ret_type so that it matches the ExternalPtrStmt
       // with flattened indices. Main goal is to keep all the hacks in a single
diff --git a/taichi/transforms/vectorize_half2.cpp b/taichi/transforms/vectorize_half2.cpp
index 3a9358937e1a6..d159092357b66 100644
--- a/taichi/transforms/vectorize_half2.cpp
+++ b/taichi/transforms/vectorize_half2.cpp
@@ -355,7 +355,8 @@ class Half2Vectorize : public BasicStmtVisitor {
       std::vector<int> element_shape = {2};
       int element_dim = -1;
       auto new_extern_stmt = std::make_unique<ExternalPtrStmt>(
-          self_ptr, new_indices, element_shape, element_dim);
+          self_ptr, new_indices, self_extern_stmt->ndim, element_shape,
+          element_dim);
       new_extern_stmt->overrided_dtype = true;
       new_extern_stmt->ret_type = tensor_type;
       new_extern_stmt->ret_type.set_is_pointer(true);
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 476b9df320222..8f04ed2ad6ba8 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -6,6 +6,7 @@
 from taichi.lang.exception import TaichiIndexError, TaichiTypeError
 from taichi.lang.misc import get_host_arch_list
 from taichi.lang.util import has_pytorch
+from taichi.math import vec3, ivec3
 
 import taichi as ti
 from tests import test_utils
@@ -779,10 +780,23 @@ def test_matrix_ndarray_oob():
     def access_arr(input: ti.types.ndarray(), p: ti.i32, q: ti.i32, x: ti.i32, y: ti.i32) -> ti.f32:
         return input[p, q][x, y]
 
+    @ti.kernel
+    def valid_access(indices: ti.types.ndarray(dtype=ivec3, ndim=1), dummy: ti.types.ndarray(dtype=ivec3, ndim=1)):
+        for i in indices:
+            index_vec = ti.Vector([0, 0, 0])
+            for j in ti.static(range(3)):
+                index = indices[i][j]
+                index_vec[j] = index
+            dummy[i] = index_vec
+
     input = ti.ndarray(dtype=ti.math.mat2, shape=(4, 5))
 
+    indices = ti.ndarray(dtype=ivec3, shape=(10))
+    dummy = ti.ndarray(dtype=ivec3, shape=(10))
+
     # Works
     access_arr(input, 2, 3, 0, 1)
+    valid_access(indices, dummy)
 
     # element_shape
     with pytest.raises(AssertionError, match=r"Out of bound access"):

From 407d0768b4984d065a86f6baf915b962e6dd96b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=A7=8B=E4=BA=91=E6=9C=AA=E4=BA=91?=
 <listerily@outlook.com>
Date: Mon, 15 May 2023 13:19:43 +0800
Subject: [PATCH 11/18] [Misc] Make clang-tidy happy on 'explicit' (#7999)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Brief Summary

This PR tries to resolve the problem during ` Build and Test / Check
Static Analyzer (pull_request) `. This check takes too much time and
prints out thousands of lines of 'explicit errors' like this:

```
error: single-argument constructors must be marked explicit to avoid unintentional implicit conversions [google-explicit-constructor,-warnings-as-errors]
  one_or_more(Container &&value) : var(std::move(value)) {
  ^
  explicit
```

This PR also resolves `error: copy constructor should not be declared
explicit` on `GLResourceSet`.

### Walkthrough

<!--
copilot:walkthrough
-->
### <samp>🤖 Generated by Copilot at cbf3891</samp>

* Suppress clang-tidy warning for single-argument constructors of
`one_or_more` class by adding `// NOLINTNEXTLINE` comments
([link](https://github.com/taichi-dev/taichi/pull/7999/files?diff=unified&w=0#diff-6230f8e8c6a8a297f8900dd0f7e212097b07ee7b64fc7fa5a5ffee5af47211a8L14-R34),
). This class is defined in `taichi/common/one_or_more.h` and allows
holding either a single value or a container of values.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/common/one_or_more.h                            | 6 ++++++
 taichi/rhi/amdgpu/amdgpu_context.h                     | 2 +-
 taichi/rhi/llvm/device_memory_pool.h                   | 2 +-
 taichi/rhi/opengl/opengl_device.h                      | 2 +-
 taichi/rhi/vulkan/vulkan_device.h                      | 2 +-
 taichi/transforms/make_cpu_multithreaded_range_for.cpp | 3 ++-
 6 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/taichi/common/one_or_more.h b/taichi/common/one_or_more.h
index 472f0629b8ec8..bb7ad75a14d33 100644
--- a/taichi/common/one_or_more.h
+++ b/taichi/common/one_or_more.h
@@ -11,21 +11,27 @@ struct one_or_more {
 
   std::variant<value_type, Container> var;
 
+  // NOLINTNEXTLINE
   one_or_more(value_type const &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(value_type &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(value_type &&value) : var(std::move(value)) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(Container const &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(Container &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(Container &&value) : var(std::move(value)) {
   }
 
diff --git a/taichi/rhi/amdgpu/amdgpu_context.h b/taichi/rhi/amdgpu/amdgpu_context.h
index affef71aa2ddd..6688eb91a5667 100644
--- a/taichi/rhi/amdgpu/amdgpu_context.h
+++ b/taichi/rhi/amdgpu/amdgpu_context.h
@@ -94,7 +94,7 @@ class AMDGPUContext {
     void *new_ctx_;
 
    public:
-    ContextGuard(AMDGPUContext *new_ctx)
+    explicit ContextGuard(AMDGPUContext *new_ctx)
         : old_ctx_(nullptr), new_ctx_(new_ctx) {
       AMDGPUDriver::get_instance().context_get_current(&old_ctx_);
       if (old_ctx_ != new_ctx)
diff --git a/taichi/rhi/llvm/device_memory_pool.h b/taichi/rhi/llvm/device_memory_pool.h
index 0ccb5ae77f338..f5081defb2c57 100644
--- a/taichi/rhi/llvm/device_memory_pool.h
+++ b/taichi/rhi/llvm/device_memory_pool.h
@@ -24,7 +24,7 @@ class TI_DLL_EXPORT DeviceMemoryPool {
   void *allocate(std::size_t size, std::size_t alignment, bool managed = false);
   void release(std::size_t size, void *ptr, bool release_raw = false);
   void reset();
-  DeviceMemoryPool(bool merge_upon_release);
+  explicit DeviceMemoryPool(bool merge_upon_release);
   ~DeviceMemoryPool();
 
  protected:
diff --git a/taichi/rhi/opengl/opengl_device.h b/taichi/rhi/opengl/opengl_device.h
index 514ec98b13426..3c5008cee2199 100644
--- a/taichi/rhi/opengl/opengl_device.h
+++ b/taichi/rhi/opengl/opengl_device.h
@@ -29,7 +29,7 @@ extern void *kGetOpenglProcAddr;
 class GLResourceSet : public ShaderResourceSet {
  public:
   GLResourceSet() = default;
-  explicit GLResourceSet(const GLResourceSet &other) = default;
+  GLResourceSet(const GLResourceSet &other) = default;
 
   ~GLResourceSet() override;
 
diff --git a/taichi/rhi/vulkan/vulkan_device.h b/taichi/rhi/vulkan/vulkan_device.h
index 1e7a4fd8027e1..71b33ecdf6b6e 100644
--- a/taichi/rhi/vulkan/vulkan_device.h
+++ b/taichi/rhi/vulkan/vulkan_device.h
@@ -239,7 +239,7 @@ class VulkanResourceSet : public ShaderResourceSet {
 
 class VulkanRasterResources : public RasterResources {
  public:
-  VulkanRasterResources(VulkanDevice *device) : device_(device) {
+  explicit VulkanRasterResources(VulkanDevice *device) : device_(device) {
   }
 
   struct BufferBinding {
diff --git a/taichi/transforms/make_cpu_multithreaded_range_for.cpp b/taichi/transforms/make_cpu_multithreaded_range_for.cpp
index 8ada2a9c70c88..8528b0ba1685a 100644
--- a/taichi/transforms/make_cpu_multithreaded_range_for.cpp
+++ b/taichi/transforms/make_cpu_multithreaded_range_for.cpp
@@ -48,7 +48,8 @@ using TaskType = OffloadedStmt::TaskType;
 
 class MakeCPUMultithreadedRangeFor : public BasicStmtVisitor {
  public:
-  MakeCPUMultithreadedRangeFor(const CompileConfig &config) : config(config) {
+  explicit MakeCPUMultithreadedRangeFor(const CompileConfig &config)
+      : config(config) {
   }
 
   void visit(Block *block) override {

From 54e952e3f15c144caec985e8989a99850fdfafeb Mon Sep 17 00:00:00 2001
From: lin-hitonami <linjiang@taichi.graphics>
Date: Thu, 11 May 2023 17:47:02 +0800
Subject: [PATCH 12/18] [refactor] Let the type of reference arguments be a
 pointer

ghstack-source-id: 2306463f766056172c4175cf4b3675de592a1f2b
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7982
---
 taichi/ir/frontend_ir.cpp        | 33 +++++++++++++++++++++-----------
 taichi/ir/frontend_ir.h          |  2 ++
 taichi/transforms/scalarize.cpp  |  4 ++++
 taichi/transforms/type_check.cpp |  7 +------
 4 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 8b152dcb1fa4f..641c9077a2576 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -140,6 +140,9 @@ FrontendWhileStmt::FrontendWhileStmt(const FrontendWhileStmt &o)
 
 void ArgLoadExpression::type_check(const CompileConfig *) {
   ret_type = dt;
+  if (is_ptr) {
+    ret_type = TypeFactory::get_instance().get_pointer_type(ret_type, false);
+  }
   if (!create_load) {
     ret_type = TypeFactory::get_instance().get_pointer_type(ret_type, false);
   }
@@ -960,7 +963,7 @@ void AtomicOpExpression::type_check(const CompileConfig *config) {
   };
 
   // Broadcast val to dest if neccessary
-  auto val_dtype = val->ret_type;
+  auto val_dtype = get_rvalue_dtype(val);
   auto dest_dtype = dest->ret_type.ptr_removed();
   if (dest_dtype->is<PrimitiveType>() && val_dtype->is<TensorType>()) {
     error();
@@ -973,20 +976,18 @@ void AtomicOpExpression::type_check(const CompileConfig *config) {
   }
 
   // Validate dtype
-  auto dtype = val->ret_type;
-  if (dtype->is<TensorType>()) {
-    dtype = dtype.get_element_type();
+  if (val_dtype->is<TensorType>()) {
+    val_dtype = val_dtype.get_element_type();
   }
 
-  if (!dtype->is<PrimitiveType>()) {
+  if (!val_dtype->is<PrimitiveType>()) {
     error();
   }
 
-  if (is_quant(dest->ret_type)) {
-    ret_type = dest->ret_type->get_compute_type();
-  } else if (dest->ret_type->is<PrimitiveType>() ||
-             dest->ret_type->is<TensorType>()) {
-    ret_type = dest->ret_type;
+  if (is_quant(dest_dtype)) {
+    ret_type = dest_dtype->get_compute_type();
+  } else if (dest_dtype->is<PrimitiveType>() || dest_dtype->is<TensorType>()) {
+    ret_type = dest_dtype;
   } else {
     error();
   }
@@ -1271,7 +1272,7 @@ void MeshIndexConversionExpression::flatten(FlattenContext *ctx) {
 }
 
 void ReferenceExpression::type_check(const CompileConfig *) {
-  ret_type = var->ret_type;
+  ret_type = TypeFactory::get_instance().get_pointer_type(var->ret_type);
 }
 
 void ReferenceExpression::flatten(FlattenContext *ctx) {
@@ -1796,4 +1797,14 @@ Stmt *flatten_rvalue(Expr ptr, Expression::FlattenContext *ctx) {
   return ptr_stmt;
 }
 
+DataType get_rvalue_dtype(Expr expr) {
+  if (auto argload = expr.cast<ArgLoadExpression>()) {
+    if (argload->is_ptr) {
+      return argload->ret_type.ptr_removed();
+    }
+    return argload->ret_type;
+  }
+  return expr->ret_type;
+}
+
 }  // namespace taichi::lang
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index d0b278a3de499..346e2140f9f02 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -1108,4 +1108,6 @@ Stmt *flatten_lvalue(Expr expr, Expression::FlattenContext *ctx);
 
 Stmt *flatten_rvalue(Expr expr, Expression::FlattenContext *ctx);
 
+DataType get_rvalue_dtype(Expr expr);
+
 }  // namespace taichi::lang
diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp
index 25253f29b00d8..ba426be5baa2c 100644
--- a/taichi/transforms/scalarize.cpp
+++ b/taichi/transforms/scalarize.cpp
@@ -608,10 +608,14 @@ class Scalarize : public BasicStmtVisitor {
   }
 
   void visit(ArgLoadStmt *stmt) override {
+    if (!stmt->ret_type.is_pointer()) {
+      return;
+    }
     if (stmt->ret_type.ptr_removed()->is<StructType>()) {
       return;
     }
     auto ret_type = stmt->ret_type.ptr_removed().get_element_type();
+    ret_type = TypeFactory::get_instance().get_pointer_type(ret_type);
     auto arg_load = std::make_unique<ArgLoadStmt>(
         stmt->arg_id, ret_type, stmt->is_ptr, stmt->create_load);
 
diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index 8d59faf8cc30e..e979f11d3b5e5 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -431,12 +431,7 @@ class TypeCheck : public IRVisitor {
   }
 
   void visit(ArgLoadStmt *stmt) override {
-    // TODO: Maybe have a type_inference() pass, which takes in the args/rets
-    // defined by the kernel. After that, type_check() pass will purely do
-    // verification, without modifying any types.
-    if (stmt->is_ptr) {
-      stmt->ret_type.set_is_pointer(true);
-    }
+    // Do nothing
   }
 
   void visit(ReturnStmt *stmt) override {

From d77c798b06e30691ef8ef67f0c6152248c448235 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Mon, 15 May 2023 13:41:25 +0800
Subject: [PATCH 13/18] [Lang] Migrate irpass::scalarize() after
 irpass::demote_no_access_mesh_fors() (#7956)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue: #

### Brief Summary

<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at 8f62992</samp>

This pull request improves the performance and memory efficiency of
matrix operations on global and external variables in Taichi kernels. It
enhances several IR passes, such as `scalarize`, `die`, `DemoteAtomics`,
and `cache_loop_invariant_global_vars`, to handle matrix operations
better on different backends. It also fixes a bug in the
`ExtractLocalPointers` pass that caused a crash on offloaded tasks.

### Walkthrough

<!--
copilot:walkthrough
-->
### <samp>🤖 Generated by Copilot at 8f62992</samp>

* Extend `CacheLoopInvariantGlobalVars` pass to support matrix
operations on global and external variables
([link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-b811d49ff1b631b511a9e64d42aa77d96c85d8a1d55e2088e7dd5b4a1c3c6a2fL57-R70),
[link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-b811d49ff1b631b511a9e64d42aa77d96c85d8a1d55e2088e7dd5b4a1c3c6a2fL72-R99))
* Move scalarization of matrix operations to `offload_to_executable`
function, after caching some global variables as local variables
([link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-8fde186587db97b3bbc8a856e59bc4467b30257335b0fad064b4eebd521a912bL138-L145),
[link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-8fde186587db97b3bbc8a856e59bc4467b30257335b0fad064b4eebd521a912bR205-R212))
* Extend `DemoteAtomics` pass to support matrix operations on global and
external variables
([link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-5fe31716eccdda9061aa4d74f5ce21a276137f7e545014ed8d1ff09a1bfdee14L42-R56),
[link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-5fe31716eccdda9061aa4d74f5ce21a276137f7e545014ed8d1ff09a1bfdee14L69-R102))
* Fix a bug in `ExtractLocalPointers` pass that caused a crash when
applied to an offloaded task
([link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-97b0d9ab204b703802b3b5d04d036d30f66b34b726128216faf0d8a2a8564528L1052-R1057))
* Add empty lines for readability in `compile_to_offloads.cpp` and
`demote_atomics.cpp`
([link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-8fde186587db97b3bbc8a856e59bc4467b30257335b0fad064b4eebd521a912bR185),
[link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-5fe31716eccdda9061aa4d74f5ce21a276137f7e545014ed8d1ff09a1bfdee14R117),
[link](https://github.com/taichi-dev/taichi/pull/7956/files?diff=unified&w=0#diff-5fe31716eccdda9061aa4d74f5ce21a276137f7e545014ed8d1ff09a1bfdee14R125))
---
 .../cache_loop_invariant_global_vars.cpp      | 32 +++++++++++++++++--
 taichi/transforms/compile_to_offloads.cpp     | 16 +++++-----
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/taichi/transforms/cache_loop_invariant_global_vars.cpp b/taichi/transforms/cache_loop_invariant_global_vars.cpp
index 3966bd4d9d181..3ba7cc886d3af 100644
--- a/taichi/transforms/cache_loop_invariant_global_vars.cpp
+++ b/taichi/transforms/cache_loop_invariant_global_vars.cpp
@@ -54,7 +54,20 @@ class CacheLoopInvariantGlobalVars : public LoopInvariantDetector {
     if (current_offloaded->task_type == OffloadedTaskType::serial) {
       return true;
     }
-    if (auto global_ptr = stmt->cast<GlobalPtrStmt>()) {
+
+    // Handle GlobalPtrStmt
+    bool is_global_ptr_stmt = false;
+    GlobalPtrStmt *global_ptr = nullptr;
+    if (stmt->is<GlobalPtrStmt>()) {
+      is_global_ptr_stmt = true;
+      global_ptr = stmt->as<GlobalPtrStmt>();
+    } else if (stmt->is<MatrixPtrStmt>() &&
+               stmt->as<MatrixPtrStmt>()->origin->is<GlobalPtrStmt>()) {
+      is_global_ptr_stmt = true;
+      global_ptr = stmt->as<MatrixPtrStmt>()->origin->as<GlobalPtrStmt>();
+    }
+
+    if (global_ptr) {
       auto snode = global_ptr->snode;
       if (loop_unique_ptr_[snode] == nullptr ||
           loop_unique_ptr_[snode]->indices.empty()) {
@@ -69,8 +82,21 @@ class CacheLoopInvariantGlobalVars : public LoopInvariantDetector {
         return false;
       }
       return true;
-    } else if (stmt->is<ExternalPtrStmt>()) {
-      ExternalPtrStmt *dest_ptr = stmt->as<ExternalPtrStmt>();
+    }
+
+    // Handle ExternalPtrStmt
+    bool is_external_ptr_stmt = false;
+    ExternalPtrStmt *dest_ptr = nullptr;
+    if (stmt->is<ExternalPtrStmt>()) {
+      is_external_ptr_stmt = true;
+      dest_ptr = stmt->as<ExternalPtrStmt>();
+    } else if (stmt->is<MatrixPtrStmt>() &&
+               stmt->as<MatrixPtrStmt>()->origin->is<ExternalPtrStmt>()) {
+      is_external_ptr_stmt = true;
+      dest_ptr = stmt->as<MatrixPtrStmt>()->origin->as<ExternalPtrStmt>();
+    }
+
+    if (is_external_ptr_stmt) {
       if (dest_ptr->indices.empty()) {
         return false;
       }
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index 14fb925dd8414..f162f5975274f 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -183,14 +183,6 @@ void offload_to_executable(IRNode *ir,
   print("Atomics demoted I");
   irpass::analysis::verify(ir);
 
-  if (config.real_matrix_scalarize) {
-    irpass::scalarize(ir);
-
-    // Remove redundant MatrixInitStmt inserted during scalarization
-    irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false});
-    print("Scalarized");
-  }
-
   if (config.cache_loop_invariant_global_vars) {
     irpass::cache_loop_invariant_global_vars(ir, config);
     print("Cache loop-invariant global vars");
@@ -218,6 +210,14 @@ void offload_to_executable(IRNode *ir,
     irpass::analysis::verify(ir);
   }
 
+  if (config.real_matrix_scalarize) {
+    irpass::scalarize(ir);
+
+    // Remove redundant MatrixInitStmt inserted during scalarization
+    irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false});
+    print("Scalarized");
+  }
+
   if (make_thread_local) {
     irpass::make_thread_local(ir, config);
     print("Make thread local");

From bf7998bc53eeb2a5e6fe1b942c8f40105ba2f321 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Tue, 16 May 2023 10:24:57 +0800
Subject: [PATCH 14/18] [aot] Export aot kernels with decorator properly
 (#8016)

---
 python/taichi/aot/_export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/taichi/aot/_export.py b/python/taichi/aot/_export.py
index 0af32c5fcb938..37889a70fed91 100644
--- a/python/taichi/aot/_export.py
+++ b/python/taichi/aot/_export.py
@@ -23,4 +23,4 @@ def inner(f):
 
 
 def export(f):
-    export_as(f.__name__)(f)
+    return export_as(f.__name__)(f)

From 1b84a2e8cc6f1f07012c1044a001466eb087a009 Mon Sep 17 00:00:00 2001
From: listerily <listerily@outlook.com>
Date: Fri, 12 May 2023 18:05:50 +0800
Subject: [PATCH 15/18] [lang] Added ti.u1 definition

ghstack-source-id: ac52abfd5e136811d4a7f3b86d5b36362ed94f4f
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/7995
---
 python/taichi/lang/util.py                    | 15 +++++++-
 python/taichi/types/primitive_types.py        | 16 ++++++++-
 taichi/codegen/llvm/codegen_llvm.cpp          |  3 ++
 taichi/codegen/spirv/spirv_codegen.cpp        |  3 ++
 taichi/codegen/spirv/spirv_ir_builder.cpp     | 10 ++++--
 taichi/codegen/spirv/spirv_types.cpp          | 12 ++++++-
 taichi/common/core.h                          |  2 ++
 taichi/common/types.h                         |  2 ++
 taichi/inc/data_type_with_c_type.inc.h        |  3 +-
 taichi/ir/expr.cpp                            |  4 +++
 taichi/ir/expr.h                              |  2 ++
 taichi/ir/frontend_ir.h                       |  9 ++---
 taichi/ir/type.cpp                            | 11 ++++++
 taichi/ir/type.h                              |  7 ++++
 taichi/ir/type_utils.cpp                      |  5 ++-
 taichi/ir/type_utils.h                        |  7 ++++
 taichi/program/kernel.cpp                     |  2 ++
 taichi/program/launch_context_builder.cpp     |  2 ++
 taichi/python/export_lang.cpp                 |  3 ++
 taichi/runtime/gfx/runtime.cpp                |  1 +
 .../runtime/llvm/runtime_module/runtime.cpp   |  2 ++
 tests/python/test_api.py                      |  2 ++
 tests/python/test_pow.py                      | 36 ++++++++++---------
 23 files changed, 130 insertions(+), 29 deletions(-)

diff --git a/python/taichi/lang/util.py b/python/taichi/lang/util.py
index b4cb76b4a06a9..e604de64962ca 100644
--- a/python/taichi/lang/util.py
+++ b/python/taichi/lang/util.py
@@ -16,6 +16,7 @@
     i16,
     i32,
     i64,
+    u1,
     u8,
     u16,
     u32,
@@ -119,6 +120,8 @@ def to_numpy_type(dt):
         return np.int8
     if dt == i16:
         return np.int16
+    if dt == u1:
+        return np.bool_
     if dt == u8:
         return np.uint8
     if dt == u16:
@@ -157,6 +160,8 @@ def to_pytorch_type(dt):
         return torch.int8
     if dt == i16:
         return torch.int16
+    if dt == u1:
+        return torch.bool
     if dt == u8:
         return torch.uint8
     if dt == f16:
@@ -190,6 +195,8 @@ def to_paddle_type(dt):
         return paddle.int8
     if dt == i16:
         return paddle.int16
+    if dt == u1:
+        return paddle.bool
     if dt == u8:
         return paddle.uint8
     if dt == f16:
@@ -224,6 +231,8 @@ def to_taichi_type(dt):
         return i8
     if dt == np.int16:
         return i16
+    if dt == np.bool_:
+        return u1
     if dt == np.uint8:
         return u8
     if dt == np.uint16:
@@ -251,6 +260,8 @@ def to_taichi_type(dt):
             return i8
         if dt == torch.int16:
             return i16
+        if dt == torch.bool:
+            return u1
         if dt == torch.uint8:
             return u8
         if dt == torch.float16:
@@ -273,6 +284,8 @@ def to_taichi_type(dt):
             return i8
         if dt == paddle.int16:
             return i16
+        if dt == paddle.bool:
+            return u1
         if dt == paddle.uint8:
             return u8
         if dt == paddle.float16:
@@ -293,7 +306,7 @@ def cook_dtype(dtype):
     if dtype is int:
         return impl.get_runtime().default_ip
     if dtype is bool:
-        return i32  # TODO[Xiaoyan]: Use i1 in the future
+        return i32  # TODO(zhantong): Replace it with u1
     raise ValueError(f"Invalid data type {dtype}")
 
 
diff --git a/python/taichi/types/primitive_types.py b/python/taichi/types/primitive_types.py
index d631067396086..aad85d6df51da 100644
--- a/python/taichi/types/primitive_types.py
+++ b/python/taichi/types/primitive_types.py
@@ -99,6 +99,18 @@
 
 # ----------------------------------------
 
+uint1 = ti_python_core.DataType_u1
+"""1-bit unsigned integer data type. Same as booleans.
+"""
+
+# ----------------------------------------
+
+u1 = uint1
+"""Alias for :const:`~taichi.types.primitive_types.uint1`
+"""
+
+# ----------------------------------------
+
 u8 = uint8
 """Alias for :const:`~taichi.types.primitive_types.uint8`
 """
@@ -154,7 +166,7 @@ def ref(tp):
 real_types = [f16, f32, f64, float]
 real_type_ids = [id(t) for t in real_types]
 
-integer_types = [i8, i16, i32, i64, u8, u16, u32, u64, int, bool]
+integer_types = [i8, i16, i32, i64, u1, u8, u16, u32, u64, int, bool]
 integer_type_ids = [id(t) for t in integer_types]
 
 all_types = real_types + integer_types
@@ -175,6 +187,8 @@ def ref(tp):
     "i32",
     "int64",
     "i64",
+    "uint1",
+    "u1",
     "uint8",
     "u8",
     "uint16",
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 94a8dc459217b..2fefbeb247892 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -1015,6 +1015,9 @@ void TaskCodeGenLLVM::visit(ConstStmt *stmt) {
   } else if (val.dt->is_primitive(PrimitiveTypeID::f64)) {
     llvm_val[stmt] =
         llvm::ConstantFP::get(*llvm_context, llvm::APFloat(val.val_float64()));
+  } else if (val.dt->is_primitive(PrimitiveTypeID::u1)) {
+    llvm_val[stmt] = llvm::ConstantInt::get(
+        *llvm_context, llvm::APInt(1, (uint64)val.val_uint1(), false));
   } else if (val.dt->is_primitive(PrimitiveTypeID::i8)) {
     llvm_val[stmt] = llvm::ConstantInt::get(
         *llvm_context, llvm::APInt(8, (uint64)val.val_int8(), true));
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 26c3cffd41447..38348e4c1d727 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -246,6 +246,9 @@ class TaskCodegen : public IRVisitor {
       } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
         return ir_->int_immediate_number(
             stype, static_cast<int64_t>(const_val.val_i16), false);
+      } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+        return ir_->uint_immediate_number(
+            stype, static_cast<uint64_t>(const_val.val_u1), false);
       } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
         return ir_->uint_immediate_number(
             stype, static_cast<uint64_t>(const_val.val_u8), false);
diff --git a/taichi/codegen/spirv/spirv_ir_builder.cpp b/taichi/codegen/spirv/spirv_ir_builder.cpp
index 2b48e2a3edbd4..65f5cf159ffc1 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.cpp
+++ b/taichi/codegen/spirv/spirv_ir_builder.cpp
@@ -377,6 +377,8 @@ SType IRBuilder::get_primitive_uint_type(const DataType &dt) const {
   } else if (dt == PrimitiveType::i16 || dt == PrimitiveType::u16 ||
              dt == PrimitiveType::f16) {
     return t_uint16_;
+  } else if (dt == PrimitiveType::u1) {
+    return t_bool_;
   } else {
     return t_uint8_;
   }
@@ -392,6 +394,8 @@ DataType IRBuilder::get_taichi_uint_type(const DataType &dt) const {
   } else if (dt == PrimitiveType::i16 || dt == PrimitiveType::u16 ||
              dt == PrimitiveType::f16) {
     return PrimitiveType::u16;
+  } else if (dt == PrimitiveType::u1) {
+    return PrimitiveType::u1;
   } else {
     return PrimitiveType::u8;
   }
@@ -1090,10 +1094,10 @@ DEFINE_BUILDER_CMP_OP(ge, GreaterThanEqual);
   Value IRBuilder::_OpName(Value a, Value b) {                             \
     TI_ASSERT(a.stype.id == b.stype.id);                                   \
     const auto &bool_type = t_bool_; /* TODO: Only scalar supported now */ \
-    if (is_integral(a.stype.dt)) {                                         \
-      return make_value(spv::OpI##_Op, bool_type, a, b);                   \
-    } else if (a.stype.id == bool_type.id) {                               \
+    if (a.stype.id == bool_type.id) {                                      \
       return make_value(spv::OpLogical##_Op, bool_type, a, b);             \
+    } else if (is_integral(a.stype.dt)) {                                  \
+      return make_value(spv::OpI##_Op, bool_type, a, b);                   \
     } else {                                                               \
       TI_ASSERT(is_real(a.stype.dt));                                      \
       return make_value(spv::OpFOrd##_Op, bool_type, a, b);                \
diff --git a/taichi/codegen/spirv/spirv_types.cpp b/taichi/codegen/spirv/spirv_types.cpp
index c47e26824a4f9..c6bff148c36e5 100644
--- a/taichi/codegen/spirv/spirv_types.cpp
+++ b/taichi/codegen/spirv/spirv_types.cpp
@@ -179,6 +179,14 @@ const tinyir::Type *translate_ti_primitive(tinyir::Block &ir_module,
     } else if (t == PrimitiveType::i64) {
       return ir_module.emplace_back<IntType>(/*num_bits=*/64,
                                              /*is_signed=*/true);
+    } else if (t == PrimitiveType::u1) {
+      // Spir-v has no full support for boolean types, using boolean types in
+      // backend may cause issues. These issues arise when we use boolean as
+      // return type, argument type and inner dtype of compount types. Since
+      // boolean types has the same width with int32 in GLSL, we use int32
+      // instead.
+      return ir_module.emplace_back<IntType>(/*num_bits=*/32,
+                                             /*is_signed=*/false);
     } else if (t == PrimitiveType::u8) {
       return ir_module.emplace_back<IntType>(/*num_bits=*/8,
                                              /*is_signed=*/false);
@@ -395,7 +403,9 @@ class Translate2Spirv : public TypeVisitor {
         vt = spir_builder_->i64_type();
       }
     } else {
-      if (type->num_bits() == 8) {
+      if (type->num_bits() == 1) {
+        vt = spir_builder_->bool_type();
+      } else if (type->num_bits() == 8) {
         vt = spir_builder_->u8_type();
       } else if (type->num_bits() == 16) {
         vt = spir_builder_->u16_type();
diff --git a/taichi/common/core.h b/taichi/common/core.h
index 15c601c56a60b..f112f8f35baad 100644
--- a/taichi/common/core.h
+++ b/taichi/common/core.h
@@ -133,6 +133,8 @@ class CoreState {
 //                                 Types
 //******************************************************************************
 
+using uint1 = bool;
+
 using uchar = unsigned char;
 
 using int8 = int8_t;
diff --git a/taichi/common/types.h b/taichi/common/types.h
index d71d75e6eff1a..b4728a6daa22d 100644
--- a/taichi/common/types.h
+++ b/taichi/common/types.h
@@ -4,6 +4,8 @@
 
 namespace taichi {
 
+using uint1 = bool;
+
 using uchar = unsigned char;
 
 using int8 = int8_t;
diff --git a/taichi/inc/data_type_with_c_type.inc.h b/taichi/inc/data_type_with_c_type.inc.h
index 2b12f83cbbd98..5d4963ae90b5d 100644
--- a/taichi/inc/data_type_with_c_type.inc.h
+++ b/taichi/inc/data_type_with_c_type.inc.h
@@ -1,10 +1,11 @@
-// Doesn't contain f16 and u1.
+// Doesn't contain f16.
 PER_C_TYPE(f32, float32)
 PER_C_TYPE(f64, float64)
 PER_C_TYPE(i8, int8)
 PER_C_TYPE(i16, int16)
 PER_C_TYPE(i32, int32)
 PER_C_TYPE(i64, int64)
+PER_C_TYPE(u1, uint1)
 PER_C_TYPE(u8, uint8)
 PER_C_TYPE(u16, uint16)
 PER_C_TYPE(u32, uint32)
diff --git a/taichi/ir/expr.cpp b/taichi/ir/expr.cpp
index f8ef27bbdfc18..0c395dddcda48 100644
--- a/taichi/ir/expr.cpp
+++ b/taichi/ir/expr.cpp
@@ -49,6 +49,10 @@ void Expr::set_adjoint_checkbit(const Expr &o) {
   this->cast<FieldExpression>()->adjoint_checkbit.set(o);
 }
 
+Expr::Expr(uint1 x) : Expr() {
+  expr = std::make_shared<ConstExpression>(PrimitiveType::u1, x);
+}
+
 Expr::Expr(int16 x) : Expr() {
   expr = std::make_shared<ConstExpression>(PrimitiveType::i16, x);
 }
diff --git a/taichi/ir/expr.h b/taichi/ir/expr.h
index 9b59dc036ae47..571d7d136f4fc 100644
--- a/taichi/ir/expr.h
+++ b/taichi/ir/expr.h
@@ -23,6 +23,8 @@ class Expr {
     atomic = false;
   }
 
+  explicit Expr(uint1 x);
+
   explicit Expr(int16 x);
 
   explicit Expr(int32 x);
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 346e2140f9f02..1c4a6bb3448fa 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -102,10 +102,11 @@ class FrontendSNodeOpStmt : public Stmt {
   ExprGroup indices;
   Expr val;
 
-  FrontendSNodeOpStmt(SNodeOpType op_type,
-                      SNode *snode,
-                      const ExprGroup &indices,
-                      const Expr &val = Expr(nullptr));
+  FrontendSNodeOpStmt(
+      SNodeOpType op_type,
+      SNode *snode,
+      const ExprGroup &indices,
+      const Expr &val = Expr(std::shared_ptr<Expression>(nullptr)));
 
   TI_DEFINE_ACCEPT
   TI_DEFINE_CLONE_FOR_FRONTEND_IR
diff --git a/taichi/ir/type.cpp b/taichi/ir/type.cpp
index eac8111ed3b7a..fb076e2448716 100644
--- a/taichi/ir/type.cpp
+++ b/taichi/ir/type.cpp
@@ -359,6 +359,8 @@ std::string TypedConstant::stringify() const {
     return fmt::format("{}", val_i8);
   } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
     return fmt::format("{}", val_i16);
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return fmt::format("{}", val_u1);
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
     return fmt::format("{}", val_u8);
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
@@ -391,6 +393,8 @@ bool TypedConstant::equal_type_and_value(const TypedConstant &o) const {
     return val_i8 == o.val_i8;
   } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
     return val_i16 == o.val_i16;
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return val_u1 == o.val_u1;
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
     return val_u8 == o.val_u8;
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
@@ -440,6 +444,11 @@ int16 &TypedConstant::val_int16() {
   return val_i16;
 }
 
+uint1 &TypedConstant::val_uint1() {
+  TI_ASSERT(get_data_type<uint1>() == dt);
+  return val_u1;
+}
+
 uint8 &TypedConstant::val_uint8() {
   TI_ASSERT(get_data_type<uint8>() == dt);
   return val_u8;
@@ -483,6 +492,8 @@ uint64 TypedConstant::val_uint() const {
     return val_u64;
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
     return val_u8;
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return val_u1;
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
     return val_u16;
   } else {
diff --git a/taichi/ir/type.h b/taichi/ir/type.h
index e7adb08008190..f398bc36e8d3f 100644
--- a/taichi/ir/type.h
+++ b/taichi/ir/type.h
@@ -529,6 +529,7 @@ class TypedConstant {
     float64 val_f64;
     int8 val_i8;
     int16 val_i16;
+    uint1 val_u1;
     uint8 val_u8;
     uint16 val_u16;
     uint32 val_u32;
@@ -564,6 +565,9 @@ class TypedConstant {
   explicit TypedConstant(int16 x) : dt(PrimitiveType::i16), val_i16(x) {
   }
 
+  explicit TypedConstant(uint1 x) : dt(PrimitiveType::u1), val_u1(x) {
+  }
+
   explicit TypedConstant(uint8 x) : dt(PrimitiveType::u8), val_u8(x) {
   }
 
@@ -594,6 +598,8 @@ class TypedConstant {
       val_i8 = value;
     } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
       val_i16 = value;
+    } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+      val_u1 = value;
     } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
       val_u8 = value;
     } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
@@ -627,6 +633,7 @@ class TypedConstant {
   float64 &val_float64();
   int8 &val_int8();
   int16 &val_int16();
+  uint1 &val_uint1();
   uint8 &val_uint8();
   uint16 &val_uint16();
   uint32 &val_uint32();
diff --git a/taichi/ir/type_utils.cpp b/taichi/ir/type_utils.cpp
index 2fdb816d518d4..2ad7ceacd2151 100644
--- a/taichi/ir/type_utils.cpp
+++ b/taichi/ir/type_utils.cpp
@@ -57,6 +57,7 @@ int data_type_size(DataType t) {
   REGISTER_DATA_TYPE(i16, int16);
   REGISTER_DATA_TYPE(i32, int32);
   REGISTER_DATA_TYPE(i64, int64);
+  REGISTER_DATA_TYPE(u1, uint1);
   REGISTER_DATA_TYPE(u8, uint8);
   REGISTER_DATA_TYPE(u16, uint16);
   REGISTER_DATA_TYPE(u32, uint32);
@@ -99,7 +100,9 @@ std::string tensor_type_format(DataType t, Arch arch) {
 }
 
 std::string data_type_format(DataType dt, Arch arch) {
-  if (dt->is_primitive(PrimitiveTypeID::i8)) {
+  if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return "%d";
+  } else if (dt->is_primitive(PrimitiveTypeID::i8)) {
     // i8/u8 is converted to i16/u16 before printing, because CUDA doesn't
     // support the "%hhd"/"%hhu" specifiers.
     return "%hd";
diff --git a/taichi/ir/type_utils.h b/taichi/ir/type_utils.h
index b644447094410..bb8c687170d6c 100644
--- a/taichi/ir/type_utils.h
+++ b/taichi/ir/type_utils.h
@@ -38,6 +38,8 @@ inline DataType get_data_type() {
     return PrimitiveType::i32;
   } else if (std::is_same<T, int64>()) {
     return PrimitiveType::i64;
+  } else if (std::is_same<T, uint1>()) {
+    return PrimitiveType::u1;
   } else if (std::is_same<T, uint8>()) {
     return PrimitiveType::u8;
   } else if (std::is_same<T, uint16>()) {
@@ -101,6 +103,7 @@ inline bool is_integral(DataType dt) {
          dt->is_primitive(PrimitiveTypeID::i16) ||
          dt->is_primitive(PrimitiveTypeID::i32) ||
          dt->is_primitive(PrimitiveTypeID::i64) ||
+         dt->is_primitive(PrimitiveTypeID::u1) ||
          dt->is_primitive(PrimitiveTypeID::u8) ||
          dt->is_primitive(PrimitiveTypeID::u16) ||
          dt->is_primitive(PrimitiveTypeID::u32) ||
@@ -146,6 +149,8 @@ inline TypedConstant get_max_value(DataType dt) {
     return {dt, std::numeric_limits<int32>::max()};
   } else if (dt->is_primitive(PrimitiveTypeID::i64)) {
     return {dt, std::numeric_limits<int64>::max()};
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return {dt, std::numeric_limits<uint1>::max()};
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
     return {dt, std::numeric_limits<uint8>::max()};
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
@@ -172,6 +177,8 @@ inline TypedConstant get_min_value(DataType dt) {
     return {dt, std::numeric_limits<int32>::lowest()};
   } else if (dt->is_primitive(PrimitiveTypeID::i64)) {
     return {dt, std::numeric_limits<int64>::lowest()};
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return {dt, std::numeric_limits<uint1>::lowest()};
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
     return {dt, std::numeric_limits<uint8>::lowest()};
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 7c1a39c246804..869f824a55ad4 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -73,6 +73,8 @@ T Kernel::fetch_ret(DataType dt, int i) {
     return (T)program->fetch_result<int8>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
     return (T)program->fetch_result<int16>(i);
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return (T)program->fetch_result<uint1>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
     return (T)program->fetch_result<uint8>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
diff --git a/taichi/program/launch_context_builder.cpp b/taichi/program/launch_context_builder.cpp
index 1458ae81d6203..cc9c14df5a729 100644
--- a/taichi/program/launch_context_builder.cpp
+++ b/taichi/program/launch_context_builder.cpp
@@ -104,6 +104,8 @@ void LaunchContextBuilder::set_arg_int(int arg_id, int64 d) {
     set_arg(arg_id, (int8)d);
   } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
     set_arg(arg_id, (int16)d);
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    set_arg(arg_id, (uint1)d);
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
     set_arg(arg_id, (uint8)d);
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 425e2ad69af73..9e4d11ac12c47 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -943,6 +943,9 @@ void export_lang(py::module &m) {
 
   m.def("make_rand_expr", Expr::make<RandExpression, const DataType &>);
 
+  m.def("make_const_expr_bool",
+        Expr::make<ConstExpression, const DataType &, uint1>);
+
   m.def("make_const_expr_int",
         Expr::make<ConstExpression, const DataType &, int64>);
 
diff --git a/taichi/runtime/gfx/runtime.cpp b/taichi/runtime/gfx/runtime.cpp
index c49a912226a36..be9b89eeb9e15 100644
--- a/taichi/runtime/gfx/runtime.cpp
+++ b/taichi/runtime/gfx/runtime.cpp
@@ -173,6 +173,7 @@ class HostDeviceContextBlitter {
       for (int j = 0; j < num; ++j) {
         // (penguinliong) Again, it's the module loader's responsibility to
         // check the data type availability.
+        TO_HOST(u1, uint1, j)
         TO_HOST(i8, int8, j)
         TO_HOST(u8, uint8, j)
         TO_HOST(i16, int16, j)
diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp
index e13fe2de455f1..3a71575ae4243 100644
--- a/taichi/runtime/llvm/runtime_module/runtime.cpp
+++ b/taichi/runtime/llvm/runtime_module/runtime.cpp
@@ -85,6 +85,7 @@ using int8 = int8_t;
 using int16 = int16_t;
 using int32 = int32_t;
 using int64 = int64_t;
+using uint1 = bool;
 using uint8 = uint8_t;
 using uint16 = uint16_t;
 using uint32 = uint32_t;
@@ -96,6 +97,7 @@ using i8 = int8;
 using i16 = int16;
 using i32 = int32;
 using i64 = int64;
+using u1 = uint1;
 using u8 = uint8;
 using u16 = uint16;
 using u32 = uint32;
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 2710a27ddebc9..7e8bddaf5614a 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -224,11 +224,13 @@ def _get_expected_matrix_apis():
     "template",
     "tools",
     "types",
+    "u1",
     "u16",
     "u32",
     "u64",
     "u8",
     "ui",
+    "uint1",
     "uint16",
     "uint32",
     "uint64",
diff --git a/tests/python/test_pow.py b/tests/python/test_pow.py
index 2679520134310..86fd59575827c 100644
--- a/tests/python/test_pow.py
+++ b/tests/python/test_pow.py
@@ -62,23 +62,25 @@ def foo(x: dt, y: ti.template()):
         foo(10, -10)
 
 
-@test_utils.test(
-    debug=True,
-    advanced_optimization=False,
-    exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
-)
-def test_ipow_negative_exp_i32():
-    _ipow_negative_exp(ti.i32)
-
-
-@test_utils.test(
-    debug=True,
-    advanced_optimization=False,
-    require=ti.extension.data64,
-    exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
-)
-def test_ipow_negative_exp_i64():
-    _ipow_negative_exp(ti.i64)
+# FIXME(zhantong): Uncomment this test after bool assertion is finished.
+# @test_utils.test(
+#     debug=True,
+#     advanced_optimization=False,
+#     exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
+# )
+# def test_ipow_negative_exp_i32():
+#     _ipow_negative_exp(ti.i32)
+
+
+# FIXME(zhantong): Uncomment this test after bool assertion is finished.
+# @test_utils.test(
+#     debug=True,
+#     advanced_optimization=False,
+#     require=ti.extension.data64,
+#     exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
+# )
+# def test_ipow_negative_exp_i64():
+#     _ipow_negative_exp(ti.i64)
 
 
 def _test_pow_int_base_int_exp(dt_base, dt_exp):

From c48c6e95759783c7699843c61a4fd3af18e411eb Mon Sep 17 00:00:00 2001
From: listerily <listerily@outlook.com>
Date: Mon, 15 May 2023 17:44:17 +0800
Subject: [PATCH 16/18] [ir] Update codegen for `if` `while` `assert` to
 support type u1.

ghstack-source-id: e83b6e73f1f9ea90e913bd4859d9d962d6782a6c
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/8003
---
 taichi/codegen/cuda/codegen_cuda.cpp          |  4 +++
 taichi/codegen/llvm/codegen_llvm.cpp          | 35 +++++++++++-------
 taichi/codegen/spirv/spirv_codegen.cpp        | 10 +++---
 taichi/runtime/llvm/llvm_context.cpp          |  6 +++-
 .../runtime/llvm/runtime_module/runtime.cpp   | 12 +++----
 tests/python/test_pow.py                      | 36 +++++++++----------
 6 files changed, 60 insertions(+), 43 deletions(-)

diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index bacad80ee53d8..46bbf331e4c5d 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -94,6 +94,10 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
       value_type = tlctx->get_data_type(PrimitiveType::u16);
       value = builder->CreateZExt(value, value_type);
     }
+    if (dt->is_primitive(PrimitiveTypeID::u1)) {
+      value_type = tlctx->get_data_type(PrimitiveType::i32);
+      value = builder->CreateZExt(value, value_type);
+    }
     return std::make_tuple(value, value_type);
   }
 
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 2fefbeb247892..bb249983483c4 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -197,7 +197,6 @@ void TaskCodeGenLLVM::emit_extra_unary(UnaryOpStmt *stmt) {
   UNARY_STD(tan)
   UNARY_STD(tanh)
   UNARY_STD(sgn)
-  UNARY_STD(logic_not)
   UNARY_STD(acos)
   UNARY_STD(asin)
   UNARY_STD(cos)
@@ -524,6 +523,11 @@ void TaskCodeGenLLVM::visit(UnaryOpStmt *stmt) {
     } else {
       llvm_val[stmt] = builder->CreateNeg(input, "neg");
     }
+  } else if (op == UnaryOpType::logic_not) {
+    llvm_val[stmt] = builder->CreateIsNull(input);
+    // TODO: (zhantong) remove this zero ext
+    llvm_val[stmt] = builder->CreateZExt(
+        llvm_val[stmt], tlctx->get_data_type(PrimitiveType::i32));
   }
   UNARY_INTRINSIC(round)
   UNARY_INTRINSIC(floor)
@@ -618,6 +622,12 @@ void TaskCodeGenLLVM::visit(BinaryOpStmt *stmt) {
   } else if (op == BinaryOpType::mod) {
     llvm_val[stmt] =
         builder->CreateSRem(llvm_val[stmt->lhs], llvm_val[stmt->rhs]);
+  } else if (op == BinaryOpType::logical_and) {
+    llvm_val[stmt] =
+        builder->CreateAnd(llvm_val[stmt->lhs], llvm_val[stmt->rhs]);
+  } else if (op == BinaryOpType::logical_or) {
+    llvm_val[stmt] =
+        builder->CreateOr(llvm_val[stmt->lhs], llvm_val[stmt->rhs]);
   } else if (op == BinaryOpType::bit_and) {
     llvm_val[stmt] =
         builder->CreateAnd(llvm_val[stmt->lhs], llvm_val[stmt->rhs]);
@@ -851,10 +861,9 @@ void TaskCodeGenLLVM::visit(BinaryOpStmt *stmt) {
 
 void TaskCodeGenLLVM::visit(TernaryOpStmt *stmt) {
   TI_ASSERT(stmt->op_type == TernaryOpType::select);
-  llvm_val[stmt] = builder->CreateSelect(
-      builder->CreateTrunc(llvm_val[stmt->op1],
-                           tlctx->get_data_type(PrimitiveType::u1)),
-      llvm_val[stmt->op2], llvm_val[stmt->op3]);
+  llvm_val[stmt] =
+      builder->CreateSelect(builder->CreateIsNotNull(llvm_val[stmt->op1]),
+                            llvm_val[stmt->op2], llvm_val[stmt->op3]);
 }
 
 void TaskCodeGenLLVM::visit(IfStmt *if_stmt) {
@@ -865,9 +874,8 @@ void TaskCodeGenLLVM::visit(IfStmt *if_stmt) {
       llvm::BasicBlock::Create(*llvm_context, "false_block", func);
   llvm::BasicBlock *after_if =
       llvm::BasicBlock::Create(*llvm_context, "after_if", func);
-  builder->CreateCondBr(
-      builder->CreateICmpNE(llvm_val[if_stmt->cond], tlctx->get_constant(0)),
-      true_block, false_block);
+  llvm::Value *cond = builder->CreateIsNotNull(llvm_val[if_stmt->cond]);
+  builder->CreateCondBr(cond, true_block, false_block);
   builder->SetInsertPoint(true_block);
   if (if_stmt->true_statements) {
     if_stmt->true_statements->accept(this);
@@ -959,6 +967,9 @@ void TaskCodeGenLLVM::visit(PrintStmt *stmt) {
     if (dtype->is_primitive(PrimitiveTypeID::u8))
       return builder->CreateZExt(to_print,
                                  tlctx->get_data_type(PrimitiveType::u16));
+    if (dtype->is_primitive(PrimitiveTypeID::u1))
+      return builder->CreateZExt(to_print,
+                                 tlctx->get_data_type(PrimitiveType::i32));
     return to_print;
   };
   for (auto i = 0; i < stmt->contents.size(); ++i) {
@@ -1054,8 +1065,7 @@ void TaskCodeGenLLVM::visit(WhileControlStmt *stmt) {
   BasicBlock *after_break =
       BasicBlock::Create(*llvm_context, "after_break", func);
   TI_ASSERT(current_while_after_loop);
-  auto cond =
-      builder->CreateICmpEQ(llvm_val[stmt->cond], tlctx->get_constant(0));
+  auto *cond = builder->CreateIsNull(llvm_val[stmt->cond]);
   builder->CreateCondBr(cond, current_while_after_loop, after_break);
   builder->SetInsertPoint(after_break);
 }
@@ -1309,7 +1319,7 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
 
   std::vector<llvm::Value *> args;
   args.emplace_back(get_runtime());
-  args.emplace_back(llvm_val[stmt->cond]);
+  args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond]));
   args.emplace_back(builder->CreateGlobalStringPtr(stmt->text));
 
   for (int i = 0; i < stmt->args.size(); i++) {
@@ -2220,8 +2230,7 @@ void TaskCodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt) {
       // test whether the current voxel is active or not
       auto is_active = call(leaf_block, element.get("element"), "is_active",
                             {builder->CreateLoad(loop_index_ty, loop_index)});
-      is_active =
-          builder->CreateTrunc(is_active, llvm::Type::getInt1Ty(*llvm_context));
+      is_active = builder->CreateIsNotNull(is_active);
       exec_cond = builder->CreateAnd(exec_cond, is_active);
     }
 
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 38348e4c1d727..f03e0fe0feddb 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -1652,9 +1652,10 @@ class TaskCodegen : public IRVisitor {
   }
 
   void visit(IfStmt *if_stmt) override {
-    spirv::Value cond_v = ir_->query_value(if_stmt->cond->raw_name());
+    spirv::Value cond_v = ir_->cast(
+        ir_->bool_type(), ir_->query_value(if_stmt->cond->raw_name()));
     spirv::Value cond =
-        ir_->ne(cond_v, ir_->cast(cond_v.stype, ir_->const_i32_zero_));
+        ir_->ne(cond_v, ir_->cast(ir_->bool_type(), ir_->const_i32_zero_));
     spirv::Label then_label = ir_->new_label();
     spirv::Label merge_label = ir_->new_label();
     spirv::Label else_label = ir_->new_label();
@@ -1776,9 +1777,10 @@ class TaskCodegen : public IRVisitor {
   }
 
   void visit(WhileControlStmt *stmt) override {
-    spirv::Value cond_v = ir_->query_value(stmt->cond->raw_name());
+    spirv::Value cond_v =
+        ir_->cast(ir_->bool_type(), ir_->query_value(stmt->cond->raw_name()));
     spirv::Value cond =
-        ir_->eq(cond_v, ir_->cast(cond_v.stype, ir_->const_i32_zero_));
+        ir_->eq(cond_v, ir_->cast(ir_->bool_type(), ir_->const_i32_zero_));
     spirv::Label then_label = ir_->new_label();
     spirv::Label merge_label = ir_->new_label();
 
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index cef39f793b7e4..3156e0c72ccb6 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -692,6 +692,9 @@ llvm::Value *TaichiLLVMContext::get_constant(DataType dt, T t) {
     return llvm::ConstantFP::get(llvm::Type::getHalfTy(*ctx), (float32)t);
   } else if (dt->is_primitive(PrimitiveTypeID::f64)) {
     return llvm::ConstantFP::get(*ctx, llvm::APFloat((float64)t));
+  } else if (dt->is_primitive(PrimitiveTypeID::u1)) {
+    return t ? llvm::ConstantInt::getTrue(*ctx)
+             : llvm::ConstantInt::getFalse(*ctx);
   } else if (is_integral(dt)) {
     if (is_signed(dt)) {
       return llvm::ConstantInt::get(
@@ -721,7 +724,8 @@ llvm::Value *TaichiLLVMContext::get_constant(T t) {
                 std::is_same_v<TargetType, float64>) {
     return llvm::ConstantFP::get(*ctx, llvm::APFloat(t));
   } else if (std::is_same_v<TargetType, bool>) {
-    return llvm::ConstantInt::get(*ctx, llvm::APInt(1, (uint64)t, true));
+    return t ? llvm::ConstantInt::getTrue(*ctx)
+             : llvm::ConstantInt::getFalse(*ctx);
   } else if (std::is_same_v<TargetType, int32> ||
              std::is_same_v<TargetType, uint32>) {
     return llvm::ConstantInt::get(*ctx, llvm::APInt(32, (uint64)t, true));
diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp
index 3a71575ae4243..460beb145e7e7 100644
--- a/taichi/runtime/llvm/runtime_module/runtime.cpp
+++ b/taichi/runtime/llvm/runtime_module/runtime.cpp
@@ -332,9 +332,9 @@ struct LLVMRuntime;
 
 constexpr bool enable_assert = true;
 
-void taichi_assert(RuntimeContext *context, i32 test, const char *msg);
-void taichi_assert_runtime(LLVMRuntime *runtime, i32 test, const char *msg);
-#define TI_ASSERT_INFO(x, msg) taichi_assert(context, (int)(x), msg)
+void taichi_assert(RuntimeContext *context, u1 test, const char *msg);
+void taichi_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg);
+#define TI_ASSERT_INFO(x, msg) taichi_assert(context, (u1)(x), msg)
 #define TI_ASSERT(x) TI_ASSERT_INFO(x, #x)
 
 void ___stubs___() {
@@ -753,12 +753,12 @@ RUNTIME_STRUCT_FIELD(ListManager, num_elements);
 RUNTIME_STRUCT_FIELD(ListManager, max_num_elements_per_chunk);
 RUNTIME_STRUCT_FIELD(ListManager, element_size);
 
-void taichi_assert(RuntimeContext *context, i32 test, const char *msg) {
+void taichi_assert(RuntimeContext *context, u1 test, const char *msg) {
   taichi_assert_runtime(context->runtime, test, msg);
 }
 
 void taichi_assert_format(LLVMRuntime *runtime,
-                          i32 test,
+                          u1 test,
                           const char *format,
                           int num_arguments,
                           uint64 *arguments) {
@@ -808,7 +808,7 @@ void taichi_assert_format(LLVMRuntime *runtime,
 #endif
 }
 
-void taichi_assert_runtime(LLVMRuntime *runtime, i32 test, const char *msg) {
+void taichi_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) {
   taichi_assert_format(runtime, test, msg, 0, nullptr);
 }
 
diff --git a/tests/python/test_pow.py b/tests/python/test_pow.py
index 86fd59575827c..2679520134310 100644
--- a/tests/python/test_pow.py
+++ b/tests/python/test_pow.py
@@ -62,25 +62,23 @@ def foo(x: dt, y: ti.template()):
         foo(10, -10)
 
 
-# FIXME(zhantong): Uncomment this test after bool assertion is finished.
-# @test_utils.test(
-#     debug=True,
-#     advanced_optimization=False,
-#     exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
-# )
-# def test_ipow_negative_exp_i32():
-#     _ipow_negative_exp(ti.i32)
-
-
-# FIXME(zhantong): Uncomment this test after bool assertion is finished.
-# @test_utils.test(
-#     debug=True,
-#     advanced_optimization=False,
-#     require=ti.extension.data64,
-#     exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
-# )
-# def test_ipow_negative_exp_i64():
-#     _ipow_negative_exp(ti.i64)
+@test_utils.test(
+    debug=True,
+    advanced_optimization=False,
+    exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
+)
+def test_ipow_negative_exp_i32():
+    _ipow_negative_exp(ti.i32)
+
+
+@test_utils.test(
+    debug=True,
+    advanced_optimization=False,
+    require=ti.extension.data64,
+    exclude=[ti.vulkan, ti.metal, ti.opengl, ti.gles],
+)
+def test_ipow_negative_exp_i64():
+    _ipow_negative_exp(ti.i64)
 
 
 def _test_pow_int_base_int_exp(dt_base, dt_exp):

From 25cc2c8539917062cef852364fa329e515175b00 Mon Sep 17 00:00:00 2001
From: listerily <listerily@outlook.com>
Date: Mon, 15 May 2023 17:44:18 +0800
Subject: [PATCH 17/18] [llvm] Simplified and add support for type u1 in
 logical not operation

ghstack-source-id: 61e101a0e14c73d6027982c47f521bf3a8363fdf
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/8005
---
 taichi/codegen/amdgpu/codegen_amdgpu.cpp       | 8 +-------
 taichi/codegen/cuda/codegen_cuda.cpp           | 6 ------
 taichi/runtime/llvm/runtime_module/runtime.cpp | 4 ----
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 9d4f4ac10b39f..5b2431ef54eb5 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -65,13 +65,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
       TI_NOT_IMPLEMENTED                                                \
     }                                                                   \
   }
-    if (op == UnaryOpType::logic_not) {
-      if (input_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
-        llvm_val[stmt] = call("logic_not_i32", input);
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
-    } else if (op == UnaryOpType::abs) {
+    if (op == UnaryOpType::abs) {
       if (input_taichi_type->is_primitive(PrimitiveTypeID::f16)) {
         llvm_val[stmt] = call("__ocml_fasb_f16", input);
       } else if (input_taichi_type->is_primitive(PrimitiveTypeID::f32)) {
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index 46bbf331e4c5d..f0ebd49aafbfd 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -254,12 +254,6 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
       } else {
         TI_NOT_IMPLEMENTED
       }
-    } else if (op == UnaryOpType::logic_not) {
-      if (input_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
-        llvm_val[stmt] = call("logic_not_i32", input);
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
     } else if (op == UnaryOpType::frexp) {
       auto stype = tlctx->get_data_type(stmt->ret_type.ptr_removed());
       auto res = builder->CreateAlloca(stype);
diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp
index 460beb145e7e7..53547922a2e85 100644
--- a/taichi/runtime/llvm/runtime_module/runtime.cpp
+++ b/taichi/runtime/llvm/runtime_module/runtime.cpp
@@ -229,10 +229,6 @@ i64 max_i64(i64 a, i64 b) {
   return a > b ? a : b;
 }
 
-int32 logic_not_i32(int32 a) {
-  return !a;
-}
-
 float32 sgn_f32(float32 a) {
   float32 b;
   if (a > 0)

From da39780a879d179de967e1fb9d42fb026d43f042 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 16 May 2023 11:40:18 +0800
Subject: [PATCH 18/18] [Lang] Support allocate with cuda memory pool and
 reduce preallocation size accordingly (#7929)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue: #

### Brief Summary

<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at c85f600</samp>

This pull request enhances the memory management of Taichi on CUDA
devices by using `malloc_async` and `mem_free_async` functions and
adjusting the preallocation size based on the memory pool feature. This
affects the files `cuda_device.cpp` and `llvm_runtime_executor.cpp`.

### Walkthrough

<!--
copilot:walkthrough
-->
### <samp>🤖 Generated by Copilot at c85f600</samp>

* Add support for CUDA memory pool feature to improve performance and
memory usage on CUDA devices
- Check if CUDA context supports memory pool and use `malloc_async` and
`mem_free_async` functions for allocation and deallocation without
caching
([link](https://github.com/taichi-dev/taichi/pull/7929/files?diff=unified&w=0#diff-7919f5d7e33aafc72f27ed93febc58a0ac77c220ae718bf78ef134dad3790654L55-R61),
[link](https://github.com/taichi-dev/taichi/pull/7929/files?diff=unified&w=0#diff-7919f5d7e33aafc72f27ed93febc58a0ac77c220ae718bf78ef134dad3790654L103-R113))
- Reduce preallocation size by half for runtime objects and sparse data
structures on CUDA devices if memory pool is supported
([link](https://github.com/taichi-dev/taichi/pull/7929/files?diff=unified&w=0#diff-b9155792159f392bd8bacd44cb1819be5239b022d707499fc364c0f93dd8c5e5R574-R579))
---
 c_api/src/taichi_llvm_impl.cpp                |   8 +-
 taichi/rhi/amdgpu/amdgpu_device.cpp           |   2 +-
 taichi/rhi/cuda/cuda_device.cpp               |  15 +-
 taichi/rhi/cuda/cuda_device.h                 |   1 +
 taichi/rhi/llvm/llvm_device.h                 |   1 +
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 134 ++++++++++++------
 taichi/runtime/llvm/llvm_runtime_executor.h   |  14 +-
 7 files changed, 115 insertions(+), 60 deletions(-)

diff --git a/c_api/src/taichi_llvm_impl.cpp b/c_api/src/taichi_llvm_impl.cpp
index a8b2fb8d8c6d2..7e89211031a34 100644
--- a/c_api/src/taichi_llvm_impl.cpp
+++ b/c_api/src/taichi_llvm_impl.cpp
@@ -54,11 +54,11 @@ TiMemory LlvmRuntime::allocate_memory(
     const taichi::lang::Device::AllocParams &params) {
   taichi::lang::LLVMRuntime *llvm_runtime = executor_->get_llvm_runtime();
   taichi::lang::LlvmDevice *llvm_device = executor_->llvm_device();
-
   taichi::lang::DeviceAllocation devalloc =
-      llvm_device->allocate_memory_runtime({params,
-                                            executor_->get_runtime_jit_module(),
-                                            llvm_runtime, result_buffer});
+      llvm_device->allocate_memory_runtime(
+          {params, executor_->get_runtime_jit_module(), llvm_runtime,
+           result_buffer, executor_->use_device_memory_pool()});
+
   return devalloc2devmem(*this, devalloc);
 }
 
diff --git a/taichi/rhi/amdgpu/amdgpu_device.cpp b/taichi/rhi/amdgpu/amdgpu_device.cpp
index ee86b03665dc3..ac786a4cb0ffb 100644
--- a/taichi/rhi/amdgpu/amdgpu_device.cpp
+++ b/taichi/rhi/amdgpu/amdgpu_device.cpp
@@ -106,8 +106,8 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) {
                                              false);
   } else if (!info.use_preallocated) {
     DeviceMemoryPool::get_instance().release(info.size, info.ptr);
-    info.ptr = nullptr;
   }
+  info.ptr = nullptr;
 }
 
 RhiResult AmdgpuDevice::map(DeviceAllocation alloc, void **mapped_ptr) {
diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp
index fe5d296071672..25d24ac8e8e5a 100644
--- a/taichi/rhi/cuda/cuda_device.cpp
+++ b/taichi/rhi/cuda/cuda_device.cpp
@@ -53,17 +53,17 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   if (info.size == 0) {
     info.ptr = nullptr;
+  } else if (params.use_memory_pool) {
+    CUDADriver::get_instance().malloc_async((void **)&info.ptr, info.size,
+                                            nullptr);
   } else {
     info.ptr =
         DeviceMemoryPool::get_instance().allocate_with_cache(this, params);
-
-    TI_ASSERT(info.ptr != nullptr);
-
-    CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size);
   }
   info.is_imported = false;
   info.use_cached = true;
   info.use_preallocated = true;
+  info.use_memory_pool = params.use_memory_pool;
 
   DeviceAllocation alloc;
   alloc.alloc_id = allocations_.size();
@@ -92,6 +92,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
 
   validate_device_alloc(handle);
   AllocInfo &info = allocations_[handle.alloc_id];
+
   if (info.size == 0) {
     return;
   }
@@ -99,14 +100,16 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
     TI_ERROR("the DeviceAllocation is already deallocated");
   }
   TI_ASSERT(!info.is_imported);
-  if (info.use_cached) {
+  if (info.use_memory_pool) {
+    CUDADriver::get_instance().mem_free_async(info.ptr, nullptr);
+  } else if (info.use_cached) {
     DeviceMemoryPool::get_instance().release(info.size, (uint64_t *)info.ptr,
                                              false);
   } else if (!info.use_preallocated) {
     auto &mem_pool = DeviceMemoryPool::get_instance();
     mem_pool.release(info.size, info.ptr, true /*release_raw*/);
-    info.ptr = nullptr;
   }
+  info.ptr = nullptr;
 }
 
 RhiResult CudaDevice::upload_data(DevicePtr *device_ptr,
diff --git a/taichi/rhi/cuda/cuda_device.h b/taichi/rhi/cuda/cuda_device.h
index 0e06174552331..3a94209084a13 100644
--- a/taichi/rhi/cuda/cuda_device.h
+++ b/taichi/rhi/cuda/cuda_device.h
@@ -77,6 +77,7 @@ class CudaDevice : public LlvmDevice {
      * */
     bool use_preallocated{true};
     bool use_cached{false};
+    bool use_memory_pool{false};
     void *mapped{nullptr};
   };
 
diff --git a/taichi/rhi/llvm/llvm_device.h b/taichi/rhi/llvm/llvm_device.h
index cbefaaeb60327..4279ba1765a7a 100644
--- a/taichi/rhi/llvm/llvm_device.h
+++ b/taichi/rhi/llvm/llvm_device.h
@@ -13,6 +13,7 @@ class LlvmDevice : public Device {
     JITModule *runtime_jit{nullptr};
     LLVMRuntime *runtime{nullptr};
     uint64 *result_buffer{nullptr};
+    bool use_memory_pool{false};
   };
 
   Arch arch() const override {
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 6c636f833f299..3adada586f5df 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -46,6 +46,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
       config.arch = host_arch();
     } else {
       // CUDA runtime created successfully
+      use_device_memory_pool_ = CUDAContext::get_instance().supports_mem_pool();
     }
 #else
     TI_WARN("Taichi is not compiled with CUDA.");
@@ -398,6 +399,20 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
   const int tree_id = field_cache_data.tree_id;
   const int root_id = field_cache_data.root_id;
 
+  bool all_dense = config_.demote_dense_struct_fors;
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (snode_metas[i].type != SNodeType::dense &&
+        snode_metas[i].type != SNodeType::place &&
+        snode_metas[i].type != SNodeType::root) {
+      all_dense = false;
+      break;
+    }
+  }
+
+  if (config_.arch == Arch::cuda && use_device_memory_pool() && !all_dense) {
+    preallocate_runtime_memory();
+  }
+
   TI_TRACE("Allocating data structure of size {} bytes", root_size);
   std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
 
@@ -424,16 +439,6 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
 
   snode_tree_allocs_[tree_id] = alloc;
 
-  bool all_dense = config_.demote_dense_struct_fors;
-  for (size_t i = 0; i < snode_metas.size(); i++) {
-    if (snode_metas[i].type != SNodeType::dense &&
-        snode_metas[i].type != SNodeType::place &&
-        snode_metas[i].type != SNodeType::root) {
-      all_dense = false;
-      break;
-    }
-  }
-
   runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
       "runtime_initialize_snodes", llvm_runtime_, root_size, root_id,
       (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense);
@@ -471,16 +476,25 @@ LlvmDevice *LlvmRuntimeExecutor::llvm_device() {
 DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
     std::size_t alloc_size,
     uint64 *result_buffer) {
-  return llvm_device()->allocate_memory_runtime(
+  auto devalloc = llvm_device()->allocate_memory_runtime(
       {{alloc_size, /*host_write=*/false, /*host_read=*/false,
         /*export_sharing=*/false, AllocUsage::Storage},
        get_runtime_jit_module(),
        get_llvm_runtime(),
-       result_buffer});
+       result_buffer,
+       use_device_memory_pool()});
+
+  TI_ASSERT(allocated_runtime_memory_allocs_.find(devalloc.alloc_id) ==
+            allocated_runtime_memory_allocs_.end());
+  allocated_runtime_memory_allocs_[devalloc.alloc_id] = devalloc;
+  return devalloc;
 }
 
 void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
+  TI_ASSERT(allocated_runtime_memory_allocs_.find(handle.alloc_id) !=
+            allocated_runtime_memory_allocs_.end());
   llvm_device()->dealloc_memory(handle);
+  allocated_runtime_memory_allocs_.erase(handle.alloc_id);
 }
 
 void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
@@ -534,13 +548,32 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 
 void LlvmRuntimeExecutor::finalize() {
   profiler_ = nullptr;
-  for (auto &preallocated_device_buffer_alloc :
-       preallocated_device_buffer_allocs_) {
-    if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
-      llvm_device()->dealloc_memory(preallocated_device_buffer_alloc);
-      llvm_device()->clear();
-      DeviceMemoryPool::get_instance().reset();
+  if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
+    preallocated_runtime_objects_allocs_.reset();
+    preallocated_runtime_memory_allocs_.reset();
+
+    // Reset runtime memory
+    auto allocated_runtime_memory_allocs_copy =
+        allocated_runtime_memory_allocs_;
+    for (auto &iter : allocated_runtime_memory_allocs_copy) {
+      // The runtime allocation may have already been freed upon explicit
+      // Ndarray/Field destruction Check if the allocation still alive
+      void *ptr = llvm_device()->get_memory_addr(iter.second);
+      if (ptr == nullptr)
+        continue;
+
+      deallocate_memory_ndarray(iter.second);
     }
+    allocated_runtime_memory_allocs_.clear();
+
+    // Reset device
+    llvm_device()->clear();
+
+    // Reset memory pool
+    DeviceMemoryPool::get_instance().reset();
+
+    // Release unused memory from cuda memory pool
+    synchronize();
   }
   finalized_ = true;
 }
@@ -551,7 +584,9 @@ LlvmRuntimeExecutor::~LlvmRuntimeExecutor() {
   }
 }
 
-void *LlvmRuntimeExecutor::preallocate_memory(std::size_t prealloc_size) {
+void *LlvmRuntimeExecutor::preallocate_memory(
+    std::size_t prealloc_size,
+    DeviceAllocationUnique &devalloc) {
   DeviceAllocation preallocated_device_buffer_alloc;
 
   Device::AllocParams preallocated_device_buffer_alloc_params;
@@ -563,11 +598,38 @@ void *LlvmRuntimeExecutor::preallocate_memory(std::size_t prealloc_size) {
 
   void *preallocated_device_buffer =
       llvm_device()->get_memory_addr(preallocated_device_buffer_alloc);
-  preallocated_device_buffer_allocs_.emplace_back(
+  devalloc = std::make_unique<DeviceAllocationGuard>(
       std::move(preallocated_device_buffer_alloc));
   return preallocated_device_buffer;
 }
 
+void LlvmRuntimeExecutor::preallocate_runtime_memory() {
+  if (preallocated_runtime_memory_allocs_ != nullptr)
+    return;
+
+  std::size_t total_prealloc_size = 0;
+  const auto total_mem = llvm_device()->get_total_memory();
+  if (config_.device_memory_fraction == 0) {
+    TI_ASSERT(config_.device_memory_GB > 0);
+    total_prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
+  } else {
+    total_prealloc_size =
+        std::size_t(config_.device_memory_fraction * total_mem);
+  }
+  TI_ASSERT(total_prealloc_size <= total_mem);
+
+  void *runtime_memory_prealloc_buffer = preallocate_memory(
+      total_prealloc_size, preallocated_runtime_memory_allocs_);
+
+  TI_TRACE("Allocating device memory {:.2f} MB",
+           1.0 * total_prealloc_size / (1UL << 20));
+
+  auto *const runtime_jit = get_runtime_jit_module();
+  runtime_jit->call<void *, std::size_t, void *>(
+      "runtime_initialize_memory", llvm_runtime_, total_prealloc_size,
+      runtime_memory_prealloc_buffer);
+}
+
 void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
                                               uint64 **result_buffer_ptr) {
   // The result buffer allocated here is only used for the launches of
@@ -576,14 +638,14 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
   // CUDA and AMDGPU backends.
   // | ==================preallocated device buffer ========================== |
   // |<- reserved for return ->|<---- usable for allocators on the device ---->|
-
   std::size_t runtime_objects_prealloc_size = 0;
   void *runtime_objects_prealloc_buffer = nullptr;
   if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
+
     runtime_objects_prealloc_size = 60 * (1UL << 20);  // 50 MB
-    runtime_objects_prealloc_buffer =
-        preallocate_memory(runtime_objects_prealloc_size);
+    runtime_objects_prealloc_buffer = preallocate_memory(
+        runtime_objects_prealloc_size, preallocated_runtime_objects_allocs_);
 
     TI_TRACE("Allocating device memory {:.2f} MB",
              1.0 * runtime_objects_prealloc_size / (1UL << 20));
@@ -639,31 +701,9 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
 
   // Preallocate for runtime memory and update to LLVMRuntime
   if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
-    std::size_t total_prealloc_size = 0;
-    const auto total_mem = llvm_device()->get_total_memory();
-    if (config_.device_memory_fraction == 0) {
-      TI_ASSERT(config_.device_memory_GB > 0);
-      total_prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
-    } else {
-      total_prealloc_size =
-          std::size_t(config_.device_memory_fraction * total_mem);
+    if (!use_device_memory_pool()) {
+      preallocate_runtime_memory();
     }
-    TI_ASSERT(total_prealloc_size <= total_mem);
-
-    auto runtime_memory_prealloc_size =
-        total_prealloc_size > runtime_objects_prealloc_size
-            ? total_prealloc_size - runtime_objects_prealloc_size
-            : 0;
-
-    void *runtime_memory_prealloc_buffer =
-        preallocate_memory(runtime_memory_prealloc_size);
-
-    TI_TRACE("Allocating device memory {:.2f} MB",
-             1.0 * runtime_memory_prealloc_size / (1UL << 20));
-
-    runtime_jit->call<void *, std::size_t, void *>(
-        "runtime_initialize_memory", llvm_runtime_,
-        runtime_memory_prealloc_size, runtime_memory_prealloc_buffer);
   }
 
   if (config_.arch == Arch::cuda) {
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
index 6e487419a28a5..871e394ad8b50 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.h
+++ b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -75,6 +75,10 @@ class LlvmRuntimeExecutor {
 
   void synchronize();
 
+  bool use_device_memory_pool() {
+    return use_device_memory_pool_;
+  }
+
  private:
   /* ----------------------- */
   /* ------ Allocation ----- */
@@ -96,7 +100,9 @@ class LlvmRuntimeExecutor {
                     std::size_t size,
                     uint32_t data);
 
-  void *preallocate_memory(std::size_t prealloc_size);
+  void *preallocate_memory(std::size_t prealloc_size,
+                           DeviceAllocationUnique &devalloc);
+  void preallocate_runtime_memory();
 
   /* ------------------------- */
   /* ---- Runtime Helpers ---- */
@@ -144,12 +150,16 @@ class LlvmRuntimeExecutor {
 
   std::unique_ptr<SNodeTreeBufferManager> snode_tree_buffer_manager_{nullptr};
   std::unordered_map<int, DeviceAllocation> snode_tree_allocs_;
-  std::vector<DeviceAllocation> preallocated_device_buffer_allocs_;
+  DeviceAllocationUnique preallocated_runtime_objects_allocs_ = nullptr;
+  DeviceAllocationUnique preallocated_runtime_memory_allocs_ = nullptr;
+  std::unordered_map<DeviceAllocationId, DeviceAllocation>
+      allocated_runtime_memory_allocs_;
 
   // good buddy
   friend LlvmProgramImpl;
   friend SNodeTreeBufferManager;
 
+  bool use_device_memory_pool_ = false;
   bool finalized_{false};
   KernelProfilerBase *profiler_ = nullptr;
 };