diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
new file mode 100644
index 0000000000000..36fa23577f27d
--- /dev/null
+++ b/.github/actionlint.yaml
@@ -0,0 +1,19 @@
+self-hosted-runner:
+  labels:
+  - cn
+  - cuda
+  - OpenGL
+  - vulkan
+  - m1
+  - driver470
+  - driver510
+  - benchmark
+  - release
+  - build
+  - sm70
+  - sm86
+  - amdgpu
+  - online
+
+# config-variables:
+#   - ENVIRONMENT_STAGE
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
new file mode 100644
index 0000000000000..5da34e3b05b9b
--- /dev/null
+++ b/.github/workflows/build.yaml
@@ -0,0 +1,395 @@
+name: Build Taichi
+on:
+  workflow_call:
+    inputs:
+      build_id:
+        required: true
+        type: string
+      nightly:
+        required: true
+        type: boolean
+      python:
+        required: true
+        type: string
+    secrets:
+      BOT_MINIO_ACCESS_KEY:
+        required: true
+      BOT_MINIO_SECRET_KEY:
+        required: true
+  workflow_dispatch:
+    inputs:
+      build_id:
+        description: 'The build id. e.g.: 20230427-102544-abcdefab'
+        required: true
+        type: string
+      nightly:
+        description: 'Are we building nightly wheels?'
+        required: true
+        default: false
+        type: boolean
+      python:
+        description: 'JSON encoded python versions need building wheel. e.g.: ["3.7"]'
+        required: true
+        type: string
+
+concurrency:
+  group: build-${{ github.event.number || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  TI_CI: "1"
+  TI_SKIP_VERSION_CHECK: 'ON'
+  CI_IMAGE_VERSION: '202304251731'
+  TI_USE_GIT_CACHE: ${{ vars.TI_USE_GIT_CACHE }}
+  NIGHTLY: ${{ inputs.nightly && 'nightly' || '' }}
+
+jobs:
+  show_environ:
+    name: Show Environment Variables
+    # Disable this workflow on forks
+    if: github.repository_owner == 'taichi-dev'
+    runs-on: [self-hosted, Linux]
+    steps:
+    - name: Environment Variables
+      run: env
+    - name: Github Object
+      run: |
+        cat <<'EOF'
+        ${{ toJson(github) }}
+        EOF
+
+  build_cpu_mac:
+    name: Build macOS-x86
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        _designated: ['']
+        include:
+        - _designated: ''
+          designated: designated
+    runs-on:
+    - self-hosted
+    - macos-10.15
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=ON
+        -DTI_WITH_C_API:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - name: Build
+        run: ./build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_linux:
+    name: Build Linux
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda: ['', cuda]
+        llvm: ['', llvm]
+        gl: ['', gl]
+        vk: ['', vk]
+        python: ${{ fromJSON(inputs.python) }}
+        include:
+        - {cuda: cuda, llvm: llvm, gl: gl, vk: vk, designated: designated}
+        exclude:
+        - {llvm: '', cuda: cuda}
+    runs-on: [self-hosted, online, Linux, build]
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_CUDA:BOOL=${{ matrix.cuda && 'ON' || 'OFF' }}
+        -DTI_WITH_LLVM:BOOL=${{ matrix.llvm && 'ON' || 'OFF' }}
+        -DTI_WITH_OPENGL:BOOL=${{ matrix.gl && 'ON' || 'OFF' }}
+        -DTI_WITH_VULKAN:BOOL=${{ matrix.vk && 'ON' || 'OFF' }}
+        -DTI_WITH_METAL:BOOL=OFF
+        -DTI_WITH_BACKTRACE:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+          fetch-depth: '0'
+
+      - name: Build
+        run: |
+          . .github/workflows/scripts/common-utils.sh
+          ci-docker-run-gpu \
+            -v $(pwd):/home/dev/taichi \
+            registry.botmaster.tgr/taichi-build-cuda:${{ env.CI_IMAGE_VERSION }} \
+            /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config}
+        env:
+          SHOULD_TAG_CONFIG: ${{ !matrix.designated && 'yes' || '' }}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_manylinux2014:
+    name: Build manylinux2014
+    timeout-minutes: 30
+    runs-on: [self-hosted, online, Linux, build]
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        _designated: ['']
+        include:
+        - _designated: ''
+          designated: designated
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=OFF
+        -DTI_BUILD_TESTS:BOOL=ON
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+          fetch-depth: '0'
+
+      - name: Build
+        run: |
+          . .github/workflows/scripts/common-utils.sh
+
+          ci-docker-run-gpu \
+            -v $(pwd):/home/dev/taichi \
+            registry.botmaster.tgr/taichi-build-manylinux2014-cuda:${{ env.CI_IMAGE_VERSION }} \
+            /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_amdgpu_linux:
+    name: Build AMDGPU
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        designated: ['']
+    runs-on: [self-hosted, online, Linux, build]
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_CUDA:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=OFF
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_BUILD_TESTS:BOOL=ON
+        -DTI_WITH_AMDGPU:BOOL=ON
+
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+          fetch-depth: '0'
+
+      - name: Build & Install
+        run: |
+          . .github/workflows/scripts/common-utils.sh
+
+          ci-docker-run \
+            -v $(pwd):/home/dev/taichi \
+            registry.botmaster.tgr/taichi-build-amdgpu:${{ env.CI_IMAGE_VERSION }} \
+            /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} --tag-local=amd
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_windows:
+    name: Build Windows
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda: ['', cuda]
+        llvm: ['', llvm]
+        gl: ['', gl]
+        vk: ['', vk]
+        python: ${{ fromJSON(inputs.python) }}
+        include:
+        - {cuda: cuda, llvm: llvm, gl: gl, vk: vk, lto: lto, pdb: pdb, designated: designated}
+        exclude:
+        - {llvm: '', cuda: cuda}
+    runs-on: [self-hosted, online, Windows, build]
+    timeout-minutes: 30
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_CUDA:BOOL=${{ matrix.cuda && 'ON' || 'OFF' }}
+        -DTI_WITH_LLVM:BOOL=${{ matrix.llvm && 'ON' || 'OFF' }}
+        -DTI_WITH_OPENGL:BOOL=${{ matrix.gl && 'ON' || 'OFF' }}
+        -DTI_WITH_VULKAN:BOOL=${{ matrix.vk && 'ON' || 'OFF' }}
+        -DTI_WITH_METAL:BOOL=OFF
+        -DTI_WITH_BACKTRACE:BOOL=ON
+        -DTI_WITH_DX11:BOOL=ON
+        -DTI_WITH_DX12:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+        -DTI_WITH_C_API:BOOL=ON
+        -DTI_WITH_LTO:BOOL=${{ matrix.lto && 'ON' || 'OFF' }}
+        -DTI_GENERATE_PDB:BOOL=${{ matrix.pdb && 'ON' || 'OFF' }}
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        shell: pwsh
+        run: |
+          $ErrorActionPreference = 'SilentlyContinue'
+          git config --system core.longpaths true
+          git submodule foreach --recursive 'git rev-parse HEAD || rm -rf $PWD'
+          $LASTEXITCODE = 0
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v4
+        with:
+          # force a 3.7 is ok, build.py will handle actual python env
+          python-version: 3.7
+
+      - name: Build
+        shell: pwsh
+        run: |
+          $nightlyFlag = $null
+          if ($env:NIGHTLY) { $nightlyFlag = "--nightly" }
+          $tagFlag = $null
+          if ($env:SHOULD_TAG_CONFIG) { $tagFlag = "--tag-config" }
+          python build.py $nightlyFlag --python=${{ matrix.python }} $tagFlag
+        env:
+          SHOULD_TAG_CONFIG: ${{ !matrix.designated && 'yes' || '' }}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+      - name: Cleanup Git Cache Configs
+        shell: pwsh
+        if: always()
+        run: |
+          . .github/workflows/scripts/common-utils.ps1
+          UnsetGitCachingProxy
+          exit 0
+
+  build_m1:
+    name: Build M1
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ${{ fromJSON(inputs.python) }}
+        _designated: ['']
+        exclude:
+        - python: "3.7"
+        include:
+        - _designated: ''
+          designated: designated
+    defaults:
+      run:
+        shell: '/usr/bin/arch -arch arm64e /bin/bash --noprofile --norc -eo pipefail {0}'
+    runs-on: [self-hosted, online, m1]
+    env:
+      TAICHI_CMAKE_ARGS: >-
+        -DTI_WITH_OPENGL:BOOL=OFF
+        -DTI_WITH_CUDA:BOOL=OFF
+        -DTI_WITH_VULKAN:BOOL=ON
+        -DTI_BUILD_TESTS:BOOL=ON
+        -DTI_WITH_C_API=ON
+        -DTI_WITH_STATIC_C_API=ON
+      PLATFORM: 'm1'
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - name: Build
+        run: |
+          brew install molten-vk
+          ./build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }}
+
+      - name: Upload Built Wheel
+        uses: shallwefootball/s3-upload-action@v1.3.3
+        with:
+          aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+          aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+          aws_bucket: built-wheels
+          source_dir: dist
+          destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }}
+          endpoint: http://botmaster.tgr:9000
+
+  build_ios_capi:
+    name: Build iOS C-API Static Library
+    timeout-minutes: 30
+    runs-on: [self-hosted, online, m1]
+    steps:
+      - name: Workaround checkout Needed single revision issue
+        run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          submodules: 'recursive'
+
+      - name: Build
+        run: .github/workflows/scripts/build.py ios
+
+      - name: Save Compiled Static Library
+        uses: actions/upload-artifact@v3
+        with:
+          name: libtaichi_c_api.iOS.a
+          path: 'dist/C-API-iOS/*.a'
+          retention-days: 7
diff --git a/.github/workflows/initiator.yaml b/.github/workflows/initiator.yaml
new file mode 100644
index 0000000000000..8cd5cfd4ddf8a
--- /dev/null
+++ b/.github/workflows/initiator.yaml
@@ -0,0 +1,74 @@
+name: Taichi Workflow Initiator
+on:
+  push:
+    branches:
+    - master
+    - rc-*
+
+concurrency:
+  group: ${{ github.event.number || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  TI_CI: "1"
+  TI_SKIP_VERSION_CHECK: 'ON'
+  TI_LITE_TEST: ${{ github.event_name == 'pull_request' && ! contains(github.event.pull_request.labels.*.name, 'full-ci') && ! startsWith(github.base_ref, 'rc-') && '1' || '' }}
+  TI_TEST_OFFLINE_CACHE: ${{ github.event.schedule == '0 18 * * *' && '1' || '' }}
+  CI_IMAGE_VERSION: '202304251731'
+  TI_USE_GIT_CACHE: ${{ vars.TI_USE_GIT_CACHE }}
+  REDIS_HOST: 172.16.5.1
+
+jobs:
+  show_environ:
+    name: Show Environment Variables
+    # Disable this workflow on forks
+    if: github.repository_owner == 'taichi-dev'
+    runs-on: [self-hosted, Linux]
+    steps:
+    - name: Environment Variables
+      run: env
+    - name: Github Object
+      run: |
+        cat <<'EOF'
+        ${{ toJson(github) }}
+        EOF
+
+  preparation:
+    name: Preparation
+    runs-on: [self-hosted, Linux]
+    outputs:
+      date: ${{ steps.gather.outputs.date }}
+      short_sha: ${{ steps.gather.outputs.short_sha }}
+      build_id: ${{ steps.gather.outputs.build_id }}
+    steps:
+      - name: Gather Information
+        id: gather
+        run: |
+          DATE=$(date +'%Y%m%d-%H%M%S')
+          SHORT_SHA=$(echo $GITHUB_SHA | cut -c '1-10')
+          echo date=$DATE >> $GITHUB_OUTPUT
+          echo short_sha=$SHORT_SHA >> $GITHUB_OUTPUT
+          echo build_id=$DATE-$SHORT_SHA >> $GITHUB_OUTPUT
+
+  build:
+    name: Build Artifacts
+    needs: [preparation]
+    uses: ./.github/workflows/build.yaml
+    with:
+      build_id: ${{ needs.preparation.outputs.build_id }}
+      nightly: false
+      python: '["3.9", "3.10", "3.11"]'
+    secrets:
+      BOT_MINIO_ACCESS_KEY: ${{ secrets.BOT_MINIO_ACCESS_KEY }}
+      BOT_MINIO_SECRET_KEY: ${{ secrets.BOT_MINIO_SECRET_KEY }}
+
+  save_build_metadata:
+    name: Save Build Metadata
+    runs-on: [self-hosted, Linux]
+    needs: [preparation, build]
+    steps:
+      - name: Saving Metadata
+        id: gather
+        run: >-
+          redis-cli -h $REDIS_HOST --raw
+          set "latest-build-id:$GITHUB_REPOSITORY:$GITHUB_REF" ${{ needs.preparation.outputs.build_id }}
diff --git a/.github/workflows/scripts/ti_build/cmake.py b/.github/workflows/scripts/ti_build/cmake.py
index cf6d07ed5df14..2dc327710e40b 100644
--- a/.github/workflows/scripts/ti_build/cmake.py
+++ b/.github/workflows/scripts/ti_build/cmake.py
@@ -14,7 +14,7 @@
 from .misc import banner
 
 # -- code --
-OPTION_RE = re.compile(r'option\(([A-Z0-9_]*) +"(.*?)" +(ON|OFF)\)')
+OPTION_RE = re.compile(r'option\(([A-Z0-9_]*) +"(.*?)" +(ON|OFF)\)(?: *# wheel-tag: (.*))?')
 DEF_RE = re.compile(r"-D([A-Z0-9_]*)(?::BOOL)?=([^ ]+)(?: |$)")
 
 
@@ -28,10 +28,7 @@ def __init__(self, environ_name):
         self.environ_name = environ_name
         self.definitions = {}
         self.option_definitions = {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": (
-                "Generate compile_commands.json",
-                False,
-            ),
+            "CMAKE_EXPORT_COMPILE_COMMANDS": ("Generate compile_commands.json", False, ""),
         }
 
         self.finalized = False
@@ -39,9 +36,9 @@ def __init__(self, environ_name):
     def collect_options(self, *files: str) -> None:
         for fn in files:
             with open(fn, "r") as f:
-                for name, desc, default in OPTION_RE.findall(f.read()):
+                for name, desc, default, wheel_tag in OPTION_RE.findall(f.read()):
                     default = self._VMAP.get(default, default)
-                    self.option_definitions[name] = (desc, default)
+                    self.option_definitions[name] = (desc, default, wheel_tag)
 
     def parse_initial_args(self) -> None:
         args = os.environ.get(self.environ_name, "")
@@ -49,7 +46,7 @@ def parse_initial_args(self) -> None:
             self.set(name, value)
 
     def get_effective(self, name: str) -> Union[str, bool]:
-        _, default = self.option_definitions.get(name, ("", None))
+        _, default, _ = self.option_definitions.get(name, ("", None, ""))
         return self.definitions.get(name, default)
 
     def set(self, name: str, value: Union[str, bool]) -> None:
@@ -57,7 +54,7 @@ def set(self, name: str, value: Union[str, bool]) -> None:
         desc = ""
         value = self._VMAP.get(value, value)
         default = None
-        desc, default = self.option_definitions.get(name, ("", None))
+        desc, default, wheel_tag = self.option_definitions.get(name, ("", None, ""))
         desc = desc and f" ({desc}) "
         is_bool = isinstance(default, bool)
         assert not is_bool or isinstance(value, bool), f"Option {name} must be bool"
@@ -84,6 +81,7 @@ def set(self, name: str, value: Union[str, bool]) -> None:
                 else:
                     p(f"{B}:: CMAKE: Already disabled: {name}{desc}{N}")
         else:
+            assert not wheel_tag, "Set a non boolean value to an option with wheel-tag"
             if orig != value:
                 if orig != default:
                     p(f"{R}:: CMAKE- {name}={orig}{desc}{N}")
@@ -99,7 +97,7 @@ def render(self) -> List[Tuple[str, str, str]]:
             else:
                 v = f"-D{name}={value}"
 
-            desc, _ = self.option_definitions.get(name, ("", None))
+            desc, _, _ = self.option_definitions.get(name, ("", None, ""))
             if desc:
                 prefix = "DO NOT " if not value else ""
                 desc = f" ({prefix}{desc})"
@@ -108,6 +106,15 @@ def render(self) -> List[Tuple[str, str, str]]:
 
         return lst
 
+    def render_wheel_tag(self) -> str:
+        tags = []
+        for name, (_, default, wheel_tag) in self.option_definitions.items():
+            if not wheel_tag:
+                continue
+            if self.definitions.get(name, default):
+                tags.append(wheel_tag)
+        return ".".join(sorted(tags))
+
     @banner("{self.environ_name} Summary")
     def print_summary(self, rendered) -> None:
         p = lambda s: print(s, file=sys.stderr, flush=True)
diff --git a/.github/workflows/scripts/ti_build/compiler.py b/.github/workflows/scripts/ti_build/compiler.py
index 5854705bc3f55..4c3b66dc15f60 100644
--- a/.github/workflows/scripts/ti_build/compiler.py
+++ b/.github/workflows/scripts/ti_build/compiler.py
@@ -3,14 +3,18 @@
 # -- stdlib --
 from pathlib import Path
 import os
+import json
 import platform
 import shutil
+import tempfile
+import sys
 
 # -- third party --
 # -- own --
 from .cmake import cmake_args
 from .dep import download_dep
 from .misc import banner, error, get_cache_home, warn
+from .tinysh import powershell
 
 
 # -- code --
@@ -56,36 +60,90 @@ def setup_clang(as_compiler=True) -> None:
             cmake_args["CMAKE_CXX_COMPILER"] = clangpp
 
 
+ENV_EXTRACT_SCRIPT = """
+param ([string]$DevShell, [string]$VsPath, [string]$OutFile)
+$WarningPreference = 'SilentlyContinue'
+Import-Module $DevShell
+Enter-VsDevShell -VsInstallPath $VsPath -SkipAutomaticLocation -DevCmdArguments "-arch=x64"
+Get-ChildItem env:* | ConvertTo-Json -Depth 1 | Out-File $OutFile
+"""
+
+
+def _vs_devshell(vs):
+    dll = vs / "Common7" / "Tools" / "Microsoft.VisualStudio.DevShell.dll"
+
+    if not dll.exists():
+        error("Could not find Visual Studio DevShell")
+        return
+
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp = Path(tmp)
+        script = tmp / "extract.ps1"
+        with script.open("w") as f:
+            f.write(ENV_EXTRACT_SCRIPT)
+        outfile = tmp / "env.json"
+        powershell(
+            "-ExecutionPolicy",
+            "Bypass",
+            "-File",
+            str(script),
+            "-DevShell",
+            str(dll),
+            "-VsPath",
+            str(vs),
+            "-OutFile",
+            str(outfile),
+        )
+        with outfile.open(encoding="utf-16") as f:
+            envs = json.load(f)
+
+    for v in envs:
+        os.environ[v["Key"]] = v["Value"]
+
+
 @banner("Setup MSVC")
 def setup_msvc() -> None:
     assert platform.system() == "Windows"
-    os.environ["TAICHI_USE_MSBUILD"] = "1"
 
-    base = Path(r"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools")
-    for edition in ("Enterprise", "Professional", "Community", "BuildTools"):
-        if (base / edition).exists():
-            return
+    base = Path("C:\\Program Files (x86)\\Microsoft Visual Studio")
+    for ver in ("2022",):
+        for edition in ("Enterprise", "Professional", "Community", "BuildTools"):
+            vs = base / ver / edition
+            if not vs.exists():
+                continue
+
+            if os.environ.get("TI_CI") and not os.environ.get("TAICHI_USE_MSBUILD"):
+                # Use Ninja + MSVC in CI, for better caching
+                _vs_devshell(vs)
+                cmake_args["CMAKE_C_COMPILER"] = "cl.exe"
+                cmake_args["CMAKE_CXX_COMPILER"] = "cl.exe"
+            else:
+                os.environ["TAICHI_USE_MSBUILD"] = "1"
 
-    url = "https://aka.ms/vs/17/release/vs_BuildTools.exe"
-    out = base
-    download_dep(
-        url,
-        out,
-        elevate=True,
-        args=[
-            "--passive",
-            "--wait",
-            "--norestart",
-            "--includeRecommended",
-            "--add",
-            "Microsoft.VisualStudio.Workload.VCTools",
-            # NOTE: We are using the custom built Clang++,
-            #       so components below are not necessary anymore.
-            # '--add',
-            # 'Microsoft.VisualStudio.Component.VC.Llvm.Clang',
-            # '--add',
-            # 'Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Llvm.Clang',
-            # '--add',
-            # 'Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset',
-        ],
-    )
+            return
+    else:
+        url = "https://aka.ms/vs/17/release/vs_BuildTools.exe"
+        out = base / "2022" / "BuildTools"
+        download_dep(
+            url,
+            out,
+            elevate=True,
+            args=[
+                "--passive",
+                "--wait",
+                "--norestart",
+                "--includeRecommended",
+                "--add",
+                "Microsoft.VisualStudio.Workload.VCTools",
+                # NOTE: We are using the custom built Clang++,
+                #       so components below are not necessary anymore.
+                # '--add',
+                # 'Microsoft.VisualStudio.Component.VC.Llvm.Clang',
+                # '--add',
+                # 'Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Llvm.Clang',
+                # '--add',
+                # 'Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset',
+            ],
+        )
+        warn("Please restart build.py after Visual Studio Build Tools is installed.")
+        sys.exit(1)
diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index 41dd136be38e9..663de529f1463 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -2,6 +2,7 @@
 
 # -- stdlib --
 import argparse
+import datetime
 import os
 import platform
 import subprocess
@@ -20,7 +21,7 @@
 from .ospkg import setup_os_pkgs
 from .python import get_desired_python_version, setup_python
 from .sccache import setup_sccache
-from .tinysh import Command, CommandFailed, git
+from .tinysh import Command, CommandFailed, git, nice
 from .vulkan import setup_vulkan
 
 
@@ -30,15 +31,22 @@ def build_wheel(python: Command, pip: Command) -> None:
     """
     Build the Taichi wheel
     """
+
     git.fetch("origin", "master", "--tags")
-    proj = os.environ.get("PROJECT_NAME", "taichi")
     proj_tags = []
     extra = []
 
-    if proj == "taichi-nightly":
-        proj_tags.extend(["egg_info", "--tag-date", "--tag-build=.post"])
-        # Include C-API in nightly builds
-        cmake_args["TI_WITH_C_API"] = True
+    cmake_args.writeback()
+    wheel_tag = cmake_args.render_wheel_tag()
+    if misc.options.tag_local:
+        wheel_tag = misc.options.tag_local
+
+    if misc.options.nightly:
+        os.environ["PROJECT_NAME"] = "taichi-nightly"
+        now = datetime.datetime.now().strftime("%Y%m%d")
+        proj_tags.extend(["egg_info", f"--tag-build=.post{now}+{wheel_tag}"])
+    elif misc.options.tag_config or misc.options.tag_local:
+        proj_tags.extend(["egg_info", f"--tag-build=+{wheel_tag}"])
 
     if platform.system() == "Linux":
         if is_manylinux2014():
@@ -46,11 +54,11 @@ def build_wheel(python: Command, pip: Command) -> None:
         else:
             extra.extend(["-p", "manylinux_2_27_x86_64"])
 
-    cmake_args.writeback()
     python("setup.py", "clean")
     python("misc/make_changelog.py", "--ver", "origin/master", "--repo_dir", "./", "--save")
 
-    python("setup.py", *proj_tags, "bdist_wheel", *extra)
+    with nice():
+        python("setup.py", *proj_tags, "bdist_wheel", *extra)
 
 
 @banner("Install Build Wheel Dependencies")
@@ -59,7 +67,7 @@ def install_build_wheel_deps(python: Command, pip: Command) -> None:
     pip.install("-r", "requirements_dev.txt")
 
 
-def setup_basic_build_env(force_vulkan=False):
+def setup_basic_build_env():
     u = platform.uname()
     if (u.system, u.machine) == ("Windows", "AMD64"):
         # Use MSVC on Windows
@@ -70,8 +78,7 @@ def setup_basic_build_env(force_vulkan=False):
         setup_clang()
 
     setup_llvm()
-    if force_vulkan or cmake_args.get_effective("TI_WITH_VULKAN"):
-        setup_vulkan()
+    setup_vulkan()
 
     sccache = setup_sccache()
 
@@ -89,18 +96,18 @@ def action_wheel():
     handle_alternate_actions()
     build_wheel(python, pip)
     try:
-        sccache("--stop-server")
+        sccache("-s")
     except CommandFailed:
         pass
 
 
 def action_android():
-    sccache, python, pip = setup_basic_build_env(force_vulkan=True)
+    sccache, python, pip = setup_basic_build_env()
     setup_android_ndk()
     handle_alternate_actions()
     build_android(python, pip)
     try:
-        sccache("--stop-server")
+        sccache("-s")
     except CommandFailed:
         pass
 
@@ -150,6 +157,15 @@ def parse_args():
     help = "Continue when encounters error."
     parser.add_argument("--permissive", action="store_true", default=False, help=help)
 
+    help = "Tag built wheel with TI_WITH_xxx config."
+    parser.add_argument("--tag-config", action="store_true", default=False, help=help)
+
+    help = "Set a local version. Overrides --tag-config."
+    parser.add_argument("--tag-local", type=str, default=None, help=help)
+
+    help = "Build nightly wheel."
+    parser.add_argument("--nightly", action="store_true", default=False, help=help)
+
     options = parser.parse_args()
     return options
 
diff --git a/.github/workflows/scripts/ti_build/tinysh.py b/.github/workflows/scripts/ti_build/tinysh.py
index 4a88d88077b84..9a7e126095bad 100644
--- a/.github/workflows/scripts/ti_build/tinysh.py
+++ b/.github/workflows/scripts/ti_build/tinysh.py
@@ -218,6 +218,19 @@ def sudo():
         return prefix("sudo")
 
 
+def nice():
+    """
+    Wrap a command with sudo.
+    """
+    if IS_WINDOWS:
+        from .misc import warn
+
+        warn("nice is not yet implemented on Windows")
+        return with_options({})
+    else:
+        return prefix("nice")
+
+
 sh = Command()
 git = sh.git
 # Use setup_python !
@@ -228,3 +241,5 @@ def sudo():
 bash = sh.bash
 start = sh.start.bake("/wait")
 apt = sh.sudo.apt
+powershell = Command("powershell.exe")
+pwsh = Command("pwsh.exe")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d024821b1f5e0..b29755fbcad3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 option(USE_LLD "Use lld (from llvm) linker" OFF)
 option(USE_MOLD "Use mold (A Modern Linker)" OFF)
-option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF)
+option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF)  # wheel-tag: bt
+option(TI_GENERATE_PDB "Generate Program Database (PDB) files (will make compilation uncacheable)" OFF)
+option(TI_WITH_LTO "Enable Link Time Optimization (LTO) (affects Windows + MSVC for now)" OFF)  # wheel-tag: lto
 
 if(LINUX OR APPLE)
     if (NOT IOS)
@@ -205,8 +207,8 @@ endif()
 configure_file(taichi/common/version.h.in ${CMAKE_SOURCE_DIR}/taichi/common/version.h)
 configure_file(taichi/common/commit_hash.h.in ${CMAKE_SOURCE_DIR}/taichi/common/commit_hash.h)
 
-option(TI_WITH_C_API "build taichi runtime c-api library" ON)
-option(TI_WITH_STATIC_C_API "build static taichi runtime c-api library" OFF)
+option(TI_WITH_C_API "build taichi runtime c-api library" ON)  # wheel-tag: aot
+option(TI_WITH_STATIC_C_API "build static taichi runtime c-api library" OFF)  # wheel-tag: static_aot
 
 if(TI_WITH_STATIC_C_API)
     set(TI_WITH_C_API ${TI_WITH_STATIC_C_API})
@@ -231,7 +233,7 @@ if (TI_BUILD_RHI_EXAMPLES)
 endif()
 
 
-option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF)
+option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF)  # wheel-tag: viz
 if (TI_WITH_GRAPHVIZ)
   set(GRAPHVIZ_GRAPH_NAME "ti_targets")
   add_custom_target(graphviz ALL
diff --git a/c_api/src/taichi_llvm_impl.cpp b/c_api/src/taichi_llvm_impl.cpp
index a8b2fb8d8c6d2..7e89211031a34 100644
--- a/c_api/src/taichi_llvm_impl.cpp
+++ b/c_api/src/taichi_llvm_impl.cpp
@@ -54,11 +54,11 @@ TiMemory LlvmRuntime::allocate_memory(
     const taichi::lang::Device::AllocParams &params) {
   taichi::lang::LLVMRuntime *llvm_runtime = executor_->get_llvm_runtime();
   taichi::lang::LlvmDevice *llvm_device = executor_->llvm_device();
-
   taichi::lang::DeviceAllocation devalloc =
-      llvm_device->allocate_memory_runtime({params,
-                                            executor_->get_runtime_jit_module(),
-                                            llvm_runtime, result_buffer});
+      llvm_device->allocate_memory_runtime(
+          {params, executor_->get_runtime_jit_module(), llvm_runtime,
+           result_buffer, executor_->use_device_memory_pool()});
+
   return devalloc2devmem(*this, devalloc);
 }
 
diff --git a/cmake/TaichiCAPITests.cmake b/cmake/TaichiCAPITests.cmake
index de894cf131b54..ad743eec3cb52 100644
--- a/cmake/TaichiCAPITests.cmake
+++ b/cmake/TaichiCAPITests.cmake
@@ -22,7 +22,7 @@ if (WIN32)
     set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${C_API_TESTS_OUTPUT_DIR})
     set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${C_API_TESTS_OUTPUT_DIR})
     set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${C_API_TESTS_OUTPUT_DIR})
-    if (MSVC)
+    if (MSVC AND TI_GENERATE_PDB)
         target_compile_options(${C_API_TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/Zi>")
         target_link_options(${C_API_TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/DEBUG>")
         target_link_options(${C_API_TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/OPT:REF>")
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index a03e41d9fc59e..58b9a5a3ded3d 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -25,10 +25,14 @@ if (WIN32)
         set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -flto=thin")
         set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -flto=thin")
     elseif (MSVC)
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /GL /Gy")
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /GL /Gy")
-        set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
-        set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /Gy")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /Gy")
+        if (TI_WITH_LTO)
+            set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /GL")
+            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /GL")
+            set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
+            set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /LTCG")
+        endif()
     endif()
 endif()
 
@@ -39,8 +43,10 @@ if (WIN32)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus /Zc:inline /std:c++17")
         # Linker & object related flags
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /bigobj")
-        # Debugging (generate PBD files)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /Zf")
+        # Debugging (generate PDB files)
+        if (TI_GENERATE_PDB)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /Zf")
+        endif()
         # Performance and optimizations
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Oi")
         # C4244: conversion from 'type1' to 'type2', possible loss of data
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 8579d2d3bdacb..a3b44521f23d2 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -1,14 +1,14 @@
 option(USE_STDCPP "Use -stdlib=libc++" OFF)
-option(TI_WITH_LLVM "Build with LLVM backends" ON)
-option(TI_WITH_METAL "Build with the Metal backend" ON)
-option(TI_WITH_CUDA "Build with the CUDA backend" ON)
-option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)
-option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF)
-option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
-option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)
-option(TI_WITH_DX11 "Build with the DX11 backend" OFF)
-option(TI_WITH_DX12 "Build with the DX12 backend" OFF)
-option(TI_WITH_GGUI "Build with GGUI" OFF)
+option(TI_WITH_LLVM "Build with LLVM backends" ON)              # wheel-tag: llvm
+option(TI_WITH_METAL "Build with the Metal backend" ON)         # wheel-tag: mtl
+option(TI_WITH_CUDA "Build with the CUDA backend" ON)           # wheel-tag: cu
+option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)  # wheel-tag: cutk
+option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF)      # wheel-tag: amd
+option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)       # wheel-tag: gl
+option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)      # wheel-tag: vk
+option(TI_WITH_DX11 "Build with the DX11 backend" OFF)          # wheel-tag: dx11
+option(TI_WITH_DX12 "Build with the DX12 backend" OFF)          # wheel-tag: dx12
+option(TI_WITH_GGUI "Build with GGUI" OFF)                      # wheel-tag: ggui
 
 # Force symbols to be 'hidden' by default so nothing is exported from the Taichi
 # library including the third-party dependencies.
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index c3d59577d15a9..63d96c3a60c11 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -56,11 +56,11 @@ if (WIN32)
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${TESTS_OUTPUT_DIR})
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${TESTS_OUTPUT_DIR})
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${TESTS_OUTPUT_DIR})
-    if (MSVC)
-        target_compile_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/Zi>")
-        target_link_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/DEBUG>")
-        target_link_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/OPT:REF>")
-        target_link_options(${TESTS_NAME} PRIVATE "$<$<CONFIG:Release>:/OPT:ICF>")
+    if (MSVC AND TI_GENERATE_PDB)
+        target_compile_options(${TESTS_NAME} PRIVATE "/Zi")
+        target_link_options(${TESTS_NAME} PRIVATE "/DEBUG")
+        target_link_options(${TESTS_NAME} PRIVATE "/OPT:REF")
+        target_link_options(${TESTS_NAME} PRIVATE "/OPT:ICF")
     endif()
 endif()
 target_link_libraries(${TESTS_NAME} PRIVATE taichi_core)
diff --git a/misc/make_changelog.py b/misc/make_changelog.py
index a7e23e593ece4..c373aa52bdf4a 100644
--- a/misc/make_changelog.py
+++ b/misc/make_changelog.py
@@ -112,4 +112,5 @@ def format(c):
     if args.save:
         with open("./python/taichi/CHANGELOG.md", "w", encoding="utf-8") as f:
             f.write(res)
-    print(res)
+    else:
+        print(res)
diff --git a/python/taichi/aot/_export.py b/python/taichi/aot/_export.py
index 0af32c5fcb938..37889a70fed91 100644
--- a/python/taichi/aot/_export.py
+++ b/python/taichi/aot/_export.py
@@ -23,4 +23,4 @@ def inner(f):
 
 
 def export(f):
-    export_as(f.__name__)(f)
+    return export_as(f.__name__)(f)
diff --git a/taichi/common/one_or_more.h b/taichi/common/one_or_more.h
index 472f0629b8ec8..bb7ad75a14d33 100644
--- a/taichi/common/one_or_more.h
+++ b/taichi/common/one_or_more.h
@@ -11,21 +11,27 @@ struct one_or_more {
 
   std::variant<value_type, Container> var;
 
+  // NOLINTNEXTLINE
   one_or_more(value_type const &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(value_type &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(value_type &&value) : var(std::move(value)) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(Container const &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(Container &value) : var(value) {
   }
 
+  // NOLINTNEXTLINE
   one_or_more(Container &&value) : var(std::move(value)) {
   }
 
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index ae597e7199ebd..641c9077a2576 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -140,6 +140,9 @@ FrontendWhileStmt::FrontendWhileStmt(const FrontendWhileStmt &o)
 
 void ArgLoadExpression::type_check(const CompileConfig *) {
   ret_type = dt;
+  if (is_ptr) {
+    ret_type = TypeFactory::get_instance().get_pointer_type(ret_type, false);
+  }
   if (!create_load) {
     ret_type = TypeFactory::get_instance().get_pointer_type(ret_type, false);
   }
@@ -656,7 +659,7 @@ Stmt *make_ndarray_access(Expression::FlattenContext *ctx,
   auto var_stmt = flatten_lvalue(var, ctx);
   auto expr = var.cast<ExternalTensorExpression>();
   auto external_ptr_stmt = std::make_unique<ExternalPtrStmt>(
-      var_stmt, index_stmts, expr->dt.get_shape(), expr->element_dim,
+      var_stmt, index_stmts, expr->dim, expr->dt.get_shape(), expr->element_dim,
       expr->is_grad);
   if (expr->dim == indices.size()) {
     // Indexing into an scalar element
@@ -960,7 +963,7 @@ void AtomicOpExpression::type_check(const CompileConfig *config) {
   };
 
   // Broadcast val to dest if neccessary
-  auto val_dtype = val->ret_type;
+  auto val_dtype = get_rvalue_dtype(val);
   auto dest_dtype = dest->ret_type.ptr_removed();
   if (dest_dtype->is<PrimitiveType>() && val_dtype->is<TensorType>()) {
     error();
@@ -973,20 +976,18 @@ void AtomicOpExpression::type_check(const CompileConfig *config) {
   }
 
   // Validate dtype
-  auto dtype = val->ret_type;
-  if (dtype->is<TensorType>()) {
-    dtype = dtype.get_element_type();
+  if (val_dtype->is<TensorType>()) {
+    val_dtype = val_dtype.get_element_type();
   }
 
-  if (!dtype->is<PrimitiveType>()) {
+  if (!val_dtype->is<PrimitiveType>()) {
     error();
   }
 
-  if (is_quant(dest->ret_type)) {
-    ret_type = dest->ret_type->get_compute_type();
-  } else if (dest->ret_type->is<PrimitiveType>() ||
-             dest->ret_type->is<TensorType>()) {
-    ret_type = dest->ret_type;
+  if (is_quant(dest_dtype)) {
+    ret_type = dest_dtype->get_compute_type();
+  } else if (dest_dtype->is<PrimitiveType>() || dest_dtype->is<TensorType>()) {
+    ret_type = dest_dtype;
   } else {
     error();
   }
@@ -1271,7 +1272,7 @@ void MeshIndexConversionExpression::flatten(FlattenContext *ctx) {
 }
 
 void ReferenceExpression::type_check(const CompileConfig *) {
-  ret_type = var->ret_type;
+  ret_type = TypeFactory::get_instance().get_pointer_type(var->ret_type);
 }
 
 void ReferenceExpression::flatten(FlattenContext *ctx) {
@@ -1796,4 +1797,14 @@ Stmt *flatten_rvalue(Expr ptr, Expression::FlattenContext *ctx) {
   return ptr_stmt;
 }
 
+DataType get_rvalue_dtype(Expr expr) {
+  if (auto argload = expr.cast<ArgLoadExpression>()) {
+    if (argload->is_ptr) {
+      return argload->ret_type.ptr_removed();
+    }
+    return argload->ret_type;
+  }
+  return expr->ret_type;
+}
+
 }  // namespace taichi::lang
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index a873d42b8e5bd..1c4a6bb3448fa 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -1109,4 +1109,6 @@ Stmt *flatten_lvalue(Expr expr, Expression::FlattenContext *ctx);
 
 Stmt *flatten_rvalue(Expr expr, Expression::FlattenContext *ctx);
 
+DataType get_rvalue_dtype(Expr expr);
+
 }  // namespace taichi::lang
diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp
index 29df5319cf31e..27a2f70d4b4aa 100644
--- a/taichi/ir/ir_builder.cpp
+++ b/taichi/ir/ir_builder.cpp
@@ -441,7 +441,7 @@ ExternalPtrStmt *IRBuilder::create_external_ptr(
     const std::vector<Stmt *> &indices,
     bool is_grad) {
   return insert(Stmt::make_typed<ExternalPtrStmt>(
-      ptr, indices, std::vector<int>(), 0, is_grad));
+      ptr, indices, indices.size(), std::vector<int>(), 0, is_grad));
 }
 
 AdStackAllocaStmt *IRBuilder::create_ad_stack(const DataType &dt,
diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
index 8234ddce9b868..1666a614d8e89 100644
--- a/taichi/ir/statements.cpp
+++ b/taichi/ir/statements.cpp
@@ -36,6 +36,7 @@ ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr,
                                  const std::vector<Stmt *> &indices,
                                  bool is_grad)
     : base_ptr(base_ptr), indices(indices), is_grad(is_grad) {
+  ndim = indices.size();
   TI_ASSERT(base_ptr != nullptr);
   TI_ASSERT(base_ptr->is<ArgLoadStmt>());
   TI_STMT_REG_FIELDS;
@@ -43,12 +44,14 @@ ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr,
 
 ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr,
                                  const std::vector<Stmt *> &indices,
+                                 int ndim,
                                  const std::vector<int> &element_shape,
                                  int element_dim,
                                  bool is_grad)
     : ExternalPtrStmt(base_ptr, indices, is_grad) {
   this->element_shape = element_shape;
   this->element_dim = element_dim;
+  this->ndim = ndim;
 }
 
 GlobalPtrStmt::GlobalPtrStmt(SNode *snode,
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index 04fe2175f1838..f3bfd718207e9 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -333,7 +333,13 @@ class AtomicOpStmt : public Stmt,
 class ExternalPtrStmt : public Stmt {
  public:
   Stmt *base_ptr;
+
   std::vector<Stmt *> indices;
+
+  // Number of dimensions of external shape
+  int ndim;
+
+  // Shape of element type
   std::vector<int> element_shape;
   // AOS: element_dim < 0
   // SOA: element_dim > 0
@@ -352,6 +358,7 @@ class ExternalPtrStmt : public Stmt {
 
   ExternalPtrStmt(Stmt *base_ptr,
                   const std::vector<Stmt *> &indices,
+                  int ndim,
                   const std::vector<int> &element_shape,
                   int element_dim,
                   bool is_grad = false);
diff --git a/taichi/rhi/CMakeLists.txt b/taichi/rhi/CMakeLists.txt
index 0e26f5579f500..4b88b7d974c0d 100644
--- a/taichi/rhi/CMakeLists.txt
+++ b/taichi/rhi/CMakeLists.txt
@@ -113,3 +113,8 @@ target_link_libraries(${TAICHI_DEVICE_API} PUBLIC common_rhi)
 # Generate shared library
 add_library(ti_device_api_shared SHARED public_device.h)
 target_link_libraries(ti_device_api_shared PUBLIC ${TAICHI_DEVICE_API})
+
+# When building targets on Windows using Ninja + MSVC, the linker requires at least 1 object file
+# to work properly, else link.exe would complain about LNK4001 warning, and fail afterwards.
+# Adding a dummy file to workaround this.
+target_sources(ti_device_api_shared PRIVATE dummy.cpp)
diff --git a/taichi/rhi/amdgpu/amdgpu_context.h b/taichi/rhi/amdgpu/amdgpu_context.h
index affef71aa2ddd..6688eb91a5667 100644
--- a/taichi/rhi/amdgpu/amdgpu_context.h
+++ b/taichi/rhi/amdgpu/amdgpu_context.h
@@ -94,7 +94,7 @@ class AMDGPUContext {
     void *new_ctx_;
 
    public:
-    ContextGuard(AMDGPUContext *new_ctx)
+    explicit ContextGuard(AMDGPUContext *new_ctx)
         : old_ctx_(nullptr), new_ctx_(new_ctx) {
       AMDGPUDriver::get_instance().context_get_current(&old_ctx_);
       if (old_ctx_ != new_ctx)
diff --git a/taichi/rhi/amdgpu/amdgpu_device.cpp b/taichi/rhi/amdgpu/amdgpu_device.cpp
index ee86b03665dc3..ac786a4cb0ffb 100644
--- a/taichi/rhi/amdgpu/amdgpu_device.cpp
+++ b/taichi/rhi/amdgpu/amdgpu_device.cpp
@@ -106,8 +106,8 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) {
                                              false);
   } else if (!info.use_preallocated) {
     DeviceMemoryPool::get_instance().release(info.size, info.ptr);
-    info.ptr = nullptr;
   }
+  info.ptr = nullptr;
 }
 
 RhiResult AmdgpuDevice::map(DeviceAllocation alloc, void **mapped_ptr) {
diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp
index fe5d296071672..25d24ac8e8e5a 100644
--- a/taichi/rhi/cuda/cuda_device.cpp
+++ b/taichi/rhi/cuda/cuda_device.cpp
@@ -53,17 +53,17 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   if (info.size == 0) {
     info.ptr = nullptr;
+  } else if (params.use_memory_pool) {
+    CUDADriver::get_instance().malloc_async((void **)&info.ptr, info.size,
+                                            nullptr);
   } else {
     info.ptr =
         DeviceMemoryPool::get_instance().allocate_with_cache(this, params);
-
-    TI_ASSERT(info.ptr != nullptr);
-
-    CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size);
   }
   info.is_imported = false;
   info.use_cached = true;
   info.use_preallocated = true;
+  info.use_memory_pool = params.use_memory_pool;
 
   DeviceAllocation alloc;
   alloc.alloc_id = allocations_.size();
@@ -92,6 +92,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
 
   validate_device_alloc(handle);
   AllocInfo &info = allocations_[handle.alloc_id];
+
   if (info.size == 0) {
     return;
   }
@@ -99,14 +100,16 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
     TI_ERROR("the DeviceAllocation is already deallocated");
   }
   TI_ASSERT(!info.is_imported);
-  if (info.use_cached) {
+  if (info.use_memory_pool) {
+    CUDADriver::get_instance().mem_free_async(info.ptr, nullptr);
+  } else if (info.use_cached) {
     DeviceMemoryPool::get_instance().release(info.size, (uint64_t *)info.ptr,
                                              false);
   } else if (!info.use_preallocated) {
     auto &mem_pool = DeviceMemoryPool::get_instance();
     mem_pool.release(info.size, info.ptr, true /*release_raw*/);
-    info.ptr = nullptr;
   }
+  info.ptr = nullptr;
 }
 
 RhiResult CudaDevice::upload_data(DevicePtr *device_ptr,
diff --git a/taichi/rhi/cuda/cuda_device.h b/taichi/rhi/cuda/cuda_device.h
index 0e06174552331..3a94209084a13 100644
--- a/taichi/rhi/cuda/cuda_device.h
+++ b/taichi/rhi/cuda/cuda_device.h
@@ -77,6 +77,7 @@ class CudaDevice : public LlvmDevice {
      * */
     bool use_preallocated{true};
     bool use_cached{false};
+    bool use_memory_pool{false};
     void *mapped{nullptr};
   };
 
diff --git a/taichi/rhi/dummy.cpp b/taichi/rhi/dummy.cpp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/taichi/rhi/llvm/device_memory_pool.h b/taichi/rhi/llvm/device_memory_pool.h
index 0ccb5ae77f338..f5081defb2c57 100644
--- a/taichi/rhi/llvm/device_memory_pool.h
+++ b/taichi/rhi/llvm/device_memory_pool.h
@@ -24,7 +24,7 @@ class TI_DLL_EXPORT DeviceMemoryPool {
   void *allocate(std::size_t size, std::size_t alignment, bool managed = false);
   void release(std::size_t size, void *ptr, bool release_raw = false);
   void reset();
-  DeviceMemoryPool(bool merge_upon_release);
+  explicit DeviceMemoryPool(bool merge_upon_release);
   ~DeviceMemoryPool();
 
  protected:
diff --git a/taichi/rhi/llvm/llvm_device.h b/taichi/rhi/llvm/llvm_device.h
index cbefaaeb60327..4279ba1765a7a 100644
--- a/taichi/rhi/llvm/llvm_device.h
+++ b/taichi/rhi/llvm/llvm_device.h
@@ -13,6 +13,7 @@ class LlvmDevice : public Device {
     JITModule *runtime_jit{nullptr};
     LLVMRuntime *runtime{nullptr};
     uint64 *result_buffer{nullptr};
+    bool use_memory_pool{false};
   };
 
   Arch arch() const override {
diff --git a/taichi/rhi/opengl/opengl_device.cpp b/taichi/rhi/opengl/opengl_device.cpp
index e2e65e25f2a28..cae2caeb4d0ea 100644
--- a/taichi/rhi/opengl/opengl_device.cpp
+++ b/taichi/rhi/opengl/opengl_device.cpp
@@ -644,9 +644,18 @@ RhiResult GLDevice::create_pipeline(Pipeline **out_pipeline,
                                     PipelineCache *cache) noexcept {
   try {
     *out_pipeline = new GLPipeline(src, name);
-  } catch (std::bad_alloc &) {
+  } catch (std::bad_alloc &e) {
     *out_pipeline = nullptr;
+    RHI_LOG_ERROR(e.what());
     return RhiResult::out_of_memory;
+  } catch (std::invalid_argument &e) {
+    *out_pipeline = nullptr;
+    RHI_LOG_ERROR(e.what());
+    return RhiResult::invalid_usage;
+  } catch (std::runtime_error &e) {
+    *out_pipeline = nullptr;
+    RHI_LOG_ERROR(e.what());
+    return RhiResult::error;
   }
   return RhiResult::success;
 }
diff --git a/taichi/rhi/opengl/opengl_device.h b/taichi/rhi/opengl/opengl_device.h
index 514ec98b13426..3c5008cee2199 100644
--- a/taichi/rhi/opengl/opengl_device.h
+++ b/taichi/rhi/opengl/opengl_device.h
@@ -29,7 +29,7 @@ extern void *kGetOpenglProcAddr;
 class GLResourceSet : public ShaderResourceSet {
  public:
   GLResourceSet() = default;
-  explicit GLResourceSet(const GLResourceSet &other) = default;
+  GLResourceSet(const GLResourceSet &other) = default;
 
   ~GLResourceSet() override;
 
diff --git a/taichi/rhi/vulkan/vulkan_device.h b/taichi/rhi/vulkan/vulkan_device.h
index 1e7a4fd8027e1..71b33ecdf6b6e 100644
--- a/taichi/rhi/vulkan/vulkan_device.h
+++ b/taichi/rhi/vulkan/vulkan_device.h
@@ -239,7 +239,7 @@ class VulkanResourceSet : public ShaderResourceSet {
 
 class VulkanRasterResources : public RasterResources {
  public:
-  VulkanRasterResources(VulkanDevice *device) : device_(device) {
+  explicit VulkanRasterResources(VulkanDevice *device) : device_(device) {
   }
 
   struct BufferBinding {
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 6c636f833f299..3adada586f5df 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -46,6 +46,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
       config.arch = host_arch();
     } else {
       // CUDA runtime created successfully
+      use_device_memory_pool_ = CUDAContext::get_instance().supports_mem_pool();
     }
 #else
     TI_WARN("Taichi is not compiled with CUDA.");
@@ -398,6 +399,20 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
   const int tree_id = field_cache_data.tree_id;
   const int root_id = field_cache_data.root_id;
 
+  bool all_dense = config_.demote_dense_struct_fors;
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (snode_metas[i].type != SNodeType::dense &&
+        snode_metas[i].type != SNodeType::place &&
+        snode_metas[i].type != SNodeType::root) {
+      all_dense = false;
+      break;
+    }
+  }
+
+  if (config_.arch == Arch::cuda && use_device_memory_pool() && !all_dense) {
+    preallocate_runtime_memory();
+  }
+
   TI_TRACE("Allocating data structure of size {} bytes", root_size);
   std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
 
@@ -424,16 +439,6 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
 
   snode_tree_allocs_[tree_id] = alloc;
 
-  bool all_dense = config_.demote_dense_struct_fors;
-  for (size_t i = 0; i < snode_metas.size(); i++) {
-    if (snode_metas[i].type != SNodeType::dense &&
-        snode_metas[i].type != SNodeType::place &&
-        snode_metas[i].type != SNodeType::root) {
-      all_dense = false;
-      break;
-    }
-  }
-
   runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
       "runtime_initialize_snodes", llvm_runtime_, root_size, root_id,
       (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense);
@@ -471,16 +476,25 @@ LlvmDevice *LlvmRuntimeExecutor::llvm_device() {
 DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
     std::size_t alloc_size,
     uint64 *result_buffer) {
-  return llvm_device()->allocate_memory_runtime(
+  auto devalloc = llvm_device()->allocate_memory_runtime(
       {{alloc_size, /*host_write=*/false, /*host_read=*/false,
         /*export_sharing=*/false, AllocUsage::Storage},
        get_runtime_jit_module(),
        get_llvm_runtime(),
-       result_buffer});
+       result_buffer,
+       use_device_memory_pool()});
+
+  TI_ASSERT(allocated_runtime_memory_allocs_.find(devalloc.alloc_id) ==
+            allocated_runtime_memory_allocs_.end());
+  allocated_runtime_memory_allocs_[devalloc.alloc_id] = devalloc;
+  return devalloc;
 }
 
 void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
+  TI_ASSERT(allocated_runtime_memory_allocs_.find(handle.alloc_id) !=
+            allocated_runtime_memory_allocs_.end());
   llvm_device()->dealloc_memory(handle);
+  allocated_runtime_memory_allocs_.erase(handle.alloc_id);
 }
 
 void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
@@ -534,13 +548,32 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 
 void LlvmRuntimeExecutor::finalize() {
   profiler_ = nullptr;
-  for (auto &preallocated_device_buffer_alloc :
-       preallocated_device_buffer_allocs_) {
-    if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
-      llvm_device()->dealloc_memory(preallocated_device_buffer_alloc);
-      llvm_device()->clear();
-      DeviceMemoryPool::get_instance().reset();
+  if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
+    preallocated_runtime_objects_allocs_.reset();
+    preallocated_runtime_memory_allocs_.reset();
+
+    // Reset runtime memory
+    auto allocated_runtime_memory_allocs_copy =
+        allocated_runtime_memory_allocs_;
+    for (auto &iter : allocated_runtime_memory_allocs_copy) {
+      // The runtime allocation may have already been freed upon explicit
+      // Ndarray/Field destruction Check if the allocation still alive
+      void *ptr = llvm_device()->get_memory_addr(iter.second);
+      if (ptr == nullptr)
+        continue;
+
+      deallocate_memory_ndarray(iter.second);
     }
+    allocated_runtime_memory_allocs_.clear();
+
+    // Reset device
+    llvm_device()->clear();
+
+    // Reset memory pool
+    DeviceMemoryPool::get_instance().reset();
+
+    // Release unused memory from cuda memory pool
+    synchronize();
   }
   finalized_ = true;
 }
@@ -551,7 +584,9 @@ LlvmRuntimeExecutor::~LlvmRuntimeExecutor() {
   }
 }
 
-void *LlvmRuntimeExecutor::preallocate_memory(std::size_t prealloc_size) {
+void *LlvmRuntimeExecutor::preallocate_memory(
+    std::size_t prealloc_size,
+    DeviceAllocationUnique &devalloc) {
   DeviceAllocation preallocated_device_buffer_alloc;
 
   Device::AllocParams preallocated_device_buffer_alloc_params;
@@ -563,11 +598,38 @@ void *LlvmRuntimeExecutor::preallocate_memory(std::size_t prealloc_size) {
 
   void *preallocated_device_buffer =
       llvm_device()->get_memory_addr(preallocated_device_buffer_alloc);
-  preallocated_device_buffer_allocs_.emplace_back(
+  devalloc = std::make_unique<DeviceAllocationGuard>(
       std::move(preallocated_device_buffer_alloc));
   return preallocated_device_buffer;
 }
 
+void LlvmRuntimeExecutor::preallocate_runtime_memory() {
+  if (preallocated_runtime_memory_allocs_ != nullptr)
+    return;
+
+  std::size_t total_prealloc_size = 0;
+  const auto total_mem = llvm_device()->get_total_memory();
+  if (config_.device_memory_fraction == 0) {
+    TI_ASSERT(config_.device_memory_GB > 0);
+    total_prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
+  } else {
+    total_prealloc_size =
+        std::size_t(config_.device_memory_fraction * total_mem);
+  }
+  TI_ASSERT(total_prealloc_size <= total_mem);
+
+  void *runtime_memory_prealloc_buffer = preallocate_memory(
+      total_prealloc_size, preallocated_runtime_memory_allocs_);
+
+  TI_TRACE("Allocating device memory {:.2f} MB",
+           1.0 * total_prealloc_size / (1UL << 20));
+
+  auto *const runtime_jit = get_runtime_jit_module();
+  runtime_jit->call<void *, std::size_t, void *>(
+      "runtime_initialize_memory", llvm_runtime_, total_prealloc_size,
+      runtime_memory_prealloc_buffer);
+}
+
 void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
                                               uint64 **result_buffer_ptr) {
   // The result buffer allocated here is only used for the launches of
@@ -576,14 +638,14 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
   // CUDA and AMDGPU backends.
   // | ==================preallocated device buffer ========================== |
   // |<- reserved for return ->|<---- usable for allocators on the device ---->|
-
   std::size_t runtime_objects_prealloc_size = 0;
   void *runtime_objects_prealloc_buffer = nullptr;
   if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
+
     runtime_objects_prealloc_size = 60 * (1UL << 20);  // 50 MB
-    runtime_objects_prealloc_buffer =
-        preallocate_memory(runtime_objects_prealloc_size);
+    runtime_objects_prealloc_buffer = preallocate_memory(
+        runtime_objects_prealloc_size, preallocated_runtime_objects_allocs_);
 
     TI_TRACE("Allocating device memory {:.2f} MB",
              1.0 * runtime_objects_prealloc_size / (1UL << 20));
@@ -639,31 +701,9 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
 
   // Preallocate for runtime memory and update to LLVMRuntime
   if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
-    std::size_t total_prealloc_size = 0;
-    const auto total_mem = llvm_device()->get_total_memory();
-    if (config_.device_memory_fraction == 0) {
-      TI_ASSERT(config_.device_memory_GB > 0);
-      total_prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
-    } else {
-      total_prealloc_size =
-          std::size_t(config_.device_memory_fraction * total_mem);
+    if (!use_device_memory_pool()) {
+      preallocate_runtime_memory();
     }
-    TI_ASSERT(total_prealloc_size <= total_mem);
-
-    auto runtime_memory_prealloc_size =
-        total_prealloc_size > runtime_objects_prealloc_size
-            ? total_prealloc_size - runtime_objects_prealloc_size
-            : 0;
-
-    void *runtime_memory_prealloc_buffer =
-        preallocate_memory(runtime_memory_prealloc_size);
-
-    TI_TRACE("Allocating device memory {:.2f} MB",
-             1.0 * runtime_memory_prealloc_size / (1UL << 20));
-
-    runtime_jit->call<void *, std::size_t, void *>(
-        "runtime_initialize_memory", llvm_runtime_,
-        runtime_memory_prealloc_size, runtime_memory_prealloc_buffer);
   }
 
   if (config_.arch == Arch::cuda) {
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
index 6e487419a28a5..871e394ad8b50 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.h
+++ b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -75,6 +75,10 @@ class LlvmRuntimeExecutor {
 
   void synchronize();
 
+  bool use_device_memory_pool() {
+    return use_device_memory_pool_;
+  }
+
  private:
   /* ----------------------- */
   /* ------ Allocation ----- */
@@ -96,7 +100,9 @@ class LlvmRuntimeExecutor {
                     std::size_t size,
                     uint32_t data);
 
-  void *preallocate_memory(std::size_t prealloc_size);
+  void *preallocate_memory(std::size_t prealloc_size,
+                           DeviceAllocationUnique &devalloc);
+  void preallocate_runtime_memory();
 
   /* ------------------------- */
   /* ---- Runtime Helpers ---- */
@@ -144,12 +150,16 @@ class LlvmRuntimeExecutor {
 
   std::unique_ptr<SNodeTreeBufferManager> snode_tree_buffer_manager_{nullptr};
   std::unordered_map<int, DeviceAllocation> snode_tree_allocs_;
-  std::vector<DeviceAllocation> preallocated_device_buffer_allocs_;
+  DeviceAllocationUnique preallocated_runtime_objects_allocs_ = nullptr;
+  DeviceAllocationUnique preallocated_runtime_memory_allocs_ = nullptr;
+  std::unordered_map<DeviceAllocationId, DeviceAllocation>
+      allocated_runtime_memory_allocs_;
 
   // good buddy
   friend LlvmProgramImpl;
   friend SNodeTreeBufferManager;
 
+  bool use_device_memory_pool_ = false;
   bool finalized_{false};
   KernelProfilerBase *profiler_ = nullptr;
 };
diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
index 67a4b08eb29e3..d15c90fb1f7fa 100644
--- a/taichi/transforms/auto_diff.cpp
+++ b/taichi/transforms/auto_diff.cpp
@@ -1518,9 +1518,10 @@ class MakeAdjoint : public ADTransform {
                        "Cannot automatically differentiate through a grad "
                        "tensor, if you really want to do that, pass the grad "
                        "tensor into the kernel directly");
-        auto adj_ptr = insert<ExternalPtrStmt>(
-            src->base_ptr, src->indices, src->element_shape, src->element_dim,
-            /*is_grad=*/true);
+        auto adj_ptr =
+            insert<ExternalPtrStmt>(src->base_ptr, src->indices, src->ndim,
+                                    src->element_shape, src->element_dim,
+                                    /*is_grad=*/true);
         adj_ptr->ret_type = src->ret_type;
 
         if (is_ptr_offset) {
@@ -1592,9 +1593,10 @@ class MakeAdjoint : public ADTransform {
                      "Cannot automatically differentiate through a grad "
                      "tensor, if you really want to do that, pass the grad "
                      "tensor into the kernel directly");
-      adjoint_ptr = insert<ExternalPtrStmt>(
-          dest->base_ptr, dest->indices, dest->element_shape, dest->element_dim,
-          /*is_grad=*/true);
+      adjoint_ptr =
+          insert<ExternalPtrStmt>(dest->base_ptr, dest->indices, dest->ndim,
+                                  dest->element_shape, dest->element_dim,
+                                  /*is_grad=*/true);
       adjoint_ptr->ret_type = dest->ret_type;
 
       if (is_ptr_offset) {
@@ -1659,7 +1661,7 @@ class MakeAdjoint : public ADTransform {
                        "tensor, if you really want to do that, pass the grad "
                        "tensor into the kernel directly");
         auto adjoint_ptr =
-            insert<ExternalPtrStmt>(dest->base_ptr, dest->indices,
+            insert<ExternalPtrStmt>(dest->base_ptr, dest->indices, dest->ndim,
                                     dest->element_shape, dest->element_dim,
                                     /*is_grad=*/true);
         adjoint_ptr->ret_type = dest->ret_type;
diff --git a/taichi/transforms/cache_loop_invariant_global_vars.cpp b/taichi/transforms/cache_loop_invariant_global_vars.cpp
index 3966bd4d9d181..3ba7cc886d3af 100644
--- a/taichi/transforms/cache_loop_invariant_global_vars.cpp
+++ b/taichi/transforms/cache_loop_invariant_global_vars.cpp
@@ -54,7 +54,20 @@ class CacheLoopInvariantGlobalVars : public LoopInvariantDetector {
     if (current_offloaded->task_type == OffloadedTaskType::serial) {
       return true;
     }
-    if (auto global_ptr = stmt->cast<GlobalPtrStmt>()) {
+
+    // Handle GlobalPtrStmt
+    bool is_global_ptr_stmt = false;
+    GlobalPtrStmt *global_ptr = nullptr;
+    if (stmt->is<GlobalPtrStmt>()) {
+      is_global_ptr_stmt = true;
+      global_ptr = stmt->as<GlobalPtrStmt>();
+    } else if (stmt->is<MatrixPtrStmt>() &&
+               stmt->as<MatrixPtrStmt>()->origin->is<GlobalPtrStmt>()) {
+      is_global_ptr_stmt = true;
+      global_ptr = stmt->as<MatrixPtrStmt>()->origin->as<GlobalPtrStmt>();
+    }
+
+    if (global_ptr) {
       auto snode = global_ptr->snode;
       if (loop_unique_ptr_[snode] == nullptr ||
           loop_unique_ptr_[snode]->indices.empty()) {
@@ -69,8 +82,21 @@ class CacheLoopInvariantGlobalVars : public LoopInvariantDetector {
         return false;
       }
       return true;
-    } else if (stmt->is<ExternalPtrStmt>()) {
-      ExternalPtrStmt *dest_ptr = stmt->as<ExternalPtrStmt>();
+    }
+
+    // Handle ExternalPtrStmt
+    bool is_external_ptr_stmt = false;
+    ExternalPtrStmt *dest_ptr = nullptr;
+    if (stmt->is<ExternalPtrStmt>()) {
+      is_external_ptr_stmt = true;
+      dest_ptr = stmt->as<ExternalPtrStmt>();
+    } else if (stmt->is<MatrixPtrStmt>() &&
+               stmt->as<MatrixPtrStmt>()->origin->is<ExternalPtrStmt>()) {
+      is_external_ptr_stmt = true;
+      dest_ptr = stmt->as<MatrixPtrStmt>()->origin->as<ExternalPtrStmt>();
+    }
+
+    if (is_external_ptr_stmt) {
       if (dest_ptr->indices.empty()) {
         return false;
       }
diff --git a/taichi/transforms/check_out_of_bound.cpp b/taichi/transforms/check_out_of_bound.cpp
index 4bd5dd51e51af..79acfff34a754 100644
--- a/taichi/transforms/check_out_of_bound.cpp
+++ b/taichi/transforms/check_out_of_bound.cpp
@@ -59,17 +59,22 @@ class CheckOutOfBound : public BasicStmtVisitor {
       auto check_lower_bound = new_stmts.push_back<BinaryOpStmt>(
           BinaryOpType::cmp_ge, stmt->indices[i], lower_bound);
       Stmt *upper_bound{nullptr};
-      // TODO: Simplify logic here since SOA layout for ndarray is deprecated
-      if ((stmt->element_dim < 0 && i == (stmt->indices.size() - 1)) ||
-          (stmt->element_dim > 0 && i == 0)) {
-        upper_bound =
-            new_stmts.push_back<ConstStmt>(TypedConstant(flattened_element));
-      } else {
-        auto axis = stmt->element_dim <= 0 ? i : (i - stmt->element_dim);
+
+      // SOA layout for ndarray is deprecated, assert it's AOS layout
+      TI_ASSERT(stmt->element_dim <= 0);
+      auto ndim = stmt->ndim;
+      if (i < ndim) {
+        // Check for External Shape
+        auto axis = i;
         upper_bound = new_stmts.push_back<ExternalTensorShapeAlongAxisStmt>(
             /*axis=*/axis,
             /*arg_id=*/stmt->base_ptr->as<ArgLoadStmt>()->arg_id);
+      } else {
+        // Check for Element Shape
+        upper_bound =
+            new_stmts.push_back<ConstStmt>(TypedConstant(flattened_element));
       }
+
       auto check_upper_bound = new_stmts.push_back<BinaryOpStmt>(
           BinaryOpType::cmp_lt, stmt->indices[i], upper_bound);
       auto check_i = new_stmts.push_back<BinaryOpStmt>(
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index 14fb925dd8414..f162f5975274f 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -183,14 +183,6 @@ void offload_to_executable(IRNode *ir,
   print("Atomics demoted I");
   irpass::analysis::verify(ir);
 
-  if (config.real_matrix_scalarize) {
-    irpass::scalarize(ir);
-
-    // Remove redundant MatrixInitStmt inserted during scalarization
-    irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false});
-    print("Scalarized");
-  }
-
   if (config.cache_loop_invariant_global_vars) {
     irpass::cache_loop_invariant_global_vars(ir, config);
     print("Cache loop-invariant global vars");
@@ -218,6 +210,14 @@ void offload_to_executable(IRNode *ir,
     irpass::analysis::verify(ir);
   }
 
+  if (config.real_matrix_scalarize) {
+    irpass::scalarize(ir);
+
+    // Remove redundant MatrixInitStmt inserted during scalarization
+    irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false});
+    print("Scalarized");
+  }
+
   if (make_thread_local) {
     irpass::make_thread_local(ir, config);
     print("Make thread local");
diff --git a/taichi/transforms/make_cpu_multithreaded_range_for.cpp b/taichi/transforms/make_cpu_multithreaded_range_for.cpp
index 8ada2a9c70c88..8528b0ba1685a 100644
--- a/taichi/transforms/make_cpu_multithreaded_range_for.cpp
+++ b/taichi/transforms/make_cpu_multithreaded_range_for.cpp
@@ -48,7 +48,8 @@ using TaskType = OffloadedStmt::TaskType;
 
 class MakeCPUMultithreadedRangeFor : public BasicStmtVisitor {
  public:
-  MakeCPUMultithreadedRangeFor(const CompileConfig &config) : config(config) {
+  explicit MakeCPUMultithreadedRangeFor(const CompileConfig &config)
+      : config(config) {
   }
 
   void visit(Block *block) override {
diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp
index 69073000b5329..ba426be5baa2c 100644
--- a/taichi/transforms/scalarize.cpp
+++ b/taichi/transforms/scalarize.cpp
@@ -608,10 +608,14 @@ class Scalarize : public BasicStmtVisitor {
   }
 
   void visit(ArgLoadStmt *stmt) override {
+    if (!stmt->ret_type.is_pointer()) {
+      return;
+    }
     if (stmt->ret_type.ptr_removed()->is<StructType>()) {
       return;
     }
     auto ret_type = stmt->ret_type.ptr_removed().get_element_type();
+    ret_type = TypeFactory::get_instance().get_pointer_type(ret_type);
     auto arg_load = std::make_unique<ArgLoadStmt>(
         stmt->arg_id, ret_type, stmt->is_ptr, stmt->create_load);
 
@@ -1119,9 +1123,9 @@ class MergeExternalAndMatrixPtr : public BasicStmtVisitor {
           std::accumulate(begin(origin->element_shape),
                           end(origin->element_shape), 1, std::multiplies<>())};
 
-      auto fused = std::make_unique<ExternalPtrStmt>(origin->base_ptr, indices,
-                                                     element_shape, element_dim,
-                                                     origin->is_grad);
+      auto fused = std::make_unique<ExternalPtrStmt>(
+          origin->base_ptr, indices, origin->ndim, element_shape, element_dim,
+          origin->is_grad);
       fused->ret_type = stmt->ret_type;
       // Note: Update base_ptr's ret_type so that it matches the ExternalPtrStmt
       // with flattened indices. Main goal is to keep all the hacks in a single
diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index 8d59faf8cc30e..e979f11d3b5e5 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -431,12 +431,7 @@ class TypeCheck : public IRVisitor {
   }
 
   void visit(ArgLoadStmt *stmt) override {
-    // TODO: Maybe have a type_inference() pass, which takes in the args/rets
-    // defined by the kernel. After that, type_check() pass will purely do
-    // verification, without modifying any types.
-    if (stmt->is_ptr) {
-      stmt->ret_type.set_is_pointer(true);
-    }
+    // Do nothing
   }
 
   void visit(ReturnStmt *stmt) override {
diff --git a/taichi/transforms/vectorize_half2.cpp b/taichi/transforms/vectorize_half2.cpp
index 3a9358937e1a6..d159092357b66 100644
--- a/taichi/transforms/vectorize_half2.cpp
+++ b/taichi/transforms/vectorize_half2.cpp
@@ -355,7 +355,8 @@ class Half2Vectorize : public BasicStmtVisitor {
       std::vector<int> element_shape = {2};
       int element_dim = -1;
       auto new_extern_stmt = std::make_unique<ExternalPtrStmt>(
-          self_ptr, new_indices, element_shape, element_dim);
+          self_ptr, new_indices, self_extern_stmt->ndim, element_shape,
+          element_dim);
       new_extern_stmt->overrided_dtype = true;
       new_extern_stmt->ret_type = tensor_type;
       new_extern_stmt->ret_type.set_is_pointer(true);
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 476b9df320222..8f04ed2ad6ba8 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -6,6 +6,7 @@
 from taichi.lang.exception import TaichiIndexError, TaichiTypeError
 from taichi.lang.misc import get_host_arch_list
 from taichi.lang.util import has_pytorch
+from taichi.math import vec3, ivec3
 
 import taichi as ti
 from tests import test_utils
@@ -779,10 +780,23 @@ def test_matrix_ndarray_oob():
     def access_arr(input: ti.types.ndarray(), p: ti.i32, q: ti.i32, x: ti.i32, y: ti.i32) -> ti.f32:
         return input[p, q][x, y]
 
+    @ti.kernel
+    def valid_access(indices: ti.types.ndarray(dtype=ivec3, ndim=1), dummy: ti.types.ndarray(dtype=ivec3, ndim=1)):
+        for i in indices:
+            index_vec = ti.Vector([0, 0, 0])
+            for j in ti.static(range(3)):
+                index = indices[i][j]
+                index_vec[j] = index
+            dummy[i] = index_vec
+
     input = ti.ndarray(dtype=ti.math.mat2, shape=(4, 5))
 
+    indices = ti.ndarray(dtype=ivec3, shape=(10))
+    dummy = ti.ndarray(dtype=ivec3, shape=(10))
+
     # Works
     access_arr(input, 2, 3, 0, 1)
+    valid_access(indices, dummy)
 
     # element_shape
     with pytest.raises(AssertionError, match=r"Out of bound access"):