diff --git a/.github/workflows/scripts/check_clang_tidy.sh b/.github/workflows/scripts/check_clang_tidy.sh
index d9db1c9a3..06cbb0c74 100755
--- a/.github/workflows/scripts/check_clang_tidy.sh
+++ b/.github/workflows/scripts/check_clang_tidy.sh
@@ -2,11 +2,11 @@
 
 CI_SETUP_CMAKE_ARGS=$1
 
-cd taichi
+cd test_actions 
 python3 -m pip install -r requirements_dev.txt
 
 rm -rf build && mkdir build && cd build
 cmake $CI_SETUP_CMAKE_ARGS ..
 
 cd ..
-python3 ./scripts/run_clang_tidy.py $PWD/taichi -clang-tidy-binary clang-tidy-10 -checks=-*,performance-inefficient-string-concatenation,readability-identifier-naming -header-filter=$PWD/taichi -p $PWD/build -j2
+python3 ./scripts/run_clang_tidy.py $PWD/test_actions -clang-tidy-binary clang-tidy-10 -checks=-*,performance-inefficient-string-concatenation,readability-identifier-naming -header-filter=$PWD/test_actions -p $PWD/build -j2
diff --git a/.github/workflows/scripts/unix_build.sh b/.github/workflows/scripts/unix_build.sh
index e29cfe704..6e975d9bc 100755
--- a/.github/workflows/scripts/unix_build.sh
+++ b/.github/workflows/scripts/unix_build.sh
@@ -11,7 +11,7 @@ check_in_docker() {
 }
 
 IN_DOCKER=$(check_in_docker)
-[[ "$IN_DOCKER" == "true" ]] && cd taichi
+[[ "$IN_DOCKER" == "true" ]] && cd test_actions 
 
 setup_sccache() {
     export SCCACHE_DIR=$(pwd)/sccache_cache
diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
index 54d99071f..f2248c864 100755
--- a/.github/workflows/scripts/unix_test.sh
+++ b/.github/workflows/scripts/unix_test.sh
@@ -20,7 +20,7 @@ fi
 python3 -m pip install dist/*.whl
 if [ -z "$GPU_TEST" ]; then
     python3 -m pip install -r requirements_test.txt
-    python3 -m pip install torch
+    #python3 -m pip install torch
 else
     ## Only GPU machine uses system python.
     export PATH=$PATH:$HOME/.local/bin
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 368e00588..10c310c48 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -136,330 +136,329 @@ jobs:
           docker run -id --user dev --name check_clang_tidy ghcr.io/taichi-dev/taichidev-cpu-ubuntu18.04:v0.1.0 /bin/bash
           tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - check_clang_tidy:/home/dev/
           docker exec --user root check_clang_tidy apt install -y clang-tidy-10
-          docker exec --user dev check_clang_tidy /home/dev/taichi/.github/workflows/scripts/check_clang_tidy.sh "$CI_SETUP_CMAKE_ARGS"
+          docker exec --user dev check_clang_tidy /home/dev/test_actions/.github/workflows/scripts/check_clang_tidy.sh "$CI_SETUP_CMAKE_ARGS"
         env:
           CR_PAT: ${{ secrets.GITHUB_TOKEN }}
           CI_SETUP_CMAKE_ARGS: -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=ON -DTI_WITH_VULKAN:BOOL=OFF -DTI_BUILD_TESTS:BOOL=OFF
 
-  build_and_test_cpu_linux:
-    name: Build and Test linux (CPU)
-    needs: [check_code_format, check_files]
-    timeout-minutes: 60
-    strategy:
-      matrix:
-        include:
-          - os: ubuntu-latest
-            python: py36
-            with_cc: OFF
-            wanted_archs: "cpu"
-          - os: ubuntu-latest
-            python: py39
-            with_cc: ON
-            wanted_archs: "cpu,cc"
-    runs-on: ${{ matrix.os }}
-    permissions:
-      packages: read
-      contents: read
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: "recursive"
-
-      - name: Get sccache cache
-        uses: actions/cache@v2
-        with:
-          path: sccache_cache
-          key: sccache-linux-${{matrix.with_cc}}-${{ github.sha }}
-          restore-keys: |
-            sccache-linux-${{matrix.with_cc}}-
-
-      - name: Get docker images
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          # https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#upgrading-a-workflow-that-accesses-ghcrio
-          echo $CR_PAT | docker login ghcr.io -u ${{ github.actor }} --password-stdin
-          docker pull ghcr.io/taichi-dev/taichidev-cpu-ubuntu18.04:v0.1.0
-        env:
-          CR_PAT: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          mkdir -m777 shared
-          docker create --user dev --name taichi_build \
-            -e PY -e PROJECT_NAME -e TAICHI_CMAKE_ARGS \
-            ghcr.io/taichi-dev/taichidev-cpu-ubuntu18.04:v0.1.0 \
-            /home/dev/taichi/.github/workflows/scripts/unix_build.sh
-          # A tarball is needed because sccache needs some permissions that only the file owner has.
-          # 1000 is the uid and gid of user "dev" in the container.
-          # If the uid or gid of the user inside the docker changes, please change the uid and gid in the following line.
-          tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build:/home/dev/
-          docker start -a taichi_build
-          rm -rf sccache_cache
-          docker cp taichi_build:/home/dev/taichi/sccache_cache sccache_cache
-          docker cp taichi_build:/home/dev/taichi/dist shared/dist
-          docker cp taichi_build:/home/dev/taichi/build shared/build
-        env:
-          PY: ${{ matrix.python }}
-          PROJECT_NAME: taichi
-          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=OFF -DTI_BUILD_TESTS:BOOL=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
-
-      - name: Test
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          docker create --user dev --name taichi_test -e PY -e TI_WANTED_ARCHS ghcr.io/taichi-dev/taichidev-cpu-ubuntu18.04:v0.1.0 /home/dev/unix_test.sh
-          docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
-          docker cp shared/dist/ taichi_test:/home/dev/
-          docker cp shared/build/ taichi_test:/home/dev/
-          docker cp ./requirements_test.txt taichi_test:/home/dev/requirements_test.txt
-          docker cp tests/ taichi_test:/home/dev/
-          docker start -a taichi_test
-        env:
-          PY: ${{ matrix.python }}
-          TI_WANTED_ARCHS: ${{ matrix.wanted_archs }}
-
-      - name: clean docker container
-        if: always()
-        run: |
-          docker rm taichi_build taichi_test -f
-
-  build_and_test_cpu_mac:
-    name: Build and Test macos (CPU)
-    needs: [check_code_format, check_files]
-    timeout-minutes: 60
-    strategy:
-      matrix:
-        include:
-          - os: macos-10.15
-            python: 3.7
-            with_cc: OFF
-            with_cpp_tests: ON
-            wanted_archs: "cpu"
-    runs-on: ${{ matrix.os }}
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: "recursive"
-
-      - uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python }}
-
-      - name: Get sccache cache
-        uses: actions/cache@v2
-        with:
-          path: sccache_cache
-          key: sccache-mac-${{ github.sha }}
-          restore-keys: |
-            sccache-mac-
-
-      - name: Download Pre-Built LLVM 10.0.0
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          python misc/ci_download.py
-        env:
-          CI_PLATFORM: ${{ matrix.os }}
-
-      - name: Build & Install
-        run: |
-          brew install molten-vk
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          mkdir -p sccache_cache
-          export PATH=`pwd`/taichi-llvm/bin/:$PATH
-          .github/workflows/scripts/unix_build.sh
-          brew uninstall molten-vk
-        env:
-          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=ON -DTI_BUILD_TESTS:BOOL=${{ matrix.with_cpp_tests }} -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
-          CXX: clang++
-      # [DEBUG] Copy this step around to enable debugging inside Github Action instances.
-      #- name: Setup tmate session
-      #  uses: mxschmitt/action-tmate@v3
-      #  with:
-      #    limit-access-to-actor: true
-
-      - name: Test
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          .github/workflows/scripts/unix_test.sh
-        env:
-          TI_WANTED_ARCHS: ${{ matrix.wanted_archs }}
-
-  build_and_test_gpu_linux:
-    name: Build and Test (GPU)
-    needs: [check_code_format, check_files]
-    runs-on: [self-hosted, cuda, vulkan, cn]
-    timeout-minutes: 60
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: "recursive"
-
-      - name: Get sccache cache
-        uses: actions/cache@v2
-        with:
-          path: sccache_cache
-          key: sccache-linux-gpu-${{ github.sha }}
-          restore-keys: |
-            sccache-linux-gpu-
-
-      - name: Build & Install
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          mkdir -m777 shared
-          docker create --user dev --name taichi_build --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
-            -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.1.1 \
-            /home/dev/taichi/.github/workflows/scripts/unix_build.sh
-          # A tarball is needed because sccache needs some permissions that only the file owner has.
-          # 1000 is the uid and gid of user "dev" in the container.
-          # If the uid or gid of the user inside the docker changes, please change the uid and gid in the following line.
-          tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build:/home/dev/
-          docker start -a taichi_build
-          rm -rf sccache_cache
-          docker cp taichi_build:/home/dev/taichi/sccache_cache sccache_cache
-          docker cp taichi_build:/home/dev/taichi/dist shared/dist
-          docker cp taichi_build:/home/dev/taichi/build shared/build
-        env:
-          PY: py38
-          GPU_BUILD: ON
-          PROJECT_NAME: taichi
-          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=ON -DTI_WITH_CC:BOOL=OFF -DTI_WITH_VULKAN:BOOL=ON -DTI_BUILD_TESTS:BOOL=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
-          DISPLAY: :1
-
-      - name: Test
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          docker create --user dev --name taichi_test --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
-            -e DISPLAY -e PY -e GPU_TEST -e TI_WANTED_ARCHS \
-             registry.taichigraphics.com/taichidev-ubuntu18.04:v0.1.1 \
-             /home/dev/unix_test.sh
-          docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
-          docker cp shared/dist/ taichi_test:/home/dev/
-          docker cp shared/build/ taichi_test:/home/dev/
-          docker cp tests/ taichi_test:/home/dev/
-          docker cp requirements_test.txt taichi_test:/home/dev/requirements_test.txt
-          docker start -a taichi_test
-        env:
-          PY: py38
-          GPU_TEST: ON
-          DISPLAY: :1
-          TI_WANTED_ARCHS: "cpu,cuda,vulkan,opengl"
-
-      - name: clean docker container
-        if: always()
-        run: |
-          docker rm taichi_build taichi_test -f
-
-  build_and_test_windows:
-    name: Build and Test Windows
-    needs: [check_code_format, check_files]
-    runs-on: [self-hosted, windows, gpu]
-    timeout-minutes: 90
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: "recursive"
-
-      - uses: actions/setup-python@v2
-        with:
-          python-version: 3.7
-
-      - name: Add Visual Studio Shell to ENV
-        uses: egor-tensin/vs-shell@v2
-        with:
-          arch: x64
-
-      - name: Get sccache cache
-        uses: actions/cache@v2
-        with:
-          path: ccache_cache
-          key: ccache-win64-${{ github.sha }}
-          restore-keys: |
-            ccache-win64-
-
-      - name: Build
-        shell: powershell
-        run: |
-          if ( "${{needs.check_files.outputs.run_job}}" -eq "false" ) {
-            exit 0
-          }
-          .\.github\workflows\scripts\win_build.ps1 -installVulkan -install -libsDir C:\
-
-      - name: Test
-        shell: powershell
-        run: |
-          if ( "${{needs.check_files.outputs.run_job}}" -eq "false" ) {
-            exit 0
-          }
-          .\.github\workflows\scripts\win_test.ps1
-        env:
-          TI_WANTED_ARCHS: cpu,cuda,opengl
-          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=ON -DTI_WITH_CC:BOOL=OFF
-          TI_SKIP_VERSION_CHECK: ON
-
-  build_and_test_m1:
-    name: Build and Test (Apple M1)
-    needs: [check_code_format, check_files]
-    timeout-minutes: 60
-    strategy:
-      matrix:
-        include:
-          - os: macos-latest
-            python: 3.8
-    defaults:
-      run:
-        # https://github.com/actions/runner/issues/805#issuecomment-844426478
-        shell: "/usr/bin/arch -arch arm64e /bin/bash --noprofile --norc -eo pipefail {0}"
-    runs-on: [self-hosted, m1]
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: "recursive"
-
-      - name: Get sccache cache
-        uses: actions/cache@v2
-        with:
-          path: sccache_cache
-          key: sccache-m1-${{ github.sha }}
-          restore-keys: |
-            sccache-m1-
-
-      - name: Build
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          export PATH=/Users/github/miniforge3/envs/$PYTHON/bin:$PATH
-          brew install molten-vk
-          .github/workflows/scripts/unix_build.sh
-        env:
-          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CUDA:BOOL=OFF -DTI_WITH_CC:BOOL=OFF -DTI_WITH_VULKAN:BOOL=ON -DTI_BUILD_TESTS:BOOL=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
-          PYTHON: ${{ matrix.python }}
-          CXX: clang++
-
-      - name: Test
-        run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
-          export PATH=/Users/github/miniforge3/envs/$PYTHON/bin:$PATH
-          .github/workflows/scripts/unix_test.sh
-        env:
-          TI_WANTED_ARCHS: "metal,vulkan,cpu"
-          PYTHON: ${{ matrix.python }}
+      build_and_test_cpu_linux:
+        name: Build and Test linux (CPU)
+        needs: [check_code_format, check_files]
+        timeout-minutes: 60
+        strategy:
+          matrix:
+            include:
+              - os: ubuntu-latest
+                python: py36
+                with_cc: OFF
+                wanted_archs: "cpu"
+              - os: ubuntu-latest
+                python: py39
+                with_cc: ON
+                wanted_archs: "cpu,cc"
+        runs-on: ${{ matrix.os }}
+        permissions:
+          packages: read
+          contents: read
+        steps:
+          - uses: actions/checkout@v2
+            with:
+              submodules: "recursive"
+
+          - name: Get sccache cache
+            uses: actions/cache@v2
+            with:
+              path: sccache_cache
+              key: sccache-linux-${{matrix.with_cc}}-${{ github.sha }}
+              restore-keys: |
+                sccache-linux-${{matrix.with_cc}}-
+
+          - name: Get docker images
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              # https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#upgrading-a-workflow-that-accesses-ghcrio
+              echo $CR_PAT | docker login ghcr.io -u ${{ github.actor }} --password-stdin
+              docker pull ghcr.io/taichi-dev/taichidev-cpu-ubuntu18.04:v0.1.0
+            env:
+              CR_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+          - name: Build
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              mkdir -m777 shared
+              docker create --user dev --name taichi_build \
+                -e PY -e PROJECT_NAME -e TAICHI_CMAKE_ARGS \
+                ghcr.io/taichi-dev/taichidev-cpu-ubuntu18.04:v0.1.0 \
+                /home/dev/test_actions/.github/workflows/scripts/unix_build.sh
+              # A tarball is needed because sccache needs some permissions that only the file owner has.
+              # 1000 is the uid and gid of user "dev" in the container.
+              # If the uid or gid of the user inside the docker changes, please change the uid and gid in the following line.
+              tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build:/home/dev/
+              docker start -a taichi_build
+              rm -rf sccache_cache
+              docker cp taichi_build:/home/dev/test_actions/sccache_cache sccache_cache
+              docker cp taichi_build:/home/dev/test_actions/dist shared/dist
+              docker cp taichi_build:/home/dev/test_actions/build shared/build
+            env:
+              PY: ${{ matrix.python }}
+              PROJECT_NAME: taichi
+              TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=OFF -DTI_BUILD_TESTS:BOOL=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+
+          - name: Test
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              docker create --user dev --name taichi_test -e PY -e TI_WANTED_ARCHS ghcr.io/taichi-dev/taichidev-cpu-ubuntu18.04:v0.1.0 /home/dev/unix_test.sh
+              docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
+              docker cp shared/dist/ taichi_test:/home/dev/
+              docker cp shared/build/ taichi_test:/home/dev/
+              docker cp ./requirements_test.txt taichi_test:/home/dev/requirements_test.txt
+              docker cp tests/ taichi_test:/home/dev/
+              docker start -a taichi_test
+            env:
+              PY: ${{ matrix.python }}
+              TI_WANTED_ARCHS: ${{ matrix.wanted_archs }}
+
+          - name: clean docker container
+            if: always()
+            run: |
+              docker rm taichi_build taichi_test -f
+
+      build_and_test_cpu_mac:
+        name: Build and Test macos (CPU)
+        needs: [check_code_format, check_files]
+        timeout-minutes: 60
+        strategy:
+          matrix:
+            include:
+              - os: macos-10.15
+                python: "3.10.0"
+                with_cc: OFF
+                with_cpp_tests: ON
+                wanted_archs: "cpu"
+        runs-on: ${{ matrix.os }}
+        steps:
+          - uses: actions/checkout@v2
+            with:
+              submodules: "recursive"
+
+          - uses: actions/setup-python@v2
+            with:
+              python-version: ${{ matrix.python }}
+
+          - name: Get sccache cache
+            uses: actions/cache@v2
+            with:
+              path: sccache_cache
+              key: sccache-mac-${{ github.sha }}
+              restore-keys: |
+                sccache-mac-
+
+          - name: Download Pre-Built LLVM 10.0.0
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              python misc/ci_download.py
+            env:
+              CI_PLATFORM: ${{ matrix.os }}
+
+          - name: Build & Install
+            run: |
+              brew install molten-vk
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              mkdir -p sccache_cache
+              export PATH=`pwd`/taichi-llvm/bin/:$PATH
+              .github/workflows/scripts/unix_build.sh
+              brew uninstall molten-vk
+            env:
+              TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=ON -DTI_BUILD_TESTS:BOOL=${{ matrix.with_cpp_tests }} -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+              CXX: clang++
+          # [DEBUG] Copy this step around to enable debugging inside Github Action instances.
+          #- name: Setup tmate session
+          #  uses: mxschmitt/action-tmate@v3
+          #  with:
+          #    limit-access-to-actor: true
+
+          - name: Test
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              .github/workflows/scripts/unix_test.sh
+            env:
+              TI_WANTED_ARCHS: ${{ matrix.wanted_archs }}
+
+      build_and_test_gpu_linux:
+        name: Build and Test (GPU)
+        needs: [check_code_format, check_files]
+        runs-on: [self-hosted, cuda, vulkan, cn]
+        timeout-minutes: 60
+        steps:
+          - uses: actions/checkout@v2
+            with:
+              submodules: "recursive"
+
+          - name: Get sccache cache
+            uses: actions/cache@v2
+            with:
+              path: sccache_cache
+              key: sccache-linux-gpu-${{ github.sha }}
+              restore-keys: |
+                sccache-linux-gpu-
+
+          - name: Build & Install
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              mkdir -m777 shared
+              docker create --user dev --name taichi_build --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
+                -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY \
+                registry.taichigraphics.com/taichidev-ubuntu18.04:v0.1.1 \
+                /home/dev/test_actions/.github/workflows/scripts/unix_build.sh
+              # A tarball is needed because sccache needs some permissions that only the file owner has.
+              # 1000 is the uid and gid of user "dev" in the container.
+              # If the uid or gid of the user inside the docker changes, please change the uid and gid in the following line.
+              tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build:/home/dev/
+              docker start -a taichi_build
+              rm -rf sccache_cache
+              docker cp taichi_build:/home/dev/test_actions/sccache_cache sccache_cache
+              docker cp taichi_build:/home/dev/test_actions/dist shared/dist
+              docker cp taichi_build:/home/dev/test_actions/build shared/build
+            env:
+              PY: py38
+              GPU_BUILD: ON
+              PROJECT_NAME: taichi
+              TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=ON -DTI_WITH_CC:BOOL=OFF -DTI_WITH_VULKAN:BOOL=ON -DTI_BUILD_TESTS:BOOL=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+              DISPLAY: :1
+
+          - name: Test
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              docker create --user dev --name taichi_test --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
+                -e DISPLAY -e PY -e GPU_TEST -e TI_WANTED_ARCHS \
+                 registry.taichigraphics.com/taichidev-ubuntu18.04:v0.1.1 \
+                 /home/dev/unix_test.sh
+              docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
+              docker cp shared/dist/ taichi_test:/home/dev/
+              docker cp shared/build/ taichi_test:/home/dev/
+              docker cp tests/ taichi_test:/home/dev/
+              docker cp requirements_test.txt taichi_test:/home/dev/requirements_test.txt
+              docker start -a taichi_test
+            env:
+              PY: py38
+              GPU_TEST: ON
+              DISPLAY: :1
+              TI_WANTED_ARCHS: "cpu,cuda,vulkan,opengl"
+
+          - name: clean docker container
+            if: always()
+            run: |
+              docker rm taichi_build taichi_test -f
+
+      build_and_test_windows:
+        name: Build and Test Windows
+        needs: [check_code_format, check_files]
+        runs-on: [self-hosted, windows, gpu]
+        timeout-minutes: 90
+        steps:
+          - uses: actions/checkout@v2
+            with:
+              submodules: "recursive"
+
+          - uses: actions/setup-python@v2
+            with:
+              python-version: 3.7
+
+          - name: Add Visual Studio Shell to ENV
+            uses: egor-tensin/vs-shell@v2
+            with:
+              arch: x64
+
+          - name: Get sccache cache
+            uses: actions/cache@v2
+            with:
+              path: ccache_cache
+              key: ccache-win64-${{ github.sha }}
+              restore-keys: |
+                ccache-win64-
+
+          - name: Build
+            shell: powershell
+            run: |
+              if ( "${{needs.check_files.outputs.run_job}}" -eq "false" ) {
+                exit 0
+              }
+              .\.github\workflows\scripts\win_build.ps1 -installVulkan -install -libsDir C:\
+
+          - name: Test
+            shell: powershell
+            run: |
+              if ( "${{needs.check_files.outputs.run_job}}" -eq "false" ) {
+                exit 0
+              }
+              .\.github\workflows\scripts\win_test.ps1
+            env:
+              TI_WANTED_ARCHS: cpu,cuda,opengl
+              TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=ON -DTI_WITH_CC:BOOL=OFF
+              TI_SKIP_VERSION_CHECK: ON
+
+      build_and_test_m1:
+        name: Build and Test (Apple M1)
+        needs: [check_code_format, check_files]
+        timeout-minutes: 60
+        strategy:
+          matrix:
+            include:
+              - os: macos-latest
+                python: 3.10
+        defaults:
+          run:
+            # https://github.com/actions/runner/issues/805#issuecomment-844426478
+            shell: "/usr/bin/arch -arch arm64e /bin/bash --noprofile --norc -eo pipefail {0}"
+        runs-on: [self-hosted, m1]
+        steps:
+          - uses: actions/checkout@v2
+            with:
+              submodules: "recursive"
+
+          - name: Get sccache cache
+            uses: actions/cache@v2
+            with:
+              path: sccache_cache
+              key: sccache-m1-${{ github.sha }}
+              restore-keys: |
+                sccache-m1-
+
+          - name: Build
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              rm -rf $HOME/Library/Python/3.8/lib/python/site-packages/taichi
+              brew install molten-vk
+              export PATH=/Users/github/miniforge3/envs/3.10/bin:$PATH
+              .github/workflows/scripts/unix_build.sh
+            env:
+              TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CUDA:BOOL=OFF -DTI_WITH_CC:BOOL=OFF -DTI_WITH_VULKAN:BOOL=ON -DTI_BUILD_TESTS:BOOL=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+              CXX: clang++
+
+          - name: Test
+            run: |
+              if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+                exit 0
+              fi
+              export PATH=/Users/github/miniforge3/envs/3.10/bin:$PATH
+              .github/workflows/scripts/unix_test.sh
+            env:
+              TI_WANTED_ARCHS: "metal,vulkan,cpu"
diff --git a/mpm88_scoped_profiler.txt b/mpm88_scoped_profiler.txt
new file mode 100644
index 000000000..a4f93d600
--- /dev/null
+++ b/mpm88_scoped_profiler.txt
@@ -0,0 +1,228 @@
+[38;2;000;255;255m>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+[0m[38;2;255;000;000m[Profiler thread 139981479728064]
+[0m    [38;2;255;000;000m260.178 ms compile                      [0m[38;2;000;255;255m [1 x 260.178 ms]
+[0m        [38;2;144;238;144m212.325 ms 81.61%  compile_to_executable[0m[38;2;000;255;255m [1 x 212.325 ms]
+[0m            [38;2;255;255;000m163.618 ms 77.06%  compile_to_offloads  [0m[38;2;000;255;255m [1 x 163.618 ms]
+[0m                [38;2;173;216;230m  0.012 ms  0.01%  frontend_type_check  [0m[38;2;000;255;255m [1 x  11.921 us]
+[0m                [38;2;173;216;230m 17.289 ms 10.57%  lower_ast            [0m[38;2;000;255;255m [1 x  17.289 ms]
+[0m                    [38;2;255;000;255m 13.589 ms 78.60%  replace_all_usages_with[0m[38;2;000;255;255m [1999 x   6.798 us]
+[0m                    [38;2;255;000;255m  3.700 ms 21.40%  [unaccounted]
+[0m                [38;2;173;216;230m  0.587 ms  0.36%  type_check           [0m[38;2;000;255;255m [2 x 293.374 us]
+[0m                [38;2;173;216;230m  1.331 ms  0.81%  verify               [0m[38;2;000;255;255m [8 x 166.386 us]
+[0m                [38;2;173;216;230m  0.309 ms  0.19%  bit_loop_vectorize   [0m[38;2;000;255;255m [1 x 308.990 us]
+[0m                    [38;2;255;000;255m306.845 us 99.31%  die                  [0m[38;2;000;255;255m [1 x 306.845 us]
+[0m                    [38;2;255;000;255m  2.146 us  0.69%  [unaccounted]
+[0m                [38;2;173;216;230m139.275 ms 85.12%  full_simplify        [0m[38;2;000;255;255m [3 x  46.425 ms]
+[0m                    [38;2;255;000;255m  0.778 ms  0.56%  extract_constant     [0m[38;2;000;255;255m [7 x 111.171 us]
+[0m                    [38;2;255;000;255m  0.192 ms  0.14%  unreachable_code_elimination[0m[38;2;000;255;255m [7 x  27.418 us]
+[0m                    [38;2;255;000;255m  0.308 ms  0.22%  binary_op_simplify   [0m[38;2;000;255;255m [7 x  44.005 us]
+[0m                        [38;2;255;000;255m 12.159 us  3.95%  replace_all_usages_with[0m[38;2;000;255;255m [1 x  12.159 us]
+[0m                        [38;2;255;000;255m295.877 us 96.05%  [unaccounted]
+[0m                    [38;2;255;000;255m 19.530 ms 14.02%  constant_fold        [0m[38;2;000;255;255m [7 x   2.790 ms]
+[0m                        [38;2;255;000;255m  0.816 ms  4.18%  replace_all_usages_with[0m[38;2;000;255;255m [112 x   7.287 us]
+[0m                        [38;2;255;000;255m 18.307 ms 93.74%  compile              [0m[38;2;000;255;255m [2 x   9.153 ms]
+[0m                            [38;2;255;000;255m  0.073 ms  0.40%  compile_to_executable[0m[38;2;000;255;255m [2 x  36.597 us]
+[0m                                [38;2;255;000;255m 34.094 us 46.58%  compile_to_offloads  [0m[38;2;000;255;255m [2 x  17.047 us]
+[0m                                    [38;2;255;000;255m  0.954 us  2.80%  frontend_type_check  [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  3.338 us  9.79%  lower_ast            [0m[38;2;000;255;255m [2 x   1.669 us]
+[0m                                    [38;2;255;000;255m  1.907 us  5.59%  type_check           [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  2.146 us  6.29%  verify               [0m[38;2;000;255;255m [4 x 536.442 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  2.80%  demote_operations    [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m 21.935 us 64.34%  offload              [0m[38;2;000;255;255m [2 x  10.967 us]
+[0m                                        [38;2;255;000;255m  1.192 us  5.43%  type_check           [0m[38;2;000;255;255m [4 x 298.023 ns]
+[0m                                        [38;2;255;000;255m 20.742 us 94.57%  [unaccounted]
+[0m                                    [38;2;255;000;255m  2.861 us  8.39%  [unaccounted]
+[0m                                [38;2;255;000;255m 38.147 us 52.12%  offload_to_executable[0m[38;2;000;255;255m [2 x  19.073 us]
+[0m                                    [38;2;255;000;255m  9.298 us 24.38%  verify               [0m[38;2;000;255;255m [20 x 464.916 ns]
+[0m                                    [38;2;255;000;255m  1.907 us  5.00%  demote_atomics       [0m[38;2;000;255;255m [4 x 476.837 ns]
+[0m                                        [38;2;255;000;255m  0.000 us  0.00%  type_check           [0m[38;2;000;255;255m [4 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  1.907 us 100.00%  [unaccounted]
+[0m                                    [38;2;255;000;255m  1.907 us  5.00%  type_check           [0m[38;2;000;255;255m [8 x 238.419 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  2.50%  make_thread_local    [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                        [38;2;255;000;255m953.674 ns type_check                   [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  make_mesh_thread_local[0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  0.000 ns type_check                   [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  2.146 us  5.62%  demote_mesh_statements[0m[38;2;000;255;255m [2 x   1.073 us]
+[0m                                        [38;2;255;000;255m  1.192 us 55.56%  type_check           [0m[38;2;000;255;255m [2 x 596.046 ns]
+[0m                                        [38;2;255;000;255m  0.954 us 44.44%  [unaccounted]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  remove_range_assumption[0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  1.907 us  5.00%  remove_loop_unique   [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  1.907 us  5.00%  die                  [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  flag_access          [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  demote_operations    [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  2.861 us  7.50%  full_simplify        [0m[38;2;000;255;255m [2 x   1.431 us]
+[0m                                        [38;2;255;000;255m  1.907 us 66.67%  simplify             [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                        [38;2;255;000;255m  0.954 us 33.33%  die                  [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  3.099 us  8.12%  optimize_bit_struct_stores[0m[38;2;000;255;255m [2 x   1.550 us]
+[0m                                        [38;2;255;000;255m  1.907 us 61.54%  die                  [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                        [38;2;255;000;255m  1.192 us 38.46%  [unaccounted]
+[0m                                    [38;2;255;000;255m 12.159 us 31.88%  [unaccounted]
+[0m                                [38;2;255;000;255m  0.954 us  1.30%  [unaccounted]
+[0m                            [38;2;255;000;255m 18.228 ms 99.57%  codegen              [0m[38;2;000;255;255m [2 x   9.114 ms]
+[0m                                [38;2;255;000;255m  9.115 ms 50.01%  clone_struct_module  [0m[38;2;000;255;255m [2 x   4.557 ms]
+[0m                                [38;2;255;000;255m  0.001 ms  0.01%  CodeGenLLVMCPU       [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                [38;2;255;000;255m  0.045 ms  0.25%  emit_to_module       [0m[38;2;000;255;255m [2 x  22.411 us]
+[0m                                [38;2;255;000;255m  9.059 ms 49.70%  compile_module_to_executable[0m[38;2;000;255;255m [2 x   4.530 ms]
+[0m                                    [38;2;255;000;255m  1.679 ms 18.53%  eliminate_unused_functions[0m[38;2;000;255;255m [2 x 839.472 us]
+[0m                                    [38;2;255;000;255m  5.826 ms 64.31%  global_optimize_module_cpu[0m[38;2;000;255;255m [2 x   2.913 ms]
+[0m                                        [38;2;255;000;255m  0.780 ms 13.39%  llvm_function_pass   [0m[38;2;000;255;255m [2 x 389.934 us]
+[0m                                        [38;2;255;000;255m  3.091 ms 53.06%  llvm_module_pass     [0m[38;2;000;255;255m [2 x   1.546 ms]
+[0m                                        [38;2;255;000;255m  1.955 ms 33.55%  [unaccounted]
+[0m                                    [38;2;255;000;255m  1.554 ms 17.16%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.407 ms  2.08%  [unaccounted]
+[0m                    [38;2;255;000;255m  2.801 ms  2.01%  die                  [0m[38;2;000;255;255m [21 x 133.401 us]
+[0m                    [38;2;255;000;255m  0.804 ms  0.58%  alg_simp             [0m[38;2;000;255;255m [7 x 114.884 us]
+[0m                        [38;2;255;000;255m151.634 us 18.86%  replace_all_usages_with[0m[38;2;000;255;255m [72 x   2.106 us]
+[0m                        [38;2;255;000;255m652.552 us 81.14%  [unaccounted]
+[0m                    [38;2;255;000;255m  0.047 ms  0.03%  loop_invariant_code_motion[0m[38;2;000;255;255m [7 x   6.744 us]
+[0m                    [38;2;255;000;255m  0.639 ms  0.46%  simplify             [0m[38;2;000;255;255m [7 x  91.348 us]
+[0m                    [38;2;255;000;255m 23.106 ms 16.59%  whole_kernel_cse     [0m[38;2;000;255;255m [7 x   3.301 ms]
+[0m                        [38;2;255;000;255m  6.803 ms 29.44%  replace_all_usages_with[0m[38;2;000;255;255m [620 x  10.973 us]
+[0m                        [38;2;255;000;255m 16.303 ms 70.56%  [unaccounted]
+[0m                    [38;2;255;000;255m 91.065 ms 65.38%  cfg_optimization     [0m[38;2;000;255;255m [5 x  18.213 ms]
+[0m                        [38;2;255;000;255m 63.132 ms 69.33%  store_to_load_forwarding[0m[38;2;000;255;255m [6 x  10.522 ms]
+[0m                            [38;2;255;000;255m 30.162 ms 47.78%  reaching_definition_analysis[0m[38;2;000;255;255m [6 x   5.027 ms]
+[0m                            [38;2;255;000;255m 11.839 ms 18.75%  replace_all_usages_with[0m[38;2;000;255;255m [1283 x   9.227 us]
+[0m                            [38;2;255;000;255m 21.132 ms 33.47%  [unaccounted]
+[0m                        [38;2;255;000;255m 26.704 ms 29.32%  dead_store_elimination[0m[38;2;000;255;255m [6 x   4.451 ms]
+[0m                            [38;2;255;000;255m 18.921 ms 70.85%  live_variable_analysis[0m[38;2;000;255;255m [6 x   3.153 ms]
+[0m                            [38;2;255;000;255m  0.103 ms  0.38%  replace_all_usages_with[0m[38;2;000;255;255m [19 x   5.396 us]
+[0m                            [38;2;255;000;255m  7.681 ms 28.76%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.721 ms  0.79%  die                  [0m[38;2;000;255;255m [5 x 144.196 us]
+[0m                        [38;2;255;000;255m  0.508 ms  0.56%  [unaccounted]
+[0m                [38;2;173;216;230m  0.011 ms  0.01%  inlining             [0m[38;2;000;255;255m [1 x  10.967 us]
+[0m                [38;2;173;216;230m  0.020 ms  0.01%  flag_access          [0m[38;2;000;255;255m [2 x   9.894 us]
+[0m                [38;2;173;216;230m  1.025 ms  0.63%  offload              [0m[38;2;000;255;255m [1 x   1.025 ms]
+[0m                    [38;2;255;000;255m  0.007 ms  0.70%  replace_all_usages_with[0m[38;2;000;255;255m [4 x   1.788 us]
+[0m                    [38;2;255;000;255m  0.106 ms 10.33%  type_check           [0m[38;2;000;255;255m [2 x  52.929 us]
+[0m                    [38;2;255;000;255m  0.912 ms 88.97%  [unaccounted]
+[0m                [38;2;173;216;230m  3.751 ms  2.29%  cfg_optimization     [0m[38;2;000;255;255m [1 x   3.751 ms]
+[0m                    [38;2;255;000;255m  2.075 ms 55.32%  store_to_load_forwarding[0m[38;2;000;255;255m [1 x   2.075 ms]
+[0m                        [38;2;255;000;255m  1.174 ms 56.57%  reaching_definition_analysis[0m[38;2;000;255;255m [1 x   1.174 ms]
+[0m                        [38;2;255;000;255m  0.901 ms 43.43%  [unaccounted]
+[0m                    [38;2;255;000;255m  1.476 ms 39.34%  dead_store_elimination[0m[38;2;000;255;255m [1 x   1.476 ms]
+[0m                        [38;2;255;000;255m  0.945 ms 64.02%  live_variable_analysis[0m[38;2;000;255;255m [1 x 944.853 us]
+[0m                        [38;2;255;000;255m  0.531 ms 35.98%  [unaccounted]
+[0m                    [38;2;255;000;255m  0.147 ms  3.92%  die                  [0m[38;2;000;255;255m [1 x 147.104 us]
+[0m                    [38;2;255;000;255m  0.053 ms  1.41%  [unaccounted]
+[0m            [38;2;255;255;000m 48.706 ms 22.94%  offload_to_executable[0m[38;2;000;255;255m [1 x  48.706 ms]
+[0m                [38;2;173;216;230m  1.132 ms  2.33%  verify               [0m[38;2;000;255;255m [10 x 113.249 us]
+[0m                [38;2;173;216;230m  0.334 ms  0.69%  demote_atomics       [0m[38;2;000;255;255m [2 x 166.893 us]
+[0m                    [38;2;255;000;255m103.951 us 31.14%  replace_all_usages_with[0m[38;2;000;255;255m [57 x   1.824 us]
+[0m                    [38;2;255;000;255m 66.042 us 19.79%  type_check           [0m[38;2;000;255;255m [2 x  33.021 us]
+[0m                    [38;2;255;000;255m163.794 us 49.07%  [unaccounted]
+[0m                [38;2;173;216;230m  0.007 ms  0.01%  replace_all_usages_with[0m[38;2;000;255;255m [6 x   1.113 us]
+[0m                [38;2;173;216;230m  0.141 ms  0.29%  type_check           [0m[38;2;000;255;255m [4 x  35.167 us]
+[0m                [38;2;173;216;230m  1.043 ms  2.14%  make_thread_local    [0m[38;2;000;255;255m [1 x   1.043 ms]
+[0m                    [38;2;255;000;255m  0.033 ms  3.16%  type_check           [0m[38;2;000;255;255m [1 x  32.902 us]
+[0m                    [38;2;255;000;255m  1.010 ms 96.84%  [unaccounted]
+[0m                [38;2;173;216;230m  0.033 ms  0.07%  make_mesh_thread_local[0m[38;2;000;255;255m [1 x  33.140 us]
+[0m                    [38;2;255;000;255m 33.140 us 100.00%  type_check          [0m[38;2;000;255;255m [1 x  33.140 us]
+[0m                [38;2;173;216;230m  0.037 ms  0.08%  demote_mesh_statements[0m[38;2;000;255;255m [1 x  36.955 us]
+[0m                    [38;2;255;000;255m 31.948 us 86.45%  type_check           [0m[38;2;000;255;255m [1 x  31.948 us]
+[0m                    [38;2;255;000;255m  5.007 us 13.55%  [unaccounted]
+[0m                [38;2;173;216;230m  0.006 ms  0.01%  remove_range_assumption[0m[38;2;000;255;255m [1 x   5.960 us]
+[0m                [38;2;173;216;230m  0.004 ms  0.01%  remove_loop_unique   [0m[38;2;000;255;255m [1 x   4.053 us]
+[0m                [38;2;173;216;230m  0.351 ms  0.72%  die                  [0m[38;2;000;255;255m [1 x 350.952 us]
+[0m                [38;2;173;216;230m  0.016 ms  0.03%  flag_access          [0m[38;2;000;255;255m [1 x  15.974 us]
+[0m                [38;2;173;216;230m  0.877 ms  1.80%  demote_operations    [0m[38;2;000;255;255m [1 x 877.142 us]
+[0m                    [38;2;255;000;255m432.014 us 49.25%  replace_all_usages_with[0m[38;2;000;255;255m [157 x   2.752 us]
+[0m                    [38;2;255;000;255m111.103 us 12.67%  type_check           [0m[38;2;000;255;255m [2 x  55.552 us]
+[0m                    [38;2;255;000;255m334.024 us 38.08%  [unaccounted]
+[0m                [38;2;173;216;230m 44.051 ms 90.44%  full_simplify        [0m[38;2;000;255;255m [1 x  44.051 ms]
+[0m                    [38;2;255;000;255m  0.057 ms  0.13%  extract_constant     [0m[38;2;000;255;255m [3 x  18.994 us]
+[0m                    [38;2;255;000;255m  0.086 ms  0.19%  unreachable_code_elimination[0m[38;2;000;255;255m [3 x  28.610 us]
+[0m                    [38;2;255;000;255m  0.189 ms  0.43%  binary_op_simplify   [0m[38;2;000;255;255m [3 x  63.022 us]
+[0m                        [38;2;255;000;255m  6.914 us  3.66%  replace_all_usages_with[0m[38;2;000;255;255m [6 x   1.152 us]
+[0m                        [38;2;255;000;255m182.152 us 96.34%  [unaccounted]
+[0m                    [38;2;255;000;255m  8.466 ms 19.22%  constant_fold        [0m[38;2;000;255;255m [3 x   2.822 ms]
+[0m                        [38;2;255;000;255m  8.332 ms 98.42%  compile              [0m[38;2;000;255;255m [1 x   8.332 ms]
+[0m                            [38;2;255;000;255m  0.032 ms  0.38%  compile_to_executable[0m[38;2;000;255;255m [1 x  31.948 us]
+[0m                                [38;2;255;000;255m 14.067 us 44.03%  compile_to_offloads  [0m[38;2;000;255;255m [1 x  14.067 us]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  frontend_type_check  [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  1.192 us  8.47%  lower_ast            [0m[38;2;000;255;255m [1 x   1.192 us]
+[0m                                    [38;2;255;000;255m  0.954 us  6.78%  type_check           [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  1.907 us 13.56%  verify               [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  demote_operations    [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  8.106 us 57.63%  offload              [0m[38;2;000;255;255m [1 x   8.106 us]
+[0m                                        [38;2;255;000;255m  0.954 us 11.76%  type_check           [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                        [38;2;255;000;255m  7.153 us 88.24%  [unaccounted]
+[0m                                    [38;2;255;000;255m  1.907 us 13.56%  [unaccounted]
+[0m                                [38;2;255;000;255m 17.881 us 55.97%  offload_to_executable[0m[38;2;000;255;255m [1 x  17.881 us]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  verify               [0m[38;2;000;255;255m [10 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  2.146 us 12.00%  demote_atomics       [0m[38;2;000;255;255m [2 x   1.073 us]
+[0m                                        [38;2;255;000;255m  0.000 us  0.00%  type_check           [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  2.146 us 100.00%  [unaccounted]
+[0m                                    [38;2;255;000;255m  2.861 us 16.00%  type_check           [0m[38;2;000;255;255m [4 x 715.256 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  5.33%  make_thread_local    [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                        [38;2;255;000;255m953.674 ns type_check                   [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  make_mesh_thread_local[0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  0.000 ns type_check                   [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  5.33%  demote_mesh_statements[0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                        [38;2;255;000;255m  0.000 ns type_check                   [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  remove_range_assumption[0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  5.33%  remove_loop_unique   [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  5.33%  die                  [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  5.33%  flag_access          [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  1.192 us  6.67%  demote_operations    [0m[38;2;000;255;255m [1 x   1.192 us]
+[0m                                    [38;2;255;000;255m  0.954 us  5.33%  full_simplify        [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                        [38;2;255;000;255m  0.000 ns simplify                     [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                        [38;2;255;000;255m953.674 ns die                          [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  1.907 us 10.67%  optimize_bit_struct_stores[0m[38;2;000;255;255m [1 x   1.907 us]
+[0m                                        [38;2;255;000;255m  0.000 us  0.00%  die                  [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  1.907 us 100.00%  [unaccounted]
+[0m                                    [38;2;255;000;255m  4.053 us 22.67%  [unaccounted]
+[0m                            [38;2;255;000;255m  8.297 ms 99.58%  codegen              [0m[38;2;000;255;255m [1 x   8.297 ms]
+[0m                                [38;2;255;000;255m  3.498 ms 42.16%  clone_struct_module  [0m[38;2;000;255;255m [1 x   3.498 ms]
+[0m                                [38;2;255;000;255m  0.000 ms  0.00%  CodeGenLLVMCPU       [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                [38;2;255;000;255m  0.026 ms  0.31%  emit_to_module       [0m[38;2;000;255;255m [1 x  25.988 us]
+[0m                                [38;2;255;000;255m  4.769 ms 57.48%  compile_module_to_executable[0m[38;2;000;255;255m [1 x   4.769 ms]
+[0m                                    [38;2;255;000;255m  0.907 ms 19.02%  eliminate_unused_functions[0m[38;2;000;255;255m [1 x 906.944 us]
+[0m                                    [38;2;255;000;255m  3.066 ms 64.29%  global_optimize_module_cpu[0m[38;2;000;255;255m [1 x   3.066 ms]
+[0m                                        [38;2;255;000;255m  0.403 ms 13.15%  llvm_function_pass   [0m[38;2;000;255;255m [1 x 403.166 us]
+[0m                                        [38;2;255;000;255m  1.645 ms 53.65%  llvm_module_pass     [0m[38;2;000;255;255m [1 x   1.645 ms]
+[0m                                        [38;2;255;000;255m  1.018 ms 33.20%  [unaccounted]
+[0m                                    [38;2;255;000;255m  0.796 ms 16.70%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.019 ms  0.23%  replace_all_usages_with[0m[38;2;000;255;255m [6 x   3.179 us]
+[0m                        [38;2;255;000;255m  0.115 ms  1.36%  [unaccounted]
+[0m                    [38;2;255;000;255m  1.502 ms  3.41%  die                  [0m[38;2;000;255;255m [9 x 166.919 us]
+[0m                    [38;2;255;000;255m  1.093 ms  2.48%  alg_simp             [0m[38;2;000;255;255m [3 x 364.383 us]
+[0m                        [38;2;255;000;255m  0.603 ms 55.16%  replace_all_usages_with[0m[38;2;000;255;255m [201 x   3.000 us]
+[0m                        [38;2;255;000;255m  0.490 ms 44.84%  [unaccounted]
+[0m                    [38;2;255;000;255m  0.021 ms  0.05%  loop_invariant_code_motion[0m[38;2;000;255;255m [3 x   6.994 us]
+[0m                    [38;2;255;000;255m  4.764 ms 10.81%  simplify             [0m[38;2;000;255;255m [3 x   1.588 ms]
+[0m                        [38;2;255;000;255m  1.228 ms 25.77%  replace_all_usages_with[0m[38;2;000;255;255m [414 x   2.965 us]
+[0m                        [38;2;255;000;255m  2.598 ms 54.53%  type_check           [0m[38;2;000;255;255m [174 x  14.929 us]
+[0m                        [38;2;255;000;255m  0.939 ms 19.70%  [unaccounted]
+[0m                    [38;2;255;000;255m 25.229 ms 57.27%  whole_kernel_cse     [0m[38;2;000;255;255m [3 x   8.410 ms]
+[0m                        [38;2;255;000;255m  4.337 ms 17.19%  replace_all_usages_with[0m[38;2;000;255;255m [1165 x   3.722 us]
+[0m                        [38;2;255;000;255m 20.892 ms 82.81%  [unaccounted]
+[0m                    [38;2;255;000;255m  2.643 ms  6.00%  cfg_optimization     [Taichi] version 0.9.0, llvm 10.0.0, commit dd7654ae, linux, python 3.8.12
+[Taichi] Starting on arch=x64
+[0m[38;2;000;255;255m [2 x   1.322 ms]
+[0m                        [38;2;255;000;255m  0.964 ms 36.46%  store_to_load_forwarding[0m[38;2;000;255;255m [3 x 321.229 us]
+[0m                            [38;2;255;000;255m262.737 us 27.26%  reaching_definition_analysis[0m[38;2;000;255;255m [3 x  87.579 us]
+[0m                            [38;2;255;000;255m163.317 us 16.95%  replace_all_usages_with[0m[38;2;000;255;255m [60 x   2.722 us]
+[0m                            [38;2;255;000;255m537.634 us 55.79%  [unaccounted]
+[0m                        [38;2;255;000;255m  1.354 ms 51.24%  dead_store_elimination[0m[38;2;000;255;255m [3 x 451.406 us]
+[0m                            [38;2;255;000;255m  0.658 ms 48.59%  live_variable_analysis[0m[38;2;000;255;255m [3 x 219.345 us]
+[0m                            [38;2;255;000;255m  0.696 ms 51.41%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.294 ms 11.12%  die                  [0m[38;2;000;255;255m [2 x 146.985 us]
+[0m                        [38;2;255;000;255m  0.031 ms  1.18%  [unaccounted]
+[0m                [38;2;173;216;230m  0.131 ms  0.27%  optimize_bit_struct_stores[0m[38;2;000;255;255m [1 x 130.892 us]
+[0m                    [38;2;255;000;255m 77.009 us 58.83%  die                  [0m[38;2;000;255;255m [1 x  77.009 us]
+[0m                    [38;2;255;000;255m 53.883 us 41.17%  [unaccounted]
+[0m                [38;2;173;216;230m  0.544 ms  1.12%  [unaccounted]
+[0m        [38;2;144;238;144m 47.835 ms 18.39%  codegen              [0m[38;2;000;255;255m [1 x  47.835 ms]
+[0m            [38;2;255;255;000m  4.241 ms  8.87%  clone_struct_module  [0m[38;2;000;255;255m [1 x   4.241 ms]
+[0m            [38;2;255;255;000m  0.000 ms  0.00%  CodeGenLLVMCPU       [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m            [38;2;255;255;000m  0.408 ms  0.85%  emit_to_module       [0m[38;2;000;255;255m [1 x 408.173 us]
+[0m            [38;2;255;255;000m 43.178 ms 90.26%  compile_module_to_executable[0m[38;2;000;255;255m [1 x  43.178 ms]
+[0m                [38;2;173;216;230m  0.809 ms  1.87%  eliminate_unused_functions[0m[38;2;000;255;255m [1 x 808.954 us]
+[0m                [38;2;173;216;230m 35.383 ms 81.95%  global_optimize_module_cpu[0m[38;2;000;255;255m [1 x  35.383 ms]
+[0m                    [38;2;255;000;255m  1.499 ms  4.24%  llvm_function_pass   [0m[38;2;000;255;255m [1 x   1.499 ms]
+[0m                    [38;2;255;000;255m 32.613 ms 92.17%  llvm_module_pass     [0m[38;2;000;255;255m [1 x  32.613 ms]
+[0m                    [38;2;255;000;255m  1.271 ms  3.59%  [unaccounted]
+[0m                [38;2;173;216;230m  6.986 ms 16.18%  [unaccounted]
+[0m[38;2;000;255;255m>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+[0m
\ No newline at end of file
diff --git a/perf.data b/perf.data
new file mode 100644
index 000000000..616f13b37
Binary files /dev/null and b/perf.data differ
diff --git a/scoped_profile_full.txt b/scoped_profile_full.txt
new file mode 100644
index 000000000..10fbea513
--- /dev/null
+++ b/scoped_profile_full.txt
@@ -0,0 +1,244 @@
+[38;2;000;255;255m>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+[0m[38;2;255;000;000m[Profiler thread 140647799088064]
+[0m    [38;2;255;000;000m 17.142 ms clone_runtime_module         [0m[38;2;000;255;255m [2 x   8.571 ms]
+[0m        [38;2;144;238;144m  9.855 ms 57.49%  module_from_bitcode_file[0m[38;2;000;255;255m [1 x   9.855 ms]
+[0m        [38;2;144;238;144m  7.262 ms 42.36%  clone module         [0m[38;2;000;255;255m [2 x   3.631 ms]
+[0m    [38;2;255;000;000m651.836 us eliminate_unused_functions   [0m[38;2;000;255;255m [1 x 651.836 us]
+[0m    [38;2;255;000;000m 81.514 ms global_optimize_module_cpu   [0m[38;2;000;255;255m [1 x  81.514 ms]
+[0m        [38;2;144;238;144m  1.975 ms  2.42%  llvm_function_pass   [0m[38;2;000;255;255m [1 x   1.975 ms]
+[0m        [38;2;144;238;144m 78.158 ms 95.88%  llvm_module_pass     [0m[38;2;000;255;255m [1 x  78.158 ms]
+[0m        [38;2;144;238;144m  1.381 ms  1.69%  [unaccounted]
+[0m    [38;2;255;000;000m  5.082 ms run                          [0m[38;2;000;255;255m [1 x   5.082 ms]
+[0m        [38;2;144;238;144m  0.036 ms  0.71%  generate_types       [0m[38;2;000;255;255m [19 x   1.895 us]
+[0m        [38;2;144;238;144m  0.067 ms  1.32%  generate_child_accessors[0m[38;2;000;255;255m [1 x  66.996 us]
+[0m            [38;2;255;255;000m 10.014 us 14.95%  generate_refine_coordinates[0m[38;2;000;255;255m [1 x  10.014 us]
+[0m            [38;2;255;255;000m 56.982 us 85.05%  generate_child_accessors[0m[38;2;000;255;255m [6 x   9.497 us]
+[0m                [38;2;173;216;230m 31.948 us 56.07%  generate_refine_coordinates[0m[38;2;000;255;255m [6 x   5.325 us]
+[0m                [38;2;173;216;230m 15.736 us 27.62%  generate_child_accessors[0m[38;2;000;255;255m [12 x   1.311 us]
+[0m                [38;2;173;216;230m  9.298 us 16.32%  [unaccounted]
+[0m        [38;2;144;238;144m  4.979 ms 97.97%  [unaccounted]
+[0m    [38;2;255;000;000m290.610 ms compile                      [0m[38;2;000;255;255m [2 x 145.305 ms]
+[0m        [38;2;144;238;144m218.299 ms 75.12%  compile_to_executable[0m[38;2;000;255;255m [2 x 109.149 ms]
+[0m            [38;2;255;255;000m169.878 ms 77.82%  compile_to_offloads  [0m[38;2;000;255;255m [2 x  84.939 ms]
+[0m                [38;2;173;216;230m  0.014 ms  0.01%  frontend_type_check  [0m[38;2;000;255;255m [2 x   6.914 us]
+[0m                [38;2;173;216;230m 18.433 ms 10.85%  lower_ast            [0m[38;2;000;255;255m [2 x   9.217 ms]
+[0m                    [38;2;255;000;255m 14.735 ms 79.94%  replace_all_usages_with[0m[38;2;000;255;255m [2017 x   7.306 us]
+[0m                    [38;2;255;000;255m  3.698 ms 20.06%  [unaccounted]
+[0m                [38;2;173;216;230m  0.614 ms  0.36%  type_check           [0m[38;2;000;255;255m [4 x 153.482 us]
+[0m                [38;2;173;216;230m  1.307 ms  0.77%  verify               [0m[38;2;000;255;255m [16 x  81.718 us]
+[0m                [38;2;173;216;230m  0.327 ms  0.19%  bit_loop_vectorize   [0m[38;2;000;255;255m [2 x 163.555 us]
+[0m                    [38;2;255;000;255m324.965 us 99.34%  die                  [0m[38;2;000;255;255m [2 x 162.482 us]
+[0m                    [38;2;255;000;255m  2.146 us  0.66%  [unaccounted]
+[0m                [38;2;173;216;230m144.166 ms 84.86%  full_simplify        [0m[38;2;000;255;255m [6 x  24.028 ms]
+[0m                    [38;2;255;000;255m  0.782 ms  0.54%  extract_constant     [0m[38;2;000;255;255m [14 x  55.875 us]
+[0m                    [38;2;255;000;255m  0.200 ms  0.14%  unreachable_code_elimination[0m[38;2;000;255;255m [14 x  14.305 us]
+[0m                    [38;2;255;000;255m  0.314 ms  0.22%  binary_op_simplify   [0m[38;2;000;255;255m [14 x  22.462 us]
+[0m                        [38;2;255;000;255m 14.067 us  4.47%  replace_all_usages_with[0m[38;2;000;255;255m [1 x  14.067 us]
+[0m                        [38;2;255;000;255m300.407 us 95.53%  [unaccounted]
+[0m                    [38;2;255;000;255m 21.322 ms 14.79%  constant_fold        [0m[38;2;000;255;255m [14 x   1.523 ms]
+[0m                        [38;2;255;000;255m  0.889 ms  4.17%  replace_all_usages_with[0m[38;2;000;255;255m [117 x   7.595 us]
+[0m                        [38;2;255;000;255m 20.008 ms 93.84%  compile              [0m[38;2;000;255;255m [2 x  10.004 ms]
+[0m                            [38;2;255;000;255m  0.075 ms  0.37%  compile_to_executable[0m[38;2;000;255;255m [2 x  37.432 us]
+[0m                                [38;2;255;000;255m 36.955 us 49.36%  compile_to_offloads  [0m[38;2;000;255;255m [2 x  18.477 us]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  frontend_type_check  [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  2.861 us  7.74%  lower_ast            [0m[38;2;000;255;255m [2 x   1.431 us]
+[0m                                    [38;2;255;000;255m  0.954 us  2.58%  type_check           [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  4.053 us 10.97%  verify               [0m[38;2;000;255;255m [4 x   1.013 us]
+[0m                                    [38;2;255;000;255m  1.907 us  5.16%  demote_operations    [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                    [38;2;255;000;255m 20.981 us 56.77%  offload              [0m[38;2;000;255;255m [2 x  10.490 us]
+[0m                                        [38;2;255;000;255m  1.907 us  9.09%  type_check           [0m[38;2;000;255;255m [4 x 476.837 ns]
+[0m                                        [38;2;255;000;255m 19.073 us 90.91%  [unaccounted]
+[0m                                    [38;2;255;000;255m  6.199 us 16.77%  [unaccounted]
+[0m                                [38;2;255;000;255m 36.955 us 49.36%  offload_to_executable[0m[38;2;000;255;255m [2 x  18.477 us]
+[0m                                    [38;2;255;000;255m  3.815 us 10.32%  verify               [0m[38;2;000;255;255m [20 x 190.735 ns]
+[0m                                    [38;2;255;000;255m  3.338 us  9.03%  demote_atomics       [0m[38;2;000;255;255m [4 x 834.465 ns]
+[0m                                        [38;2;255;000;255m  2.146 us 64.29%  type_check           [0m[38;2;000;255;255m [4 x 536.442 ns]
+[0m                                        [38;2;255;000;255m  1.192 us 35.71%  [unaccounted]
+[0m                                    [38;2;255;000;255m  1.907 us  5.16%  type_check           [0m[38;2;000;255;255m [8 x 238.419 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  2.58%  make_thread_local    [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                        [38;2;255;000;255m953.674 ns type_check                   [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  make_mesh_thread_local[0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  0.000 ns type_check                   [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  2.58%  demote_mesh_statements[0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                        [38;2;255;000;255m953.674 ns type_check                   [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  2.58%  remove_range_assumption[0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  1.192 us  3.23%  remove_loop_unique   [0m[38;2;000;255;255m [2 x 596.046 ns]
+[0m                                    [38;2;255;000;255m  1.907 us  5.16%  die                  [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  2.146 us  5.81%  flag_access          [0m[38;2;000;255;255m [2 x   1.073 us]
+[0m                                    [38;2;255;000;255m  0.954 us  2.58%  demote_operations    [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                    [38;2;255;000;255m  3.099 us  8.39%  full_simplify        [0m[38;2;000;255;255m [2 x   1.550 us]
+[0m                                        [38;2;255;000;255m  1.192 us 38.46%  simplify             [0m[38;2;000;255;255m [2 x 596.046 ns]
+[0m                                        [38;2;255;000;255m  0.954 us 30.77%  die                  [0m[38;2;000;255;255m [2 x 476.837 ns]
+[0m                                        [38;2;255;000;255m  0.954 us 30.77%  [unaccounted]
+[0m                                    [38;2;255;000;255m  2.861 us  7.74%  optimize_bit_struct_stores[0m[38;2;000;255;255m [2 x   1.431 us]
+[0m                                        [38;2;255;000;255m  0.000 us  0.00%  die                  [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  2.861 us 100.00%  [unaccounted]
+[0m                                    [38;2;255;000;255m 12.875 us 34.84%  [unaccounted]
+[0m                                [38;2;255;000;255m  0.954 us  1.27%  [unaccounted]
+[0m                            [38;2;255;000;255m 19.926 ms 99.59%  codegen              [0m[38;2;000;255;255m [2 x   9.963 ms]
+[0m                                [38;2;255;000;255m 10.840 ms 54.40%  clone_struct_module  [0m[38;2;000;255;255m [2 x   5.420 ms]
+[0m                                [38;2;255;000;255m  0.000 ms  0.00%  CodeGenLLVMCPU       [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                [38;2;255;000;255m  0.046 ms  0.23%  emit_to_module       [0m[38;2;000;255;255m [2 x  23.007 us]
+[0m                                [38;2;255;000;255m  9.030 ms 45.32%  compile_module_to_executable[0m[38;2;000;255;255m [2 x   4.515 ms]
+[0m                                    [38;2;255;000;255m  1.717 ms 19.02%  eliminate_unused_functions[0m[38;2;000;255;255m [2 x 858.545 us]
+[0m                                    [38;2;255;000;255m  5.776 ms 63.96%  global_optimize_module_cpu[0m[38;2;000;255;255m [2 x   2.888 ms]
+[0m                                        [38;2;255;000;255m  0.777 ms 13.46%  llvm_function_pass   [0m[38;2;000;255;255m [2 x 388.622 us]
+[0m                                        [38;2;255;000;255m  3.068 ms 53.11%  llvm_module_pass     [0m[38;2;000;255;255m [2 x   1.534 ms]
+[0m                                        [38;2;255;000;255m  1.931 ms 33.43%  [unaccounted]
+[0m                                    [38;2;255;000;255m  1.537 ms 17.02%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.426 ms  2.00%  [unaccounted]
+[0m                    [38;2;255;000;255m  2.889 ms  2.00%  die                  [0m[38;2;000;255;255m [42 x  68.784 us]
+[0m                    [38;2;255;000;255m  0.797 ms  0.55%  alg_simp             [0m[38;2;000;255;255m [14 x  56.948 us]
+[0m                        [38;2;255;000;255m134.230 us 16.84%  replace_all_usages_with[0m[38;2;000;255;255m [72 x   1.864 us]
+[0m                        [38;2;255;000;255m663.042 us 83.16%  [unaccounted]
+[0m                    [38;2;255;000;255m  0.047 ms  0.03%  loop_invariant_code_motion[0m[38;2;000;255;255m [14 x   3.338 us]
+[0m                    [38;2;255;000;255m  0.621 ms  0.43%  simplify             [0m[38;2;000;255;255m [14 x  44.363 us]
+[0m                    [38;2;255;000;255m 24.378 ms 16.91%  whole_kernel_cse     [0m[38;2;000;255;255m [14 x   1.741 ms]
+[0m                        [38;2;255;000;255m  7.641 ms 31.35%  replace_all_usages_with[0m[38;2;000;255;255m [625 x  12.226 us]
+[0m                        [38;2;255;000;255m 16.737 ms 68.65%  [unaccounted]
+[0m                    [38;2;255;000;255m 92.805 ms 64.37%  cfg_optimization     [0m[38;2;000;255;255m [10 x   9.280 ms]
+[0m                        [38;2;255;000;255m 64.329 ms 69.32%  store_to_load_forwarding[0m[38;2;000;255;255m [12 x   5.361 ms]
+[0m                            [38;2;255;000;255m 30.363 ms 47.20%  reaching_definition_analysis[0m[38;2;000;255;255m [12 x   2.530 ms]
+[0m                            [38;2;255;000;255m 12.398 ms 19.27%  replace_all_usages_with[0m[38;2;000;255;255m [1289 x   9.619 us]
+[0m                            [38;2;255;000;255m 21.568 ms 33.53%  [unaccounted]
+[0m                        [38;2;255;000;255m 27.198 ms 29.31%  dead_store_elimination[0m[38;2;000;255;255m [12 x   2.267 ms]
+[0m                            [38;2;255;000;255m 19.209 ms 70.63%  live_variable_analysis[0m[38;2;000;255;255m [12 x   1.601 ms]
+[0m                            [38;2;255;000;255m  0.115 ms  0.42%  replace_all_usages_with[0m[38;2;000;255;255m [19 x   6.036 us]
+[0m                            [38;2;255;000;255m  7.874 ms 28.95%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.754 ms  0.81%  die                  [0m[38;2;000;255;255m [10 x  75.364 us]
+[0m                        [38;2;255;000;255m  0.524 ms  0.56%  [unaccounted]
+[0m                [38;2;173;216;230m  0.010 ms  0.01%  inlining             [0m[38;2;000;255;255m [2 x   5.007 us]
+[0m                [38;2;173;216;230m  0.020 ms  0.01%  flag_access          [0m[38;2;000;255;255m [4 x   5.007 us]
+[0m                [38;2;173;216;230m  1.093 ms  0.64%  offload              [0m[38;2;000;255;255m [2 x 546.575 us]
+[0m                    [38;2;255;000;255m  0.006 ms  0.52%  replace_all_usages_with[0m[38;2;000;255;255m [5 x   1.144 us]
+[0m                    [38;2;255;000;255m  0.109 ms  9.95%  type_check           [0m[38;2;000;255;255m [4 x  27.180 us]
+[0m                    [38;2;255;000;255m  0.979 ms 89.53%  [unaccounted]
+[0m                [38;2;173;216;230m  3.873 ms  2.28%  cfg_optimization     [0m[38;2;000;255;255m [2 x   1.937 ms]
+[0m                    [38;2;255;000;255m  2.122 ms 54.79%  store_to_load_forwarding[0m[38;2;000;255;255m [2 x   1.061 ms]
+[0m                        [38;2;255;000;255m  1.180 ms 55.61%  reaching_definition_analysis[0m[38;2;000;255;255m [2 x 589.967 us]
+[0m                        [38;2;255;000;255m  0.942 ms 44.39%  [unaccounted]
+[0m                    [38;2;255;000;255m  1.531 ms 39.53%  dead_store_elimination[0m[38;2;000;255;255m [2 x 765.562 us]
+[0m                        [38;2;255;000;255m  0.974 ms 63.63%  live_variable_analysis[0m[38;2;000;255;255m [2 x 487.089 us]
+[0m                        [38;2;255;000;255m  0.557 ms 36.37%  [unaccounted]
+[0m                    [38;2;255;000;255m  0.150 ms  3.87%  die                  [0m[38;2;000;255;255m [2 x  74.983 us]
+[0m                    [38;2;255;000;255m  0.070 ms  1.81%  [unaccounted]
+[0m            [38;2;255;255;000m 48.419 ms 22.18%  offload_to_executable[0m[38;2;000;255;255m [2 x  24.209 ms]
+[0m                [38;2;173;216;230m  1.081 ms  2.23%  verify               [0m[38;2;000;255;255m [20 x  54.073 us]
+[0m                [38;2;173;216;230m  0.330 ms  0.68%  demote_atomics       [0m[38;2;000;255;255m [4 x  82.552 us]
+[0m                    [38;2;255;000;255m 68.188 us 20.65%  type_check           [0m[38;2;000;255;255m [4 x  17.047 us]
+[0m                    [38;2;255;000;255m100.136 us 30.32%  replace_all_usages_with[0m[38;2;000;255;255m [57 x   1.757 us]
+[0m                    [38;2;255;000;255m161.886 us 49.03%  [unaccounted]
+[0m                [38;2;173;216;230m  0.148 ms  0.31%  type_check           [0m[38;2;000;255;255m [8 x  18.477 us]
+[0m                [38;2;173;216;230m  1.046 ms  2.16%  make_thread_local    [0m[38;2;000;255;255m [2 x 522.971 us]
+[0m                    [38;2;255;000;255m  0.033 ms  3.15%  type_check           [0m[38;2;000;255;255m [2 x  16.451 us]
+[0m                    [38;2;255;000;255m  1.013 ms 96.85%  [unaccounted]
+[0m                [38;2;173;216;230m  0.032 ms  0.07%  make_mesh_thread_local[0m[38;2;000;255;255m [2 x  16.093 us]
+[0m                    [38;2;255;000;255m 32.187 us 100.00%  type_check          [0m[38;2;000;255;255m [2 x  16.093 us]
+[0m                [38;2;173;216;230m  0.037 ms  0.08%  demote_mesh_statements[0m[38;2;000;255;255m [2 x  18.477 us]
+[0m                    [38;2;255;000;255m 30.994 us 83.87%  type_check           [0m[38;2;000;255;255m [2 x  15.497 us]
+[0m                    [38;2;255;000;255m  5.960 us 16.13%  [unaccounted]
+[0m                [38;2;173;216;230m  0.004 ms  0.01%  remove_range_assumption[0m[38;2;000;255;255m [2 x   2.027 us]
+[0m                [38;2;173;216;230m  0.003 ms  0.01%  remove_loop_unique   [0m[38;2;000;255;255m [2 x   1.550 us]
+[0m                [38;2;173;216;230m  0.351 ms  0.72%  die                  [0m[38;2;000;255;255m [2 x 175.476 us]
+[0m                [38;2;173;216;230m  0.016 ms  0.03%  flag_access          [0m[38;2;000;255;255m [2 x   7.987 us]
+[0m                [38;2;173;216;230m  0.879 ms  1.82%  demote_operations    [0m[38;2;000;255;255m [2 x 439.525 us]
+[0m                    [38;2;255;000;255m436.306 us 49.63%  replace_all_usages_with[0m[38;2;000;255;255m [162 x   2.693 us]
+[0m                    [38;2;255;000;255m109.196 us 12.42%  type_check           [0m[38;2;000;255;255m [4 x  27.299 us]
+[0m                    [38;2;255;000;255m333.548 us 37.94%  [unaccounted]
+[0m                [38;2;173;216;230m 43.794 ms 90.45%  full_simplify        [0m[38;2;000;255;255m [2 x  21.897 ms]
+[0m                    [38;2;255;000;255m  0.055 ms  0.13%  extract_constant     [0m[38;2;000;255;255m [6 x   9.179 us]
+[0m                    [38;2;255;000;255m  0.088 ms  0.20%  unreachable_code_elimination[0m[38;2;000;255;255m [6 x  14.623 us]
+[0m                    [38;2;255;000;255m  0.189 ms  0.43%  binary_op_simplify   [0m[38;2;000;255;255m [6 x  31.551 us]
+[0m                        [38;2;255;000;255m  6.437 us  3.40%  replace_all_usages_with[0m[38;2;000;255;255m [6 x   1.073 us]
+[0m                        [38;2;255;000;255m182.867 us 96.60%  [unaccounted]
+[0m                    [38;2;255;000;255m  8.382 ms 19.14%  constant_fold        [0m[38;2;000;255;255m [6 x   1.397 ms]
+[0m                        [38;2;255;000;255m  8.245 ms 98.37%  compile              [0m[38;2;000;255;255m [1 x   8.245 ms]
+[0m                            [38;2;255;000;255m  0.031 ms  0.38%  compile_to_executable[0m[38;2;000;255;255m [1 x  30.994 us]
+[0m                                [38;2;255;000;255m 14.067 us 45.38%  compile_to_offloads  [0m[38;2;000;255;255m [1 x  14.067 us]
+[0m                                    [38;2;255;000;255m  0.954 us  6.78%  frontend_type_check  [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  6.78%  lower_ast            [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  type_check           [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  3.338 us 23.73%  verify               [0m[38;2;000;255;255m [2 x   1.669 us]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  demote_operations    [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  7.868 us 55.93%  offload              [0m[38;2;000;255;255m [1 x   7.868 us]
+[0m                                        [38;2;255;000;255m  1.192 us 15.15%  type_check           [0m[38;2;000;255;255m [2 x 596.046 ns]
+[0m                                        [38;2;255;000;255m  6.676 us 84.85%  [unaccounted]
+[0m                                    [38;2;255;000;255m  0.954 us  6.78%  [unaccounted]
+[0m                                [38;2;255;000;255m 16.928 us 54.62%  offload_to_executable[0m[38;2;000;255;255m [1 x  16.928 us]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  verify               [0m[38;2;000;255;255m [10 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  1.907 us 11.27%  demote_atomics       [0m[38;2;000;255;255m [2 x 953.674 ns]
+[0m                                        [38;2;255;000;255m  0.000 us  0.00%  type_check           [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  1.907 us 100.00%  [unaccounted]
+[0m                                    [38;2;255;000;255m  2.146 us 12.68%  type_check           [0m[38;2;000;255;255m [4 x 536.442 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  5.63%  make_thread_local    [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                        [38;2;255;000;255m953.674 ns type_check                   [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  make_mesh_thread_local[0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  0.000 ns type_check                   [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  demote_mesh_statements[0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  0.000 ns type_check                   [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  remove_range_assumption[0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  1.192 us  7.04%  remove_loop_unique   [0m[38;2;000;255;255m [1 x   1.192 us]
+[0m                                    [38;2;255;000;255m  0.954 us  5.63%  die                  [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.954 us  5.63%  flag_access          [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  0.000 us  0.00%  demote_operations    [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                    [38;2;255;000;255m  1.192 us  7.04%  full_simplify        [0m[38;2;000;255;255m [1 x   1.192 us]
+[0m                                        [38;2;255;000;255m  0.000 us  0.00%  simplify             [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                        [38;2;255;000;255m  1.192 us 100.00%  die                 [0m[38;2;000;255;255m [1 x   1.192 us]
+[0m                                    [38;2;255;000;255m  0.954 us  5.63%  optimize_bit_struct_stores[0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                        [38;2;255;000;255m953.674 ns die                          [0m[38;2;000;255;255m [1 x 953.674 ns]
+[0m                                    [38;2;255;000;255m  6.676 us 39.44%  [unaccounted]
+[0m                            [38;2;255;000;255m  8.210 ms 99.58%  codegen              [0m[38;2;000;255;255m [1 x   8.210 ms]
+[0m                                [38;2;255;000;255m  3.573 ms 43.52%  clone_struct_module  [0m[38;2;000;255;255m [1 x   3.573 ms]
+[0m                                [38;2;255;000;255m  0.000 ms  0.00%  CodeGenLLVMCPU       [0m[38;2;000;255;255m [1 x   0.000 ns]
+[0m                                [38;2;255;000;255m  0.023 ms  0.28%  emit_to_module       [0m[38;2;000;255;255m [1 x  23.127 us]
+[0m                                [38;2;255;000;255m  4.611 ms 56.16%  compile_module_to_executable[0m[38;2;000;255;255m [1 x   4.611 ms]
+[0m                                    [38;2;255;000;255m  0.863 ms 18.71%  eliminate_unused_functions[0m[38;2;000;255;255m [1 x 862.837 us]
+[0m                                    [38;2;255;000;255m  2.964 ms 64.28%  global_optimize_module_cpu[0m[38;2;000;255;255m [1 x   2.964 ms]
+[0m                                        [38;2;255;000;255m  0.396 ms 13.36%  llvm_function_pass   [0m[38;2;000;255;255m [1 x 396.013 us]
+[0m                                        [38;2;255;000;255m  1.584 ms 53.44%  llvm_module_pass     [0m[38;2;000;255;255m [1 x   1.584 ms]
+[0m                                        [38;2;255;000;255m  0.984 ms 33.20%  [unaccounted]
+[0m                                    [38;2;255;000;255m  0.784 ms 17.01%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.020 ms  0.24%  replace_all_usages_with[0m[38;2;000;255;255m [6 x   3.378 us]
+[0m                        [38;2;255;000;255m  0.117 ms  1.39%  [unaccounted]
+[0m                    [38;2;255;000;255m  1.547 ms  3.53%  die                  [0m[38;2;000;255;255m [18 x  85.950 us]
+[0m                    [38;2;255;000;255m  1.079 ms  2.46%  alg_simp             [0m[38;2;000;255;255m [6 x 179.887 us]
+[0m                        [38;2;255;000;255m  0.586 ms 54.27%  replace_all_usages_with[0m[38;2;000;255;255m [208 x   2.816 us]
+[0m                        [38;2;255;000;255m  0.494 ms 45.73%  [unaccounted]
+[0m                    [38;2;255;000;255m  0.018 ms  0.04%  loop_invariant_code_motion[0m[38;2;000;255;255m [6 x   2.940 us]
+[0m                    [38;2;255;000;255m  4.671 ms 10.67%  simplify             [0m[38;2;000;255;255m [6 x 778.516 us]
+[0m                        [38;2;255;000;255m  1.221 ms 26.13%  replace_all_usages_with[0m[38;2;000;255;255m [434 x   2.813 us]
+[0m                        [38;2;255;000;255m  2.472 ms 52.92%  type_check           [0m[38;2;000;255;255m [184 x  13.434 us]
+[0m                        [38;2;255;000;255m  0.978 ms 20.95%  [unaccounted]
+[0m                    [38;2;255;000;255m 24.900 ms 56.86%  whole_kernel_cse     [0m[38;2;000;255;255m [6 x   4.150 ms]
+[0m                        [38;2;255;000;255m  4.452 ms 17.88%  replace_all_usages_with[0m[38;2;000;255;255m [1206 x   3.692 us]
+[0m                        [38;2;255;000;255m 20.448 ms 82.12%  [unaccounted]
+[0m                    [38;2;255;000;255m  2.859 ms  6.53%  cfg_optimization     [0m[38;2;000;255;255m [4 x 714.779 us]
+[0m                        [38;2;255;000;255m  1.027 ms 35.92%  store_to_load_forwarding[0m[38;2;000;255;255m [5 x 205.374 us]
+[0m                            [38;2;255;000;255m  0.290 ms 28.26%  reaching_definition_analysis[0m[38;2;000;255;255m [5 x  58.031 us]
+[0m                            [38;2;255;000;255m  0.165 ms 16.02%  replace_all_usages_with[0m[38;2;000;255;255m [60 x   2.742 us]
+[0m                            [38;2;255;000;255m  0.572 ms 55.72%  [unaccounted]
+[0m                        [38;2;255;000;255m  1.500 ms 52.46%  dead_store_elimination[0m[38;2;000;255;255m [5 x 299.978 us]
+[0m                            [38;2;255;000;255m  0.734 ms 48.96%  live_variable_analysis[0m[38;2;000;255;255m [5 x 146.866 us]
+[0m                            [38;2;255;000;255m  0.766 ms 51.04%  [unaccounted]
+[0m                        [38;2;255;000;255m  0.300 ms 10.50%  die                  [0m[38;2;000;255;255m [4 x  75.042 us]
+[0m                        [38;2;255;000;255m  0.032 ms  1.13%  [unaccounted]
+[0m                [38;2;173;216;230m  0.136 ms  0.28%  optimize_bit_struct_stores[0m[38;2;000;255;255m [2 x  67.949 us]
+[0m                    [38;2;255;000;255m 81.301 us 59.82%  die                  [0m[38;2;000;255;255m [2 x  40.650 us]
+[0m                    [38;2;255;000;255m 54.598 us 40.18%  [unaccounted]
+[0m                [38;2;173;216;230m  0.007 ms  0.01%  replace_all_usages_with[0m[38;2;000;255;255m [6 x   1.192 us]
+[0m                [38;2;173;216;230m  0.554 ms  1.14%  [unaccounted]
+[0m        [38;2;144;238;144m 72.283 ms 24.87%  codegen              [0m[38;2;000;255;255m [2 x  36.142 ms]
+[0m            [38;2;255;255;000m  9.329 ms 12.91%  clone_struct_module  [0m[38;2;000;255;255m [2 x   4.665 ms]
+[0m            [38;2;255;255;000m  0.000 ms  0.00%  CodeGenLLVMCPU       [0m[38;2;000;255;255m [2 x   0.000 ns]
+[0m            [38;2;255;255;000m  0.478 ms  0.66%  emit_to_module       [0m[38;2;000;255;255m [2 x 239.015 us]
+[0m            [38;2;255;255;000m 62.460 ms 86.41%  compile_module_to_executable[0m[38;2;000;255;255m [2 x  31.230 ms]
+[0m                [38;2;173;216;230m  1.526 ms  2.44%  eliminate_unused_functions[0m[38;2;000;255;255m [2 x 762.939 us]
+[0m                [38;2;173;216;230m 51.956 ms 83.18%  global_optimize_module_cpu[0m[38;2;000;255;255m [2 x  25.978 ms]
+[0m                    [38;2;255;000;255m  2.474 ms  4.76%  [Taichi] version 0.9.0, llvm 10.0.0, commit dd7654ae, linux, python 3.8.12
+[Taichi] Starting on arch=x64
+llvm_function_pass   [0m[38;2;000;255;255m [2 x   1.237 ms]
+[0m                    [38;2;255;000;255m 46.998 ms 90.46%  llvm_module_pass     [0m[38;2;000;255;255m [2 x  23.499 ms]
+[0m                    [38;2;255;000;255m  2.484 ms  4.78%  [unaccounted]
+[0m                [38;2;173;216;230m  8.978 ms 14.37%  [unaccounted]
+[0m[38;2;000;255;255m>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+[0m
\ No newline at end of file
diff --git a/setup.py b/setup.py
index df883ba3b..484bb2bc0 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@
     'Programming Language :: Python :: 3.7',
     'Programming Language :: Python :: 3.8',
     'Programming Language :: Python :: 3.9',
+    'Programming Language :: Python :: 3.10',
 ]
 
 project_name = os.getenv('PROJECT_NAME', 'taichi')
@@ -261,7 +262,7 @@ def run(self):
       author='Taichi developers',
       author_email='yuanmhu@gmail.com',
       url='https://github.com/taichi-dev/taichi',
-      python_requires=">=3.6,<3.10",
+      python_requires=">=3.6,<3.11",
       install_requires=[
           'numpy', 'sourceinspect>=0.0.4', 'colorama', 'astor',
           'astunparse;python_version<"3.9"'
diff --git a/test_temp/jit/jit_pyinstrument.py b/test_temp/jit/jit_pyinstrument.py
new file mode 100644
index 000000000..bde6706c7
--- /dev/null
+++ b/test_temp/jit/jit_pyinstrument.py
@@ -0,0 +1,22 @@
+import time
+import taichi as ti
+from pyinstrument import Profiler
+from pyinstrument.renderers import ConsoleRenderer
+
+ti.init(arch=ti.cpu)
+a = ti.ndarray(float, 2048*2048)
+
+@ti.kernel
+def fill_half(a: ti.any_arr()):
+    for I in a:
+        a[I] = 0.5
+
+fill_half(a)
+profiler = Profiler()
+profiler.start()
+fill_half(a)
+session = profiler.stop()
+profile_renderer = ConsoleRenderer(unicode=True, color=True, show_all=True)
+print(profile_renderer.render(session))
+
+
diff --git a/test_temp/jit/jit_section.py b/test_temp/jit/jit_section.py
new file mode 100644
index 000000000..26613dd89
--- /dev/null
+++ b/test_temp/jit/jit_section.py
@@ -0,0 +1,21 @@
+import time
+import taichi as ti
+#from pyinstrument import Profiler
+#from pyinstrument.renderers import ConsoleRenderer
+
+from line_profiler import LineProfiler
+
+ti.init(arch=ti.cpu)
+a = ti.ndarray(float, 2048*2048)
+
+@ti.kernel
+def fill_half(a: ti.any_arr()):
+    for I in a:
+        a[I] = 0.5
+
+lprofiler = LineProfiler()
+lprofiler.add_function(ti.lang.kernel_impl.Kernel.func__)
+lp_wrapper = lprofiler(fill_half(a))
+lp_wrapper()
+
+lprofiler.print_stats()
diff --git a/test_temp/jit/jit_test.py b/test_temp/jit/jit_test.py
new file mode 100644
index 000000000..8dc0663f6
--- /dev/null
+++ b/test_temp/jit/jit_test.py
@@ -0,0 +1,28 @@
+import time
+import taichi as ti
+
+print('end2end time')
+
+t_start = time.perf_counter()
+ti.init(arch=ti.cpu)
+t_used = time.perf_counter() - t_start
+print('ti.init time:', "{:.8f}".format(t_used), "s")
+
+a = ti.ndarray(float, 2048*2048)
+
+@ti.kernel
+def fill_half(a: ti.any_arr()):
+    for I in a:
+        a[I] = 0.5
+
+t_start = time.perf_counter()
+fill_half(a)
+t_used = time.perf_counter() - t_start
+print('first execution time:', "{:.8f}".format(t_used), "s")
+
+
+t_start = time.perf_counter()
+fill_half(a)
+t_used = time.perf_counter() - t_start
+print('second execution time:', "{:.8f}".format(t_used), "s")
+
diff --git a/test_temp/jit/mpm88/mpm88_compute.py b/test_temp/jit/mpm88/mpm88_compute.py
new file mode 100644
index 000000000..93bb6680a
--- /dev/null
+++ b/test_temp/jit/mpm88/mpm88_compute.py
@@ -0,0 +1,92 @@
+# MPM-MLS in 88 lines of Taichi code, originally created by @yuanming-hu
+import taichi as ti
+
+ti.init(arch=ti.cpu)
+
+n_particles = 8192
+n_grid = 128
+dx = 1 / n_grid
+dt = 2e-4
+
+p_rho = 1
+p_vol = (dx * 0.5)**2
+p_mass = p_vol * p_rho
+gravity = 9.8
+bound = 3
+E = 400
+
+x = ti.Vector.field(2, float, n_particles)
+v = ti.Vector.field(2, float, n_particles)
+C = ti.Matrix.field(2, 2, float, n_particles)
+J = ti.field(float, n_particles)
+
+grid_v = ti.Vector.field(2, float, (n_grid, n_grid))
+grid_m = ti.field(float, (n_grid, n_grid))
+
+
+@ti.kernel
+def substep():
+    for i, j in grid_m:
+        grid_v[i, j] = [0, 0]
+        grid_m[i, j] = 0
+    for p in x:
+        Xp = x[p] / dx
+        base = int(Xp - 0.5)
+        fx = Xp - base
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
+        affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+        for i, j in ti.static(ti.ndrange(3, 3)):
+            offset = ti.Vector([i, j])
+            dpos = (offset - fx) * dx
+            weight = w[i].x * w[j].y
+            grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+            grid_m[base + offset] += weight * p_mass
+    for i, j in grid_m:
+        if grid_m[i, j] > 0:
+            grid_v[i, j] /= grid_m[i, j]
+        grid_v[i, j].y -= dt * gravity
+        if i < bound and grid_v[i, j].x < 0:
+            grid_v[i, j].x = 0
+        if i > n_grid - bound and grid_v[i, j].x > 0:
+            grid_v[i, j].x = 0
+        if j < bound and grid_v[i, j].y < 0:
+            grid_v[i, j].y = 0
+        if j > n_grid - bound and grid_v[i, j].y > 0:
+            grid_v[i, j].y = 0
+    for p in x:
+        Xp = x[p] / dx
+        base = int(Xp - 0.5)
+        fx = Xp - base
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        new_v = ti.Vector.zero(float, 2)
+        new_C = ti.Matrix.zero(float, 2, 2)
+        for i, j in ti.static(ti.ndrange(3, 3)):
+            offset = ti.Vector([i, j])
+            dpos = (offset - fx) * dx
+            weight = w[i].x * w[j].y
+            g_v = grid_v[base + offset]
+            new_v += weight * g_v
+            new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
+        v[p] = new_v
+        x[p] += dt * v[p]
+        J[p] *= 1 + dt * new_C.trace()
+        C[p] = new_C
+
+
+@ti.kernel
+def init():
+    for i in range(n_particles):
+        x[i] = [ti.random() * 0.4 + 0.2, ti.random() * 0.4 + 0.2]
+        v[i] = [0, -1]
+        J[i] = 1
+
+
+ti.clear_profile_info()
+init()
+ti.print_profile_info()
+
+substep()
+
+
+substep()
diff --git a/test_temp/jit/mpm88/mpm88_compute_time.py b/test_temp/jit/mpm88/mpm88_compute_time.py
new file mode 100644
index 000000000..833c40194
--- /dev/null
+++ b/test_temp/jit/mpm88/mpm88_compute_time.py
@@ -0,0 +1,98 @@
+# MPM-MLS in 88 lines of Taichi code, originally created by @yuanming-hu
+import time
+import taichi as ti
+
+print ("start ti.init")
+ti.init(arch=ti.cpu, log_level=ti.TRACE)
+print ("finish ti.init")
+
+n_particles = 8192
+n_grid = 128
+dx = 1 / n_grid
+dt = 2e-4
+
+p_rho = 1
+p_vol = (dx * 0.5)**2
+p_mass = p_vol * p_rho
+gravity = 9.8
+bound = 3
+E = 400
+
+x = ti.Vector.field(2, float, n_particles)
+v = ti.Vector.field(2, float, n_particles)
+C = ti.Matrix.field(2, 2, float, n_particles)
+J = ti.field(float, n_particles)
+
+grid_v = ti.Vector.field(2, float, (n_grid, n_grid))
+grid_m = ti.field(float, (n_grid, n_grid))
+
+
+@ti.kernel
+def substep():
+    for i, j in grid_m:
+        grid_v[i, j] = [0, 0]
+        grid_m[i, j] = 0
+    for p in x:
+        Xp = x[p] / dx
+        base = int(Xp - 0.5)
+        fx = Xp - base
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
+        affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+        for i, j in ti.static(ti.ndrange(3, 3)):
+            offset = ti.Vector([i, j])
+            dpos = (offset - fx) * dx
+            weight = w[i].x * w[j].y
+            grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+            grid_m[base + offset] += weight * p_mass
+    for i, j in grid_m:
+        if grid_m[i, j] > 0:
+            grid_v[i, j] /= grid_m[i, j]
+        grid_v[i, j].y -= dt * gravity
+        if i < bound and grid_v[i, j].x < 0:
+            grid_v[i, j].x = 0
+        if i > n_grid - bound and grid_v[i, j].x > 0:
+            grid_v[i, j].x = 0
+        if j < bound and grid_v[i, j].y < 0:
+            grid_v[i, j].y = 0
+        if j > n_grid - bound and grid_v[i, j].y > 0:
+            grid_v[i, j].y = 0
+    for p in x:
+        Xp = x[p] / dx
+        base = int(Xp - 0.5)
+        fx = Xp - base
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        new_v = ti.Vector.zero(float, 2)
+        new_C = ti.Matrix.zero(float, 2, 2)
+        for i, j in ti.static(ti.ndrange(3, 3)):
+            offset = ti.Vector([i, j])
+            dpos = (offset - fx) * dx
+            weight = w[i].x * w[j].y
+            g_v = grid_v[base + offset]
+            new_v += weight * g_v
+            new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
+        v[p] = new_v
+        x[p] += dt * v[p]
+        J[p] *= 1 + dt * new_C.trace()
+        C[p] = new_C
+
+
+@ti.kernel
+def init():
+    for i in range(n_particles):
+        x[i] = [ti.random() * 0.4 + 0.2, ti.random() * 0.4 + 0.2]
+        v[i] = [0, -1]
+        J[i] = 1
+
+
+print ("start init")
+init()
+print ("finish init")
+
+print ("start substep")
+substep()
+print ("finish substep")
+
+print ("start substep")
+substep()
+print ("finish substep")
diff --git a/test_temp/jit/mpm88/trace_with_details.txt b/test_temp/jit/mpm88/trace_with_details.txt
new file mode 100644
index 000000000..0f4785e89
--- /dev/null
+++ b/test_temp/jit/mpm88/trace_with_details.txt
@@ -0,0 +1,111 @@
+[T 02/22/22 13:49:12.586 14398] [program.cpp:Program@51] Program initializing...
+[T 02/22/22 13:49:12.586 14398] [snode_tree_buffer_manager.cpp:SNodeTreeBufferManager@11] SNode tree buffer manager created.
+[T 02/22/22 13:49:12.586 14398] [llvm_context.cpp:TaichiLLVMContext@63] Creating Taichi llvm context for arch: x64
+[T 02/22/22 13:49:12.586 14398] [llvm_context.cpp:get_this_thread_data@649] Creating thread local data for thread 140442101912512
+[T 02/22/22 13:49:12.586 14398] [llvm_context.cpp:TaichiLLVMContext@97] Taichi llvm context created.
+[T 02/22/22 13:49:12.586 14398] [memory_pool.cpp:MemoryPool@13] Memory pool created. Default buffer size per allocator = 1024 MB
+[T 02/22/22 13:49:12.681 14398] [program.cpp:Program@159] Program (0x55f9c8bb44d0) arch=x64 initialized.
+[T 02/22/22 13:49:12.681 14398] [misc.py:init@347] Materializing runtime...
+[T 02/22/22 13:49:12.681 14398] [unified_allocator.cpp:UnifiedAllocator@45] Memory allocated. Allocation time = 4.05e-06 s
+[T 02/22/22 13:49:12.681 14398] [unified_allocator.h:allocate@39] UM [data=140436605714432] allocate() request=256 remain=1073741824
+[T 02/22/22 13:49:12.681 14398] [llvm_program.cpp:materialize_runtime@394] Allocating 16 random states (used by CUDA only)
+[T 02/22/22 13:49:12.691 14398] [unified_allocator.h:allocate@39] UM [data=140436605714432] allocate() request=35256 remain=1073741568
+[T 02/22/22 13:49:12.691 14398] [unified_allocator.h:allocate@39] UM [data=140436605714432] allocate() request=2097160 remain=1073706312
+[T 02/22/22 13:49:12.691 14398] [unified_allocator.h:allocate@39] UM [data=140436605714432] allocate() request=1048576 remain=1071607800
+[T 02/22/22 13:49:12.691 14398] [unified_allocator.h:allocate@39] UM [data=140436605714432] allocate() request=320 remain=1070555136
+[T 02/22/22 13:49:12.691 14398] [llvm_program.cpp:materialize_runtime@403] LLVMRuntime initialized (excluding `root`)
+[T 02/22/22 13:49:12.691 14398] [llvm_program.cpp:materialize_runtime@406] LLVMRuntime pointer fetched
+[T 02/22/22 13:49:12.691 14398] [hacked_signal_handler.cpp:HackedSignalRegister@62] Taichi signal handlers registered. Thread ID = 14398
+[T 02/22/22 13:49:12.702 14398] [llvm_program.cpp:initialize_llvm_runtime_snodes@183] Allocating data structure of size 491520 bytes
+[T 02/22/22 13:49:12.702 14398] [snode_tree_buffer_manager.cpp:allocate@44] allocating memory for SNode Tree 0
+[T 02/22/22 13:49:12.702 14398] [unified_allocator.h:allocate@39] UM [data=140436605714432] allocate() request=491520 remain=1070554816
+[T 02/22/22 13:49:12.702 14398] [kernel_impl.py:materialize@441] Compiling kernel init_c58_0...
+[T 02/22/22 13:49:12.703 14398] [kernel_impl.py:materialize@476] [           JIT] creating kernel
+[T 02/22/22 13:49:12.703 14398] [program.h:kernel@186] [           JIT] make unique Kernel
+[T 02/22/22 13:49:12.703 14398] [kernel.cpp:Kernel@35] [           JIT] Kernel ctor 2
+[T 02/22/22 13:49:12.703 14398] [kernel.cpp:init@394] [           JIT] Kernel init
+[T 02/22/22 13:49:12.703 14398] [kernel.cpp:init@418] [           JIT] pos2
+[T 02/22/22 13:49:12.704 14398] [kernel.cpp:init@428] [           JIT] call compile()
+[T 02/22/22 13:49:12.704 14398] [kernel_impl.py:materialize@480] [           JIT] materialize rest] 0.0016105850008898415
+[T 02/22/22 13:49:12.704 14398] [kernel_impl.py:ensure_compiled@661] [           JIT] materialize()] 0.01229692600099952
+[T 02/22/22 13:49:12.704 14398] [kernel_impl.py:__call__@677] [           JIT]------------]--------
+[T 02/22/22 13:49:12.704 14398] [kernel_impl.py:func__@622] [           JIT] t_kernel(launch_ctx)
+[T 02/22/22 13:49:12.704 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:12.704 14398] [llvm_program.cpp:compile@134] [           JIT] LlvmProgramImpl::compile
+[T 02/22/22 13:49:12.706 14398] [llvm_program.cpp:compile@137] [           JIT] kernel lower
+[T 02/22/22 13:49:12.706 14398] [llvm_program.cpp:compile@140] [           JIT] kernel return codegen
+[T 02/22/22 13:49:12.730 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel init_c58_0_kernel
+[T 02/22/22 13:49:12.730 14398] [kernel_impl.py:__call__@680] [           JIT]------------] 0.02549659599935694
+[T 02/22/22 13:49:12.730 14398] [kernel_impl.py:materialize@441] Compiling kernel substep_c56_0...
+[T 02/22/22 13:49:12.732 14398] [kernel_impl.py:materialize@476] [           JIT] creating kernel
+[T 02/22/22 13:49:12.732 14398] [program.h:kernel@186] [           JIT] make unique Kernel
+[T 02/22/22 13:49:12.732 14398] [kernel.cpp:Kernel@35] [           JIT] Kernel ctor 2
+[T 02/22/22 13:49:12.732 14398] [kernel.cpp:init@394] [           JIT] Kernel init
+[T 02/22/22 13:49:12.732 14398] [kernel.cpp:init@418] [           JIT] pos2
+[T 02/22/22 13:49:12.808 14398] [kernel.cpp:init@428] [           JIT] call compile()
+[T 02/22/22 13:49:12.808 14398] [kernel_impl.py:materialize@480] [           JIT] materialize rest] 0.07633498200084432
+[T 02/22/22 13:49:12.809 14398] [kernel_impl.py:ensure_compiled@661] [           JIT] materialize()] 0.07872364200011361
+[T 02/22/22 13:49:12.809 14398] [kernel_impl.py:__call__@677] [           JIT]------------]--------
+[T 02/22/22 13:49:12.809 14398] [kernel_impl.py:func__@622] [           JIT] t_kernel(launch_ctx)
+[T 02/22/22 13:49:12.809 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:12.809 14398] [llvm_program.cpp:compile@134] [           JIT] LlvmProgramImpl::compile
+[T 02/22/22 13:49:12.828 14398] [kernel.cpp:Kernel@27] [           JIT] Kernel ctor 1
+[T 02/22/22 13:49:12.828 14398] [kernel.cpp:init@394] [           JIT] Kernel init
+[T 02/22/22 13:49:12.828 14398] [kernel.cpp:init@418] [           JIT] pos2
+[T 02/22/22 13:49:12.828 14398] [kernel.cpp:init@428] [           JIT] call compile()
+[T 02/22/22 13:49:12.828 14398] [constant_fold.cpp:get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=12397727920192291077
+[T 02/22/22 13:49:12.828 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:12.828 14398] [llvm_program.cpp:compile@134] [           JIT] LlvmProgramImpl::compile
+[T 02/22/22 13:49:12.828 14398] [llvm_program.cpp:compile@137] [           JIT] kernel lower
+[T 02/22/22 13:49:12.828 14398] [llvm_program.cpp:compile@140] [           JIT] kernel return codegen
+[T 02/22/22 13:49:12.838 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_0_kernel
+[T 02/22/22 13:49:12.838 14398] [kernel.cpp:Kernel@27] [           JIT] Kernel ctor 1
+[T 02/22/22 13:49:12.838 14398] [kernel.cpp:init@394] [           JIT] Kernel init
+[T 02/22/22 13:49:12.838 14398] [kernel.cpp:init@418] [           JIT] pos2
+[T 02/22/22 13:49:12.838 14398] [kernel.cpp:init@428] [           JIT] call compile()
+[T 02/22/22 13:49:12.838 14398] [constant_fold.cpp:get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=12397727918246134016
+[T 02/22/22 13:49:12.838 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:12.838 14398] [llvm_program.cpp:compile@134] [           JIT] LlvmProgramImpl::compile
+[T 02/22/22 13:49:12.838 14398] [llvm_program.cpp:compile@137] [           JIT] kernel lower
+[T 02/22/22 13:49:12.838 14398] [llvm_program.cpp:compile@140] [           JIT] kernel return codegen
+[T 02/22/22 13:49:12.847 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_1_kernel
+[T 02/22/22 13:49:13.014 14398] [kernel.cpp:Kernel@27] [           JIT] Kernel ctor 1
+[T 02/22/22 13:49:13.014 14398] [kernel.cpp:init@394] [           JIT] Kernel init
+[T 02/22/22 13:49:13.014 14398] [kernel.cpp:init@418] [           JIT] pos2
+[T 02/22/22 13:49:13.014 14398] [kernel.cpp:init@428] [           JIT] call compile()
+[T 02/22/22 13:49:13.014 14398] [constant_fold.cpp:get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=12397727920259663113
+[T 02/22/22 13:49:13.014 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:13.014 14398] [llvm_program.cpp:compile@134] [           JIT] LlvmProgramImpl::compile
+[T 02/22/22 13:49:13.015 14398] [llvm_program.cpp:compile@137] [           JIT] kernel lower
+[T 02/22/22 13:49:13.015 14398] [llvm_program.cpp:compile@140] [           JIT] kernel return codegen
+[T 02/22/22 13:49:13.023 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 13:49:13.023 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:13.023 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 13:49:13.023 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:13.023 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 13:49:13.023 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:13.023 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 13:49:13.023 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:13.023 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 13:49:13.023 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:13.023 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 13:49:13.026 14398] [llvm_program.cpp:compile@137] [           JIT] kernel lower
+[T 02/22/22 13:49:13.026 14398] [llvm_program.cpp:compile@140] [           JIT] kernel return codegen
+[T 02/22/22 13:49:13.074 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel substep_c56_0_kernel
+[T 02/22/22 13:49:13.074 14398] [kernel_impl.py:__call__@680] [           JIT]------------] 0.2657265729994833
+[T 02/22/22 13:49:13.074 14398] [kernel_impl.py:materialize@434] [           JIT] compiled, returned 
+[T 02/22/22 13:49:13.075 14398] [kernel_impl.py:ensure_compiled@661] [           JIT] materialize()] 4.139199882047251e-05
+[T 02/22/22 13:49:13.075 14398] [kernel_impl.py:__call__@677] [           JIT]------------]--------
+[T 02/22/22 13:49:13.075 14398] [kernel_impl.py:func__@622] [           JIT] t_kernel(launch_ctx)
+[T 02/22/22 13:49:13.075 14398] [kernel.cpp:operator()@109] [           JIT] Kernel::operator()
+[T 02/22/22 13:49:13.075 14398] [codegen_llvm.cpp:operator()@2246] Launching kernel substep_c56_0_kernel
+[T 02/22/22 13:49:13.075 14398] [kernel_impl.py:__call__@680] [           JIT]------------] 0.0006230629987840075
+[Taichi] version 0.9.0, llvm 10.0.0, commit dd7654ae, linux, python 3.8.12
+[Taichi] Starting on arch=x64
+ti init time: 0.10585920 s
+init() time: 0.03791171 s
+>>>>>>> 1 substep() time: 0.34459417 s
+2 substep() time: 0.00072633 s
+[T 02/22/22 13:49:13.105 14398] [program.cpp:finalize@467] Program finalizing...
+[T 02/22/22 13:49:13.106 14398] [program.cpp:finalize@515] Program (0x55f9c8bb44d0) finalized_.
+[T 02/22/22 13:49:13.110 14398] [hacked_signal_handler.cpp:~HackedSignalRegister@81] Taichi signal handlers unregistered. Thread ID = 14398
diff --git a/test_temp/jit/mpm88/trace_with_step_time.txt b/test_temp/jit/mpm88/trace_with_step_time.txt
new file mode 100644
index 000000000..1abe6abca
--- /dev/null
+++ b/test_temp/jit/mpm88/trace_with_step_time.txt
@@ -0,0 +1,46 @@
+[T 02/22/22 10:43:31.687 4690] [program.cpp:Program@51] Program initializing...
+[T 02/22/22 10:43:31.687 4690] [snode_tree_buffer_manager.cpp:SNodeTreeBufferManager@11] SNode tree buffer manager created.
+[T 02/22/22 10:43:31.687 4690] [llvm_context.cpp:TaichiLLVMContext@63] Creating Taichi llvm context for arch: x64
+[T 02/22/22 10:43:31.687 4690] [llvm_context.cpp:get_this_thread_data@649] Creating thread local data for thread 140328470250432
+[T 02/22/22 10:43:31.687 4690] [llvm_context.cpp:TaichiLLVMContext@97] Taichi llvm context created.
+[T 02/22/22 10:43:31.687 4690] [memory_pool.cpp:MemoryPool@13] Memory pool created. Default buffer size per allocator = 1024 MB
+[T 02/22/22 10:43:31.782 4690] [program.cpp:Program@159] Program (0x55bbab715710) arch=x64 initialized.
+[T 02/22/22 10:43:31.782 4690] [misc.py:init@347] Materializing runtime...
+[T 02/22/22 10:43:31.782 4690] [unified_allocator.cpp:UnifiedAllocator@45] Memory allocated. Allocation time = 3.81e-06 s
+[T 02/22/22 10:43:31.782 4690] [unified_allocator.h:allocate@39] UM [data=140322973622272] allocate() request=256 remain=1073741824
+[T 02/22/22 10:43:31.782 4690] [llvm_program.cpp:materialize_runtime@391] Allocating 16 random states (used by CUDA only)
+[T 02/22/22 10:43:31.792 4690] [unified_allocator.h:allocate@39] UM [data=140322973622272] allocate() request=35256 remain=1073741568
+[T 02/22/22 10:43:31.792 4690] [unified_allocator.h:allocate@39] UM [data=140322973622272] allocate() request=2097160 remain=1073706312
+[T 02/22/22 10:43:31.792 4690] [unified_allocator.h:allocate@39] UM [data=140322973622272] allocate() request=1048576 remain=1071607800
+[T 02/22/22 10:43:31.792 4690] [unified_allocator.h:allocate@39] UM [data=140322973622272] allocate() request=320 remain=1070555136
+[T 02/22/22 10:43:31.792 4690] [llvm_program.cpp:materialize_runtime@400] LLVMRuntime initialized (excluding `root`)
+[T 02/22/22 10:43:31.792 4690] [llvm_program.cpp:materialize_runtime@403] LLVMRuntime pointer fetched
+[T 02/22/22 10:43:31.792 4690] [hacked_signal_handler.cpp:HackedSignalRegister@62] Taichi signal handlers registered. Thread ID = 4690
+[T 02/22/22 10:43:31.802 4690] [llvm_program.cpp:initialize_llvm_runtime_snodes@180] Allocating data structure of size 491520 bytes
+[T 02/22/22 10:43:31.802 4690] [snode_tree_buffer_manager.cpp:allocate@44] allocating memory for SNode Tree 0
+[T 02/22/22 10:43:31.802 4690] [unified_allocator.h:allocate@39] UM [data=140322973622272] allocate() request=491520 remain=1070554816
+[T 02/22/22 10:43:31.802 4690] [kernel_impl.py:materialize@437] Compiling kernel init_c58_0...
+[T 02/22/22 10:43:31.829 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel init_c58_0_kernel
+[T 02/22/22 10:43:31.829 4690] [kernel_impl.py:materialize@437] Compiling kernel substep_c56_0...
+[T 02/22/22 10:43:31.928 4690] [constant_fold.cpp:get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=16201203073215955205
+[T 02/22/22 10:43:31.938 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_0_kernel
+[T 02/22/22 10:43:31.938 4690] [constant_fold.cpp:get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=16201203071269798144
+[T 02/22/22 10:43:31.948 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_1_kernel
+[T 02/22/22 10:43:32.111 4690] [constant_fold.cpp:get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=16201203073283327241
+[T 02/22/22 10:43:32.120 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 10:43:32.120 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 10:43:32.120 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 10:43:32.120 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 10:43:32.120 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 10:43:32.120 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel jit_evaluator_2_kernel
+[T 02/22/22 10:43:32.171 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel substep_c56_0_kernel
+[T 02/22/22 10:43:32.172 4690] [codegen_llvm.cpp:operator()@2246] Launching kernel substep_c56_0_kernel
+[Taichi] version 0.9.0, llvm 10.0.0, commit dd7654ae, linux, python 3.8.12
+[Taichi] Starting on arch=x64
+ti init time: 0.10571943 s
+init() time: 0.03646100 s
+>>>>>>> 1 substep() time: 0.34291046 s
+2 substep() time: 0.00053424 s
+[T 02/22/22 10:43:32.202 4690] [program.cpp:finalize@467] Program finalizing...
+[T 02/22/22 10:43:32.203 4690] [program.cpp:finalize@515] Program (0x55bbab715710) finalized_.
+[T 02/22/22 10:43:32.207 4690] [hacked_signal_handler.cpp:~HackedSignalRegister@81] Taichi signal handlers unregistered. Thread ID = 4690
diff --git a/test_temp/jit/r000hs/.norun b/test_temp/jit/r000hs/.norun
new file mode 100644
index 000000000..e69de29bb
diff --git a/test_temp/jit/r000hs/archive/binaries/[vdso]/0e58e4148bc0ce4e23cb901a87f22850/[vdso] b/test_temp/jit/r000hs/archive/binaries/[vdso]/0e58e4148bc0ce4e23cb901a87f22850/[vdso]
new file mode 100644
index 000000000..4b5b25278
Binary files /dev/null and b/test_temp/jit/r000hs/archive/binaries/[vdso]/0e58e4148bc0ce4e23cb901a87f22850/[vdso] differ
diff --git a/test_temp/jit/r000hs/archive/binaries/[vdso]/8de1bac5042ec43d09faecb1f06f864b/[vdso] b/test_temp/jit/r000hs/archive/binaries/[vdso]/8de1bac5042ec43d09faecb1f06f864b/[vdso]
new file mode 100644
index 000000000..11d8fdb44
Binary files /dev/null and b/test_temp/jit/r000hs/archive/binaries/[vdso]/8de1bac5042ec43d09faecb1f06f864b/[vdso] differ
diff --git a/test_temp/jit/r000hs/config/analysis_type.cfg b/test_temp/jit/r000hs/config/analysis_type.cfg
new file mode 100644
index 000000000..b46ea5487
--- /dev/null
+++ b/test_temp/jit/r000hs/config/analysis_type.cfg
@@ -0,0 +1,249 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<bag xmlns:boolean="http://www.w3.org/2001/XMLSchema#boolean" xmlns:double="http://www.intel.com/2001/XMLSchema#double" xmlns:exsl="http://exslt.org/common" xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:str="http://exslt.org/strings" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <internal>
+  <name>%HotspotsAtypeName</name>
+  <shortName>%HotspotsAtypeShortName</shortName>
+  <description>%HotspotsAtypeDescription</description>
+  <shortDescription>%HotspotsAtypeShortDescription</shortDescription>
+  <abbreviation>hs</abbreviation>
+  <alias>hotspots</alias>
+  <int:schemaVersion>1</int:schemaVersion>
+  <int:contentVersion>1</int:contentVersion>
+  <iconClass>fire solid</iconClass>
+  <property name="helpId">configs.analysis_type-hotspots_f1101</property>
+ </internal>
+ <prerequisites xsl:version="1.0" exsl:keep_exsl_namespace="" syntax="norules">
+  <xsl:copy-of select="document('config://analysis_type/include/require_connection.xsl?connections=group_generic,tcp')"/>
+ </prerequisites>
+ <knobs xsl:version="1.0" exsl:keep_exsl_namespace="" syntax="norules">
+  <stringKnob id="initialViewpoint" displayName="%InitialViewpoint" boolean:visible="false">
+   <defaultValue>%HotspotsByCPUUsageViewpointName</defaultValue>
+  </stringKnob>
+  <stringKnob id="allowedViewpoints" boolean:visible="false">
+   <defaultValue>%HotspotsByCPUUsageViewpointName</defaultValue>
+  </stringKnob>
+  <xsl:variable name="knobsParams">
+   <params samplingIntervalApplyKnob="samplingMode" samplingIntervalApplyKnobValue="hw" pmuSamplingIntervalDescription="SamplingIntervalDescriptionOnHotspots"/>
+  </xsl:variable>
+  <xsl:variable name="knobsParamsName">
+   <xsl:text>config://analysis_type/include/knobs.xsl?</xsl:text>
+   <xsl:for-each select="exsl:node-set($knobsParams)//@*">
+    <xsl:value-of select="concat(name(), '=', .)"/>
+    <xsl:text>&amp;</xsl:text>
+   </xsl:for-each>
+  </xsl:variable>
+  <xsl:variable name="commonKnobs" select="document($knobsParamsName)"/>
+  <xsl:choose>
+   <xsl:when test="exsl:ctx('targetOS', '') = 'QNX' or exsl:ctx('targetOS', '') = 'MacOSX'">
+    <enumKnob id="samplingMode" displayName="%SamplingMode" cliName="sampling-mode" boolean:visible="false">
+     <description>%SamplingModeDescription</description>
+     <values>
+      <value displayName="%HardwareSampling" cliName="hw">hw</value>
+      <defaultValue>hw</defaultValue>
+     </values>
+    </enumKnob>
+    <xsl:copy-of select="$commonKnobs//knobs/doubleKnob[@id='samplingInterval']"/>
+    <booleanKnob id="enableStackCollect" displayName="%EnableStackCollection" cliName="enable-stack-collection">
+     <xsl:if test="not(exsl:is_experimental('sep-lbr'))">
+      <xsl:attribute name="boolean:visible">false</xsl:attribute>
+     </xsl:if>
+     <description>%EnableStackCollectionDescription</description>
+     <boolean:defaultValue>false</boolean:defaultValue>
+    </booleanKnob>
+   </xsl:when>
+   <xsl:otherwise>
+    <groupKnob id="groupForCustomControl">
+     <knobProperty name="knob_control_id">hotspotsGroup</knobProperty>
+     <knobs>
+      <enumKnob id="samplingMode" displayName="%SamplingMode" cliName="sampling-mode" boolean:visible="true">
+       <description>%SamplingModeDescription</description>
+       <values>
+        <value displayName="%SoftwareSampling" cliName="sw">sw</value>
+        <value displayName="%HardwareSampling" cliName="hw">hw</value>
+        <defaultValue>sw</defaultValue>
+       </values>
+      </enumKnob>
+      <xsl:copy-of select="$commonKnobs//knobs/doubleKnob[@id='samplingInterval']"/>
+      <booleanKnob id="enableStackCollect" displayName="%EnableStackCollection" cliName="enable-stack-collection">
+       <xsl:if test="not(exsl:is_experimental('sep-lbr')) and exsl:ctx('targetOS', '') = 'MacOSX'">
+        <xsl:attribute name="boolean:visible">false</xsl:attribute>
+       </xsl:if>
+       <description>%EnableStackCollectionDescription</description>
+       <boolean:defaultValue>false</boolean:defaultValue>
+      </booleanKnob>
+      <xsl:copy-of select="$commonKnobs//knobs/enumKnob[@id='stackSizeCollect']"/>
+     </knobs>
+    </groupKnob>
+   </xsl:otherwise>
+  </xsl:choose>
+  <doubleKnob id="slowGoodFrameThreshold" displayName="%SlowGoodFrameThreshold" cliName="slow-frames-threshold" boolean:visible="false">
+   <description>%SlowGoodFrameThresholdDescription</description>
+   <double:min>0.01</double:min>
+   <double:max>1024000</double:max>
+   <double:defaultValue>40</double:defaultValue>
+  </doubleKnob>
+  <doubleKnob id="goodFastFrameThreshold" displayName="%GoodFastFrameThreshold" cliName="fast-frames-threshold" boolean:visible="false">
+   <description>%GoodFastFrameThresholdDescription</description>
+   <double:min>0.01</double:min>
+   <double:max>1024000</double:max>
+   <double:defaultValue>100</double:defaultValue>
+  </doubleKnob>
+  <xsl:if test="exsl:ctx('targetOS', '') != 'Android'">
+   <booleanKnob id="enableCharacterizationInsights" displayName="%EnableCharacterizationInsights" cliName="enable-characterization-insights">
+    <boolean:defaultValue>true</boolean:defaultValue>
+    <description>%EnableCharacterizationInsightsDescription</description>
+   </booleanKnob>
+  </xsl:if>
+ </knobs>
+ <analysis xsl:version="1.0" str:keep_str_namespace="" exsl:keep_exsl_namespace="" syntax="norules">
+  <xsl:variable name="minMajorVersion" select="number(5)"/>
+  <xsl:variable name="minMinorVersion" select="number(10)"/>
+  <xsl:variable name="sepVersion" select="string(exsl:ctx('SEPDriverVersion', ''))"/>
+  <xsl:variable name="currentSepVersions" select="str:tokenize($sepVersion, '.')"/>
+  <xsl:variable name="useEventBasedCounts" select="(exsl:ctx('samplingMode', 'sw')='sw' or (number($currentSepVersions[1]) &gt; $minMajorVersion) or        (number($currentSepVersions[1]) = $minMajorVersion and number($currentSepVersions[2]) &gt; $minMinorVersion)) and not(exsl:ctx('PMU') = 'alderlake')"/>
+  <xsl:variable name="events" select="document(concat('config://analysis_type/include/common_events.xsl?useEventBasedCounts=', $useEventBasedCounts))"/>
+  <xsl:variable name="mainEvents" select="$events//events/cpi"/>
+  <xsl:variable name="fpuEvents" select="$events//events/fpu"/>
+  <xsl:variable name="retiredEvents" select="$events//events/retired"/>
+  <xsl:variable name="retiredEventsPM" select="$events//events/retired_perf_metrics"/>
+  <xsl:variable name="pmuCommon" select="document('config://include/pmu_common.xsl')"/>
+  <xsl:variable name="perfMetricsPossible" select="$pmuCommon//variables/perfMetricsPossible"/>
+  <collector id="runss">
+   <xsl:choose>
+    <xsl:when test="exsl:ctx('samplingMode', 'sw')='sw'">
+     <xsl:if test="not(exsl:ctx('isTPSSAvailable', 0)) and not(exsl:ctx('isPtraceAvailable', 0)) and not(exsl:ctx('targetOS', '') = 'QNX')">
+      <xsl:value-of select="exsl:error('%RunssHotspotsNotSupported')"/>
+     </xsl:if>
+     <xsl:if test="exsl:ctx('isPtraceScopeLimited', 0)">
+      <xsl:value-of select="exsl:error('%RunssPtraceScopeLimited')"/>
+     </xsl:if>
+     <xsl:if test="exsl:ctx('targetOS', '') = 'MacOSX'">
+      <xsl:value-of select="exsl:error('%ThisAnalysisTypeIsNotApplicable')"/>
+     </xsl:if>
+     <collectorKnob knob="collectSamplesMode">stack</collectorKnob>
+     <collectorKnob knob="samplingInterval">10</collectorKnob>
+     <collectorKnob knob="collectUserTasksEventsCountersMode">true</collectorKnob>
+     <xsl:if test="exsl:ctx('isPytraceAvailable', 0)">
+      <xsl:choose>
+       <xsl:when test="exsl:ctx('targetOS', '') = 'Windows'">
+        <collectorKnob knob="mrteType">java,dotnet,python</collectorKnob>
+       </xsl:when>
+       <xsl:when test="exsl:ctx('targetOS', '') = 'Linux'">
+        <collectorKnob knob="mrteType">java,python</collectorKnob>
+       </xsl:when>
+      </xsl:choose>
+     </xsl:if>
+     <collectorKnob knob="collectOpenMPRegions">
+      <xsl:value-of select="exsl:ctx('analyzeOpenMPRegions', 0)"/>
+     </collectorKnob>
+     <xsl:if test="exsl:ctx('enableCharacterizationInsights', 0)">
+      <xsl:variable name="isSEPFlow" select="exsl:ctx('isSEPDriverAvailable', 0) and (not(exsl:ctx('usePerf', 0)) or exsl:ctx('targetOS', '') = 'Windows' or exsl:ctx('targetOS', '') = 'MacOSX' or exsl:ctx('targetOS', '') = 'FreeBSD')"/>
+      <xsl:variable name="isPerfFlow" select="(exsl:ctx('targetOS', '') = 'Linux' or exsl:ctx('targetOS', '') = 'Android') and exsl:ctx('LinuxPerfCredentials', 'NotAvailable')!='NotAvailable'                        and contains(exsl:ctx('LinuxPerfCapabilities', ''), 'format')"/>
+      <xsl:variable name="isPMUAvailable" select="exsl:ctx('PerfmonVersion', '4') &gt; '1' and ($isSEPFlow or $isPerfFlow)"/>
+      <xsl:variable name="isPermissionInsufficient" select="exsl:ctx('targetOS')='Windows' and not(exsl:ctx('AdministratorPrivileges', 'false'))"/>
+      <xsl:choose>
+       <xsl:when test="not($isPMUAvailable) or $isPermissionInsufficient">
+        <xsl:if test="not($isPMUAvailable)">
+         <xsl:value-of select="exsl:warning('%HWInsightsNotAvailableWarningTpss')"/>
+        </xsl:if>
+        <xsl:if test="$isPermissionInsufficient">
+         <xsl:value-of select="exsl:warning('%HWInsightsNotAvailablePermissionWarningTpss')"/>
+        </xsl:if>
+       </xsl:when>
+       <xsl:otherwise>
+        <xsl:variable name="fullEventList">
+         <xsl:value-of select="$mainEvents"/>
+         <xsl:if test="$fpuEvents and $fpuEvents != ''">
+          <xsl:text>,</xsl:text>
+          <xsl:value-of select="$fpuEvents"/>
+         </xsl:if>
+         <xsl:choose>
+          <xsl:when test="$perfMetricsPossible = 'true'">
+           <xsl:if test="$retiredEventsPM and $retiredEventsPM != ''">
+            <xsl:text>,</xsl:text>
+            <xsl:value-of select="$retiredEventsPM"/>
+           </xsl:if>
+          </xsl:when>
+          <xsl:otherwise>
+           <xsl:if test="$retiredEvents and $retiredEvents != ''">
+            <xsl:text>,</xsl:text>
+            <xsl:value-of select="$retiredEvents"/>
+           </xsl:if>
+          </xsl:otherwise>
+         </xsl:choose>
+        </xsl:variable>
+        <collectorKnob knob="pmuEventConfig">
+         <xsl:value-of select="exsl:merge_pmu_events($fullEventList)"/>
+        </collectorKnob>
+        <xsl:choose>
+         <xsl:when test="$perfMetricsPossible = 'true'">
+          <boolean:collectorKnob knob="useAggregatedCounting">true</boolean:collectorKnob>
+          <boolean:collectorKnob knob="usePerfMetrics">true</boolean:collectorKnob>
+         </xsl:when>
+         <xsl:otherwise>
+          <boolean:collectorKnob knob="useCountingMode">true</boolean:collectorKnob>
+         </xsl:otherwise>
+        </xsl:choose>
+       </xsl:otherwise>
+      </xsl:choose>
+     </xsl:if>
+     <boolean:collectorKnob knob="collectSWHotspots">true</boolean:collectorKnob>
+    </xsl:when>
+    <xsl:otherwise>
+     <xsl:attribute name="id">runsa</xsl:attribute>
+     <xsl:if test="exsl:ctx('PMU', '') = ''">
+      <xsl:value-of select="exsl:error('%UnknownPMUForAT')"/>
+     </xsl:if>
+     <collectorKnob knob="useEventBasedCounts">
+      <xsl:value-of select="$useEventBasedCounts"/>
+     </collectorKnob>
+     <collectorKnob knob="collectUserTasksEventsCountersMode">true</collectorKnob>
+     <collectorKnob knob="eventMode">all</collectorKnob>
+     <collectorKnob knob="enableStackCollection">
+      <xsl:value-of select="exsl:ctx('enableStackCollect', 0)"/>
+     </collectorKnob>
+     <xsl:copy-of select="document('config://analysis_type/include/knobs.xsl')//knobs/stackSizeKnobLogic/*"/>
+     <boolean:collectorKnob knob="enableCSwitch">false</boolean:collectorKnob>
+     <boolean:collectorKnob knob="cpuGpuUsageData">true</boolean:collectorKnob>
+     <xsl:variable name="usePerfMetrics" select="exsl:is_experimental('icl-perf-metrics') and exsl:ctx('isSEPDriverAvailable', 0)"/>
+     <xsl:variable name="fullEventList">
+      <xsl:value-of select="$mainEvents"/>
+      <xsl:choose>
+       <xsl:when test="$usePerfMetrics">
+        <xsl:if test="$retiredEventsPM and $retiredEventsPM != ''">
+         <xsl:text>,</xsl:text>
+         <xsl:value-of select="$retiredEvents"/>
+        </xsl:if>
+       </xsl:when>
+       <xsl:otherwise>
+        <xsl:if test="$retiredEvents and $retiredEvents != ''">
+         <xsl:text>,</xsl:text>
+         <xsl:value-of select="$retiredEvents"/>
+        </xsl:if>
+       </xsl:otherwise>
+      </xsl:choose>
+      <xsl:if test="exsl:ctx('enableCharacterizationInsights', 0)">
+       <xsl:if test="$fpuEvents and $fpuEvents != ''">
+        <xsl:text>,</xsl:text>
+        <xsl:value-of select="$fpuEvents"/>
+       </xsl:if>
+      </xsl:if>
+     </xsl:variable>
+     <collectorKnob knob="pmuEventConfig">
+      <xsl:value-of select="exsl:merge_pmu_events($fullEventList)"/>
+     </collectorKnob>
+     <collectorKnob knob="pmuSamplingInterval">
+      <xsl:value-of select="format-number(exsl:ctx('samplingInterval', 1), '#.####')"/>
+     </collectorKnob>
+     <boolean:collectorKnob knob="isUArchUsageAvailable">true</boolean:collectorKnob>
+     <collectorKnob knob="usePerfMetrics">
+      <xsl:value-of select="$usePerfMetrics"/>
+     </collectorKnob>
+     <xsl:if test="exsl:is_experimental('ehfi')">
+      <collectorKnob knob="analyzeEHFIClasses">true</collectorKnob>
+     </xsl:if>
+    </xsl:otherwise>
+   </xsl:choose>
+  </collector>
+ </analysis>
+</bag>
diff --git a/test_temp/jit/r000hs/config/collection.cfg b/test_temp/jit/r000hs/config/collection.cfg
new file mode 100644
index 000000000..6292189c4
--- /dev/null
+++ b/test_temp/jit/r000hs/config/collection.cfg
@@ -0,0 +1,34 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:boolean="http://www.w3.org/2001/XMLSchema#boolean" xmlns:double="http://www.intel.com/2001/XMLSchema#double" xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:null="http://www.intel.com/2009/BagSchema#null">
+ <pointer:workload pointer:type_id="dasID_cctrl2::ApplicationWorkload" pointer:ptr_type="dasID_cctrl2::IWorkload">
+  <workload targetName="launch" launch_app.app_to_launch="python" launch_app.app_parameters="&quot;mpm88_compute.py&quot; &quot;1000&quot; " result_directory.path="/home/qiao/Taichi/taichi-qiao/test_temp/jit/r@@@{at}">
+   <search_directories/>
+   <context>
+    <contextValue id="allowMultipleRuns" boolean:value="false"/>
+    <contextValue id="analyzeKvmGuest" boolean:value="false"/>
+    <contextValue id="analyzeSystemWide" boolean:value="false"/>
+    <contextValue id="cpuMask" value=""/>
+    <contextValue id="customCollector" value=""/>
+    <contextValue id="dataLimit" int:value="1000"/>
+    <contextValue id="enableRing" boolean:value="false"/>
+    <contextValue id="finalizationMode" value="fast"/>
+    <contextValue id="followChild" boolean:value="true"/>
+    <contextValue id="followChildGroup" null:value=""/>
+    <contextValue id="followChildStrategy" value=""/>
+    <contextValue id="groupForFinalizationControl" null:value=""/>
+    <contextValue id="kvmGuestKallsyms" value=""/>
+    <contextValue id="kvmGuestModules" value=""/>
+    <contextValue id="kvmProfileGuest" null:value=""/>
+    <contextValue id="mrteMode" value="auto"/>
+    <contextValue id="targetDurationType" value="short"/>
+    <contextValue id="targetRingBuffer" double:value="0"/>
+    <contextValue id="targetType" value="launch"/>
+    <contextValue id="traceMpi" boolean:value="false"/>
+    <contextValue id="tracingMode" null:value=""/>
+    <contextValue id="wrapperScriptContent" value=""/>
+    <contextValue id="wrapperScriptPath" value=""/>
+   </context>
+  </workload>
+ </pointer:workload>
+</bag>
diff --git a/test_temp/jit/r000hs/config/context_values.cfg b/test_temp/jit/r000hs/config/context_values.cfg
new file mode 100644
index 000000000..167a3c97e
--- /dev/null
+++ b/test_temp/jit/r000hs/config/context_values.cfg
@@ -0,0 +1,358 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:boolean="http://www.w3.org/2001/XMLSchema#boolean" xmlns:double="http://www.intel.com/2001/XMLSchema#double" xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:long="http://www.w3.org/2001/XMLSchema#long" xmlns:null="http://www.intel.com/2009/BagSchema#null" xmlns:unsignedByte="http://www.w3.org/2001/XMLSchema#unsignedByte" xmlns:unsignedInt="http://www.w3.org/2001/XMLSchema#unsignedInt" xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong" xmlns:unsignedShort="http://www.w3.org/2001/XMLSchema#unsignedShort">
+ <contextValue id="CLIENT_ID" value="CLI"/>
+ <contextValue id="CPU_NAME" value="Intel(R) microarchitecture code named Rocketlake"/>
+ <contextValue id="Hypervisor" value="None"/>
+ <contextValue id="HypervisorType" value="None"/>
+ <contextValue id="IsNUMANodeWithoutCPUsPresent" boolean:value="false"/>
+ <contextValue id="LinuxPerfCapabilities" value="NotAvailable"/>
+ <contextValue id="LinuxPerfCredentials" value="Restricted"/>
+ <contextValue id="LinuxPerfMuxIntervalMs" int:value="4"/>
+ <contextValue id="LinuxPerfStackCapabilities" value="NotAvailable"/>
+ <contextValue id="LinuxRelease" value="5.13.0-28-generic"/>
+ <contextValue id="OS" value="Linux"/>
+ <contextValue id="OS@version" int:value="1"/>
+ <contextValue id="OSBitness" value="64"/>
+ <contextValue id="OSBuildNumber" unsignedShort:value="0"/>
+ <contextValue id="PMU" value="icelake"/>
+ <contextValue id="PerfmonVersion" unsignedByte:value="5"/>
+ <contextValue id="RootPrivileges" boolean:value="false"/>
+ <contextValue id="accurateCPUTimeDetection" boolean:value="false"/>
+ <contextValue id="adjustCollectionBoundsByOMPApps" boolean:value="false"/>
+ <contextValue id="advancedLoopAnalysis" boolean:value="false"/>
+ <contextValue id="allowMultipleRuns" boolean:value="false"/>
+ <contextValue id="allowedViewpoints" value="%HotspotsByCPUUsageViewpointName"/>
+ <contextValue id="analyzeActivePowerConsumption" boolean:value="false"/>
+ <contextValue id="analyzeDgfxBandwidth" boolean:value="false"/>
+ <contextValue id="analyzeEHFIClasses" boolean:value="false"/>
+ <contextValue id="analyzeEnergyConsumption" boolean:value="false"/>
+ <contextValue id="analyzeFPU" boolean:value="false"/>
+ <contextValue id="analyzeFullProcTrace" boolean:value="false"/>
+ <contextValue id="analyzeIdlePowerConsumption" boolean:value="false"/>
+ <contextValue id="analyzeKvmGuest" boolean:value="false"/>
+ <contextValue id="analyzeMemoryConsumption" boolean:value="false"/>
+ <contextValue id="analyzePersistentMemory" boolean:value="false"/>
+ <contextValue id="analyzeSystemWide" boolean:value="false"/>
+ <contextValue id="appRunsCount" unsignedInt:value="1"/>
+ <contextValue id="apsMode" boolean:value="false"/>
+ <contextValue id="areGpuHardwareMetricsAvailableList" value="0:0:2.0|true;"/>
+ <contextValue id="areKernelPtrsRestricted" value="yes"/>
+ <contextValue id="atraceEventConfig" value=""/>
+ <contextValue id="basicBlockAnalysis" boolean:value="true"/>
+ <contextValue id="calleeAttributionMode" value="undefined"/>
+ <contextValue id="chipsetEventConfig" value=""/>
+ <contextValue id="collectCAT" boolean:value="false"/>
+ <contextValue id="collectCacheOccupancy" boolean:value="false"/>
+ <contextValue id="collectCallCounts" boolean:value="false"/>
+ <contextValue id="collectFPGAOpenCl" boolean:value="false"/>
+ <contextValue id="collectFpgaBlueStreamEvents" boolean:value="false"/>
+ <contextValue id="collectFramesMode" boolean:value="true"/>
+ <contextValue id="collectFullProcTrace" boolean:value="false"/>
+ <contextValue id="collectGpuCm" boolean:value="false"/>
+ <contextValue id="collectGpuMetal" boolean:value="false"/>
+ <contextValue id="collectGpuOpenCl" value="0:0:2.0|false"/>
+ <contextValue id="collectHostGPUBandwidth" boolean:value="false"/>
+ <contextValue id="collectHwTrace" boolean:value="false"/>
+ <contextValue id="collectIOMMU" boolean:value="false"/>
+ <contextValue id="collectIoMode" value="off"/>
+ <contextValue id="collectIoWaits" boolean:value="false"/>
+ <contextValue id="collectL3ExternalBW" boolean:value="false"/>
+ <contextValue id="collectMemBandwidth" boolean:value="false"/>
+ <contextValue id="collectMemObjects" boolean:value="false"/>
+ <contextValue id="collectMmioAccess" boolean:value="false"/>
+ <contextValue id="collectOpenMPRegions" boolean:value="false"/>
+ <contextValue id="collectOsCounters" boolean:value="false"/>
+ <contextValue id="collectPCIeBandwidth" boolean:value="false"/>
+ <contextValue id="collectPStateData" boolean:value="true"/>
+ <contextValue id="collectPTforTSX" boolean:value="false"/>
+ <contextValue id="collectPreciseClockticks" boolean:value="false"/>
+ <contextValue id="collectSWHotspots" boolean:value="true"/>
+ <contextValue id="collectSamplesMode" value="stack"/>
+ <contextValue id="collectSignalsMode" value="off"/>
+ <contextValue id="collectTSXCycles" boolean:value="false"/>
+ <contextValue id="collectThrottlingReasons" boolean:value="false"/>
+ <contextValue id="collectTopology" boolean:value="false"/>
+ <contextValue id="collectTripCounts" boolean:value="false"/>
+ <contextValue id="collectUserHistogramMode" boolean:value="false"/>
+ <contextValue id="collectUserSyncMode" boolean:value="false"/>
+ <contextValue id="collectUserTasksEventsCountersMode" boolean:value="true"/>
+ <contextValue id="collectWaitsMode" value="off"/>
+ <contextValue id="collectionStart" value="02:19:43 22/02/2022 UTC"/>
+ <contextValue id="collectionStart@version" int:value="2"/>
+ <contextValue id="collectionStartTsc" unsignedLong:value="4463507835657606232"/>
+ <contextValue id="collectionStop" value="02:19:51 22/02/2022 UTC"/>
+ <contextValue id="collectionStop@version" int:value="2"/>
+ <contextValue id="collectionStopTsc" unsignedLong:value="4463507915099956045"/>
+ <contextValue id="collectorTypeDecription" value="User-mode sampling and tracing"/>
+ <contextValue id="collectorTypeDecription@version" int:value="1"/>
+ <contextValue id="commandLine" value="python &quot;mpm88_compute.py&quot; &quot;1000&quot; "/>
+ <contextValue id="commandLine@version" int:value="1"/>
+ <contextValue id="computerName" value="pc"/>
+ <contextValue id="computerName@version" int:value="1"/>
+ <contextValue id="connectionType" value="localhost"/>
+ <contextValue id="coreFrequencies" value="3504000187"/>
+ <contextValue id="coreFrequencies@version" int:value="1"/>
+ <contextValue id="cpuByIoWaits" boolean:value="false"/>
+ <contextValue id="cpuFamily" unsignedInt:value="6"/>
+ <contextValue id="cpuFamily@version" int:value="1"/>
+ <contextValue id="cpuGpuUsageData" boolean:value="false"/>
+ <contextValue id="cpuMask" value=""/>
+ <contextValue id="cpuModel" unsignedInt:value="167"/>
+ <contextValue id="cpuModel@version" int:value="1"/>
+ <contextValue id="createGPUQueueFrames" boolean:value="false"/>
+ <contextValue id="criticalRankOpenmpThreadCount" unsignedInt:value="0"/>
+ <contextValue id="criticalRankOpenmpThreadCount@version" int:value="1"/>
+ <contextValue id="cswitchMode" value="inactive"/>
+ <contextValue id="customCollector" value=""/>
+ <contextValue id="dataLimit" int:value="1000"/>
+ <contextValue id="disableGPUSysinfo" boolean:value="false"/>
+ <contextValue id="dramBandwidthLimits" boolean:value="false"/>
+ <contextValue id="emonCSVTraceFormat" boolean:value="false"/>
+ <contextValue id="emonMachineReadableVersionFile" boolean:value="false"/>
+ <contextValue id="emonPreset" value=""/>
+ <contextValue id="enableCStateCollection" boolean:value="false"/>
+ <contextValue id="enableCSwitch" boolean:value="false"/>
+ <contextValue id="enableCharacterizationInsights" boolean:value="true"/>
+ <contextValue id="enableCycleAccurateMode" boolean:value="false"/>
+ <contextValue id="enableDramBandwidthLimitsWarning" boolean:value="false"/>
+ <contextValue id="enableHWBasedCSCollection" boolean:value="false"/>
+ <contextValue id="enableInterruptsCollection" boolean:value="false"/>
+ <contextValue id="enableLBRCollection" boolean:value="false"/>
+ <contextValue id="enableMemoryObjectCorrelation" boolean:value="false"/>
+ <contextValue id="enableMpiTracing" boolean:value="false"/>
+ <contextValue id="enableOpenglesInstrumentation" boolean:value="false"/>
+ <contextValue id="enablePEBSCollection" boolean:value="false"/>
+ <contextValue id="enableParallelFsCollection" boolean:value="false"/>
+ <contextValue id="enableRing" boolean:value="false"/>
+ <contextValue id="enableRing0ProfilingMode" boolean:value="false"/>
+ <contextValue id="enableStackCollect" boolean:value="false"/>
+ <contextValue id="enableStackCollection" boolean:value="false"/>
+ <contextValue id="enableThreadAffinity" boolean:value="false"/>
+ <contextValue id="enableVTSSCollection" boolean:value="false"/>
+ <contextValue id="energyProfilingMode" value="none"/>
+ <contextValue id="environmentVars" null:value=""/>
+ <contextValue id="environmentVars@version" int:value="1"/>
+ <contextValue id="errorsAsWarnings" boolean:value="false"/>
+ <contextValue id="eventInfo" boolean:value="false"/>
+ <contextValue id="eventMode" value="all"/>
+ <contextValue id="eventMuxFactor" int:value="5"/>
+ <contextValue id="explicitUncoreEventsConfig" boolean:value="false"/>
+ <contextValue id="fileRequiestLogic" value="local"/>
+ <contextValue id="finalizationMode" value="fast"/>
+ <contextValue id="followChild" boolean:value="true"/>
+ <contextValue id="followChildGroup" null:value=""/>
+ <contextValue id="followChildStrategy" value=""/>
+ <contextValue id="forceMuxOff" boolean:value="false"/>
+ <contextValue id="forceSchedAndCounterMetricsGrouper" boolean:value="false"/>
+ <contextValue id="forceShowInlines" boolean:value="false"/>
+ <contextValue id="forceSystemWide" boolean:value="false"/>
+ <contextValue id="fpgaAocxOrBinaryFile" value=""/>
+ <contextValue id="fpgaNoMemTransfers" boolean:value="false"/>
+ <contextValue id="fpgaNoTemporal" boolean:value="false"/>
+ <contextValue id="fpgaOnBoard" value="None"/>
+ <contextValue id="fpgaPeriod" int:value="0"/>
+ <contextValue id="fpgaSourceFile" value=""/>
+ <contextValue id="fpuVersion" value="1_0"/>
+ <contextValue id="ftraceEventConfig" value=""/>
+ <contextValue id="genArchOnBoard" int:value="10"/>
+ <contextValue id="goodFastFrameThreshold" double:value="100"/>
+ <contextValue id="gpuAdapterDriver" value=""/>
+ <contextValue id="gpuAdapterName" value="Display controller: Intel Corporation Device 0x4c8a"/>
+ <contextValue id="gpuAdapterNameList" value="0:0:2.0|Display controller: Intel Corporation Device 0x4c8a;"/>
+ <contextValue id="gpuAdapterNames" value="0:0:2.0|Display controller: Intel Corporation Device 0x4c8a"/>
+ <contextValue id="gpuAdapterTileNameList" value="0:0:2.0|Display controller: Intel Corporation Device 0x4c8a|0,;"/>
+ <contextValue id="gpuAdapterVendor" value="Intel Corporation"/>
+ <contextValue id="gpuCounters" value="0:0:2.0|none"/>
+ <contextValue id="gpuMetricsSelected" value=""/>
+ <contextValue id="gpuOpenCLDeviceOrder" value="bdf"/>
+ <contextValue id="gpuPerformanceMode" null:value=""/>
+ <contextValue id="gpuPlatform" null:value=""/>
+ <contextValue id="gpuPlatformIndex" null:value=""/>
+ <contextValue id="gpuPlatformIndex@version" int:value="1"/>
+ <contextValue id="gpuPlatformIndexList" value="0:0:2.0|23;"/>
+ <contextValue id="gpuProfilingMode" value="0:0:2.0|none"/>
+ <contextValue id="gpuSamplingInterval" value="0:0:2.0|1"/>
+ <contextValue id="gpuUsage" boolean:value="false"/>
+ <contextValue id="groupForCustomControl" null:value=""/>
+ <contextValue id="groupForFinalizationControl" null:value=""/>
+ <contextValue id="groupForGPUCustomCollection" null:value=""/>
+ <contextValue id="groupForTraceEventConfig" null:value=""/>
+ <contextValue id="handleLostEvents" boolean:value="false"/>
+ <contextValue id="hideSystemByDefault" boolean:value="false"/>
+ <contextValue id="hideWarningInPerfsnapshot" boolean:value="false"/>
+ <contextValue id="hostName" value="pc"/>
+ <contextValue id="hostOS" value="Linux"/>
+ <contextValue id="hostsCount" int:value="1"/>
+ <contextValue id="hostsCount@version" int:value="1"/>
+ <contextValue id="i915Status" value="KernelNotPatched"/>
+ <contextValue id="ignorePowerData" boolean:value="false"/>
+ <contextValue id="inKernelProfilingAnalysis" boolean:value="false"/>
+ <contextValue id="initialViewpoint" value="%HotspotsByCPUUsageViewpointName"/>
+ <contextValue id="iptCollectEvents" boolean:value="false"/>
+ <contextValue id="iptRegionsToLoad" int:value="0"/>
+ <contextValue id="is3DXP2LMMode" boolean:value="false"/>
+ <contextValue id="is3DXPAppDirectMode" boolean:value="false"/>
+ <contextValue id="is3DXPPresent" boolean:value="false"/>
+ <contextValue id="isAOCLAvailable" boolean:value="false"/>
+ <contextValue id="isCATSupportedByCPU" boolean:value="false"/>
+ <contextValue id="isCPUSupportedBySocwatch" boolean:value="false"/>
+ <contextValue id="isCSwitchAvailable" value="no"/>
+ <contextValue id="isCpuThrottlingAvailable" boolean:value="false"/>
+ <contextValue id="isDeviceOrCredentialGuardEnabled" boolean:value="false"/>
+ <contextValue id="isEHFIAvailable" boolean:value="false"/>
+ <contextValue id="isEnergyCollectionSupported" boolean:value="true"/>
+ <contextValue id="isFtraceAvailable" value="ftraceAccessError,debugfsNotAccessible"/>
+ <contextValue id="isFtraceAvailableKnob" value="ftraceAccessError,debugfsNotAccessible"/>
+ <contextValue id="isFunctionTracingAvailable" value="no"/>
+ <contextValue id="isGENDebugInfoAvailableList" value="0:0:2.0|true;"/>
+ <contextValue id="isGTPinCollectionAvailableList" value="0:0:2.0|true;"/>
+ <contextValue id="isGpuBusynessAvailable" value="i915TracepointsConfigOff,notAccessible"/>
+ <contextValue id="isGpuWaitAvailable" value="no"/>
+ <contextValue id="isHTEnabled" boolean:value="true"/>
+ <contextValue id="isHybridCPU" boolean:value="false"/>
+ <contextValue id="isHybridCPU@version" int:value="1"/>
+ <contextValue id="isIPMWatchReady" boolean:value="false"/>
+ <contextValue id="isIowaitTracingAvailable" value="no"/>
+ <contextValue id="isL3MonitoringSupportedByCPU" boolean:value="false"/>
+ <contextValue id="isMaxDRAMBandwidthMeasurementSupported" boolean:value="true"/>
+ <contextValue id="isMdfEtwAvailable" boolean:value="false"/>
+ <contextValue id="isNMIWatchDogTimerRunning" boolean:value="true"/>
+ <contextValue id="isNvdimmAvailable" boolean:value="true"/>
+ <contextValue id="isOsCountersCollectorAvailable" boolean:value="false"/>
+ <contextValue id="isPAXDriverLoaded" boolean:value="false"/>
+ <contextValue id="isPStateAvailable" boolean:value="true"/>
+ <contextValue id="isPTAvailable" boolean:value="true"/>
+ <contextValue id="isPerfPCIeMappingAvailable" boolean:value="false"/>
+ <contextValue id="isPowerVRDataAvailable" value="no"/>
+ <contextValue id="isPtraceAvailable" boolean:value="true"/>
+ <contextValue id="isPtraceScopeLimited" boolean:value="false"/>
+ <contextValue id="isPytraceAvailable" boolean:value="true"/>
+ <contextValue id="isSEPDriverAvailable" boolean:value="false"/>
+ <contextValue id="isSGXAvailable" boolean:value="false"/>
+ <contextValue id="isSTTAvailable" value="no"/>
+ <contextValue id="isSocwatchDriverLoaded" boolean:value="false"/>
+ <contextValue id="isTPSSAvailable" boolean:value="true"/>
+ <contextValue id="isTSXAvailable" boolean:value="false"/>
+ <contextValue id="isUArchUsageAvailable" boolean:value="false"/>
+ <contextValue id="isVSyncAvailable" value="no"/>
+ <contextValue id="isVTSSPPDriverAvailable" boolean:value="false"/>
+ <contextValue id="kernelStackWarningCheck" boolean:value="false"/>
+ <contextValue id="kernelsToProfile" value="*#1#1#4294967295"/>
+ <contextValue id="kvmGuestKallsyms" value=""/>
+ <contextValue id="kvmGuestModules" value=""/>
+ <contextValue id="kvmProfileGuest" null:value=""/>
+ <contextValue id="l0DevicesAvailable" boolean:value="false"/>
+ <contextValue id="l0GPUDevicesAvailable" boolean:value="false"/>
+ <contextValue id="l0LoaderStatus" value="LibNotFound"/>
+ <contextValue id="l0MetricConfig" value=""/>
+ <contextValue id="l0SamplingInterval" double:value="1"/>
+ <contextValue id="l0SamplingType" value="time"/>
+ <contextValue id="l0VPUDevicesAvailable" boolean:value="false"/>
+ <contextValue id="loadLbrStackToDb" boolean:value="true"/>
+ <contextValue id="loadPebsData" boolean:value="true"/>
+ <contextValue id="loadRawLbrData" boolean:value="false"/>
+ <contextValue id="logicalCPUCount" int:value="16"/>
+ <contextValue id="logicalCPUCount@version" int:value="1"/>
+ <contextValue id="maxLocalBandwidth" unsignedLong:value="0"/>
+ <contextValue id="maxRegionDuration" double:value="100"/>
+ <contextValue id="memoryAccessBandwidthMeasuring" boolean:value="false"/>
+ <contextValue id="memoryObjectMinSize" int:value="1024"/>
+ <contextValue id="mpiRank" null:value=""/>
+ <contextValue id="mpiRank@version" int:value="2"/>
+ <contextValue id="mrteMode" value="auto"/>
+ <contextValue id="mrteType" value="java,python"/>
+ <contextValue id="nameThreadsAsCreationModule" boolean:value="false"/>
+ <contextValue id="noSignalAltstack" boolean:value="false"/>
+ <contextValue id="numaVersion" value="1_0"/>
+ <contextValue id="numaVersion@version" int:value="1"/>
+ <contextValue id="numaVersionCurrent" value="1_0"/>
+ <contextValue id="omniPathOnBoard" value="None"/>
+ <contextValue id="openclSourceAsm" boolean:value="true"/>
+ <contextValue id="openmpProcessCount" unsignedInt:value="0"/>
+ <contextValue id="openmpProcessCount@version" int:value="1"/>
+ <contextValue id="openmpThreadCount" unsignedInt:value="0"/>
+ <contextValue id="openmpThreadCount@version" int:value="2"/>
+ <contextValue id="osDetailedName" value="5.13.0-28-generic DISTRIB_ID=Ubuntu&#10;DISTRIB_RELEASE=20.04&#10;DISTRIB_CODENAME=focal&#10;DISTRIB_DESCRIPTION=&quot;Ubuntu 20.04.3 LTS&quot;"/>
+ <contextValue id="osDetailedName@version" int:value="1"/>
+ <contextValue id="osKernelRelease" value="5.13.0-28-generic"/>
+ <contextValue id="osKernelRelease@version" int:value="1"/>
+ <contextValue id="osName" value="Linux"/>
+ <contextValue id="osName@version" int:value="1"/>
+ <contextValue id="packageCount" int:value="1"/>
+ <contextValue id="packageCount@version" int:value="1"/>
+ <contextValue id="pausedTime" double:value="0"/>
+ <contextValue id="pausedTime@version" int:value="2"/>
+ <contextValue id="pciClassParts" value=""/>
+ <contextValue id="perfForceSystemWide" boolean:value="false"/>
+ <contextValue id="perfPreferPerCpuSamplingMode" boolean:value="true"/>
+ <contextValue id="physicalCoreCount" int:value="8"/>
+ <contextValue id="physicalCoreCount@version" int:value="1"/>
+ <contextValue id="platformType" value="140"/>
+ <contextValue id="pmuEventConfig" value=""/>
+ <contextValue id="pmuSamplingInterval" double:value="1"/>
+ <contextValue id="populatedIoParts" value=""/>
+ <contextValue id="populatedIoUnits" value=""/>
+ <contextValue id="populatedTidValuesForIO" value=""/>
+ <contextValue id="preciseMultiplexing" boolean:value="false"/>
+ <contextValue id="preferDriverlessCollection" boolean:value="true"/>
+ <contextValue id="preferedGpuAdapter" value="0:0:2.0"/>
+ <contextValue id="preferredEbsCollectorOrder" value="perf,vtss,sep"/>
+ <contextValue id="processKernelBinaries" boolean:value="false"/>
+ <contextValue id="referenceFrequency" unsignedLong:value="3504000187"/>
+ <contextValue id="referenceFrequency@version" int:value="2"/>
+ <contextValue id="resolveCallsites" boolean:value="true"/>
+ <contextValue id="restrictPCIeBandwidthByClass" value="None"/>
+ <contextValue id="resultSize" long:value="12857102"/>
+ <contextValue id="resultSize@version" int:value="1"/>
+ <contextValue id="ringBuffer" int:value="0"/>
+ <contextValue id="ringBufferSizeInSeconds" double:value="0"/>
+ <contextValue id="runss:enable" boolean:value="true"/>
+ <contextValue id="samplingInterval" double:value="1"/>
+ <contextValue id="samplingMode" value="sw"/>
+ <contextValue id="scPids" value="2601702"/>
+ <contextValue id="scStartTscs" value="4463507833944358870"/>
+ <contextValue id="shortCollectionMux" null:value=""/>
+ <contextValue id="shortCollectionMux@version" int:value="1"/>
+ <contextValue id="showGPUBandwidthHistogram" boolean:value="true"/>
+ <contextValue id="showInlinesByDefault" boolean:value="true"/>
+ <contextValue id="slowGoodFrameThreshold" double:value="40"/>
+ <contextValue id="stackSize" int:value="1024"/>
+ <contextValue id="stackSizeCollect" value="1024"/>
+ <contextValue id="stackStitching" boolean:value="true"/>
+ <contextValue id="stackTypeCollect" value="software"/>
+ <contextValue id="stackUnwindLimit" int:value="8388608"/>
+ <contextValue id="stackwalk" value="offline"/>
+ <contextValue id="supportedTargetTypes" value="all"/>
+ <contextValue id="suppressCSVSyntaxWarnings" boolean:value="false"/>
+ <contextValue id="systemCollectorConfig" value=""/>
+ <contextValue id="systemWideContextSwitch" boolean:value="false"/>
+ <contextValue id="systemWideDiskIO" boolean:value="false"/>
+ <contextValue id="targetConcurrency" int:value="16"/>
+ <contextValue id="targetConcurrency@version" int:value="2"/>
+ <contextValue id="targetDurationType" value="short"/>
+ <contextValue id="targetGPU" value="0:0:2.0"/>
+ <contextValue id="targetOS" value="Linux"/>
+ <contextValue id="targetOS@version" int:value="1"/>
+ <contextValue id="targetOption" value="localhost"/>
+ <contextValue id="targetRingBuffer" double:value="0"/>
+ <contextValue id="targetType" value="launch"/>
+ <contextValue id="tidValuesForIO" value=""/>
+ <contextValue id="tmamVersion" value="4_3"/>
+ <contextValue id="tmamVersion@version" int:value="1"/>
+ <contextValue id="tmamVersionCurrent" value="4_3"/>
+ <contextValue id="totalElapsedTime" double:value="7.6418914804"/>
+ <contextValue id="totalElapsedTime@version" int:value="3"/>
+ <contextValue id="traceMpi" boolean:value="false"/>
+ <contextValue id="tracingMode" null:value=""/>
+ <contextValue id="uncoreSamplingInterval" int:value="10"/>
+ <contextValue id="useAOCLProfile" boolean:value="false"/>
+ <contextValue id="useAggregatedCounting" boolean:value="false"/>
+ <contextValue id="useCountingMode" boolean:value="false"/>
+ <contextValue id="useEventBasedCounts" boolean:value="false"/>
+ <contextValue id="useGpuCounting" boolean:value="false"/>
+ <contextValue id="usePerf" boolean:value="false"/>
+ <contextValue id="usePerfMetrics" boolean:value="false"/>
+ <contextValue id="wrapperScriptContent" value=""/>
+ <contextValue id="wrapperScriptPath" value=""/>
+</bag>
diff --git a/test_temp/jit/r000hs/config/log.cfg b/test_temp/jit/r000hs/config/log.cfg
new file mode 100644
index 000000000..f4622dd03
--- /dev/null
+++ b/test_temp/jit/r000hs/config/log.cfg
@@ -0,0 +1,36 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:long="http://www.w3.org/2001/XMLSchema#long">
+ <message_entry_t int:status="1" cap="Data collection completed with warnings" msg="Please see warning messages for details. " long:timeStamp="1645496391569">
+  <message msg="Hardware collection of CPU events is not possible on this system. Microarchitecture performance insights will not be available." int:severity="2"/>
+  <message msg="Collection started. To stop the collection, either press CTRL-C or enter from another console window: vtune -r /home/qiao/Taichi/taichi-qiao/test_temp/jit/r000hs -command stop." int:severity="1"/>
+  <message msg="Collection stopped." int:severity="1"/>
+ </message_entry_t>
+ <message_entry_t int:status="1" cap="Finalization completed with warnings" msg="Result finalization has completed with warnings that may affect the representation of analysis data. Please see details below." long:timeStamp="1645496403033">
+  <message msg="Using result path `/home/qiao/Taichi/taichi-qiao/test_temp/jit/r000hs'" int:severity="1"/>
+  <message msg="The database has been cleared, elapsed time is 0.214 seconds." int:severity="16"/>
+  <message msg="Raw data has been loaded to the database, elapsed time is 3.670 seconds." int:severity="16"/>
+  <message msg="Data transformations have been finished, elapsed time is 0.010 seconds." int:severity="16"/>
+  <message msg="Cannot locate file `&lt;frozen importlib._bootstrap&gt;'." int:severity="2"/>
+  <message msg="Cannot locate file `&lt;frozen importlib._bootstrap_external&gt;'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/numpy/random/mtrand.cpython-38-x86_64-linux-gnu.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/torch/lib/libcudart-a7b20f20.so.11.0'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/numpy/random/bit_generator.cpython-38-x86_64-linux-gnu.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/usr/bin/uname'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/torch/lib/libc10.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/numpy/core/_multiarray_umath.cpython-38-x86_64-linux-gnu.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/torch/lib/libtorch_python.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/intel/oneapi/vtune/2022.0.0/lib64/libtpsstool.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/numpy.libs/libopenblasp-r0-2d23e62b.3.17.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/lib64/ld-linux-x86-64.so.2'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/taichi-qiao/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so'." int:severity="2"/>
+  <message msg="Symbol resolution has been finished, elapsed time is 6.787 seconds." int:severity="16"/>
+  <message msg="Deferred data transformations have been finished, elapsed time is 0.018 seconds." int:severity="16"/>
+  <message msg="Data model parameters have been set, elapsed time is 0.014 seconds." int:severity="16"/>
+  <message msg="Precomputing frequently used data has been finished, elapsed time is 0.022 seconds." int:severity="16"/>
+  <message msg="Redundant overtime data has been discarded, elapsed time is 0.001 seconds." int:severity="16"/>
+  <message msg="Raw collector data has been discarded, elapsed time is 0.000 seconds." int:severity="16"/>
+  <message msg="Finalizing the result took 11.001 seconds." int:severity="16"/>
+  <message msg="Knob values have been set, elapsed time is 0.000 seconds." int:severity="16"/>
+ </message_entry_t>
+</bag>
diff --git a/test_temp/jit/r000hs/config/runss.options b/test_temp/jit/r000hs/config/runss.options
new file mode 100644
index 000000000..79d5bbbb2
--- /dev/null
+++ b/test_temp/jit/r000hs/config/runss.options
@@ -0,0 +1,18 @@
+-r
+/home/qiao/Taichi/taichi-qiao/test_temp/jit/r000hs
+--stack-stitching
+--data-limit-mb=1000
+--disk-space-limit=0
+--mrte-type=java,python
+--stack-unwind-limit=8388608
+--itt-config=frame
+--itt-config=task,event,counter
+--stackwalk=offline
+--mrte-mode=auto
+--type=cpu:counters:nostack
+--type=cpu:stack
+--interval=10
+--
+python
+mpm88_compute.py
+1000
diff --git a/test_temp/jit/r000hs/config/search_dir.cfg b/test_temp/jit/r000hs/config/search_dir.cfg
new file mode 100644
index 000000000..eaa5edc64
--- /dev/null
+++ b/test_temp/jit/r000hs/config/search_dir.cfg
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <All_Configurations-All_Platforms>
+  <searchDirs>
+   <searchCategory>
+    <category type="u8_t">1</category>
+   </searchCategory>
+   <searchCategory>
+    <category type="u8_t">2</category>
+   </searchCategory>
+   <searchCategory>
+    <category type="u8_t">3</category>
+   </searchCategory>
+   <searchCategory>
+    <category type="u8_t">4</category>
+   </searchCategory>
+  </searchDirs>
+ </All_Configurations-All_Platforms>
+</root>
diff --git a/test_temp/jit/r000hs/config/state.cfg b/test_temp/jit/r000hs/config/state.cfg
new file mode 100644
index 000000000..18a046704
--- /dev/null
+++ b/test_temp/jit/r000hs/config/state.cfg
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <guiEngine type="bag"/>
+</root>
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601702.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601702.0.th
new file mode 100644
index 000000000..365334e57
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601702.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601716.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601716.0.th
new file mode 100644
index 000000000..fd23c1975
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601716.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601746.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601746.0.th
new file mode 100644
index 000000000..56f8f8f2b
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601746.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601747.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601747.0.th
new file mode 100644
index 000000000..897c471ce
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601747.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601748.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601748.0.th
new file mode 100644
index 000000000..54f49fa56
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601748.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601749.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601749.0.th
new file mode 100644
index 000000000..7eb622d99
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601749.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601750.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601750.0.th
new file mode 100644
index 000000000..27b25cfea
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601750.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601751.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601751.0.th
new file mode 100644
index 000000000..0d1177804
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601751.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601752.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601752.0.th
new file mode 100644
index 000000000..d5e3b31cb
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601752.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601753.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601753.0.th
new file mode 100644
index 000000000..63bda20d4
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601753.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601754.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601754.0.th
new file mode 100644
index 000000000..836f7a9ed
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601754.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601755.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601755.0.th
new file mode 100644
index 000000000..7ea535d9b
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601755.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601756.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601756.0.th
new file mode 100644
index 000000000..8204f860e
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601756.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601757.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601757.0.th
new file mode 100644
index 000000000..bd38992d2
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601757.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601758.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601758.0.th
new file mode 100644
index 000000000..093ec0ef0
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601758.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601759.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601759.0.th
new file mode 100644
index 000000000..565cddbf3
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601759.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601760.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601760.0.th
new file mode 100644
index 000000000..98ec1c65c
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601760.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601761.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601761.0.th
new file mode 100644
index 000000000..229ee182a
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601761.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601762.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601762.0.th
new file mode 100644
index 000000000..1be108368
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601762.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601763.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601763.0.th
new file mode 100644
index 000000000..cec933212
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601763.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601764.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601764.0.th
new file mode 100644
index 000000000..31a59bc3d
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601764.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601765.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601765.0.th
new file mode 100644
index 000000000..7df872afc
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601765.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601766.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601766.0.th
new file mode 100644
index 000000000..cb2af8235
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601766.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601767.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601767.0.th
new file mode 100644
index 000000000..9895c53b3
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601767.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601768.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601768.0.th
new file mode 100644
index 000000000..0f63ffc3b
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601768.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601769.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601769.0.th
new file mode 100644
index 000000000..2a6b53812
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601769.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601770.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601770.0.th
new file mode 100644
index 000000000..9e98be026
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601770.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601771.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601771.0.th
new file mode 100644
index 000000000..404d9d5b7
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601771.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601772.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601772.0.th
new file mode 100644
index 000000000..b2334f40f
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601772.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601773.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601773.0.th
new file mode 100644
index 000000000..e425db0ee
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601773.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601774.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601774.0.th
new file mode 100644
index 000000000..98932205e
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601774.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601775.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601775.0.th
new file mode 100644
index 000000000..16c06d63d
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601775.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601776.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601776.0.th
new file mode 100644
index 000000000..e605012d7
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601776.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601777.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601777.0.th
new file mode 100644
index 000000000..dec578e11
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601777.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601778.0.th b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601778.0.th
new file mode 100644
index 000000000..074bc0bde
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0-2601778.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0.pytrace b/test_temp/jit/r000hs/data.0/2601697-2601702.0.pytrace
new file mode 100644
index 000000000..dee4b7ed7
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0.pytrace differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0.pytrace-sym b/test_temp/jit/r000hs/data.0/2601697-2601702.0.pytrace-sym
new file mode 100644
index 000000000..9a65f0fdb
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0.pytrace-sym differ
diff --git a/test_temp/jit/r000hs/data.0/2601697-2601702.0.trace b/test_temp/jit/r000hs/data.0/2601697-2601702.0.trace
new file mode 100644
index 000000000..69691220d
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601697-2601702.0.trace differ
diff --git a/test_temp/jit/r000hs/data.0/2601702-2601717.0-2601717.0.th b/test_temp/jit/r000hs/data.0/2601702-2601717.0-2601717.0.th
new file mode 100644
index 000000000..9944a43aa
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601702-2601717.0-2601717.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601702-2601717.0.trace b/test_temp/jit/r000hs/data.0/2601702-2601717.0.trace
new file mode 100644
index 000000000..52dc0a553
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601702-2601717.0.trace differ
diff --git a/test_temp/jit/r000hs/data.0/2601702-2601717.1-2601717.0.th b/test_temp/jit/r000hs/data.0/2601702-2601717.1-2601717.0.th
new file mode 100644
index 000000000..43f11e3a1
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601702-2601717.1-2601717.0.th differ
diff --git a/test_temp/jit/r000hs/data.0/2601702-2601717.1.trace b/test_temp/jit/r000hs/data.0/2601702-2601717.1.trace
new file mode 100644
index 000000000..8e9ea30f8
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/2601702-2601717.1.trace differ
diff --git a/test_temp/jit/r000hs/data.0/pc.2601702.jit b/test_temp/jit/r000hs/data.0/pc.2601702.jit
new file mode 100644
index 000000000..a649219d6
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/pc.2601702.jit differ
diff --git a/test_temp/jit/r000hs/data.0/systemcollector-2601697-pc.sc b/test_temp/jit/r000hs/data.0/systemcollector-2601697-pc.sc
new file mode 100644
index 000000000..0df615f0c
Binary files /dev/null and b/test_temp/jit/r000hs/data.0/systemcollector-2601697-pc.sc differ
diff --git a/test_temp/jit/r000hs/r000hs.vtune b/test_temp/jit/r000hs/r000hs.vtune
new file mode 100644
index 000000000..fa6669349
--- /dev/null
+++ b/test_temp/jit/r000hs/r000hs.vtune
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <rdmgr>
+  <timestamp type="u64_t">1645496383</timestamp>
+  <hostname type="string">pc</hostname>
+  <os type="string">linux</os>
+  <product type="string">Intel® VTune™ Profiler 2022.0.0</product>
+  <buildNumber type="s32_t">621730</buildNumber>
+  <logicalCPUCount type="s32_t">16</logicalCPUCount>
+  <physicalCoreCount type="s32_t">8</physicalCoreCount>
+  <processorPackageCount type="s32_t">1</processorPackageCount>
+  <CPUFrequency type="s64_t">3500000000</CPUFrequency>
+  <CPUFamily type="s32_t">6</CPUFamily>
+  <CPUModel type="s32_t">167</CPUModel>
+  <CPUStepping type="s32_t">1</CPUStepping>
+  <CPUBrandName type="string">11th Gen Intel(R) Core(TM) i9-11900K @ 3.50GHz</CPUBrandName>
+  <isa type="string">avx512</isa>
+  <collectionTimeBegin type="u64_t">1645496383</collectionTimeBegin>
+  <collectionTimeEnd type="u64_t">1645496391</collectionTimeEnd>
+  <variables>
+   <at type="string">hs</at>
+  </variables>
+ </rdmgr>
+</root>
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/container.metadata b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/container.metadata
new file mode 100644
index 000000000..ee892ce7d
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/container.metadata
@@ -0,0 +1,21 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:null="http://www.intel.com/2009/BagSchema#null" xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="465">
+  <indexFields>
+   <field path="dd_thread"/>
+   <field path="cpu_data_attr"/>
+   <field path="cpu_data_scale"/>
+   <field path="cpu_usage_data_attr_utilization"/>
+   <field path="cpu_usage_data_scale"/>
+   <emptyField path="marker_global_data_attr" null:defaultValue=""/>
+   <emptyField path="marker_global_data_scale" int:defaultValue="1"/>
+  </indexFields>
+  <int64Fields>
+   <field path="cpu_data_duration"/>
+   <field path="cpu_data_count"/>
+   <field path="cpu_data_inst_count"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/0 b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/0
new file mode 100644
index 000000000..6e448db5e
Binary files /dev/null and b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/0 differ
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/mapping.xml b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/mapping.xml
new file mode 100644
index 000000000..6d50be211
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="0" value="0"/>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/container.metadata b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/container.metadata
new file mode 100644
index 000000000..8eab066e4
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="4">
+  <indexFields>
+   <field path="cpu_usage_data_attr"/>
+   <field path="cpu_usage_data_scale"/>
+  </indexFields>
+  <int64Fields>
+   <field path="cpu_usage_data_duration"/>
+  </int64Fields>
+  <doubleFields>
+   <field path="cpu_usage_data_count"/>
+   <field path="cpu_usage_data_inst_count"/>
+  </doubleFields>
+ </entry>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/0 b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/0
new file mode 100644
index 000000000..0760b6337
Binary files /dev/null and b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/0 differ
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/mapping.xml b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/mapping.xml
new file mode 100644
index 000000000..6d50be211
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="0" value="0"/>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/container.metadata b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/container.metadata
new file mode 100644
index 000000000..db418f330
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="1">
+  <indexFields>
+   <field path="cpu_data_scale"/>
+   <field path="global_elapsed_time_data_scale"/>
+  </indexFields>
+  <int64Fields>
+   <field path="cpu_data_duration"/>
+   <field path="cpu_data_count"/>
+   <field path="cpu_data_inst_count"/>
+   <field path="global_elapsed_time_data_duration"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/0 b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/0
new file mode 100644
index 000000000..8c32cf2a5
Binary files /dev/null and b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/0 differ
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/mapping.xml b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/mapping.xml
new file mode 100644
index 000000000..6d50be211
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="0" value="0"/>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_sched_and_counter_metrics/container.metadata b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_sched_and_counter_metrics/container.metadata
new file mode 100644
index 000000000..cd56e726d
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/_cache_grouper_data4_sched_and_counter_metrics/container.metadata
@@ -0,0 +1,19 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:null="http://www.intel.com/2009/BagSchema#null" xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="0">
+  <indexFields>
+   <field path="dd_thread"/>
+   <field path="cpu_usage_data_attr_utilization"/>
+   <field path="cpu_usage_data_scale"/>
+   <emptyField path="counter_data_attr" null:defaultValue=""/>
+   <emptyField path="counter_data_scale" int:defaultValue="1"/>
+  </indexFields>
+  <int64Fields>
+   <emptyField path="counter_data_duration" int:defaultValue="0"/>
+   <emptyField path="counter_data_count" int:defaultValue="0"/>
+   <emptyField path="counter_data_inst_count" int:defaultValue="0"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/dicer.db b/test_temp/jit/r000hs/sqlite-db/dicer.db
new file mode 100644
index 000000000..c0380b481
Binary files /dev/null and b/test_temp/jit/r000hs/sqlite-db/dicer.db differ
diff --git a/test_temp/jit/r000hs/sqlite-db/grouper.metadata b/test_temp/jit/r000hs/sqlite-db/grouper.metadata
new file mode 100644
index 000000000..c63d79a20
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/grouper.metadata
@@ -0,0 +1,82 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <grouper name="dd_thread" axis="dd_thread" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="cpu_data" correlationType="byPrimaryAxis">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+   <entry instanceTable="cpu_usage_data" correlationType="global">
+    <groupings>
+     <grouping path="attr.utilization"/>
+    </groupings>
+   </entry>
+   <entry instanceTable="marker_global_data" correlationType="global">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+   </entry>
+  </entries>
+ </grouper>
+ <grouper name="global_data_grouper" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="cpu_usage_data" correlationType="global">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+  </entries>
+ </grouper>
+ <grouper name="global_time_interval_metrics" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="cpu_data" correlationType="global">
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+   <entry instanceTable="global_elapsed_time_data" correlationType="global">
+    <standardMetrics>
+     <metric>time</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+  </entries>
+ </grouper>
+ <grouper name="sched_and_counter_metrics" axis="dd_thread" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="counter_data" correlationType="byPrimaryAxis">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+   <entry instanceTable="cpu_usage_data" correlationType="global">
+    <groupings>
+     <grouping path="attr.utilization"/>
+    </groupings>
+   </entry>
+  </entries>
+ </grouper>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/offload_region_data/container.metadata b/test_temp/jit/r000hs/sqlite-db/offload_region_data/container.metadata
new file mode 100644
index 000000000..94869b4f7
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/offload_region_data/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="start_tsc" unsignedLong:size="18446744073709551615">
+  <indexFields>
+   <field path="dd_offload_region"/>
+   <field path="dd_offload_region.context"/>
+   <field path="dd_offload_region.context.process"/>
+   <field path="dd_offload_region.context.gpu_node"/>
+  </indexFields>
+  <int64Fields>
+   <field path="count"/>
+   <field path="end_tsc"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/offload_region_operation_data/container.metadata b/test_temp/jit/r000hs/sqlite-db/offload_region_operation_data/container.metadata
new file mode 100644
index 000000000..f6c7b7410
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/offload_region_operation_data/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="start_tsc" unsignedLong:size="18446744073709551615">
+  <indexFields>
+   <field path="dd_offload_region_operation"/>
+   <field path="dd_offload_region_operation.region.context"/>
+   <field path="dd_offload_region_operation.region.context.process"/>
+   <field path="dd_offload_region_operation.region.context.gpu_node"/>
+  </indexFields>
+  <int64Fields>
+   <field path="count"/>
+   <field path="end_tsc"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-645333858/instance/0/0 b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-645333858/instance/0/0
new file mode 100644
index 000000000..715454072
Binary files /dev/null and b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-645333858/instance/0/0 differ
diff --git a/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-645333858/instance/0/mapping.xml b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-645333858/instance/0/mapping.xml
new file mode 100644
index 000000000..fe912f504
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-645333858/instance/0/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="1_4463507837251199658" value="0"/>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-97516135/aggregated/0/0 b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-97516135/aggregated/0/0
new file mode 100644
index 000000000..76fa95772
Binary files /dev/null and b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-97516135/aggregated/0/0 differ
diff --git a/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-97516135/aggregated/0/mapping.xml b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-97516135/aggregated/0/mapping.xml
new file mode 100644
index 000000000..34890bf86
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/timelinedb/dbint-97516135/aggregated/0/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="18446744073709551615_18446744073709551615_0_1_1_4463507903482906259" value="0"/>
+</bag>
diff --git a/test_temp/jit/r000hs/sqlite-db/timelinedb/directory.xml b/test_temp/jit/r000hs/sqlite-db/timelinedb/directory.xml
new file mode 100644
index 000000000..d5bca712e
--- /dev/null
+++ b/test_temp/jit/r000hs/sqlite-db/timelinedb/directory.xml
@@ -0,0 +1,6 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" int:version="9" int:type="2">
+ <dbEntry key="GROUP_DEF(type:1 cpu_data /cpu_data::dd_sample.callsite.bottom_user_cs.code_loc.mod_seg.mod_file/cpu_data::dd_sample.callsite.type_cs.code_loc.func_range.func_inst.function.subtype/cpu_data::dd_sample.callsite.type_cs.code_loc.func_range.func_inst.function.type/dd_band::dd_thread/dd_band::dd_thread.process)" value="dbint-97516135"/>
+ <dbEntry key="GROUP_DEF(type:2 cpu_data /dd_band::dd_thread)" value="dbint-645333858"/>
+</bag>
diff --git a/test_temp/jit/test_lineprofiler.py b/test_temp/jit/test_lineprofiler.py
new file mode 100644
index 000000000..f8c34a5d2
--- /dev/null
+++ b/test_temp/jit/test_lineprofiler.py
@@ -0,0 +1,37 @@
+import time
+import random
+
+def very_slow_random_generator():
+    time.sleep(5)
+    arr = [random.randint(1,100) for i in range(100000)]
+    return sum(arr) / len(arr)
+
+def slow_random_generator():
+    time.sleep(2)
+    arr = [random.randint(1,100) for i in range(100000)]
+    return sum(arr) / len(arr)
+
+def fast_random_generator():
+    time.sleep(1)
+    arr = [random.randint(1,100) for i in range(100000)]
+    return sum(arr) / len(arr)
+
+def main_func():
+    result = fast_random_generator()
+    print(result)
+
+    result = slow_random_generator()
+    print(result)
+
+    result = very_slow_random_generator()
+    print(result)
+
+from line_profiler import LineProfiler
+
+lprofiler = LineProfiler()
+
+lp_wrapper = lprofiler(main_func)
+
+lp_wrapper()
+
+lprofiler.print_stats()
diff --git a/test_temp/others/keep_running.py b/test_temp/others/keep_running.py
new file mode 100644
index 000000000..06d62ffde
--- /dev/null
+++ b/test_temp/others/keep_running.py
@@ -0,0 +1,2 @@
+for i in range(10):
+    exec(open("jit_test.py").read())
diff --git a/test_temp/others/ndarray_fill_test.py b/test_temp/others/ndarray_fill_test.py
new file mode 100644
index 000000000..4d43bdd7e
--- /dev/null
+++ b/test_temp/others/ndarray_fill_test.py
@@ -0,0 +1,13 @@
+import taichi as ti
+
+ti.init(arch=ti.cuda, log_level=ti.TRACE)
+#ti.init(arch=ti.cuda)
+
+a = ti.ndarray(float, 16)
+
+@ti.kernel
+def p():
+    print('p')
+
+
+p()
diff --git a/test_temp/others/numpy_test.py b/test_temp/others/numpy_test.py
new file mode 100644
index 000000000..06868ed50
--- /dev/null
+++ b/test_temp/others/numpy_test.py
@@ -0,0 +1,19 @@
+import taichi as ti
+import time
+
+ti.init(arch=ti.cuda)
+
+print("test to numpy")
+N = 2048*2048
+
+a = ti.ndarray(ti.f32, N)
+a.to_numpy
+
+iterations = 100000
+t_start = time.perf_counter()
+for i in range(iterations):
+    a.to_numpy
+t_used = time.perf_counter() - t_start 
+print('total time:', "{:.3f}".format(t_used*1000), "ms")
+
+
diff --git a/test_temp/others/r000hs/.norun b/test_temp/others/r000hs/.norun
new file mode 100644
index 000000000..e69de29bb
diff --git a/test_temp/others/r000hs/archive/binaries/[vdso]/0e58e4148bc0ce4e23cb901a87f22850/[vdso] b/test_temp/others/r000hs/archive/binaries/[vdso]/0e58e4148bc0ce4e23cb901a87f22850/[vdso]
new file mode 100644
index 000000000..4b5b25278
Binary files /dev/null and b/test_temp/others/r000hs/archive/binaries/[vdso]/0e58e4148bc0ce4e23cb901a87f22850/[vdso] differ
diff --git a/test_temp/others/r000hs/archive/binaries/[vdso]/8de1bac5042ec43d09faecb1f06f864b/[vdso] b/test_temp/others/r000hs/archive/binaries/[vdso]/8de1bac5042ec43d09faecb1f06f864b/[vdso]
new file mode 100644
index 000000000..11d8fdb44
Binary files /dev/null and b/test_temp/others/r000hs/archive/binaries/[vdso]/8de1bac5042ec43d09faecb1f06f864b/[vdso] differ
diff --git a/test_temp/others/r000hs/config/analysis_type.cfg b/test_temp/others/r000hs/config/analysis_type.cfg
new file mode 100644
index 000000000..b46ea5487
--- /dev/null
+++ b/test_temp/others/r000hs/config/analysis_type.cfg
@@ -0,0 +1,249 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<bag xmlns:boolean="http://www.w3.org/2001/XMLSchema#boolean" xmlns:double="http://www.intel.com/2001/XMLSchema#double" xmlns:exsl="http://exslt.org/common" xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:str="http://exslt.org/strings" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <internal>
+  <name>%HotspotsAtypeName</name>
+  <shortName>%HotspotsAtypeShortName</shortName>
+  <description>%HotspotsAtypeDescription</description>
+  <shortDescription>%HotspotsAtypeShortDescription</shortDescription>
+  <abbreviation>hs</abbreviation>
+  <alias>hotspots</alias>
+  <int:schemaVersion>1</int:schemaVersion>
+  <int:contentVersion>1</int:contentVersion>
+  <iconClass>fire solid</iconClass>
+  <property name="helpId">configs.analysis_type-hotspots_f1101</property>
+ </internal>
+ <prerequisites xsl:version="1.0" exsl:keep_exsl_namespace="" syntax="norules">
+  <xsl:copy-of select="document('config://analysis_type/include/require_connection.xsl?connections=group_generic,tcp')"/>
+ </prerequisites>
+ <knobs xsl:version="1.0" exsl:keep_exsl_namespace="" syntax="norules">
+  <stringKnob id="initialViewpoint" displayName="%InitialViewpoint" boolean:visible="false">
+   <defaultValue>%HotspotsByCPUUsageViewpointName</defaultValue>
+  </stringKnob>
+  <stringKnob id="allowedViewpoints" boolean:visible="false">
+   <defaultValue>%HotspotsByCPUUsageViewpointName</defaultValue>
+  </stringKnob>
+  <xsl:variable name="knobsParams">
+   <params samplingIntervalApplyKnob="samplingMode" samplingIntervalApplyKnobValue="hw" pmuSamplingIntervalDescription="SamplingIntervalDescriptionOnHotspots"/>
+  </xsl:variable>
+  <xsl:variable name="knobsParamsName">
+   <xsl:text>config://analysis_type/include/knobs.xsl?</xsl:text>
+   <xsl:for-each select="exsl:node-set($knobsParams)//@*">
+    <xsl:value-of select="concat(name(), '=', .)"/>
+    <xsl:text>&amp;</xsl:text>
+   </xsl:for-each>
+  </xsl:variable>
+  <xsl:variable name="commonKnobs" select="document($knobsParamsName)"/>
+  <xsl:choose>
+   <xsl:when test="exsl:ctx('targetOS', '') = 'QNX' or exsl:ctx('targetOS', '') = 'MacOSX'">
+    <enumKnob id="samplingMode" displayName="%SamplingMode" cliName="sampling-mode" boolean:visible="false">
+     <description>%SamplingModeDescription</description>
+     <values>
+      <value displayName="%HardwareSampling" cliName="hw">hw</value>
+      <defaultValue>hw</defaultValue>
+     </values>
+    </enumKnob>
+    <xsl:copy-of select="$commonKnobs//knobs/doubleKnob[@id='samplingInterval']"/>
+    <booleanKnob id="enableStackCollect" displayName="%EnableStackCollection" cliName="enable-stack-collection">
+     <xsl:if test="not(exsl:is_experimental('sep-lbr'))">
+      <xsl:attribute name="boolean:visible">false</xsl:attribute>
+     </xsl:if>
+     <description>%EnableStackCollectionDescription</description>
+     <boolean:defaultValue>false</boolean:defaultValue>
+    </booleanKnob>
+   </xsl:when>
+   <xsl:otherwise>
+    <groupKnob id="groupForCustomControl">
+     <knobProperty name="knob_control_id">hotspotsGroup</knobProperty>
+     <knobs>
+      <enumKnob id="samplingMode" displayName="%SamplingMode" cliName="sampling-mode" boolean:visible="true">
+       <description>%SamplingModeDescription</description>
+       <values>
+        <value displayName="%SoftwareSampling" cliName="sw">sw</value>
+        <value displayName="%HardwareSampling" cliName="hw">hw</value>
+        <defaultValue>sw</defaultValue>
+       </values>
+      </enumKnob>
+      <xsl:copy-of select="$commonKnobs//knobs/doubleKnob[@id='samplingInterval']"/>
+      <booleanKnob id="enableStackCollect" displayName="%EnableStackCollection" cliName="enable-stack-collection">
+       <xsl:if test="not(exsl:is_experimental('sep-lbr')) and exsl:ctx('targetOS', '') = 'MacOSX'">
+        <xsl:attribute name="boolean:visible">false</xsl:attribute>
+       </xsl:if>
+       <description>%EnableStackCollectionDescription</description>
+       <boolean:defaultValue>false</boolean:defaultValue>
+      </booleanKnob>
+      <xsl:copy-of select="$commonKnobs//knobs/enumKnob[@id='stackSizeCollect']"/>
+     </knobs>
+    </groupKnob>
+   </xsl:otherwise>
+  </xsl:choose>
+  <doubleKnob id="slowGoodFrameThreshold" displayName="%SlowGoodFrameThreshold" cliName="slow-frames-threshold" boolean:visible="false">
+   <description>%SlowGoodFrameThresholdDescription</description>
+   <double:min>0.01</double:min>
+   <double:max>1024000</double:max>
+   <double:defaultValue>40</double:defaultValue>
+  </doubleKnob>
+  <doubleKnob id="goodFastFrameThreshold" displayName="%GoodFastFrameThreshold" cliName="fast-frames-threshold" boolean:visible="false">
+   <description>%GoodFastFrameThresholdDescription</description>
+   <double:min>0.01</double:min>
+   <double:max>1024000</double:max>
+   <double:defaultValue>100</double:defaultValue>
+  </doubleKnob>
+  <xsl:if test="exsl:ctx('targetOS', '') != 'Android'">
+   <booleanKnob id="enableCharacterizationInsights" displayName="%EnableCharacterizationInsights" cliName="enable-characterization-insights">
+    <boolean:defaultValue>true</boolean:defaultValue>
+    <description>%EnableCharacterizationInsightsDescription</description>
+   </booleanKnob>
+  </xsl:if>
+ </knobs>
+ <analysis xsl:version="1.0" str:keep_str_namespace="" exsl:keep_exsl_namespace="" syntax="norules">
+  <xsl:variable name="minMajorVersion" select="number(5)"/>
+  <xsl:variable name="minMinorVersion" select="number(10)"/>
+  <xsl:variable name="sepVersion" select="string(exsl:ctx('SEPDriverVersion', ''))"/>
+  <xsl:variable name="currentSepVersions" select="str:tokenize($sepVersion, '.')"/>
+  <xsl:variable name="useEventBasedCounts" select="(exsl:ctx('samplingMode', 'sw')='sw' or (number($currentSepVersions[1]) &gt; $minMajorVersion) or        (number($currentSepVersions[1]) = $minMajorVersion and number($currentSepVersions[2]) &gt; $minMinorVersion)) and not(exsl:ctx('PMU') = 'alderlake')"/>
+  <xsl:variable name="events" select="document(concat('config://analysis_type/include/common_events.xsl?useEventBasedCounts=', $useEventBasedCounts))"/>
+  <xsl:variable name="mainEvents" select="$events//events/cpi"/>
+  <xsl:variable name="fpuEvents" select="$events//events/fpu"/>
+  <xsl:variable name="retiredEvents" select="$events//events/retired"/>
+  <xsl:variable name="retiredEventsPM" select="$events//events/retired_perf_metrics"/>
+  <xsl:variable name="pmuCommon" select="document('config://include/pmu_common.xsl')"/>
+  <xsl:variable name="perfMetricsPossible" select="$pmuCommon//variables/perfMetricsPossible"/>
+  <collector id="runss">
+   <xsl:choose>
+    <xsl:when test="exsl:ctx('samplingMode', 'sw')='sw'">
+     <xsl:if test="not(exsl:ctx('isTPSSAvailable', 0)) and not(exsl:ctx('isPtraceAvailable', 0)) and not(exsl:ctx('targetOS', '') = 'QNX')">
+      <xsl:value-of select="exsl:error('%RunssHotspotsNotSupported')"/>
+     </xsl:if>
+     <xsl:if test="exsl:ctx('isPtraceScopeLimited', 0)">
+      <xsl:value-of select="exsl:error('%RunssPtraceScopeLimited')"/>
+     </xsl:if>
+     <xsl:if test="exsl:ctx('targetOS', '') = 'MacOSX'">
+      <xsl:value-of select="exsl:error('%ThisAnalysisTypeIsNotApplicable')"/>
+     </xsl:if>
+     <collectorKnob knob="collectSamplesMode">stack</collectorKnob>
+     <collectorKnob knob="samplingInterval">10</collectorKnob>
+     <collectorKnob knob="collectUserTasksEventsCountersMode">true</collectorKnob>
+     <xsl:if test="exsl:ctx('isPytraceAvailable', 0)">
+      <xsl:choose>
+       <xsl:when test="exsl:ctx('targetOS', '') = 'Windows'">
+        <collectorKnob knob="mrteType">java,dotnet,python</collectorKnob>
+       </xsl:when>
+       <xsl:when test="exsl:ctx('targetOS', '') = 'Linux'">
+        <collectorKnob knob="mrteType">java,python</collectorKnob>
+       </xsl:when>
+      </xsl:choose>
+     </xsl:if>
+     <collectorKnob knob="collectOpenMPRegions">
+      <xsl:value-of select="exsl:ctx('analyzeOpenMPRegions', 0)"/>
+     </collectorKnob>
+     <xsl:if test="exsl:ctx('enableCharacterizationInsights', 0)">
+      <xsl:variable name="isSEPFlow" select="exsl:ctx('isSEPDriverAvailable', 0) and (not(exsl:ctx('usePerf', 0)) or exsl:ctx('targetOS', '') = 'Windows' or exsl:ctx('targetOS', '') = 'MacOSX' or exsl:ctx('targetOS', '') = 'FreeBSD')"/>
+      <xsl:variable name="isPerfFlow" select="(exsl:ctx('targetOS', '') = 'Linux' or exsl:ctx('targetOS', '') = 'Android') and exsl:ctx('LinuxPerfCredentials', 'NotAvailable')!='NotAvailable'                        and contains(exsl:ctx('LinuxPerfCapabilities', ''), 'format')"/>
+      <xsl:variable name="isPMUAvailable" select="exsl:ctx('PerfmonVersion', '4') &gt; '1' and ($isSEPFlow or $isPerfFlow)"/>
+      <xsl:variable name="isPermissionInsufficient" select="exsl:ctx('targetOS')='Windows' and not(exsl:ctx('AdministratorPrivileges', 'false'))"/>
+      <xsl:choose>
+       <xsl:when test="not($isPMUAvailable) or $isPermissionInsufficient">
+        <xsl:if test="not($isPMUAvailable)">
+         <xsl:value-of select="exsl:warning('%HWInsightsNotAvailableWarningTpss')"/>
+        </xsl:if>
+        <xsl:if test="$isPermissionInsufficient">
+         <xsl:value-of select="exsl:warning('%HWInsightsNotAvailablePermissionWarningTpss')"/>
+        </xsl:if>
+       </xsl:when>
+       <xsl:otherwise>
+        <xsl:variable name="fullEventList">
+         <xsl:value-of select="$mainEvents"/>
+         <xsl:if test="$fpuEvents and $fpuEvents != ''">
+          <xsl:text>,</xsl:text>
+          <xsl:value-of select="$fpuEvents"/>
+         </xsl:if>
+         <xsl:choose>
+          <xsl:when test="$perfMetricsPossible = 'true'">
+           <xsl:if test="$retiredEventsPM and $retiredEventsPM != ''">
+            <xsl:text>,</xsl:text>
+            <xsl:value-of select="$retiredEventsPM"/>
+           </xsl:if>
+          </xsl:when>
+          <xsl:otherwise>
+           <xsl:if test="$retiredEvents and $retiredEvents != ''">
+            <xsl:text>,</xsl:text>
+            <xsl:value-of select="$retiredEvents"/>
+           </xsl:if>
+          </xsl:otherwise>
+         </xsl:choose>
+        </xsl:variable>
+        <collectorKnob knob="pmuEventConfig">
+         <xsl:value-of select="exsl:merge_pmu_events($fullEventList)"/>
+        </collectorKnob>
+        <xsl:choose>
+         <xsl:when test="$perfMetricsPossible = 'true'">
+          <boolean:collectorKnob knob="useAggregatedCounting">true</boolean:collectorKnob>
+          <boolean:collectorKnob knob="usePerfMetrics">true</boolean:collectorKnob>
+         </xsl:when>
+         <xsl:otherwise>
+          <boolean:collectorKnob knob="useCountingMode">true</boolean:collectorKnob>
+         </xsl:otherwise>
+        </xsl:choose>
+       </xsl:otherwise>
+      </xsl:choose>
+     </xsl:if>
+     <boolean:collectorKnob knob="collectSWHotspots">true</boolean:collectorKnob>
+    </xsl:when>
+    <xsl:otherwise>
+     <xsl:attribute name="id">runsa</xsl:attribute>
+     <xsl:if test="exsl:ctx('PMU', '') = ''">
+      <xsl:value-of select="exsl:error('%UnknownPMUForAT')"/>
+     </xsl:if>
+     <collectorKnob knob="useEventBasedCounts">
+      <xsl:value-of select="$useEventBasedCounts"/>
+     </collectorKnob>
+     <collectorKnob knob="collectUserTasksEventsCountersMode">true</collectorKnob>
+     <collectorKnob knob="eventMode">all</collectorKnob>
+     <collectorKnob knob="enableStackCollection">
+      <xsl:value-of select="exsl:ctx('enableStackCollect', 0)"/>
+     </collectorKnob>
+     <xsl:copy-of select="document('config://analysis_type/include/knobs.xsl')//knobs/stackSizeKnobLogic/*"/>
+     <boolean:collectorKnob knob="enableCSwitch">false</boolean:collectorKnob>
+     <boolean:collectorKnob knob="cpuGpuUsageData">true</boolean:collectorKnob>
+     <xsl:variable name="usePerfMetrics" select="exsl:is_experimental('icl-perf-metrics') and exsl:ctx('isSEPDriverAvailable', 0)"/>
+     <xsl:variable name="fullEventList">
+      <xsl:value-of select="$mainEvents"/>
+      <xsl:choose>
+       <xsl:when test="$usePerfMetrics">
+        <xsl:if test="$retiredEventsPM and $retiredEventsPM != ''">
+         <xsl:text>,</xsl:text>
+         <xsl:value-of select="$retiredEvents"/>
+        </xsl:if>
+       </xsl:when>
+       <xsl:otherwise>
+        <xsl:if test="$retiredEvents and $retiredEvents != ''">
+         <xsl:text>,</xsl:text>
+         <xsl:value-of select="$retiredEvents"/>
+        </xsl:if>
+       </xsl:otherwise>
+      </xsl:choose>
+      <xsl:if test="exsl:ctx('enableCharacterizationInsights', 0)">
+       <xsl:if test="$fpuEvents and $fpuEvents != ''">
+        <xsl:text>,</xsl:text>
+        <xsl:value-of select="$fpuEvents"/>
+       </xsl:if>
+      </xsl:if>
+     </xsl:variable>
+     <collectorKnob knob="pmuEventConfig">
+      <xsl:value-of select="exsl:merge_pmu_events($fullEventList)"/>
+     </collectorKnob>
+     <collectorKnob knob="pmuSamplingInterval">
+      <xsl:value-of select="format-number(exsl:ctx('samplingInterval', 1), '#.####')"/>
+     </collectorKnob>
+     <boolean:collectorKnob knob="isUArchUsageAvailable">true</boolean:collectorKnob>
+     <collectorKnob knob="usePerfMetrics">
+      <xsl:value-of select="$usePerfMetrics"/>
+     </collectorKnob>
+     <xsl:if test="exsl:is_experimental('ehfi')">
+      <collectorKnob knob="analyzeEHFIClasses">true</collectorKnob>
+     </xsl:if>
+    </xsl:otherwise>
+   </xsl:choose>
+  </collector>
+ </analysis>
+</bag>
diff --git a/test_temp/others/r000hs/config/applicationOutput.txt b/test_temp/others/r000hs/config/applicationOutput.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/test_temp/others/r000hs/config/collection.cfg b/test_temp/others/r000hs/config/collection.cfg
new file mode 100644
index 000000000..3890dc8ba
--- /dev/null
+++ b/test_temp/others/r000hs/config/collection.cfg
@@ -0,0 +1,34 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:boolean="http://www.w3.org/2001/XMLSchema#boolean" xmlns:double="http://www.intel.com/2001/XMLSchema#double" xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:null="http://www.intel.com/2009/BagSchema#null">
+ <pointer:workload pointer:type_id="dasID_cctrl2::ApplicationWorkload" pointer:ptr_type="dasID_cctrl2::IWorkload">
+  <workload targetName="launch" launch_app.app_to_launch="/home/qiao/miniconda3/envs/qiao-mem/bin/python" launch_app.app_parameters="&quot;jit_test.py&quot; " result_directory.path="/home/qiao/Taichi/taichi-qiao-mem/test_temp/r@@@{at}">
+   <search_directories/>
+   <context>
+    <contextValue id="allowMultipleRuns" boolean:value="false"/>
+    <contextValue id="analyzeKvmGuest" boolean:value="false"/>
+    <contextValue id="analyzeSystemWide" boolean:value="false"/>
+    <contextValue id="cpuMask" value=""/>
+    <contextValue id="customCollector" value=""/>
+    <contextValue id="dataLimit" int:value="1000"/>
+    <contextValue id="enableRing" boolean:value="false"/>
+    <contextValue id="finalizationMode" value="fast"/>
+    <contextValue id="followChild" boolean:value="true"/>
+    <contextValue id="followChildGroup" null:value=""/>
+    <contextValue id="followChildStrategy" value=""/>
+    <contextValue id="groupForFinalizationControl" null:value=""/>
+    <contextValue id="kvmGuestKallsyms" value=""/>
+    <contextValue id="kvmGuestModules" value=""/>
+    <contextValue id="kvmProfileGuest" null:value=""/>
+    <contextValue id="mrteMode" value="auto"/>
+    <contextValue id="targetDurationType" value="short"/>
+    <contextValue id="targetRingBuffer" double:value="0"/>
+    <contextValue id="targetType" value="launch"/>
+    <contextValue id="traceMpi" boolean:value="false"/>
+    <contextValue id="tracingMode" null:value=""/>
+    <contextValue id="wrapperScriptContent" value=""/>
+    <contextValue id="wrapperScriptPath" value=""/>
+   </context>
+  </workload>
+ </pointer:workload>
+</bag>
diff --git a/test_temp/others/r000hs/config/context_values.cfg b/test_temp/others/r000hs/config/context_values.cfg
new file mode 100644
index 000000000..a9cd9ded1
--- /dev/null
+++ b/test_temp/others/r000hs/config/context_values.cfg
@@ -0,0 +1,358 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:boolean="http://www.w3.org/2001/XMLSchema#boolean" xmlns:double="http://www.intel.com/2001/XMLSchema#double" xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:long="http://www.w3.org/2001/XMLSchema#long" xmlns:null="http://www.intel.com/2009/BagSchema#null" xmlns:unsignedByte="http://www.w3.org/2001/XMLSchema#unsignedByte" xmlns:unsignedInt="http://www.w3.org/2001/XMLSchema#unsignedInt" xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong" xmlns:unsignedShort="http://www.w3.org/2001/XMLSchema#unsignedShort">
+ <contextValue id="CLIENT_ID" value="CLI"/>
+ <contextValue id="CPU_NAME" value="Intel(R) microarchitecture code named Rocketlake"/>
+ <contextValue id="Hypervisor" value="None"/>
+ <contextValue id="HypervisorType" value="None"/>
+ <contextValue id="IsNUMANodeWithoutCPUsPresent" boolean:value="false"/>
+ <contextValue id="LinuxPerfCapabilities" value="NotAvailable"/>
+ <contextValue id="LinuxPerfCredentials" value="Restricted"/>
+ <contextValue id="LinuxPerfMuxIntervalMs" int:value="4"/>
+ <contextValue id="LinuxPerfStackCapabilities" value="NotAvailable"/>
+ <contextValue id="LinuxRelease" value="5.13.0-28-generic"/>
+ <contextValue id="OS" value="Linux"/>
+ <contextValue id="OS@version" int:value="1"/>
+ <contextValue id="OSBitness" value="64"/>
+ <contextValue id="OSBuildNumber" unsignedShort:value="0"/>
+ <contextValue id="PMU" value="icelake"/>
+ <contextValue id="PerfmonVersion" unsignedByte:value="5"/>
+ <contextValue id="RootPrivileges" boolean:value="false"/>
+ <contextValue id="accurateCPUTimeDetection" boolean:value="false"/>
+ <contextValue id="adjustCollectionBoundsByOMPApps" boolean:value="false"/>
+ <contextValue id="advancedLoopAnalysis" boolean:value="false"/>
+ <contextValue id="allowMultipleRuns" boolean:value="false"/>
+ <contextValue id="allowedViewpoints" value="%HotspotsByCPUUsageViewpointName"/>
+ <contextValue id="analyzeActivePowerConsumption" boolean:value="false"/>
+ <contextValue id="analyzeDgfxBandwidth" boolean:value="false"/>
+ <contextValue id="analyzeEHFIClasses" boolean:value="false"/>
+ <contextValue id="analyzeEnergyConsumption" boolean:value="false"/>
+ <contextValue id="analyzeFPU" boolean:value="false"/>
+ <contextValue id="analyzeFullProcTrace" boolean:value="false"/>
+ <contextValue id="analyzeIdlePowerConsumption" boolean:value="false"/>
+ <contextValue id="analyzeKvmGuest" boolean:value="false"/>
+ <contextValue id="analyzeMemoryConsumption" boolean:value="false"/>
+ <contextValue id="analyzePersistentMemory" boolean:value="false"/>
+ <contextValue id="analyzeSystemWide" boolean:value="false"/>
+ <contextValue id="appRunsCount" unsignedInt:value="1"/>
+ <contextValue id="apsMode" boolean:value="false"/>
+ <contextValue id="areGpuHardwareMetricsAvailableList" value="0:0:2.0|true;"/>
+ <contextValue id="areKernelPtrsRestricted" value="yes"/>
+ <contextValue id="atraceEventConfig" value=""/>
+ <contextValue id="basicBlockAnalysis" boolean:value="true"/>
+ <contextValue id="calleeAttributionMode" value="undefined"/>
+ <contextValue id="chipsetEventConfig" value=""/>
+ <contextValue id="collectCAT" boolean:value="false"/>
+ <contextValue id="collectCacheOccupancy" boolean:value="false"/>
+ <contextValue id="collectCallCounts" boolean:value="false"/>
+ <contextValue id="collectFPGAOpenCl" boolean:value="false"/>
+ <contextValue id="collectFpgaBlueStreamEvents" boolean:value="false"/>
+ <contextValue id="collectFramesMode" boolean:value="true"/>
+ <contextValue id="collectFullProcTrace" boolean:value="false"/>
+ <contextValue id="collectGpuCm" boolean:value="false"/>
+ <contextValue id="collectGpuMetal" boolean:value="false"/>
+ <contextValue id="collectGpuOpenCl" value="0:0:2.0|false"/>
+ <contextValue id="collectHostGPUBandwidth" boolean:value="false"/>
+ <contextValue id="collectHwTrace" boolean:value="false"/>
+ <contextValue id="collectIOMMU" boolean:value="false"/>
+ <contextValue id="collectIoMode" value="off"/>
+ <contextValue id="collectIoWaits" boolean:value="false"/>
+ <contextValue id="collectL3ExternalBW" boolean:value="false"/>
+ <contextValue id="collectMemBandwidth" boolean:value="false"/>
+ <contextValue id="collectMemObjects" boolean:value="false"/>
+ <contextValue id="collectMmioAccess" boolean:value="false"/>
+ <contextValue id="collectOpenMPRegions" boolean:value="false"/>
+ <contextValue id="collectOsCounters" boolean:value="false"/>
+ <contextValue id="collectPCIeBandwidth" boolean:value="false"/>
+ <contextValue id="collectPStateData" boolean:value="true"/>
+ <contextValue id="collectPTforTSX" boolean:value="false"/>
+ <contextValue id="collectPreciseClockticks" boolean:value="false"/>
+ <contextValue id="collectSWHotspots" boolean:value="true"/>
+ <contextValue id="collectSamplesMode" value="stack"/>
+ <contextValue id="collectSignalsMode" value="off"/>
+ <contextValue id="collectTSXCycles" boolean:value="false"/>
+ <contextValue id="collectThrottlingReasons" boolean:value="false"/>
+ <contextValue id="collectTopology" boolean:value="false"/>
+ <contextValue id="collectTripCounts" boolean:value="false"/>
+ <contextValue id="collectUserHistogramMode" boolean:value="false"/>
+ <contextValue id="collectUserSyncMode" boolean:value="false"/>
+ <contextValue id="collectUserTasksEventsCountersMode" boolean:value="true"/>
+ <contextValue id="collectWaitsMode" value="off"/>
+ <contextValue id="collectionStart" value="09:50:24 11/02/2022 UTC"/>
+ <contextValue id="collectionStart@version" int:value="2"/>
+ <contextValue id="collectionStartTsc" unsignedLong:value="4454274247714529472"/>
+ <contextValue id="collectionStop" value="09:50:31 11/02/2022 UTC"/>
+ <contextValue id="collectionStop@version" int:value="2"/>
+ <contextValue id="collectionStopTsc" unsignedLong:value="4454274318997678643"/>
+ <contextValue id="collectorTypeDecription" value="User-mode sampling and tracing"/>
+ <contextValue id="collectorTypeDecription@version" int:value="1"/>
+ <contextValue id="commandLine" value="/home/qiao/miniconda3/envs/qiao-mem/bin/python &quot;jit_test.py&quot; "/>
+ <contextValue id="commandLine@version" int:value="1"/>
+ <contextValue id="computerName" value="pc"/>
+ <contextValue id="computerName@version" int:value="1"/>
+ <contextValue id="connectionType" value="localhost"/>
+ <contextValue id="coreFrequencies" value="3504000208"/>
+ <contextValue id="coreFrequencies@version" int:value="1"/>
+ <contextValue id="cpuByIoWaits" boolean:value="false"/>
+ <contextValue id="cpuFamily" unsignedInt:value="6"/>
+ <contextValue id="cpuFamily@version" int:value="1"/>
+ <contextValue id="cpuGpuUsageData" boolean:value="false"/>
+ <contextValue id="cpuMask" value=""/>
+ <contextValue id="cpuModel" unsignedInt:value="167"/>
+ <contextValue id="cpuModel@version" int:value="1"/>
+ <contextValue id="createGPUQueueFrames" boolean:value="false"/>
+ <contextValue id="criticalRankOpenmpThreadCount" unsignedInt:value="0"/>
+ <contextValue id="criticalRankOpenmpThreadCount@version" int:value="1"/>
+ <contextValue id="cswitchMode" value="inactive"/>
+ <contextValue id="customCollector" value=""/>
+ <contextValue id="dataLimit" int:value="1000"/>
+ <contextValue id="disableGPUSysinfo" boolean:value="false"/>
+ <contextValue id="dramBandwidthLimits" boolean:value="false"/>
+ <contextValue id="emonCSVTraceFormat" boolean:value="false"/>
+ <contextValue id="emonMachineReadableVersionFile" boolean:value="false"/>
+ <contextValue id="emonPreset" value=""/>
+ <contextValue id="enableCStateCollection" boolean:value="false"/>
+ <contextValue id="enableCSwitch" boolean:value="false"/>
+ <contextValue id="enableCharacterizationInsights" boolean:value="true"/>
+ <contextValue id="enableCycleAccurateMode" boolean:value="false"/>
+ <contextValue id="enableDramBandwidthLimitsWarning" boolean:value="false"/>
+ <contextValue id="enableHWBasedCSCollection" boolean:value="false"/>
+ <contextValue id="enableInterruptsCollection" boolean:value="false"/>
+ <contextValue id="enableLBRCollection" boolean:value="false"/>
+ <contextValue id="enableMemoryObjectCorrelation" boolean:value="false"/>
+ <contextValue id="enableMpiTracing" boolean:value="false"/>
+ <contextValue id="enableOpenglesInstrumentation" boolean:value="false"/>
+ <contextValue id="enablePEBSCollection" boolean:value="false"/>
+ <contextValue id="enableParallelFsCollection" boolean:value="false"/>
+ <contextValue id="enableRing" boolean:value="false"/>
+ <contextValue id="enableRing0ProfilingMode" boolean:value="false"/>
+ <contextValue id="enableStackCollect" boolean:value="false"/>
+ <contextValue id="enableStackCollection" boolean:value="false"/>
+ <contextValue id="enableThreadAffinity" boolean:value="false"/>
+ <contextValue id="enableVTSSCollection" boolean:value="false"/>
+ <contextValue id="energyProfilingMode" value="none"/>
+ <contextValue id="environmentVars" null:value=""/>
+ <contextValue id="environmentVars@version" int:value="1"/>
+ <contextValue id="errorsAsWarnings" boolean:value="false"/>
+ <contextValue id="eventInfo" boolean:value="false"/>
+ <contextValue id="eventMode" value="all"/>
+ <contextValue id="eventMuxFactor" int:value="5"/>
+ <contextValue id="explicitUncoreEventsConfig" boolean:value="false"/>
+ <contextValue id="fileRequiestLogic" value="local"/>
+ <contextValue id="finalizationMode" value="fast"/>
+ <contextValue id="followChild" boolean:value="true"/>
+ <contextValue id="followChildGroup" null:value=""/>
+ <contextValue id="followChildStrategy" value=""/>
+ <contextValue id="forceMuxOff" boolean:value="false"/>
+ <contextValue id="forceSchedAndCounterMetricsGrouper" boolean:value="false"/>
+ <contextValue id="forceShowInlines" boolean:value="false"/>
+ <contextValue id="forceSystemWide" boolean:value="false"/>
+ <contextValue id="fpgaAocxOrBinaryFile" value=""/>
+ <contextValue id="fpgaNoMemTransfers" boolean:value="false"/>
+ <contextValue id="fpgaNoTemporal" boolean:value="false"/>
+ <contextValue id="fpgaOnBoard" value="None"/>
+ <contextValue id="fpgaPeriod" int:value="0"/>
+ <contextValue id="fpgaSourceFile" value=""/>
+ <contextValue id="fpuVersion" value="1_0"/>
+ <contextValue id="ftraceEventConfig" value=""/>
+ <contextValue id="genArchOnBoard" int:value="10"/>
+ <contextValue id="goodFastFrameThreshold" double:value="100"/>
+ <contextValue id="gpuAdapterDriver" value=""/>
+ <contextValue id="gpuAdapterName" value="Display controller: Intel Corporation Device 0x4c8a"/>
+ <contextValue id="gpuAdapterNameList" value="0:0:2.0|Display controller: Intel Corporation Device 0x4c8a;"/>
+ <contextValue id="gpuAdapterNames" value="0:0:2.0|Display controller: Intel Corporation Device 0x4c8a"/>
+ <contextValue id="gpuAdapterTileNameList" value="0:0:2.0|Display controller: Intel Corporation Device 0x4c8a|0,;"/>
+ <contextValue id="gpuAdapterVendor" value="Intel Corporation"/>
+ <contextValue id="gpuCounters" value="0:0:2.0|none"/>
+ <contextValue id="gpuMetricsSelected" value=""/>
+ <contextValue id="gpuOpenCLDeviceOrder" value="bdf"/>
+ <contextValue id="gpuPerformanceMode" null:value=""/>
+ <contextValue id="gpuPlatform" null:value=""/>
+ <contextValue id="gpuPlatformIndex" null:value=""/>
+ <contextValue id="gpuPlatformIndex@version" int:value="1"/>
+ <contextValue id="gpuPlatformIndexList" value="0:0:2.0|23;"/>
+ <contextValue id="gpuProfilingMode" value="0:0:2.0|none"/>
+ <contextValue id="gpuSamplingInterval" value="0:0:2.0|1"/>
+ <contextValue id="gpuUsage" boolean:value="false"/>
+ <contextValue id="groupForCustomControl" null:value=""/>
+ <contextValue id="groupForFinalizationControl" null:value=""/>
+ <contextValue id="groupForGPUCustomCollection" null:value=""/>
+ <contextValue id="groupForTraceEventConfig" null:value=""/>
+ <contextValue id="handleLostEvents" boolean:value="false"/>
+ <contextValue id="hideSystemByDefault" boolean:value="false"/>
+ <contextValue id="hideWarningInPerfsnapshot" boolean:value="false"/>
+ <contextValue id="hostName" value="pc"/>
+ <contextValue id="hostOS" value="Linux"/>
+ <contextValue id="hostsCount" int:value="1"/>
+ <contextValue id="hostsCount@version" int:value="1"/>
+ <contextValue id="i915Status" value="KernelNotPatched"/>
+ <contextValue id="ignorePowerData" boolean:value="false"/>
+ <contextValue id="inKernelProfilingAnalysis" boolean:value="false"/>
+ <contextValue id="initialViewpoint" value="%HotspotsByCPUUsageViewpointName"/>
+ <contextValue id="iptCollectEvents" boolean:value="false"/>
+ <contextValue id="iptRegionsToLoad" int:value="0"/>
+ <contextValue id="is3DXP2LMMode" boolean:value="false"/>
+ <contextValue id="is3DXPAppDirectMode" boolean:value="false"/>
+ <contextValue id="is3DXPPresent" boolean:value="false"/>
+ <contextValue id="isAOCLAvailable" boolean:value="true"/>
+ <contextValue id="isCATSupportedByCPU" boolean:value="false"/>
+ <contextValue id="isCPUSupportedBySocwatch" boolean:value="false"/>
+ <contextValue id="isCSwitchAvailable" value="no"/>
+ <contextValue id="isCpuThrottlingAvailable" boolean:value="false"/>
+ <contextValue id="isDeviceOrCredentialGuardEnabled" boolean:value="false"/>
+ <contextValue id="isEHFIAvailable" boolean:value="false"/>
+ <contextValue id="isEnergyCollectionSupported" boolean:value="true"/>
+ <contextValue id="isFtraceAvailable" value="ftraceAccessError,debugfsNotAccessible"/>
+ <contextValue id="isFtraceAvailableKnob" value="ftraceAccessError,debugfsNotAccessible"/>
+ <contextValue id="isFunctionTracingAvailable" value="no"/>
+ <contextValue id="isGENDebugInfoAvailableList" value="0:0:2.0|true;"/>
+ <contextValue id="isGTPinCollectionAvailableList" value="0:0:2.0|true;"/>
+ <contextValue id="isGpuBusynessAvailable" value="i915TracepointsConfigOff,notAccessible"/>
+ <contextValue id="isGpuWaitAvailable" value="no"/>
+ <contextValue id="isHTEnabled" boolean:value="true"/>
+ <contextValue id="isHybridCPU" boolean:value="false"/>
+ <contextValue id="isHybridCPU@version" int:value="1"/>
+ <contextValue id="isIPMWatchReady" boolean:value="false"/>
+ <contextValue id="isIowaitTracingAvailable" value="no"/>
+ <contextValue id="isL3MonitoringSupportedByCPU" boolean:value="false"/>
+ <contextValue id="isMaxDRAMBandwidthMeasurementSupported" boolean:value="true"/>
+ <contextValue id="isMdfEtwAvailable" boolean:value="false"/>
+ <contextValue id="isNMIWatchDogTimerRunning" boolean:value="true"/>
+ <contextValue id="isNvdimmAvailable" boolean:value="true"/>
+ <contextValue id="isOsCountersCollectorAvailable" boolean:value="false"/>
+ <contextValue id="isPAXDriverLoaded" boolean:value="false"/>
+ <contextValue id="isPStateAvailable" boolean:value="true"/>
+ <contextValue id="isPTAvailable" boolean:value="true"/>
+ <contextValue id="isPerfPCIeMappingAvailable" boolean:value="false"/>
+ <contextValue id="isPowerVRDataAvailable" value="no"/>
+ <contextValue id="isPtraceAvailable" boolean:value="true"/>
+ <contextValue id="isPtraceScopeLimited" boolean:value="false"/>
+ <contextValue id="isPytraceAvailable" boolean:value="true"/>
+ <contextValue id="isSEPDriverAvailable" boolean:value="false"/>
+ <contextValue id="isSGXAvailable" boolean:value="false"/>
+ <contextValue id="isSTTAvailable" value="no"/>
+ <contextValue id="isSocwatchDriverLoaded" boolean:value="false"/>
+ <contextValue id="isTPSSAvailable" boolean:value="true"/>
+ <contextValue id="isTSXAvailable" boolean:value="false"/>
+ <contextValue id="isUArchUsageAvailable" boolean:value="false"/>
+ <contextValue id="isVSyncAvailable" value="no"/>
+ <contextValue id="isVTSSPPDriverAvailable" boolean:value="false"/>
+ <contextValue id="kernelStackWarningCheck" boolean:value="false"/>
+ <contextValue id="kernelsToProfile" value="*#1#1#4294967295"/>
+ <contextValue id="kvmGuestKallsyms" value=""/>
+ <contextValue id="kvmGuestModules" value=""/>
+ <contextValue id="kvmProfileGuest" null:value=""/>
+ <contextValue id="l0DevicesAvailable" boolean:value="false"/>
+ <contextValue id="l0GPUDevicesAvailable" boolean:value="false"/>
+ <contextValue id="l0LoaderStatus" value="LibNotFound"/>
+ <contextValue id="l0MetricConfig" value=""/>
+ <contextValue id="l0SamplingInterval" double:value="1"/>
+ <contextValue id="l0SamplingType" value="time"/>
+ <contextValue id="l0VPUDevicesAvailable" boolean:value="false"/>
+ <contextValue id="loadLbrStackToDb" boolean:value="true"/>
+ <contextValue id="loadPebsData" boolean:value="true"/>
+ <contextValue id="loadRawLbrData" boolean:value="false"/>
+ <contextValue id="logicalCPUCount" int:value="16"/>
+ <contextValue id="logicalCPUCount@version" int:value="1"/>
+ <contextValue id="maxLocalBandwidth" unsignedLong:value="0"/>
+ <contextValue id="maxRegionDuration" double:value="100"/>
+ <contextValue id="memoryAccessBandwidthMeasuring" boolean:value="false"/>
+ <contextValue id="memoryObjectMinSize" int:value="1024"/>
+ <contextValue id="mpiRank" null:value=""/>
+ <contextValue id="mpiRank@version" int:value="2"/>
+ <contextValue id="mrteMode" value="auto"/>
+ <contextValue id="mrteType" value="java,python"/>
+ <contextValue id="nameThreadsAsCreationModule" boolean:value="false"/>
+ <contextValue id="noSignalAltstack" boolean:value="false"/>
+ <contextValue id="numaVersion" value="1_0"/>
+ <contextValue id="numaVersion@version" int:value="1"/>
+ <contextValue id="numaVersionCurrent" value="1_0"/>
+ <contextValue id="omniPathOnBoard" value="None"/>
+ <contextValue id="openclSourceAsm" boolean:value="true"/>
+ <contextValue id="openmpProcessCount" unsignedInt:value="0"/>
+ <contextValue id="openmpProcessCount@version" int:value="1"/>
+ <contextValue id="openmpThreadCount" unsignedInt:value="0"/>
+ <contextValue id="openmpThreadCount@version" int:value="2"/>
+ <contextValue id="osDetailedName" value="5.13.0-28-generic DISTRIB_ID=Ubuntu&#10;DISTRIB_RELEASE=20.04&#10;DISTRIB_CODENAME=focal&#10;DISTRIB_DESCRIPTION=&quot;Ubuntu 20.04.3 LTS&quot;"/>
+ <contextValue id="osDetailedName@version" int:value="1"/>
+ <contextValue id="osKernelRelease" value="5.13.0-28-generic"/>
+ <contextValue id="osKernelRelease@version" int:value="1"/>
+ <contextValue id="osName" value="Linux"/>
+ <contextValue id="osName@version" int:value="1"/>
+ <contextValue id="packageCount" int:value="1"/>
+ <contextValue id="packageCount@version" int:value="1"/>
+ <contextValue id="pausedTime" double:value="0"/>
+ <contextValue id="pausedTime@version" int:value="2"/>
+ <contextValue id="pciClassParts" value=""/>
+ <contextValue id="perfForceSystemWide" boolean:value="false"/>
+ <contextValue id="perfPreferPerCpuSamplingMode" boolean:value="true"/>
+ <contextValue id="physicalCoreCount" int:value="8"/>
+ <contextValue id="physicalCoreCount@version" int:value="1"/>
+ <contextValue id="platformType" value="140"/>
+ <contextValue id="pmuEventConfig" value=""/>
+ <contextValue id="pmuSamplingInterval" double:value="1"/>
+ <contextValue id="populatedIoParts" value=""/>
+ <contextValue id="populatedIoUnits" value=""/>
+ <contextValue id="populatedTidValuesForIO" value=""/>
+ <contextValue id="preciseMultiplexing" boolean:value="false"/>
+ <contextValue id="preferDriverlessCollection" boolean:value="true"/>
+ <contextValue id="preferedGpuAdapter" value="0:0:2.0"/>
+ <contextValue id="preferredEbsCollectorOrder" value="perf,vtss,sep"/>
+ <contextValue id="processKernelBinaries" boolean:value="false"/>
+ <contextValue id="referenceFrequency" unsignedLong:value="3504000208"/>
+ <contextValue id="referenceFrequency@version" int:value="2"/>
+ <contextValue id="resolveCallsites" boolean:value="true"/>
+ <contextValue id="restrictPCIeBandwidthByClass" value="None"/>
+ <contextValue id="resultSize" long:value="11985247"/>
+ <contextValue id="resultSize@version" int:value="1"/>
+ <contextValue id="ringBuffer" int:value="0"/>
+ <contextValue id="ringBufferSizeInSeconds" double:value="0"/>
+ <contextValue id="runss:enable" boolean:value="true"/>
+ <contextValue id="samplingInterval" double:value="1"/>
+ <contextValue id="samplingMode" value="sw"/>
+ <contextValue id="scPids" value="322572"/>
+ <contextValue id="scStartTscs" value="4454274245855029778"/>
+ <contextValue id="shortCollectionMux" null:value=""/>
+ <contextValue id="shortCollectionMux@version" int:value="1"/>
+ <contextValue id="showGPUBandwidthHistogram" boolean:value="true"/>
+ <contextValue id="showInlinesByDefault" boolean:value="true"/>
+ <contextValue id="slowGoodFrameThreshold" double:value="40"/>
+ <contextValue id="stackSize" int:value="1024"/>
+ <contextValue id="stackSizeCollect" value="1024"/>
+ <contextValue id="stackStitching" boolean:value="true"/>
+ <contextValue id="stackTypeCollect" value="software"/>
+ <contextValue id="stackUnwindLimit" int:value="8388608"/>
+ <contextValue id="stackwalk" value="offline"/>
+ <contextValue id="supportedTargetTypes" value="all"/>
+ <contextValue id="suppressCSVSyntaxWarnings" boolean:value="false"/>
+ <contextValue id="systemCollectorConfig" value=""/>
+ <contextValue id="systemWideContextSwitch" boolean:value="false"/>
+ <contextValue id="systemWideDiskIO" boolean:value="false"/>
+ <contextValue id="targetConcurrency" int:value="16"/>
+ <contextValue id="targetConcurrency@version" int:value="2"/>
+ <contextValue id="targetDurationType" value="short"/>
+ <contextValue id="targetGPU" value="0:0:2.0"/>
+ <contextValue id="targetOS" value="Linux"/>
+ <contextValue id="targetOS@version" int:value="1"/>
+ <contextValue id="targetOption" value="localhost"/>
+ <contextValue id="targetRingBuffer" double:value="0"/>
+ <contextValue id="targetType" value="launch"/>
+ <contextValue id="tidValuesForIO" value=""/>
+ <contextValue id="tmamVersion" value="4_3"/>
+ <contextValue id="tmamVersion@version" int:value="1"/>
+ <contextValue id="tmamVersionCurrent" value="4_3"/>
+ <contextValue id="totalElapsedTime" double:value="6.8586842196999998"/>
+ <contextValue id="totalElapsedTime@version" int:value="3"/>
+ <contextValue id="traceMpi" boolean:value="false"/>
+ <contextValue id="tracingMode" null:value=""/>
+ <contextValue id="uncoreSamplingInterval" int:value="10"/>
+ <contextValue id="useAOCLProfile" boolean:value="false"/>
+ <contextValue id="useAggregatedCounting" boolean:value="false"/>
+ <contextValue id="useCountingMode" boolean:value="false"/>
+ <contextValue id="useEventBasedCounts" boolean:value="false"/>
+ <contextValue id="useGpuCounting" boolean:value="false"/>
+ <contextValue id="usePerf" boolean:value="false"/>
+ <contextValue id="usePerfMetrics" boolean:value="false"/>
+ <contextValue id="wrapperScriptContent" value=""/>
+ <contextValue id="wrapperScriptPath" value=""/>
+</bag>
diff --git a/test_temp/others/r000hs/config/log.cfg b/test_temp/others/r000hs/config/log.cfg
new file mode 100644
index 000000000..7ef63acdf
--- /dev/null
+++ b/test_temp/others/r000hs/config/log.cfg
@@ -0,0 +1,35 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:long="http://www.w3.org/2001/XMLSchema#long">
+ <message_entry_t int:status="1" cap="Data collection completed with warnings" msg="Please see warning messages for details. " long:timeStamp="1644573031929">
+  <message msg="Hardware collection of CPU events is not possible on this system. Microarchitecture performance insights will not be available." int:severity="2"/>
+  <message msg="Collection started. To stop the collection, either press CTRL-C or enter from another console window: vtune -r /home/qiao/Taichi/taichi-qiao-mem/test_temp/r000hs -command stop." int:severity="1"/>
+  <message msg="Cannot stop posix timer: __NR_timer_settime() system call returned -1." int:severity="2"/>
+  <message msg="Collection stopped." int:severity="1"/>
+ </message_entry_t>
+ <message_entry_t int:status="1" cap="Finalization completed with warnings" msg="Result finalization has completed with warnings that may affect the representation of analysis data. Please see details below." long:timeStamp="1644573042881">
+  <message msg="Using result path `/home/qiao/Taichi/taichi-qiao-mem/test_temp/r000hs'" int:severity="1"/>
+  <message msg="The database has been cleared, elapsed time is 0.210 seconds." int:severity="16"/>
+  <message msg="Raw data has been loaded to the database, elapsed time is 3.344 seconds." int:severity="16"/>
+  <message msg="Data transformations have been finished, elapsed time is 0.009 seconds." int:severity="16"/>
+  <message msg="Cannot locate file `&lt;frozen importlib._bootstrap&gt;'." int:severity="2"/>
+  <message msg="Cannot locate file `&lt;frozen importlib._bootstrap_external&gt;'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/qiao-mem/lib/python3.8/site-packages/numpy/random/mtrand.cpython-38-x86_64-linux-gnu.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/qiao-mem/lib/python3.8/site-packages/numpy/random/bit_generator.cpython-38-x86_64-linux-gnu.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/intel/oneapi/vtune/2022.0.0/lib64/libtpsstool.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/qiao-mem/lib/python3.8/site-packages/numpy/core/_multiarray_umath.cpython-38-x86_64-linux-gnu.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/qiao-mem/lib/python3.8/site-packages/torch/lib/libtorch_python.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/qiao-mem/lib/python3.8/site-packages/numpy.libs/libopenblasp-r0-2d23e62b.3.17.so'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/lib64/ld-linux-x86-64.so.2'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/usr/bin/uname'." int:severity="2"/>
+  <message msg="Cannot locate debugging information for file `/home/qiao/miniconda3/envs/qiao-mem/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so'." int:severity="2"/>
+  <message msg="Symbol resolution has been finished, elapsed time is 6.618 seconds." int:severity="16"/>
+  <message msg="Deferred data transformations have been finished, elapsed time is 0.018 seconds." int:severity="16"/>
+  <message msg="Data model parameters have been set, elapsed time is 0.013 seconds." int:severity="16"/>
+  <message msg="Precomputing frequently used data has been finished, elapsed time is 0.022 seconds." int:severity="16"/>
+  <message msg="Redundant overtime data has been discarded, elapsed time is 0.001 seconds." int:severity="16"/>
+  <message msg="Raw collector data has been discarded, elapsed time is 0.000 seconds." int:severity="16"/>
+  <message msg="Finalizing the result took 10.497 seconds." int:severity="16"/>
+  <message msg="Knob values have been set, elapsed time is 0.000 seconds." int:severity="16"/>
+ </message_entry_t>
+</bag>
diff --git a/test_temp/others/r000hs/config/runss.options b/test_temp/others/r000hs/config/runss.options
new file mode 100644
index 000000000..e19c3eaaa
--- /dev/null
+++ b/test_temp/others/r000hs/config/runss.options
@@ -0,0 +1,17 @@
+-r
+/home/qiao/Taichi/taichi-qiao-mem/test_temp/r000hs
+--stack-stitching
+--data-limit-mb=1000
+--disk-space-limit=0
+--mrte-type=java,python
+--stack-unwind-limit=8388608
+--itt-config=frame
+--itt-config=task,event,counter
+--stackwalk=offline
+--mrte-mode=auto
+--type=cpu:counters:nostack
+--type=cpu:stack
+--interval=10
+--
+/home/qiao/miniconda3/envs/qiao-mem/bin/python
+jit_test.py
diff --git a/test_temp/others/r000hs/config/search_dir.cfg b/test_temp/others/r000hs/config/search_dir.cfg
new file mode 100644
index 000000000..eaa5edc64
--- /dev/null
+++ b/test_temp/others/r000hs/config/search_dir.cfg
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <All_Configurations-All_Platforms>
+  <searchDirs>
+   <searchCategory>
+    <category type="u8_t">1</category>
+   </searchCategory>
+   <searchCategory>
+    <category type="u8_t">2</category>
+   </searchCategory>
+   <searchCategory>
+    <category type="u8_t">3</category>
+   </searchCategory>
+   <searchCategory>
+    <category type="u8_t">4</category>
+   </searchCategory>
+  </searchDirs>
+ </All_Configurations-All_Platforms>
+</root>
diff --git a/test_temp/others/r000hs/config/state.cfg b/test_temp/others/r000hs/config/state.cfg
new file mode 100644
index 000000000..1e0d48399
--- /dev/null
+++ b/test_temp/others/r000hs/config/state.cfg
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <guiEngine type="bag"/>
+ <viewpoint>
+  <name type="string">Hotspots by CPU Utilization</name>
+ </viewpoint>
+ <callerCalleePane>
+  <state>
+   <functions>
+    <sortOrderAscending type="bool">true</sortOrderAscending>
+    <sortedColumn>
+     <query type="string">CounterFunctionModule</query>
+     <flag type="s32_t">2</flag>
+    </sortedColumn>
+   </functions>
+   <callers type="bag"/>
+   <callees type="bag"/>
+  </state>
+ </callerCalleePane>
+ <topDownPane>
+  <state>
+   <grouping_id type="string">/SourceCallStack</grouping_id>
+   <sortOrderAscending type="bool">true</sortOrderAscending>
+   <sortedColumn>
+    <query type="string">CounterSourceFunctionSourceFile</query>
+    <flag type="s32_t">2</flag>
+   </sortedColumn>
+  </state>
+ </topDownPane>
+ <bottomUpPane>
+  <state>
+   <grouping_id type="string">/Function/ParentCallStack</grouping_id>
+   <sortOrderAscending type="bool">false</sortOrderAscending>
+   <splitterPosition type="string">32.5615%</splitterPosition>
+   <sortedColumn>
+    <query type="string">CounterFunctionModule</query>
+    <flag type="s32_t">2</flag>
+   </sortedColumn>
+  </state>
+ </bottomUpPane>
+</root>
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322572.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322572.0.th
new file mode 100644
index 000000000..cf9afb80f
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322572.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322586.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322586.0.th
new file mode 100644
index 000000000..da78e243d
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322586.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322639.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322639.0.th
new file mode 100644
index 000000000..b6d771ec9
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322639.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322640.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322640.0.th
new file mode 100644
index 000000000..6310803fb
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322640.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322641.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322641.0.th
new file mode 100644
index 000000000..eb2967ad3
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322641.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322642.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322642.0.th
new file mode 100644
index 000000000..80d3636d5
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322642.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322643.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322643.0.th
new file mode 100644
index 000000000..4a9c44510
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322643.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322644.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322644.0.th
new file mode 100644
index 000000000..8aa0248cd
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322644.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322645.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322645.0.th
new file mode 100644
index 000000000..c4d93c2c8
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322645.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322646.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322646.0.th
new file mode 100644
index 000000000..861230521
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322646.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322647.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322647.0.th
new file mode 100644
index 000000000..4b05a03da
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322647.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322648.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322648.0.th
new file mode 100644
index 000000000..572719dbc
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322648.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322649.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322649.0.th
new file mode 100644
index 000000000..1797b4c20
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322649.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322650.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322650.0.th
new file mode 100644
index 000000000..7732b1770
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322650.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322651.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322651.0.th
new file mode 100644
index 000000000..c528277ee
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322651.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322652.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322652.0.th
new file mode 100644
index 000000000..2f6d60651
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322652.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322653.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322653.0.th
new file mode 100644
index 000000000..2d7d74648
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322653.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322654.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322654.0.th
new file mode 100644
index 000000000..3284b4142
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322654.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322655.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322655.0.th
new file mode 100644
index 000000000..948803aa0
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322655.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322656.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322656.0.th
new file mode 100644
index 000000000..f9f663495
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322656.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322657.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322657.0.th
new file mode 100644
index 000000000..df983e0f9
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322657.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322658.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322658.0.th
new file mode 100644
index 000000000..b3d551a07
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322658.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322659.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322659.0.th
new file mode 100644
index 000000000..949adc80e
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322659.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322660.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322660.0.th
new file mode 100644
index 000000000..789e6d6ac
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322660.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322661.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322661.0.th
new file mode 100644
index 000000000..95c2781fd
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322661.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322662.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322662.0.th
new file mode 100644
index 000000000..1ed74c44f
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322662.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322663.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322663.0.th
new file mode 100644
index 000000000..d4e00de28
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322663.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322664.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322664.0.th
new file mode 100644
index 000000000..d7ded495c
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322664.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322665.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322665.0.th
new file mode 100644
index 000000000..b92b4c492
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322665.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322666.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322666.0.th
new file mode 100644
index 000000000..5f4eb47d6
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322666.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322667.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322667.0.th
new file mode 100644
index 000000000..2441cdf96
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322667.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322668.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322668.0.th
new file mode 100644
index 000000000..bd8420a00
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322668.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322669.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322669.0.th
new file mode 100644
index 000000000..cdb28d54b
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322669.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322670.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322670.0.th
new file mode 100644
index 000000000..673e15c75
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322670.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0-322671.0.th b/test_temp/others/r000hs/data.0/322567-322572.0-322671.0.th
new file mode 100644
index 000000000..94dd11d94
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0-322671.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0.pytrace b/test_temp/others/r000hs/data.0/322567-322572.0.pytrace
new file mode 100644
index 000000000..dee4b7ed7
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0.pytrace differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0.pytrace-sym b/test_temp/others/r000hs/data.0/322567-322572.0.pytrace-sym
new file mode 100644
index 000000000..37de8a1af
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0.pytrace-sym differ
diff --git a/test_temp/others/r000hs/data.0/322567-322572.0.trace b/test_temp/others/r000hs/data.0/322567-322572.0.trace
new file mode 100644
index 000000000..3a203eab7
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322567-322572.0.trace differ
diff --git a/test_temp/others/r000hs/data.0/322572-322587.0-322587.0.th b/test_temp/others/r000hs/data.0/322572-322587.0-322587.0.th
new file mode 100644
index 000000000..c93c51786
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322572-322587.0-322587.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322572-322587.0.trace b/test_temp/others/r000hs/data.0/322572-322587.0.trace
new file mode 100644
index 000000000..afca0b5ed
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322572-322587.0.trace differ
diff --git a/test_temp/others/r000hs/data.0/322572-322587.1-322587.0.th b/test_temp/others/r000hs/data.0/322572-322587.1-322587.0.th
new file mode 100644
index 000000000..16ebe8b88
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322572-322587.1-322587.0.th differ
diff --git a/test_temp/others/r000hs/data.0/322572-322587.1.trace b/test_temp/others/r000hs/data.0/322572-322587.1.trace
new file mode 100644
index 000000000..3ce481faa
Binary files /dev/null and b/test_temp/others/r000hs/data.0/322572-322587.1.trace differ
diff --git a/test_temp/others/r000hs/data.0/pc.322572.jit b/test_temp/others/r000hs/data.0/pc.322572.jit
new file mode 100644
index 000000000..28435e4e5
Binary files /dev/null and b/test_temp/others/r000hs/data.0/pc.322572.jit differ
diff --git a/test_temp/others/r000hs/data.0/systemcollector-322567-pc.sc b/test_temp/others/r000hs/data.0/systemcollector-322567-pc.sc
new file mode 100644
index 000000000..a4385b56a
Binary files /dev/null and b/test_temp/others/r000hs/data.0/systemcollector-322567-pc.sc differ
diff --git a/test_temp/others/r000hs/r000hs.vtune b/test_temp/others/r000hs/r000hs.vtune
new file mode 100644
index 000000000..798d649de
--- /dev/null
+++ b/test_temp/others/r000hs/r000hs.vtune
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <rdmgr>
+  <timestamp type="u64_t">1644573024</timestamp>
+  <hostname type="string">pc</hostname>
+  <os type="string">linux</os>
+  <product type="string">Intel® VTune™ Profiler 2022.0.0</product>
+  <buildNumber type="s32_t">621730</buildNumber>
+  <logicalCPUCount type="s32_t">16</logicalCPUCount>
+  <physicalCoreCount type="s32_t">8</physicalCoreCount>
+  <processorPackageCount type="s32_t">1</processorPackageCount>
+  <CPUFrequency type="s64_t">3500000000</CPUFrequency>
+  <CPUFamily type="s32_t">6</CPUFamily>
+  <CPUModel type="s32_t">167</CPUModel>
+  <CPUStepping type="s32_t">1</CPUStepping>
+  <CPUBrandName type="string">11th Gen Intel(R) Core(TM) i9-11900K @ 3.50GHz</CPUBrandName>
+  <isa type="string">avx512</isa>
+  <collectionTimeBegin type="u64_t">1644573024</collectionTimeBegin>
+  <collectionTimeEnd type="u64_t">1644573031</collectionTimeEnd>
+  <variables>
+   <at type="string">hs</at>
+  </variables>
+ </rdmgr>
+</root>
diff --git a/test_temp/others/r000hs/r000hs.vtuneproj b/test_temp/others/r000hs/r000hs.vtuneproj
new file mode 100644
index 000000000..4dc9bc36c
--- /dev/null
+++ b/test_temp/others/r000hs/r000hs.vtuneproj
@@ -0,0 +1,218 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <connectionOption type="string">localhost</connectionOption>
+ <rdmgr>
+  <timestamp type="u64_t">1644573070</timestamp>
+  <hostname type="string">pc</hostname>
+  <os type="string">linux</os>
+  <product type="string">Intel® VTune™ Profiler 2022.0.0</product>
+  <buildNumber type="s32_t">621730</buildNumber>
+  <logicalCPUCount type="s32_t">16</logicalCPUCount>
+  <physicalCoreCount type="s32_t">8</physicalCoreCount>
+  <processorPackageCount type="s32_t">1</processorPackageCount>
+  <CPUFrequency type="s64_t">3500000000</CPUFrequency>
+  <CPUFamily type="s32_t">6</CPUFamily>
+  <CPUModel type="s32_t">167</CPUModel>
+  <CPUStepping type="s32_t">1</CPUStepping>
+  <CPUBrandName type="string">11th Gen Intel(R) Core(TM) i9-11900K @ 3.50GHz</CPUBrandName>
+  <isa type="string">avx512</isa>
+ </rdmgr>
+ <All_Configurations-All_Platforms>
+  <target_settings>
+   <selected_workload>
+    <localhost type="string">launch</localhost>
+   </selected_workload>
+   <workloads>
+    <localhost>
+     <launch type="pointer" type_id="dasID_cctrl2::ApplicationWorkload" ptr_type="dasID_cctrl2::IWorkload">
+      <workload>
+       <targetName type="string">launch</targetName>
+       <result_directory.path type="string">/home/qiao/Taichi/taichi-qiao-mem/test_temp/r000hs/r@@@{at}</result_directory.path>
+       <use_app_dir_as_work_dir type="bool">true</use_app_dir_as_work_dir>
+       <context>
+        <contextValue>
+         <id type="string">allowMultipleRuns</id>
+         <value type="bool">false</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">analyzeKvmGuest</id>
+         <value type="bool">false</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">analyzeSystemWide</id>
+         <value type="bool">false</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">cpuMask</id>
+         <value type="string"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">customCollector</id>
+         <value type="string"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">dataLimit</id>
+         <value type="s32_t">1000</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">enableRing</id>
+         <value type="bool">false</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">finalizationMode</id>
+         <value type="string">fast</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">followChild</id>
+         <value type="bool">true</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">followChildGroup</id>
+         <value type="null"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">followChildStrategy</id>
+         <value type="string"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">groupForFinalizationControl</id>
+         <value type="null"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">kvmGuestKallsyms</id>
+         <value type="string"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">kvmGuestModules</id>
+         <value type="string"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">kvmProfileGuest</id>
+         <value type="null"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">mrteMode</id>
+         <value type="string">auto</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">targetDurationType</id>
+         <value type="string">short</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">targetRingBuffer</id>
+         <value type="double">0</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">targetType</id>
+         <value type="string">launch</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">traceMpi</id>
+         <value type="bool">false</value>
+        </contextValue>
+        <contextValue>
+         <id type="string">tracingMode</id>
+         <value type="null"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">wrapperScriptContent</id>
+         <value type="string"></value>
+        </contextValue>
+        <contextValue>
+         <id type="string">wrapperScriptPath</id>
+         <value type="string"></value>
+        </contextValue>
+       </context>
+      </workload>
+     </launch>
+    </localhost>
+   </workloads>
+  </target_settings>
+  <analysis_type_settings>
+   <selected_analysis_type_path type="string">/home/qiao/intel/oneapi/vtune/2022.0.0/config/analysis_type/hotspots.cfg</selected_analysis_type_path>
+   <localhost>
+    <launch>
+     <at_start_point>
+      <at_path type="string">/home/qiao/intel/oneapi/vtune/2022.0.0/config/analysis_type/survey.cfg</at_path>
+      <knob_values>
+       <contextValue>
+        <id type="string">allowedViewpoints</id>
+        <value type="string">%PerfSnapshotViewpointName</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">collectMemoryBW</id>
+        <value type="bool">false</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">collectPCIeBW</id>
+        <value type="bool">false</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">dramBandwidthLimitsAT</id>
+        <value type="bool">true</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">initialViewpoint</id>
+        <value type="string">%PerfSnapshotViewpointName</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">isUArchUsageAvailable</id>
+        <value type="bool">true</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">useGpuCountingMode</id>
+        <value type="bool">true</value>
+       </contextValue>
+      </knob_values>
+     </at_start_point>
+     <at_start_point>
+      <at_path type="string">/home/qiao/intel/oneapi/vtune/2022.0.0/config/analysis_type/hotspots.cfg</at_path>
+      <knob_values>
+       <contextValue>
+        <id type="string">allowedViewpoints</id>
+        <value type="string">%HotspotsByCPUUsageViewpointName</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">enableCharacterizationInsights</id>
+        <value type="bool">true</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">enableStackCollect</id>
+        <value type="bool">false</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">goodFastFrameThreshold</id>
+        <value type="double">100</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">groupForCustomControl</id>
+        <value type="null"></value>
+       </contextValue>
+       <contextValue>
+        <id type="string">initialViewpoint</id>
+        <value type="string">%HotspotsByCPUUsageViewpointName</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">samplingInterval</id>
+        <value type="double">1</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">samplingMode</id>
+        <value type="string">sw</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">slowGoodFrameThreshold</id>
+        <value type="double">40</value>
+       </contextValue>
+       <contextValue>
+        <id type="string">stackSizeCollect</id>
+        <value type="string">1024</value>
+       </contextValue>
+      </knob_values>
+     </at_start_point>
+    </launch>
+   </localhost>
+  </analysis_type_settings>
+ </All_Configurations-All_Platforms>
+ <contextValues type="bag"/>
+</root>
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/container.metadata b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/container.metadata
new file mode 100644
index 000000000..878abe9eb
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/container.metadata
@@ -0,0 +1,21 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:null="http://www.intel.com/2009/BagSchema#null" xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="451">
+  <indexFields>
+   <field path="dd_thread"/>
+   <field path="cpu_data_attr"/>
+   <field path="cpu_data_scale"/>
+   <field path="cpu_usage_data_attr_utilization"/>
+   <field path="cpu_usage_data_scale"/>
+   <emptyField path="marker_global_data_attr" null:defaultValue=""/>
+   <emptyField path="marker_global_data_scale" int:defaultValue="1"/>
+  </indexFields>
+  <int64Fields>
+   <field path="cpu_data_duration"/>
+   <field path="cpu_data_count"/>
+   <field path="cpu_data_inst_count"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/0 b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/0
new file mode 100644
index 000000000..09af7e0f6
Binary files /dev/null and b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/0 differ
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/mapping.xml b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/mapping.xml
new file mode 100644
index 000000000..6d50be211
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_dd_thread/grouper/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="0" value="0"/>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/container.metadata b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/container.metadata
new file mode 100644
index 000000000..272afd737
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="5">
+  <indexFields>
+   <field path="cpu_usage_data_attr"/>
+   <field path="cpu_usage_data_scale"/>
+  </indexFields>
+  <int64Fields>
+   <field path="cpu_usage_data_duration"/>
+  </int64Fields>
+  <doubleFields>
+   <field path="cpu_usage_data_count"/>
+   <field path="cpu_usage_data_inst_count"/>
+  </doubleFields>
+ </entry>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/0 b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/0
new file mode 100644
index 000000000..9cd172777
Binary files /dev/null and b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/0 differ
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/mapping.xml b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/mapping.xml
new file mode 100644
index 000000000..6d50be211
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_data_grouper/grouper/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="0" value="0"/>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/container.metadata b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/container.metadata
new file mode 100644
index 000000000..db418f330
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="1">
+  <indexFields>
+   <field path="cpu_data_scale"/>
+   <field path="global_elapsed_time_data_scale"/>
+  </indexFields>
+  <int64Fields>
+   <field path="cpu_data_duration"/>
+   <field path="cpu_data_count"/>
+   <field path="cpu_data_inst_count"/>
+   <field path="global_elapsed_time_data_duration"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/0 b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/0
new file mode 100644
index 000000000..295fa8934
Binary files /dev/null and b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/0 differ
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/mapping.xml b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/mapping.xml
new file mode 100644
index 000000000..6d50be211
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_global_time_interval_metrics/grouper/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="0" value="0"/>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_sched_and_counter_metrics/container.metadata b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_sched_and_counter_metrics/container.metadata
new file mode 100644
index 000000000..cd56e726d
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/_cache_grouper_data4_sched_and_counter_metrics/container.metadata
@@ -0,0 +1,19 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" xmlns:null="http://www.intel.com/2009/BagSchema#null" xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="grouper" unsignedLong:size="0">
+  <indexFields>
+   <field path="dd_thread"/>
+   <field path="cpu_usage_data_attr_utilization"/>
+   <field path="cpu_usage_data_scale"/>
+   <emptyField path="counter_data_attr" null:defaultValue=""/>
+   <emptyField path="counter_data_scale" int:defaultValue="1"/>
+  </indexFields>
+  <int64Fields>
+   <emptyField path="counter_data_duration" int:defaultValue="0"/>
+   <emptyField path="counter_data_count" int:defaultValue="0"/>
+   <emptyField path="counter_data_inst_count" int:defaultValue="0"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/dicer.db b/test_temp/others/r000hs/sqlite-db/dicer.db
new file mode 100644
index 000000000..0c032b11e
Binary files /dev/null and b/test_temp/others/r000hs/sqlite-db/dicer.db differ
diff --git a/test_temp/others/r000hs/sqlite-db/grouper.metadata b/test_temp/others/r000hs/sqlite-db/grouper.metadata
new file mode 100644
index 000000000..c63d79a20
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/grouper.metadata
@@ -0,0 +1,82 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <grouper name="dd_thread" axis="dd_thread" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="cpu_data" correlationType="byPrimaryAxis">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+   <entry instanceTable="cpu_usage_data" correlationType="global">
+    <groupings>
+     <grouping path="attr.utilization"/>
+    </groupings>
+   </entry>
+   <entry instanceTable="marker_global_data" correlationType="global">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+   </entry>
+  </entries>
+ </grouper>
+ <grouper name="global_data_grouper" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="cpu_usage_data" correlationType="global">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+  </entries>
+ </grouper>
+ <grouper name="global_time_interval_metrics" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="cpu_data" correlationType="global">
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+   <entry instanceTable="global_elapsed_time_data" correlationType="global">
+    <standardMetrics>
+     <metric>time</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+  </entries>
+ </grouper>
+ <grouper name="sched_and_counter_metrics" axis="dd_thread" recommendedStorage="container">
+  <entries>
+   <entry instanceTable="counter_data" correlationType="byPrimaryAxis">
+    <groupings>
+     <grouping path="attr"/>
+    </groupings>
+    <standardMetrics>
+     <metric>time</metric>
+     <metric>count</metric>
+     <metric>instanceCount</metric>
+    </standardMetrics>
+    <customMetrics/>
+   </entry>
+   <entry instanceTable="cpu_usage_data" correlationType="global">
+    <groupings>
+     <grouping path="attr.utilization"/>
+    </groupings>
+   </entry>
+  </entries>
+ </grouper>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/offload_region_data/container.metadata b/test_temp/others/r000hs/sqlite-db/offload_region_data/container.metadata
new file mode 100644
index 000000000..94869b4f7
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/offload_region_data/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="start_tsc" unsignedLong:size="18446744073709551615">
+  <indexFields>
+   <field path="dd_offload_region"/>
+   <field path="dd_offload_region.context"/>
+   <field path="dd_offload_region.context.process"/>
+   <field path="dd_offload_region.context.gpu_node"/>
+  </indexFields>
+  <int64Fields>
+   <field path="count"/>
+   <field path="end_tsc"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/offload_region_operation_data/container.metadata b/test_temp/others/r000hs/sqlite-db/offload_region_operation_data/container.metadata
new file mode 100644
index 000000000..f6c7b7410
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/offload_region_operation_data/container.metadata
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:unsignedLong="http://www.w3.org/2001/XMLSchema#unsignedLong">
+ <entry path="start_tsc" unsignedLong:size="18446744073709551615">
+  <indexFields>
+   <field path="dd_offload_region_operation"/>
+   <field path="dd_offload_region_operation.region.context"/>
+   <field path="dd_offload_region_operation.region.context.process"/>
+   <field path="dd_offload_region_operation.region.context.gpu_node"/>
+  </indexFields>
+  <int64Fields>
+   <field path="count"/>
+   <field path="end_tsc"/>
+  </int64Fields>
+  <doubleFields/>
+ </entry>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1716639517/aggregated/0/0 b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1716639517/aggregated/0/0
new file mode 100644
index 000000000..b17d6896d
Binary files /dev/null and b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1716639517/aggregated/0/0 differ
diff --git a/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1716639517/aggregated/0/mapping.xml b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1716639517/aggregated/0/mapping.xml
new file mode 100644
index 000000000..8876237f2
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1716639517/aggregated/0/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="18446744073709551615_18446744073709551615_0_1_1_4454274271682318111" value="0"/>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1988985760/instance/0/0 b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1988985760/instance/0/0
new file mode 100644
index 000000000..a9a6634a6
Binary files /dev/null and b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1988985760/instance/0/0 differ
diff --git a/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1988985760/instance/0/mapping.xml b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1988985760/instance/0/mapping.xml
new file mode 100644
index 000000000..84f15414a
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/timelinedb/dbint-1988985760/instance/0/mapping.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag>
+ <fileEntry key="1_4454274249175902508" value="0"/>
+</bag>
diff --git a/test_temp/others/r000hs/sqlite-db/timelinedb/directory.xml b/test_temp/others/r000hs/sqlite-db/timelinedb/directory.xml
new file mode 100644
index 000000000..0786dd145
--- /dev/null
+++ b/test_temp/others/r000hs/sqlite-db/timelinedb/directory.xml
@@ -0,0 +1,6 @@
+<?xml version='1.0' encoding='UTF-8'?>
+
+<bag xmlns:int="http://www.w3.org/2001/XMLSchema#int" int:version="9" int:type="2">
+ <dbEntry key="GROUP_DEF(type:1 cpu_data /cpu_data::dd_sample.callsite.bottom_user_cs.code_loc.mod_seg.mod_file/cpu_data::dd_sample.callsite.type_cs.code_loc.func_range.func_inst.function.subtype/cpu_data::dd_sample.callsite.type_cs.code_loc.func_range.func_inst.function.type/dd_band::dd_thread/dd_band::dd_thread.process)" value="dbint-1716639517"/>
+ <dbEntry key="GROUP_DEF(type:2 cpu_data /dd_band::dd_thread)" value="dbint-1988985760"/>
+</bag>
diff --git a/test_temp/others/stream_demo.py b/test_temp/others/stream_demo.py
new file mode 100644
index 000000000..e58dd77ee
--- /dev/null
+++ b/test_temp/others/stream_demo.py
@@ -0,0 +1,58 @@
+import taichi as ti
+import time
+
+ti.init(arch=ti.cuda)
+
+dim, n_grid, steps, dt = 3, 128, 25, 8e-5
+
+n_particles = n_grid**dim // 2**(dim - 1)
+dx = 1 / n_grid
+
+p_rho = 1
+p_vol = (dx * 0.5)**2
+p_mass = p_vol * p_rho
+gravity = 9.8
+bound = 3
+E = 400
+
+x = ti.Vector.field(dim, float, n_particles)
+v = ti.Vector.field(dim, float, n_particles)
+C = ti.Matrix.field(dim, dim, float, n_particles)
+J = ti.field(float, n_particles)
+
+grid_v = ti.Vector.field(dim, float, (n_grid, ) * dim)
+grid_m = ti.field(float, (n_grid, ) * dim)
+
+neighbour = (3, ) * dim
+
+
+@ti.kernel
+def k1():
+    for I in ti.grouped(grid_m):
+        grid_v[I] = ti.zero(grid_v[I])
+        grid_m[I] = 0    
+
+@ti.kernel
+def k2():
+    for p in x:
+        Xp = x[p] / dx
+        base = int(Xp - 0.5)
+        fx = Xp - base
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
+        affine = ti.Matrix.identity(float, dim) * stress + p_mass * C[p]
+        for offset in ti.static(ti.grouped(ti.ndrange(*neighbour))):
+            dpos = (offset - fx) * dx
+            weight = 1.0
+            for i in ti.static(range(dim)):
+                weight *= w[offset[i]][i]
+            grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+            grid_m[base + offset] += weight * p_mass
+
+
+t_start = time.perf_counter()
+k1()
+k2()
+t_used = time.perf_counter() - t_start
+print('execution time {:.3f}'.format(t_used*1000), "ms")
+
diff --git a/test_temp/others/test_mesh.py b/test_temp/others/test_mesh.py
new file mode 100644
index 000000000..b29ffbdb6
--- /dev/null
+++ b/test_temp/others/test_mesh.py
@@ -0,0 +1,28 @@
+import os
+import numpy as np
+import taichi as ti
+
+this_dir = os.path.dirname(os.path.abspath(__file__))
+model_file_path = os.path.join(this_dir, 'ell.json')
+
+ti.init(arch=ti.x64)
+
+def test_mesh_for(cell_reorder=False, vert_reorder=False, extra_tests=True):
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'t': ti.i32}, reorder=vert_reorder)
+    mesh_builder.cells.place({'t': ti.i32}, reorder=cell_reorder)
+    mesh_builder.cells.link(mesh_builder.verts)
+    mesh_builder.verts.link(mesh_builder.cells)
+    mesh_builder.cells.link(mesh_builder.cells)
+    mesh_builder.verts.link(mesh_builder.verts)
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+
+    @ti.kernel
+    def cell_vert():
+        for c in model.cells:
+            for j in range(c.verts.size):
+                c.t += c.verts[j].id
+
+    cell_vert()
+
+test_mesh_for(False, False)
diff --git a/test_temp/others/test_ndarray_vulkan.py b/test_temp/others/test_ndarray_vulkan.py
new file mode 100644
index 000000000..45f31bb3c
--- /dev/null
+++ b/test_temp/others/test_ndarray_vulkan.py
@@ -0,0 +1,9 @@
+import taichi as ti
+
+ti.init(arch=ti.vulkan)
+
+a = ti.ndarray(float, 8)
+
+a[1] = 0.1
+
+print(a[1])
diff --git a/tests/_python_orig/__init__.py b/tests/_python_orig/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/tests/_python_orig/__init__.py
@@ -0,0 +1 @@
+
diff --git a/tests/_python_orig/bls_test_template.py b/tests/_python_orig/bls_test_template.py
new file mode 100644
index 000000000..879b27981
--- /dev/null
+++ b/tests/_python_orig/bls_test_template.py
@@ -0,0 +1,288 @@
+import random
+
+import numpy as np
+
+import taichi as ti
+
+
+def bls_test_template(dim,
+                      N,
+                      bs,
+                      stencil,
+                      block_dim=None,
+                      scatter=False,
+                      benchmark=0,
+                      dense=False):
+    x, y, y2 = ti.field(ti.i32), ti.field(ti.i32), ti.field(ti.i32)
+
+    index = ti.axes(*range(dim))
+    mismatch = ti.field(ti.i32, shape=())
+
+    if not isinstance(bs, (tuple, list)):
+        bs = [bs for _ in range(dim)]
+
+    grid_size = [N // bs[i] for i in range(dim)]
+
+    if dense:
+        create_block = lambda: ti.root.dense(index, grid_size)
+    else:
+        create_block = lambda: ti.root.pointer(index, grid_size)
+
+    if scatter:
+        block = create_block()
+
+        block.dense(index, bs).place(x)
+        block.dense(index, bs).place(y)
+        block.dense(index, bs).place(y2)
+    else:
+        create_block().dense(index, bs).place(x)
+        create_block().dense(index, bs).place(y)
+        create_block().dense(index, bs).place(y2)
+
+    ndrange = ((bs[i], N - bs[i]) for i in range(dim))
+
+    if block_dim is None:
+        block_dim = 1
+        for i in range(dim):
+            block_dim *= bs[i]
+
+    @ti.kernel
+    def populate():
+        for I in ti.grouped(ti.ndrange(*ndrange)):
+            s = 0
+            for i in ti.static(range(dim)):
+                s += I[i]**(i + 1)
+            x[I] = s
+
+    @ti.kernel
+    def apply(use_bls: ti.template(), y: ti.template()):
+        if ti.static(use_bls and not scatter):
+            ti.block_local(x)
+        if ti.static(use_bls and scatter):
+            ti.block_local(y)
+
+        ti.block_dim(block_dim)
+        for I in ti.grouped(x):
+            if ti.static(scatter):
+                for offset in ti.static(stencil):
+                    y[I + ti.Vector(offset)] += x[I]
+            else:
+                # gather
+                s = 0
+                for offset in ti.static(stencil):
+                    s = s + x[I + ti.Vector(offset)]
+                y[I] = s
+
+    populate()
+
+    if benchmark:
+        for i in range(benchmark):
+            x.snode.parent().deactivate_all()
+            if not scatter:
+                populate()
+            y.snode.parent().deactivate_all()
+            y2.snode.parent().deactivate_all()
+            apply(False, y2)
+            apply(True, y)
+    else:
+        # Simply test
+        apply(False, y2)
+        apply(True, y)
+
+    @ti.kernel
+    def check():
+        for I in ti.grouped(y2):
+            if y[I] != y2[I]:
+                print('check failed', I, y[I], y2[I])
+                mismatch[None] = 1
+
+    check()
+
+    ti.print_kernel_profile_info()
+
+    assert mismatch[None] == 0
+
+
+def bls_particle_grid(N,
+                      ppc=8,
+                      block_size=16,
+                      scatter=True,
+                      benchmark=0,
+                      pointer_level=1,
+                      sort_points=True,
+                      use_offset=True):
+    M = N * N * ppc
+
+    m1 = ti.field(ti.f32)
+    m2 = ti.field(ti.f32)
+    m3 = ti.field(ti.f32)
+    pid = ti.field(ti.i32)
+    err = ti.field(ti.i32, shape=())
+
+    max_num_particles_per_block = block_size**2 * 4096
+
+    x = ti.Vector.field(2, dtype=ti.f32)
+
+    s1 = ti.field(dtype=ti.f32)
+    s2 = ti.field(dtype=ti.f32)
+    s3 = ti.field(dtype=ti.f32)
+
+    ti.root.dense(ti.i, M).place(x)
+    ti.root.dense(ti.i, M).place(s1, s2, s3)
+
+    if pointer_level == 1:
+        block = ti.root.pointer(ti.ij, N // block_size)
+    elif pointer_level == 2:
+        block = ti.root.pointer(ti.ij, N // block_size // 4).pointer(ti.ij, 4)
+    else:
+        raise ValueError('pointer_level must be 1 or 2')
+
+    if use_offset:
+        grid_offset = (-N // 2, -N // 2)
+        grid_offset_block = (-N // 2 // block_size, -N // 2 // block_size)
+        world_offset = -0.5
+    else:
+        grid_offset = (0, 0)
+        grid_offset_block = (0, 0)
+        world_offset = 0
+
+    block.dense(ti.ij, block_size).place(m1, offset=grid_offset)
+    block.dense(ti.ij, block_size).place(m2, offset=grid_offset)
+    block.dense(ti.ij, block_size).place(m3, offset=grid_offset)
+
+    block.dynamic(ti.l,
+                  max_num_particles_per_block,
+                  chunk_size=block_size**2 * ppc * 4).place(
+                      pid, offset=grid_offset_block + (0, ))
+
+    bound = 0.1
+
+    extend = 4
+
+    x_ = [(random.random() * (1 - 2 * bound) + bound + world_offset,
+           random.random() * (1 - 2 * bound) + bound + world_offset)
+          for _ in range(M)]
+    if sort_points:
+        x_.sort(key=lambda q: int(q[0] * N) // block_size * N + int(q[1] * N)
+                // block_size)
+
+    x.from_numpy(np.array(x_, dtype=np.float32))
+
+    @ti.kernel
+    def insert():
+        ti.block_dim(256)
+        for i in x:
+            # It is important to ensure insert and p2g uses the exact same way to compute the base
+            # coordinates. Otherwise there might be coordinate mismatch due to float-point errors.
+            base = ti.Vector([
+                int(ti.floor(x[i][0] * N) - grid_offset[0]),
+                int(ti.floor(x[i][1] * N) - grid_offset[1])
+            ])
+            base_p = ti.rescale_index(m1, pid, base)
+            ti.append(pid.parent(), base_p, i)
+
+    scatter_weight = (N * N / M) * 0.01
+
+    @ti.kernel
+    def p2g(use_shared: ti.template(), m: ti.template()):
+        ti.block_dim(256)
+        if ti.static(use_shared):
+            ti.block_local(m)
+        for I in ti.grouped(pid):
+            p = pid[I]
+
+            u_ = ti.floor(x[p] * N).cast(ti.i32)
+            Im = ti.rescale_index(pid, m, I)
+            u0 = ti.assume_in_range(u_[0], Im[0], 0, 1)
+            u1 = ti.assume_in_range(u_[1], Im[1], 0, 1)
+
+            u = ti.Vector([u0, u1])
+
+            for offset in ti.static(ti.grouped(ti.ndrange(extend, extend))):
+                m[u + offset] += scatter_weight
+
+    @ti.kernel
+    def p2g_naive():
+        ti.block_dim(256)
+        for p in x:
+            u = ti.floor(x[p] * N).cast(ti.i32)
+
+            for offset in ti.static(ti.grouped(ti.ndrange(extend, extend))):
+                m3[u + offset] += scatter_weight
+
+    @ti.kernel
+    def fill_m1():
+        for i, j in ti.ndrange(N, N):
+            m1[i, j] = ti.random()
+
+    @ti.kernel
+    def g2p(use_shared: ti.template(), s: ti.template()):
+        ti.block_dim(256)
+        if ti.static(use_shared):
+            ti.block_local(m1)
+        for I in ti.grouped(pid):
+            p = pid[I]
+
+            u_ = ti.floor(x[p] * N).cast(ti.i32)
+
+            Im = ti.rescale_index(pid, m1, I)
+            u0 = ti.assume_in_range(u_[0], Im[0], 0, 1)
+            u1 = ti.assume_in_range(u_[1], Im[1], 0, 1)
+
+            u = ti.Vector([u0, u1])
+
+            tot = 0.0
+
+            for offset in ti.static(ti.grouped(ti.ndrange(extend, extend))):
+                tot += m1[u + offset]
+
+            s[p] = tot
+
+    @ti.kernel
+    def g2p_naive(s: ti.template()):
+        ti.block_dim(256)
+        for p in x:
+            u = ti.floor(x[p] * N).cast(ti.i32)
+
+            tot = 0.0
+            for offset in ti.static(ti.grouped(ti.ndrange(extend, extend))):
+                tot += m1[u + offset]
+            s[p] = tot
+
+    insert()
+
+    for i in range(benchmark):
+        pid.parent(2).snode.deactivate_all()
+        insert()
+
+    @ti.kernel
+    def check_m():
+        for i in range(grid_offset[0], grid_offset[0] + N):
+            for j in range(grid_offset[1], grid_offset[1] + N):
+                if abs(m1[i, j] - m3[i, j]) > 1e-4:
+                    err[None] = 1
+                if abs(m2[i, j] - m3[i, j]) > 1e-4:
+                    err[None] = 1
+
+    @ti.kernel
+    def check_s():
+        for i in range(M):
+            if abs(s1[i] - s2[i]) > 1e-4:
+                err[None] = 1
+            if abs(s1[i] - s3[i]) > 1e-4:
+                err[None] = 1
+
+    if scatter:
+        for i in range(max(benchmark, 1)):
+            p2g(True, m1)
+            p2g(False, m2)
+            p2g_naive()
+        check_m()
+    else:
+        for i in range(max(benchmark, 1)):
+            g2p(True, s1)
+            g2p(False, s2)
+            g2p_naive(s3)
+        check_s()
+
+    assert not err[None]
diff --git a/tests/_python_orig/ell.json b/tests/_python_orig/ell.json
new file mode 100644
index 000000000..e1e560189
--- /dev/null
+++ b/tests/_python_orig/ell.json
@@ -0,0 +1,120 @@
+{
+"num_patches" : 8,
+  "elements" : [ 
+{"order" : 0,
+"num" : 20,
+"max_num_per_patch" : 32,
+"owned_offsets" : [0,0,3,5,7,10,12,14,20],
+"total_offsets" : [0,20,40,60,80,100,120,140,160],
+"l2g_mapping" : [12,4,0,3,13,1,5,11,15,9,2,8,7,10,19,14,6,17,16,18,2,5,10,13,1,4,9,12,0,3,15,7,8,11,17,19,6,14,16,18,1,13,12,4,0,9,5,2,10,3,8,15,11,17,19,7,6,14,16,18,4,7,12,0,3,13,1,5,15,6,9,8,11,17,19,14,2,10,16,18,16,17,18,19,11,12,14,15,3,0,8,4,9,13,1,6,7,5,10,2,8,9,12,0,11,1,13,10,4,3,15,17,19,16,18,14,5,2,7,6,0,12,4,3,9,8,1,11,13,15,17,19,5,7,6,14,10,2,16,18,3,6,11,14,15,19,12,4,0,7,16,18,8,17,9,13,1,5,10,2],
+"g2r_mapping" : [12,3,0,14,5,1,15,6,10,11,2,16,13,4,17,18,7,8,9,19],
+"l2r_mapping" : [13,5,12,14,4,3,1,16,18,11,0,10,6,2,19,17,15,8,7,9,0,1,2,4,3,5,11,13,12,14,18,6,10,16,8,19,15,17,7,9,3,4,13,5,12,11,1,0,2,14,10,18,16,8,19,6,15,17,7,9,5,6,13,12,14,4,3,1,18,15,11,10,16,8,19,17,0,2,7,9,7,8,9,19,16,13,17,18,14,12,10,5,11,4,3,15,6,1,2,0,10,11,13,12,16,3,4,2,5,14,18,8,19,7,9,17,1,0,6,15,12,13,5,14,11,10,3,16,4,18,8,19,1,6,15,17,2,0,7,9,14,15,16,17,18,19,13,5,12,6,7,9,10,8,11,4,3,1,2,0]
+}
+,{"order" : 2,
+"num" : 66,
+"max_num_per_patch" : 96,
+"owned_offsets" : [0,8,17,23,31,42,48,56,66],
+"total_offsets" : [0,66,132,198,264,330,396,462,528],
+"l2g_mapping" : [0,3,8,10,11,15,17,18,1,2,9,16,14,43,44,45,27,25,26,20,12,19,7,13,34,32,33,4,48,49,53,54,47,24,46,37,35,36,5,6,39,38,64,57,63,42,55,56,21,22,23,60,58,59,65,50,40,41,62,30,61,52,29,51,31,28,16,32,33,34,35,36,37,38,39,15,17,8,9,11,19,20,10,12,13,14,18,0,2,3,25,27,46,47,5,6,7,1,4,26,44,45,49,53,57,58,60,64,43,48,21,23,24,41,42,54,55,22,40,56,63,65,28,29,30,50,51,59,61,62,31,52,9,12,13,14,19,20,8,10,15,16,18,32,33,35,36,38,11,17,34,37,39,1,2,3,4,6,7,26,27,44,45,49,53,57,58,60,64,0,25,46,47,5,43,48,21,23,24,41,42,54,55,22,40,56,63,65,28,29,30,50,51,59,61,62,31,52,2,22,23,24,25,27,46,47,0,3,8,10,11,15,17,18,1,4,6,7,9,12,14,20,26,44,45,49,53,57,58,60,64,5,13,43,48,21,41,42,54,55,16,19,32,34,35,37,39,33,36,38,40,56,63,65,28,29,30,50,51,59,61,62,31,52,28,29,30,31,51,52,58,59,60,61,62,50,57,63,64,65,43,44,45,48,49,53,54,55,56,1,2,3,4,6,7,9,10,11,12,14,20,26,27,40,41,42,21,22,24,25,47,0,23,46,5,13,18,8,15,17,19,36,37,16,32,34,35,39,33,38,4,5,6,7,48,49,12,13,19,20,36,37,1,2,3,9,10,11,14,26,27,44,45,53,57,58,60,64,0,18,43,28,29,30,50,51,54,55,56,59,61,63,8,15,16,32,33,35,38,17,34,39,25,46,47,21,23,24,41,42,22,40,65,62,31,52,1,26,43,44,45,53,57,64,0,3,4,5,7,13,14,18,48,2,6,9,10,11,12,20,27,49,58,60,8,15,17,25,46,47,21,23,24,41,42,54,55,19,36,37,16,32,33,35,38,28,29,30,50,51,56,59,61,63,34,39,22,40,65,62,31,52,21,40,41,42,50,54,55,56,63,65,0,1,2,23,24,25,26,43,45,46,22,28,29,30,44,48,49,51,53,57,58,59,61,52,27,47,64,31,60,62,3,4,6,7,9,10,11,12,14,20,8,15,17,18,5,13,19,36,37,16,32,34,35,39,33,38],
+"g2r_mapping" : [0,48,23,1,42,43,44,45,2,17,3,4,18,19,20,5,8,6,7,21,22,56,24,25,26,27,49,28,31,32,33,34,9,10,11,12,13,14,15,16,57,58,59,50,51,52,29,30,46,47,60,35,36,53,61,62,63,54,37,38,39,40,41,64,55,65],
+"l2r_mapping" : [0,1,2,3,4,5,6,7,48,23,17,8,20,50,51,52,28,27,49,22,18,21,45,19,11,9,10,42,46,47,53,61,30,26,29,14,12,13,43,44,16,15,55,54,64,59,62,63,56,24,25,39,37,38,65,60,57,58,41,33,40,36,32,35,34,31,8,9,10,11,12,13,14,15,16,5,6,2,17,4,21,22,3,18,19,20,7,0,23,1,27,28,29,30,43,44,45,48,42,49,51,52,47,53,54,37,39,55,50,46,56,25,26,58,59,61,62,24,57,63,64,65,31,32,33,60,35,38,40,41,34,36,17,18,19,20,21,22,2,3,5,8,7,9,10,12,13,15,4,6,11,14,16,48,23,1,42,44,45,49,28,51,52,47,53,54,37,39,55,0,27,29,30,43,50,46,56,25,26,58,59,61,62,24,57,63,64,65,31,32,33,60,35,38,40,41,34,36,23,24,25,26,27,28,29,30,0,1,2,3,4,5,6,7,48,42,44,45,17,18,20,22,49,51,52,47,53,54,37,39,55,43,19,50,46,56,58,59,61,62,8,21,9,11,12,14,16,10,13,15,57,63,64,65,31,32,33,60,35,38,40,41,34,36,31,32,33,34,35,36,37,38,39,40,41,60,54,64,55,65,50,51,52,46,47,53,61,62,63,48,23,1,42,44,45,17,3,4,18,20,22,49,28,57,58,59,56,24,26,27,30,0,25,29,43,19,7,2,5,6,21,13,14,8,9,11,12,16,10,15,42,43,44,45,46,47,18,19,21,22,13,14,48,23,1,17,3,4,20,49,28,51,52,53,54,37,39,55,0,7,50,31,32,33,60,35,61,62,63,38,40,64,2,5,8,9,10,12,15,6,11,16,27,29,30,56,25,26,58,59,24,57,65,41,34,36,48,49,50,51,52,53,54,55,0,1,42,43,45,19,20,7,46,23,44,17,3,4,18,22,28,47,37,39,2,5,6,27,29,30,56,25,26,58,59,61,62,21,13,14,8,9,10,12,15,31,32,33,60,35,63,38,40,64,11,16,24,57,65,41,34,36,56,57,58,59,60,61,62,63,64,65,0,48,23,25,26,27,49,50,52,29,24,31,32,33,51,46,47,35,53,54,37,38,40,36,28,30,55,34,39,41,1,42,44,45,17,3,4,18,20,22,2,5,6,7,43,19,21,13,14,8,9,11,12,16,10,15]
+}
+,{"order" : 1,
+"num" : 61,
+"max_num_per_patch" : 64,
+"owned_offsets" : [0,4,13,19,26,36,42,49,61],
+"total_offsets" : [0,61,122,183,244,305,366,427,488],
+"l2g_mapping" : [3,13,14,19,0,1,2,4,5,11,12,15,18,20,16,46,47,48,28,29,25,6,21,17,7,36,37,38,9,10,50,54,23,49,22,39,40,41,8,42,58,60,32,44,43,53,24,26,27,55,56,57,52,45,59,30,34,51,31,35,33,18,20,36,37,38,39,40,41,42,19,11,13,15,21,12,14,16,17,0,3,4,29,49,6,7,8,1,2,9,28,46,56,58,5,10,48,22,25,26,43,47,23,27,44,54,60,24,50,32,34,35,53,57,55,59,30,31,52,45,51,33,11,12,15,16,17,21,14,20,38,41,13,18,37,40,0,1,2,6,9,28,46,56,58,3,4,19,29,49,5,7,10,48,8,39,36,42,22,25,26,43,47,50,23,27,44,54,60,32,34,35,53,57,55,59,30,31,52,24,45,51,33,0,4,22,23,24,29,49,3,13,14,19,1,2,6,9,11,12,28,46,56,58,5,7,10,16,48,25,26,43,47,15,18,21,37,40,17,20,38,41,36,27,44,54,60,45,8,39,50,32,34,35,53,57,55,59,30,31,52,51,42,33,30,31,33,34,35,51,55,56,57,59,32,52,58,60,46,47,48,50,53,54,0,1,2,6,9,11,12,28,43,44,45,23,25,27,29,4,5,22,26,3,7,10,16,8,13,14,19,49,17,21,39,15,18,37,40,20,38,41,24,36,42,6,7,8,9,10,50,17,21,39,0,1,2,11,12,28,46,56,58,3,5,16,48,32,34,35,47,53,54,57,14,15,20,38,41,13,18,37,40,42,4,19,29,49,22,25,26,43,23,27,44,60,55,59,30,31,52,33,51,45,36,24,1,2,5,28,46,48,58,3,7,10,16,0,6,9,11,12,56,4,13,14,19,29,49,22,25,26,43,47,8,17,21,39,50,15,20,38,41,32,34,35,53,54,57,18,37,40,23,27,44,60,55,59,30,31,52,36,24,45,51,42,33,25,26,27,32,43,44,45,47,52,53,54,60,2,4,5,22,24,34,35,46,48,50,57,51,23,28,29,30,31,55,58,0,1,6,9,11,12,56,3,13,14,19,49,7,10,16,33,59,8,17,21,39,15,18,37,40,20,38,41,36,42],
+"g2r_mapping" : [19,42,43,0,20,44,36,37,38,39,40,13,14,1,2,15,16,17,4,3,5,18,21,22,23,49,50,51,45,24,26,27,52,28,29,30,6,7,8,9,10,11,12,53,54,55,46,56,47,25,41,31,57,58,59,32,33,34,48,35,60],
+"l2r_mapping" : [0,1,2,3,19,42,43,20,44,13,14,15,4,5,16,46,56,47,45,24,49,36,18,17,37,6,7,8,39,40,41,59,22,25,21,9,10,11,38,12,48,60,52,54,53,58,23,50,51,32,33,34,57,55,35,26,29,31,27,30,28,4,5,6,7,8,9,10,11,12,3,13,1,15,18,14,2,16,17,19,0,20,24,25,36,37,38,42,43,39,45,46,33,48,44,40,47,21,49,50,53,56,22,51,54,59,60,23,41,52,29,30,58,34,32,35,26,27,57,55,31,28,13,14,15,16,17,18,2,5,8,11,1,4,7,10,19,42,43,36,39,45,46,33,48,0,20,3,24,25,44,37,40,47,38,9,6,12,21,49,50,53,56,41,22,51,54,59,60,52,29,30,58,34,32,35,26,27,57,23,55,31,28,19,20,21,22,23,24,25,0,1,2,3,42,43,36,39,13,14,45,46,33,48,44,37,40,16,47,49,50,53,56,15,4,18,7,10,17,5,8,11,6,51,54,59,60,55,38,9,41,52,29,30,58,34,32,35,26,27,57,31,12,28,26,27,28,29,30,31,32,33,34,35,52,57,48,60,46,56,47,41,58,59,19,42,43,36,39,13,14,45,53,54,55,22,49,51,24,20,44,21,50,0,37,40,16,38,1,2,3,25,17,18,9,15,4,7,10,5,8,11,23,6,12,36,37,38,39,40,41,17,18,9,19,42,43,13,14,45,46,33,48,0,44,16,47,52,29,30,56,58,59,34,2,15,5,8,11,1,4,7,10,12,20,3,24,25,21,49,50,53,22,51,54,60,32,35,26,27,57,28,31,55,6,23,42,43,44,45,46,47,48,0,37,40,16,19,36,39,13,14,33,20,1,2,3,24,25,21,49,50,53,56,38,17,18,9,41,15,5,8,11,52,29,30,58,59,34,4,7,10,22,51,54,60,32,35,26,27,57,6,23,55,31,12,28,49,50,51,52,53,54,55,56,57,58,59,60,43,20,44,21,23,29,30,46,47,41,34,31,22,45,24,26,27,32,48,19,42,36,39,13,14,33,0,1,2,3,25,37,40,16,28,35,38,17,18,9,15,4,7,10,5,8,11,6,12]
+}
+,{"order" : 3,
+"num" : 24,
+"max_num_per_patch" : 32,
+"owned_offsets" : [0,4,7,9,12,16,18,21,24],
+"total_offsets" : [0,24,48,72,96,120,144,168,192],
+"l2g_mapping" : [0,2,4,5,14,8,6,3,10,16,18,15,11,1,12,22,19,7,20,23,13,21,17,9,10,11,12,4,6,2,3,5,1,0,16,14,8,18,15,22,19,7,20,23,13,21,17,9,3,6,1,2,5,11,16,4,0,12,14,10,8,18,15,22,19,7,20,23,13,21,17,9,7,8,15,0,13,18,5,14,19,22,2,3,16,23,20,4,6,1,17,21,10,11,9,12,9,17,20,21,23,22,19,18,13,8,14,7,0,15,16,5,1,2,3,4,6,10,11,12,1,16,3,14,6,5,0,18,2,11,8,22,19,4,12,15,20,23,13,10,7,21,17,9,14,18,22,0,8,16,19,20,23,5,15,1,13,21,17,2,3,7,9,4,6,10,11,12,13,19,23,7,17,18,22,15,9,8,14,20,21,0,16,5,1,2,3,4,6,10,11,12],
+"g2r_mapping" : [0,16,1,7,2,3,8,9,10,12,4,5,6,21,18,11,17,13,19,22,14,15,20,23],
+"l2r_mapping" : [0,1,2,3,18,10,8,7,4,17,19,11,5,16,6,20,22,9,14,23,21,15,13,12,4,5,6,2,8,1,7,3,16,0,17,18,10,19,11,20,22,9,14,23,21,15,13,12,7,8,16,1,3,5,17,2,0,6,18,4,10,19,11,20,22,9,14,23,21,15,13,12,9,10,11,0,21,19,3,18,22,20,1,7,17,23,14,2,8,16,13,15,4,5,12,6,12,13,14,15,23,20,22,19,21,10,18,9,0,11,17,3,16,1,7,2,8,4,5,6,16,17,7,18,8,3,0,19,1,5,10,20,22,2,6,11,14,23,21,4,9,15,13,12,18,19,20,0,10,17,22,14,23,3,11,16,21,15,13,1,7,9,12,2,8,4,5,6,21,22,23,9,13,19,20,11,12,10,18,14,15,0,17,3,16,1,7,2,8,4,5,6]
+}
+  ], 
+  "relations" : [ 
+{"from_order" : 0,
+"to_order" : 0,
+"offset" : [0,0,4,8,12,12,20,27,27,35,39,39,43,47,51,51,55,61,61,68,79,79,87,91,101,107,115,122],
+"value" : [1,3,4,2,3,5,4,0,6,3,4,0,2,3,1,4,5,6,7,8,2,3,0,6,5,7,8,2,3,4,5,6,7,8,1,4,8,9,0,3,2,4,1,3,5,4,0,3,0,4,6,1,2,3,4,2,3,0,5,6,7,1,2,3,4,5,6,7,2,0,3,4,5,8,6,9,7,10,11,6,7,8,9,4,1,3,2,9,0,4,3,5,10,11,6,0,8,12,3,4,13,0,4,1,11,5,2,9,0,1,6,7,3,2,5,10,11,2,3,13,6,4]
+}
+,{"from_order" : 0,
+"to_order" : 1,
+"offset" : [0,0,4,8,12,12,20,27,27,35,39,39,43,47,51,51,55,61,61,68,79,79,87,91,101,107,115,122],
+"value" : [2,3,4,8,0,9,1,2,5,6,7,8,1,6,2,3,4,7,8,9,0,10,2,11,5,12,13,0,7,1,8,9,10,5,6,2,3,4,6,0,2,3,9,6,7,8,9,1,2,4,5,2,3,4,5,0,1,2,6,7,8,0,7,2,8,9,10,5,11,0,1,12,13,14,15,3,4,16,6,12,13,14,15,0,1,4,7,16,1,2,6,3,17,18,19,7,20,21,9,10,22,4,5,6,23,8,9,24,0,2,25,26,5,10,11,27,28,3,8,29,30,11]
+}
+,{"from_order" : 0,
+"to_order" : 2,
+"offset" : [0,0,5,10,15,15,30,42,42,55,60,60,65,70,75,75,80,89,89,101,123,123,138,143,162,171,186,198],
+"value" : [1,2,3,7,8,9,0,10,2,3,4,5,6,7,8,6,0,7,1,2,3,8,9,10,4,11,12,13,14,15,6,0,16,9,17,4,5,11,18,13,19,20,8,0,9,10,11,12,13,14,15,4,5,6,7,1,2,3,6,7,0,2,3,9,10,6,7,8,9,10,0,1,3,4,5,0,1,2,4,5,1,2,3,6,7,8,9,10,11,8,0,9,10,11,12,13,14,15,2,3,16,0,17,9,10,18,12,19,20,21,22,14,23,1,24,3,4,25,5,6,26,27,7,10,11,12,0,13,14,15,16,2,3,17,18,19,5,6,0,20,13,1,2,21,22,23,17,24,18,25,26,4,27,28,5,6,7,29,30,31,32,8,1,2,3,4,27,33,6,7,9,0,20,14,15,16,34,1,3,35,28,5,7,8,36,9,22,23,37,4,33,29,31,38,39,8,36,9]
+}
+,{"from_order" : 3,
+"to_order" : 3,
+"offset" : [0,3,6,8,11,11,13,15,17,17,20,23,23,25,28,30,30,32,34,36,38,38,40,42,42,45,49,52,52,54,57,60],
+"value" : [3,4,5,2,6,3,1,8,0,1,7,3,2,4,2,0,1,2,1,4,3,0,5,2,4,3,2,5,0,1,1,3,0,4,5,3,0,2,2,1,3,0,3,5,1,4,0,2,6,1,7,8,1,3,0,5,2,4,1,6]
+}
+,{"from_order" : 3,
+"to_order" : 2,
+"value" : [1,0,8,9,4,2,10,3,6,2,5,11,1,3,12,7,8,13,14,15,16,9,17,18,19,10,20,21,22,20,23,12,24,11,25,26,14,27,28,29,18,15,30,31,32,33,17,34,35,21,36,37,22,27,38,39,40,25,36,41,42,30,43,44,45,31,46,47,33,48,49,50,51,43,52,53,54,55,47,44,45,48,56,57,58,59,53,60,61,62,55,63,64,65,62,59,3,0,1,2,6,14,4,5,8,1,4,7,10,11,9,0,15,12,17,14,13,11,12,16,30,17,18,19,23,16,19,20,30,32,28,29,23,21,31,22,34,32,43,36,31,42,34,35,25,22,24,33,33,35,37,49,27,46,24,26,41,37,38,54,48,49,50,53,46,44,51,45,40,38,39,61,55,59,53,54,48,44,52,47,63,58,61,62,65,57,59,60,64,56,57,58,26,1,2,3,5,0,1,4,26,24,41,25,16,6,0,7,23,7,3,10,19,4,13,14,29,24,43,31,17,6,8,9,23,37,21,22,20,11,13,15,21,42,29,30,18,9,11,12,28,22,38,27,27,30,32,49,40,46,38,39,36,32,33,54,48,49,50,53,46,44,51,45,35,33,34,61,55,59,53,54,48,44,52,47,63,58,61,62,65,57,59,60,64,56,57,58,3,37,1,2,5,0,4,24,7,3,4,6,9,8,16,0,39,37,52,38,24,26,28,40,9,11,22,15,16,35,25,26,39,40,41,53,32,28,29,54,12,10,20,11,19,21,34,22,25,17,36,27,55,59,53,54,31,29,30,61,14,10,13,42,23,20,21,43,19,17,33,18,65,57,59,60,63,58,61,62,45,42,44,49,47,43,46,50,64,56,57,58,48,44,46,51,3,0,1,2,5,1,11,4,8,12,6,7,10,2,7,9,15,11,24,13,14,21,12,13,41,22,23,24,37,18,21,22,41,42,39,40,38,26,45,37,25,16,17,18,44,42,43,48,27,47,25,26,46,44,45,49,17,28,19,20,27,32,35,52,30,28,50,29,33,53,31,32,30,34,51,35,55,53,54,59,36,31,34,56,61,59,60,64,58,56,62,57,63,60,62,65,3,0,1,2,21,0,4,5,3,6,7,18,12,30,21,22,9,15,6,8,14,16,18,29,14,28,12,13,19,22,23,36,17,42,15,16,11,8,47,10,20,13,52,19,27,23,24,41,59,36,37,38,49,42,43,44,51,45,47,48,54,57,52,53,26,24,25,39,62,34,38,41,59,55,61,58,50,44,45,46,57,55,60,56,63,33,39,40,65,32,34,35,64,31,32,33,0,2,3,4,1,4,5,39,7,5,6,57,9,8,0,17,24,17,31,1,3,10,16,25,38,39,40,54,27,6,26,55,62,52,54,57,9,20,14,15,33,36,31,32,12,10,11,18,38,34,61,37,63,51,55,56,65,50,52,53,21,28,19,20,12,22,13,14,36,34,60,35,64,49,50,51,30,28,29,44,23,19,22,41,58,44,45,46,43,41,47,42,59,45,47,48,3,0,1,2,3,5,6,7,9,4,7,8,14,0,20,13,33,22,4,27,16,18,28,5,36,28,29,8,35,14,15,19,37,21,22,23,34,12,15,16,11,17,24,18,38,29,30,31,39,23,31,32,40,10,11,12,24,41,25,26,40,45,48,53,43,41,54,42,46,50,44,45,43,47,55,48,52,50,51,59,49,44,47,56,61,59,60,64,58,56,62,57,63,60,62,65]
+}
+,{"from_order" : 3,
+"to_order" : 1,
+"value" : [4,5,0,6,7,8,4,9,1,10,2,11,1,12,3,2,11,13,0,4,5,10,2,14,6,5,8,15,16,17,18,4,19,6,7,20,9,21,22,10,11,23,5,21,24,10,14,23,25,12,26,11,13,27,15,5,17,28,29,30,20,6,18,15,16,31,19,32,33,7,34,20,22,35,36,11,23,37,21,5,24,38,28,29,26,36,39,11,27,37,40,18,41,42,15,31,43,20,44,16,45,31,34,32,20,46,47,48,49,50,40,42,15,51,52,43,41,42,45,31,44,20,43,47,48,53,54,55,49,42,56,51,57,58,52,42,59,45,55,58,60,42,56,59,2,0,3,12,1,4,13,5,6,12,17,7,3,6,8,12,4,7,11,0,9,15,12,1,10,23,13,14,12,17,18,10,11,14,15,12,26,23,24,14,16,17,19,18,26,14,15,16,23,26,24,25,28,34,18,26,19,27,20,33,30,26,35,28,34,47,27,26,33,30,40,35,29,18,21,27,20,37,37,27,29,30,40,44,21,41,22,20,36,37,32,29,45,48,30,44,43,37,39,40,51,44,36,41,37,46,38,42,53,31,32,48,30,52,57,43,45,48,51,44,39,37,43,38,42,58,54,55,53,48,49,52,59,56,57,48,50,51,55,56,60,48,49,50,15,17,29,1,3,4,0,17,5,1,2,4,17,15,29,32,18,30,14,0,10,1,6,2,23,14,15,1,6,3,5,33,13,2,4,9,20,15,31,18,30,41,10,11,25,6,2,7,14,15,23,16,24,28,12,13,35,2,8,9,16,15,28,20,40,31,34,11,12,2,7,8,19,14,26,16,24,37,37,16,19,20,40,45,26,42,27,24,36,37,22,19,46,47,20,45,44,37,39,40,50,45,36,42,37,57,38,43,52,21,22,47,20,51,56,44,46,47,50,45,39,37,44,38,43,58,53,54,52,47,48,51,59,55,56,47,49,50,54,55,60,47,48,49,2,3,26,4,27,40,17,0,5,12,1,26,5,3,6,1,2,26,0,11,7,12,1,21,28,26,41,27,40,44,26,12,17,18,29,42,7,0,11,16,9,24,12,11,21,18,29,25,41,26,28,29,51,42,20,17,43,48,18,42,0,15,8,16,9,30,11,13,22,16,24,35,18,11,25,14,23,47,57,41,43,48,51,42,53,19,20,48,18,52,8,31,10,9,30,36,15,13,32,16,30,35,13,11,22,45,14,23,58,56,57,48,50,51,54,55,53,48,49,52,39,31,33,30,36,37,32,46,34,30,35,38,55,56,60,48,49,50,33,34,59,30,37,38,0,1,2,10,3,4,5,1,11,10,4,18,6,7,12,10,14,8,9,0,6,10,3,8,11,29,13,10,18,19,12,27,13,10,14,19,29,32,28,15,18,19,32,22,27,14,15,19,28,32,29,38,33,30,27,20,34,22,35,32,22,21,36,14,15,16,37,31,32,58,38,33,20,21,39,22,35,36,34,31,47,35,37,32,14,21,16,24,41,17,39,20,21,26,45,42,23,21,40,43,24,41,20,25,44,26,45,51,21,23,40,26,42,48,44,52,46,45,51,55,25,23,49,26,51,48,59,52,53,51,55,56,49,50,54,51,48,57,53,54,60,51,56,57,0,10,1,2,3,4,15,10,21,3,4,5,10,0,1,13,20,6,11,10,19,15,25,21,12,0,7,13,30,6,18,9,10,13,29,20,9,10,18,11,39,19,44,11,14,15,25,27,9,12,34,13,29,30,7,8,37,30,6,33,14,9,41,11,39,44,17,14,50,22,15,27,49,44,46,25,26,27,34,35,40,29,30,31,36,37,38,30,32,33,41,47,42,39,43,44,51,16,17,22,15,28,55,49,50,22,26,27,46,44,49,45,48,58,59,35,36,30,31,32,43,47,44,60,45,48,52,53,51,22,23,28,57,54,55,22,24,26,53,54,56,22,23,24,1,0,2,4,27,5,24,1,3,4,27,41,6,3,49,37,4,41,11,0,7,1,17,2,3,11,21,1,17,24,4,0,5,13,9,32,48,24,26,27,40,41,50,16,6,37,4,42,54,48,49,37,40,41,7,11,0,15,19,10,21,46,22,17,23,24,12,0,8,28,13,9,26,24,48,25,47,57,51,52,50,37,38,42,58,53,54,37,39,40,11,14,18,15,19,33,0,12,8,15,10,29,23,46,24,56,25,47,52,53,60,37,38,39,18,43,20,19,33,34,14,12,30,15,33,29,55,43,44,33,34,35,30,31,45,33,29,36,44,45,59,33,35,36,4,0,5,1,2,6,5,0,4,7,9,10,8,5,11,3,9,10,15,24,0,16,1,2,23,28,8,3,18,9,0,12,25,19,7,10,30,25,11,3,19,10,26,24,42,13,15,0,27,28,46,3,17,18,25,31,26,12,13,0,12,32,14,19,7,20,29,37,30,3,19,22,47,27,29,3,17,22,31,32,38,12,13,14,19,32,20,34,44,21,38,31,32,36,40,45,33,32,43,48,34,44,31,35,39,36,40,52,32,33,43,36,45,49,39,53,41,40,52,56,35,33,50,36,52,49,59,53,54,52,56,57,50,51,55,52,49,58,54,55,60,52,57,58]
+}
+,{"from_order" : 3,
+"to_order" : 0,
+"value" : [0,1,2,3,0,1,4,5,4,1,6,5,2,1,0,5,0,3,2,7,0,8,1,3,0,4,9,5,2,0,9,5,6,10,4,5,0,7,2,11,8,3,0,7,1,8,12,3,9,4,13,5,9,0,2,11,4,10,13,5,0,14,8,7,8,15,3,7,12,3,8,16,17,14,0,7,14,15,8,7,3,15,8,16,17,18,14,7,19,15,14,7,14,18,19,7,1,0,3,4,6,3,2,4,3,0,2,4,3,5,1,4,7,3,6,4,7,5,3,4,8,7,6,4,8,5,7,4,6,7,8,12,7,5,8,9,7,13,8,12,7,9,8,13,7,10,5,9,10,9,7,13,5,10,11,9,7,15,10,13,10,17,9,13,11,9,10,16,14,15,7,13,15,17,10,13,9,17,10,16,14,18,15,13,19,17,15,13,15,18,19,13,4,2,5,0,2,1,5,0,5,2,4,10,2,3,1,0,4,3,2,0,5,1,8,0,2,12,4,10,1,3,6,0,2,3,4,9,1,7,8,0,2,9,4,12,6,7,1,0,2,11,3,9,11,9,2,12,3,11,15,9,2,14,11,12,11,17,9,12,15,9,11,16,13,14,2,12,14,17,11,12,9,17,11,16,13,18,14,12,19,17,14,12,14,18,19,12,1,4,8,9,2,8,0,4,0,8,1,4,2,0,3,4,4,15,8,9,8,4,2,12,3,0,2,6,2,4,3,12,8,15,4,12,2,14,8,12,2,0,5,6,3,2,10,6,2,12,3,11,14,15,8,12,13,14,2,12,5,0,7,6,2,5,10,6,10,2,3,11,19,15,14,12,13,18,14,12,7,16,5,6,10,5,17,6,14,18,19,12,5,16,17,6,3,0,2,4,2,6,3,4,1,3,5,4,1,0,3,4,3,6,7,4,5,3,7,4,7,6,8,4,7,8,5,4,8,6,7,15,5,7,11,8,5,8,9,4,16,8,7,15,5,11,9,8,11,7,16,8,5,4,9,10,9,11,5,14,12,5,9,10,5,11,13,14,9,5,12,14,13,11,17,14,5,13,12,14,17,19,13,14,12,13,18,14,13,19,18,14,1,2,3,0,2,4,3,0,3,2,1,5,2,9,3,4,2,6,1,5,3,8,2,5,2,8,3,9,10,9,2,4,2,8,6,5,1,6,7,5,2,10,8,9,2,12,10,4,10,15,9,4,6,8,16,5,6,17,7,5,8,10,18,9,11,12,2,4,12,15,10,4,9,15,10,19,16,17,6,5,18,9,10,19,11,13,12,4,14,15,12,4,12,13,14,4,1,3,0,7,9,3,1,7,1,11,9,7,1,2,0,3,1,9,2,3,1,7,0,5,9,15,3,7,10,11,1,7,11,15,9,7,0,2,1,6,2,9,13,3,4,1,0,5,3,15,9,14,10,18,11,7,19,15,11,7,1,2,8,6,0,1,4,6,13,3,9,14,11,18,19,7,8,2,12,6,1,8,4,6,12,17,8,6,4,8,16,6,8,17,16,6,0,3,4,1,4,3,0,2,5,3,4,2,9,0,4,1,11,3,5,2,4,0,6,2,6,5,4,2,7,4,9,0,5,10,11,2,6,4,7,0,6,0,8,2,13,5,6,2,13,10,5,2,6,7,8,0,6,2,8,12,8,7,6,16,14,6,8,12,6,7,15,16,8,6,14,16,15,7,17,16,6,15,14,16,17,19,15,16,14,15,18,16,15,19,18,16]
+}
+,{"from_order" : 2,
+"to_order" : 3,
+"offset" : [0,1,3,5,7,8,9,10,11,11,13,15,16,17,19,20,21,22,23,23,25,27,28,30,32,33,33,35,36,37,39,41,42,43,44,44,45,47,49,50,51,52,53,55,56,57,58,58,60,61,62,64,65,66,66,68,70,71,73,75,77,79,80,80,82,83,84,86,88,90,91,93,95,96],
+"value" : [0,0,3,1,2,1,3,1,2,2,3,3,0,0,2,0,0,1,2,1,1,2,2,3,1,0,1,0,0,4,1,5,1,3,1,0,0,0,2,1,2,1,2,2,0,0,1,0,3,0,1,1,2,2,3,2,3,3,0,1,0,0,0,2,1,1,3,0,4,1,0,0,5,0,1,1,2,7,2,2,3,0,0,0,0,1,4,2,5,1,1,1,2,6,2,2]
+}
+,{"from_order" : 2,
+"to_order" : 2,
+"offset" : [0,7,18,28,38,46,52,57,64,64,73,82,87,92,101,107,112,117,122,122,132,141,148,158,168,175,175,185,190,195,204,214,223,229,234,234,239,248,257,262,268,273,281,290,295,300,305,305,314,319,325,335,340,348,348,358,369,376,388,399,410,422,429,429,438,444,449,459,469,480,488,497,507,514],
+"value" : [1,7,9,17,34,8,13,9,3,4,16,8,27,22,12,14,0,7,4,6,3,5,7,10,11,21,25,36,9,1,4,16,10,20,12,2,5,7,9,1,3,16,10,19,2,6,6,2,3,7,11,26,2,4,11,24,5,0,1,2,3,5,23,12,10,3,11,12,14,1,4,9,2,3,8,11,12,0,14,4,2,7,3,9,0,1,7,2,0,10,1,8,6,8,11,12,0,14,1,5,7,6,17,18,14,4,7,14,15,5,4,8,8,1,2,4,5,1,3,4,6,7,16,5,7,1,3,6,9,4,11,13,25,26,5,0,7,3,2,4,14,41,26,3,10,1,4,14,21,23,24,26,29,0,7,1,2,10,5,19,6,0,9,11,13,1,2,14,0,16,25,26,1,4,19,9,11,12,5,16,24,26,8,4,6,3,7,2,37,52,3,6,1,37,38,2,6,1,7,37,4,24,39,40,5,7,8,0,6,37,3,24,39,40,24,28,32,0,9,11,12,4,7,7,8,0,4,2,3,4,5,1,3,6,3,2,9,1,4,3,5,2,11,12,7,13,0,4,3,10,1,11,12,7,13,0,9,2,10,1,5,0,5,0,1,11,23,24,4,1,3,11,15,8,17,18,20,21,12,7,9,8,10,1,2,11,12,13,6,9,7,10,6,12,14,10,0,2,6,7,9,2,3,7,8,12,14,3,18,21,2,5,1,4,3,7,2,0,4,3,6,9,1,0,5,2,6,9,12,14,0,18,21,1,7,30,21,0,1,5,21,22,23,24,25,0,2,4,9,10,12,14,3,17,1,4,8,2,24,5,7,0,17,4,34,36,31,38,39,8,0,4,39,40,3,16,0,9,10,12,14,4,25,5,6,26,2,16,0,17,1,3,25,5,6,26,2,39,40,1,24,7,3,4,25,6,26,39,54,57,27,7,50,51,52,55,57,3,4,25,5,26,6,27,1,24,5,57,62,14,15,16,3,5,13,2,20,1,3,7,9,0,20,2,3,6,0,13,1,2,6,0,14,15,16,5,1,7,9,33,9,22,23,29,31,8,27,6,7,0,14,15,16,3,17,18,6,28,7,8,2,3,17,18,5,4,27,7,1,3,9,4,27,6,28,5,8,36,9,22,23,4,29,31,28,5,7,4,33,1,3,7,8,36]
+}
+,{"from_order" : 2,
+"to_order" : 1,
+"value" : [0,7,8,4,5,0,1,2,11,4,10,2,4,9,1,3,2,13,1,12,3,0,2,14,5,6,8,4,6,7,9,10,11,12,11,13,5,10,14,8,16,17,5,15,17,6,15,16,18,4,19,19,7,20,18,6,20,9,21,22,21,10,23,22,11,23,21,5,24,24,14,23,25,12,26,26,11,27,25,13,27,5,28,29,17,29,30,15,28,30,18,15,31,20,16,31,19,32,33,34,32,20,33,7,34,22,35,36,36,11,37,35,23,37,24,38,29,21,38,28,26,36,39,39,27,37,40,18,41,40,42,15,41,42,31,44,20,43,44,16,45,43,45,31,20,47,48,32,46,48,34,46,47,49,50,40,50,15,51,49,42,51,52,43,41,52,42,45,43,48,53,44,47,53,54,55,49,55,42,56,54,56,51,57,58,52,58,42,59,57,59,45,55,58,60,60,56,59,0,12,1,3,12,4,2,1,4,2,0,3,6,12,7,5,17,7,13,5,6,8,4,7,3,6,8,9,15,1,11,0,9,11,15,12,10,14,12,18,10,11,13,12,17,10,23,13,18,14,15,23,14,17,24,16,17,26,14,16,19,15,16,19,20,33,18,27,20,18,26,19,21,20,37,29,18,21,22,20,36,21,41,22,24,25,34,23,25,28,23,26,24,26,27,33,26,28,34,29,27,37,26,30,35,27,30,40,30,28,47,29,30,44,32,48,30,31,30,52,53,31,32,32,29,45,33,40,35,35,34,47,37,38,42,36,46,38,36,41,37,39,38,58,39,37,43,37,40,44,39,40,51,41,46,42,43,42,58,43,51,44,45,48,44,57,43,45,60,49,50,56,48,50,55,48,49,57,48,51,59,50,51,53,48,52,54,49,52,54,55,53,55,56,60,59,56,57,0,1,2,17,1,4,29,3,4,15,1,3,5,2,4,0,17,5,10,6,2,14,1,6,25,6,7,11,2,7,23,6,3,12,2,8,34,7,8,13,2,9,33,4,9,35,8,9,14,0,10,10,11,25,34,11,12,5,33,13,12,13,35,15,16,28,14,16,24,14,15,23,15,18,30,17,32,18,17,15,29,19,16,37,19,14,26,15,20,31,16,20,40,20,18,41,19,20,45,22,47,20,21,20,51,52,21,22,22,19,46,23,24,28,26,24,37,27,24,36,26,42,27,29,32,30,28,40,31,31,30,41,37,38,43,36,57,38,36,42,37,39,38,58,39,37,44,37,40,45,39,40,50,42,57,43,44,43,58,44,50,45,46,47,45,56,44,46,60,48,49,55,47,49,54,47,48,56,47,50,59,49,50,52,47,51,53,48,51,53,54,52,54,55,60,59,55,56,0,12,1,3,4,40,2,4,27,2,3,26,5,1,26,17,0,5,6,1,2,5,3,6,7,1,21,0,11,7,8,9,30,0,16,9,0,15,8,10,9,36,8,31,10,7,9,24,11,12,21,11,14,23,13,45,14,13,11,22,15,16,30,13,16,35,11,16,24,15,13,32,17,12,26,11,18,25,12,18,29,18,14,47,17,18,42,20,48,18,19,18,52,53,19,20,20,17,43,22,45,23,22,24,35,21,29,25,25,23,47,26,27,40,28,27,44,28,26,41,26,29,42,28,29,51,31,30,36,32,30,35,33,30,37,39,31,33,34,30,38,32,46,34,33,34,59,39,36,37,46,35,38,59,37,38,41,40,44,41,51,42,43,48,42,57,41,43,60,49,50,56,48,50,55,48,49,57,48,51,58,50,51,53,48,52,54,49,52,54,55,53,55,56,60,58,56,57,2,3,4,1,10,4,0,10,3,0,1,2,5,4,18,5,1,11,7,14,8,6,10,8,6,7,12,9,3,8,9,0,6,11,10,18,12,10,14,13,10,19,12,27,13,11,29,13,36,15,16,21,14,16,22,14,15,16,41,17,14,24,17,27,14,19,32,15,19,28,15,18,29,18,19,21,22,36,20,22,35,20,21,39,21,24,41,23,43,24,23,21,40,25,26,51,20,26,45,20,25,44,23,26,48,21,26,42,25,23,49,27,22,32,27,20,34,29,33,30,28,38,30,28,32,29,32,38,33,31,58,33,37,31,32,34,35,32,34,31,47,39,35,36,37,58,38,47,35,37,40,43,41,40,42,48,39,45,42,44,45,51,46,45,55,44,52,46,49,51,48,50,48,57,49,50,54,52,51,55,53,51,56,59,52,53,54,51,57,53,54,60,59,55,56,60,56,57,10,3,4,1,2,4,0,2,3,0,10,1,21,4,5,15,3,5,0,13,6,1,20,6,7,30,6,12,0,7,8,6,33,7,8,37,10,11,19,9,11,39,9,10,18,12,13,30,9,13,29,9,12,34,10,13,20,14,11,44,14,9,41,10,15,21,11,15,25,14,15,27,17,22,15,16,15,28,51,16,17,17,14,50,18,39,19,18,29,20,19,25,21,56,23,24,54,22,24,53,22,23,55,22,26,57,24,26,44,25,27,46,25,26,49,26,27,51,22,28,52,23,28,50,22,27,34,29,30,40,29,31,35,30,31,36,30,32,59,31,32,37,30,33,38,32,33,34,35,40,59,35,36,36,37,38,41,39,44,42,39,43,41,47,42,44,45,48,43,60,45,43,47,44,46,45,58,46,44,49,47,60,48,49,48,58,55,49,50,52,53,51,53,54,56,57,54,55,0,1,2,3,1,24,2,27,5,0,4,5,1,4,27,3,4,41,6,37,4,6,3,49,7,17,2,11,0,7,0,13,9,8,28,9,12,0,8,8,10,29,0,15,10,7,19,10,5,9,32,11,1,17,12,28,13,14,15,33,11,15,19,11,14,18,12,15,29,14,12,30,3,11,21,4,13,32,16,4,42,50,16,6,18,19,33,20,19,34,18,43,20,21,17,24,22,17,23,21,46,22,24,25,47,23,56,25,23,46,24,26,25,57,26,24,48,24,27,41,26,27,40,30,33,29,31,29,36,30,31,45,43,33,34,44,33,35,55,34,35,45,33,36,59,35,36,60,38,39,53,37,39,52,37,38,54,37,40,58,39,40,48,40,41,50,37,42,51,38,42,49,37,41,55,43,44,44,45,59,46,56,47,48,47,57,54,48,49,51,52,50,52,53,60,58,53,54,0,1,2,5,2,6,4,1,6,4,0,5,8,3,9,0,7,10,4,7,9,5,9,10,11,3,10,8,5,11,38,13,14,32,12,14,31,12,13,15,16,1,15,24,0,26,13,0,25,12,0,14,7,20,12,19,7,42,13,15,24,16,2,46,17,18,28,3,18,27,3,17,32,19,20,20,44,21,19,34,21,23,18,9,25,19,10,30,3,19,37,19,22,29,3,22,47,17,22,23,28,8,25,31,26,26,24,42,30,25,11,27,28,46,29,37,30,47,27,29,31,32,38,32,34,44,33,48,34,33,32,43,35,36,52,31,36,40,31,35,39,33,36,49,32,36,45,35,33,50,39,40,52,41,40,56,39,53,41,38,40,45,43,48,44,43,45,49,50,52,49,51,49,58,50,51,55,53,52,56,54,52,57,59,53,54,55,52,58,54,55,60,59,56,57,60,57,58]
+}
+,{"from_order" : 2,
+"to_order" : 0,
+"value" : [1,2,3,0,1,2,1,4,5,0,1,5,0,1,4,1,6,5,4,1,6,2,1,5,0,2,3,0,1,3,0,4,5,4,6,5,2,0,5,3,2,7,0,2,7,0,3,7,0,8,1,8,1,3,0,8,3,0,4,9,0,9,5,4,9,5,9,0,2,2,9,5,6,10,4,10,4,5,6,10,5,0,2,11,7,2,11,0,7,11,8,0,7,8,3,7,1,8,12,12,3,8,1,12,3,9,4,13,4,13,5,9,13,5,9,2,11,9,0,11,4,10,13,10,13,5,0,14,8,14,0,7,14,8,7,3,15,8,15,3,7,8,15,7,3,8,16,12,8,16,12,3,16,17,14,0,17,0,7,17,14,7,14,15,8,15,14,7,15,8,16,3,15,16,17,18,14,14,18,7,17,18,7,19,15,14,14,19,7,19,15,7,14,18,19,18,19,7,3,1,4,0,3,4,1,0,4,1,0,3,3,2,4,6,2,4,6,3,2,0,2,4,3,0,2,5,1,4,3,5,1,5,3,4,7,3,4,7,5,3,3,6,4,7,3,6,7,5,4,7,6,4,8,6,4,8,7,4,8,5,4,5,8,9,7,5,9,7,5,8,10,5,9,7,10,5,5,11,9,5,10,11,6,8,12,6,7,12,6,7,8,7,8,9,7,8,12,7,10,9,7,8,13,7,9,13,7,13,12,10,7,13,15,7,13,14,7,13,14,15,7,7,15,10,9,8,13,13,8,12,9,10,16,11,9,16,11,9,10,9,17,16,9,17,10,10,9,13,17,9,13,11,10,16,17,10,16,10,17,13,15,10,13,15,17,10,18,19,13,15,19,13,15,18,13,17,15,13,19,17,13,14,15,13,14,18,13,14,18,15,15,18,19,19,17,15,2,1,0,2,5,0,4,5,0,4,2,0,1,5,0,2,1,5,3,1,0,2,3,0,3,6,0,1,6,0,4,3,0,7,1,0,6,7,0,1,8,0,5,8,0,7,8,0,2,3,1,1,3,6,6,7,1,5,1,8,1,7,8,2,4,9,2,3,9,2,3,4,2,4,10,5,2,10,5,2,4,2,11,9,2,11,3,2,4,12,2,9,12,2,12,10,11,2,12,14,2,12,13,2,12,13,14,2,2,14,11,3,4,9,11,3,9,3,15,9,3,11,15,5,4,10,9,4,12,12,4,10,9,11,16,15,9,16,15,9,11,9,17,16,9,17,11,11,9,12,17,9,12,15,11,16,17,11,16,11,17,12,14,11,12,14,17,11,18,19,12,14,19,12,14,18,12,17,14,12,19,17,12,13,14,12,13,18,12,13,18,14,14,18,19,19,17,14,2,0,4,1,8,9,1,4,9,1,4,8,8,0,4,2,8,0,0,1,4,0,8,1,0,3,4,2,0,3,0,5,6,2,0,6,2,0,5,0,7,6,5,0,7,3,0,6,2,3,4,2,3,11,10,2,11,10,2,3,2,5,6,2,10,6,3,2,6,2,5,10,2,8,4,2,3,12,2,4,12,2,12,11,8,2,12,14,2,12,13,2,12,13,14,2,2,14,8,10,3,11,3,10,6,4,3,12,12,3,11,4,8,9,4,15,9,4,15,8,8,4,12,15,4,12,5,7,6,5,10,6,16,5,6,7,16,5,5,17,6,10,5,17,5,16,17,7,16,6,10,17,6,16,17,6,15,8,9,8,15,12,14,8,12,14,15,8,18,19,12,14,19,12,14,18,12,15,14,12,19,15,12,13,14,12,13,18,12,13,18,14,14,18,19,19,15,14,0,2,4,3,2,4,3,0,4,3,0,2,2,6,4,2,6,3,1,5,4,1,3,4,1,3,5,1,0,4,1,0,3,6,3,4,3,5,4,3,7,4,5,3,7,3,6,7,8,9,4,5,9,4,5,8,4,4,9,10,5,4,10,7,5,4,7,8,4,6,8,4,7,6,4,5,9,8,5,11,8,5,11,9,5,9,10,12,5,10,12,5,9,5,13,14,5,11,14,5,11,13,5,12,14,9,5,14,5,13,12,5,7,8,5,7,11,6,7,15,8,6,15,8,6,7,8,7,15,16,7,15,16,8,7,7,11,8,11,7,16,11,9,8,16,8,15,11,16,8,12,9,10,9,12,14,9,11,14,11,13,14,11,17,14,13,11,17,13,12,14,12,18,14,12,13,18,13,17,14,19,13,14,17,19,13,13,18,14,13,19,18,17,19,14,19,18,14,2,3,0,1,3,0,1,2,0,1,2,3,4,3,0,2,4,0,2,1,5,3,1,5,6,1,5,2,6,1,1,7,5,1,6,7,2,3,9,2,8,9,2,8,3,2,6,5,2,8,5,2,8,6,3,2,5,2,10,9,2,10,8,2,3,4,2,9,4,10,2,4,12,2,4,11,2,4,11,12,2,2,12,10,8,3,9,3,8,5,9,3,4,13,14,4,12,14,4,12,13,4,15,12,4,14,15,4,10,9,4,15,9,4,10,15,4,11,12,4,11,13,4,12,10,4,8,6,5,8,16,5,6,16,5,17,6,5,16,17,5,6,7,5,17,7,5,6,8,16,16,17,6,6,17,7,10,8,9,8,18,9,8,10,18,9,10,19,18,9,19,18,9,10,9,15,19,9,15,10,18,10,19,15,10,19,12,15,10,11,13,12,12,13,14,14,15,12,1,0,3,1,9,3,3,0,7,1,0,7,1,3,7,9,1,7,11,1,7,1,11,9,2,0,3,1,2,0,1,0,5,4,0,5,4,1,0,0,4,6,0,1,6,0,2,6,7,0,5,1,2,3,4,1,5,1,8,6,1,2,6,1,2,8,1,4,6,1,8,4,1,9,2,1,7,5,10,1,7,10,11,1,2,8,6,2,12,6,8,2,12,9,2,3,2,13,3,2,9,13,3,9,14,13,3,14,13,3,9,3,15,14,3,15,9,9,3,7,15,3,7,8,4,6,4,16,6,4,8,16,8,12,6,17,8,6,12,17,6,8,16,6,17,16,6,18,19,7,11,19,7,11,18,7,15,11,7,19,15,7,9,15,7,10,11,7,10,18,7,11,9,7,12,17,8,8,17,16,13,9,14,15,9,14,11,15,9,10,18,11,11,18,19,19,15,11,0,4,1,3,4,1,0,3,1,0,3,4,3,5,2,4,0,2,3,0,2,4,3,2,5,4,2,5,3,4,7,8,0,6,8,0,6,7,0,9,0,1,9,0,4,4,7,0,6,4,0,0,8,2,6,0,2,7,9,0,9,4,1,10,11,2,5,11,2,5,10,2,6,8,2,2,8,12,6,2,12,11,3,2,4,6,2,5,6,2,13,6,2,13,5,2,13,10,2,11,3,5,6,4,7,7,4,9,6,5,4,5,10,11,13,5,6,13,10,5,6,7,8,6,8,12,14,6,12,14,6,8,6,15,16,6,7,16,6,7,15,6,14,16,8,6,16,6,15,14,7,15,16,7,17,16,15,7,17,8,7,16,14,8,12,8,14,16,15,14,16,14,18,16,14,15,18,15,17,16,19,15,16,17,19,15,15,18,16,15,19,18,17,19,16,19,18,16]
+}
+,{"from_order" : 1,
+"to_order" : 3,
+"offset" : [0,2,4,7,8,8,10,12,13,15,17,18,20,22,23,23,25,29,35,37,40,42,42,46,49,51,53,54,56,57,57,59,61,62,64,66,67,69,70,72,73,73,76,78,79,81,83,84,84,90,94,96,99,104,106,108,108,114,116,118,124,126,129,130,133,135,138,142,144],
+"value" : [0,3,1,2,1,2,3,2,3,0,3,0,0,0,2,0,2,1,1,2,1,2,2,3,1,3,0,4,1,3,7,1,11,5,9,0,4,0,1,5,1,5,3,10,6,1,3,1,2,0,2,0,2,0,1,2,2,0,3,0,1,0,0,3,0,1,1,2,3,2,2,3,3,0,2,4,0,2,0,0,1,0,1,1,3,11,16,9,0,5,3,4,0,1,3,0,4,1,2,0,5,1,7,2,0,5,7,2,3,9,0,7,5,1,3,0,3,0,8,4,11,12,6,2,0,1,0,1,2,0,10,5,1,4,2,4,1,2,5,1,6,2,6,2]
+}
+,{"from_order" : 1,
+"to_order" : 2,
+"offset" : [0,3,6,10,12,12,15,18,20,23,26,28,31,34,36,36,39,43,49,52,56,59,59,64,68,71,74,76,79,81,81,84,87,89,92,95,97,100,102,105,107,107,111,114,116,119,122,124,124,130,134,137,141,147,150,153,153,159,162,165,171,174,178,180,184,187,191,195,198],
+"value" : [0,1,7,2,4,6,2,3,5,7,5,6,0,10,3,9,0,2,2,3,1,3,8,1,2,7,5,6,4,6,8,4,5,7,7,8,0,16,5,0,7,1,3,6,0,9,4,11,13,2,3,10,1,2,4,14,4,5,19,0,9,11,12,5,8,0,4,6,2,3,6,1,3,7,1,2,4,5,7,6,7,2,3,10,1,3,5,0,3,0,2,9,0,1,4,4,5,7,8,10,6,8,6,7,9,9,10,2,3,6,9,1,3,7,1,2,0,2,5,0,1,4,4,5,0,9,10,12,14,3,0,17,1,4,8,0,2,1,24,5,7,3,4,25,5,6,26,2,3,16,6,27,7,0,14,15,16,3,5,0,13,2,0,20,1,22,23,4,29,31,8,2,3,6,1,3,7,9,1,2,17,18,5,6,4,33,9,4,27,6,7,28,5,7,8,8,36,9]
+}
+,{"from_order" : 1,
+"to_order" : 1,
+"offset" : [0,13,26,40,50,50,59,69,75,84,94,102,111,121,127,127,143,160,173,186,198,209,209,226,240,250,260,266,280,290,290,299,308,314,326,338,346,355,368,380,386,386,401,412,420,433,442,454,454,470,487,500,517,536,551,567,567,581,591,601,616,628,640,648,664,675,689,705,718],
+"value" : [4,7,1,2,3,19,33,5,8,24,29,14,17,4,0,7,2,3,19,33,9,11,12,22,26,36,4,0,7,1,3,19,33,10,11,14,23,13,27,37,4,0,7,1,2,19,33,12,13,25,10,11,12,13,3,6,9,1,2,0,9,2,14,15,12,16,17,4,7,0,9,1,3,4,8,2,4,8,10,11,12,0,13,6,2,3,8,14,15,12,16,17,1,7,23,24,25,17,13,6,7,8,10,11,12,0,13,3,5,7,8,5,6,8,14,15,12,16,17,1,4,2,3,4,5,6,7,14,15,16,17,18,1,19,20,21,22,10,2,11,5,12,13,14,15,16,17,18,0,19,20,21,22,6,2,3,4,7,8,9,0,10,11,5,12,13,1,6,3,4,7,8,9,15,23,28,29,30,31,1,6,2,4,7,8,9,17,29,32,5,33,1,6,2,3,7,8,9,0,10,2,11,12,13,17,29,32,4,33,11,12,13,14,15,16,17,18,19,20,7,1,8,9,10,5,6,0,7,8,9,10,5,6,12,21,2,26,27,28,29,3,4,6,12,1,21,26,27,28,29,2,4,6,26,40,17,5,41,42,43,2,3,6,27,40,44,3,26,40,17,41,42,43,0,7,1,8,9,10,6,0,7,1,8,9,10,5,2,3,4,1,10,11,6,12,13,2,3,9,0,10,11,6,12,13,2,4,5,0,3,9,1,4,5,0,2,9,10,4,14,15,16,17,18,19,8,1,2,5,10,3,14,15,16,17,18,19,8,1,2,4,28,29,30,11,18,7,8,9,0,1,10,11,12,13,6,8,9,20,21,22,23,24,25,26,27,14,12,6,7,9,10,3,4,14,15,16,17,18,19,6,7,8,0,2,3,1,2,6,7,8,9,10,11,3,12,13,14,15,16,17,0,2,6,7,8,10,18,19,4,20,21,0,1,6,7,8,3,4,5,9,10,11,0,12,13,14,15,16,17,2,4,5,10,18,19,1,20,21,2,3,5,22,23,24,15,25,21,26,27,28,2,3,4,11,1,12,13,14,15,3,4,16,6,7,2,8,9,10,5,11,0,12,13,14,15,3,4,16,6,17,2,23,24,25,26,27,0,7,8,9,10,5,1,17,23,24,25,26,27,11,0,1,12,13,14,15,4,16,6,46,24,47,21,48,41,49,11,0,1,12,13,14,15,3,16,6,37,38,39,27,5,32,40,41,42,0,7,2,8,9,10,37,38,39,4,27,32,40,41,42,52,53,37,54,50,49,11,0,1,12,13,14,15,3,4,16,12,13,14,15,1,4,7,24,2,25,26,5,10,11,12,13,14,15,0,4,7,16,2,6,24,0,25,26,5,10,11,16,1,6,27,28,8,29,30,11,17,18,19,7,20,21,9,10,22,12,13,14,15,0,1,7,5,6,23,8,9,4,6,23,8,9,24,0,2,25,26,10,11,4,5,23,8,9,16,1,2,12,13,14,15,0,1,4,3,17,18,19,20,21,9,10,22,4,5,6,23,9,27,28,3,29,30,11,4,5,6,23,8,3,17,18,19,7,20,21,10,22,24,0,2,25,26,5,11,3,17,18,19,7,20,21,9,22,27,28,3,8,29,30,24,0,2,25,26,5,10]
+}
+,{"from_order" : 1,
+"to_order" : 0,
+"value" : [1,2,1,4,1,5,1,6,0,1,0,2,0,3,1,3,2,3,0,4,0,5,4,5,4,6,6,5,2,5,0,7,3,7,2,7,0,8,8,1,3,8,9,0,4,9,9,5,9,2,6,10,10,4,10,5,0,11,2,11,7,11,8,7,12,8,1,12,12,3,9,13,4,13,13,5,9,11,10,13,14,0,14,8,14,7,15,8,3,15,15,7,12,16,3,16,8,16,17,14,17,0,17,7,15,14,15,16,17,18,14,18,18,7,19,15,14,19,19,7,18,19,3,1,1,4,1,0,0,3,0,4,6,2,3,2,2,4,0,2,5,1,7,3,5,3,3,4,3,6,7,4,5,4,8,4,6,4,7,5,5,8,5,9,10,5,5,11,6,7,6,8,6,12,7,8,7,9,7,12,7,10,7,13,14,7,15,7,8,9,8,12,8,13,11,9,9,10,9,16,9,17,9,13,11,10,10,16,17,10,10,13,15,10,11,16,13,12,15,13,18,13,19,13,17,13,14,13,14,15,14,18,15,18,15,19,17,15,17,16,19,17,18,19,2,1,2,0,1,0,4,0,5,0,1,5,3,0,6,0,7,0,8,0,3,1,1,6,7,1,1,8,2,3,2,4,2,9,5,2,2,10,2,11,2,12,13,2,14,2,3,4,3,9,3,6,11,3,3,15,4,9,5,4,4,10,4,12,5,10,5,8,6,7,7,8,15,9,9,11,9,16,9,17,9,12,12,10,15,11,11,16,17,11,11,12,14,11,14,12,18,12,19,12,17,12,13,12,13,14,13,18,14,18,14,19,17,14,15,16,17,16,19,17,18,19,2,0,0,4,1,4,1,8,1,9,8,0,0,1,0,3,0,5,0,6,0,7,2,3,2,4,10,2,2,11,2,5,2,6,2,8,2,12,13,2,14,2,3,4,10,3,3,11,3,6,3,12,4,8,4,9,4,15,4,12,5,6,5,7,5,10,16,5,5,17,10,6,7,6,16,6,17,6,7,16,8,9,15,8,8,12,14,8,15,9,10,11,10,17,12,11,14,12,18,12,19,12,15,12,13,12,13,14,13,18,14,18,14,19,15,14,19,15,16,17,18,19,3,0,3,2,0,2,0,4,2,4,2,6,1,3,1,5,1,4,1,0,3,4,6,3,3,5,3,7,5,4,8,4,9,4,4,10,6,4,7,4,5,11,5,9,5,8,12,5,5,10,5,13,5,14,5,7,8,6,6,7,6,15,16,7,8,7,7,15,7,11,11,8,9,8,16,8,8,15,11,9,12,9,9,10,9,14,12,10,11,13,11,14,11,17,11,16,12,14,13,12,12,18,13,14,13,17,19,13,13,18,17,14,19,14,18,14,16,15,17,19,19,18,1,2,1,3,1,0,2,0,3,0,4,0,1,5,6,1,1,7,2,8,2,3,2,9,2,6,2,5,2,10,2,4,11,2,12,2,8,3,3,9,3,5,3,4,12,4,13,4,14,4,9,4,15,4,10,4,11,4,8,5,6,5,16,5,17,5,7,5,8,6,6,16,17,6,6,7,17,7,8,9,8,16,10,8,8,18,18,9,9,10,9,19,9,15,18,10,10,19,15,10,12,10,11,12,11,13,12,13,12,14,15,12,13,14,14,15,15,19,16,17,18,19,1,0,1,3,0,3,1,9,1,7,0,7,11,1,2,0,4,0,0,5,0,6,1,2,4,1,1,5,1,8,1,6,10,1,2,3,2,8,2,6,2,12,9,2,2,13,13,3,3,9,3,14,3,15,3,7,4,5,4,6,8,4,4,16,7,5,8,6,12,6,17,6,16,6,11,7,18,7,19,7,15,7,9,7,10,7,8,12,17,8,8,16,13,9,9,14,15,9,11,9,10,11,10,18,11,18,11,19,15,11,12,17,13,14,15,14,19,15,17,16,18,19,0,4,0,1,4,1,5,2,0,3,3,4,3,1,0,2,3,5,3,2,4,2,5,4,6,0,7,0,8,0,9,0,9,1,10,2,11,2,6,2,8,2,2,12,13,2,11,3,9,4,6,4,4,7,5,10,5,11,13,5,5,6,6,7,6,8,14,6,6,12,6,15,6,16,13,6,7,8,7,15,7,16,7,17,7,9,14,8,8,12,8,16,10,11,13,10,14,12,14,16,15,14,14,18,15,16,15,17,19,15,15,18,17,16,19,16,18,16,17,19,19,18]
+}
+,{"from_order" : 0,
+"to_order" : 3,
+"offset" : [0,0,2,4,6,6,14,20,20,26,28,28,30,32,34,34,36,40,40,46,58,58,66,68,78,82,90,96],
+"value" : [0,2,3,0,1,2,3,0,7,4,1,11,5,9,3,7,1,11,5,9,3,10,15,6,1,2,0,2,0,3,2,3,0,1,0,1,0,2,4,9,3,11,16,9,0,5,3,11,15,16,9,20,4,0,5,1,7,2,13,3,9,0,10,7,5,1,3,0,8,10,14,4,5,1,11,12,6,2,0,4,1,2,3,9,0,7,5,1,6,2,8,4,11,12,6,2]
+}
+  ], 
+"attrs" : {
+  "x" : [ 
+0,0,0,1,0,0,2,0,0,0,1,0,1,1,0,2,1,0,0,2,0,1,2,0,0,0,1,1,0,1,2,0,1,0,1,1,1,1,1,2,1,1,0,2,1,1,2,1,0,1,3,1,1,3,0,2,3,1,2,3  ]}
+}
\ No newline at end of file
diff --git a/tests/_python_orig/examples/__init__.py b/tests/_python_orig/examples/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/tests/_python_orig/examples/__init__.py
@@ -0,0 +1 @@
+
diff --git a/tests/_python_orig/examples/algorithm/test_laplace.py b/tests/_python_orig/examples/algorithm/test_laplace.py
new file mode 100644
index 000000000..e957b9ea8
--- /dev/null
+++ b/tests/_python_orig/examples/algorithm/test_laplace.py
@@ -0,0 +1,10 @@
+def test_laplace():
+    from taichi.examples.algorithm.laplace import laplace, x, y
+
+    for i in range(10):
+        x[i, i + 1] = 1.0
+
+    laplace()
+
+    for i in range(10):
+        assert y[i, i + 1] == (4.0 if i % 3 == 1 else 0.0)
diff --git a/tests/_python_orig/examples/autodiff/__init__.py b/tests/_python_orig/examples/autodiff/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/tests/_python_orig/examples/autodiff/__init__.py
@@ -0,0 +1 @@
+
diff --git a/tests/_python_orig/examples/autodiff/test_minimization.py b/tests/_python_orig/examples/autodiff/test_minimization.py
new file mode 100644
index 000000000..90660639e
--- /dev/null
+++ b/tests/_python_orig/examples/autodiff/test_minimization.py
@@ -0,0 +1,21 @@
+import random
+
+import taichi as ti
+from tests import test_utils
+
+
+def test_minimization():
+    from taichi.examples.autodiff.minimization import (L, gradient_descent, n,
+                                                       reduce, x, y)
+
+    for i in range(n):
+        x[i] = random.random()
+        y[i] = random.random()
+
+    for k in range(100):
+        with ti.Tape(loss=L):
+            reduce()
+        gradient_descent()
+
+    for i in range(n):
+        assert x[i] == test_utils.approx(y[i], rel=1e-2)
diff --git a/tests/_python_orig/examples/rendering/test_cornell_box.py b/tests/_python_orig/examples/rendering/test_cornell_box.py
new file mode 100644
index 000000000..152d7f2a7
--- /dev/null
+++ b/tests/_python_orig/examples/rendering/test_cornell_box.py
@@ -0,0 +1,43 @@
+import argparse
+
+import taichi as ti
+
+FRAMES = 200
+
+
+def test_cornell_box():
+    from taichi.examples.rendering.cornell_box import render, tonemap
+    for i in range(FRAMES):
+        render()
+        interval = 10
+        if i % interval == 0:
+            tonemap(i)
+
+
+def video_cornell_box(result_dir):
+    from taichi.examples.rendering.cornell_box import (render, tonemap,
+                                                       tonemapped_buffer)
+    video_manager = ti.VideoManager(output_dir=result_dir,
+                                    framerate=24,
+                                    automatic_build=False)
+    gui = ti.GUI("Taichi Cornell Box",
+                 res=800,
+                 background_color=0x112F41,
+                 show_gui=False)
+    for i in range(FRAMES):
+        render()
+        interval = 10
+        if i % interval == 0:
+            tonemap(i)
+
+        gui.set_image(tonemapped_buffer)
+        video_manager.write_frame(gui.get_image())
+        gui.clear()
+    video_manager.make_video(mp4=True, gif=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate cornell_box video')
+    parser.add_argument('output_directory',
+                        help='output directory of generated video')
+    video_cornell_box(parser.parse_args().output_directory)
diff --git a/tests/_python_orig/examples/rendering/test_taichi_logo.py b/tests/_python_orig/examples/rendering/test_taichi_logo.py
new file mode 100644
index 000000000..2a8e85395
--- /dev/null
+++ b/tests/_python_orig/examples/rendering/test_taichi_logo.py
@@ -0,0 +1,32 @@
+import argparse
+
+import taichi as ti
+
+FRAMES = 100
+
+
+def test_taichi_logo():
+    from taichi.examples.rendering.taichi_logo import paint
+    paint()
+
+
+def video_taichi_logo(result_dir):
+    from taichi.examples.rendering.taichi_logo import n, paint, x
+    video_manager = ti.VideoManager(output_dir=result_dir,
+                                    framerate=24,
+                                    automatic_build=False)
+    paint()
+    gui = ti.GUI('Logo', (n, n), show_gui=False)
+    for i in range(FRAMES):
+        gui.set_image(x)
+        video_manager.write_frame(gui.get_image())
+        gui.clear()
+
+    video_manager.make_video(mp4=True, gif=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate taichi_logo video')
+    parser.add_argument('output_directory',
+                        help='output directory of generated video')
+    video_taichi_logo(parser.parse_args().output_directory)
diff --git a/tests/_python_orig/examples/simulation/test_mpm99.py b/tests/_python_orig/examples/simulation/test_mpm99.py
new file mode 100644
index 000000000..297fe0236
--- /dev/null
+++ b/tests/_python_orig/examples/simulation/test_mpm99.py
@@ -0,0 +1,45 @@
+import argparse
+
+import taichi as ti
+
+FRAMES = 100
+
+
+def test_mpm99():
+    from taichi.examples.simulation.mpm99 import dt, initialize, substep
+
+    initialize()
+    for i in range(FRAMES):
+        for s in range(int(2e-3 // dt)):
+            substep()
+
+
+def video_mpm99(result_dir):
+    from taichi.examples.simulation.mpm99 import (dt, initialize, material,
+                                                  substep, x)
+
+    video_manager = ti.VideoManager(output_dir=result_dir,
+                                    framerate=24,
+                                    automatic_build=False)
+    initialize()
+    gui = ti.GUI("Taichi MLS-MPM-99",
+                 res=512,
+                 background_color=0x112F41,
+                 show_gui=False)
+    for i in range(FRAMES):
+        for s in range(int(2e-3 // dt)):
+            substep()
+        gui.circles(x.to_numpy(),
+                    radius=1.5,
+                    palette=[0x068587, 0xED553B, 0xEEEEF0],
+                    palette_indices=material)
+        video_manager.write_frame(gui.get_image())
+        gui.clear()
+    video_manager.make_video(mp4=True, gif=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate mpm99 video')
+    parser.add_argument('output_directory',
+                        help='output directory of generated video')
+    video_mpm99(parser.parse_args().output_directory)
diff --git a/tests/_python_orig/fuse_test_template.py b/tests/_python_orig/fuse_test_template.py
new file mode 100644
index 000000000..db808509b
--- /dev/null
+++ b/tests/_python_orig/fuse_test_template.py
@@ -0,0 +1,91 @@
+import time
+
+import taichi as ti
+
+
+def template_fuse_dense_x2y2z(
+    size=1024**3,
+    repeat=10,
+    first_n=100,
+):
+    x = ti.field(ti.i32, shape=(size, ))
+    y = ti.field(ti.i32, shape=(size, ))
+    z = ti.field(ti.i32, shape=(size, ))
+    first_n = min(first_n, size)
+
+    @ti.kernel
+    def x_to_y():
+        for i in x:
+            y[i] = x[i] + 1
+
+    @ti.kernel
+    def y_to_z():
+        for i in x:
+            z[i] = y[i] + 4
+
+    def x_to_y_to_z():
+        x_to_y()
+        y_to_z()
+
+    for i in range(first_n):
+        x[i] = i * 10
+
+    # Simply test
+    for _ in range(repeat):
+        t = time.time()
+        x_to_y()
+        ti.sync()
+        print('x_to_y', time.time() - t)
+
+    for _ in range(repeat):
+        t = time.time()
+        y_to_z()
+        ti.sync()
+        print('y_to_z', time.time() - t)
+
+    for _ in range(repeat):
+        t = time.time()
+        x_to_y_to_z()
+        ti.sync()
+        print('fused x->y->z', time.time() - t)
+
+    for i in range(first_n):
+        assert x[i] == i * 10
+        assert y[i] == x[i] + 1
+        assert z[i] == x[i] + 5
+
+
+def template_fuse_reduction(size=1024**3, repeat=10, first_n=100):
+    x = ti.field(ti.i32, shape=(size, ))
+    first_n = min(first_n, size)
+
+    @ti.kernel
+    def reset():
+        for i in range(first_n):
+            x[i] = i * 10
+
+    @ti.kernel
+    def inc():
+        for i in x:
+            x[i] = x[i] + 1
+
+    # Simply test
+    reset()
+    ti.sync()
+    for _ in range(repeat):
+        t = time.time()
+        inc()
+        ti.sync()
+        print('single inc', time.time() - t)
+
+    reset()
+    ti.sync()
+    t = time.time()
+    for _ in range(repeat):
+        inc()
+    ti.sync()
+    duration = time.time() - t
+    print(f'fused {repeat} inc: total={duration} average={duration / repeat}')
+
+    for i in range(first_n):
+        assert x[i] == i * 10 + repeat
diff --git a/tests/_python_orig/test_abs.py b/tests/_python_orig/test_abs.py
new file mode 100644
index 000000000..a2ae2a6ba
--- /dev/null
+++ b/tests/_python_orig/test_abs.py
@@ -0,0 +1,37 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_abs():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+
+    N = 16
+
+    ti.root.dense(ti.i, N).place(x)
+    ti.root.dense(ti.i, N).place(y)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            x[i] = abs(y[i])
+
+    for i in range(N):
+        y[i] = i - 10
+        x.grad[i] = 1
+
+    func()
+    func.grad()
+
+    def sgn(x):
+        if x > 0:
+            return 1
+        if x < 0:
+            return -1
+        return 0
+
+    for i in range(N):
+        assert x[i] == abs(y[i])
+        assert y.grad[i] == sgn(y[i])
diff --git a/tests/_python_orig/test_ad_atomic.py b/tests/_python_orig/test_ad_atomic.py
new file mode 100644
index 000000000..fa317a79e
--- /dev/null
+++ b/tests/_python_orig/test_ad_atomic.py
@@ -0,0 +1,28 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_ad_reduce():
+    N = 16
+
+    x = ti.field(dtype=ti.f32, shape=N, needs_grad=True)
+    loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def func():
+        for i in x:
+            loss[None] += x[i]**2
+
+    total_loss = 0
+    for i in range(N):
+        x[i] = i
+        total_loss += i * i
+
+    loss.grad[None] = 1
+    func()
+    func.grad()
+
+    assert total_loss == test_utils.approx(loss[None])
+    for i in range(N):
+        assert x.grad[i] == test_utils.approx(i * 2)
diff --git a/tests/_python_orig/test_ad_basics.py b/tests/_python_orig/test_ad_basics.py
new file mode 100644
index 000000000..a870e643a
--- /dev/null
+++ b/tests/_python_orig/test_ad_basics.py
@@ -0,0 +1,353 @@
+import functools
+
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+has_autograd = False
+
+try:
+    import autograd.numpy as np
+    from autograd import grad
+    has_autograd = True
+except:
+    pass
+
+
+def if_has_autograd(func):
+    # functools.wraps is nececssary for pytest parametrization to work
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if has_autograd:
+            func(*args, **kwargs)
+
+    return wrapper
+
+
+# Note: test happens at v = 0.2
+def grad_test(tifunc, npfunc=None):
+    npfunc = npfunc or tifunc
+
+    print(
+        f'arch={ti.lang.impl.current_cfg().arch} default_fp={ti.lang.impl.current_cfg().default_fp}'
+    )
+    x = ti.field(ti.lang.impl.current_cfg().default_fp)
+    y = ti.field(ti.lang.impl.current_cfg().default_fp)
+
+    ti.root.dense(ti.i, 1).place(x, x.grad, y, y.grad)
+
+    @ti.kernel
+    def func():
+        for i in x:
+            y[i] = tifunc(x[i])
+
+    v = 0.234
+
+    y.grad[0] = 1
+    x[0] = v
+    func()
+    func.grad()
+
+    assert y[0] == test_utils.approx(npfunc(v), rel=1e-4)
+    assert x.grad[0] == test_utils.approx(grad(npfunc)(v), rel=1e-4)
+
+
+@if_has_autograd
+@test_utils.test()
+def test_size1():
+    x = ti.field(ti.i32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    x[0] = 1
+    assert x[0] == 1
+
+
+@pytest.mark.parametrize('tifunc', [
+    lambda x: x,
+    lambda x: -x,
+    lambda x: x * x,
+    lambda x: x**2,
+    lambda x: x * x * x,
+    lambda x: x * x * x * x,
+    lambda x: 0.4 * x * x - 3,
+    lambda x: (x - 3) * (x - 1),
+    lambda x: (x - 3) * (x - 1) + x * x,
+])
+@if_has_autograd
+@test_utils.test()
+def test_poly(tifunc):
+    grad_test(tifunc)
+
+
+@pytest.mark.parametrize('tifunc,npfunc', [
+    (lambda x: ti.tanh(x), lambda x: np.tanh(x)),
+    (lambda x: ti.sin(x), lambda x: np.sin(x)),
+    (lambda x: ti.cos(x), lambda x: np.cos(x)),
+    (lambda x: ti.acos(x), lambda x: np.arccos(x)),
+    (lambda x: ti.asin(x), lambda x: np.arcsin(x)),
+])
+@if_has_autograd
+@test_utils.test(exclude=[ti.vulkan])
+def test_trigonometric(tifunc, npfunc):
+    grad_test(tifunc, npfunc)
+
+
+@pytest.mark.parametrize('tifunc', [
+    lambda x: 1 / x,
+    lambda x: (x + 1) / (x - 1),
+    lambda x: (x + 1) * (x + 2) / ((x - 1) * (x + 3)),
+])
+@if_has_autograd
+@test_utils.test()
+def test_frac(tifunc):
+    grad_test(tifunc)
+
+
+@pytest.mark.parametrize('tifunc,npfunc', [
+    (lambda x: ti.sqrt(x), lambda x: np.sqrt(x)),
+    (lambda x: ti.exp(x), lambda x: np.exp(x)),
+    (lambda x: ti.log(x), lambda x: np.log(x)),
+])
+@if_has_autograd
+@test_utils.test()
+def test_unary(tifunc, npfunc):
+    grad_test(tifunc, npfunc)
+
+
+@pytest.mark.parametrize('tifunc,npfunc', [
+    (lambda x: ti.min(x, 0), lambda x: np.minimum(x, 0)),
+    (lambda x: ti.min(x, 1), lambda x: np.minimum(x, 1)),
+    (lambda x: ti.min(0, x), lambda x: np.minimum(0, x)),
+    (lambda x: ti.min(1, x), lambda x: np.minimum(1, x)),
+    (lambda x: ti.max(x, 0), lambda x: np.maximum(x, 0)),
+    (lambda x: ti.max(x, 1), lambda x: np.maximum(x, 1)),
+    (lambda x: ti.max(0, x), lambda x: np.maximum(0, x)),
+    (lambda x: ti.max(1, x), lambda x: np.maximum(1, x)),
+])
+@if_has_autograd
+@test_utils.test()
+def test_minmax(tifunc, npfunc):
+    grad_test(tifunc, npfunc)
+
+
+@if_has_autograd
+@test_utils.test()
+def test_mod():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    ti.root.dense(ti.i, 1).place(x, y)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        y[0] = x[0] % 3
+
+    @ti.kernel
+    def func2():
+        ti.atomic_add(y[0], x[0] % 3)
+
+    func()
+    func.grad()
+
+    func2()
+    func2.grad()
+
+
+@pytest.mark.parametrize('tifunc,npfunc', [
+    (lambda x: ti.atan2(0.4, x), lambda x: np.arctan2(0.4, x)),
+    (lambda y: ti.atan2(y, 0.4), lambda y: np.arctan2(y, 0.4)),
+])
+@if_has_autograd
+@test_utils.test()
+def test_atan2(tifunc, npfunc):
+    grad_test(tifunc, npfunc)
+
+
+@pytest.mark.parametrize('tifunc,npfunc', [
+    (lambda x: ti.atan2(0.4, x), lambda x: np.arctan2(0.4, x)),
+    (lambda y: ti.atan2(y, 0.4), lambda y: np.arctan2(y, 0.4)),
+])
+@if_has_autograd
+@test_utils.test(require=ti.extension.data64, default_fp=ti.f64)
+def test_atan2_f64(tifunc, npfunc):
+    grad_test(tifunc, npfunc)
+
+
+@pytest.mark.parametrize('tifunc,npfunc', [
+    (lambda x: 0.4**x, lambda x: np.power(0.4, x)),
+    (lambda y: y**0.4, lambda y: np.power(y, 0.4)),
+])
+@if_has_autograd
+@test_utils.test()
+def test_pow(tifunc, npfunc):
+    grad_test(tifunc, npfunc)
+
+
+@pytest.mark.parametrize('tifunc,npfunc', [
+    (lambda x: 0.4**x, lambda x: np.power(0.4, x)),
+    (lambda y: y**0.4, lambda y: np.power(y, 0.4)),
+])
+@if_has_autograd
+@test_utils.test(require=ti.extension.data64, default_fp=ti.f64)
+def test_pow_f64(tifunc, npfunc):
+    grad_test(tifunc, npfunc)
+
+
+@test_utils.test()
+def test_obey_kernel_simplicity():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x, y)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in x:
+            # OK: nested for loop
+            for j in ti.static(range(3)):
+                # OK: a series of non-for-loop statements
+                y[i] += x[i] * 42
+                y[i] -= x[i] * 5
+
+    y.grad[0] = 1.0
+    x[0] = 0.1
+
+    func()
+    func.grad()
+    assert x.grad[0] == test_utils.approx((42 - 5) * 3)
+
+
+@test_utils.test()
+def test_violate_kernel_simplicity1():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x, y)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in x:
+            y[i] = x[i] * 42
+            for j in ti.static(range(3)):
+                y[i] += x[i]
+
+    func()
+    func.grad()
+
+
+@test_utils.test()
+def test_violate_kernel_simplicity2():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x, y)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in x:
+            for j in ti.static(range(3)):
+                y[i] += x[i]
+            y[i] += x[i] * 42
+
+    func()
+    func.grad()
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_cast():
+    @ti.kernel
+    def func():
+        print(ti.cast(ti.cast(ti.cast(1.0, ti.f64), ti.f32), ti.f64))
+
+    func()
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_ad_precision_1():
+    loss = ti.field(ti.f32, shape=())
+    x = ti.field(ti.f64, shape=())
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        loss[None] = x[None]
+
+    loss.grad[None] = 1
+    func.grad()
+
+    assert x.grad[None] == 1
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_ad_precision_2():
+    loss = ti.field(ti.f64, shape=())
+    x = ti.field(ti.f32, shape=())
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        loss[None] = x[None]
+
+    with ti.Tape(loss):
+        func()
+
+    assert x.grad[None] == 1
+
+
+@test_utils.test()
+def test_ad_rand():
+    loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+    x = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def work():
+        loss[None] = x[None] * ti.random()
+
+    x[None] = 10
+    with pytest.raises(RuntimeError) as e:
+        with ti.Tape(loss):
+            work()
+    assert 'RandStmt not supported' in e.value.args[0]
+
+
+@test_utils.test(exclude=[ti.cc, ti.vulkan, ti.opengl])
+def test_ad_frac():
+    @ti.func
+    def frac(x):
+        fractional = x - ti.floor(x) if x > 0. else x - ti.ceil(x)
+        return fractional
+
+    @ti.kernel
+    def ti_frac(input_field: ti.template(), output_field: ti.template()):
+        for i in input_field:
+            output_field[i] = frac(input_field[i])**2
+
+    @ti.kernel
+    def calc_loss(input_field: ti.template(), loss: ti.template()):
+        for i in input_field:
+            loss[None] += input_field[i]
+
+    n = 10
+    field0 = ti.field(dtype=ti.f32, shape=(n, ), needs_grad=True)
+    randoms = np.random.randn(10).astype(np.float32)
+    field0.from_numpy(randoms)
+    field1 = ti.field(dtype=ti.f32, shape=(n, ), needs_grad=True)
+    loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+
+    with ti.Tape(loss):
+        ti_frac(field0, field1)
+        calc_loss(field1, loss)
+
+    grads = field0.grad.to_numpy()
+    expected = np.modf(randoms)[0] * 2
+    for i in range(n):
+        assert grads[i] == test_utils.approx(expected[i], rel=1e-4)
diff --git a/tests/_python_orig/test_ad_demote_dense.py b/tests/_python_orig/test_ad_demote_dense.py
new file mode 100644
index 000000000..425345385
--- /dev/null
+++ b/tests/_python_orig/test_ad_demote_dense.py
@@ -0,0 +1,14 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(exclude=[ti.metal, ti.opengl])
+def test_ad_demote_dense():
+    a = ti.field(ti.f32, shape=(7, 3, 19))
+
+    @ti.kernel
+    def inc():
+        for i, j, k in a:
+            a[i, j, k] += 1
+
+    inc.grad()
diff --git a/tests/_python_orig/test_ad_for.py b/tests/_python_orig/test_ad_for.py
new file mode 100644
index 000000000..b0b3cb6d3
--- /dev/null
+++ b/tests/_python_orig/test_ad_for.py
@@ -0,0 +1,777 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_sum():
+    N = 10
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.i32, shape=N)
+    p = ti.field(ti.f32, shape=N, needs_grad=True)
+
+    @ti.kernel
+    def compute_sum():
+        for i in range(N):
+            ret = 1.0
+            for j in range(b[i]):
+                ret = ret + a[i]
+            p[i] = ret
+
+    for i in range(N):
+        a[i] = 3
+        b[i] = i
+
+    compute_sum()
+
+    for i in range(N):
+        assert p[i] == 3 * b[i] + 1
+        p.grad[i] = 1
+
+    compute_sum.grad()
+
+    for i in range(N):
+        assert a.grad[i] == b[i]
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_sum_local_atomic():
+    N = 10
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.i32, shape=N)
+    p = ti.field(ti.f32, shape=N, needs_grad=True)
+
+    @ti.kernel
+    def compute_sum():
+        for i in range(N):
+            ret = 1.0
+            for j in range(b[i]):
+                ret += a[i]
+            p[i] = ret
+
+    for i in range(N):
+        a[i] = 3
+        b[i] = i
+
+    compute_sum()
+
+    for i in range(N):
+        assert p[i] == 3 * b[i] + 1
+        p.grad[i] = 1
+
+    compute_sum.grad()
+
+    for i in range(N):
+        assert a.grad[i] == b[i]
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_power():
+    N = 10
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.i32, shape=N)
+    p = ti.field(ti.f32, shape=N, needs_grad=True)
+
+    @ti.kernel
+    def power():
+        for i in range(N):
+            ret = 1.0
+            for j in range(b[i]):
+                ret = ret * a[i]
+            p[i] = ret
+
+    for i in range(N):
+        a[i] = 3
+        b[i] = i
+
+    power()
+
+    for i in range(N):
+        assert p[i] == 3**b[i]
+        p.grad[i] = 1
+
+    power.grad()
+
+    for i in range(N):
+        assert a.grad[i] == b[i] * 3**(b[i] - 1)
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_fibonacci():
+    N = 15
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.f32, shape=N, needs_grad=True)
+    c = ti.field(ti.i32, shape=N)
+    f = ti.field(ti.f32, shape=N, needs_grad=True)
+
+    @ti.kernel
+    def fib():
+        for i in range(N):
+            p = a[i]
+            q = b[i]
+            for j in range(c[i]):
+                p, q = q, p + q
+            f[i] = q
+
+    b.fill(1)
+
+    for i in range(N):
+        c[i] = i
+
+    fib()
+
+    for i in range(N):
+        f.grad[i] = 1
+
+    fib.grad()
+
+    for i in range(N):
+        print(a.grad[i], b.grad[i])
+        if i == 0:
+            assert a.grad[i] == 0
+        else:
+            assert a.grad[i] == f[i - 1]
+        assert b.grad[i] == f[i]
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_fibonacci_index():
+    N = 5
+    M = 10
+    a = ti.field(ti.f32, shape=M, needs_grad=True)
+    b = ti.field(ti.f32, shape=M, needs_grad=True)
+    f = ti.field(ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def fib():
+        for i in range(N):
+            p = 0
+            q = 1
+            for j in range(5):
+                p, q = q, p + q
+                b[q] += a[q]
+
+        for i in range(M):
+            f[None] += b[i]
+
+    f.grad[None] = 1
+    a.fill(1)
+
+    fib()
+    fib.grad()
+
+    for i in range(M):
+        is_fib = int(i in [1, 2, 3, 5, 8])
+        assert a.grad[i] == is_fib * N
+        assert b[i] == is_fib * N
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_global_ptr():
+    N = 5
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.f32, shape=N, needs_grad=True)
+    f = ti.field(ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def task():
+        for i in range(N):
+            p = 0
+            for j in range(N):
+                b[i] += a[p]**2
+                p += 1
+
+        for i in range(N):
+            f[None] += b[i]
+
+    f.grad[None] = 1
+    for i in range(N):
+        a[i] = i
+
+    task()
+    task.grad()
+
+    for i in range(N):
+        print(a.grad[i])
+        assert a.grad[i] == 2 * i * N
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_integer_stack():
+    N = 5
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.f32, shape=N, needs_grad=True)
+    c = ti.field(ti.i32, shape=N)
+    f = ti.field(ti.f32, shape=N, needs_grad=True)
+
+    @ti.kernel
+    def int_stack():
+        for i in range(N):
+            weight = 1
+            s = 0.0
+            for j in range(c[i]):
+                s += weight * a[i] + b[i]
+                weight *= 10
+            f[i] = s
+
+    a.fill(1)
+    b.fill(1)
+
+    for i in range(N):
+        c[i] = i
+
+    int_stack()
+
+    for i in range(N):
+        print(f[i])
+        f.grad[i] = 1
+
+    int_stack.grad()
+
+    t = 0
+    for i in range(N):
+        assert a.grad[i] == t
+        assert b.grad[i] == i
+        t = t * 10 + 1
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_double_for_loops():
+    N = 5
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.f32, shape=N, needs_grad=True)
+    c = ti.field(ti.i32, shape=N)
+    f = ti.field(ti.f32, shape=N, needs_grad=True)
+
+    @ti.kernel
+    def double_for():
+        for i in range(N):
+            weight = 1.0
+            for j in range(c[i]):
+                weight *= a[i]
+            s = 0.0
+            for j in range(c[i] * 2):
+                s += weight + b[i]
+            f[i] = s
+
+    a.fill(2)
+    b.fill(1)
+
+    for i in range(N):
+        c[i] = i
+
+    double_for()
+
+    for i in range(N):
+        assert f[i] == 2 * i * (1 + 2**i)
+        f.grad[i] = 1
+
+    double_for.grad()
+
+    for i in range(N):
+        assert a.grad[i] == 2 * i * i * 2**(i - 1)
+        assert b.grad[i] == 2 * i
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_double_for_loops_more_nests():
+    N = 6
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.f32, shape=N, needs_grad=True)
+    c = ti.field(ti.i32, shape=(N, N // 2))
+    f = ti.field(ti.f32, shape=(N, N // 2), needs_grad=True)
+
+    @ti.kernel
+    def double_for():
+        for i in range(N):
+            for k in range(N // 2):
+                weight = 1.0
+                for j in range(c[i, k]):
+                    weight *= a[i]
+                s = 0.0
+                for j in range(c[i, k] * 2):
+                    s += weight + b[i]
+                f[i, k] = s
+
+    a.fill(2)
+    b.fill(1)
+
+    for i in range(N):
+        for k in range(N // 2):
+            c[i, k] = i + k
+
+    double_for()
+
+    for i in range(N):
+        for k in range(N // 2):
+            assert f[i, k] == 2 * (i + k) * (1 + 2**(i + k))
+            f.grad[i, k] = 1
+
+    double_for.grad()
+
+    for i in range(N):
+        total_grad_a = 0
+        total_grad_b = 0
+        for k in range(N // 2):
+            total_grad_a += 2 * (i + k)**2 * 2**(i + k - 1)
+            total_grad_b += 2 * (i + k)
+        assert a.grad[i] == total_grad_a
+        assert b.grad[i] == total_grad_b
+
+
+@test_utils.test(require=[ti.extension.adstack, ti.extension.data64])
+def test_complex_body():
+    N = 5
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.f32, shape=N, needs_grad=True)
+    c = ti.field(ti.i32, shape=N)
+    f = ti.field(ti.f32, shape=N, needs_grad=True)
+    g = ti.field(ti.f32, shape=N, needs_grad=False)
+
+    @ti.kernel
+    def complex():
+        for i in range(N):
+            weight = 2.0
+            tot = 0.0
+            tot_weight = 0.0
+            for j in range(c[i]):
+                tot_weight += weight + 1
+                tot += (weight + 1) * a[i]
+                weight = weight + 1
+                weight = weight * 4
+                weight = ti.cast(weight, ti.f64)
+                weight = ti.cast(weight, ti.f32)
+
+            g[i] = tot_weight
+            f[i] = tot
+
+    a.fill(2)
+    b.fill(1)
+
+    for i in range(N):
+        c[i] = i
+        f.grad[i] = 1
+
+    complex()
+    complex.grad()
+
+    for i in range(N):
+        assert a.grad[i] == g[i]
+
+
+@test_utils.test(require=[ti.extension.adstack, ti.extension.bls])
+def test_triple_for_loops_bls():
+    N = 8
+    M = 3
+    a = ti.field(ti.f32, shape=N, needs_grad=True)
+    b = ti.field(ti.f32, shape=2 * N, needs_grad=True)
+    f = ti.field(ti.f32, shape=(N - M, N), needs_grad=True)
+
+    @ti.kernel
+    def triple_for():
+        ti.block_local(a)
+        ti.block_local(b)
+        for i in range(N - M):
+            for k in range(N):
+                weight = 1.0
+                for j in range(M):
+                    weight *= a[i + j]
+                s = 0.0
+                for j in range(2 * M):
+                    s += weight + b[2 * i + j]
+                f[i, k] = s
+
+    a.fill(2)
+
+    for i in range(2 * N):
+        b[i] = i
+
+    triple_for()
+
+    for i in range(N - M):
+        for k in range(N):
+            assert f[i, k] == 2 * M * 2**M + (4 * i + 2 * M - 1) * M
+            f.grad[i, k] = 1
+
+    triple_for.grad()
+
+    for i in range(N):
+        assert a.grad[i] == 2 * M * min(min(N - i - 1, i + 1), M) * \
+               2**(M - 1) * N
+    for i in range(N):
+        assert b.grad[i * 2] == min(min(N - i - 1, i + 1), M) * N
+        assert b.grad[i * 2 + 1] == min(min(N - i - 1, i + 1), M) * N
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_mixed_inner_loops():
+    x = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+    arr = ti.field(dtype=ti.f32, shape=(5))
+    loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def mixed_inner_loops():
+        for i in arr:
+            loss[None] += ti.sin(x[None])
+            for j in range(2):
+                loss[None] += ti.sin(x[None]) + 1.0
+
+    loss.grad[None] = 1.0
+    x[None] = 0.0
+    mixed_inner_loops()
+    mixed_inner_loops.grad()
+
+    assert loss[None] == 10.0
+    assert x.grad[None] == 15.0
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_mixed_inner_loops_tape():
+    x = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+    arr = ti.field(dtype=ti.f32, shape=(5))
+    loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def mixed_inner_loops_tape():
+        for i in arr:
+            loss[None] += ti.sin(x[None])
+            for j in range(2):
+                loss[None] += ti.sin(x[None]) + 1.0
+
+    x[None] = 0.0
+    with ti.Tape(loss=loss):
+        mixed_inner_loops_tape()
+
+    assert loss[None] == 10.0
+    assert x.grad[None] == 15.0
+
+
+@test_utils.test(require=ti.extension.adstack, ad_stack_size=32)
+def test_inner_loops_local_variable_fixed_stack_size_tape():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_inner_loops_local_variable():
+        for i in arr:
+            for j in range(3):
+                s = 0.0
+                t = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                    t += ti.sin(x[None])
+                loss[None] += s + t
+
+    x[None] = 0.0
+    with ti.Tape(loss=loss):
+        test_inner_loops_local_variable()
+
+    assert loss[None] == 18.0
+    assert x.grad[None] == 36.0
+
+
+@test_utils.test(require=ti.extension.adstack, ad_stack_size=32)
+def test_inner_loops_local_variable_fixed_stack_size_kernel_grad():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_inner_loops_local_variable():
+        for i in arr:
+            for j in range(3):
+                s = 0.0
+                t = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                    t += ti.sin(x[None])
+                loss[None] += s + t
+
+    loss.grad[None] = 1.0
+    x[None] = 0.0
+    test_inner_loops_local_variable()
+    test_inner_loops_local_variable.grad()
+
+    assert loss[None] == 18.0
+    assert x.grad[None] == 36.0
+
+
+@test_utils.test(require=ti.extension.adstack, ad_stack_size=0)
+def test_inner_loops_local_variable_adaptive_stack_size_tape():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_inner_loops_local_variable():
+        for i in arr:
+            for j in range(3):
+                s = 0.0
+                t = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                    t += ti.sin(x[None])
+                loss[None] += s + t
+
+    x[None] = 0.0
+    with ti.Tape(loss=loss):
+        test_inner_loops_local_variable()
+
+    assert loss[None] == 18.0
+    assert x.grad[None] == 36.0
+
+
+@test_utils.test(require=ti.extension.adstack, ad_stack_size=0)
+def test_inner_loops_local_variable_adaptive_stack_size_kernel_grad():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_inner_loops_local_variable():
+        for i in arr:
+            for j in range(3):
+                s = 0.0
+                t = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                    t += ti.sin(x[None])
+                loss[None] += s + t
+
+    loss.grad[None] = 1.0
+    x[None] = 0.0
+    test_inner_loops_local_variable()
+    test_inner_loops_local_variable.grad()
+
+    assert loss[None] == 18.0
+    assert x.grad[None] == 36.0
+
+
+@test_utils.test(require=ti.extension.adstack, ad_stack_size=0)
+def test_more_inner_loops_local_variable_adaptive_stack_size_tape():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_more_inner_loops_local_variable():
+        for i in arr:
+            for j in range(2):
+                s = 0.0
+                for k in range(3):
+                    u = 0.0
+                    s += ti.sin(x[None]) + 1.0
+                    for l in range(2):
+                        u += ti.sin(x[None])
+                    loss[None] += u
+                loss[None] += s
+
+    x[None] = 0.0
+    with ti.Tape(loss=loss):
+        test_more_inner_loops_local_variable()
+
+    assert loss[None] == 12.0
+    assert x.grad[None] == 36.0
+
+
+@test_utils.test(require=ti.extension.adstack, ad_stack_size=32)
+def test_more_inner_loops_local_variable_fixed_stack_size_tape():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_more_inner_loops_local_variable():
+        for i in arr:
+            for j in range(2):
+                s = 0.0
+                for k in range(3):
+                    u = 0.0
+                    s += ti.sin(x[None]) + 1.0
+                    for l in range(2):
+                        u += ti.sin(x[None])
+                    loss[None] += u
+                loss[None] += s
+
+    x[None] = 0.0
+    with ti.Tape(loss=loss):
+        test_more_inner_loops_local_variable()
+
+    assert loss[None] == 12.0
+    assert x.grad[None] == 36.0
+
+
+@test_utils.test(require=ti.extension.adstack,
+                 ad_stack_size=32,
+                 arch=[ti.cpu, ti.gpu])
+def test_stacked_inner_loops_local_variable_fixed_stack_size_kernel_grad():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_stacked_inner_loops_local_variable():
+        for i in arr:
+            loss[None] += ti.sin(x[None])
+            for j in range(3):
+                s = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                loss[None] += s
+            for j in range(3):
+                s = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                loss[None] += s
+
+    loss.grad[None] = 1.0
+    x[None] = 0.0
+    test_stacked_inner_loops_local_variable()
+    test_stacked_inner_loops_local_variable.grad()
+
+    assert loss[None] == 36.0
+    assert x.grad[None] == 38.0
+
+
+@test_utils.test(require=ti.extension.adstack,
+                 ad_stack_size=32,
+                 arch=[ti.cpu, ti.gpu])
+def test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable_fixed_stack_size_kernel_grad(
+):
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable():
+        for i in arr:
+            loss[None] += ti.sin(x[None])
+            for j in range(3):
+                for k in range(3):
+                    loss[None] += ti.sin(x[None]) + 1.0
+            for j in range(3):
+                s = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                loss[None] += s
+            for j in range(3):
+                for k in range(3):
+                    loss[None] += ti.sin(x[None]) + 1.0
+
+    loss.grad[None] = 1.0
+    x[None] = 0.0
+    test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable()
+    test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable.grad()
+
+    assert loss[None] == 54.0
+    assert x.grad[None] == 56.0
+
+
+@test_utils.test(require=ti.extension.adstack,
+                 ad_stack_size=0,
+                 arch=[ti.cpu, ti.gpu])
+def test_stacked_inner_loops_local_variable_adaptive_stack_size_kernel_grad():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_stacked_inner_loops_local_variable():
+        for i in arr:
+            loss[None] += ti.sin(x[None])
+            for j in range(3):
+                s = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                loss[None] += s
+            for j in range(3):
+                s = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                loss[None] += s
+
+    loss.grad[None] = 1.0
+    x[None] = 0.0
+    test_stacked_inner_loops_local_variable()
+    test_stacked_inner_loops_local_variable.grad()
+
+    assert loss[None] == 36.0
+    assert x.grad[None] == 38.0
+
+
+@test_utils.test(require=ti.extension.adstack,
+                 ad_stack_size=0,
+                 arch=[ti.cpu, ti.gpu])
+def test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable_adaptive_stack_size_kernel_grad(
+):
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable():
+        for i in arr:
+            loss[None] += ti.sin(x[None])
+            for j in range(3):
+                for k in range(3):
+                    loss[None] += ti.sin(x[None]) + 1.0
+            for j in range(3):
+                s = 0.0
+                for k in range(3):
+                    s += ti.sin(x[None]) + 1.0
+                loss[None] += s
+            for j in range(3):
+                for k in range(3):
+                    loss[None] += ti.sin(x[None]) + 1.0
+
+    loss.grad[None] = 1.0
+    x[None] = 0.0
+    test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable()
+    test_stacked_mixed_ib_and_non_ib_inner_loops_local_variable.grad()
+
+    assert loss[None] == 54.0
+    assert x.grad[None] == 56.0
+
+
+@test_utils.test(require=ti.extension.adstack,
+                 ad_stack_size=0,
+                 arch=[ti.cpu, ti.gpu])
+def test_large_for_loops_adaptive_stack_size():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_large_loop():
+        for i in range(5):
+            for j in range(2000):
+                for k in range(1000):
+                    loss[None] += ti.sin(x[None]) + 1.0
+
+    with ti.Tape(loss=loss):
+        test_large_loop()
+
+    assert loss[None] == 1e7
+    assert x.grad[None] == 1e7
+
+
+@test_utils.test(require=ti.extension.adstack,
+                 ad_stack_size=1,
+                 arch=[ti.cpu, ti.gpu])
+def test_large_for_loops_fixed_stack_size():
+    x = ti.field(dtype=float, shape=(), needs_grad=True)
+    arr = ti.field(dtype=float, shape=(2), needs_grad=True)
+    loss = ti.field(dtype=float, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def test_large_loop():
+        for i in range(5):
+            for j in range(2000):
+                for k in range(1000):
+                    loss[None] += ti.sin(x[None]) + 1.0
+
+    with ti.Tape(loss=loss):
+        test_large_loop()
+
+    assert loss[None] == 1e7
+    assert x.grad[None] == 1e7
diff --git a/tests/_python_orig/test_ad_if.py b/tests/_python_orig/test_ad_if.py
new file mode 100644
index 000000000..b76ef6028
--- /dev/null
+++ b/tests/_python_orig/test_ad_if.py
@@ -0,0 +1,244 @@
+from taichi.lang import impl
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_if_simple():
+    x = ti.field(ti.f32, shape=())
+    y = ti.field(ti.f32, shape=())
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        if x[None] > 0.:
+            y[None] = x[None]
+
+    x[None] = 1
+    y.grad[None] = 1
+
+    func()
+    func.grad()
+
+    assert x.grad[None] == 1
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_if():
+    x = ti.field(ti.f32, shape=2)
+    y = ti.field(ti.f32, shape=2)
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func(i: ti.i32):
+        if x[i] > 0:
+            y[i] = x[i]
+        else:
+            y[i] = 2 * x[i]
+
+    x[0] = 0
+    x[1] = 1
+    y.grad[0] = 1
+    y.grad[1] = 1
+
+    func(0)
+    func.grad(0)
+    func(1)
+    func.grad(1)
+
+    assert x.grad[0] == 2
+    assert x.grad[1] == 1
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_if_nested():
+    n = 20
+    x = ti.field(ti.f32, shape=n)
+    y = ti.field(ti.f32, shape=n)
+    z = ti.field(ti.f32, shape=n)
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in x:
+            if x[i] < 2:
+                if x[i] == 0:
+                    y[i] = 0
+                else:
+                    y[i] = z[i] * 1
+            else:
+                if x[i] == 2:
+                    y[i] = z[i] * 2
+                else:
+                    y[i] = z[i] * 3
+
+    z.fill(1)
+
+    for i in range(n):
+        x[i] = i % 4
+
+    func()
+    for i in range(n):
+        assert y[i] == i % 4
+        y.grad[i] = 1
+    func.grad()
+
+    for i in range(n):
+        assert z.grad[i] == i % 4
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_if_mutable():
+    x = ti.field(ti.f32, shape=2)
+    y = ti.field(ti.f32, shape=2)
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func(i: ti.i32):
+        t = x[i]
+        if t > 0:
+            y[i] = t
+        else:
+            y[i] = 2 * t
+
+    x[0] = 0
+    x[1] = 1
+    y.grad[0] = 1
+    y.grad[1] = 1
+
+    func(0)
+    func.grad(0)
+    func(1)
+    func.grad(1)
+
+    assert x.grad[0] == 2
+    assert x.grad[1] == 1
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_if_parallel():
+    x = ti.field(ti.f32, shape=2)
+    y = ti.field(ti.f32, shape=2)
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in range(2):
+            t = x[i]
+            if t > 0:
+                y[i] = t
+            else:
+                y[i] = 2 * t
+
+    x[0] = 0
+    x[1] = 1
+    y.grad[0] = 1
+    y.grad[1] = 1
+
+    func()
+    func.grad()
+
+    assert x.grad[0] == 2
+    assert x.grad[1] == 1
+
+
+@test_utils.test(require=[ti.extension.adstack, ti.extension.data64],
+                 default_fp=ti.f64)
+def test_ad_if_parallel_f64():
+    x = ti.field(ti.f64, shape=2)
+    y = ti.field(ti.f64, shape=2)
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in range(2):
+            t = x[i]
+            if t > 0:
+                y[i] = t
+            else:
+                y[i] = 2 * t
+
+    x[0] = 0
+    x[1] = 1
+    y.grad[0] = 1
+    y.grad[1] = 1
+
+    func()
+    func.grad()
+
+    assert x.grad[0] == 2
+    assert x.grad[1] == 1
+
+
+@test_utils.test(require=ti.extension.adstack)
+def test_ad_if_parallel_complex():
+    x = ti.field(ti.f32, shape=2)
+    y = ti.field(ti.f32, shape=2)
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        ti.parallelize(1)
+        for i in range(2):
+            t = 0.0
+            if x[i] > 0:
+                t = 1 / x[i]
+            y[i] = t
+
+    x[0] = 0
+    x[1] = 2
+    y.grad[0] = 1
+    y.grad[1] = 1
+
+    func()
+    func.grad()
+
+    assert x.grad[0] == 0
+    assert x.grad[1] == -0.25
+
+
+@test_utils.test(require=[ti.extension.adstack, ti.extension.data64],
+                 default_fp=ti.f64)
+def test_ad_if_parallel_complex_f64():
+    x = ti.field(ti.f64, shape=2)
+    y = ti.field(ti.f64, shape=2)
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        ti.parallelize(1)
+        for i in range(2):
+            t = 0.0
+            if x[i] > 0:
+                t = 1 / x[i]
+            y[i] = t
+
+    x[0] = 0
+    x[1] = 2
+    y.grad[0] = 1
+    y.grad[1] = 1
+
+    func()
+    func.grad()
+
+    assert x.grad[0] == 0
+    assert x.grad[1] == -0.25
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_stack():
+    @ti.kernel
+    def func():
+        impl.call_internal("test_stack")
+
+    func()
diff --git a/tests/_python_orig/test_ad_offload.py b/tests/_python_orig/test_ad_offload.py
new file mode 100644
index 000000000..945dba9c8
--- /dev/null
+++ b/tests/_python_orig/test_ad_offload.py
@@ -0,0 +1,24 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_offload_order():
+    n = 128
+    x = ti.field(ti.f32, shape=n, needs_grad=True)
+    y = ti.field(ti.f32, shape=n, needs_grad=True)
+    z = ti.field(ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def forward():
+        for i in x:
+            y[i] = x[i]
+
+        # for i in x:
+        #     z[None] += y[i]
+
+    with ti.Tape(z):
+        forward()
+
+    # for i in range(n):
+    #     assert x.grad[i] == 1
diff --git a/tests/_python_orig/test_aot.py b/tests/_python_orig/test_aot.py
new file mode 100644
index 000000000..4352c5f66
--- /dev/null
+++ b/tests/_python_orig/test_aot.py
@@ -0,0 +1,562 @@
+import json
+import os
+import sys
+import tempfile
+
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cc)
+def test_record():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        recorded_file = os.path.join(tmpdir, 'record.yml')
+        ti.aot.start_recording(recorded_file)
+
+        loss = ti.field(float, (), needs_grad=True)
+        x = ti.field(float, 233, needs_grad=True)
+
+        @ti.kernel
+        def compute_loss():
+            for i in x:
+                loss[None] += x[i]**2
+
+        compute_loss()
+        ti.aot.stop_recording()
+
+        assert os.path.exists(recorded_file)
+
+        # Make sure kernel info is in the file
+        with open(recorded_file, 'r') as f:
+            assert 'compute_loss' in ''.join(f.readlines())
+
+
+@test_utils.test(arch=ti.opengl, max_block_dim=32)
+def test_opengl_max_block_dim():
+    density = ti.field(float, shape=(8, 8))
+
+    @ti.kernel
+    def init():
+        for i, j in density:
+            density[i, j] = 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        m = ti.aot.Module(ti.opengl)
+        m.add_field('density', density)
+        m.add_kernel(init)
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            res = json.load(json_file)
+            gl_file_path = res['aot_data']['kernels']['init']['tasks'][0][
+                'source_path']
+            with open(gl_file_path) as gl_file:
+                s = 'layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;\n'
+                assert s in gl_file.readlines()
+
+
+@test_utils.test(arch=[ti.opengl, ti.vulkan])
+def test_aot_field_range_hint():
+    density = ti.field(float, shape=(8, 8))
+
+    @ti.kernel
+    def init():
+        for i, j in density:
+            density[i, j] = 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        m = ti.aot.Module(ti.opengl)
+        m.add_field('density', density)
+        m.add_kernel(init)
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            res = json.load(json_file)
+            range_hint = res['aot_data']['kernels']['init']['tasks'][0][
+                'range_hint']
+            assert range_hint == '64'
+
+
+@test_utils.test(arch=ti.opengl)
+def test_aot_ndarray_range_hint():
+    density = ti.ndarray(dtype=ti.f32, shape=(8, 8))
+
+    @ti.kernel
+    def init(density: ti.any_arr()):
+        for i, j in density:
+            density[i, j] = 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        m = ti.aot.Module(ti.opengl)
+        m.add_kernel(init, (density, ))
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            res = json.load(json_file)
+            range_hint = res['aot_data']['kernels']['init']['tasks'][0][
+                'range_hint']
+            assert range_hint == 'arg 0'
+
+
+@test_utils.test(arch=ti.opengl)
+def test_element_size_alignment():
+    a = ti.field(ti.f32, shape=())
+    b = ti.Matrix.field(2, 3, ti.f32, shape=(2, 4))
+    c = ti.field(ti.i32, shape=())
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        s = ti.aot.Module(ti.lang.impl.current_cfg().arch)
+        s.add_field('a', a)
+        s.add_field('b', b)
+        s.add_field('c', c)
+        s.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            res = json.load(json_file)
+            offsets = (res['aot_data']['fields'][0]['mem_offset_in_parent'],
+                       res['aot_data']['fields'][1]['mem_offset_in_parent'],
+                       res['aot_data']['fields'][2]['mem_offset_in_parent'])
+            assert 0 in offsets and 4 in offsets and 24 in offsets
+            assert res['aot_data']['root_buffer_size'] == 216
+
+
+@test_utils.test(arch=[ti.opengl, ti.vulkan])
+def test_save():
+    density = ti.field(float, shape=(4, 4))
+
+    @ti.kernel
+    def init():
+        for i, j in density:
+            density[i, j] = 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # note ti.aot.Module(ti.opengl) is no-op according to its docstring.
+        m = ti.aot.Module(ti.lang.impl.current_cfg().arch)
+        m.add_field('density', density)
+        m.add_kernel(init)
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            json.load(json_file)
+
+
+@test_utils.test(arch=ti.opengl)
+def test_save_template_kernel():
+    density = ti.field(float, shape=(4, 4))
+
+    @ti.kernel
+    def foo(n: ti.template()):
+        for i in range(n):
+            density[0, 0] += 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # note ti.aot.Module(ti.opengl) is no-op according to its docstring.
+        m = ti.aot.Module(ti.lang.impl.current_cfg().arch)
+        m.add_field('density', density)
+        with m.add_kernel_template(foo) as kt:
+            kt.instantiate(n=6)
+            kt.instantiate(n=8)
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            json.load(json_file)
+
+
+@test_utils.test(arch=[ti.opengl, ti.vulkan])
+def test_non_dense_snode():
+    n = 8
+    x = ti.field(dtype=ti.f32)
+    y = ti.field(dtype=ti.f32)
+    blk = ti.root.dense(ti.i, n)
+    blk.place(x)
+    blk.dense(ti.i, n).place(y)
+
+    with pytest.raises(RuntimeError, match='AOT: only supports dense field'):
+        m = ti.aot.Module(ti.lang.impl.current_cfg().arch)
+        m.add_field('x', x)
+        m.add_field('y', y)
+
+
+@test_utils.test(arch=[ti.opengl, ti.vulkan])
+def test_mpm88_aot():
+    n_particles = 8192
+    n_grid = 128
+    dx = 1 / n_grid
+    dt = 2e-4
+
+    p_rho = 1
+    p_vol = (dx * 0.5)**2
+    p_mass = p_vol * p_rho
+    gravity = 9.8
+    bound = 3
+    E = 400
+
+    x = ti.Vector.field(2, float, n_particles)
+    v = ti.Vector.field(2, float, n_particles)
+    C = ti.Matrix.field(2, 2, float, n_particles)
+    J = ti.field(float, n_particles)
+
+    grid_v = ti.Vector.field(2, float, (n_grid, n_grid))
+    grid_m = ti.field(float, (n_grid, n_grid))
+
+    @ti.kernel
+    def substep():
+        for i, j in grid_m:
+            grid_v[i, j] = [0, 0]
+            grid_m[i, j] = 0
+        for p in x:
+            Xp = x[p] / dx
+            base = int(Xp - 0.5)
+            fx = Xp - base
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
+            affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+            for i, j in ti.static(ti.ndrange(3, 3)):
+                offset = ti.Vector([i, j])
+                dpos = (offset - fx) * dx
+                weight = w[i].x * w[j].y
+                grid_v[base +
+                       offset] += weight * (p_mass * v[p] + affine @ dpos)
+                grid_m[base + offset] += weight * p_mass
+        for i, j in grid_m:
+            if grid_m[i, j] > 0:
+                grid_v[i, j] /= grid_m[i, j]
+            grid_v[i, j].y -= dt * gravity
+            if i < bound and grid_v[i, j].x < 0:
+                grid_v[i, j].x = 0
+            if i > n_grid - bound and grid_v[i, j].x > 0:
+                grid_v[i, j].x = 0
+            if j < bound and grid_v[i, j].y < 0:
+                grid_v[i, j].y = 0
+            if j > n_grid - bound and grid_v[i, j].y > 0:
+                grid_v[i, j].y = 0
+        for p in x:
+            Xp = x[p] / dx
+            base = int(Xp - 0.5)
+            fx = Xp - base
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            new_v = ti.Vector.zero(float, 2)
+            new_C = ti.Matrix.zero(float, 2, 2)
+            for i, j in ti.static(ti.ndrange(3, 3)):
+                offset = ti.Vector([i, j])
+                dpos = (offset - fx) * dx
+                weight = w[i].x * w[j].y
+                g_v = grid_v[base + offset]
+                new_v += weight * g_v
+                new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
+            v[p] = new_v
+            x[p] += dt * v[p]
+            J[p] *= 1 + dt * new_C.trace()
+            C[p] = new_C
+
+    @ti.kernel
+    def init():
+        for i in range(n_particles):
+            x[i] = [ti.random() * 0.4 + 0.2, ti.random() * 0.4 + 0.2]
+            v[i] = [0, -1]
+            J[i] = 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        m = ti.aot.Module(ti.lang.impl.current_cfg().arch)
+        m.add_field("x", x)
+        m.add_field("v", v)
+        m.add_field("C", C)
+        m.add_field("J", J)
+        m.add_field("grid_v", grid_v)
+        m.add_field("grid_m", grid_m)
+        m.add_kernel(substep)
+        m.add_kernel(init)
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            json.load(json_file)
+
+
+@test_utils.test(arch=ti.opengl)
+def test_opengl_8_ssbo():
+    # 6 ndarrays + gtmp + args
+    n = 4
+    density1 = ti.ndarray(dtype=ti.f32, shape=(4, 4))
+    density2 = ti.ndarray(dtype=ti.f32, shape=(4, 4))
+    density3 = ti.ndarray(dtype=ti.f32, shape=(4, 4))
+    density4 = ti.ndarray(dtype=ti.f32, shape=(4, 4))
+    density5 = ti.ndarray(dtype=ti.f32, shape=(4, 4))
+    density6 = ti.ndarray(dtype=ti.f32, shape=(4, 4))
+
+    @ti.kernel
+    def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),
+             density3: ti.any_arr(), density4: ti.any_arr(),
+             density5: ti.any_arr(), density6: ti.any_arr()):
+        for i, j in density1:
+            density1[i, j] = d + 1
+            density2[i, j] = d + 2
+            density3[i, j] = d + 3
+            density4[i, j] = d + 4
+            density5[i, j] = d + 5
+            density6[i, j] = d + 6
+
+    init(0, density1, density2, density3, density4, density5, density6)
+    assert (density1.to_numpy() == (np.zeros(shape=(n, n)) + 1)).all()
+    assert (density2.to_numpy() == (np.zeros(shape=(n, n)) + 2)).all()
+    assert (density3.to_numpy() == (np.zeros(shape=(n, n)) + 3)).all()
+    assert (density4.to_numpy() == (np.zeros(shape=(n, n)) + 4)).all()
+    assert (density5.to_numpy() == (np.zeros(shape=(n, n)) + 5)).all()
+    assert (density6.to_numpy() == (np.zeros(shape=(n, n)) + 6)).all()
+
+
+@test_utils.test(arch=ti.opengl)
+def test_opengl_exceed_max_ssbo():
+    # 8 ndarrays + args > 8 (maximum allowed)
+    n = 4
+    density1 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density2 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density3 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density4 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density5 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density6 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density7 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density8 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+
+    @ti.kernel
+    def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),
+             density3: ti.any_arr(), density4: ti.any_arr(),
+             density5: ti.any_arr(), density6: ti.any_arr(),
+             density7: ti.any_arr(), density8: ti.any_arr()):
+        for i, j in density1:
+            density1[i, j] = d + 1
+            density2[i, j] = d + 2
+            density3[i, j] = d + 3
+            density4[i, j] = d + 4
+            density5[i, j] = d + 5
+            density6[i, j] = d + 6
+            density7[i, j] = d + 7
+            density8[i, j] = d + 8
+
+    with pytest.raises(RuntimeError):
+        init(0, density1, density2, density3, density4, density5, density6,
+             density7, density8)
+
+
+@test_utils.test(arch=[ti.opengl, ti.vulkan])
+def test_mpm99_aot():
+    quality = 1  # Use a larger value for higher-res simulations
+    n_particles, n_grid = 9000 * quality**2, 128 * quality
+    dx, inv_dx = 1 / n_grid, float(n_grid)
+    dt = 1e-4 / quality
+    p_vol, p_rho = (dx * 0.5)**2, 1
+    p_mass = p_vol * p_rho
+    E, nu = 0.1e4, 0.2  # Young's modulus and Poisson's ratio
+    mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / (
+        (1 + nu) * (1 - 2 * nu))  # Lame parameters
+    x = ti.Vector.field(2, dtype=float, shape=n_particles)  # position
+    v = ti.Vector.field(2, dtype=float, shape=n_particles)  # velocity
+    C = ti.Matrix.field(2, 2, dtype=float,
+                        shape=n_particles)  # affine velocity field
+    F = ti.Matrix.field(2, 2, dtype=float,
+                        shape=n_particles)  # deformation gradient
+    material = ti.field(dtype=int, shape=n_particles)  # material id
+    Jp = ti.field(dtype=float, shape=n_particles)  # plastic deformation
+    grid_v = ti.Vector.field(2, dtype=float,
+                             shape=(n_grid,
+                                    n_grid))  # grid node momentum/velocity
+    grid_m = ti.field(dtype=float, shape=(n_grid, n_grid))  # grid node mass
+    grid_v_int = ti.Vector.field(2, dtype=int,
+                                 shape=(n_grid,
+                                        n_grid))  # grid node momentum/velocity
+    grid_m_int = ti.field(dtype=int, shape=(n_grid, n_grid))  # grid node mass
+
+    v_exp = 24
+    m_exp = 40
+
+    @ti.kernel
+    def substep():
+        for i, j in grid_m:
+            grid_v[i, j] = [0, 0]
+            grid_m[i, j] = 0
+            grid_v_int[i, j] = [0, 0]
+            grid_m_int[i, j] = 0
+        for p in x:  # Particle state update and scatter to grid (P2G)
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            # Quadratic kernels  [http://mpm.graphics   Eqn. 123, with x=fx, fx-1,fx-2]
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            F[p] = (ti.Matrix.identity(float, 2) +
+                    dt * C[p]) @ F[p]  # deformation gradient update
+            h = ti.exp(
+                10 * (1.0 - Jp[p])
+            )  # Hardening coefficient: snow gets harder when compressed
+            if material[p] == 1:  # jelly, make it softer
+                h = 0.3
+            mu, la = mu_0 * h, lambda_0 * h
+            if material[p] == 0:  # liquid
+                mu = 0.0
+            U, sig, V = ti.svd(F[p])
+            J = 1.0
+            for d in ti.static(range(2)):
+                new_sig = sig[d, d]
+                if material[p] == 2:  # Snow
+                    new_sig = min(max(sig[d, d], 1 - 2.5e-2),
+                                  1 + 4.5e-3)  # Plasticity
+                Jp[p] *= sig[d, d] / new_sig
+                sig[d, d] = new_sig
+                J *= new_sig
+            if material[
+                    p] == 0:  # Reset deformation gradient to avoid numerical instability
+                F[p] = ti.Matrix.identity(float, 2) * ti.sqrt(J)
+            elif material[p] == 2:
+                F[p] = U @ sig @ V.transpose(
+                )  # Reconstruct elastic deformation gradient after plasticity
+            stress = 2 * mu * (F[p] - U @ V.transpose()) @ F[p].transpose(
+            ) + ti.Matrix.identity(float, 2) * la * J * (J - 1)
+            stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress
+            affine = stress + p_mass * C[p]
+            for i, j in ti.static(ti.ndrange(
+                    3, 3)):  # Loop over 3x3 grid node neighborhood
+                offset = ti.Vector([i, j])
+                dpos = (offset.cast(float) - fx) * dx
+                weight = w[i][0] * w[j][1]
+                grid_v_int[base + offset] += int(
+                    ti.floor(0.5 + weight * (p_mass * v[p] + affine @ dpos) *
+                             (2.0**v_exp)))
+                grid_m_int[base + offset] += int(
+                    ti.floor(0.5 + weight * p_mass * (2.0**m_exp)))
+        for i, j in grid_m:
+            if grid_m_int[i, j] > 0:  # No need for epsilon here
+                # grid_v[i, j] = (1.0 / grid_m[i, j]) * grid_v[i, j] # Momentum to velocity
+                grid_v[i, j] = (2**(m_exp - v_exp) / grid_m_int[i, j]
+                                ) * grid_v_int[i, j]  # Momentum to velocity
+                grid_v[i, j][1] -= dt * 50  # gravity
+                if i < 3 and grid_v[i, j][0] < 0:
+                    grid_v[i, j][0] = 0  # Boundary conditions
+                if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0
+                if j < 3 and grid_v[i, j][1] < 0: grid_v[i, j][1] = 0
+                if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0
+        for p in x:  # grid to particle (G2P)
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            w = [
+                0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2
+            ]
+            new_v = ti.Vector.zero(float, 2)
+            new_C = ti.Matrix.zero(float, 2, 2)
+            for i, j in ti.static(ti.ndrange(
+                    3, 3)):  # loop over 3x3 grid node neighborhood
+                dpos = ti.Vector([i, j]).cast(float) - fx
+                g_v = grid_v[base + ti.Vector([i, j])]
+                weight = w[i][0] * w[j][1]
+                new_v += weight * g_v
+                new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
+            v[p], C[p] = new_v, new_C
+            x[p] += dt * v[p]  # advection
+
+    group_size = n_particles // 3
+
+    @ti.kernel
+    def initialize():
+        for i in range(n_particles):
+            x[i] = [
+                ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size),
+                ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size)
+            ]
+            material[i] = i // group_size  # 0: fluid 1: jelly 2: snow
+            v[i] = ti.Matrix([0, 0])
+            F[i] = ti.Matrix([[1, 0], [0, 1]])
+            Jp[i] = 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        m = ti.aot.Module(ti.lang.impl.current_cfg().arch)
+        m.add_field('x', x)
+        m.add_field('v', v)
+        m.add_field('C', C)
+        m.add_field('J', Jp)
+        m.add_field('grid_v', grid_v)
+        m.add_field('grid_m', grid_m)
+        m.add_field('grid_v_int', grid_v_int)
+        m.add_field('grid_m_int', grid_m_int)
+        m.add_field('material', material)
+        m.add_kernel(initialize)
+        m.add_kernel(substep)
+
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            json.load(json_file)
+
+
+@test_utils.test(arch=ti.opengl)
+def test_mpm88_ndarray():
+    dim = 2
+    N = 64
+    n_particles = N * N
+    n_grid = 128
+    dx = 1 / n_grid
+    inv_dx = 1 / dx
+    dt = 2.0e-4
+    p_vol = (dx * 0.5)**2
+    p_rho = 1
+    p_mass = p_vol * p_rho
+    E = 400
+
+    @ti.kernel
+    def substep(x: ti.any_arr(element_dim=1), v: ti.any_arr(element_dim=1),
+                C: ti.any_arr(element_dim=2), J: ti.any_arr(),
+                grid_v: ti.any_arr(element_dim=1), grid_m: ti.any_arr()):
+        for p in x:
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            stress = -dt * p_vol * (J[p] - 1) * 4 * inv_dx * inv_dx * E
+            affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    offset = ti.Vector([i, j])
+                    dpos = (offset.cast(float) - fx) * dx
+                    weight = w[i][0] * w[j][1]
+                    ti.atomic_add(grid_v[base + offset],
+                                  weight * (p_mass * v[p] + affine @ dpos))
+                    ti.atomic_add(grid_m[base + offset], weight * p_mass)
+
+        for i, j in grid_m:
+            if grid_m[i, j] > 0:
+                bound = 3
+                inv_m = 1 / grid_m[i, j]
+                grid_v[i, j] = inv_m * grid_v[i, j]
+                grid_v[i, j][1] -= dt * 9.8
+                if i < bound and grid_v[i, j][0] < 0:
+                    grid_v[i, j][0] = 0
+                if i > n_grid - bound and grid_v[i, j][0] > 0:
+                    grid_v[i, j][0] = 0
+                if j < bound and grid_v[i, j][1] < 0:
+                    grid_v[i, j][1] = 0
+                if j > n_grid - bound and grid_v[i, j][1] > 0:
+                    grid_v[i, j][1] = 0
+
+        for p in x:
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            w = [
+                0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2
+            ]
+            new_v = ti.Vector.zero(ti.f32, 2)
+            new_C = ti.Matrix.zero(ti.f32, 2, 2)
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    dpos = ti.Vector([i, j]).cast(float) - fx
+                    g_v = grid_v[base + ti.Vector([i, j])]
+                    weight = w[i][0] * w[j][1]
+                    new_v += weight * g_v
+                    new_C += 4 * weight * g_v.outer_product(dpos) * inv_dx
+            v[p] = new_v
+            x[p] += dt * v[p]
+            J[p] *= 1 + dt * new_C.trace()
+            C[p] = new_C
+
+    x = ti.Vector.ndarray(dim, ti.f32, n_particles)
+    v = ti.Vector.ndarray(dim, ti.f32, n_particles)
+    C = ti.Matrix.ndarray(dim, dim, ti.f32, n_particles)
+    J = ti.ndarray(ti.f32, n_particles)
+    grid_v = ti.Vector.ndarray(dim, ti.f32, (n_grid, n_grid))
+    grid_m = ti.ndarray(ti.f32, (n_grid, n_grid))
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        m = ti.aot.Module(ti.opengl)
+        m.add_kernel(substep, (x, v, C, J, grid_v, grid_m))
+
+        m.save(tmpdir, '')
+        with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
+            json.load(json_file)
diff --git a/tests/_python_orig/test_arg_alignment.py b/tests/_python_orig/test_arg_alignment.py
new file mode 100644
index 000000000..4320f219e
--- /dev/null
+++ b/tests/_python_orig/test_arg_alignment.py
@@ -0,0 +1,23 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(exclude=[ti.opengl])
+def test_ret_write():
+    @ti.kernel
+    def func(a: ti.i16) -> ti.f32:
+        return 3.0
+
+    assert func(255) == 3.0
+
+
+@test_utils.test(exclude=[ti.opengl])
+def test_arg_read():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func(a: ti.i8, b: ti.i32):
+        x[None] = b
+
+    func(255, 2)
+    assert x[None] == 2
diff --git a/tests/_python_orig/test_arg_check.py b/tests/_python_orig/test_arg_check.py
new file mode 100644
index 000000000..35bd2d5eb
--- /dev/null
+++ b/tests/_python_orig/test_arg_check.py
@@ -0,0 +1,40 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_argument_error():
+    x = ti.field(ti.i32)
+
+    ti.root.place(x)
+
+    try:
+
+        @ti.kernel
+        def set_i32_notype(v):
+            pass
+    except ti.TaichiSyntaxError:
+        pass
+
+    try:
+
+        @ti.kernel
+        def set_i32_args(*args):
+            pass
+    except ti.TaichiSyntaxError:
+        pass
+
+    try:
+
+        @ti.kernel
+        def set_i32_kwargs(**kwargs):
+            pass
+    except ti.TaichiSyntaxError:
+        pass
+
+    @ti.kernel
+    def set_i32(v: ti.i32):
+        x[None] = v
+
+    set_i32(123)
+    assert x[None] == 123
diff --git a/tests/_python_orig/test_arg_load.py b/tests/_python_orig/test_arg_load.py
new file mode 100644
index 000000000..427b1a64c
--- /dev/null
+++ b/tests/_python_orig/test_arg_load.py
@@ -0,0 +1,71 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_arg_load():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.f32)
+
+    ti.root.place(x, y)
+
+    @ti.kernel
+    def set_i32(v: ti.i32):
+        x[None] = v
+
+    @ti.kernel
+    def set_f32(v: ti.f32):
+        y[None] = v
+
+    set_i32(123)
+    assert x[None] == 123
+
+    set_i32(456)
+    assert x[None] == 456
+
+    set_f32(0.125)
+    assert y[None] == 0.125
+
+    set_f32(1.5)
+    assert y[None] == 1.5
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_arg_load_f64():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.f32)
+
+    ti.root.place(x, y)
+
+    @ti.kernel
+    def set_f64(v: ti.f64):
+        y[None] = ti.cast(v, ti.f32)
+
+    @ti.kernel
+    def set_i64(v: ti.i64):
+        y[None] = v
+
+    set_i64(789)
+    assert y[None] == 789
+
+    set_f64(2.5)
+    assert y[None] == 2.5
+
+
+@test_utils.test()
+def test_ext_arr():
+    N = 128
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, N).place(x)
+
+    @ti.kernel
+    def set_f32(v: ti.ext_arr()):
+        for i in range(N):
+            x[i] = v[i] + i
+
+    import numpy as np
+    v = np.ones((N, ), dtype=np.float32) * 10
+    set_f32(v)
+    for i in range(N):
+        assert x[i] == 10 + i
diff --git a/tests/_python_orig/test_assert.py b/tests/_python_orig/test_assert.py
new file mode 100644
index 000000000..dd18b6d12
--- /dev/null
+++ b/tests/_python_orig/test_assert.py
@@ -0,0 +1,124 @@
+import pytest
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_assert_minimal():
+    @ti.kernel
+    def func():
+        assert 0
+
+    @ti.kernel
+    def func2():
+        assert False
+
+    with pytest.raises(RuntimeError):
+        func()
+    with pytest.raises(RuntimeError):
+        func2()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_assert_basic():
+    @ti.kernel
+    def func():
+        x = 20
+        assert 10 <= x < 20
+
+    with pytest.raises(RuntimeError):
+        func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_assert_message():
+    @ti.kernel
+    def func():
+        x = 20
+        assert 10 <= x < 20, 'Foo bar'
+
+    with pytest.raises(RuntimeError, match='Foo bar'):
+        func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_assert_message_formatted():
+    x = ti.field(dtype=int, shape=16)
+    x[10] = 42
+
+    @ti.kernel
+    def assert_formatted():
+        for i in x:
+            assert x[i] == 0, 'x[%d] expect=%d got=%d' % (i, 0, x[i])
+
+    @ti.kernel
+    def assert_float():
+        y = 0.5
+        assert y < 0, 'y = %f' % y
+
+    with pytest.raises(RuntimeError, match=r'x\[10\] expect=0 got=42'):
+        assert_formatted()
+    # TODO: note that we are not fully polished to be able to recover from
+    # assertion failures...
+    with pytest.raises(RuntimeError, match=r'y = 0.5'):
+        assert_float()
+
+    # success case
+    x[10] = 0
+    assert_formatted()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_assert_ok():
+    @ti.kernel
+    def func():
+        x = 20
+        assert 10 <= x <= 20
+
+    func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_static_assert_is_static():
+    @ti.kernel
+    def func():
+        x = 0
+        ti.static_assert(x)  # Expr is not None
+
+    func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_static_assert_message():
+    x = 3
+
+    @ti.kernel
+    def func():
+        ti.static_assert(x == 4, "Oh, no!")
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_static_assert_vector_n_ok():
+    x = ti.Vector.field(4, ti.f32, ())
+
+    @ti.kernel
+    def func():
+        ti.static_assert(x.n == 4)
+
+    func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_static_assert_data_type_ok():
+    x = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        ti.static_assert(x.dtype == ti.f32)
+
+    func()
diff --git a/tests/_python_orig/test_assign.py b/tests/_python_orig/test_assign.py
new file mode 100644
index 000000000..182f4bdfb
--- /dev/null
+++ b/tests/_python_orig/test_assign.py
@@ -0,0 +1,84 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(debug=True)
+def test_assign_basic():
+    @ti.kernel
+    def func_basic():
+        a = 1
+        assert a == 1
+
+    func_basic()
+
+
+@test_utils.test(debug=True)
+def test_assign_unpack():
+    @ti.kernel
+    def func_unpack():
+        (a, b) = (1, 2)
+        assert a == 1
+        assert b == 2
+
+    func_unpack()
+
+
+@test_utils.test(debug=True)
+def test_assign_chained():
+    @ti.kernel
+    def func_chained():
+        a = b = 1
+        assert a == 1
+        assert b == 1
+
+    func_chained()
+
+
+@test_utils.test(debug=True)
+def test_assign_chained_unpack():
+    @ti.kernel
+    def func_chained_unpack():
+        (a, b) = (c, d) = (1, 2)
+        assert a == 1
+        assert b == 2
+        assert c == 1
+        assert d == 2
+
+    func_chained_unpack()
+
+
+@test_utils.test(debug=True)
+def test_assign_assign():
+    @ti.kernel
+    def func_assign():
+        a = 0
+        a = 1
+        assert a == 1
+
+    func_assign()
+
+
+@test_utils.test(debug=True)
+def test_assign_ann():
+    @ti.kernel
+    def func_ann():
+        a: ti.i32 = 1
+        b: ti.f32 = a
+        assert a == 1
+        assert b == 1.0
+
+    func_ann()
+
+
+@test_utils.test()
+def test_assign_ann_over():
+    @ti.kernel
+    def func_ann_over():
+        my_int = ti.i32
+        d: my_int = 2
+        d: ti.f32 = 2.0
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func_ann_over()
diff --git a/tests/_python_orig/test_ast_refactor.py b/tests/_python_orig/test_ast_refactor.py
new file mode 100644
index 000000000..448568dd3
--- /dev/null
+++ b/tests/_python_orig/test_ast_refactor.py
@@ -0,0 +1,1015 @@
+import numpy as np
+import pytest
+from taichi.lang import impl
+from taichi.lang.util import has_pytorch
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_binop():
+    @ti.kernel
+    def foo(x: ti.i32, y: ti.i32, a: ti.template()):
+        a[0] = x + y
+        a[1] = x - y
+        a[2] = x * y
+        a[3] = impl.ti_float(x) / y
+        a[4] = x // y
+        a[5] = x % y
+        a[6] = x**y
+        a[7] = x << y
+        a[8] = x >> y
+        a[9] = x | y
+        a[10] = x ^ y
+        a[11] = x & y
+
+    x = 37
+    y = 3
+    a = ti.field(ti.f32, shape=(12, ))
+    b = ti.field(ti.f32, shape=(12, ))
+
+    a[0] = x + y
+    a[1] = x - y
+    a[2] = x * y
+    a[3] = x / y
+    a[4] = x // y
+    a[5] = x % y
+    a[6] = x**y
+    a[7] = x << y
+    a[8] = x >> y
+    a[9] = x | y
+    a[10] = x ^ y
+    a[11] = x & y
+
+    foo(x, y, b)
+
+    for i in range(12):
+        assert a[i] == test_utils.approx(b[i])
+
+
+@test_utils.test()
+def test_augassign():
+    @ti.kernel
+    def foo(x: ti.i32, y: ti.i32, a: ti.template(), b: ti.template()):
+        for i in a:
+            a[i] = x
+        a[0] += y
+        a[1] -= y
+        a[2] *= y
+        a[3] //= y
+        a[4] %= y
+        a[5] **= y
+        a[6] <<= y
+        a[7] >>= y
+        a[8] |= y
+        a[9] ^= y
+        a[10] &= y
+        b[0] = x
+        b[0] /= y
+
+    x = 37
+    y = 3
+    a = ti.field(ti.i32, shape=(11, ))
+    b = ti.field(ti.i32, shape=(11, ))
+    c = ti.field(ti.f32, shape=(1, ))
+    d = ti.field(ti.f32, shape=(1, ))
+
+    a[0] = x + y
+    a[1] = x - y
+    a[2] = x * y
+    a[3] = x // y
+    a[4] = x % y
+    a[5] = x**y
+    a[6] = x << y
+    a[7] = x >> y
+    a[8] = x | y
+    a[9] = x ^ y
+    a[10] = x & y
+    c[0] = x / y
+
+    foo(x, y, b, d)
+
+    for i in range(11):
+        assert a[i] == b[i]
+    assert c[0] == test_utils.approx(d[0])
+
+
+@test_utils.test()
+def test_unaryop():
+    @ti.kernel
+    def foo(x: ti.i32, a: ti.template()):
+        a[0] = +x
+        a[1] = -x
+        a[2] = not x
+        a[3] = ~x
+
+    x = 1234
+    a = ti.field(ti.i32, shape=(4, ))
+    b = ti.field(ti.i32, shape=(4, ))
+
+    a[0] = +x
+    a[1] = -x
+    a[2] = not x
+    a[3] = ~x
+
+    foo(x, b)
+
+    for i in range(4):
+        assert a[i] == b[i]
+
+
+@test_utils.test()
+def test_boolop():
+    @ti.kernel
+    def foo(a: ti.template()):
+        a[0] = 0 and 0
+        a[1] = 0 and 1
+        a[2] = 1 and 0
+        a[3] = 1 and 1
+        a[4] = 0 or 0
+        a[5] = 0 or 1
+        a[6] = 1 or 0
+        a[7] = 1 or 1
+        a[8] = 1 and 1 and 1 and 1
+        a[9] = 1 and 1 and 1 and 0
+        a[10] = 0 or 0 or 0 or 0
+        a[11] = 0 or 0 or 1 or 0
+
+    a = ti.field(ti.i32, shape=(12, ))
+    b = ti.field(ti.i32, shape=(12, ))
+
+    a[0] = 0 and 0
+    a[1] = 0 and 1
+    a[2] = 1 and 0
+    a[3] = 1 and 1
+    a[4] = 0 or 0
+    a[5] = 0 or 1
+    a[6] = 1 or 0
+    a[7] = 1 or 1
+    a[8] = 1 and 1 and 1 and 1
+    a[9] = 1 and 1 and 1 and 0
+    a[10] = 0 or 0 or 0 or 0
+    a[11] = 0 or 0 or 1 or 0
+
+    foo(b)
+
+    for i in range(12):
+        assert a[i] == b[i]
+
+
+@test_utils.test()
+def test_compare_fail():
+    with pytest.raises(ti.TaichiCompilationError,
+                       match='"Is" is not supported in Taichi kernels.'):
+
+        @ti.kernel
+        def foo():
+            1 is [1]
+
+        foo()
+
+
+@test_utils.test()
+def test_single_compare():
+    @ti.kernel
+    def foo(a: ti.template(), b: ti.template(), c: ti.template()):
+        for i in ti.static(range(3)):
+            c[i * 6] = a[i] == b[i]
+            c[i * 6 + 1] = a[i] != b[i]
+            c[i * 6 + 2] = a[i] < b[i]
+            c[i * 6 + 3] = a[i] <= b[i]
+            c[i * 6 + 4] = a[i] > b[i]
+            c[i * 6 + 5] = a[i] >= b[i]
+
+    a = ti.Vector([1, 1, 2])
+    b = ti.Vector([2, 1, 1])
+    c = ti.field(ti.i32, shape=(18, ))
+    d = ti.field(ti.i32, shape=(18, ))
+
+    for i in range(3):
+        c[i * 6] = a[i] == b[i]
+        c[i * 6 + 1] = a[i] != b[i]
+        c[i * 6 + 2] = a[i] < b[i]
+        c[i * 6 + 3] = a[i] <= b[i]
+        c[i * 6 + 4] = a[i] > b[i]
+        c[i * 6 + 5] = a[i] >= b[i]
+
+    foo(a, b, d)
+    for i in range(18):
+        assert c[i] == d[i]
+
+
+@test_utils.test()
+def test_chain_compare():
+    @ti.kernel
+    def foo(a: ti.i32, b: ti.i32, c: ti.template()):
+        c[0] = a == b == a
+        c[1] = a == b != a
+        c[2] = a != b == a
+        c[3] = a < b > a
+        c[4] = a > b < a
+        c[5] = a < b < a
+        c[6] = a > b > a
+        c[7] = a == a == a == a
+        c[8] = a == a == a != a
+        c[9] = a < b > a < b
+        c[10] = a > b > a < b
+
+    a = 1
+    b = 2
+    c = ti.field(ti.i32, shape=(11, ))
+    d = ti.field(ti.i32, shape=(11, ))
+
+    c[0] = a == b == a
+    c[1] = a == b != a
+    c[2] = a != b == a
+    c[3] = a < b > a
+    c[4] = a > b < a
+    c[5] = a < b < a
+    c[6] = a > b > a
+    c[7] = a == a == a == a
+    c[8] = a == a == a != a
+    c[9] = a < b > a < b
+    c[10] = a > b > a < b
+
+    foo(a, b, d)
+    for i in range(11):
+        assert c[i] == d[i]
+
+
+@test_utils.test()
+def test_return():
+    @ti.kernel
+    def foo(x: ti.i32) -> ti.i32:
+        return x + 1
+
+    assert foo(1) == 2
+
+
+@test_utils.test()
+def test_format_print():
+    a = ti.field(ti.i32, shape=(10, ))
+
+    @ti.kernel
+    def foo():
+        a[0] = 1.0
+        a[5] = 2.0
+        print('Test if the string.format and fstring print works')
+        print('string.format: a[0]={}, a[5]={}'.format(a[0], a[5]))
+        print(f'fstring: a[0]={a[0]}, a[5]={a[5]}')
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_if():
+    @ti.kernel
+    def foo(x: ti.i32) -> ti.i32:
+        ret = 0
+        if x:
+            ret = 1
+        else:
+            ret = 0
+        return ret
+
+    assert foo(1)
+    assert not foo(0)
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_if():
+    @ti.kernel
+    def foo(x: ti.template()) -> ti.i32:
+        ret = 0
+        if ti.static(x):
+            ret = 1
+        else:
+            ret = 0
+        return ret
+
+    assert foo(1)
+    assert not foo(0)
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_struct_for():
+    a = ti.field(ti.i32, shape=(10, ))
+
+    @ti.kernel
+    def foo(x: ti.i32):
+        for i in a:
+            a[i] = x
+
+    x = 5
+    foo(x)
+    for i in range(10):
+        assert a[i] == 5
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_grouped_struct_for():
+    a = ti.field(ti.i32, shape=(4, 4))
+
+    @ti.kernel
+    def foo(x: ti.i32):
+        for I in ti.grouped(a):
+            a[I] = x
+
+    x = 5
+    foo(x)
+    for i in range(4):
+        for j in range(4):
+            assert a[i, j] == 5
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_for():
+    a = ti.field(ti.i32, shape=(10, ))
+
+    @ti.kernel
+    def foo(x: ti.i32):
+        for i in ti.static(range(10)):
+            a[i] = x
+
+    x = 5
+    foo(x)
+    for i in range(10):
+        assert a[i] == 5
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_grouped_for():
+    a = ti.field(ti.i32, shape=(4, 4))
+
+    @ti.kernel
+    def foo(x: ti.i32):
+        for i in ti.static(ti.grouped(ti.ndrange((1, 3), (1, 3)))):
+            a[i] = x
+
+    x = 5
+    foo(x)
+    for i in range(4):
+        for j in range(4):
+            if 1 <= i < 3 and 1 <= j < 3:
+                assert a[i, j] == 5
+            else:
+                assert a[i, j] == 0
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_range_for_single_argument():
+    a = ti.field(ti.i32, shape=(10, ))
+
+    @ti.kernel
+    def foo(x: ti.i32):
+        for i in range(5):
+            a[i] = x
+
+    x = 5
+    foo(x)
+    for i in range(10):
+        if i < 5:
+            assert a[i] == 5
+        else:
+            assert a[i] == 0
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_range_for_two_arguments():
+    a = ti.field(ti.i32, shape=(10, ))
+
+    @ti.kernel
+    def foo(x: ti.i32):
+        for i in range(3, 7):
+            a[i] = x
+
+    x = 5
+    foo(x)
+    for i in range(10):
+        if 3 <= i < 7:
+            assert a[i] == 5
+        else:
+            assert a[i] == 0
+
+
+@test_utils.test()
+def test_range_for_three_arguments():
+    a = ti.field(ti.i32, shape=(10, ))
+
+    with pytest.raises(ti.TaichiCompilationError,
+                       match='Range should have 1 or 2 arguments, found 3'):
+
+        @ti.kernel
+        def foo(x: ti.i32):
+            for i in range(3, 7, 2):
+                a[i] = x
+
+        x = 5
+        foo(x)
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_ndrange_for():
+    x = ti.field(ti.f32, shape=(16, 32, 64))
+
+    @ti.kernel
+    def func():
+        for i, j, k in ti.ndrange((4, 10), (3, 8), 17):
+            x[i, j, k] = i + j * 10 + k * 100
+
+    func()
+    for i in range(16):
+        for j in range(32):
+            for k in range(64):
+                if 4 <= i < 10 and 3 <= j < 8 and k < 17:
+                    assert x[i, j, k] == i + j * 10 + k * 100
+                else:
+                    assert x[i, j, k] == 0
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_grouped_ndrange_for():
+    x = ti.field(ti.i32, shape=(6, 6, 6))
+    y = ti.field(ti.i32, shape=(6, 6, 6))
+
+    @ti.kernel
+    def func():
+        lower = ti.Vector([0, 1, 2])
+        upper = ti.Vector([3, 4, 5])
+        for I in ti.grouped(
+                ti.ndrange((lower[0], upper[0]), (lower[1], upper[1]),
+                           (lower[2], upper[2]))):
+            x[I] = I[0] + I[1] + I[2]
+        for i in range(0, 3):
+            for j in range(1, 4):
+                for k in range(2, 5):
+                    y[i, j, k] = i + j + k
+
+    func()
+
+    for i in range(6):
+        for j in range(6):
+            for k in range(6):
+                assert x[i, j, k] == y[i, j, k]
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_for_break():
+    n = 10
+
+    @ti.kernel
+    def foo(a: ti.template()):
+        for i in ti.static(range(n)):
+            a[i] = 3
+            if ti.static(i >= 5):
+                break
+                a[i] = 10
+            a[i] = 5
+
+    a = ti.field(ti.i32, shape=(n, ))
+    foo(a)
+    for i in range(n):
+        if i < 5:
+            assert a[i] == 5
+        elif i == 5:
+            assert a[i] == 3
+        else:
+            assert a[i] == 0
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_grouped_for_break():
+    n = 4
+
+    @ti.kernel
+    def foo(a: ti.template()):
+        for I in ti.static(ti.grouped(ti.ndrange(n, n))):
+            a[I] = 3
+            if ti.static(I[0] >= 3):
+                break
+                a[I] = 10
+            a[I] = 5
+
+    a = ti.field(ti.i32, shape=(n, n))
+    foo(a)
+    for i in range(n):
+        for j in range(n):
+            if i < 3:
+                assert a[i, j] == 5
+            elif i == 3 and j == 0:
+                assert a[i, j] == 3
+            else:
+                assert a[i, j] == 0
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_for_continue():
+    n = 10
+
+    @ti.kernel
+    def foo(a: ti.template()):
+        for i in ti.static(range(n)):
+            a[i] = 3
+            if ti.static(i >= 5):
+                continue
+                a[i] = 10
+            a[i] = 5
+
+    a = ti.field(ti.i32, shape=(n, ))
+    foo(a)
+    for i in range(n):
+        if i < 5:
+            assert a[i] == 5
+        else:
+            assert a[i] == 3
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_grouped_for_continue():
+    n = 4
+
+    @ti.kernel
+    def foo(a: ti.template()):
+        for I in ti.static(ti.grouped(ti.ndrange(n, n))):
+            a[I] = 3
+            if ti.static(I[0] >= 3):
+                continue
+                a[I] = 10
+            a[I] = 5
+
+    a = ti.field(ti.i32, shape=(n, n))
+    foo(a)
+    for i in range(n):
+        for j in range(n):
+            if i < 3:
+                assert a[i, j] == 5
+            else:
+                assert a[i, j] == 3
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_for_break():
+    n = 4
+
+    @ti.kernel
+    def foo(a: ti.template()):
+        for i in range(n):
+            for j in range(n):
+                a[i, j] = 3
+                if i >= 3:
+                    break
+                    a[i, j] = 10
+                a[i, j] = 5
+
+    a = ti.field(ti.i32, shape=(n, n))
+    foo(a)
+    for i in range(n):
+        for j in range(n):
+            if i < 3:
+                assert a[i, j] == 5
+            elif i == 3 and j == 0:
+                assert a[i, j] == 3
+            else:
+                assert a[i, j] == 0
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_for_continue():
+    n = 4
+
+    @ti.kernel
+    def foo(a: ti.template()):
+        for i in range(n):
+            for j in range(n):
+                a[i, j] = 3
+                if i >= 3:
+                    continue
+                    a[i, j] = 10
+                a[i, j] = 5
+
+    a = ti.field(ti.i32, shape=(n, n))
+    foo(a)
+    for i in range(n):
+        for j in range(n):
+            if i < 3:
+                assert a[i, j] == 5
+            else:
+                assert a[i, j] == 3
+
+
+@test_utils.test()
+def test_while():
+    x = ti.field(ti.f32)
+
+    N = 1
+
+    ti.root.dense(ti.i, N).place(x)
+
+    @ti.kernel
+    def func():
+        i = 0
+        s = 0
+        while i < 10:
+            s += i
+            i += 1
+        x[0] = s
+
+    func()
+    assert x[0] == 45
+
+
+@test_utils.test()
+def test_while_break():
+    ret = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        i = 0
+        s = 0
+        while True:
+            s += i
+            i += 1
+            if i > 10:
+                break
+        ret[None] = s
+
+    func()
+    assert ret[None] == 55
+
+
+@test_utils.test()
+def test_while_continue():
+    ret = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        i = 0
+        s = 0
+        while i < 10:
+            i += 1
+            if i % 2 == 0:
+                continue
+            s += i
+        ret[None] = s
+
+    func()
+    assert ret[None] == 25
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_func():
+    @ti.func
+    def bar(x):
+        return x * x, -x
+
+    a = ti.field(ti.i32, shape=(10, ))
+    b = ti.field(ti.i32, shape=(10, ))
+
+    @ti.kernel
+    def foo():
+        for i in a:
+            a[i], b[i] = bar(i)
+
+    foo()
+    for i in range(10):
+        assert a[i] == i * i
+        assert b[i] == -i
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_func_in_python_func():
+    @ti.func
+    def bar(x: ti.template()):
+        if ti.static(x):
+            mat = bar(x // 2)
+            mat = mat @ mat
+            if ti.static(x % 2):
+                mat = mat @ ti.Matrix([[1, 1], [1, 0]])
+            return mat
+        else:
+            return ti.Matrix([[1, 0], [0, 1]])
+
+    def fibonacci(x):
+        return impl.subscript(bar(x), 1, 0)
+
+    @ti.kernel
+    def foo(x: ti.template()) -> ti.i32:
+        return fibonacci(x)
+
+    fib = [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
+
+    for i in range(10):
+        assert foo(i) == fib[i]
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_ifexp():
+    @ti.kernel
+    def foo(x: ti.i32) -> ti.i32:
+        return 1 if x else 0
+
+    assert foo(1) == 1
+    assert foo(0) == 0
+
+
+@test_utils.test(print_preprocessed_ir=True)
+def test_static_ifexp():
+    @ti.kernel
+    def foo(x: ti.template()) -> ti.i32:
+        return 1 if ti.static(x) else 0
+
+    assert foo(1) == 1
+    assert foo(0) == 0
+
+
+@test_utils.test()
+def test_static_assign():
+    a = ti.field(ti.i32, shape=(1, ))
+    b = ti.field(ti.i32, shape=(1, ))
+
+    @ti.kernel
+    def foo(xx: ti.template(), yy: ti.template()) -> ti.i32:
+        x, y = ti.static(xx, yy)
+        x[0] -= 1
+        y[0] -= 1
+        return x[0] + y[0]
+
+    a[0] = 2
+    b[0] = 3
+    assert foo(a, b) == 3
+
+
+@test_utils.test()
+def test_static_assign_element():
+    with pytest.raises(
+            ti.TaichiCompilationError,
+            match='Static assign cannot be used on elements in arrays'):
+
+        @ti.kernel
+        def foo():
+            a = ti.static([1, 2, 3])
+            a[0] = ti.static(2)
+
+        foo()
+
+
+@test_utils.test()
+def test_recreate_variable():
+    with pytest.raises(ti.TaichiCompilationError,
+                       match='Recreating variables is not allowed'):
+
+        @ti.kernel
+        def foo():
+            a = 1
+            a = ti.static(2)
+
+        foo()
+
+
+@test_utils.test()
+def test_taichi_other_than_ti():
+    import taichi as tc
+
+    @tc.func
+    def bar(x: tc.template()):
+        if tc.static(x):
+            mat = bar(x // 2)
+            mat = mat @ mat
+            if tc.static(x % 2):
+                mat = mat @ tc.Matrix([[1, 1], [1, 0]])
+            return mat
+        else:
+            return tc.Matrix([[1, 0], [0, 1]])
+
+    def fibonacci(x):
+        return impl.subscript(bar(x), 1, 0)
+
+    @tc.kernel
+    def foo(x: tc.template()) -> tc.i32:
+        return fibonacci(x)
+
+    fib = [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
+
+    for i in range(10):
+        assert foo(i) == fib[i]
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_assert_message():
+    @ti.kernel
+    def func():
+        x = 20
+        assert 10 <= x < 20, 'Foo bar'
+
+    with pytest.raises(RuntimeError, match='Foo bar'):
+        func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_assert_message_formatted():
+    x = ti.field(dtype=int, shape=16)
+    x[10] = 42
+
+    @ti.kernel
+    def assert_formatted():
+        for i in x:
+            assert x[i] == 0, 'x[%d] expect=%d got=%d' % (i, 0, x[i])
+
+    @ti.kernel
+    def assert_float():
+        y = 0.5
+        assert y < 0, 'y = %f' % y
+
+    with pytest.raises(RuntimeError, match=r'x\[10\] expect=0 got=42'):
+        assert_formatted()
+    # TODO: note that we are not fully polished to be able to recover from
+    # assertion failures...
+    with pytest.raises(RuntimeError, match=r'y = 0.5'):
+        assert_float()
+
+    # success case
+    x[10] = 0
+    assert_formatted()
+
+
+@test_utils.test()
+def test_dict():
+    @ti.kernel
+    def foo(x: ti.template()) -> ti.i32:
+        a = {1: 2, 3: 4}
+        b = {5: 6, **a}
+        return b[x]
+
+    assert foo(1) == 2
+    with pytest.raises(ti.TaichiCompilationError):
+        foo(2)
+
+
+@test_utils.test()
+def test_listcomp():
+    @ti.func
+    def identity(dt, n: ti.template()):
+        return ti.Matrix([[ti.cast(int(i == j), dt) for j in range(n)]
+                          for i in range(n)])
+
+    @ti.kernel
+    def foo(n: ti.template()) -> ti.i32:
+        a = identity(ti.i32, n)
+        b = [j for i in a for j in i]
+        ret = 0
+        for i in ti.static(range(n)):
+            for j in ti.static(range(n)):
+                ret += i * j * b[i * n + j]
+        return ret
+
+    assert foo(5) == 1 + 4 + 9 + 16
+
+
+@test_utils.test()
+def test_dictcomp():
+    @ti.kernel
+    def foo(n: ti.template()) -> ti.i32:
+        a = {i: i * i for i in range(n) if i % 3 if i % 2}
+        ret = 0
+        for i in ti.static(range(n)):
+            if ti.static(i % 3):
+                if ti.static(i % 2):
+                    ret += a[i]
+        return ret
+
+    assert foo(10) == 1 * 1 + 5 * 5 + 7 * 7
+
+
+@test_utils.test()
+def test_dictcomp_fail():
+    @ti.kernel
+    def foo(n: ti.template(), m: ti.template()) -> ti.i32:
+        a = {i: i * i for i in range(n) if i % 3 if i % 2}
+        return a[m]
+
+    with pytest.raises(ti.TaichiCompilationError):
+        foo(5, 2)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        foo(5, 3)
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.opengl])
+def test_ndarray():
+    n = 4
+    m = 7
+
+    @ti.kernel
+    def run(x: ti.any_arr(element_dim=2, layout=ti.Layout.AOS),
+            y: ti.any_arr()):
+        for i in ti.static(range(n)):
+            for j in ti.static(range(m)):
+                x[i, j][0, 0] += i + j + y[i, j]
+
+    a = ti.Matrix.ndarray(1, 1, ti.i32, shape=(n, m))
+    for i in range(n):
+        for j in range(m):
+            a[i, j][0, 0] = i * j
+    b = np.ones((n, m), dtype=np.int32)
+    run(a, b)
+    for i in range(n):
+        for j in range(m):
+            assert a[i, j][0, 0] == i * j + i + j + 1
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_builder():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.static(ti.ndrange(n, n)):
+            Abuilder[i, j] += i + j
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    for i in range(n):
+        for j in range(n):
+            assert A[i, j] == i + j
+
+
+@test_utils.test()
+def test_func_default_value():
+    @ti.func
+    def bar(s, t=1):
+        return s + t
+
+    @ti.kernel
+    def foo() -> ti.i32:
+        return bar(1)
+
+    assert foo() == 2
+
+
+@test_utils.test()
+def test_func_default_value_fail():
+    with pytest.raises(ti.TaichiCompilationError):
+
+        @ti.func
+        def bar(s, t=1):
+            return s + t
+
+        @ti.kernel
+        def foo() -> ti.i32:
+            return bar(1, 2, 3)
+
+        foo()
+
+
+@test_utils.test()
+def test_raise():
+    dim = 1
+    m = ti.Matrix.field(dim, dim, ti.f32)
+    ti.root.place(m)
+
+    with pytest.raises(
+            ti.TaichiCompilationError,
+            match="Polar decomposition only supports 2D and 3D matrices."):
+
+        @ti.kernel
+        def foo():
+            ti.polar_decompose(m, ti.f32)
+
+        foo()
+
+
+@test_utils.test()
+def test_scalar_argument():
+    @ti.kernel
+    def add(a: ti.f32, b: ti.f32) -> ti.f32:
+        a = a + b
+        return a
+
+    assert add(1.0, 2.0) == test_utils.approx(3.0)
+
+
+@test_utils.test()
+def test_default_template_args_on_func():
+    @ti.func
+    def bar(a: ti.template() = 123):
+        return a
+
+    @ti.kernel
+    def foo() -> ti.i32:
+        return bar()
+
+    assert foo() == 123
+
+
+@test_utils.test()
+def test_grouped_static_for_cast():
+    @ti.kernel
+    def foo() -> ti.f32:
+        ret = 0.
+        for I in ti.static(ti.grouped(ti.ndrange((4, 5), (3, 5), 5))):
+            tmp = I.cast(float)
+            ret += tmp[2] / 2
+        return ret
+
+    assert foo() == test_utils.approx(10)
diff --git a/tests/_python_orig/test_ast_resolver.py b/tests/_python_orig/test_ast_resolver.py
new file mode 100644
index 000000000..47ccd8bc3
--- /dev/null
+++ b/tests/_python_orig/test_ast_resolver.py
@@ -0,0 +1,47 @@
+import ast
+from collections import namedtuple
+
+from taichi.lang.ast.symbol_resolver import ASTResolver
+
+
+def test_ast_resolver_basic():
+    # import within the function to avoid polluting the global scope
+    import taichi as ti
+    ti.init()
+    node = ast.parse('ti.kernel', mode='eval').body
+    assert ASTResolver.resolve_to(node, ti.kernel, locals())
+
+
+def test_ast_resolver_direct_import():
+    import taichi as ti
+    ti.init()
+    from taichi import kernel
+    node = ast.parse('kernel', mode='eval').body
+    assert ASTResolver.resolve_to(node, kernel, locals())
+
+
+def test_ast_resolver_alias():
+    import taichi
+    taichi.init()
+    node = ast.parse('taichi.kernel', mode='eval').body
+    assert ASTResolver.resolve_to(node, taichi.kernel, locals())
+
+    import taichi as tc
+    node = ast.parse('tc.kernel', mode='eval').body
+    assert ASTResolver.resolve_to(node, tc.kernel, locals())
+
+
+def test_ast_resolver_chain():
+    import taichi as ti
+    ti.init()
+    node = ast.parse('ti.lang.ops.atomic_add', mode='eval').body
+    assert ASTResolver.resolve_to(node, ti.atomic_add, locals())
+
+
+def test_ast_resolver_wrong_ti():
+    import taichi
+    taichi.init()
+    fake_ti = namedtuple('FakeTi', ['kernel'])
+    ti = fake_ti(kernel='fake')
+    node = ast.parse('ti.kernel', mode='eval').body
+    assert not ASTResolver.resolve_to(node, taichi.kernel, locals())
diff --git a/tests/_python_orig/test_async.py b/tests/_python_orig/test_async.py
new file mode 100644
index 000000000..18f5c5d56
--- /dev/null
+++ b/tests/_python_orig/test_async.py
@@ -0,0 +1,57 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_simple():
+    n = 32
+
+    x = ti.field(dtype=ti.i32, shape=n)
+
+    @ti.kernel
+    def double():
+        for i in x:
+            x[i] = i * 2
+
+    double()
+
+    for i in range(n):
+        assert x[i] == i * 2
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_numpy():
+    n = 10000
+
+    @ti.kernel
+    def inc(a: ti.ext_arr()):
+        for i in range(n):
+            a[i] += i
+
+    x = np.zeros(dtype=np.int32, shape=n)
+    for i in range(10):
+        inc(x)
+
+    for i in range(n):
+        assert x[i] == i * 10
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_listgen_opt_with_offsets():
+    x = ti.field(dtype=ti.i32)
+
+    ti.root.pointer(ti.i, 4).dense(ti.i, 4).place(x, offset=-8)
+
+    @ti.kernel
+    def inc():
+        for i in x:
+            x[i] += 1
+
+    for i in range(10):
+        inc()
+
+    ti.sync()
+    assert ti.tools.async_utils.get_kernel_stats().get_counters(
+    )['launched_tasks_list_gen'] <= 2
diff --git a/tests/_python_orig/test_atomic.py b/tests/_python_orig/test_atomic.py
new file mode 100644
index 000000000..73896fcfa
--- /dev/null
+++ b/tests/_python_orig/test_atomic.py
@@ -0,0 +1,337 @@
+import taichi as ti
+from tests import test_utils
+
+n = 128
+
+
+def run_atomic_add_global_case(vartype, step, valproc=lambda x: x):
+    x = ti.field(vartype)
+    y = ti.field(vartype)
+    c = ti.field(vartype)
+
+    ti.root.dense(ti.i, n).place(x, y)
+    ti.root.place(c)
+
+    # Make Taichi correctly infer the type
+    # TODO: Taichi seems to treat numpy.int32 as a float type, fix that.
+    init_ck = 0 if vartype == ti.i32 else 0.0
+
+    @ti.kernel
+    def func():
+        ck = init_ck
+        for i in range(n):
+            x[i] = ti.atomic_add(c[None], step)
+            y[i] = ti.atomic_add(ck, step)
+
+    func()
+
+    assert valproc(c[None]) == n * step
+    x_actual = sorted(x.to_numpy())
+    y_actual = sorted(y.to_numpy())
+    expect = [i * step for i in range(n)]
+    for (xa, ya, e) in zip(x_actual, y_actual, expect):
+        print(xa, ya, e)
+        assert valproc(xa) == e
+        assert valproc(ya) == e
+
+
+@test_utils.test()
+def test_atomic_add_global_i32():
+    run_atomic_add_global_case(ti.i32, 42)
+
+
+@test_utils.test()
+def test_atomic_add_global_f32():
+    run_atomic_add_global_case(
+        ti.f32, 4.2, valproc=lambda x: test_utils.approx(x, rel=1e-5))
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_atomic_min_max_uint():
+    x = ti.field(ti.u64, shape=100)
+
+    @ti.kernel
+    def test0():
+        for I in x:
+            x[I] = 0
+        x[1] = ti.cast(1, ti.u64) << 63
+        for I in x:
+            ti.atomic_max(x[0], x[I])
+
+    test0()
+    assert x[0] == 9223372036854775808
+
+    @ti.kernel
+    def test1():
+        for I in x:
+            x[I] = ti.cast(1, ti.u64) << 63
+        x[1] = 100
+        for I in x:
+            ti.atomic_min(x[0], x[I])
+
+    test1()
+    assert x[0] == 100
+
+
+@test_utils.test()
+def test_atomic_add_expr_evaled():
+    c = ti.field(ti.i32)
+    step = 42
+
+    ti.root.place(c)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            # this is an expr with side effect, make sure it's not optimized out.
+            ti.atomic_add(c[None], step)
+
+    func()
+
+    assert c[None] == n * step
+
+
+@test_utils.test()
+def test_atomic_add_demoted():
+    # Ensure demoted atomics do not crash the program.
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    step = 42
+
+    ti.root.dense(ti.i, n).place(x, y)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            s = i
+            # Both adds should get demoted.
+            x[i] = ti.atomic_add(s, step)
+            y[i] = ti.atomic_add(s, step)
+
+    func()
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i + step
+
+
+@test_utils.test()
+def test_atomic_add_with_local_store_simplify1():
+    # Test for the following LocalStoreStmt simplification case:
+    #
+    # local store [$a <- ...]
+    # atomic add ($a, ...)
+    # local store [$a <- ...]
+    #
+    # Specifically, the second store should not suppress the first one, because
+    # atomic_add can return value.
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    step = 42
+
+    ti.root.dense(ti.i, n).place(x, y)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            # do a local store
+            j = i
+            x[i] = ti.atomic_add(j, step)
+            # do another local store, make sure the previous one is not optimized out
+            j = x[i]
+            y[i] = j
+
+    func()
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i
+
+
+@test_utils.test()
+def test_atomic_add_with_local_store_simplify2():
+    # Test for the following LocalStoreStmt simplification case:
+    #
+    # local store [$a <- ...]
+    # atomic add ($a, ...)
+    #
+    # Specifically, the local store should not be removed, because
+    # atomic_add can return its value.
+    x = ti.field(ti.i32)
+    step = 42
+
+    ti.root.dense(ti.i, n).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            j = i
+            x[i] = ti.atomic_add(j, step)
+
+    func()
+
+    for i in range(n):
+        assert x[i] == i
+
+
+@test_utils.test()
+def test_atomic_add_with_if_simplify():
+    # Make sure IfStmt simplification doesn't move stmts depending on the result
+    # of atomic_add()
+    x = ti.field(ti.i32)
+    step = 42
+
+    ti.root.dense(ti.i, n).place(x)
+
+    boundary = n / 2
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            if i > boundary:
+                # A sequence of commands designed such that atomic_add() is the only
+                # thing to decide whether the if branch can be simplified.
+                s = i
+                j = ti.atomic_add(s, s)
+                k = j + s
+                x[i] = k
+            else:
+                # If we look at the IR, this branch should be simplified, since nobody
+                # is using atomic_add's result.
+                ti.atomic_add(x[i], i)
+                x[i] += step
+
+    func()
+
+    for i in range(n):
+        expect = i * 3 if i > boundary else (i + step)
+        assert x[i] == expect
+
+
+@test_utils.test()
+def test_local_atomic_with_if():
+    ret = ti.field(dtype=ti.i32, shape=())
+
+    @ti.kernel
+    def test():
+        if True:
+            x = 0
+            x += 1
+            ret[None] = x
+
+    test()
+    assert ret[None] == 1
+
+
+@test_utils.test()
+def test_atomic_sub_expr_evaled():
+    c = ti.field(ti.i32)
+    step = 42
+
+    ti.root.place(c)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            # this is an expr with side effect, make sure it's not optimized out.
+            ti.atomic_sub(c[None], step)
+
+    func()
+
+    assert c[None] == -n * step
+
+
+@test_utils.test()
+def test_atomic_max_expr_evaled():
+    c = ti.field(ti.i32)
+    step = 42
+
+    ti.root.place(c)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            # this is an expr with side effect, make sure it's not optimized out.
+            ti.atomic_max(c[None], i * step)
+
+    func()
+
+    assert c[None] == (n - 1) * step
+
+
+@test_utils.test()
+def test_atomic_min_expr_evaled():
+    c = ti.field(ti.i32)
+    step = 42
+
+    ti.root.place(c)
+
+    @ti.kernel
+    def func():
+        c[None] = 1000
+        for i in range(n):
+            # this is an expr with side effect, make sure it's not optimized out.
+            ti.atomic_min(c[None], i * step)
+
+    func()
+
+    assert c[None] == 0
+
+
+@test_utils.test()
+def test_atomic_and_expr_evaled():
+    c = ti.field(ti.i32)
+    step = 42
+
+    ti.root.place(c)
+
+    max_int = 2147483647
+
+    @ti.kernel
+    def func():
+        c[None] = 1023
+        for i in range(10):
+            # this is an expr with side effect, make sure it's not optimized out.
+            ti.atomic_and(c[None], max_int - 2**i)
+
+    func()
+
+    assert c[None] == 0
+
+
+@test_utils.test()
+def test_atomic_or_expr_evaled():
+    c = ti.field(ti.i32)
+    step = 42
+
+    ti.root.place(c)
+
+    @ti.kernel
+    def func():
+        c[None] = 0
+        for i in range(10):
+            # this is an expr with side effect, make sure it's not optimized out.
+            ti.atomic_or(c[None], 2**i)
+
+    func()
+
+    assert c[None] == 1023
+
+
+@test_utils.test()
+def test_atomic_xor_expr_evaled():
+    c = ti.field(ti.i32)
+    step = 42
+
+    ti.root.place(c)
+
+    @ti.kernel
+    def func():
+        c[None] = 1023
+        for i in range(10):
+            # this is an expr with side effect, make sure it's not optimized out.
+            ti.atomic_xor(c[None], 2**i)
+
+    func()
+
+    assert c[None] == 0
diff --git a/tests/_python_orig/test_basics.py b/tests/_python_orig/test_basics.py
new file mode 100644
index 000000000..cd452526c
--- /dev/null
+++ b/tests/_python_orig/test_basics.py
@@ -0,0 +1,128 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_simple():
+    n = 128
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def func():
+        x[7] = 120
+
+    func()
+
+    for i in range(n):
+        if i == 7:
+            assert x[i] == 120
+        else:
+            assert x[i] == 0
+
+
+@test_utils.test()
+def test_range_loops():
+    n = 128
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            x[i] = i + 123
+
+    func()
+
+    for i in range(n):
+        assert x[i] == i + 123
+
+
+@test_utils.test()
+def test_python_access():
+    n = 128
+    x = ti.field(ti.i32, shape=n)
+
+    x[3] = 123
+    x[4] = 456
+    assert x[3] == 123
+    assert x[4] == 456
+
+
+@test_utils.test()
+def test_if():
+    x = ti.field(ti.f32, shape=16)
+
+    @ti.kernel
+    def if_test():
+        for i in x:
+            if i < 100:
+                x[i] = 100
+            else:
+                x[i] = i
+
+    if_test()
+
+    for i in range(16):
+        assert x[i] == 100
+
+    @ti.kernel
+    def if_test2():
+        for i in x:
+            if i < 100:
+                x[i] = i
+            else:
+                x[i] = 100
+
+    if_test2()
+
+    for i in range(16):
+        assert x[i] == i
+
+
+@test_utils.test()
+def test_if_global_load():
+    x = ti.field(ti.i32, shape=16)
+
+    @ti.kernel
+    def fill():
+        for i in x:
+            if x[i]:
+                x[i] = i
+
+    for i in range(16):
+        x[i] = i % 2
+
+    fill()
+
+    for i in range(16):
+        if i % 2 == 0:
+            assert x[i] == 0
+        else:
+            assert x[i] == i
+
+
+@test_utils.test()
+def test_while_global_load():
+    x = ti.field(ti.i32, shape=16)
+    y = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def run():
+        while x[3]:
+            x[3] -= 1
+            y[None] += 1
+
+    for i in range(16):
+        x[i] = i
+
+    run()
+
+    assert y[None] == 3
+
+
+@test_utils.test()
+def test_datatype_string():
+    for ty in [
+            ti.u8, ti.u16, ti.u32, ti.u64, ti.i8, ti.i16, ti.i32, ti.f32,
+            ti.f64
+    ]:
+        assert ty.to_string() == str(ty)
diff --git a/tests/_python_orig/test_binding.py b/tests/_python_orig/test_binding.py
new file mode 100644
index 000000000..6a7b197d2
--- /dev/null
+++ b/tests/_python_orig/test_binding.py
@@ -0,0 +1,13 @@
+import taichi as ti
+
+
+def test_binding():
+    ti.init()
+    taichi_lang = ti._lib.core
+    print(taichi_lang.BinaryOpType.mul)
+    one = taichi_lang.make_const_expr_int(ti.i32, 1)
+    two = taichi_lang.make_const_expr_int(ti.i32, 2)
+    expr = taichi_lang.make_binary_op_expr(taichi_lang.BinaryOpType.add, one,
+                                           two)
+    print(expr.serialize())
+    print(taichi_lang.make_global_store_stmt(None, None))
diff --git a/tests/_python_orig/test_bit_array.py b/tests/_python_orig/test_bit_array.py
new file mode 100644
index 000000000..d5426ab3c
--- /dev/null
+++ b/tests/_python_orig/test_bit_array.py
@@ -0,0 +1,54 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.quant, debug=True)
+def test_1D_bit_array():
+    cu1 = ti.types.quantized_types.quant.int(1, False)
+
+    x = ti.field(dtype=cu1)
+
+    N = 32
+
+    ti.root.bit_array(ti.i, N, num_bits=32).place(x)
+
+    @ti.kernel
+    def set_val():
+        for i in range(N):
+            x[i] = i % 2
+
+    @ti.kernel
+    def verify_val():
+        for i in range(N):
+            assert x[i] == i % 2
+
+    set_val()
+    verify_val()
+
+
+@test_utils.test(require=ti.extension.quant, debug=True)
+def test_2D_bit_array():
+    ci1 = ti.types.quantized_types.quant.int(1, False)
+
+    x = ti.field(dtype=ci1)
+
+    M, N = 4, 8
+
+    ti.root.bit_array(ti.ij, (M, N), num_bits=32).place(x)
+
+    @ti.kernel
+    def set_val():
+        for i in range(M):
+            for j in range(N):
+                x[i, j] = (i * N + j) % 2
+
+    @ti.kernel
+    def verify_val():
+        for i in range(M):
+            for j in range(N):
+                assert x[i, j] == (i * N + j) % 2
+
+    set_val()
+    verify_val()
diff --git a/tests/_python_orig/test_bit_array_vectorization.py b/tests/_python_orig/test_bit_array_vectorization.py
new file mode 100644
index 000000000..02afb38ec
--- /dev/null
+++ b/tests/_python_orig/test_bit_array_vectorization.py
@@ -0,0 +1,179 @@
+from taichi.lang.impl import get_runtime
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.quant,
+                 debug=True,
+                 cfg_optimization=False)
+def test_vectorized_struct_for():
+    cu1 = ti.types.quantized_types.quant.int(1, False)
+
+    x = ti.field(dtype=cu1)
+    y = ti.field(dtype=cu1)
+
+    N = 4096
+    n_blocks = 4
+    bits = 32
+    boundary_offset = 1024
+
+    block = ti.root.pointer(ti.ij, (n_blocks, n_blocks))
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(x)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(y)
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            x[i, j] = ti.random(dtype=ti.i32) % 2
+
+    @ti.kernel
+    def assign_vectorized():
+        get_runtime().prog.current_ast_builder().bit_vectorize(32)
+        for i, j in x:
+            y[i, j] = x[i, j]
+
+    @ti.kernel
+    def verify():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            assert y[i, j] == x[i, j]
+
+    init()
+    assign_vectorized()
+    verify()
+
+
+@test_utils.test(require=ti.extension.quant)
+def test_offset_load():
+    ci1 = ti.types.quantized_types.quant.int(1, False)
+
+    x = ti.field(dtype=ci1)
+    y = ti.field(dtype=ci1)
+    z = ti.field(dtype=ci1)
+
+    N = 4096
+    n_blocks = 4
+    bits = 32
+    boundary_offset = 1024
+    assert boundary_offset >= N // n_blocks
+
+    block = ti.root.pointer(ti.ij, (n_blocks, n_blocks))
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(x)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(y)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(z)
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            x[i, j] = ti.random(dtype=ti.i32) % 2
+
+    @ti.kernel
+    def assign_vectorized(dx: ti.template(), dy: ti.template()):
+        get_runtime().prog.current_ast_builder().bit_vectorize(32)
+        for i, j in x:
+            y[i, j] = x[i + dx, j + dy]
+            z[i, j] = x[i + dx, j + dy]
+
+    @ti.kernel
+    def verify(dx: ti.template(), dy: ti.template()):
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            assert y[i, j] == x[i + dx, j + dy]
+
+    init()
+    assign_vectorized(0, 1)
+    verify(0, 1)
+    assign_vectorized(1, 0)
+    verify(1, 0)
+    assign_vectorized(0, -1)
+    verify(0, -1)
+    assign_vectorized(-1, 0)
+    verify(-1, 0)
+    assign_vectorized(1, 1)
+    verify(1, 1)
+    assign_vectorized(1, -1)
+    verify(1, -1)
+    assign_vectorized(-1, -1)
+    verify(-1, -1)
+    assign_vectorized(-1, 1)
+    verify(-1, 1)
+
+
+@test_utils.test(require=ti.extension.quant, debug=True)
+def test_evolve():
+    ci1 = ti.types.quantized_types.quant.int(1, False)
+
+    x = ti.field(dtype=ci1)
+    y = ti.field(dtype=ci1)
+    z = ti.field(dtype=ci1)
+
+    N = 4096
+    n_blocks = 4
+    bits = 32
+    boundary_offset = 1024
+    assert boundary_offset >= N // n_blocks
+
+    block = ti.root.pointer(ti.ij, (n_blocks, n_blocks))
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(x)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(y)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks))).bit_array(
+        ti.j, bits, num_bits=bits).place(z)
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            x[i, j] = ti.random(dtype=ti.i32) % 2
+
+    @ti.kernel
+    def evolve_vectorized(x: ti.template(), y: ti.template()):
+        get_runtime().prog.current_ast_builder().bit_vectorize(32)
+        for i, j in x:
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            y[i, j] = (num_active_neighbors == 3) | \
+                      ((num_active_neighbors == 2) & (x[i, j] == 1))
+
+    @ti.kernel
+    def evolve_naive(x: ti.template(), y: ti.template()):
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
+                                                      and x[i, j] == 1)
+
+    @ti.kernel
+    def verify():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            assert y[i, j] == z[i, j]
+
+    init()
+    evolve_naive(x, z)
+    evolve_vectorized(x, y)
+    verify()
diff --git a/tests/_python_orig/test_bit_operations.py b/tests/_python_orig/test_bit_operations.py
new file mode 100644
index 000000000..5c11194dd
--- /dev/null
+++ b/tests/_python_orig/test_bit_operations.py
@@ -0,0 +1,49 @@
+import operator as ops
+
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_bit_shl():
+    @ti.kernel
+    def shl(a: ti.i32, b: ti.i32) -> ti.i32:
+        return a << b
+
+    for i in range(8):
+        assert shl(3, i) == 3 * 2**i
+
+
+@test_utils.test()
+def test_bit_sar():
+    @ti.kernel
+    def sar(a: ti.i32, b: ti.i32) -> ti.i32:
+        return a >> b
+
+    n = 8
+    test_num = 2**n
+    neg_test_num = -test_num
+    for i in range(n):
+        assert sar(test_num, i) == 2**(n - i)
+    # for negative number
+    for i in range(n):
+        assert sar(neg_test_num, i) == -2**(n - i)
+
+
+@test_utils.test()
+def test_bit_shr():
+    @ti.kernel
+    def shr(a: ti.i32, b: ti.i32) -> ti.i32:
+        return ti.bit_shr(a, b)
+
+    n = 8
+    test_num = 2**n
+    neg_test_num = -test_num
+    for i in range(n):
+        assert shr(test_num, i) == 2**(n - i)
+    for i in range(n):
+        offset = 0x100000000 if i > 0 else 0
+        assert shr(neg_test_num, i) == (neg_test_num + offset) >> i
diff --git a/tests/_python_orig/test_bit_struct.py b/tests/_python_orig/test_bit_struct.py
new file mode 100644
index 000000000..dde979230
--- /dev/null
+++ b/tests/_python_orig/test_bit_struct.py
@@ -0,0 +1,174 @@
+import numpy as np
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.quant_basic, debug=True)
+def test_simple_array():
+    ci13 = ti.types.quantized_types.quant.int(13, True)
+    cu19 = ti.types.quantized_types.quant.int(19, False)
+
+    x = ti.field(dtype=ci13)
+    y = ti.field(dtype=cu19)
+
+    N = 12
+
+    ti.root.dense(ti.i, N).bit_struct(num_bits=32).place(x, y)
+
+    @ti.kernel
+    def set_val():
+        for i in range(N):
+            x[i] = -2**i
+            y[i] = 2**i - 1
+
+    @ti.kernel
+    def verify_val():
+        for i in range(N):
+            assert x[i] == -2**i
+            assert y[i] == 2**i - 1
+
+    set_val()
+    verify_val()
+
+    # Test bit_struct SNode read and write in Python-scope by calling the wrapped, untranslated function body
+    set_val.__wrapped__()
+    verify_val.__wrapped__()
+
+
+# TODO: remove excluding of ti.metal
+@test_utils.test(require=ti.extension.quant_basic,
+                 exclude=[ti.metal],
+                 debug=True)
+def test_custom_int_load_and_store():
+    ci13 = ti.types.quantized_types.quant.int(13, True)
+    cu14 = ti.types.quantized_types.quant.int(14, False)
+    ci5 = ti.types.quantized_types.quant.int(5, True)
+
+    x = ti.field(dtype=ci13)
+    y = ti.field(dtype=cu14)
+    z = ti.field(dtype=ci5)
+
+    test_case_np = np.array(
+        [[2**12 - 1, 2**14 - 1, -(2**3)], [2**11 - 1, 2**13 - 1, -(2**2)],
+         [0, 0, 0], [123, 4567, 8], [10, 31, 11]],
+        dtype=np.int32)
+
+    ti.root.bit_struct(num_bits=32).place(x, y, z)
+    test_case = ti.Vector.field(3, dtype=ti.i32, shape=len(test_case_np))
+    test_case.from_numpy(test_case_np)
+
+    @ti.kernel
+    def set_val(idx: ti.i32):
+        x[None] = test_case[idx][0]
+        y[None] = test_case[idx][1]
+        z[None] = test_case[idx][2]
+
+    @ti.kernel
+    def verify_val(idx: ti.i32):
+        assert x[None] == test_case[idx][0]
+        assert y[None] == test_case[idx][1]
+        assert z[None] == test_case[idx][2]
+
+    for idx in range(len(test_case_np)):
+        set_val(idx)
+        verify_val(idx)
+
+    # Test bit_struct SNode read and write in Python-scope by calling the wrapped, untranslated function body
+    for idx in range(len(test_case_np)):
+        set_val.__wrapped__(idx)
+        verify_val.__wrapped__(idx)
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_custom_int_full_struct():
+    cit = ti.types.quantized_types.quant.int(32, True)
+    x = ti.field(dtype=cit)
+    ti.root.dense(ti.i, 1).bit_struct(num_bits=32).place(x)
+
+    x[0] = 15
+    assert x[0] == 15
+
+    x[0] = 12
+    assert x[0] == 12
+
+
+def test_bit_struct():
+    def test_single_bit_struct(physical_type, compute_type, custom_bits,
+                               test_case):
+        ti.init(arch=ti.cpu, debug=True)
+
+        cit1 = ti.types.quantized_types.quant.int(custom_bits[0], True,
+                                                  compute_type)
+        cit2 = ti.types.quantized_types.quant.int(custom_bits[1], False,
+                                                  compute_type)
+        cit3 = ti.types.quantized_types.quant.int(custom_bits[2], True,
+                                                  compute_type)
+
+        a = ti.field(dtype=cit1)
+        b = ti.field(dtype=cit2)
+        c = ti.field(dtype=cit3)
+        ti.root.bit_struct(num_bits=physical_type).place(a, b, c)
+
+        @ti.kernel
+        def set_val(test_val: ti.ext_arr()):
+            a[None] = test_val[0]
+            b[None] = test_val[1]
+            c[None] = test_val[2]
+
+        @ti.kernel
+        def verify_val(test_val: ti.ext_arr()):
+            assert a[None] == test_val[0]
+            assert b[None] == test_val[1]
+            assert c[None] == test_val[2]
+
+        set_val(test_case)
+        verify_val(test_case)
+
+        ti.reset()
+
+    test_single_bit_struct(8, ti.i8, [3, 3, 2],
+                           np.array([2**2 - 1, 2**3 - 1, -2**1]))
+    test_single_bit_struct(16, ti.i16, [4, 7, 5],
+                           np.array([2**3 - 1, 2**7 - 1, -2**4]))
+    test_single_bit_struct(32, ti.i32, [17, 11, 4],
+                           np.array([2**16 - 1, 2**10 - 1, -2**3]))
+    test_single_bit_struct(64, ti.i64, [32, 23, 9],
+                           np.array([2**31 - 1, 2**23 - 1, -2**8]))
+    test_single_bit_struct(32, ti.i16, [7, 12, 13],
+                           np.array([2**6 - 1, 2**12 - 1, -2**12]))
+    test_single_bit_struct(64, ti.i32, [18, 22, 24],
+                           np.array([2**17 - 1, 2**22 - 1, -2**23]))
+
+    test_single_bit_struct(16, ti.i16, [5, 5, 6], np.array([15, 5, 20]))
+    test_single_bit_struct(32, ti.i32, [10, 10, 12], np.array([11, 19, 2020]))
+
+
+@test_utils.test(require=[ti.extension.quant_basic, ti.extension.sparse],
+                 debug=True)
+def test_bit_struct_struct_for():
+    block_size = 16
+    N = 64
+    cell = ti.root.pointer(ti.i, N // block_size)
+    fixed32 = ti.types.quantized_types.quant.fixed(frac=32, num_range=1024)
+
+    x = ti.field(dtype=fixed32)
+    cell.dense(ti.i, block_size).bit_struct(32).place(x)
+
+    for i in range(N):
+        if i // block_size % 2 == 0:
+            x[i] = 0
+
+    @ti.kernel
+    def assign():
+        for i in x:
+            x[i] = ti.cast(i, float)
+
+    assign()
+
+    for i in range(N):
+        if i // block_size % 2 == 0:
+            assert x[i] == approx(i, abs=1e-3)
+        else:
+            assert x[i] == 0
diff --git a/tests/_python_orig/test_bitmasked.py b/tests/_python_orig/test_bitmasked.py
new file mode 100644
index 000000000..e31b4eb71
--- /dev/null
+++ b/tests/_python_orig/test_bitmasked.py
@@ -0,0 +1,315 @@
+import taichi as ti
+from tests import test_utils
+
+
+def _test_basic():
+    x = ti.field(ti.i32)
+    c = ti.field(ti.i32)
+    s = ti.field(ti.i32)
+
+    bm = ti.root.bitmasked(ti.ij, (3, 6)).bitmasked(ti.i, 5)
+    bm.place(x)
+    ti.root.place(c, s)
+
+    @ti.kernel
+    def run():
+        x[5, 1] = 2
+        x[9, 4] = 20
+        x[0, 3] = 20
+
+    @ti.kernel
+    def sum():
+        for i, j in x:
+            c[None] += ti.is_active(bm, [i, j])
+            s[None] += x[i, j]
+
+    run()
+    sum()
+
+    assert c[None] == 3
+    assert s[None] == 42
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_basic():
+    _test_basic()
+
+
+@test_utils.test(require=[ti.extension.sparse, ti.extension.packed],
+                 packed=True)
+def test_basic_packed():
+    _test_basic()
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_bitmasked_then_dense():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.bitmasked(ti.i, n).dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    x[0] = 1
+    x[127] = 1
+    x[256] = 1
+    x[257] = 1
+
+    func()
+    assert s[None] == 256
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_bitmasked_bitmasked():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.bitmasked(ti.i, n).bitmasked(ti.i, n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    x[0] = 1
+    x[127] = 1
+    x[256] = 1
+    x[257] = 1
+
+    func()
+    assert s[None] == 4
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_huge_bitmasked():
+    # Mainly for testing Metal listgen's grid-stride loop implementation.
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 1024
+
+    ti.root.bitmasked(ti.i, n).bitmasked(ti.i, 2 * n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def func():
+        for i in range(n * n * 2):
+            if i % 32 == 0:
+                x[i] = 1.0
+
+    @ti.kernel
+    def count():
+        for i in x:
+            s[None] += 1
+
+    func()
+    count()
+    assert s[None] == (n * n * 2) // 32
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_bitmasked_listgen_bounded():
+    # Mainly for testing Metal's listgen is bounded by the actual number of
+    # elements possible for that SNode. Note that 1) SNode's size is padded
+    # to POT, and 2) Metal ListManager's data size is not padded, we need to
+    # make sure listgen doesn't go beyond ListManager's capacity.
+    x = ti.field(ti.i32)
+    c = ti.field(ti.i32)
+
+    # A prime that is bit higher than 65536, which is Metal's maximum number of
+    # threads for listgen.
+    n = 80173
+
+    ti.root.dense(ti.i, n).bitmasked(ti.i, 1).place(x)
+    ti.root.place(c)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            x[i] = 1
+
+    @ti.kernel
+    def count():
+        for i in x:
+            c[None] += 1
+
+    func()
+    count()
+    assert c[None] == n
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_deactivate():
+    # https://github.com/taichi-dev/taichi/issues/778
+    a = ti.field(ti.i32)
+    a_a = ti.root.bitmasked(ti.i, 4)
+    a_b = a_a.dense(ti.i, 4)
+    a_b.place(a)
+    c = ti.field(ti.i32)
+    ti.root.place(c)
+
+    @ti.kernel
+    def run():
+        a[0] = 123
+
+    @ti.kernel
+    def is_active():
+        c[None] = ti.is_active(a_a, [0])
+
+    @ti.kernel
+    def deactivate():
+        ti.deactivate(a_a, [0])
+
+    run()
+    is_active()
+    assert c[None] == 1
+
+    deactivate()
+    is_active()
+    assert c[None] == 0
+
+
+def _test_sparsity_changes():
+    x = ti.field(ti.i32)
+    c = ti.field(ti.i32)
+    s = ti.field(ti.i32)
+
+    bm = ti.root.bitmasked(ti.i, 5).bitmasked(ti.i, 3)
+    bm.place(x)
+    ti.root.place(c, s)
+
+    @ti.kernel
+    def run():
+        for i in x:
+            s[None] += x[i]
+            c[None] += 1
+
+    # Only two elements of |x| are activated
+    x[1] = 2
+    x[8] = 20
+    run()
+    assert c[None] == 2
+    assert s[None] == 22
+
+    c[None] = 0
+    s[None] = 0
+    # Four elements are activated now
+    x[7] = 15
+    x[14] = 5
+
+    run()
+    assert c[None] == 4
+    assert s[None] == 42
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_sparsity_changes():
+    _test_sparsity_changes()
+
+
+@test_utils.test(require=[ti.extension.sparse, ti.extension.packed],
+                 packed=True)
+def test_sparsity_changes_packed():
+    _test_sparsity_changes()
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_bitmasked_offset_child():
+    x = ti.field(ti.i32)
+    x2 = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    y2 = ti.field(ti.i32)
+    y3 = ti.field(ti.i32)
+    z = ti.field(ti.i32)
+    s = ti.field(ti.i32, shape=())
+
+    n = 16
+    # Offset children:
+    # * In |bm|'s cell: |bm2| has a non-zero offset
+    # * In |bm2|'s cell: |z| has a non-zero offset
+    # * We iterate over |z| to test the listgen handles offsets correctly
+    bm = ti.root.bitmasked(ti.i, n)
+    bm.dense(ti.i, 16).place(x, x2)
+    bm2 = bm.bitmasked(ti.i, 4)
+
+    bm2.dense(ti.i, 4).place(y, y2, y3)
+    bm2.bitmasked(ti.i, 4).place(z)
+
+    @ti.kernel
+    def func():
+        for _ in z:
+            s[None] += 1
+
+    z[0] = 1
+    z[7] = 1
+    z[42] = 1
+    z[53] = 1
+    z[88] = 1
+    z[101] = 1
+    z[233] = 1
+
+    func()
+    assert s[None] == 7
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_bitmasked_2d_power_of_two():
+    some_val = ti.field(dtype=float)
+    width, height = 10, 10
+    total = width * height
+    ptr = ti.root.bitmasked(ti.ij, (width, height))
+    ptr.place(some_val)
+    num_active = ti.field(dtype=int, shape=())
+
+    @ti.kernel
+    def init():
+        num_active[None] = 0
+        for x, y in ti.ndrange(width, height):
+            some_val[x, y] = 5
+            num_active[None] += 1
+
+    @ti.kernel
+    def run():
+        num_active[None] = 0
+        for x, y in some_val:
+            num_active[None] += 1
+
+    init()
+    assert num_active[None] == total
+    run()
+    assert num_active[None] == total
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_root_deactivate():
+    a = ti.field(ti.i32)
+    a_a = ti.root.bitmasked(ti.i, 4)
+    a_b = a_a.dense(ti.i, 4)
+    a_b.place(a)
+    c = ti.field(ti.i32)
+    ti.root.place(c)
+
+    @ti.kernel
+    def run():
+        a[0] = 123
+
+    @ti.kernel
+    def is_active():
+        c[None] = ti.is_active(a_a, [0])
+
+    run()
+    is_active()
+    assert c[None] == 1
+
+    ti.root.deactivate_all()
+    is_active()
+    assert c[None] == 0
diff --git a/tests/_python_orig/test_bls.py b/tests/_python_orig/test_bls.py
new file mode 100644
index 000000000..b4116bbf0
--- /dev/null
+++ b/tests/_python_orig/test_bls.py
@@ -0,0 +1,176 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_simple_1d():
+    x, y = ti.field(ti.f32), ti.field(ti.f32)
+
+    N = 64
+    bs = 16
+
+    ti.root.pointer(ti.i, N // bs).dense(ti.i, bs).place(x, y)
+
+    @ti.kernel
+    def populate():
+        for i in range(N):
+            x[i] = i
+
+    @ti.kernel
+    def copy():
+        ti.block_local(x)
+        for i in x:
+            y[i] = x[i]
+
+    populate()
+    copy()
+
+    for i in range(N):
+        assert y[i] == i
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_simple_2d():
+    x, y = ti.field(ti.f32), ti.field(ti.f32)
+
+    N = 16
+    bs = 16
+
+    ti.root.pointer(ti.ij, N // bs).dense(ti.ij, bs).place(x, y)
+
+    @ti.kernel
+    def populate():
+        for i, j in ti.ndrange(N, N):
+            x[i, j] = i - j
+
+    @ti.kernel
+    def copy():
+        ti.block_local(x)
+        for i, j in x:
+            y[i, j] = x[i, j]
+
+    populate()
+    copy()
+
+    for i in range(N):
+        for j in range(N):
+            assert y[i, j] == i - j
+
+
+def _test_bls_stencil(*args, **kwargs):
+    from .bls_test_template import bls_test_template
+    bls_test_template(*args, **kwargs)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_gather_1d_trivial():
+    # y[i] = x[i]
+    _test_bls_stencil(1, 128, bs=32, stencil=((0, ), ))
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_gather_1d():
+    # y[i] = x[i - 1] + x[i]
+    _test_bls_stencil(1, 128, bs=32, stencil=((-1, ), (0, )))
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_gather_2d():
+    stencil = [(0, 0), (0, -1), (0, 1), (1, 0)]
+    _test_bls_stencil(2, 128, bs=16, stencil=stencil)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_gather_2d_nonsquare():
+    stencil = [(0, 0), (0, -1), (0, 1), (1, 0)]
+    _test_bls_stencil(2, 128, bs=(4, 16), stencil=stencil)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_gather_3d():
+    stencil = [(-1, -1, -1), (2, 0, 1)]
+    _test_bls_stencil(3, 64, bs=(4, 8, 16), stencil=stencil)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_scatter_1d_trivial():
+    # y[i] = x[i]
+    _test_bls_stencil(1, 128, bs=32, stencil=((0, ), ), scatter=True)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_scatter_1d():
+    _test_bls_stencil(1, 128, bs=32, stencil=(
+        (1, ),
+        (0, ),
+    ), scatter=True)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_scatter_2d():
+    stencil = [(0, 0), (0, -1), (0, 1), (1, 0)]
+    _test_bls_stencil(2, 128, bs=16, stencil=stencil, scatter=True)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_multiple_inputs():
+    x, y, z, w, w2 = ti.field(ti.i32), ti.field(ti.i32), ti.field(
+        ti.i32), ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 128
+    bs = 8
+
+    ti.root.pointer(ti.ij, N // bs).dense(ti.ij, bs).place(x, y, z, w, w2)
+
+    @ti.kernel
+    def populate():
+        for i, j in ti.ndrange((bs, N - bs), (bs, N - bs)):
+            x[i, j] = i - j
+            y[i, j] = i + j * j
+            z[i, j] = i * i - j
+
+    @ti.kernel
+    def copy(bls: ti.template(), w: ti.template()):
+        if ti.static(bls):
+            ti.block_local(x, y, z)
+        for i, j in x:
+            w[i,
+              j] = x[i, j - 2] + y[i + 2, j -
+                                   1] + y[i - 1, j] + z[i - 1, j] + z[i + 1, j]
+
+    populate()
+    copy(False, w2)
+    copy(True, w)
+
+    for i in range(N):
+        for j in range(N):
+            assert w[i, j] == w2[i, j]
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_bls_large_block():
+    n = 2**10
+    block_size = 32
+    stencil_length = 28  # uses 60 * 60 * 4B = 14.0625KiB shared memory
+
+    a = ti.field(dtype=ti.f32)
+    b = ti.field(dtype=ti.f32)
+    block = ti.root.pointer(ti.ij, n // block_size)
+    block.dense(ti.ij, block_size).place(a)
+    block.dense(ti.ij, block_size).place(b)
+
+    @ti.kernel
+    def foo():
+        ti.block_dim(512)
+        ti.block_local(a)
+        for i, j in a:
+            for k in range(stencil_length):
+                b[i, j] += a[i + k, j]
+                b[i, j] += a[i, j + k]
+
+    foo()
+
+
+# TODO: BLS on CPU
+# TODO: BLS boundary out of bound
+# TODO: BLS with TLS
diff --git a/tests/_python_orig/test_bls_assume_in_range.py b/tests/_python_orig/test_bls_assume_in_range.py
new file mode 100644
index 000000000..c7dc0cbe3
--- /dev/null
+++ b/tests/_python_orig/test_bls_assume_in_range.py
@@ -0,0 +1,53 @@
+import taichi as ti
+from tests import test_utils
+
+from .bls_test_template import bls_particle_grid
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_scattering():
+    bls_particle_grid(N=128,
+                      ppc=10,
+                      block_size=8,
+                      scatter=True,
+                      use_offset=False)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_scattering_offset():
+    bls_particle_grid(N=128,
+                      ppc=10,
+                      block_size=8,
+                      scatter=True,
+                      use_offset=True)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_scattering_two_pointer_levels():
+    bls_particle_grid(N=128,
+                      ppc=10,
+                      block_size=8,
+                      scatter=True,
+                      pointer_level=2,
+                      use_offset=False)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_gathering():
+    bls_particle_grid(N=128,
+                      ppc=10,
+                      block_size=8,
+                      scatter=False,
+                      use_offset=False)
+
+
+@test_utils.test(require=ti.extension.bls)
+def test_gathering_offset():
+    bls_particle_grid(N=128,
+                      ppc=10,
+                      block_size=8,
+                      scatter=False,
+                      use_offset=True)
+
+
+# TODO: debug mode behavior of assume_in_range
diff --git a/tests/_python_orig/test_bool_op.py b/tests/_python_orig/test_bool_op.py
new file mode 100644
index 000000000..d15160e68
--- /dev/null
+++ b/tests/_python_orig/test_bool_op.py
@@ -0,0 +1,68 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(debug=True, short_circuit_operators=True)
+def test_and_shorted():
+    a = ti.field(ti.i32, shape=10)
+
+    @ti.func
+    def explode() -> ti.i32:
+        return a[-1]
+
+    @ti.kernel
+    def func() -> ti.i32:
+        return False and explode()
+
+    assert func() == 0
+
+
+@test_utils.test(debug=True, short_circuit_operators=True)
+def test_and_not_shorted():
+    @ti.kernel
+    def func() -> ti.i32:
+        return True and False
+
+    assert func() == 0
+
+
+@test_utils.test(debug=True, short_circuit_operators=True)
+def test_or_shorted():
+    a = ti.field(ti.i32, shape=10)
+
+    @ti.func
+    def explode() -> ti.i32:
+        return a[-1]
+
+    @ti.kernel
+    def func() -> ti.i32:
+        return True or explode()
+
+    assert func() == 1
+
+
+@test_utils.test(debug=True, short_circuit_operators=True)
+def test_or_not_shorted():
+    @ti.kernel
+    def func() -> ti.i32:
+        return False or True
+
+    assert func() == 1
+
+
+@test_utils.test(debug=True)
+def test_static_or():
+    @ti.kernel
+    def func() -> ti.i32:
+        return ti.static(0 or 3 or 5)
+
+    assert func() == 3
+
+
+@test_utils.test(debug=True)
+def test_static_and():
+    @ti.kernel
+    def func() -> ti.i32:
+        return ti.static(5 and 2 and 0)
+
+    assert func() == 0
diff --git a/tests/_python_orig/test_callable_template_mapper.py b/tests/_python_orig/test_callable_template_mapper.py
new file mode 100644
index 000000000..d52be405d
--- /dev/null
+++ b/tests/_python_orig/test_callable_template_mapper.py
@@ -0,0 +1,54 @@
+from taichi.lang.kernel_impl import TaichiCallableTemplateMapper
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_callable_template_mapper():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.f32)
+
+    ti.root.place(x, y)
+
+    mapper = TaichiCallableTemplateMapper(
+        (ti.template(), ti.template(), ti.template()),
+        template_slot_locations=(0, 1, 2))
+    assert mapper.lookup((0, 0, 0))[0] == 0
+    assert mapper.lookup((0, 1, 0))[0] == 1
+    assert mapper.lookup((0, 0, 0))[0] == 0
+    assert mapper.lookup((0, 0, 1))[0] == 2
+    assert mapper.lookup((0, 1, 0))[0] == 1
+
+    mapper = TaichiCallableTemplateMapper((ti.i32, ti.i32, ti.i32), ())
+    assert mapper.lookup((0, 0, 0))[0] == 0
+    assert mapper.lookup((0, 1, 0))[0] == 0
+    assert mapper.lookup((0, 0, 0))[0] == 0
+    assert mapper.lookup((0, 0, 1))[0] == 0
+    assert mapper.lookup((0, 1, 0))[0] == 0
+
+    mapper = TaichiCallableTemplateMapper((ti.i32, ti.template(), ti.i32),
+                                          (1, ))
+    assert mapper.lookup((0, x, 0))[0] == 0
+    assert mapper.lookup((0, y, 0))[0] == 1
+    assert mapper.lookup((0, x, 1))[0] == 0
+
+
+@test_utils.test()
+def test_callable_template_mapper_numpy():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.f32)
+
+    ti.root.place(x, y)
+
+    annotations = (ti.template(), ti.template(), ti.ext_arr())
+
+    import numpy as np
+
+    mapper = TaichiCallableTemplateMapper(annotations, (0, 1, 2))
+    assert mapper.lookup((0, 0, np.ones(shape=(1, 2, 3),
+                                        dtype=np.float32)))[0] == 0
+    assert mapper.lookup((0, 0, np.ones(shape=(1, 2, 4),
+                                        dtype=np.float32)))[0] == 0
+    assert mapper.lookup((0, 0, np.ones(shape=(1, 2, 1),
+                                        dtype=np.int32)))[0] == 1
diff --git a/tests/_python_orig/test_cast.py b/tests/_python_orig/test_cast.py
new file mode 100644
index 000000000..0bcca774a
--- /dev/null
+++ b/tests/_python_orig/test_cast.py
@@ -0,0 +1,175 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@pytest.mark.parametrize('dtype', [ti.u8, ti.u16, ti.u32])
+@test_utils.test(exclude=ti.opengl)
+def test_cast_uint_to_float(dtype):
+    @ti.kernel
+    def func(a: dtype) -> ti.f32:
+        return ti.cast(a, ti.f32)
+
+    assert func(255) == 255
+
+
+@pytest.mark.parametrize('dtype', [ti.u8, ti.u16, ti.u32])
+@test_utils.test(exclude=ti.opengl)
+def test_cast_float_to_uint(dtype):
+    @ti.kernel
+    def func(a: ti.f32) -> dtype:
+        return ti.cast(a, dtype)
+
+    assert func(255) == 255
+
+
+@test_utils.test()
+def test_cast_f32():
+    z = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        z[None] = ti.cast(1e9, ti.f32) / ti.cast(1e6, ti.f32) + 1e-3
+
+    func()
+    assert z[None] == 1000
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_cast_f64():
+    z = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        z[None] = ti.cast(1e13, ti.f64) / ti.cast(1e10, ti.f64) + 1e-3
+
+    func()
+    assert z[None] == 1000
+
+
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+def test_cast_default_fp(dtype):
+    ti.init(default_fp=dtype)
+
+    @ti.kernel
+    def func(x: int, y: int) -> float:
+        return ti.cast(x, float) * float(y)
+
+    assert func(23, 4) == pytest.approx(23.0 * 4.0)
+
+
+@pytest.mark.parametrize('dtype', [ti.i32, ti.i64])
+def test_cast_default_ip(dtype):
+    ti.init(default_ip=dtype)
+
+    @ti.kernel
+    def func(x: float, y: float) -> int:
+        return ti.cast(x, int) * int(y)
+
+    # make sure that int(4.6) == 4:
+    assert func(23.3, 4.6) == 23 * 4
+    if dtype == ti.i64:
+        large = 1000000000
+        assert func(large, 233) == large * 233
+        assert func(233, large) == 233 * large
+
+
+@test_utils.test()
+def test_cast_within_while():
+    ret = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        t = 10
+        while t > 5:
+            t = 1.0
+            break
+        ret[None] = t
+
+    func()
+
+
+@test_utils.test()
+def test_bit_cast():
+    x = ti.field(ti.i32, shape=())
+    y = ti.field(ti.f32, shape=())
+    z = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func1():
+        y[None] = ti.bit_cast(x[None], ti.f32)
+
+    @ti.kernel
+    def func2():
+        z[None] = ti.bit_cast(y[None], ti.i32)
+
+    x[None] = 2333
+    func1()
+    func2()
+    assert z[None] == 2333
+
+
+@test_utils.test(arch=ti.cpu)
+def test_int_extension():
+    x = ti.field(dtype=ti.i32, shape=2)
+    y = ti.field(dtype=ti.u32, shape=2)
+
+    a = ti.field(dtype=ti.i8, shape=1)
+    b = ti.field(dtype=ti.u8, shape=1)
+
+    @ti.kernel
+    def run_cast_i32():
+        x[0] = ti.cast(a[0], ti.i32)
+        x[1] = ti.cast(b[0], ti.i32)
+
+    @ti.kernel
+    def run_cast_u32():
+        y[0] = ti.cast(a[0], ti.u32)
+        y[1] = ti.cast(b[0], ti.u32)
+
+    a[0] = -128
+    b[0] = -128
+
+    run_cast_i32()
+    assert x[0] == -128
+    assert x[1] == 128
+
+    run_cast_u32()
+    assert y[0] == 0xFFFFFF80
+    assert y[1] == 128
+
+
+@test_utils.test(arch=ti.cpu)
+def test_custom_int_extension():
+    x = ti.field(dtype=ti.i32, shape=2)
+    y = ti.field(dtype=ti.u32, shape=2)
+
+    ci5 = ti.types.quantized_types.quant.int(5, True, ti.i16)
+    cu7 = ti.types.quantized_types.quant.int(7, False, ti.u16)
+
+    a = ti.field(dtype=ci5)
+    b = ti.field(dtype=cu7)
+
+    ti.root.bit_struct(num_bits=32).place(a, b)
+
+    @ti.kernel
+    def run_cast_int():
+        x[0] = ti.cast(a[None], ti.i32)
+        x[1] = ti.cast(b[None], ti.i32)
+
+    @ti.kernel
+    def run_cast_uint():
+        y[0] = ti.cast(a[None], ti.u32)
+        y[1] = ti.cast(b[None], ti.u32)
+
+    a[None] = -16
+    b[None] = -64
+
+    run_cast_int()
+    assert x[0] == -16
+    assert x[1] == 64
+
+    run_cast_uint()
+    assert y[0] == 0xFFFFFFF0
+    assert y[1] == 64
diff --git a/tests/_python_orig/test_classfunc.py b/tests/_python_orig/test_classfunc.py
new file mode 100644
index 000000000..95c4b2c6f
--- /dev/null
+++ b/tests/_python_orig/test_classfunc.py
@@ -0,0 +1,23 @@
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_classfunc():
+    @ti.data_oriented
+    class Foo:
+        def __init__(self):
+            self.val = ti.Matrix.field(n=3, m=3, dtype=ti.f32, shape=3)
+
+        @ti.func
+        def add_mat(self, a, b):
+            return a + b
+
+        @ti.kernel
+        def fill(self):
+            self.val[0] = self.add_mat(self.val[1], self.val[2])
+
+    foo = Foo()
+    foo.fill()
diff --git a/tests/_python_orig/test_clear_all_gradients.py b/tests/_python_orig/test_clear_all_gradients.py
new file mode 100644
index 000000000..f93e78049
--- /dev/null
+++ b/tests/_python_orig/test_clear_all_gradients.py
@@ -0,0 +1,40 @@
+from taichi.lang import impl
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(exclude=[ti.vulkan])
+def test_clear_all_gradients():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+    z = ti.field(ti.f32)
+    w = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.place(x)
+    ti.root.dense(ti.i, n).place(y)
+    ti.root.dense(ti.i, n).dense(ti.j, n).place(z, w)
+    ti.root.lazy_grad()
+
+    x.grad[None] = 3
+    for i in range(n):
+        y.grad[i] = 3
+        for j in range(n):
+            z.grad[i, j] = 5
+            w.grad[i, j] = 6
+
+    ti.clear_all_gradients()
+    assert impl.get_runtime().get_num_compiled_functions() == 3
+
+    assert x.grad[None] == 0
+    for i in range(n):
+        assert y.grad[i] == 0
+        for j in range(n):
+            assert z.grad[i, j] == 0
+            assert w.grad[i, j] == 0
+
+    ti.clear_all_gradients()
+    # No more kernel compilation
+    assert impl.get_runtime().get_num_compiled_functions() == 3
diff --git a/tests/_python_orig/test_cli.py b/tests/_python_orig/test_cli.py
new file mode 100644
index 000000000..e7ed85fba
--- /dev/null
+++ b/tests/_python_orig/test_cli.py
@@ -0,0 +1,221 @@
+import argparse
+import copy
+import sys
+from contextlib import contextmanager
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from taichi.main import TaichiMain
+
+import taichi as ti
+
+
+@contextmanager
+def patch_sys_argv_helper(custom_argv: list):
+    """Temporarily patch sys.argv for testing."""
+    try:
+        cached_argv = copy.deepcopy(sys.argv)
+        sys.argv = custom_argv
+        yield sys.argv
+    finally:
+        sys.argv = cached_argv
+
+
+def test_cli_exit_one_with_no_command_provided():
+    with patch_sys_argv_helper(["ti"]):
+        cli = TaichiMain(test_mode=True)
+        assert cli() == 1
+
+
+def test_cli_exit_one_with_bogus_command_provided():
+    with patch_sys_argv_helper(["ti", "bogus-command-not-registered-yet"]):
+        cli = TaichiMain(test_mode=True)
+        assert cli() == 1
+
+
+def test_cli_can_dispatch_commands_to_methods_correctly():
+    with patch_sys_argv_helper(
+        ["ti", "example", "bogus_example_name_for_test"]):
+        with patch.object(TaichiMain, 'example',
+                          return_value=None) as mock_method:
+            cli = TaichiMain(test_mode=False)
+            cli()
+            mock_method.assert_called_once_with(
+                ["bogus_example_name_for_test"])
+
+
+def test_cli_example():
+    with patch_sys_argv_helper(["ti", "example", "minimal"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.name == "minimal"
+
+    with patch_sys_argv_helper(["ti", "example", "minimal.py"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.name == "minimal"
+
+    with patch_sys_argv_helper(["ti", "example", "-s",
+                                "minimal.py"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.name == "minimal" and args.save == True
+
+    with patch_sys_argv_helper(["ti", "example", "-p",
+                                "minimal.py"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.name == "minimal" and args.print == True
+
+    with patch_sys_argv_helper(["ti", "example", "-P",
+                                "minimal.py"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.name == "minimal" and args.pretty_print == True
+
+
+def test_cli_gif():
+    with patch_sys_argv_helper(["ti", "gif", "-i", "video.mp4", "-f",
+                                "30"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.input_file == "video.mp4"
+        assert args.framerate == 30
+        assert args.output_file == "video.gif"
+
+    with patch_sys_argv_helper(["ti", "gif", "-i", "video.mp3", "-f",
+                                "30"]) as custom_argv:
+        with pytest.raises(SystemExit) as pytest_wrapped_err:
+            cli = TaichiMain(test_mode=True)
+            args = cli()
+            assert pytest_wrapped_err.__context__.type == argparse.ArgumentTypeError
+
+
+def test_cli_video_speed():
+    with patch_sys_argv_helper(
+        ["ti", "video_speed", "-i", "video.mp4", "-s", "2.0"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.input_file == "video.mp4"
+        assert args.speed == 2.0
+        assert args.output_file == "video-sped.mp4"
+
+    with patch_sys_argv_helper(
+        ["ti", "video_speed", "-i", "video.mp3", "-s", "2.0"]) as custom_argv:
+        with pytest.raises(SystemExit) as pytest_wrapped_err:
+            cli = TaichiMain(test_mode=True)
+            args = cli()
+            assert pytest_wrapped_err.__context__.type == argparse.ArgumentTypeError
+
+
+def test_cli_video_crop():
+    with patch_sys_argv_helper([
+            "ti", "video_crop", "-i", "video.mp4", "--x1", "10.0", "--x2",
+            "20.0", "--y1", "10.0", "--y2", "20.0"
+    ]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.input_file == "video.mp4"
+        assert args.x_begin == 10.0
+        assert args.x_end == 20.0
+        assert args.y_begin == 10.0
+        assert args.y_end == 20.0
+        assert args.output_file == "video-cropped.mp4"
+
+    with patch_sys_argv_helper([
+            "ti", "video_crop", "-i", "video.mp3", "--x1", "10.0", "--x2",
+            "20.0", "--y1", "10.0", "--y2", "20.0"
+    ]) as custom_argv:
+        with pytest.raises(SystemExit) as pytest_wrapped_err:
+            cli = TaichiMain(test_mode=True)
+            args = cli()
+            assert pytest_wrapped_err.__context__.type == argparse.ArgumentTypeError
+
+
+def test_cli_video_scale():
+    with patch_sys_argv_helper(
+        ["ti", "video_scale", "-i", "video.mp4", "-w", "1.2"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.input_file == "video.mp4"
+        assert args.ratio_width == 1.2
+        assert args.ratio_height == 1.2
+        assert args.output_file == "video-scaled.mp4"
+
+    with patch_sys_argv_helper([
+            "ti", "video_scale", "-i", "video.mp4", "-w", "1.2",
+            "--ratio-height", "1.5"
+    ]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.input_file == "video.mp4"
+        assert args.ratio_width == 1.2
+        assert args.ratio_height == 1.5
+        assert args.output_file == "video-scaled.mp4"
+
+    with patch_sys_argv_helper([
+            "ti", "video_scale", "-i", "video.mp3", "-w", "1.2",
+            "--ratio-height", "1.5"
+    ]) as custom_argv:
+        with pytest.raises(SystemExit) as pytest_wrapped_err:
+            cli = TaichiMain(test_mode=True)
+            args = cli()
+            assert pytest_wrapped_err.__context__.type == argparse.ArgumentTypeError
+
+
+def test_cli_video():
+    with patch_sys_argv_helper(
+        ["ti", "video", "image.gif", "-o", "video.mp4", "-f",
+         "30"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.inputs == ["image.gif"]
+        assert args.framerate == 30
+        assert isinstance(args.output_file, Path)
+        assert args.output_file.name == "video.mp4"
+
+    with patch_sys_argv_helper(["ti", "video", "-o", "video.mp4", "-f",
+                                "30"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert isinstance(args.inputs, list)
+        assert args.framerate == 30
+        assert isinstance(args.output_file, Path)
+        assert args.output_file.name == "video.mp4"
+
+
+def test_cli_regression():
+    with patch_sys_argv_helper(["ti", "regression", "a.py", "b.py",
+                                "-g"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.files == ["a.py", "b.py"]
+        assert args.gui == True
+
+
+def test_cli_benchmark():
+    with patch_sys_argv_helper(
+        ["ti", "benchmark", "a.py", "b.py", "-T", "-v", "-r2",
+         "-t4"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.files == ["a.py", "b.py"]
+        assert args.tprt == True
+        assert args.verbose == True
+        assert args.rerun == "2"
+        assert args.threads == "4"
+
+
+def test_cli_debug():
+    with patch_sys_argv_helper(["ti", "debug", "a.py"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.filename == "a.py"
+
+
+def test_cli_run():
+    with patch_sys_argv_helper(["ti", "run", "a.py"]) as custom_argv:
+        cli = TaichiMain(test_mode=True)
+        args = cli()
+        assert args.filename == "a.py"
diff --git a/tests/_python_orig/test_compare.py b/tests/_python_orig/test_compare.py
new file mode 100644
index 000000000..30327f78c
--- /dev/null
+++ b/tests/_python_orig/test_compare.py
@@ -0,0 +1,172 @@
+import pytest
+from taichi.lang import impl
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_compare_basics():
+    a = ti.field(ti.i32)
+    ti.root.dynamic(ti.i, 256).place(a)
+    b = ti.field(ti.i32, shape=())
+    c = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        b[None] = 3
+        c[None] = 5
+        a[0] = b[None] < c[None]
+        a[1] = b[None] <= c[None]
+        a[2] = b[None] > c[None]
+        a[3] = b[None] >= c[None]
+        a[4] = b[None] == c[None]
+        a[5] = b[None] != c[None]
+        a[6] = c[None] < b[None]
+        a[7] = c[None] <= b[None]
+        a[8] = c[None] > b[None]
+        a[9] = c[None] >= b[None]
+        a[10] = c[None] == b[None]
+        a[11] = c[None] != b[None]
+
+    func()
+    assert a[0]
+    assert a[1]
+    assert not a[2]
+    assert not a[3]
+    assert not a[4]
+    assert a[5]
+    assert not a[6]
+    assert not a[7]
+    assert a[8]
+    assert a[9]
+    assert not a[10]
+    assert a[11]
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_compare_equality():
+    a = ti.field(ti.i32)
+    ti.root.dynamic(ti.i, 256).place(a)
+    b = ti.field(ti.i32, shape=())
+    c = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        b[None] = 3
+        c[None] = 3
+        a[0] = b[None] < c[None]
+        a[1] = b[None] <= c[None]
+        a[2] = b[None] > c[None]
+        a[3] = b[None] >= c[None]
+        a[4] = b[None] == c[None]
+        a[5] = b[None] != c[None]
+        a[6] = c[None] < b[None]
+        a[7] = c[None] <= b[None]
+        a[8] = c[None] > b[None]
+        a[9] = c[None] >= b[None]
+        a[10] = c[None] == b[None]
+        a[11] = c[None] != b[None]
+
+    func()
+    assert not a[0]
+    assert a[1]
+    assert not a[2]
+    assert a[3]
+    assert a[4]
+    assert not a[5]
+    assert not a[6]
+    assert a[7]
+    assert not a[8]
+    assert a[9]
+    assert a[10]
+    assert not a[11]
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_no_duplicate_eval():
+    a = ti.field(ti.i32)
+    ti.root.dynamic(ti.i, 256).place(a)
+
+    @ti.kernel
+    def func():
+        a[2] = 0 <= ti.append(a.parent(), [], 10) < 1
+
+    func()
+    assert a[0] == 10
+    assert a[1] == 0  # not appended twice
+    assert a[2]  # ti.append returns 0
+
+
+@test_utils.test()
+def test_no_duplicate_eval_func():
+    a = ti.field(ti.i32, ())
+    b = ti.field(ti.i32, ())
+
+    @ti.func
+    def why_this_foo_fail(n):
+        return ti.atomic_add(b[None], n)
+
+    def foo(n):
+        return ti.atomic_add(impl.subscript(b, None), n)
+
+    @ti.kernel
+    def func():
+        a[None] = 0 <= foo(2) < 1
+
+    func()
+    assert a[None] == 1
+    assert b[None] == 2
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_chain_compare():
+    a = ti.field(ti.i32)
+    ti.root.dynamic(ti.i, 256).place(a)
+    b = ti.field(ti.i32, shape=())
+    c = ti.field(ti.i32, shape=())
+    d = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        b[None] = 2
+        c[None] = 3
+        d[None] = 3
+        a[0] = c[None] == d[None] != b[None] < d[None] > b[None] >= b[
+            None] <= c[None]
+        a[1] = b[None] <= c[None] != d[None] > b[None] == b[None]
+
+    func()
+    assert a[0]
+    assert not a[1]
+
+
+@test_utils.test()
+def test_static_in():
+    @ti.kernel
+    def foo(a: ti.template()) -> ti.i32:
+        b = 0
+        if ti.static(a in [ti.i32, ti.u32]):
+            b = 1
+        elif ti.static(a not in [ti.f32, ti.f64]):
+            b = 2
+        return b
+
+    assert foo(ti.u32) == 1
+    assert foo(ti.i64) == 2
+    assert foo(ti.f32) == 0
+
+
+@test_utils.test()
+def test_non_static_in():
+    with pytest.raises(ti.TaichiCompilationError,
+                       match='"In" is only supported inside `ti.static`.'):
+
+        @ti.kernel
+        def foo(a: ti.template()) -> ti.i32:
+            b = 0
+            if a in [ti.i32, ti.u32]:
+                b = 1
+            return b
+
+        foo(ti.i32)
diff --git a/tests/_python_orig/test_complex_struct.py b/tests/_python_orig/test_complex_struct.py
new file mode 100644
index 000000000..cc698e8eb
--- /dev/null
+++ b/tests/_python_orig/test_complex_struct.py
@@ -0,0 +1,174 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_complex_dense():
+    a = ti.field(ti.i32, shape=(4, 4))
+    b = ti.field(ti.i32, shape=(16, 16))
+    c = ti.field(ti.i32, shape=(16, 4))
+    d = ti.field(ti.i32, shape=(4, 4, 4))
+
+    w = ti.field(ti.i32)
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    z = ti.field(ti.i32)
+
+    blk = ti.root.dense(ti.ij, 4)
+    blk.place(w)
+    blk.dense(ti.ij, 2).dense(ti.ij, 2).place(x)
+    blk.dense(ti.i, 4).place(y)
+    blk.dense(ti.k, 4).place(z)
+
+    @ti.kernel
+    def set_w():
+        for I in ti.grouped(ti.ndrange(4, 4)):
+            w[I] = 1
+
+    @ti.kernel
+    def set_x():
+        for I in ti.grouped(ti.ndrange(16, 16)):
+            x[I] = 2
+
+    @ti.kernel
+    def set_y():
+        for I in ti.grouped(ti.ndrange(16, 4)):
+            y[I] = 3
+
+    @ti.kernel
+    def set_z():
+        for I in ti.grouped(ti.ndrange(4, 4, 4)):
+            z[I] = 4
+
+    @ti.kernel
+    def set_a():
+        for I in ti.grouped(w):
+            a[I] = w[I]
+
+    @ti.kernel
+    def set_b():
+        for I in ti.grouped(x):
+            b[I] = x[I]
+
+    @ti.kernel
+    def set_c():
+        for I in ti.grouped(y):
+            c[I] = y[I]
+
+    @ti.kernel
+    def set_d():
+        for I in ti.grouped(z):
+            d[I] = z[I]
+
+    set_w()
+    set_x()
+    set_y()
+    set_z()
+
+    set_a()
+    set_b()
+    set_c()
+    set_d()
+
+    for i in range(4):
+        for j in range(4):
+            assert a[i, j] == 1
+
+    for i in range(16):
+        for j in range(16):
+            assert b[i, j] == 2
+
+    for i in range(16):
+        for j in range(4):
+            assert c[i, j] == 3
+
+    for i in range(4):
+        for j in range(4):
+            for k in range(4):
+                assert d[i, j, k] == 4
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_complex_pointer():
+    a = ti.field(ti.i32, shape=(4, 4))
+    b = ti.field(ti.i32, shape=(16, 16))
+    c = ti.field(ti.i32, shape=(16, 4))
+    d = ti.field(ti.i32, shape=(4, 4, 4))
+
+    w = ti.field(ti.i32)
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    z = ti.field(ti.i32)
+
+    blk = ti.root.pointer(ti.ij, 4)
+    blk.place(w)
+    blk.pointer(ti.ij, 2).dense(ti.ij, 2).place(x)
+    blk.dense(ti.i, 4).place(y)
+    blk.dense(ti.k, 4).place(z)
+
+    @ti.kernel
+    def set_w():
+        for I in ti.grouped(ti.ndrange(4, 4)):
+            w[I] = 1
+
+    @ti.kernel
+    def set_x():
+        for I in ti.grouped(ti.ndrange(16, 16)):
+            x[I] = 2
+
+    @ti.kernel
+    def set_y():
+        for I in ti.grouped(ti.ndrange(16, 4)):
+            y[I] = 3
+
+    @ti.kernel
+    def set_z():
+        for I in ti.grouped(ti.ndrange(4, 4, 4)):
+            z[I] = 4
+
+    @ti.kernel
+    def set_a():
+        for I in ti.grouped(w):
+            a[I] = w[I]
+
+    @ti.kernel
+    def set_b():
+        for I in ti.grouped(x):
+            b[I] = x[I]
+
+    @ti.kernel
+    def set_c():
+        for I in ti.grouped(y):
+            c[I] = y[I]
+
+    @ti.kernel
+    def set_d():
+        for I in ti.grouped(z):
+            d[I] = z[I]
+
+    set_w()
+    set_x()
+    set_y()
+    set_z()
+
+    set_a()
+    set_b()
+    set_c()
+    set_d()
+
+    for i in range(4):
+        for j in range(4):
+            assert a[i, j] == 1
+
+    for i in range(16):
+        for j in range(16):
+            assert b[i, j] == 2
+
+    for i in range(16):
+        for j in range(4):
+            assert c[i, j] == 3
+
+    for i in range(4):
+        for j in range(4):
+            for k in range(4):
+                assert d[i, j, k] == 4
diff --git a/tests/_python_orig/test_constant_fold.py b/tests/_python_orig/test_constant_fold.py
new file mode 100644
index 000000000..98aca30e9
--- /dev/null
+++ b/tests/_python_orig/test_constant_fold.py
@@ -0,0 +1,19 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_constant_fold():
+    n = 100
+
+    @ti.kernel
+    def series() -> int:
+        s = 0
+        for i in ti.static(range(n)):
+            a = i + 1
+            s += a * a
+        return s
+
+    # \sum_{i=1}^n (i^2) = n * (n + 1) * (2n + 1) / 6
+    expected = n * (n + 1) * (2 * n + 1) // 6
+    assert series() == expected
diff --git a/tests/_python_orig/test_continue.py b/tests/_python_orig/test_continue.py
new file mode 100644
index 000000000..cdb13d54c
--- /dev/null
+++ b/tests/_python_orig/test_continue.py
@@ -0,0 +1,149 @@
+import taichi as ti
+from tests import test_utils
+
+n = 1000
+
+
+@test_utils.test()
+def test_for_continue():
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def run():
+        # Launch just one thread
+        for _ in range(1):
+            for j in range(n):
+                if j % 2 == 0:
+                    continue
+                x[j] = j
+
+    run()
+    xs = x.to_numpy()
+    for i in range(n):
+        expect = 0 if i % 2 == 0 else i
+        assert xs[i] == expect
+
+
+@test_utils.test()
+def test_while_continue():
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def run():
+        # Launch just one thread
+        for _ in range(1):
+            j = 0
+            while j < n:
+                oj = j
+                j += 1
+                if oj % 2 == 0:
+                    continue
+                x[oj] = oj
+
+    run()
+    xs = x.to_numpy()
+    for i in range(n):
+        expect = 0 if i % 2 == 0 else i
+        assert xs[i] == expect
+
+
+@test_utils.test()
+def test_kernel_continue():
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def run():
+        for i in range(n):
+            if i % 2 == 0:
+                # At kernel level, this is the same as return
+                continue
+            x[i] = i
+
+    run()
+    xs = x.to_numpy()
+    for i in range(n):
+        expect = 0 if i % 2 == 0 else i
+        assert xs[i] == expect
+
+
+@test_utils.test()
+def test_unconditional_continue():
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def run():
+        # Launch just one thread
+        for _ in range(1):
+            for j in range(n):
+                continue
+                # pylint: disable=unreachable
+                x[j] = j
+
+    run()
+    xs = x.to_numpy()
+    for i in range(n):
+        assert xs[i] == 0
+
+
+@test_utils.test()
+def test_kernel_continue_in_nested_if():
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def run(a: ti.i32):
+        for i in range(1):
+            if a:
+                if a:
+                    continue
+            if a:
+                if a:
+                    continue
+            x[i] = i
+
+    x[0] = 1
+    run(1)
+    assert x[0] == 1
+    run(0)
+    assert x[0] == 0
+
+
+@test_utils.test()
+def test_kernel_continue_in_nested_if_2():
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def run(a: ti.i32):
+        for i in range(1):
+            if a:
+                if a:
+                    continue
+            if a:
+                continue
+            x[i] = i
+
+    x[0] = 1
+    run(1)
+    assert x[0] == 1
+    run(0)
+    assert x[0] == 0
+
+
+@test_utils.test()
+def test_kernel_continue_in_nested_if_3():
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def run(a: ti.i32):
+        for i in range(1):
+            if a:
+                continue
+            if a:
+                if a:
+                    continue
+            x[i] = i
+
+    x[0] = 1
+    run(1)
+    assert x[0] == 1
+    run(0)
+    assert x[0] == 0
diff --git a/tests/_python_orig/test_copy_from.py b/tests/_python_orig/test_copy_from.py
new file mode 100644
index 000000000..f546477e1
--- /dev/null
+++ b/tests/_python_orig/test_copy_from.py
@@ -0,0 +1,25 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_scalar():
+    n = 16
+
+    x = ti.field(ti.i32, shape=n)
+    y = ti.field(ti.i32, shape=n)
+
+    x[1] = 2
+
+    y[0] = 1
+    y[2] = 3
+
+    x.copy_from(y)
+
+    assert x[0] == 1
+    assert x[1] == 0
+    assert x[2] == 3
+
+    assert y[0] == 1
+    assert y[1] == 0
+    assert y[2] == 3
diff --git a/tests/_python_orig/test_cuda_internals.py b/tests/_python_orig/test_cuda_internals.py
new file mode 100644
index 000000000..1c2e3e3ae
--- /dev/null
+++ b/tests/_python_orig/test_cuda_internals.py
@@ -0,0 +1,37 @@
+from taichi.lang import impl
+
+import taichi as ti
+from tests import test_utils
+
+# TODO: these are not really tests...
+
+
+@test_utils.test(arch=ti.cuda)
+def test_do_nothing():
+    @ti.kernel
+    def test():
+        for i in range(10):
+            impl.call_internal("do_nothing")
+
+    test()
+
+
+@test_utils.test(arch=ti.cuda)
+def test_active_mask():
+    @ti.kernel
+    def test():
+        for i in range(48):
+            if i % 2 == 0:
+                impl.call_internal("test_active_mask")
+
+    test()
+
+
+@test_utils.test(arch=ti.cuda)
+def test_shfl_down():
+    @ti.kernel
+    def test():
+        for i in range(32):
+            impl.call_internal("test_shfl")
+
+    test()
diff --git a/tests/_python_orig/test_custom_float.py b/tests/_python_orig/test_custom_float.py
new file mode 100644
index 000000000..aae233620
--- /dev/null
+++ b/tests/_python_orig/test_custom_float.py
@@ -0,0 +1,92 @@
+import math
+
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_custom_float():
+    cft = ti.types.quantized_types.quant.fixed(frac=32, num_range=2)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    @ti.kernel
+    def foo():
+        x[None] = 0.7
+        print(x[None])
+        x[None] = x[None] + 0.4
+
+    foo()
+    assert x[None] == approx(1.1)
+    x[None] = 0.64
+    assert x[None] == approx(0.64)
+    x[None] = 0.66
+    assert x[None] == approx(0.66)
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_custom_matrix_rotation():
+    cft = ti.types.quantized_types.quant.fixed(frac=16, num_range=1.2)
+
+    x = ti.Matrix.field(2, 2, dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(0, 0),
+                                          x.get_scalar_field(0, 1))
+    ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(1, 0),
+                                          x.get_scalar_field(1, 1))
+
+    x[None] = [[1.0, 0.0], [0.0, 1.0]]
+
+    @ti.kernel
+    def rotate_18_degrees():
+        angle = math.pi / 10
+        x[None] = x[None] @ ti.Matrix(
+            [[ti.cos(angle), ti.sin(angle)], [-ti.sin(angle),
+                                              ti.cos(angle)]])
+
+    for i in range(5):
+        rotate_18_degrees()
+    assert x[None][0, 0] == approx(0, abs=1e-4)
+    assert x[None][0, 1] == approx(1, abs=1e-4)
+    assert x[None][1, 0] == approx(-1, abs=1e-4)
+    assert x[None][1, 1] == approx(0, abs=1e-4)
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_custom_float_implicit_cast():
+    ci13 = ti.types.quantized_types.quant.int(bits=13)
+    cft = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=ci13, scale=0.1)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    @ti.kernel
+    def foo():
+        x[None] = 10
+
+    foo()
+    assert x[None] == approx(10.0)
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_cache_read_only():
+    ci15 = ti.types.quantized_types.quant.int(bits=15)
+    cft = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=ci15, scale=0.1)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    @ti.kernel
+    def test(data: ti.f32):
+        ti.cache_read_only(x)
+        assert x[None] == data
+
+    x[None] = 0.7
+    test(0.7)
+    x[None] = 1.2
+    test(1.2)
diff --git a/tests/_python_orig/test_custom_float_exponents.py b/tests/_python_orig/test_custom_float_exponents.py
new file mode 100644
index 000000000..6ede9f03a
--- /dev/null
+++ b/tests/_python_orig/test_custom_float_exponents.py
@@ -0,0 +1,135 @@
+import numpy as np
+import pytest
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.quant)
+def test_custom_float_unsigned():
+    cu13 = ti.types.quantized_types.quant.int(13, False)
+    exp = ti.types.quantized_types.quant.int(6, False)
+    cft = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cu13, exponent_type=exp, scale=1)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    tests = [
+        0, 1 / 1024, 1.75 / 1024, 0.25, 0.5, 0.75, 1, 2, 3, 4, 5, 6, 7, 128,
+        256, 512, 1024
+    ]
+
+    assert x[None] == 0
+
+    for v in tests:
+        x[None] = v
+        assert x[None] == v
+
+
+@test_utils.test(require=ti.extension.quant)
+def test_custom_float_signed():
+    cu13 = ti.types.quantized_types.quant.int(13, True)
+    exp = ti.types.quantized_types.quant.int(6, False)
+    cft = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cu13, exponent_type=exp, scale=1)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    tests = [0, 0.125, 0.5, 2, 4, 6, 7, 8, 9]
+
+    assert x[None] == 0
+
+    for v in tests:
+        x[None] = v
+        assert x[None] == v
+
+        x[None] = -v
+        assert x[None] == -v
+
+    ftz_tests = [1e-30, 1e-20, 1e-10, 1e-2]
+    for v in ftz_tests:
+        x[None] = v
+        assert x[None] == approx(v, abs=1e-5)
+
+        x[None] = -v
+        assert x[None] == approx(-v, abs=1e-5)
+
+
+@pytest.mark.parametrize('digits_bits', [23, 24])
+@test_utils.test(require=ti.extension.quant)
+def test_custom_float_precision(digits_bits):
+    cu24 = ti.types.quantized_types.quant.int(digits_bits, True)
+    exp = ti.types.quantized_types.quant.int(8, False)
+    cft = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cu24, exponent_type=exp, scale=1)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    tests = [np.float32(np.pi), np.float32(np.pi * (1 << 100))]
+
+    for v in tests:
+        x[None] = v
+        if digits_bits == 24:
+            # Sufficient digits
+            assert x[None] == v
+        else:
+            # The binary representation of np.float32(np.pi) ends with 1, so removing one digit will result in a different number.
+            assert x[None] != v
+            assert x[None] == pytest.approx(v, rel=3e-7)
+
+
+@pytest.mark.parametrize('signed', [True, False])
+@test_utils.test(require=ti.extension.quant)
+def test_custom_float_truncation(signed):
+    cit = ti.types.quantized_types.quant.int(2, signed)
+    exp = ti.types.quantized_types.quant.int(5, False)
+    cft = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit, exponent_type=exp, scale=1)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    # Sufficient digits
+    for v in [1, 1.5]:
+        x[None] = v
+        assert x[None] == v
+
+    x[None] = 1.75
+    if signed:
+        # Insufficient digits
+        assert x[None] == 2
+    else:
+        # Sufficient digits
+        assert x[None] == 1.75
+
+    # Insufficient digits
+    x[None] = 1.625
+    if signed:
+        assert x[None] == 1.5
+    else:
+        assert x[None] == 1.75
+
+
+@test_utils.test(require=ti.extension.quant)
+def test_custom_float_atomic_demotion():
+    cit = ti.types.quantized_types.quant.int(2, True)
+    exp = ti.types.quantized_types.quant.int(5, False)
+    cft = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit, exponent_type=exp, scale=1)
+    x = ti.field(dtype=cft)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    @ti.kernel
+    def foo():
+        for i in range(1):
+            x[None] += 1
+
+    foo()
+    foo()
+
+    assert x[None] == 2
diff --git a/tests/_python_orig/test_custom_float_shared_exp.py b/tests/_python_orig/test_custom_float_shared_exp.py
new file mode 100644
index 000000000..02e9da00b
--- /dev/null
+++ b/tests/_python_orig/test_custom_float_shared_exp.py
@@ -0,0 +1,167 @@
+import pytest
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+@pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
+@test_utils.test(require=ti.extension.quant)
+def test_shared_exponents(exponent_bits):
+    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quantized_types.quant.int(10, False)
+    cit2 = ti.types.quantized_types.quant.int(14, False)
+    cft1 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit1, exponent_type=exp, scale=1)
+    cft2 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit2, exponent_type=exp, scale=1)
+    a = ti.field(dtype=cft1)
+    b = ti.field(dtype=cft2)
+    ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
+
+    assert a[None] == 0.0
+    assert b[None] == 0.0
+
+    a[None] = 10
+    assert a[None] == 10.0
+    assert b[None] == 0.0
+
+    a[None] = 0
+    assert a[None] == 0.0
+    assert b[None] == 0.0
+
+    @ti.kernel
+    def foo(x: ti.f32, y: ti.f32):
+        a[None] = x
+        b[None] = y
+
+    foo(3.2, 0.25)
+
+    assert a[None] == approx(3.2, rel=1e-3)
+    assert b[None] == approx(0.25, rel=2e-2)
+    a[None] = 0.27
+    assert a[None] == approx(0.27, rel=1e-2)
+    assert b[None] == approx(0.25, rel=2e-2)
+    a[None] = 100
+    assert a[None] == approx(100, rel=1e-3)
+    assert b[None] == approx(0.25, rel=1e-2)
+
+    b[None] = 0
+    assert a[None] == approx(100, rel=1e-3)
+    assert b[None] == 0
+
+    foo(0, 0)
+    assert a[None] == 0.0
+    assert b[None] == 0.0
+
+    # test flush to zero
+    foo(1000, 1e-6)
+    assert a[None] == 1000.0
+    assert b[None] == 0.0
+
+    foo(1000, 1000)
+    assert a[None] == 1000.0
+    assert b[None] == 1000.0
+
+    foo(1e-30, 1e-30)
+    if exponent_bits == 8:
+        assert a[None] == approx(1e-30, 1e-3)
+        assert b[None] == approx(1e-30, 1e-4)
+    else:
+        # Insufficient exponent bits: should flush to zero
+        assert a[None] == 0
+        assert b[None] == 0
+
+
+@pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
+@test_utils.test(require=ti.extension.quant)
+def test_shared_exponent_add(exponent_bits):
+    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quantized_types.quant.int(10, False)
+    cit2 = ti.types.quantized_types.quant.int(14, False)
+    cft1 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit1, exponent_type=exp, scale=1)
+    cft2 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit2, exponent_type=exp, scale=1)
+    a = ti.field(dtype=cft1)
+    b = ti.field(dtype=cft2)
+    ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
+
+    @ti.kernel
+    def foo(x: ti.f32, y: ti.f32):
+        a[None] = x
+        b[None] = y
+
+    a[None] = 4
+    assert a[None] == 4
+    assert b[None] == 0
+    b[None] = 3
+    assert a[None] == 4
+    assert b[None] == 3
+
+    b[None] += 1
+
+    assert a[None] == 4
+    assert b[None] == 4
+
+    for i in range(100):
+        a[None] += 4
+        b[None] += 1
+        assert a[None] == 4 + (i + 1) * 4
+        assert b[None] == 4 + (i + 1)
+
+
+@pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
+@test_utils.test(require=ti.extension.quant)
+def test_shared_exponent_borrow(exponent_bits):
+    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quantized_types.quant.int(10, False)
+    cit2 = ti.types.quantized_types.quant.int(14, False)
+    cft1 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit1, exponent_type=exp, scale=1)
+    cft2 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit2, exponent_type=exp, scale=1)
+    a = ti.field(dtype=cft1)
+    b = ti.field(dtype=cft2)
+    ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
+
+    @ti.kernel
+    def foo(x: ti.f32, y: ti.f32):
+        a[None] = x
+        b[None] = y
+
+    def inc():
+        a[None] += 1
+        b[None] -= 1
+
+    foo(0, 100)
+
+    for i in range(100):
+        assert a[None] == i
+        assert b[None] == 100 - i
+        inc()
+
+
+@pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
+@test_utils.test(require=ti.extension.quant)
+def test_negative(exponent_bits):
+    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quantized_types.quant.int(10, False)
+    cit2 = ti.types.quantized_types.quant.int(14, True)
+    cft1 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit1, exponent_type=exp, scale=1)
+    cft2 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit2, exponent_type=exp, scale=1)
+    a = ti.field(dtype=cft1)
+    b = ti.field(dtype=cft2)
+    ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
+
+    a[None] = 37
+    assert a[None] == 37
+    b[None] = -123
+    assert b[None] == -123
+
+
+# TODO: test precision
+# TODO: make sure unsigned has one more effective significand bit
+# TODO: test shared exponent floats with custom int in a single bit struct
diff --git a/tests/_python_orig/test_custom_float_time_integration.py b/tests/_python_orig/test_custom_float_time_integration.py
new file mode 100644
index 000000000..00906efec
--- /dev/null
+++ b/tests/_python_orig/test_custom_float_time_integration.py
@@ -0,0 +1,63 @@
+import math
+
+import pytest
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+@pytest.mark.parametrize('use_cft,use_exponent,use_shared_exp',
+                         [(False, False, False), (True, False, False),
+                          (True, True, False), (True, True, True)])
+@test_utils.test(require=ti.extension.quant)
+def test_custom_float_time_integration(use_cft, use_exponent, use_shared_exp):
+    if use_cft:
+        if use_exponent:
+            exp = ti.types.quantized_types.quant.int(6, False)
+            cit = ti.types.quantized_types.quant.int(13, True)
+            cft = ti.types.quantized_types.type_factory.custom_float(
+                significand_type=cit, exponent_type=exp, scale=1)
+            x = ti.Vector.field(2, dtype=cft)
+            if use_shared_exp:
+                ti.root.bit_struct(num_bits=32).place(x, shared_exponent=True)
+            else:
+                ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(0))
+                ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(1))
+        else:
+            cit = ti.types.quantized_types.quant.int(16, True)
+            cft = ti.types.quantized_types.type_factory.custom_float(
+                significand_type=cit, scale=1 / 2**14)
+            x = ti.Vector.field(2, dtype=cft)
+            ti.root.bit_struct(num_bits=32).place(x)
+    else:
+        x = ti.Vector.field(2, dtype=ti.f32, shape=())
+
+    @ti.func
+    def v_at(p):
+        return ti.Vector([-p[1], p[0]])
+
+    @ti.kernel
+    def advance(dt: ti.f32):
+        v_mid = v_at(x[None] + 0.5 * dt * v_at(x[None]))
+        x[None] = x[None] + v_mid * dt
+
+    x[None] = [1, 0]
+    num_steps = 1000
+    dt = math.pi * 2 / num_steps
+    px = []
+    py = []
+
+    N = 1
+
+    for i in range(num_steps * N):
+        advance(dt)
+        px.append(x[None][0])
+        py.append(x[None][1])
+
+    assert px[num_steps // 2 - 1] == approx(-1, abs=2e-2)
+    assert py[num_steps // 2 - 1] == approx(0, abs=2e-2)
+
+    assert px[-1] == approx(1, abs=2e-2)
+    # TODO: why large error here?
+    assert py[-1] == approx(0, abs=3e-2)
diff --git a/tests/_python_orig/test_custom_int.py b/tests/_python_orig/test_custom_int.py
new file mode 100644
index 000000000..b75d366b8
--- /dev/null
+++ b/tests/_python_orig/test_custom_int.py
@@ -0,0 +1,17 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_custom_int_implicit_cast():
+    ci13 = ti.types.quantized_types.quant.int(13, True)
+    x = ti.field(dtype=ci13)
+
+    ti.root.bit_struct(num_bits=32).place(x)
+
+    @ti.kernel
+    def foo():
+        x[None] = 10.3
+
+    foo()
+    assert x[None] == 10
diff --git a/tests/_python_orig/test_custom_struct.py b/tests/_python_orig/test_custom_struct.py
new file mode 100644
index 000000000..657e8f682
--- /dev/null
+++ b/tests/_python_orig/test_custom_struct.py
@@ -0,0 +1,357 @@
+import numpy as np
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_struct_member_access():
+    n = 32
+
+    x = ti.Struct.field({"a": ti.f32, "b": ti.f32}, shape=(n, ))
+    y = ti.Struct.field({"a": ti.f32, "b": ti.f32})
+
+    ti.root.dense(ti.i, n // 4).dense(ti.i, 4).place(y)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i].a = i
+            y[i].a = i
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            x[i].b = x[i].a
+
+    def run_python_scope():
+        for i in range(n):
+            y[i].b = y[i].a * 2 + 1
+
+    init()
+    run_taichi_scope()
+    for i in range(n):
+        assert x[i].b == i
+    run_python_scope()
+    for i in range(n):
+        assert y[i].b == i * 2 + 1
+
+
+@test_utils.test()
+def test_struct_whole_access():
+    n = 32
+
+    # also tests implicit cast
+    x = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+    y = ti.Struct.field({"a": ti.f32, "b": ti.i32})
+
+    ti.root.dense(ti.i, n // 4).dense(ti.i, 4).place(y)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i] = ti.Struct(a=2 * i, b=1.01 * i)
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            # element-wise ops only work in Taichi scope
+            y[i] = x[i] * 2 + 1
+
+    def run_python_scope():
+        for i in range(n):
+            y[i] = ti.Struct(a=x[i].a, b=int(x[i].b))
+
+    init()
+    for i in range(n):
+        assert x[i].a == 2 * i
+        assert x[i].b == approx(1.01 * i, rel=1e-4)
+    run_taichi_scope()
+    for i in range(n):
+        assert y[i].a == 4 * i + 1
+        assert y[i].b == int((1.01 * i) * 2 + 1)
+    run_python_scope()
+    for i in range(n):
+        assert y[i].a == 2 * i
+        assert y[i].b == int(1.01 * i)
+
+
+@test_utils.test()
+def test_struct_fill():
+    n = 32
+
+    # also tests implicit cast
+    x = ti.Struct.field({
+        "a": ti.f32,
+        "b": ti.types.vector(3, ti.i32)
+    },
+                        shape=(n, ))
+
+    def fill_each():
+        x.a.fill(1.0)
+        x.b.fill(1.5)
+
+    def fill_all():
+        x.fill(2.5)
+
+    @ti.kernel
+    def fill_elements():
+        for i in x:
+            x[i].fill(i + 0.5)
+
+    fill_each()
+    for i in range(n):
+        assert x[i].a == 1.0
+        assert x[i].b[0] == 1 and x[i].b[1] == 1 and x[i].b[2] == 1
+    fill_all()
+    for i in range(n):
+        assert x[i].a == 2.5
+        assert x[i].b[0] == 2 and x[i].b[1] == 2 and x[i].b[2] == 2
+    fill_elements()
+    for i in range(n):
+        assert x[i].a == i + 0.5
+        assert np.allclose(x[i].b.to_numpy(), int(x[i].a))
+
+
+@test_utils.test()
+def test_matrix_type():
+    n = 32
+    vec2f = ti.types.vector(2, ti.f32)
+    vec3i = ti.types.vector(3, ti.i32)
+    x = vec3i.field()
+    ti.root.dense(ti.i, n).place(x)
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            v = vec2f(i + 0.2)
+            # also tests implicit cast
+            x[i] = vec3i(v, i + 1.2)
+
+    def run_python_scope():
+        for i in range(n):
+            v = vec2f(i + 0.2)
+            x[i] = vec3i(i + 1.8, v)
+
+    run_taichi_scope()
+    for i in range(n):
+        assert np.allclose(x[i].to_numpy(), np.array([i, i, i + 1]))
+    run_python_scope()
+    for i in range(n):
+        assert np.allclose(x[i].to_numpy(), np.array([i + 1, i, i]))
+
+
+@test_utils.test()
+def test_struct_type():
+    n = 32
+    vec3f = ti.types.vector(3, float)
+    line3f = ti.types.struct(linedir=vec3f, length=float)
+    mystruct = ti.types.struct(line=line3f, idx=int)
+    x = mystruct.field(shape=(n, ))
+
+    @ti.kernel
+    def init_taichi_scope():
+        for i in x:
+            x[i] = mystruct(1)
+
+    def init_python_scope():
+        for i in range(n):
+            x[i] = mystruct(3)
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            v = vec3f(1)
+            line = line3f(linedir=v, length=i + 0.5)
+            x[i] = mystruct(line=line, idx=i)
+
+    def run_python_scope():
+        for i in range(n):
+            v = vec3f(1)
+            x[i] = ti.Struct({
+                "line": {
+                    "linedir": v,
+                    "length": i + 0.5
+                },
+                "idx": i
+            })
+
+    init_taichi_scope()
+    for i in range(n):
+        assert x[i].idx == 1
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == 1.0
+    run_taichi_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
+    init_python_scope()
+    for i in range(n):
+        assert x[i].idx == 3
+        assert np.allclose(x[i].line.linedir.to_numpy(), 3.0)
+        assert x[i].line.length == 3.0
+    run_python_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
+    x.fill(5)
+    for i in range(n):
+        assert x[i].idx == 5
+        assert np.allclose(x[i].line.linedir.to_numpy(), 5.0)
+        assert x[i].line.length == 5.0
+
+
+@test_utils.test()
+def test_struct_assign():
+    n = 32
+    vec3f = ti.types.vector(3, float)
+    line3f = ti.types.struct(linedir=vec3f, length=float)
+    mystruct = ti.types.struct(line=line3f, idx=int)
+    x = mystruct.field(shape=(n, ))
+    y = line3f.field(shape=(n, ))
+
+    @ti.kernel
+    def init():
+        for i in y:
+            y[i] = line3f(linedir=vec3f(1), length=i + 0.5)
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            x[i].idx = i
+            x[i].line = y[i]
+
+    def run_python_scope():
+        for i in range(n):
+            x[i].idx = i
+            x[i].line = y[i]
+
+    init()
+    run_taichi_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
+    x.fill(5)
+    run_python_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
+
+
+@test_utils.test()
+def test_compound_type_implicit_cast():
+    vec2i = ti.types.vector(2, int)
+    vec2f = ti.types.vector(2, float)
+    structi = ti.types.struct(a=int, b=vec2i)
+    structf = ti.types.struct(a=float, b=vec2f)
+
+    @ti.kernel
+    def f2i_taichi_scope() -> int:
+        s = structi(2.5)
+        return s.a + s.b[0] + s.b[1]
+
+    def f2i_python_scope():
+        s = structi(2.5)
+        return s.a + s.b[0] + s.b[1]
+
+    @ti.kernel
+    def i2f_taichi_scope() -> float:
+        s = structf(2)
+        return s.a + s.b[0] + s.b[1]
+
+    def i2f_python_scope():
+        s = structf(2)
+        return s.a + s.b[0] + s.b[1]
+
+    int_value = f2i_taichi_scope()
+    assert type(int_value) == int and int_value == 6
+    int_value = f2i_python_scope()
+    assert type(int_value) == int and int_value == 6
+    float_value = i2f_taichi_scope()
+    assert type(float_value) == float and float_value == approx(6.0, rel=1e-4)
+    float_value = i2f_python_scope()
+    assert type(float_value) == float and float_value == approx(6.0, rel=1e-4)
+
+
+@test_utils.test()
+def test_local_struct_assign():
+    n = 32
+    vec3f = ti.types.vector(3, float)
+    line3f = ti.types.struct(linedir=vec3f, length=float)
+    mystruct = ti.types.struct(line=line3f, idx=int)
+
+    @ti.kernel
+    def run_taichi_scope():
+        y = line3f(0)
+        x = mystruct(0)
+        x.idx = 0
+        x.line = y
+
+    def run_python_scope():
+        y = line3f(0)
+        x = mystruct(0)
+        x.idx = 0
+        x.line = y
+
+    run_taichi_scope()
+    run_python_scope()
+
+
+@test_utils.test(debug=True)
+def test_copy_python_scope_struct_to_taichi_scope():
+    a = ti.Struct({'a': 2, 'b': 3})
+
+    @ti.kernel
+    def test():
+        b = a
+        assert b.a == 2
+        assert b.b == 3
+        b = ti.Struct({'a': 3, 'b': 4})
+        assert b.a == 3
+        assert b.b == 4
+
+    test()
+
+
+@test_utils.test(debug=True)
+def test_copy_struct_field_element_to_taichi_scope():
+    a = ti.Struct.field({'a': ti.i32, 'b': ti.i32}, shape=())
+    a[None].a = 2
+    a[None].b = 3
+
+    @ti.kernel
+    def test():
+        b = a[None]
+        assert b.a == 2
+        assert b.b == 3
+        b.a = 5
+        b.b = 9
+        assert b.a == 5
+        assert b.b == 9
+        assert a[None].a == 2
+        assert a[None].b == 3
+
+    test()
+
+
+@test_utils.test(debug=True)
+def test_copy_struct_in_taichi_scope():
+    @ti.kernel
+    def test():
+        a = ti.Struct({'a': 2, 'b': 3})
+        b = a
+        assert b.a == 2
+        assert b.b == 3
+        b.a = 5
+        b.b = 9
+        assert b.a == 5
+        assert b.b == 9
+        assert a.a == 2
+        assert a.b == 3
+
+    test()
diff --git a/tests/_python_orig/test_custom_type_atomics.py b/tests/_python_orig/test_custom_type_atomics.py
new file mode 100644
index 000000000..1e810963b
--- /dev/null
+++ b/tests/_python_orig/test_custom_type_atomics.py
@@ -0,0 +1,93 @@
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+# TODO: remove excluding of ti.metal.
+@test_utils.test(require=ti.extension.quant_basic,
+                 exclude=[ti.metal],
+                 debug=True)
+def test_custom_int_atomics():
+    ci13 = ti.types.quantized_types.quant.int(13, True)
+    ci5 = ti.types.quantized_types.quant.int(5, True)
+    cu2 = ti.types.quantized_types.quant.int(2, False)
+
+    x = ti.field(dtype=ci13)
+    y = ti.field(dtype=ci5)
+    z = ti.field(dtype=cu2)
+
+    ti.root.bit_struct(num_bits=32).place(x, y, z)
+
+    x[None] = 3
+    y[None] = 2
+    z[None] = 0
+
+    @ti.kernel
+    def foo():
+        for i in range(10):
+            x[None] += 4
+
+        for j in range(5):
+            y[None] -= 1
+
+        for k in range(3):
+            z[None] += 1
+
+    foo()
+
+    assert x[None] == 43
+    assert y[None] == -3
+    assert z[None] == 3
+
+
+@test_utils.test(require=[ti.extension.quant_basic, ti.extension.data64],
+                 debug=True)
+def test_custom_int_atomics_b64():
+    ci13 = ti.types.quantized_types.quant.int(13, True)
+
+    x = ti.field(dtype=ci13)
+
+    ti.root.bit_array(ti.i, 4, num_bits=64).place(x)
+
+    x[0] = 100
+    x[1] = 200
+    x[2] = 300
+
+    @ti.kernel
+    def foo():
+        for i in range(9):
+            x[i % 3] += i
+
+    foo()
+
+    assert x[0] == 109
+    assert x[1] == 212
+    assert x[2] == 315
+
+
+@test_utils.test(require=ti.extension.quant_basic, debug=True)
+def test_custom_float_atomics():
+    ci13 = ti.types.quantized_types.quant.int(13, True)
+    ci19 = ti.types.quantized_types.quant.int(19, False)
+    cft13 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=ci13, scale=0.1)
+    cft19 = ti.types.quantized_types.type_factory.custom_float(
+        significand_type=ci19, scale=0.1)
+
+    x = ti.field(dtype=cft13)
+    y = ti.field(dtype=cft19)
+
+    ti.root.bit_struct(num_bits=32).place(x, y)
+
+    @ti.kernel
+    def foo():
+        x[None] = 0.7
+        y[None] = 123.4
+        for _ in range(10):
+            x[None] -= 0.4
+            y[None] += 100.1
+
+    foo()
+    assert x[None] == approx(-3.3)
+    assert y[None] == approx(1124.4)
diff --git a/tests/_python_orig/test_customized_grad.py b/tests/_python_orig/test_customized_grad.py
new file mode 100644
index 000000000..d813c6a0e
--- /dev/null
+++ b/tests/_python_orig/test_customized_grad.py
@@ -0,0 +1,227 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_customized_kernels_tape():
+    x = ti.field(ti.f32)
+    total = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(total)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func(mul: ti.f32):
+        for i in range(n):
+            ti.atomic_add(total[None], x[i] * mul)
+
+    @ti.ad.grad_replaced
+    def forward(mul):
+        func(mul)
+        func(mul)
+
+    @ti.ad.grad_for(forward)
+    def backward(mul):
+        func.grad(mul)
+
+    with ti.Tape(loss=total):
+        forward(4)
+    assert x.grad[0] == 4
+
+
+@test_utils.test()
+def test_customized_kernels_grad():
+    x = ti.field(ti.f32)
+    total = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(total)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func(mul: ti.f32):
+        for i in range(n):
+            ti.atomic_add(total[None], x[i] * mul)
+
+    @ti.ad.grad_replaced
+    def forward(mul):
+        func(mul)
+        func(mul)
+
+    @ti.ad.grad_for(forward)
+    def backward(mul):
+        func.grad(mul)
+
+    total.grad[None] = 1
+    forward(4)
+    forward.grad(4)
+    assert x.grad[0] == 4
+
+
+@test_utils.test()
+def test_customized_kernels_indirect():
+    x = ti.field(ti.f32)
+    total = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(total)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func(mul: ti.f32):
+        for i in range(n):
+            ti.atomic_add(total[None], x[i] * mul)
+
+    def func_proxy(mul):
+        func(mul)
+
+    @ti.ad.grad_replaced
+    def forward(mul):
+        func_proxy(mul)
+        func_proxy(mul)
+
+    @ti.ad.grad_for(forward)
+    def backward(mul):
+        func.grad(mul)
+
+    with ti.Tape(loss=total):
+        forward(4)
+    assert x.grad[0] == 4
+
+
+@test_utils.test()
+def test_customized_kernels_oop():
+    @ti.data_oriented
+    class A:
+        def __init__(self):
+            self.x = ti.field(ti.f32)
+            self.total = ti.field(ti.f32)
+            self.n = 128
+
+            ti.root.dense(ti.i, self.n).place(self.x)
+            ti.root.place(self.total)
+
+        @ti.kernel
+        def func(self, mul: ti.f32):
+            for i in range(self.n):
+                ti.atomic_add(self.total[None], self.x[i] * mul)
+
+        @ti.ad.grad_replaced
+        def forward(self, mul):
+            self.func(mul)
+            self.func(mul)
+
+        @ti.ad.grad_for(forward)
+        def backward(self, mul):
+            self.func.grad(mul)
+
+    a = A()
+
+    ti.root.lazy_grad()
+
+    with ti.Tape(loss=a.total):
+        a.forward(4)
+    assert a.x.grad[0] == 4
+
+
+@test_utils.test()
+def test_customized_kernels_oop2():
+    @ti.data_oriented
+    class A:
+        def __init__(self):
+            self.x = ti.field(ti.f32)
+            self.total = ti.field(ti.f32)
+            self.n = 128
+
+            ti.root.dense(ti.i, self.n).place(self.x)
+            ti.root.place(self.total)
+
+        @ti.kernel
+        def func(self, mul: ti.f32):
+            for i in range(self.n):
+                ti.atomic_add(self.total[None], self.x[i] * mul)
+
+        def func_proxy(self, mul):
+            self.func(mul)
+
+        @ti.ad.grad_replaced
+        def forward(self, mul):
+            self.func_proxy(mul)
+            self.func_proxy(mul)
+
+        @ti.ad.grad_for(forward)
+        def backward(self, mul):
+            self.func.grad(mul)
+
+    a = A()
+
+    ti.root.lazy_grad()
+
+    with ti.Tape(loss=a.total):
+        a.forward(4)
+    assert a.x.grad[0] == 4
+
+
+@test_utils.test()
+def test_decorated_primal_is_taichi_kernel():
+    x = ti.field(ti.f32)
+    total = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(total)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func(mul: ti.f32):
+        for i in range(n):
+            ti.atomic_add(total[None], x[i] * mul)
+
+    with pytest.raises(RuntimeError):
+
+        @ti.ad.grad_for(func)
+        def backward(mul):
+            func.grad(mul)
+
+    with ti.Tape(loss=total):
+        func(4)
+
+
+@test_utils.test()
+def test_decorated_primal_missing_decorator():
+    x = ti.field(ti.f32)
+    total = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(total)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func(mul: ti.f32):
+        for i in range(n):
+            ti.atomic_add(total[None], x[i] * mul)
+
+    def foward(mul):
+        func(mul)
+        func(mul)
+
+    with pytest.raises(RuntimeError):
+
+        @ti.ad.grad_for(func)
+        def backward(mul):
+            func.grad(mul)
+
+    with ti.Tape(loss=total):
+        func(4)
diff --git a/tests/_python_orig/test_debug.py b/tests/_python_orig/test_debug.py
new file mode 100644
index 000000000..e60abf74e
--- /dev/null
+++ b/tests/_python_orig/test_debug.py
@@ -0,0 +1,118 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+def test_cpu_debug_snode_reader():
+    ti.init(arch=ti.x64, debug=True)
+
+    x = ti.field(ti.f32, shape=())
+    x[None] = 10.0
+
+    assert x[None] == 10.0
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_cpu_debug_snode_writer_out_of_bound():
+    x = ti.field(ti.f32, shape=3)
+
+    with pytest.raises(RuntimeError):
+        x[3] = 10.0
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_cpu_debug_snode_writer_out_of_bound_negative():
+    x = ti.field(ti.f32, shape=3)
+    with pytest.raises(RuntimeError):
+        x[-1] = 10.0
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_cpu_debug_snode_reader_out_of_bound():
+    x = ti.field(ti.f32, shape=3)
+
+    with pytest.raises(RuntimeError):
+        a = x[3]
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_cpu_debug_snode_reader_out_of_bound_negative():
+    x = ti.field(ti.f32, shape=3)
+    with pytest.raises(RuntimeError):
+        a = x[-1]
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_out_of_bound():
+    x = ti.field(ti.i32, shape=(8, 16))
+
+    @ti.kernel
+    def func():
+        x[3, 16] = 1
+
+    with pytest.raises(RuntimeError):
+        func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_not_out_of_bound():
+    x = ti.field(ti.i32, shape=(8, 16))
+
+    @ti.kernel
+    def func():
+        x[7, 15] = 1
+
+    func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_out_of_bound_dynamic():
+    x = ti.field(ti.i32)
+
+    ti.root.dynamic(ti.i, 16, 4).place(x)
+
+    @ti.kernel
+    def func():
+        x[17] = 1
+
+    with pytest.raises(RuntimeError):
+        func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_not_out_of_bound_dynamic():
+    x = ti.field(ti.i32)
+
+    ti.root.dynamic(ti.i, 16, 4).place(x)
+
+    @ti.kernel
+    def func():
+        x[3] = 1
+
+    func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_out_of_bound_with_offset():
+    x = ti.field(ti.i32, shape=(8, 16), offset=(-8, -8))
+
+    @ti.kernel
+    def func():
+        x[0, 0] = 1
+
+    with pytest.raises(RuntimeError):
+        func()
+        func()
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, gdb_trigger=False)
+def test_not_out_of_bound_with_offset():
+    x = ti.field(ti.i32, shape=(8, 16), offset=(-4, -8))
+
+    @ti.kernel
+    def func():
+        x[-4, -8] = 1
+        x[3, 7] = 2
+
+    func()
diff --git a/tests/_python_orig/test_div.py b/tests/_python_orig/test_div.py
new file mode 100644
index 000000000..0da0c2410
--- /dev/null
+++ b/tests/_python_orig/test_div.py
@@ -0,0 +1,88 @@
+from taichi.lang import impl
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def _test_floor_div(arg1, a, arg2, b, arg3, c):
+    z = ti.field(arg3, shape=())
+
+    @ti.kernel
+    def func(x: arg1, y: arg2):
+        z[None] = x // y
+
+    func(a, b)
+    assert z[None] == c
+
+
+@test_utils.test()
+def _test_true_div(arg1, a, arg2, b, arg3, c):
+    z = ti.field(arg3, shape=())
+
+    @ti.kernel
+    def func(x: arg1, y: arg2):
+        z[None] = x / y
+
+    func(a, b)
+    assert z[None] == c
+
+
+def test_floor_div():
+    _test_floor_div(ti.i32, 10, ti.i32, 3, ti.f32, 3)
+    _test_floor_div(ti.f32, 10, ti.f32, 3, ti.f32, 3)
+    _test_floor_div(ti.i32, 10, ti.f32, 3, ti.f32, 3)
+    _test_floor_div(ti.f32, 10, ti.i32, 3, ti.f32, 3)
+
+    _test_floor_div(ti.i32, -10, ti.i32, 3, ti.f32, -4)
+    _test_floor_div(ti.f32, -10, ti.f32, 3, ti.f32, -4)
+    _test_floor_div(ti.i32, -10, ti.f32, 3, ti.f32, -4)
+    _test_floor_div(ti.f32, -10, ti.i32, 3, ti.f32, -4)
+
+    _test_floor_div(ti.i32, 10, ti.i32, -3, ti.f32, -4)
+    _test_floor_div(ti.f32, 10, ti.f32, -3, ti.f32, -4)
+    _test_floor_div(ti.i32, 10, ti.f32, -3, ti.f32, -4)
+    _test_floor_div(ti.f32, 10, ti.i32, -3, ti.f32, -4)
+
+
+def test_true_div():
+    _test_true_div(ti.i32, 3, ti.i32, 2, ti.f32, 1.5)
+    _test_true_div(ti.f32, 3, ti.f32, 2, ti.f32, 1.5)
+    _test_true_div(ti.i32, 3, ti.f32, 2, ti.f32, 1.5)
+    _test_true_div(ti.f32, 3, ti.i32, 2, ti.f32, 1.5)
+    _test_true_div(ti.f32, 3, ti.i32, 2, ti.i32, 1)
+
+    _test_true_div(ti.i32, -3, ti.i32, 2, ti.f32, -1.5)
+    _test_true_div(ti.f32, -3, ti.f32, 2, ti.f32, -1.5)
+    _test_true_div(ti.i32, -3, ti.f32, 2, ti.f32, -1.5)
+    _test_true_div(ti.f32, -3, ti.i32, 2, ti.f32, -1.5)
+    _test_true_div(ti.f32, -3, ti.i32, 2, ti.i32, -1)
+
+
+@test_utils.test()
+def test_div_default_ip():
+    impl.get_runtime().set_default_ip(ti.i64)
+    z = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 1e15 + 1e9
+        z[None] = a // 1e10
+
+    func()
+    assert z[None] == 100000
+
+
+@test_utils.test()
+def test_floor_div_pythonic():
+    z = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func(x: ti.i32, y: ti.i32):
+        z[None] = x // y
+
+    for i in range(-10, 11):
+        for j in range(-10, 11):
+            if j != 0:
+                func(i, j)
+                assert z[None] == i // j
diff --git a/tests/_python_orig/test_dynamic.py b/tests/_python_orig/test_dynamic.py
new file mode 100644
index 000000000..a443a4d78
--- /dev/null
+++ b/tests/_python_orig/test_dynamic.py
@@ -0,0 +1,212 @@
+import pytest
+from taichi.lang.misc import serialize
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dynamic():
+    x = ti.field(ti.f32)
+    n = 128
+
+    ti.root.dynamic(ti.i, n, 32).place(x)
+
+    @ti.kernel
+    def func():
+        pass
+
+    for i in range(n):
+        x[i] = i
+
+    for i in range(n):
+        assert x[i] == i
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dynamic2():
+    x = ti.field(ti.f32)
+    n = 128
+
+    ti.root.dynamic(ti.i, n, 32).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            x[i] = i
+
+    func()
+
+    for i in range(n):
+        assert x[i] == i
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dynamic_matrix():
+    x = ti.Matrix.field(2, 1, dtype=ti.i32)
+    n = 8192
+
+    ti.root.dynamic(ti.i, n, chunk_size=128).place(x)
+
+    @ti.kernel
+    def func():
+        serialize()
+        for i in range(n // 4):
+            x[i * 4][1, 0] = i
+
+    func()
+
+    for i in range(n // 4):
+        a = x[i * 4][1, 0]
+        assert a == i
+        if i + 1 < n // 4:
+            b = x[i * 4 + 1][1, 0]
+            assert b == 0
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_append():
+    x = ti.field(ti.i32)
+    n = 128
+
+    ti.root.dynamic(ti.i, n, 32).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            ti.append(x.parent(), [], i)
+
+    func()
+
+    elements = []
+    for i in range(n):
+        elements.append(x[i])
+    elements.sort()
+    for i in range(n):
+        assert elements[i] == i
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_length():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.f32, shape=())
+    n = 128
+
+    ti.root.dynamic(ti.i, n, 32).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            ti.append(x.parent(), [], i)
+
+    func()
+
+    @ti.kernel
+    def get_len():
+        y[None] = ti.length(x.parent(), [])
+
+    get_len()
+
+    assert y[None] == n
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_append_ret_value():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    z = ti.field(ti.i32)
+    n = 128
+
+    ti.root.dynamic(ti.i, n, 32).place(x)
+    ti.root.dynamic(ti.i, n, 32).place(y)
+    ti.root.dynamic(ti.i, n, 32).place(z)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            u = ti.append(x.parent(), [], i)
+            y[u] = i + 1
+            z[u] = i + 3
+
+    func()
+
+    for i in range(n):
+        assert x[i] + 1 == y[i]
+        assert x[i] + 3 == z[i]
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dense_dynamic():
+    # The spin lock implementation has triggered a bug in CUDA, the end result
+    # being that appending to Taichi's dynamic node messes up its length. See
+    # https://stackoverflow.com/questions/65995357/cuda-spinlock-implementation-with-independent-thread-scheduling-supported
+    # CUDA 11.2 didn't fix this bug, unfortunately.
+    if ti.lang.impl.current_cfg().arch == ti.cuda:
+        pytest.skip('CUDA spinlock bug')
+
+    n = 128
+    x = ti.field(ti.i32)
+    l = ti.field(ti.i32, shape=n)
+
+    ti.root.dense(ti.i, n).dynamic(ti.j, n, 8).place(x)
+
+    @ti.kernel
+    def func():
+        serialize()
+        for i in range(n):
+            for j in range(n):
+                ti.append(x.parent(), j, i)
+
+        for i in range(n):
+            l[i] = ti.length(x.parent(), i)
+
+    func()
+
+    for i in range(n):
+        assert l[i] == n
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dense_dynamic_len():
+    n = 128
+    x = ti.field(ti.i32)
+    l = ti.field(ti.i32, shape=n)
+
+    ti.root.dense(ti.i, n).dynamic(ti.j, n, 32).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            l[i] = ti.length(x.parent(), i)
+
+    func()
+
+    for i in range(n):
+        assert l[i] == 0
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dynamic_activate():
+    # record the lengths
+    l = ti.field(ti.i32, 3)
+    x = ti.field(ti.i32)
+    xp = ti.root.dynamic(ti.i, 32, 32)
+    xp.place(x)
+
+    m = 5
+
+    @ti.kernel
+    def func():
+        for i in range(m):
+            ti.append(xp, [], i)
+        l[0] = ti.length(xp, [])
+        x[20] = 42
+        l[1] = ti.length(xp, [])
+        x[10] = 43
+        l[2] = ti.length(xp, [])
+
+    func()
+    l = l.to_numpy()
+    assert l[0] == m
+    assert l[1] == 21
+    assert l[2] == 21
diff --git a/tests/_python_orig/test_eig.py b/tests/_python_orig/test_eig.py
new file mode 100644
index 000000000..f0e883f01
--- /dev/null
+++ b/tests/_python_orig/test_eig.py
@@ -0,0 +1,136 @@
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+def _eigen_vector_equal(v1, v2, tol):
+    if np.linalg.norm(v1) == 0.0:
+        assert np.linalg.norm(v2) == 0.0
+    else:
+        v1 = v1 / np.linalg.norm(v1)
+        v2 = v2 / np.linalg.norm(v2)
+        try:
+            np.testing.assert_allclose(v1, v2, atol=tol, rtol=tol)
+        except AssertionError:
+            assert np.allclose(v1, -v2, atol=tol, rtol=tol) or np.allclose(
+                v1, 1.j * v2, atol=tol, rtol=tol) or np.allclose(
+                    v1, -1.j * v2, atol=tol, rtol=tol)
+
+
+def _test_eig2x2_real(dt):
+    A = ti.Matrix.field(2, 2, dtype=dt, shape=())
+    v = ti.Matrix.field(2, 2, dtype=dt, shape=())
+    w = ti.Matrix.field(4, 2, dtype=dt, shape=())
+
+    A[None] = [[1, 1], [2, 3]]
+
+    @ti.kernel
+    def eigen_solve():
+        v[None], w[None] = ti.eig(A[None])
+
+    tol = 1e-5 if dt == ti.f32 else 1e-12
+    dtype = np.float32 if dt == ti.f32 else np.float64
+
+    eigen_solve()
+    v_np, w_np = np.linalg.eig(A.to_numpy().astype(dtype))
+    v_ti = v.to_numpy()[:, 0].astype(dtype)
+    w_ti = w.to_numpy()[0::2, :].astype(dtype)
+
+    # sort by eigenvalues
+    idx_np = np.argsort(v_np)
+    idx_ti = np.argsort(v_ti)
+
+    np.testing.assert_allclose(v_ti[idx_ti], v_np[idx_np], atol=tol, rtol=tol)
+    _eigen_vector_equal(w_ti[:, idx_ti[0]], w_np[:, idx_np[0]], tol)
+    _eigen_vector_equal(w_ti[:, idx_ti[1]], w_np[:, idx_np[1]], tol)
+
+
+def _test_eig2x2_complex(dt):
+    A = ti.Matrix.field(2, 2, dtype=dt, shape=())
+    v = ti.Matrix.field(2, 2, dtype=dt, shape=())
+    w = ti.Matrix.field(4, 2, dtype=dt, shape=())
+
+    A[None] = [[1, -1], [1, 1]]
+
+    @ti.kernel
+    def eigen_solve():
+        v[None], w[None] = ti.eig(A[None])
+
+    tol = 1e-5 if dt == ti.f32 else 1e-12
+    dtype = np.float32 if dt == ti.f32 else np.float64
+
+    eigen_solve()
+    v_np, w_np = np.linalg.eig(A.to_numpy().astype(dtype))
+    v_ti = v.to_numpy().astype(dtype)
+    w_ti = w.to_numpy().astype(dtype)
+    v_ti_complex = v_ti[:, 0] + v_ti[:, 1] * 1.j
+    w_ti_complex = w_ti[0::2, :] + w_ti[1::2, :] * 1.j
+
+    # sort by eigenvalues
+    idx_np = np.argsort(v_np)
+    idx_ti = np.argsort(v_ti_complex)
+
+    np.testing.assert_allclose(v_ti_complex[idx_ti],
+                               v_np[idx_np],
+                               atol=tol,
+                               rtol=tol)
+    _eigen_vector_equal(w_ti_complex[:, idx_ti[0]], w_np[:, idx_np[0]], tol)
+    _eigen_vector_equal(w_ti_complex[:, idx_ti[1]], w_np[:, idx_np[1]], tol)
+
+
+def _test_sym_eig2x2(dt):
+    A = ti.Matrix.field(2, 2, dtype=dt, shape=())
+    v = ti.Vector.field(2, dtype=dt, shape=())
+    w = ti.Matrix.field(2, 2, dtype=dt, shape=())
+
+    A[None] = [[5, 3], [3, 2]]
+
+    @ti.kernel
+    def eigen_solve():
+        v[None], w[None] = ti.sym_eig(A[None])
+
+    tol = 1e-5 if dt == ti.f32 else 1e-12
+    dtype = np.float32 if dt == ti.f32 else np.float64
+
+    eigen_solve()
+    v_np, w_np = np.linalg.eig(A.to_numpy().astype(dtype))
+    v_ti = v.to_numpy().astype(dtype)
+    w_ti = w.to_numpy().astype(dtype)
+
+    # sort by eigenvalues
+    idx_np = np.argsort(v_np)
+    idx_ti = np.argsort(v_ti)
+
+    np.testing.assert_allclose(v_ti[idx_ti], v_np[idx_np], atol=tol, rtol=tol)
+    _eigen_vector_equal(w_ti[:, idx_ti[0]], w_np[:, idx_np[0]], tol)
+    _eigen_vector_equal(w_ti[:, idx_ti[1]], w_np[:, idx_np[1]], tol)
+
+
+def test_eig2x2():
+    for func in [_test_eig2x2_real, _test_eig2x2_complex]:
+        for fp in [ti.f32, ti.f64]:
+
+            @test_utils.test(
+                require=ti.extension.data64 if fp == ti.f64 else [],
+                default_fp=fp,
+                fast_math=False)
+            def wrapped():
+                func(fp)
+
+            wrapped()
+
+
+def test_sym_eig2x2():
+    for func in [_test_sym_eig2x2]:
+        for fp in [ti.f32, ti.f64]:
+
+            @test_utils.test(
+                require=ti.extension.data64 if fp == ti.f64 else [],
+                default_fp=fp,
+                fast_math=False)
+            def wrapped():
+                func(fp)
+
+            wrapped()
diff --git a/tests/_python_orig/test_element_wise.py b/tests/_python_orig/test_element_wise.py
new file mode 100644
index 000000000..59354c96e
--- /dev/null
+++ b/tests/_python_orig/test_element_wise.py
@@ -0,0 +1,346 @@
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+def _c_mod(a, b):
+    return a - b * int(float(a) / b)
+
+
+@pytest.mark.parametrize('lhs_is_mat,rhs_is_mat', [(True, True), (True, False),
+                                                   (False, True)])
+@test_utils.test(fast_math=False, exclude=[ti.vulkan])
+def test_binary_f(lhs_is_mat, rhs_is_mat):
+    x = ti.Matrix.field(3, 2, ti.f32, 16)
+    if lhs_is_mat:
+        y = ti.Matrix.field(3, 2, ti.f32, ())
+    else:
+        y = ti.field(ti.f32, ())
+    if rhs_is_mat:
+        z = ti.Matrix.field(3, 2, ti.f32, ())
+    else:
+        z = ti.field(ti.f32, ())
+
+    if lhs_is_mat:
+        y.from_numpy(np.array([[0, 2], [9, 3.1], [7, 4]], np.float32))
+    else:
+        y[None] = 6.1
+    if rhs_is_mat:
+        z.from_numpy(np.array([[4, 5], [6, 3], [9, 2]], np.float32))
+    else:
+        z[None] = 5
+
+    @ti.kernel
+    def func():
+        x[0] = y[None] + z[None]
+        x[1] = y[None] - z[None]
+        x[2] = y[None] * z[None]
+        x[3] = y[None] / z[None]
+        x[4] = y[None] // z[None]
+        x[5] = y[None] % z[None]
+        x[6] = y[None]**z[None]
+        x[7] = y[None] == z[None]
+        x[8] = y[None] != z[None]
+        x[9] = y[None] > z[None]
+        x[10] = y[None] >= z[None]
+        x[11] = y[None] < z[None]
+        x[12] = y[None] <= z[None]
+        x[13] = ti.atan2(y[None], z[None])
+        x[14] = ti.min(y[None], z[None])
+        x[15] = ti.max(y[None], z[None])
+
+    func()
+    x = x.to_numpy()
+    y = y.to_numpy()
+    z = z.to_numpy()
+    assert test_utils.allclose(x[0], y + z)
+    assert test_utils.allclose(x[1], y - z)
+    assert test_utils.allclose(x[2], y * z)
+    assert test_utils.allclose(x[3], y / z)
+    assert test_utils.allclose(x[4], y // z)
+    assert test_utils.allclose(x[5], y % z)
+    assert test_utils.allclose(x[6], y**z)
+    assert test_utils.allclose(x[7], y == z)
+    assert test_utils.allclose(x[8], y != z)
+    assert test_utils.allclose(x[9], y > z)
+    assert test_utils.allclose(x[10], y >= z)
+    assert test_utils.allclose(x[11], y < z)
+    assert test_utils.allclose(x[12], y <= z)
+    assert test_utils.allclose(x[13], np.arctan2(y, z))
+    assert test_utils.allclose(x[14], np.minimum(y, z))
+    assert test_utils.allclose(x[15], np.maximum(y, z))
+
+
+@pytest.mark.parametrize('is_mat', [(True, True), (True, False),
+                                    (False, True)])
+@test_utils.test()
+def test_binary_i(is_mat):
+    lhs_is_mat, rhs_is_mat = is_mat
+
+    x = ti.Matrix.field(3, 2, ti.i32, 20)
+    if lhs_is_mat:
+        y = ti.Matrix.field(3, 2, ti.i32, ())
+    else:
+        y = ti.field(ti.i32, ())
+    if rhs_is_mat:
+        z = ti.Matrix.field(3, 2, ti.i32, ())
+    else:
+        z = ti.field(ti.i32, ())
+
+    if lhs_is_mat:
+        y.from_numpy(np.array([[0, 2], [9, 3], [7, 4]], np.int32))
+    else:
+        y[None] = 6
+    if rhs_is_mat:
+        z.from_numpy(np.array([[4, 5], [6, 3], [9, 2]], np.int32))
+    else:
+        z[None] = 5
+
+    @ti.kernel
+    def func():
+        x[0] = y[None] + z[None]
+        x[1] = y[None] - z[None]
+        x[2] = y[None] * z[None]
+        x[3] = y[None] // z[None]
+        x[4] = ti.raw_div(y[None], z[None])
+        x[5] = y[None] % z[None]
+        x[6] = ti.raw_mod(y[None], z[None])
+        x[7] = y[None]**z[None]
+        x[8] = y[None] == z[None]
+        x[9] = y[None] != z[None]
+        x[10] = y[None] > z[None]
+        x[11] = y[None] >= z[None]
+        x[12] = y[None] < z[None]
+        x[13] = y[None] <= z[None]
+        x[14] = y[None] & z[None]
+        x[15] = y[None] ^ z[None]
+        x[16] = y[None] | z[None]
+        x[17] = ti.min(y[None], z[None])
+        x[18] = ti.max(y[None], z[None])
+        x[19] = y[None] << z[None]
+
+    func()
+    x = x.to_numpy()
+    y = y.to_numpy()
+    z = z.to_numpy()
+    assert test_utils.allclose(x[0], y + z)
+    assert test_utils.allclose(x[1], y - z)
+    assert test_utils.allclose(x[2], y * z)
+    assert test_utils.allclose(x[3], y // z)
+    assert test_utils.allclose(x[4], y // z)
+    assert test_utils.allclose(x[5], y % z)
+    assert test_utils.allclose(x[6], y % z)
+    assert test_utils.allclose(x[7], y**z, rel=1e-5)
+    assert test_utils.allclose(x[8], y == z)
+    assert test_utils.allclose(x[9], y != z)
+    assert test_utils.allclose(x[10], y > z)
+    assert test_utils.allclose(x[11], y >= z)
+    assert test_utils.allclose(x[12], y < z)
+    assert test_utils.allclose(x[13], y <= z)
+    assert test_utils.allclose(x[14], y & z)
+    assert test_utils.allclose(x[15], y ^ z)
+    assert test_utils.allclose(x[16], y | z)
+    assert test_utils.allclose(x[17], np.minimum(y, z))
+    assert test_utils.allclose(x[18], np.maximum(y, z))
+    assert test_utils.allclose(x[19], y << z)
+
+
+@pytest.mark.parametrize('rhs_is_mat', [True, False])
+@test_utils.test(fast_math=False)
+def test_writeback_binary_f(rhs_is_mat):
+    x = ti.Matrix.field(3, 2, ti.f32, 9)
+    y = ti.Matrix.field(3, 2, ti.f32, ())
+    if rhs_is_mat:
+        z = ti.Matrix.field(3, 2, ti.f32, ())
+    else:
+        z = ti.field(ti.f32, ())
+
+    y.from_numpy(np.array([[0, 2], [9, 3.1], [7, 4]], np.float32))
+    if rhs_is_mat:
+        z.from_numpy(np.array([[4, 5], [6, 3], [9, 2]], np.float32))
+    else:
+        z[None] = 5
+
+    @ti.kernel
+    def func():
+        for i in x:
+            x[i] = y[None]
+        if ti.static(rhs_is_mat):
+            x[0] = z[None]
+        else:
+            x[0].fill(z[None])
+        x[1] += z[None]
+        x[2] -= z[None]
+        x[3] *= z[None]
+        x[4] /= z[None]
+        x[5] //= z[None]
+        x[6] %= z[None]
+        ti.atomic_min(x[7], z[None])
+        ti.atomic_max(x[8], z[None])
+
+    func()
+    x = x.to_numpy()
+    y = y.to_numpy()
+    z = z.to_numpy()
+    assert test_utils.allclose(x[1], y + z)
+    assert test_utils.allclose(x[2], y - z)
+    assert test_utils.allclose(x[3], y * z)
+    assert test_utils.allclose(x[4], y / z)
+    assert test_utils.allclose(x[5], y // z)
+    assert test_utils.allclose(x[6], y % z)
+    assert test_utils.allclose(x[7], np.minimum(y, z))
+    assert test_utils.allclose(x[8], np.maximum(y, z))
+
+
+@pytest.mark.parametrize('rhs_is_mat', [(True, True), (True, False)])
+@test_utils.test()
+def test_writeback_binary_i(rhs_is_mat):
+    x = ti.Matrix.field(3, 2, ti.i32, 12)
+    y = ti.Matrix.field(3, 2, ti.i32, ())
+    if rhs_is_mat:
+        z = ti.Matrix.field(3, 2, ti.i32, ())
+    else:
+        z = ti.field(ti.i32, ())
+
+    y.from_numpy(np.array([[0, 2], [9, 3], [7, 4]], np.int32))
+    if rhs_is_mat:
+        z.from_numpy(np.array([[4, 5], [6, 3], [9, 2]], np.int32))
+    else:
+        z[None] = 5
+
+    @ti.kernel
+    def func():
+        for i in x:
+            x[i] = y[None]
+        x[0] = z[None]
+        x[1] += z[None]
+        x[2] -= z[None]
+        x[3] *= z[None]
+        x[4] //= z[None]
+        x[5] %= z[None]
+        x[6] &= z[None]
+        x[7] |= z[None]
+        x[8] ^= z[None]
+        ti.atomic_min(x[10], z[None])
+        ti.atomic_max(x[11], z[None])
+
+    func()
+    x = x.to_numpy()
+    y = y.to_numpy()
+    z = z.to_numpy()
+    assert test_utils.allclose(x[1], y + z)
+    assert test_utils.allclose(x[2], y - z)
+    assert test_utils.allclose(x[3], y * z)
+    assert test_utils.allclose(x[4], y // z)
+    assert test_utils.allclose(x[5], y % z)
+    assert test_utils.allclose(x[6], y & z)
+    assert test_utils.allclose(x[7], y | z)
+    assert test_utils.allclose(x[8], y ^ z)
+    assert test_utils.allclose(x[10], np.minimum(y, z))
+    assert test_utils.allclose(x[11], np.maximum(y, z))
+
+
+@test_utils.test()
+def test_unary():
+    xi = ti.Matrix.field(3, 2, ti.i32, 4)
+    yi = ti.Matrix.field(3, 2, ti.i32, ())
+    xf = ti.Matrix.field(3, 2, ti.f32, 15)
+    yf = ti.Matrix.field(3, 2, ti.f32, ())
+
+    yi.from_numpy(np.array([[3, 2], [9, 0], [7, 4]], np.int32))
+    yf.from_numpy(np.array([[0.3, 0.2], [0.9, 0.1], [0.7, 0.4]], np.float32))
+
+    @ti.kernel
+    def func():
+        xi[0] = -yi[None]
+        xi[1] = ~yi[None]
+        xi[2] = not yi[None]
+        xi[3] = abs(yi[None])
+        xf[0] = -yf[None]
+        xf[1] = abs(yf[None])
+        xf[2] = ti.sqrt(yf[None])
+        xf[3] = ti.sin(yf[None])
+        xf[4] = ti.cos(yf[None])
+        xf[5] = ti.tan(yf[None])
+        xf[6] = ti.asin(yf[None])
+        xf[7] = ti.acos(yf[None])
+        xf[8] = ti.tanh(yf[None])
+        xf[9] = ti.floor(yf[None])
+        xf[10] = ti.ceil(yf[None])
+        xf[11] = ti.exp(yf[None])
+        xf[12] = ti.log(yf[None])
+        xf[13] = ti.rsqrt(yf[None])
+        xf[14] = ti.round(yf[None])
+
+    func()
+    xi = xi.to_numpy()
+    yi = yi.to_numpy()
+    xf = xf.to_numpy()
+    yf = yf.to_numpy()
+    assert test_utils.allclose(xi[0], -yi)
+    assert test_utils.allclose(xi[1], ~yi)
+    assert test_utils.allclose(xi[3], np.abs(yi))
+    assert test_utils.allclose(xf[0], -yf)
+    assert test_utils.allclose(xf[1], np.abs(yf))
+    assert test_utils.allclose(xf[2], np.sqrt(yf), rel=1e-5)
+    assert test_utils.allclose(xf[3], np.sin(yf), rel=1e-4)
+    assert test_utils.allclose(xf[4], np.cos(yf), rel=1e-4)
+    assert test_utils.allclose(xf[5], np.tan(yf), rel=1e-4)
+    assert test_utils.allclose(xf[6], np.arcsin(yf), rel=1e-4)
+    assert test_utils.allclose(xf[7], np.arccos(yf), rel=1e-4)
+    assert test_utils.allclose(xf[8], np.tanh(yf), rel=1e-4)
+    assert test_utils.allclose(xf[9], np.floor(yf), rel=1e-5)
+    assert test_utils.allclose(xf[10], np.ceil(yf), rel=1e-5)
+    assert test_utils.allclose(xf[11], np.exp(yf), rel=1e-5)
+    assert test_utils.allclose(xf[12], np.log(yf), rel=1e-5)
+    assert test_utils.allclose(xf[13], 1 / np.sqrt(yf), rel=1e-5)
+    assert test_utils.allclose(xf[14], np.round(yf), rel=1e-5)
+
+
+@pytest.mark.parametrize('is_mat', [(True, True, True), (True, False, False),
+                                    (False, True, False), (False, False, True),
+                                    (False, True, True)])
+@test_utils.test()
+def test_ternary_i(is_mat):
+    cond_is_mat, lhs_is_mat, rhs_is_mat = is_mat
+    x = ti.Matrix.field(3, 2, ti.i32, 1)
+    if cond_is_mat:
+        y = ti.Matrix.field(3, 2, ti.i32, ())
+    else:
+        y = ti.field(ti.i32, ())
+    if lhs_is_mat:
+        z = ti.Matrix.field(3, 2, ti.i32, ())
+    else:
+        z = ti.field(ti.i32, ())
+    if rhs_is_mat:
+        w = ti.Matrix.field(3, 2, ti.i32, ())
+    else:
+        w = ti.field(ti.i32, ())
+
+    if cond_is_mat:
+        y.from_numpy(np.array([[0, 2], [9, 0], [7, 4]], np.int32))
+    else:
+        y[None] = 0
+    if lhs_is_mat:
+        z.from_numpy(np.array([[4, 5], [6, 3], [9, 2]], np.int32))
+    else:
+        z[None] = 5
+    if rhs_is_mat:
+        w.from_numpy(np.array([[4, 5], [6, 3], [9, 2]], np.int32))
+    else:
+        w[None] = 4
+
+    @ti.kernel
+    def func():
+        x[0] = z[None] if y[None] else w[None]
+
+    func()
+    x = x.to_numpy()
+    y = y.to_numpy()
+    z = z.to_numpy()
+    w = w.to_numpy()
+    assert test_utils.allclose(
+        x[0],
+        np.int32(np.bool_(y)) * z + np.int32(1 - np.bool_(y)) * w)
diff --git a/tests/_python_orig/test_empty.py b/tests/_python_orig/test_empty.py
new file mode 100644
index 000000000..80ac246bb
--- /dev/null
+++ b/tests/_python_orig/test_empty.py
@@ -0,0 +1,21 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_empty():
+    @ti.kernel
+    def func():
+        pass
+
+    func()
+
+
+@test_utils.test()
+def test_empty_args():
+    @ti.kernel
+    def func(x: ti.i32, arr: ti.ext_arr()):
+        pass
+
+    import numpy as np
+    func(42, np.arange(10, dtype=np.float32))
diff --git a/tests/_python_orig/test_exception.py b/tests/_python_orig/test_exception.py
new file mode 100644
index 000000000..2cace7487
--- /dev/null
+++ b/tests/_python_orig/test_exception.py
@@ -0,0 +1,158 @@
+from inspect import currentframe, getframeinfo
+from sys import version_info
+
+import pytest
+from tests import test_utils
+
+import taichi as ti
+
+
+@test_utils.test()
+def test_exception_multiline():
+    frameinfo = getframeinfo(currentframe())
+    with pytest.raises(ti.TaichiNameError) as e:
+        # yapf: disable
+        @ti.kernel
+        def foo():
+            aaaa(111,
+                 1211222,
+
+                 23)
+        foo()
+        # yapf: enable
+
+    if version_info < (3, 8):
+        msg = f"""
+On line {frameinfo.lineno + 5} of file "{frameinfo.filename}", in foo:
+            aaaa(111,"""
+    else:
+        msg = f"""
+On line {frameinfo.lineno + 5} of file "{frameinfo.filename}", in foo:
+            aaaa(111,
+            ^^^^"""
+    print(e.value.args[0])
+    assert e.value.args[0][:len(msg)] == msg
+
+
+@test_utils.test()
+def test_exception_from_func():
+    frameinfo = getframeinfo(currentframe())
+    with pytest.raises(ti.TaichiNameError) as e:
+
+        @ti.func
+        def baz():
+            t()
+
+        @ti.func
+        def bar():
+            baz()
+
+        @ti.kernel
+        def foo():
+            bar()
+
+        foo()
+    lineno = frameinfo.lineno
+    file = frameinfo.filename
+    if version_info < (3, 8):
+        msg = f"""
+On line {lineno + 13} of file "{file}", in foo:
+            bar()
+On line {lineno + 9} of file "{file}", in bar:
+            baz()
+On line {lineno + 5} of file "{file}", in baz:
+            t()"""
+    else:
+        msg = f"""
+On line {lineno + 13} of file "{file}", in foo:
+            bar()
+            ^^^^^
+On line {lineno + 9} of file "{file}", in bar:
+            baz()
+            ^^^^^
+On line {lineno + 5} of file "{file}", in baz:
+            t()
+            ^"""
+    print(e.value.args[0])
+    assert e.value.args[0][:len(msg)] == msg
+
+
+@test_utils.test()
+def test_tab():
+    frameinfo = getframeinfo(currentframe())
+    with pytest.raises(ti.TaichiNameError) as e:
+        # yapf: disable
+        @ti.kernel
+        def foo():
+            a(11,	22,	3)
+        foo()
+        # yapf: enable
+    lineno = frameinfo.lineno
+    file = frameinfo.filename
+    if version_info < (3, 8):
+        msg = f"""
+On line {lineno + 5} of file "{file}", in foo:
+            a(11,   22, 3)"""
+    else:
+        msg = f"""
+On line {lineno + 5} of file "{file}", in foo:
+            a(11,   22, 3)
+            ^"""
+    print(e.value.args[0])
+    assert e.value.args[0][:len(msg)] == msg
+
+
+@test_utils.test()
+def test_super_long_line():
+    frameinfo = getframeinfo(currentframe())
+    with pytest.raises(ti.TaichiNameError) as e:
+        # yapf: disable
+        @ti.kernel
+        def foo():
+            aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(111)
+        foo()
+        # yapf: enable
+    lineno = frameinfo.lineno
+    file = frameinfo.filename
+    if version_info < (3, 8):
+        msg = f"""
+On line {lineno + 5} of file "{file}", in foo:
+            aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(111)
+"""
+    else:
+        msg = f"""
+On line {lineno + 5} of file "{file}", in foo:
+            aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbaaaaaa
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+bbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(111)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"""
+    print(e.value.args[0])
+    assert e.value.args[0][:len(msg)] == msg
+
+
+@pytest.mark.skipif(version_info < (3, 8), reason="This is a feature for python>=3.8")
+@test_utils.test()
+def test_exception_in_node_with_body():
+    frameinfo = getframeinfo(currentframe())
+    @ti.kernel
+    def foo():
+        for i in range(1, 2, 3):
+            a = 1
+            b = 1
+            c = 1
+            d = 1
+
+    with pytest.raises(ti.TaichiCompilationError) as e:
+        foo()
+    lineno = frameinfo.lineno
+    file = frameinfo.filename
+    msg = f"""
+On line {lineno + 3} of file "{file}", in foo:
+        for i in range(1, 2, 3):
+        ^^^^^^^^^^^^^^^^^^^^^^^^
+Range should have 1 or 2 arguments, found 3"""
+    print(e.value.args[0])
+    assert e.value.args[0] == msg
+
diff --git a/tests/_python_orig/test_expr_dict.py b/tests/_python_orig/test_expr_dict.py
new file mode 100644
index 000000000..2cb32de83
--- /dev/null
+++ b/tests/_python_orig/test_expr_dict.py
@@ -0,0 +1,44 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(ti.cpu)
+def test_expr_dict_basic():
+    @ti.kernel
+    def func(u: int, v: float) -> float:
+        x = {'foo': 2 + u, 'bar': 3 + v}
+        return x['foo'] * 100 + x['bar']
+
+    assert func(2, 0.1) == test_utils.approx(403.1)
+
+
+@test_utils.test(ti.cpu)
+def test_expr_dict_field():
+    a = ti.field(ti.f32, shape=(4, ))
+
+    @ti.kernel
+    def func() -> float:
+        x = {'foo': 2 + a[0], 'bar': 3 + a[1]}
+        return x['foo'] * 100 + x['bar']
+
+    a[0] = 2
+    a[1] = 0.1
+    assert func() == test_utils.approx(403.1)
+
+
+@test_utils.test(ti.cpu)
+def test_dictcomp_multiple_ifs():
+    n = 8
+    x = ti.field(ti.i32, shape=(n, ))
+
+    @ti.kernel
+    def test() -> ti.i32:
+        # Taichi doesn't support global fields appearing anywhere after "for"
+        # here.
+        a = {x[j]: x[j] + j for j in range(100) if j > 2 if j < 5}
+        return sum(a.values())
+
+    for i in range(n):
+        x[i] = i * 2
+
+    assert test() == (3 * 2 + 3) + (4 * 2 + 4)
diff --git a/tests/_python_orig/test_expr_list.py b/tests/_python_orig/test_expr_list.py
new file mode 100644
index 000000000..5f9ea669c
--- /dev/null
+++ b/tests/_python_orig/test_expr_list.py
@@ -0,0 +1,28 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(ti.cpu)
+def test_expr_list_basic():
+    @ti.kernel
+    def func(u: int, v: float) -> float:
+        x = [2 + u, 3 + v]
+        return x[0] * 100 + x[1]
+
+    assert func(1, 1.1) == test_utils.approx(304.1)
+
+
+@test_utils.test()
+def test_listcomp_multiple_ifs():
+    x = ti.field(ti.i32, shape=(4, ))
+
+    @ti.kernel
+    def test() -> ti.i32:
+        # Taichi doesn't support global fields appearing anywhere after "for"
+        # here.
+        a = [x[0] for j in range(100) if j > 2 if j < 5]
+        return sum(a)
+
+    for i in range(6):
+        x[0] = i
+        assert test() == i * 2
diff --git a/tests/_python_orig/test_external_func.py b/tests/_python_orig/test_external_func.py
new file mode 100644
index 000000000..b7c4bb22f
--- /dev/null
+++ b/tests/_python_orig/test_external_func.py
@@ -0,0 +1,89 @@
+import ctypes
+import os
+import shutil
+import tempfile
+
+import pytest
+from taichi.lang.util import has_clangpp
+
+import taichi as ti
+from tests import test_utils
+
+
+@pytest.mark.skipif(not has_clangpp(), reason='Clang not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_source_builder_from_source():
+    source_bc = '''
+    extern "C" {
+        void add_and_mul(float *a, float *b, float *c, float *d, int *e) {
+            *c = (*a) + (*b);
+            *d = (*a) * (*b);
+            *e = int((*a) * (*b) + (*a));
+        }
+        void pow_int(int *a, int *b, int *c) {
+            int ret = 1;
+            for (int i = 0; i < (*b); i++)
+                ret = ret * (*a);
+            *c = ret;
+        }
+    }
+    '''
+    sb_bc = ti.lang.source_builder.SourceBuilder.from_source(source_bc)
+
+    @ti.kernel
+    def func_bc() -> ti.i32:
+        a = 2.0
+        b = 3.0
+        c = 0.0
+        d = 0.0
+        e = 3
+        sb_bc.add_and_mul(a, b, c, d, e)
+        p = 0
+        c_plus_d = int(c + d)
+        sb_bc.pow_int(c_plus_d, e, p)
+        return p
+
+    assert func_bc() == 11**8
+
+
+@pytest.mark.skipif(not has_clangpp(), reason='Clang not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_source_builder_from_file():
+    source_code = '''
+    extern "C" {
+        void add_and_mul(float *a, float *b, float *c, float *d, int *e) {
+            *c = (*a) + (*b);
+            *d = (*a) * (*b);
+            *e = int((*a) * (*b) + (*a));
+        }
+        void pow_int(int *a, int *b, int *c) {
+            int ret = 1;
+            for (int i = 0; i < (*b); i++)
+                ret = ret * (*a);
+            *c = ret;
+        }
+    }
+    '''
+
+    td = tempfile.mkdtemp()
+    fn = os.path.join(td, 'source.cpp')
+    with open(fn, 'w') as f:
+        f.write(source_code)
+    sb_bc = ti.lang.source_builder.SourceBuilder.from_file(fn)
+
+    @ti.kernel
+    def func_bc() -> ti.i32:
+        a = 2.0
+        b = 3.0
+        c = 0.0
+        d = 0.0
+        e = 3
+        sb_bc.add_and_mul(a, b, c, d, e)
+        p = 0
+        c_plus_d = int(c + d)
+        sb_bc.pow_int(c_plus_d, e, p)
+        return p
+
+    assert func_bc() == 11**8
+
+    shutil.rmtree(td)
diff --git a/tests/_python_orig/test_f16.py b/tests/_python_orig/test_f16.py
new file mode 100644
index 000000000..00709f0a9
--- /dev/null
+++ b/tests/_python_orig/test_f16.py
@@ -0,0 +1,303 @@
+import math
+
+import numpy as np
+import pytest
+from taichi.lang.util import has_pytorch
+
+import taichi as ti
+from tests import test_utils
+
+archs_support_f16 = [ti.cpu, ti.cuda, ti.vulkan]
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_snode_read_write():
+    dtype = ti.f16
+    x = ti.field(dtype, shape=())
+    x[None] = 0.3
+    print(x[None])
+    assert (x[None] == test_utils.approx(0.3, rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_float16():
+    dtype = ti.float16
+    x = ti.field(dtype, shape=())
+    x[None] = 0.3
+    print(x[None])
+    assert (x[None] == test_utils.approx(0.3, rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_to_numpy():
+    n = 16
+    x = ti.field(ti.f16, shape=n)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i] = i * 2
+
+    init()
+    y = x.to_numpy()
+    for i in range(n):
+        assert (y[i] == 2 * i)
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_from_numpy():
+    n = 16
+    y = ti.field(dtype=ti.f16, shape=n)
+    x = np.arange(n, dtype=np.half)
+    y.from_numpy(x)
+
+    @ti.kernel
+    def init():
+        for i in y:
+            y[i] = 3 * i
+
+    init()
+    z = y.to_numpy()
+    for i in range(n):
+        assert (z[i] == i * 3)
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(arch=archs_support_f16)
+def test_to_torch():
+    n = 16
+    x = ti.field(ti.f16, shape=n)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i] = i * 2
+
+    init()
+    y = x.to_torch()
+    print(y)
+    for i in range(n):
+        assert (y[i] == 2 * i)
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(arch=archs_support_f16)
+def test_from_torch():
+    import torch
+    n = 16
+    y = ti.field(dtype=ti.f16, shape=n)
+    # torch doesn't have rand implementation for float16 so we need to create float first and then convert
+    x = torch.range(0, n - 1).to(torch.float16)
+    y.from_torch(x)
+
+    @ti.kernel
+    def init():
+        for i in y:
+            y[i] = 3 * i
+
+    init()
+    z = y.to_torch()
+    for i in range(n):
+        assert (z[i] == i * 3)
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_binary_op():
+    dtype = ti.f16
+    x = ti.field(dtype, shape=())
+    y = ti.field(dtype, shape=())
+    z = ti.field(dtype, shape=())
+
+    @ti.kernel
+    def add():
+        x[None] = y[None] + z[None]
+        x[None] = x[None] * z[None]
+
+    y[None] = 0.2
+    z[None] = 0.72
+    add()
+    u = x.to_numpy()
+    assert (u[None] == test_utils.approx(0.6624, rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_rand_promote():
+    dtype = ti.f16
+    x = ti.field(dtype, shape=(4, 4))
+
+    @ti.kernel
+    def init():
+        for i, j in x:
+            x[i, j] = ti.random(dtype=dtype)
+            print(x[i, j])
+
+    init()
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_unary_op():
+    dtype = ti.f16
+    x = ti.field(dtype, shape=())
+    y = ti.field(dtype, shape=())
+
+    @ti.kernel
+    def foo():
+        x[None] = -y[None]
+        x[None] = ti.floor(x[None])
+        y[None] = ti.ceil(y[None])
+
+    y[None] = -1.4
+    foo()
+    assert (x[None] == test_utils.approx(1, rel=1e-3))
+    assert (y[None] == test_utils.approx(-1, rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_extra_unary_promote():
+    dtype = ti.f16
+    x = ti.field(dtype, shape=())
+    y = ti.field(dtype, shape=())
+
+    @ti.kernel
+    def foo():
+        x[None] = abs(y[None])
+
+    y[None] = -0.3
+    foo()
+    assert (x[None] == test_utils.approx(0.3, rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16, exclude=ti.vulkan)
+def test_binary_extra_promote():
+    x = ti.field(dtype=ti.f16, shape=())
+    y = ti.field(dtype=ti.f16, shape=())
+    z = ti.field(dtype=ti.f16, shape=())
+
+    @ti.kernel
+    def foo():
+        y[None] = x[None]**2
+        z[None] = ti.atan2(y[None], 0.3)
+
+    x[None] = 0.1
+    foo()
+    assert (z[None] == test_utils.approx(math.atan2(0.1**2, 0.3), rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_arg_f16():
+    dtype = ti.f16
+    x = ti.field(dtype, shape=())
+    y = ti.field(dtype, shape=())
+
+    @ti.kernel
+    def foo(a: ti.f16):
+        x[None] = y[None] + a
+
+    y[None] = -0.3
+    foo(1.2)
+    assert (x[None] == test_utils.approx(0.9, rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_fractal_f16():
+    n = 320
+    pixels = ti.field(dtype=ti.f16, shape=(n * 2, n))
+
+    @ti.func
+    def complex_sqr(z):
+        return ti.Vector([z[0]**2 - z[1]**2, z[1] * z[0] * 2], dt=ti.f16)
+
+    @ti.kernel
+    def paint(t: float):
+        for i, j in pixels:  # Parallelized over all pixels
+            c = ti.Vector([-0.8, ti.cos(t) * 0.2], dt=ti.f16)
+            z = ti.Vector([i / n - 1, j / n - 0.5], dt=ti.f16) * 2
+            iterations = 0
+            while z.norm() < 20 and iterations < 50:
+                z = complex_sqr(z) + c
+                iterations += 1
+            pixels[i, j] = 1 - iterations * 0.02
+
+    paint(0.03)
+
+
+# TODO(): Vulkan support
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_atomic_add_f16():
+    f = ti.field(dtype=ti.f16, shape=(2))
+
+    @ti.kernel
+    def foo():
+        # Parallel sum
+        for i in range(1000):
+            f[0] += 1.12
+
+        # Serial sum
+        for _ in range(1):
+            for i in range(1000):
+                f[1] = f[1] + 1.12
+
+    foo()
+    assert (f[0] == test_utils.approx(f[1], rel=1e-3))
+
+
+# TODO(): Vulkan support
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_atomic_max_f16():
+    f = ti.field(dtype=ti.f16, shape=(2))
+
+    @ti.kernel
+    def foo():
+        # Parallel max
+        for i in range(1000):
+            ti.atomic_max(f[0], 1.12 * i)
+
+        # Serial max
+        for _ in range(1):
+            for i in range(1000):
+                f[1] = ti.max(1.12 * i, f[1])
+
+    foo()
+    assert (f[0] == test_utils.approx(f[1], rel=1e-3))
+
+
+# TODO(): Vulkan support
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_atomic_min_f16():
+    f = ti.field(dtype=ti.f16, shape=(2))
+
+    @ti.kernel
+    def foo():
+        # Parallel min
+        for i in range(1000):
+            ti.atomic_min(f[0], -3.13 * i)
+
+        # Serial min
+        for _ in range(1):
+            for i in range(1000):
+                f[1] = ti.min(-3.13 * i, f[1])
+
+    foo()
+    assert (f[0] == test_utils.approx(f[1], rel=1e-3))
+
+
+@test_utils.test(arch=archs_support_f16)
+def test_cast_f32_to_f16():
+    @ti.kernel
+    def func() -> ti.f16:
+        a = ti.cast(23.0, ti.f32)
+        b = ti.cast(4.0, ti.f32)
+        return ti.cast(a * b, ti.f16)
+
+    assert func() == pytest.approx(23.0 * 4.0, 1e-4)
+
+
+@test_utils.test(arch=archs_support_f16, require=ti.extension.data64)
+def test_cast_f64_to_f16():
+    @ti.kernel
+    def func() -> ti.f16:
+        a = ti.cast(23.0, ti.f64)
+        b = ti.cast(4.0, ti.f64)
+        return ti.cast(a * b, ti.f16)
+
+    assert func() == pytest.approx(23.0 * 4.0, 1e-4)
diff --git a/tests/_python_orig/test_field.py b/tests/_python_orig/test_field.py
new file mode 100644
index 000000000..7f1bcd1fb
--- /dev/null
+++ b/tests/_python_orig/test_field.py
@@ -0,0 +1,183 @@
+'''
+To test our new `ti.field` API is functional (#1500)
+'''
+
+import pytest
+from taichi.lang import impl
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+data_types = [ti.i32, ti.f32, ti.i64, ti.f64]
+field_shapes = [(), 8, (6, 12)]
+vector_dims = [3]
+matrix_dims = [(1, 2), (2, 3)]
+
+
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', field_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_scalar_field(dtype, shape):
+    x = ti.field(dtype, shape)
+
+    if isinstance(shape, tuple):
+        assert x.shape == shape
+    else:
+        assert x.shape == (shape, )
+
+    assert x.dtype == dtype
+
+
+@pytest.mark.parametrize('n', vector_dims)
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', field_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_vector_field(n, dtype, shape):
+    x = ti.Vector.field(n, dtype, shape)
+
+    if isinstance(shape, tuple):
+        assert x.shape == shape
+    else:
+        assert x.shape == (shape, )
+
+    assert x.dtype == dtype
+    assert x.n == n
+    assert x.m == 1
+
+
+@pytest.mark.parametrize('n,m', matrix_dims)
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', field_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_matrix_field(n, m, dtype, shape):
+    x = ti.Matrix.field(n, m, dtype=dtype, shape=shape)
+
+    if isinstance(shape, tuple):
+        assert x.shape == shape
+    else:
+        assert x.shape == (shape, )
+
+    assert x.dtype == dtype
+    assert x.n == n
+    assert x.m == m
+
+
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', field_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_scalr_field_from_numpy(dtype, shape):
+    import numpy as np
+    x = ti.field(dtype, shape)
+    # use the corresponding dtype for the numpy array.
+    numpy_dtypes = {
+        ti.i32: np.int32,
+        ti.f32: np.float32,
+        ti.f64: np.float64,
+        ti.i64: np.int64,
+    }
+    arr = np.empty(shape, dtype=numpy_dtypes[dtype])
+    x.from_numpy(arr)
+
+
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', field_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_scalr_field_from_numpy_with_mismatch_shape(dtype, shape):
+    import numpy as np
+    x = ti.field(dtype, shape)
+    numpy_dtypes = {
+        ti.i32: np.int32,
+        ti.f32: np.float32,
+        ti.f64: np.float64,
+        ti.i64: np.int64,
+    }
+    # compose the mismatch shape for every ti.field.
+    # set the shape to (2, 3) by default, if the ti.field shape is a tuple, set it to 1.
+    mismatch_shape = (2, 3)
+    if isinstance(shape, tuple):
+        mismatch_shape = 1
+    arr = np.empty(mismatch_shape, dtype=numpy_dtypes[dtype])
+    with pytest.raises(ValueError):
+        x.from_numpy(arr)
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_field_needs_grad():
+    # Just make sure the usage doesn't crash, see #1545
+    n = 8
+    m1 = ti.field(dtype=ti.f32, shape=n, needs_grad=True)
+    m2 = ti.field(dtype=ti.f32, shape=n, needs_grad=True)
+    gr = ti.field(dtype=ti.f32, shape=n)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            gr[i] = m1.grad[i] + m2.grad[i]
+
+    func()
+
+
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+def test_default_fp(dtype):
+    ti.init(default_fp=dtype)
+
+    x = ti.Vector.field(2, float, ())
+
+    assert x.dtype == impl.get_runtime().default_fp
+
+
+@pytest.mark.parametrize('dtype', [ti.i32, ti.i64])
+def test_default_ip(dtype):
+    ti.init(default_ip=dtype)
+
+    x = ti.Vector.field(2, int, ())
+
+    assert x.dtype == impl.get_runtime().default_ip
+
+
+@test_utils.test()
+def test_field_name():
+    a = ti.field(dtype=ti.f32, shape=(2, 3), name='a')
+    b = ti.Vector.field(3, dtype=ti.f32, shape=(2, 3), name='b')
+    c = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(5, 4), name='c')
+    assert a._name == 'a'
+    assert b._name == 'b'
+    assert c._name == 'c'
+    assert b.snode._name == 'b'
+    d = []
+    for i in range(10):
+        d.append(ti.field(dtype=ti.f32, shape=(2, 3), name=f'd{i}'))
+        assert d[i]._name == f'd{i}'
+
+
+@test_utils.test()
+@pytest.mark.parametrize('shape', field_shapes)
+@pytest.mark.parametrize('dtype', [ti.i32, ti.f32])
+def test_field_copy_from(shape, dtype):
+    x = ti.field(dtype=ti.f32, shape=shape)
+    other = ti.field(dtype=dtype, shape=shape)
+    other.fill(1)
+    x.copy_from(other)
+    convert = lambda arr: arr[0] if len(arr) == 1 else arr
+    assert (convert(x.shape) == shape)
+    assert (x.dtype == ti.f32)
+    assert ((x.to_numpy() == 1).all())
+
+
+@test_utils.test()
+def test_field_copy_from_with_mismatch_shape():
+    x = ti.field(dtype=ti.f32, shape=(2, 3))
+    for other_shape in [(2, ), (2, 2), (2, 3, 4)]:
+        other = ti.field(dtype=ti.f16, shape=other_shape)
+        with pytest.raises(ValueError):
+            x.copy_from(other)
+
+
+@test_utils.test()
+def test_field_copy_from_with_non_filed_object():
+    import numpy as np
+    x = ti.field(dtype=ti.f32, shape=(2, 3))
+    other = np.zeros((2, 3))
+    with pytest.raises(TypeError):
+        x.copy_from(other)
diff --git a/tests/_python_orig/test_fields_builder.py b/tests/_python_orig/test_fields_builder.py
new file mode 100644
index 000000000..80e7db5a6
--- /dev/null
+++ b/tests/_python_orig/test_fields_builder.py
@@ -0,0 +1,180 @@
+import pytest
+from taichi.lang.exception import TaichiRuntimeError
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
+def test_fields_with_shape():
+    shape = 5
+    x = ti.field(ti.f32, shape=shape)
+
+    @ti.kernel
+    def assign_field_single():
+        for i in range(shape):
+            x[i] = i
+
+    assign_field_single()
+    for i in range(shape):
+        assert x[i] == i
+
+    y = ti.field(ti.f32, shape=shape)
+
+    @ti.kernel
+    def assign_field_multiple():
+        for i in range(shape):
+            y[i] = i * 2
+        for i in range(shape):
+            x[i] = i * 3
+
+    assign_field_multiple()
+    for i in range(shape):
+        assert x[i] == i * 3
+        assert y[i] == i * 2
+
+    assign_field_single()
+    for i in range(shape):
+        assert x[i] == i
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
+def test_fields_builder_dense():
+    shape = 5
+    fb1 = ti.FieldsBuilder()
+    x = ti.field(ti.f32)
+    fb1.dense(ti.i, shape).place(x)
+    fb1.finalize()
+
+    @ti.kernel
+    def assign_field_single():
+        for i in range(shape):
+            x[i] = i * 3
+
+    assign_field_single()
+    for i in range(shape):
+        assert x[i] == i * 3
+
+    fb2 = ti.FieldsBuilder()
+    y = ti.field(ti.f32)
+    fb2.dense(ti.i, shape).place(y)
+    z = ti.field(ti.f32)
+    fb2.dense(ti.i, shape).place(z)
+    fb2.finalize()
+
+    @ti.kernel
+    def assign_field_multiple():
+        for i in range(shape):
+            x[i] = i * 2
+        for i in range(shape):
+            y[i] = i + 5
+        for i in range(shape):
+            z[i] = i + 10
+
+    assign_field_multiple()
+    for i in range(shape):
+        assert x[i] == i * 2
+        assert y[i] == i + 5
+        assert z[i] == i + 10
+
+    assign_field_single()
+    for i in range(shape):
+        assert x[i] == i * 3
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.metal])
+def test_fields_builder_pointer():
+    shape = 5
+    fb1 = ti.FieldsBuilder()
+    x = ti.field(ti.f32)
+    fb1.pointer(ti.i, shape).place(x)
+    fb1.finalize()
+
+    @ti.kernel
+    def assign_field_single():
+        for i in range(shape):
+            x[i] = i * 3
+
+    assign_field_single()
+    for i in range(shape):
+        assert x[i] == i * 3
+
+    fb2 = ti.FieldsBuilder()
+    y = ti.field(ti.f32)
+    fb2.pointer(ti.i, shape).place(y)
+    z = ti.field(ti.f32)
+    fb2.pointer(ti.i, shape).place(z)
+    fb2.finalize()
+
+    @ti.kernel
+    def assign_field_multiple_range_for():
+        for i in range(shape):
+            x[i] = i * 2
+        for i in range(shape):
+            y[i] = i + 5
+        for i in range(shape):
+            z[i] = i + 10
+
+    assign_field_multiple_range_for()
+    for i in range(shape):
+        assert x[i] == i * 2
+        assert y[i] == i + 5
+        assert z[i] == i + 10
+
+    @ti.kernel
+    def assign_field_multiple_struct_for():
+        for i in y:
+            y[i] += 5
+        for i in z:
+            z[i] -= 5
+
+    assign_field_multiple_struct_for()
+    for i in range(shape):
+        assert y[i] == i + 10
+        assert z[i] == i + 5
+
+    assign_field_single()
+    for i in range(shape):
+        assert x[i] == i * 3
+
+
+# We currently only consider data types that all platforms support.
+# See https://docs.taichi.graphics/lang/articles/basic/type#supported-primitive-types for more details.
+@pytest.mark.parametrize('test_1d_size', [1, 10, 100])
+@pytest.mark.parametrize('field_type', [ti.f32, ti.i32])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
+def test_fields_builder_destroy(test_1d_size, field_type):
+    def test_for_single_destroy_multi_fields():
+        fb = ti.FieldsBuilder()
+        for create_field_idx in range(10):
+            field = ti.field(field_type)
+            fb.dense(ti.i, test_1d_size).place(field)
+        fb_snode_tree = fb.finalize()
+        fb_snode_tree.destroy()
+
+    def test_for_multi_destroy_multi_fields():
+        fb0 = ti.FieldsBuilder()
+        fb1 = ti.FieldsBuilder()
+
+        for create_field_idx in range(10):
+            field0 = ti.field(field_type)
+            field1 = ti.field(field_type)
+
+            fb0.dense(ti.i, test_1d_size).place(field0)
+            fb1.pointer(ti.i, test_1d_size).place(field1)
+
+        fb0_snode_tree = fb0.finalize()
+        fb1_snode_tree = fb1.finalize()
+
+        fb0_snode_tree.destroy()
+        fb1_snode_tree.destroy()
+
+    def test_for_raise_destroy_twice():
+        fb = ti.FieldsBuilder()
+        a = ti.field(ti.f32)
+        fb.dense(ti.i, test_1d_size).place(a)
+        c = fb.finalize()
+
+        with pytest.raises(TaichiRuntimeError):
+            c.destroy()
+            c.destroy()
diff --git a/tests/_python_orig/test_fill.py b/tests/_python_orig/test_fill.py
new file mode 100644
index 000000000..5692fe388
--- /dev/null
+++ b/tests/_python_orig/test_fill.py
@@ -0,0 +1,71 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_fill_scalar():
+    val = ti.field(ti.i32)
+    n = 4
+    m = 7
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    for i in range(n):
+        for j in range(m):
+            val[i, j] = i + j * 3
+
+    val.fill(2)
+
+    for i in range(n):
+        for j in range(m):
+            assert val[i, j] == 2
+
+
+@test_utils.test()
+def test_fill_matrix_scalar():
+    val = ti.Matrix.field(2, 3, ti.i32)
+
+    n = 4
+    m = 7
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    for i in range(n):
+        for j in range(m):
+            for p in range(2):
+                for q in range(3):
+                    val[i, j][p, q] = i + j * 3
+
+    val.fill(2)
+
+    for i in range(n):
+        for j in range(m):
+            for p in range(2):
+                for q in range(3):
+                    assert val[i, j][p, q] == 2
+
+
+@test_utils.test()
+def test_fill_matrix_matrix():
+    val = ti.Matrix.field(2, 3, ti.i32)
+
+    n = 4
+    m = 7
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    for i in range(n):
+        for j in range(m):
+            for p in range(2):
+                for q in range(3):
+                    val[i, j][p, q] = i + j * 3
+
+    mat = ti.Matrix([[0, 1, 2], [2, 3, 4]])
+
+    val.fill(mat)
+
+    for i in range(n):
+        for j in range(m):
+            for p in range(2):
+                for q in range(3):
+                    assert val[i, j][p, q] == mat(p, q)
diff --git a/tests/_python_orig/test_for_break.py b/tests/_python_orig/test_for_break.py
new file mode 100644
index 000000000..ba858f9f0
--- /dev/null
+++ b/tests/_python_orig/test_for_break.py
@@ -0,0 +1,94 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_for_break():
+    x = ti.field(ti.i32)
+    N, M = 4, 4
+    ti.root.dense(ti.ij, (N, M)).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            for j in range(M):
+                if j > i:
+                    break
+                x[i, j] = 100 * i + j
+
+    func()
+    for i in range(N):
+        for j in range(M):
+            if j > i:
+                assert x[i, j] == 0
+            else:
+                assert x[i, j] == 100 * i + j
+
+
+@test_utils.test()
+def test_for_break2():
+    x = ti.field(ti.i32)
+    N, M = 8, 8
+    ti.root.dense(ti.ij, (N, M)).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            for j in range(M):
+                x[i, j] = 100 * i + j
+                if j > i:
+                    break
+
+    func()
+    for i in range(N):
+        for j in range(M):
+            if j > i + 1:
+                assert x[i, j] == 0
+            else:
+                assert x[i, j] == 100 * i + j
+
+
+@test_utils.test(exclude=ti.vulkan)
+def test_for_break3():
+    x = ti.field(ti.i32)
+    N, M = 8, 8
+    ti.root.dense(ti.ij, (N, M)).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            for j in range(i, M - i):
+                if i == 0:
+                    break
+                x[i, j] = 100 * i + j
+
+    func()
+    for i in range(N):
+        for j in range(M):
+            if j < i or j >= M - i or i == 0:
+                assert x[i, j] == 0
+            else:
+                assert x[i, j] == 100 * i + j
+
+
+@test_utils.test()
+def test_for_break_complex():
+    x = ti.field(ti.i32)
+    N, M = 16, 32
+    ti.root.dense(ti.ij, (N, M)).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(1, N):
+            for j in range(3, M):
+                if j > i:
+                    break
+                x[i, j] = 100 * i + j
+
+    func()
+    for i in range(N):
+        for j in range(M):
+            if i < 1 or j < 3 or j > i:
+                assert x[i, j] == 0
+            else:
+                assert x[i, j] == 100 * i + j
diff --git a/tests/_python_orig/test_for_group_mismatch.py b/tests/_python_orig/test_for_group_mismatch.py
new file mode 100644
index 000000000..e25cea450
--- /dev/null
+++ b/tests/_python_orig/test_for_group_mismatch.py
@@ -0,0 +1,100 @@
+import pytest
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_struct_for_mismatch():
+    x = ti.field(ti.f32, (3, 4))
+
+    @ti.kernel
+    def func():
+        for i in x:
+            print(i)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_struct_for_mismatch2():
+    x = ti.field(ti.f32, (3, 4))
+
+    @ti.kernel
+    def func():
+        for i, j, k in x:
+            print(i, j, k)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def _test_grouped_struct_for_mismatch():
+    # doesn't work for now
+    # need grouped refactor
+    # for now, it just throw a unfriendly message:
+    # AssertionError: __getitem__ cannot be called in Python-scope
+    x = ti.field(ti.f32, (3, 4))
+
+    @ti.kernel
+    def func():
+        for i, j in ti.grouped(x):
+            print(i, j)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def _test_ndrange_for_mismatch():
+    # doesn't work for now
+    # need ndrange refactor
+    @ti.kernel
+    def func():
+        for i in ti.ndrange(3, 4):
+            print(i)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def _test_ndrange_for_mismatch2():
+    # doesn't work for now
+    # need ndrange and grouped refactor
+    @ti.kernel
+    def func():
+        for i, j, k in ti.ndrange(3, 4):
+            print(i, j, k)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def _test_grouped_ndrange_for_mismatch():
+    # doesn't work for now
+    # need ndrange and grouped refactor
+    @ti.kernel
+    def func():
+        for i in ti.grouped(ti.ndrange(3, 4)):
+            print(i)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def _test_static_ndrange_for_mismatch():
+    # doesn't work for now
+    # need ndrange and static refactor
+    @ti.kernel
+    def func():
+        for i in ti.static(ti.ndrange(3, 4)):
+            print(i)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
diff --git a/tests/_python_orig/test_fp_flush_to_zero.py b/tests/_python_orig/test_fp_flush_to_zero.py
new file mode 100644
index 000000000..d3c06e4c9
--- /dev/null
+++ b/tests/_python_orig/test_fp_flush_to_zero.py
@@ -0,0 +1,32 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_ftz_f32():
+    a = ti.field(dtype=ti.f32, shape=2)
+
+    @ti.kernel
+    def foo():
+        a[0] = 1e-45
+        a[1] = 1e-10 * 1e-35
+
+    foo()
+    assert a[0] == 0
+    assert a[1] == 0
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_ftz_f64():
+    a = ti.field(dtype=ti.f64, shape=2)
+
+    @ti.kernel
+    def foo():
+        a[0] = 1e-323
+        x = 1e-300
+        y = 1e-23
+        a[1] = x * y
+
+    foo()
+    assert a[0] == 0
+    assert a[1] == 0
diff --git a/tests/_python_orig/test_function.py b/tests/_python_orig/test_function.py
new file mode 100644
index 000000000..a0101e375
--- /dev/null
+++ b/tests/_python_orig/test_function.py
@@ -0,0 +1,289 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(experimental_real_function=True)
+def test_function_without_return():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.func
+    def foo(val: ti.i32):
+        x[None] += val
+
+    @ti.kernel
+    def run():
+        foo(40)
+        foo(2)
+
+    x[None] = 0
+    run()
+    assert x[None] == 42
+
+
+@test_utils.test(experimental_real_function=True)
+def test_function_with_return():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.func
+    def foo(val: ti.i32) -> ti.i32:
+        x[None] += val
+        return val
+
+    @ti.kernel
+    def run():
+        a = foo(40)
+        foo(2)
+        assert a == 40
+
+    x[None] = 0
+    run()
+    assert x[None] == 42
+
+
+@test_utils.test(experimental_real_function=True, exclude=[ti.opengl, ti.cc])
+def test_function_with_multiple_last_return():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.func
+    def foo(val: ti.i32) -> ti.i32:
+        if x[None]:
+            x[None] += val * 2
+            return val * 2
+        else:
+            x[None] += val
+            return val
+
+    @ti.kernel
+    def run():
+        a = foo(40)
+        foo(1)
+        assert a == 40
+
+    x[None] = 0
+    run()
+    assert x[None] == 42
+
+
+@test_utils.test(experimental_real_function=True)
+def test_call_expressions():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.func
+    def foo(val: ti.i32) -> ti.i32:
+        if x[None] > 10:
+            x[None] += 1
+        x[None] += val
+        return 0
+
+    @ti.kernel
+    def run():
+        assert foo(15) == 0
+        assert foo(10) == 0
+
+    x[None] = 0
+    run()
+    assert x[None] == 26
+
+
+@test_utils.test(arch=ti.cpu, experimental_real_function=True)
+def test_failing_multiple_return():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.func
+    def foo(val: ti.i32) -> ti.i32:
+        if x[None] > 10:
+            if x[None] > 20:
+                return 1
+            x[None] += 1
+        x[None] += val
+        return 0
+
+    @ti.kernel
+    def run():
+        assert foo(15) == 0
+        assert foo(10) == 0
+        assert foo(100) == 1
+
+    with pytest.raises(AssertionError):
+        x[None] = 0
+        run()
+        assert x[None] == 26
+
+
+@test_utils.test(experimental_real_function=True)
+def test_python_function():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.func
+    def inc(val: ti.i32):
+        x[None] += val
+
+    def identity(x):
+        return x
+
+    @ti.data_oriented
+    class A:
+        def __init__(self):
+            self.count = ti.field(ti.i32, shape=())
+            self.count[None] = 0
+
+        @ti.lang.kernel_impl.pyfunc
+        def dec(self, val: ti.i32) -> ti.i32:
+            self.count[None] += 1
+            x[None] -= val
+            return self.count[None]
+
+        @ti.kernel
+        def run(self) -> ti.i32:
+            a = self.dec(1)
+            identity(2)
+            inc(identity(3))
+            return a
+
+    a = A()
+    x[None] = 0
+    assert a.run() == 1
+    assert a.run() == 2
+    assert x[None] == 4
+    assert a.dec(4) == 3
+    assert x[None] == 0
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda], debug=True)
+def test_default_templates():
+    @ti.func
+    def func1(x: ti.template()):
+        x = 1
+
+    @ti.func
+    def func2(x: ti.template()):
+        x += 1
+
+    @ti.func
+    def func3(x):
+        x = 1
+
+    @ti.func
+    def func4(x):
+        x += 1
+
+    @ti.func
+    def func1_field(x: ti.template()):
+        x[None] = 1
+
+    @ti.func
+    def func2_field(x: ti.template()):
+        x[None] += 1
+
+    @ti.func
+    def func3_field(x):
+        x[None] = 1
+
+    @ti.func
+    def func4_field(x):
+        x[None] += 1
+
+    v = ti.field(dtype=ti.i32, shape=())
+
+    @ti.kernel
+    def run_func():
+        a = 0
+        func1(a)
+        assert a == 1
+        b = 0
+        func2(b)
+        assert b == 1
+        c = 0
+        func3(c)
+        assert c == 0
+        d = 0
+        func4(d)
+        assert d == 0
+
+        v[None] = 0
+        func1_field(v)
+        assert v[None] == 1
+        v[None] = 0
+        func2_field(v)
+        assert v[None] == 1
+        v[None] = 0
+        func3_field(v)
+        assert v[None] == 1
+        v[None] = 0
+        func4_field(v)
+        assert v[None] == 1
+
+    run_func()
+
+
+@test_utils.test(experimental_real_function=True)
+def test_experimental_templates():
+    x = ti.field(ti.i32, shape=())
+    y = ti.field(ti.i32, shape=())
+    answer = ti.field(ti.i32, shape=8)
+
+    @ti.kernel
+    def kernel_inc(x: ti.template()):
+        x[None] += 1
+
+    def run_kernel():
+        x[None] = 10
+        y[None] = 20
+        kernel_inc(x)
+        assert x[None] == 11
+        assert y[None] == 20
+        kernel_inc(y)
+        assert x[None] == 11
+        assert y[None] == 21
+
+    @ti.func
+    def inc(x: ti.template()):
+        x[None] += 1
+
+    @ti.kernel
+    def run_func():
+        x[None] = 10
+        y[None] = 20
+        inc(x)
+        answer[0] = x[None]
+        answer[1] = y[None]
+        inc(y)
+        answer[2] = x[None]
+        answer[3] = y[None]
+
+    def verify():
+        assert answer[0] == 11
+        assert answer[1] == 20
+        assert answer[2] == 11
+        assert answer[3] == 21
+
+    run_kernel()
+    run_func()
+    verify()
+
+
+@test_utils.test(experimental_real_function=True)
+def test_missing_arg_annotation():
+    with pytest.raises(ti.TaichiSyntaxError, match='must be type annotated'):
+
+        @ti.func
+        def add(a, b: ti.i32) -> ti.i32:
+            return a + b
+
+
+@test_utils.test(experimental_real_function=True)
+def test_missing_return_annotation():
+    with pytest.raises(ti.TaichiCompilationError,
+                       match='return value must be annotated'):
+
+        @ti.func
+        def add(a: ti.i32, b: ti.i32):
+            return a + b
+
+        @ti.kernel
+        def run():
+            add(30, 2)
+
+        run()
diff --git a/tests/_python_orig/test_function_parameter_by_value.py b/tests/_python_orig/test_function_parameter_by_value.py
new file mode 100644
index 000000000..c27d5988c
--- /dev/null
+++ b/tests/_python_orig/test_function_parameter_by_value.py
@@ -0,0 +1,18 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_pass_by_value():
+    @ti.func
+    def set_val(x, i):
+        x = i
+
+    ret = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def task():
+        set_val(ret[None], 112)
+
+    task()
+    assert ret[None] == 0
diff --git a/tests/_python_orig/test_fuse_dense.py b/tests/_python_orig/test_fuse_dense.py
new file mode 100644
index 000000000..a08a0fe47
--- /dev/null
+++ b/tests/_python_orig/test_fuse_dense.py
@@ -0,0 +1,40 @@
+import taichi as ti
+from tests import test_utils
+
+from .fuse_test_template import (template_fuse_dense_x2y2z,
+                                 template_fuse_reduction)
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_fuse_dense_x2y2z():
+    template_fuse_dense_x2y2z(size=10 * 1024**2)
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_fuse_reduction():
+    template_fuse_reduction(size=10 * 1024**2)
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_no_fuse_sigs_mismatch():
+    n = 4096
+    x = ti.field(ti.i32, shape=(n, ))
+
+    @ti.kernel
+    def inc_i():
+        for i in x:
+            x[i] += i
+
+    @ti.kernel
+    def inc_by(k: ti.i32):
+        for i in x:
+            x[i] += k
+
+    repeat = 5
+    for i in range(repeat):
+        inc_i()
+        inc_by(i)
+
+    x = x.to_numpy()
+    for i in range(n):
+        assert x[i] == i * repeat + ((repeat - 1) * repeat // 2)
diff --git a/tests/_python_orig/test_fuse_dynamic.py b/tests/_python_orig/test_fuse_dynamic.py
new file mode 100644
index 000000000..e514fd05b
--- /dev/null
+++ b/tests/_python_orig/test_fuse_dynamic.py
@@ -0,0 +1,62 @@
+import time
+
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+def benchmark_fuse_dynamic_x2y2z(size=1024**2, repeat=10, first_n=100):
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    z = ti.field(ti.i32)
+
+    ti.root.dynamic(ti.i, size, chunk_size=2048).place(x, y, z)
+
+    @ti.kernel
+    def x_to_y():
+        for i in x:
+            y[i] = x[i] + 1
+
+    @ti.kernel
+    def y_to_z():
+        for i in x:
+            z[i] = y[i] + 4
+
+    first_n = min(first_n, size)
+
+    for i in range(first_n):
+        x[i] = i * 10
+
+    for _ in range(repeat):
+        t = time.time()
+        x_to_y()
+        ti.sync()
+        print('x_to_y', time.time() - t)
+    print('')
+
+    for _ in range(repeat):
+        t = time.time()
+        y_to_z()
+        ti.sync()
+        print('y_to_z', time.time() - t)
+    print('')
+
+    for _ in range(repeat):
+        t = time.time()
+        x_to_y()
+        y_to_z()
+        ti.sync()
+        print('fused x->y->z', time.time() - t)
+    print('')
+
+    for i in range(first_n):
+        assert x[i] == i * 10
+        assert y[i] == x[i] + 1
+        assert z[i] == x[i] + 5
+
+
+@test_utils.test(require=[ti.extension.async_mode, ti.extension.sparse],
+                 async_mode=True)
+def test_fuse_dynamic_x2y2z():
+    benchmark_fuse_dynamic_x2y2z()
diff --git a/tests/_python_orig/test_gc.py b/tests/_python_orig/test_gc.py
new file mode 100644
index 000000000..afa050ca5
--- /dev/null
+++ b/tests/_python_orig/test_gc.py
@@ -0,0 +1,135 @@
+import taichi as ti
+from tests import test_utils
+
+
+def _test_block_gc():
+    N = 100000
+
+    dx = 1 / 128
+    inv_dx = 1.0 / dx
+
+    x = ti.Vector.field(2, dtype=ti.f32)
+
+    indices = ti.ij
+
+    grid_m = ti.field(dtype=ti.i32)
+
+    grid = ti.root.pointer(indices, 64)
+    grid.pointer(indices, 32).dense(indices, 8).place(grid_m)
+
+    ti.root.dense(ti.i, N).place(x)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i] = ti.Vector(
+                [ti.random() * 0.1 + 0.5,
+                 ti.random() * 0.1 + 0.5])
+
+    init()
+
+    @ti.kernel
+    def build_grid():
+        for p in x:
+            base = int(ti.floor(x[p] * inv_dx - 0.5))
+            grid_m[base] += 1
+
+    @ti.kernel
+    def move():
+        for p in x:
+            x[p] += ti.Vector([0.0, 0.1])
+
+    assert grid._num_dynamically_allocated == 0
+    for _ in range(100):
+        grid.deactivate_all()
+        # Scatter the particles to the sparse grid
+        build_grid()
+        # Move the block of particles
+        move()
+
+    ti.sync()
+    # The block of particles can occupy at most two blocks on the sparse grid.
+    # It's fine to run 100 times and do just one final check, because
+    # num_dynamically_allocated stores the number of slots *ever* allocated.
+    assert 1 <= grid._num_dynamically_allocated <= 2, grid._num_dynamically_allocated
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_block():
+    _test_block_gc()
+
+
+#TODO: Remove exclude of ti.metal.
+@test_utils.test(require=[ti.extension.sparse, ti.extension.async_mode],
+                 exclude=[ti.metal],
+                 async_mode=True)
+def test_block_async():
+    _test_block_gc()
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dynamic_gc():
+    x = ti.field(dtype=ti.i32)
+
+    L = ti.root.dynamic(ti.i, 1024 * 1024, chunk_size=1024)
+    L.place(x)
+
+    assert L._num_dynamically_allocated == 0
+
+    for i in range(100):
+        x[1024] = 1
+        L.deactivate_all()
+        assert L._num_dynamically_allocated <= 2
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer_gc():
+    x = ti.field(dtype=ti.i32)
+
+    L = ti.root.pointer(ti.ij, 32)
+    L.pointer(ti.ij, 32).dense(ti.ij, 8).place(x)
+
+    assert L._num_dynamically_allocated == 0
+
+    for i in range(1024):
+        x[i * 8, i * 8] = 1
+        assert L._num_dynamically_allocated == 1
+        L.deactivate_all()
+
+        # Note that being inactive doesn't mean it's not allocated.
+        assert L._num_dynamically_allocated == 1
+
+
+@test_utils.test(require=[ti.extension.sparse, ti.extension.async_mode],
+                 async_mode=True)
+def test_fuse_allocator_state():
+    N = 16
+    x = ti.field(dtype=ti.i32, shape=N)
+    y = ti.field(dtype=ti.i32)
+
+    y_parent = ti.root.pointer(ti.i, N * 2)
+    y_parent.place(y)
+
+    # https://github.com/taichi-dev/taichi/pull/1973#pullrequestreview-511154376
+
+    @ti.kernel
+    def activate_y():
+        for i in x:
+            idx = i + 1
+            y[idx] = idx
+
+    @ti.kernel
+    def deactivate_y():
+        for i in x:
+            ti.deactivate(y_parent, i)
+
+    activate_y()
+    deactivate_y()
+    ti.sync()
+
+    # TODO: assert that activate_y and deactivate_y are not fused.
+    assert y_parent._num_dynamically_allocated == N
+    ys = y.to_numpy()
+    for i, y in enumerate(ys):
+        expected = N if i == N else 0
+        assert y == expected
diff --git a/tests/_python_orig/test_get_external_tensor_shape.py b/tests/_python_orig/test_get_external_tensor_shape.py
new file mode 100644
index 000000000..fa5578fc0
--- /dev/null
+++ b/tests/_python_orig/test_get_external_tensor_shape.py
@@ -0,0 +1,72 @@
+import numpy as np
+import pytest
+from taichi.lang.util import has_pytorch
+
+import taichi as ti
+from tests import test_utils
+
+if has_pytorch():
+    import torch
+
+
+@pytest.mark.parametrize('size', [[1], [1, 2, 3, 4]])
+@test_utils.test()
+def test_get_external_tensor_shape_access_numpy(size):
+    @ti.kernel
+    def func(x: ti.ext_arr(), index: ti.template()) -> ti.i32:
+        return x.shape[index]
+
+    x_hat = np.ones(size, dtype=np.int32)
+    for idx, y_ref in enumerate(size):
+        y_hat = func(x_hat, idx)
+        assert y_ref == y_hat, "Size of axis {} should equal {} and not {}.".format(
+            idx, y_ref, y_hat)
+
+
+@pytest.mark.parametrize('size', [[1, 1], [2, 2]])
+@test_utils.test()
+def test_get_external_tensor_shape_sum_numpy(size):
+    @ti.kernel
+    def func(x: ti.ext_arr()) -> ti.i32:
+        y = 0
+        for i in range(x.shape[0]):
+            for j in range(x.shape[1]):
+                y += x[i, j]
+        return y
+
+    x_hat = np.ones(size, dtype=np.int32)
+    x_ref = x_hat
+    y_hat = func(x_hat)
+    y_ref = x_ref.sum()
+    assert y_ref == y_hat, "Output should equal {} and not {}.".format(
+        y_ref, y_hat)
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@pytest.mark.parametrize('size', [[1, 2, 3, 4]])
+@test_utils.test(exclude=ti.opengl)
+def test_get_external_tensor_shape_access_torch(size):
+    @ti.kernel
+    def func(x: ti.ext_arr(), index: ti.template()) -> ti.i32:
+        return x.shape[index]
+
+    x_hat = torch.ones(size, dtype=torch.int32, device='cpu')
+    for idx, y_ref in enumerate(size):
+        y_hat = func(x_hat, idx)
+        assert y_ref == y_hat, "Size of axis {} should equal {} and not {}.".format(
+            idx, y_ref, y_hat)
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@pytest.mark.parametrize('size', [[1, 2, 3, 4]])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.opengl])
+def test_get_external_tensor_shape_access_ndarray(size):
+    @ti.kernel
+    def func(x: ti.any_arr(), index: ti.template()) -> ti.i32:
+        return x.shape[index]
+
+    x_hat = ti.ndarray(ti.i32, shape=size)
+    for idx, y_ref in enumerate(size):
+        y_hat = func(x_hat, idx)
+        assert y_ref == y_hat, "Size of axis {} should equal {} and not {}.".format(
+            idx, y_ref, y_hat)
diff --git a/tests/_python_orig/test_ggui.py b/tests/_python_orig/test_ggui.py
new file mode 100644
index 000000000..1865cd830
--- /dev/null
+++ b/tests/_python_orig/test_ggui.py
@@ -0,0 +1,291 @@
+import os
+import pathlib
+import platform
+import tempfile
+
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+REGENERATE_GROUNDTRUTH_IMAGES = False
+RENDER_REPEAT = 5
+supported_archs = [ti.vulkan, ti.cuda]
+
+
+def get_temp_png():
+    f, name = tempfile.mkstemp(suffix='.png')
+    os.close(f)
+    return name
+
+
+def write_temp_image(window):
+    f = get_temp_png()
+    window.write_image(f)
+    try:
+        os.remove(f)
+    except OSError:
+        pass
+
+
+def verify_image(window, image_name, tolerence=0.1):
+    if REGENERATE_GROUNDTRUTH_IMAGES:
+        ground_truth_name = f"tests/python/expected/{image_name}.png"
+        window.write_image(ground_truth_name)
+    else:
+        ground_truth_name = str(
+            pathlib.Path(__file__).parent) + f"/expected/{image_name}.png"
+        actual_name = get_temp_png()
+        window.write_image(actual_name)
+        ground_truth_np = ti.imread(ground_truth_name)
+        actual_np = ti.imread(actual_name)
+        assert len(ground_truth_np.shape) == len(actual_np.shape)
+        for i in range(len(ground_truth_np.shape)):
+            assert ground_truth_np.shape[i] == actual_np.shape[i]
+        diff = ground_truth_np - actual_np
+        mse = np.mean(diff * diff)
+        assert mse <= tolerence  # the pixel values are 0~255
+        os.remove(actual_name)
+
+
+@pytest.mark.skipif(not ti.ui.GGUI_AVAILABLE, reason="GGUI Not Available")
+@test_utils.test(arch=supported_archs)
+def test_geometry_2d():
+    window = ti.ui.Window('test', (640, 480), show_window=False)
+    canvas = window.get_canvas()
+
+    # simple circles
+    n_circles_0 = 10
+    circle_positions_0 = ti.Vector.field(2, ti.f32, shape=n_circles_0)
+    for i in range(n_circles_0):
+        circle_positions_0[i] = ti.Vector([0.1, i * 0.1])
+
+    # circles with per vertex colors
+    n_circles_1 = 10
+    circle_positions_1 = ti.Vector.field(2, ti.f32, shape=n_circles_1)
+    circle_colors_1 = ti.Vector.field(3, ti.f32, shape=n_circles_1)
+    for i in range(n_circles_0):
+        circle_positions_1[i] = ti.Vector([0.2, i * 0.1])
+        circle_colors_1[i] = ti.Vector([i * 0.1, 1.0 - i * 0.1, 0.5])
+
+    # simple triangles
+    n_triangles_0 = 10
+    triangles_positions_0 = ti.Vector.field(2, ti.f32, shape=3 * n_triangles_0)
+    for i in range(n_triangles_0):
+        triangles_positions_0[3 * i] = ti.Vector([0.3, i * 0.1])
+        triangles_positions_0[3 * i + 1] = ti.Vector([0.35, i * 0.1])
+        triangles_positions_0[3 * i + 2] = ti.Vector([0.35, i * 0.1 + 0.05])
+
+    # triangles with per vertex colors and indices
+    triangles_positions_1 = ti.Vector.field(2, ti.f32, shape=4)
+    triangles_colors_1 = ti.Vector.field(3, ti.f32, shape=4)
+    triangles_positions_1[0] = ti.Vector([0.4, 0])
+    triangles_positions_1[1] = ti.Vector([0.4, 1])
+    triangles_positions_1[2] = ti.Vector([0.45, 0])
+    triangles_positions_1[3] = ti.Vector([0.45, 1])
+    triangles_colors_1[0] = ti.Vector([0, 0, 0])
+    triangles_colors_1[1] = ti.Vector([1, 0, 0])
+    triangles_colors_1[2] = ti.Vector([0, 1, 0])
+    triangles_colors_1[3] = ti.Vector([1, 1, 0])
+    triangle_indices_1 = ti.Vector.field(3, ti.i32, shape=2)
+    triangle_indices_1[0] = ti.Vector([0, 1, 3])
+    triangle_indices_1[1] = ti.Vector([0, 2, 3])
+
+    # simple lines
+    n_lines_0 = 10
+    lines_positions_0 = ti.Vector.field(2, ti.f32, shape=2 * n_lines_0)
+    for i in range(n_lines_0):
+        lines_positions_0[2 * i] = ti.Vector([0.5, i * 0.1])
+        lines_positions_0[2 * i + 1] = ti.Vector([0.5, i * 0.1 + 0.05])
+
+    # lines with per vertex colors and indices
+    lines_positions_1 = ti.Vector.field(2, ti.f32, shape=4)
+    lines_colors_1 = ti.Vector.field(3, ti.f32, shape=4)
+    lines_positions_1[0] = ti.Vector([0.6, 0])
+    lines_positions_1[1] = ti.Vector([0.6, 1])
+    lines_positions_1[2] = ti.Vector([0.65, 0])
+    lines_positions_1[3] = ti.Vector([0.65, 1])
+    lines_colors_1[0] = ti.Vector([0, 0, 0])
+    lines_colors_1[1] = ti.Vector([1, 0, 0])
+    lines_colors_1[2] = ti.Vector([0, 1, 0])
+    lines_colors_1[3] = ti.Vector([1, 1, 0])
+    lines_indices_1 = ti.Vector.field(2, ti.i32, shape=6)
+    line_id = 0
+    for i in range(4):
+        for j in range(i + 1, 4):
+            lines_indices_1[line_id] = ti.Vector([i, j])
+            line_id += 1
+
+    def render():
+
+        canvas.circles(circle_positions_0, radius=0.05, color=(1, 0, 0))
+
+        canvas.circles(circle_positions_1,
+                       radius=0.05,
+                       per_vertex_color=circle_colors_1)
+
+        canvas.triangles(triangles_positions_0, color=(0, 0, 1))
+
+        canvas.triangles(triangles_positions_1,
+                         per_vertex_color=triangles_colors_1,
+                         indices=triangle_indices_1)
+
+        canvas.lines(lines_positions_0, width=0.01, color=(0, 1, 0))
+
+        canvas.lines(lines_positions_1,
+                     width=0.01,
+                     per_vertex_color=lines_colors_1,
+                     indices=lines_indices_1)
+
+    for _ in range(RENDER_REPEAT):
+        render()
+        write_temp_image(window)
+    render()
+    if (platform.system() == 'Darwin'):
+        # FIXME: Use lower tolerence when macOS ggui supports wide lines
+        verify_image(window, 'test_geometry_2d', 1.0)
+    else:
+        verify_image(window, 'test_geometry_2d')
+    window.destroy()
+
+
+@pytest.mark.skipif(not ti.ui.GGUI_AVAILABLE, reason="GGUI Not Available")
+@test_utils.test(arch=supported_archs)
+def test_geometry_3d():
+    window = ti.ui.Window('test', (640, 480), show_window=False)
+    canvas = window.get_canvas()
+    scene = ti.ui.Scene()
+    camera = ti.ui.make_camera()
+    camera.position(0.0, 0.0, 1.5)
+    camera.lookat(0.0, 0.0, 0)
+    scene.set_camera(camera)
+
+    # simple particles
+    num_per_dim = 32
+    num_particles_0 = int(num_per_dim**3)
+    particles_positions_0 = ti.Vector.field(3, ti.f32, shape=num_particles_0)
+
+    @ti.kernel
+    def init_particles_0():
+        for x, y, z in ti.ndrange(num_per_dim, num_per_dim, num_per_dim):
+            i = x * (num_per_dim**2) + y * num_per_dim + z
+            gap = 0.01
+            particles_positions_0[i] = ti.Vector(
+                [-0.4, 0, 0.0],
+                dt=ti.f32) + ti.Vector([x, y, z], dt=ti.f32) * gap
+
+    init_particles_0()
+
+    # particles with individual colors
+    num_per_dim = 32
+    num_particles_1 = int(num_per_dim**3)
+    particles_positions_1 = ti.Vector.field(3, ti.f32, shape=num_particles_1)
+    particles_colors_1 = ti.Vector.field(3, ti.f32, shape=num_particles_1)
+
+    @ti.kernel
+    def init_particles_1():
+        for x, y, z in ti.ndrange(num_per_dim, num_per_dim, num_per_dim):
+            i = x * (num_per_dim**2) + y * num_per_dim + z
+            gap = 0.01
+            particles_positions_1[i] = ti.Vector(
+                [0.2, 0, 0.0],
+                dt=ti.f32) + ti.Vector([x, y, z], dt=ti.f32) * gap
+            particles_colors_1[i] = ti.Vector([x, y, z],
+                                              dt=ti.f32) / num_per_dim
+
+    init_particles_1()
+
+    # mesh
+    vertices = ti.Vector.field(3, ti.f32, shape=8)
+    colors = ti.Vector.field(3, ti.f32, shape=8)
+
+    @ti.kernel
+    def init_mesh():
+        for i, j, k in ti.ndrange(2, 2, 2):
+            index = i * 4 + j * 2 + k
+            vertices[index] = ti.Vector(
+                [-0.1, -0.3, 0.0],
+                dt=ti.f32) + ti.Vector([i, j, k], dt=ti.f32) * 0.25
+            colors[index] = ti.Vector([i, j, k], dt=ti.f32)
+
+    init_mesh()
+    indices = ti.field(ti.i32, shape=36)
+    indices_np = np.array([
+        0, 1, 2, 3, 1, 2, 4, 5, 6, 7, 5, 6, 0, 1, 4, 5, 1, 4, 2, 3, 6, 7, 3, 6,
+        0, 2, 4, 6, 2, 4, 1, 3, 5, 7, 3, 5
+    ],
+                          dtype=np.int32)
+    indices.from_numpy(indices_np)
+
+    def render():
+        scene.point_light(pos=(2, 2, 2), color=(1, 1, 1))
+
+        scene.particles(particles_positions_0, radius=0.01, color=(0.5, 0, 0))
+
+        scene.particles(particles_positions_1,
+                        radius=0.01,
+                        per_vertex_color=particles_colors_1)
+
+        scene.mesh(vertices,
+                   per_vertex_color=colors,
+                   indices=indices,
+                   two_sided=True)
+
+        canvas.scene(scene)
+
+    for _ in range(RENDER_REPEAT):
+        render()
+        write_temp_image(window)
+    render()
+    verify_image(window, 'test_geometry_3d')
+    window.destroy()
+
+
+@pytest.mark.skipif(not ti.ui.GGUI_AVAILABLE, reason="GGUI Not Available")
+@test_utils.test(arch=supported_archs)
+def test_set_image():
+    window = ti.ui.Window('test', (640, 480), show_window=False)
+    canvas = window.get_canvas()
+
+    img = ti.Vector.field(4, ti.f32, (512, 512))
+
+    @ti.kernel
+    def init_img():
+        for i, j in img:
+            img[i, j] = ti.Vector([i, j, 0, 512], dt=ti.f32) / 512
+
+    init_img()
+
+    def render():
+        canvas.set_image(img)
+
+    for _ in range(RENDER_REPEAT):
+        render()
+        write_temp_image(window)
+    render()
+    verify_image(window, 'test_set_image')
+    window.destroy()
+
+
+@pytest.mark.skipif(not ti.ui.GGUI_AVAILABLE, reason="GGUI Not Available")
+@test_utils.test(arch=supported_archs)
+def test_imgui():
+    window = ti.ui.Window('test', (640, 480), show_window=False)
+
+    def render():
+        with window.GUI.sub_window("window 0", 0.1, 0.1, 0.8, 0.2) as w:
+            w.text("Hello Taichi!")
+            w.text("Hello Again!")
+        with window.GUI.sub_window("window 1", 0.1, 0.4, 0.8, 0.2) as w:
+            w.button("Press to unlease creativity")
+            w.slider_float('creativity level', 100.0, 0.0, 100.0)
+        with window.GUI.sub_window("window 2", 0.1, 0.7, 0.8, 0.2) as w:
+            w.color_edit_3('Heyy', (0, 0, 1))
+
+    for _ in range(RENDER_REPEAT):
+        render()
+        write_temp_image(window)
+    render()
+    verify_image(window, 'test_imgui')
+    window.destroy()
diff --git a/tests/_python_orig/test_global_buffer_misalined.py b/tests/_python_orig/test_global_buffer_misalined.py
new file mode 100644
index 000000000..eab4524d2
--- /dev/null
+++ b/tests/_python_orig/test_global_buffer_misalined.py
@@ -0,0 +1,15 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_global_buffer_misalignment():
+    @ti.kernel
+    def test(x: ti.f32):
+        a = x
+        b = ti.cast(0.12, ti.f64)
+        for i in range(8):
+            b += a
+
+    for i in range(8):
+        test(0.1)
diff --git a/tests/_python_orig/test_global_store_grad.py b/tests/_python_orig/test_global_store_grad.py
new file mode 100644
index 000000000..6f98e9f01
--- /dev/null
+++ b/tests/_python_orig/test_global_store_grad.py
@@ -0,0 +1,32 @@
+"""
+import taichi as ti
+
+ti.lang.impl.current_cfg().print_ir = True
+
+
+def test_global_store_branching():
+    # ti.reset()
+
+    N = 16
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, N).place(x)
+    ti.root.dense(ti.i, N).place(y)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def oldeven():
+        for i in range(N):
+            if i % 2 == 0:
+                x[i] = y[i]
+
+    for i in range(N):
+        x.grad[i] = 1
+
+    oldeven()
+    oldeven.grad()
+
+    for i in range(N):
+        assert y.grad[i] == (i % 2 == 0)
+"""
diff --git a/tests/_python_orig/test_global_thread_idx.py b/tests/_python_orig/test_global_thread_idx.py
new file mode 100644
index 000000000..6e0d58690
--- /dev/null
+++ b/tests/_python_orig/test_global_thread_idx.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cuda)
+def test_global_thread_idx():
+    n = 2048
+    x = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            tid = ti.global_thread_idx()
+            x[tid] = tid
+
+    func()
+    assert np.arange(n).sum() == x.to_numpy().sum()
diff --git a/tests/_python_orig/test_grouped.py b/tests/_python_orig/test_grouped.py
new file mode 100644
index 000000000..0757c7f92
--- /dev/null
+++ b/tests/_python_orig/test_grouped.py
@@ -0,0 +1,182 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_vector_index():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 7
+    p = 11
+
+    ti.root.dense(ti.i, n).dense(ti.j, m).dense(ti.k, p).place(val)
+
+    @ti.kernel
+    def test():
+        for i in range(n):
+            for j in range(m):
+                for k in range(p):
+                    I = ti.Vector([i, j, k])
+                    val[I] = i + j * 2 + k * 3
+
+    test()
+
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                assert val[i, j, k] == i + j * 2 + k * 3
+
+
+@test_utils.test()
+def test_grouped():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 8
+    p = 16
+
+    ti.root.dense(ti.i, n).dense(ti.j, m).dense(ti.k, p).place(val)
+
+    @ti.kernel
+    def test():
+        for I in ti.grouped(val):
+            val[I] = I[0] + I[1] * 2 + I[2] * 3
+
+    test()
+
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                assert val[i, j, k] == i + j * 2 + k * 3
+
+
+@test_utils.test()
+def test_grouped_ndrange():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 8
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    x0 = 2
+    y0 = 3
+    x1 = 1
+    y1 = 6
+
+    @ti.kernel
+    def test():
+        for I in ti.grouped(ti.ndrange((x0, y0), (x1, y1))):
+            val[I] = I[0] + I[1] * 2
+
+    test()
+
+    for i in range(n):
+        for j in range(m):
+            assert val[i, j] == (i +
+                                 j * 2 if x0 <= i < y0 and x1 <= j < y1 else 0)
+
+
+@test_utils.test()
+def test_static_grouped_ndrange():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 8
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    x0 = 2
+    y0 = 3
+    x1 = 1
+    y1 = 6
+
+    @ti.kernel
+    def test():
+        for I in ti.static(ti.grouped(ti.ndrange((x0, y0), (x1, y1)))):
+            val[I] = I[0] + I[1] * 2
+
+    test()
+
+    for i in range(n):
+        for j in range(m):
+            assert val[i, j] == (i +
+                                 j * 2 if x0 <= i < y0 and x1 <= j < y1 else 0)
+
+
+@test_utils.test()
+def test_grouped_ndrange_starred():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 8
+    p = 16
+    dim = 3
+
+    ti.root.dense(ti.ijk, (n, m, p)).place(val)
+
+    @ti.kernel
+    def test():
+        for I in ti.grouped(ti.ndrange(*(((0, n), ) * dim))):
+            val[I] = I[0] + I[1] * 2 + I[2] * 3
+
+    test()
+
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                assert val[i, j,
+                           k] == (i + j * 2 + k * 3 if j < n and k < n else 0)
+
+
+@test_utils.test()
+def test_grouped_ndrange_0d():
+    val = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def test():
+        for I in ti.grouped(ti.ndrange()):
+            val[I] = 42
+
+    test()
+
+    assert val[None] == 42
+
+
+@test_utils.test()
+def test_static_grouped_ndrange_0d():
+    val = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def test():
+        for I in ti.static(ti.grouped(ti.ndrange())):
+            val[I] = 42
+
+    test()
+
+    assert val[None] == 42
+
+
+@test_utils.test()
+def test_static_grouped_func():
+
+    K = 3
+    dim = 2
+
+    v = ti.Vector.field(K, dtype=ti.i32, shape=((K, ) * dim))
+
+    def stencil_range():
+        return ti.ndrange(*((K, ) * (dim + 1)))
+
+    @ti.kernel
+    def p2g():
+        for I in ti.static(ti.grouped(stencil_range())):
+            v[I[0], I[1]][I[2]] = I[0] + I[1] * 3 + I[2] * 10
+
+    p2g()
+
+    for i in range(K):
+        for j in range(K):
+            for k in range(K):
+                assert v[i, j][k] == i + j * 3 + k * 10
diff --git a/tests/_python_orig/test_gui.py b/tests/_python_orig/test_gui.py
new file mode 100644
index 000000000..dbb0bae7a
--- /dev/null
+++ b/tests/_python_orig/test_gui.py
@@ -0,0 +1,32 @@
+import numpy as np
+import pytest
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@pytest.mark.parametrize('dtype', [ti.u8, ti.f32])
+@test_utils.test(arch=get_host_arch_list())
+def test_save_image_without_window(dtype):
+    n = 255
+    pixels = ti.field(dtype=dtype, shape=(n, n, 3))
+
+    @ti.kernel
+    def paint(c: dtype):
+        for i, j, k in pixels:
+            pixels[i, j, k] = c
+
+    gui = ti.GUI("Test", res=(n, n), show_gui=False)
+    for i in [0, 32, 64, 128, 255]:
+        if dtype is ti.u8:
+            paint(i)
+        else:
+            paint(i * 1.0 / n)
+        gui.set_image(pixels)
+        image_path = test_utils.make_temp_file(suffix='.png')
+        gui.show(image_path)
+        image = ti.imread(image_path)
+        delta = (image - i).sum()
+        assert delta == 0, "Expected image difference to be 0 but got {} instead.".format(
+            delta)
diff --git a/tests/_python_orig/test_image_io.py b/tests/_python_orig/test_image_io.py
new file mode 100644
index 000000000..fd04ea19b
--- /dev/null
+++ b/tests/_python_orig/test_image_io.py
@@ -0,0 +1,91 @@
+import os
+
+import numpy as np
+import pytest
+from taichi.lang.misc import get_host_arch_list
+from taichi.lang.util import to_numpy_type
+
+import taichi as ti
+from tests import test_utils
+
+
+# jpg is also supported but hard to test here since it's lossy:
+@pytest.mark.parametrize('comp,ext', [(3, 'bmp'), (1, 'png'), (3, 'png'),
+                                      (4, 'png')])
+@pytest.mark.parametrize('resx,resy', [(201, 173)])
+@pytest.mark.parametrize('is_field', [False, True])
+@pytest.mark.parametrize('dt', [ti.u8])
+@test_utils.test(arch=get_host_arch_list())
+def test_image_io(resx, resy, comp, ext, is_field, dt):
+    if comp != 1:
+        shape = (resx, resy, comp)
+    else:
+        shape = (resx, resy)
+    if is_field:
+        pixel_t = ti.field(dt, shape)
+    pixel = np.random.randint(256, size=shape, dtype=to_numpy_type(dt))
+    if is_field:
+        pixel_t.from_numpy(pixel)
+    fn = test_utils.make_temp_file(suffix='.' + ext)
+    if is_field:
+        ti.imwrite(pixel_t, fn)
+    else:
+        ti.imwrite(pixel, fn)
+    pixel_r = ti.imread(fn)
+    if comp == 1:
+        # from (resx, resy, 1) to (resx, resy)
+        pixel_r = pixel_r.reshape((resx, resy))
+    assert (pixel_r == pixel).all()
+    os.remove(fn)
+
+
+@pytest.mark.parametrize('comp,ext', [(3, 'png'), (4, 'png')])
+@pytest.mark.parametrize('resx,resy', [(91, 81)])
+@pytest.mark.parametrize('dt', [ti.f32, ti.f64])
+@test_utils.test(arch=get_host_arch_list())
+def test_image_io_vector(resx, resy, comp, ext, dt):
+    shape = (resx, resy)
+    pixel = np.random.rand(*shape, comp).astype(to_numpy_type(dt))
+    pixel_t = ti.Vector.field(comp, dt, shape)
+    pixel_t.from_numpy(pixel)
+    fn = test_utils.make_temp_file(suffix='.' + ext)
+    ti.imwrite(pixel_t, fn)
+    pixel_r = (ti.imread(fn).astype(to_numpy_type(dt)) + 0.5) / 256.0
+    assert np.allclose(pixel_r, pixel, atol=2e-2)
+    os.remove(fn)
+
+
+@pytest.mark.parametrize('comp,ext', [(3, 'png')])
+@pytest.mark.parametrize('resx,resy', [(91, 81)])
+@pytest.mark.parametrize('dt', [ti.u16, ti.u32, ti.u64])
+@test_utils.test(arch=get_host_arch_list())
+def test_image_io_uint(resx, resy, comp, ext, dt):
+    shape = (resx, resy)
+    np_type = to_numpy_type(dt)
+    # When saving to disk, pixel data will be truncated into 8 bits.
+    # Be careful here if you want lossless saving.
+    np_max = np.iinfo(np_type).max // 256
+    pixel = np.random.randint(256, size=(*shape, comp), dtype=np_type) * np_max
+    pixel_t = ti.Vector.field(comp, dt, shape)
+    pixel_t.from_numpy(pixel)
+    fn = test_utils.make_temp_file(suffix='.' + ext)
+    ti.imwrite(pixel_t, fn)
+    pixel_r = ti.imread(fn).astype(np_type) * np_max
+    assert (pixel_r == pixel).all()
+    os.remove(fn)
+
+
+@pytest.mark.parametrize('comp', [1, 3])
+@pytest.mark.parametrize('resx,resy', [(91, 81)])
+@pytest.mark.parametrize('scale', [1, 2, 3])
+@test_utils.test(arch=get_host_arch_list())
+def test_image_resize_sum(resx, resy, comp, scale):
+    shape = (resx, resy)
+    if comp != 1:
+        shape = shape + (comp, )
+    old_img = np.random.rand(*shape).astype(np.float32)
+    if resx == resy:
+        new_img = ti.imresize(old_img, resx * scale)
+    else:
+        new_img = ti.imresize(old_img, resx * scale, resy * scale)
+    assert np.sum(old_img) * scale**2 == test_utils.approx(np.sum(new_img))
diff --git a/tests/_python_orig/test_immediate_layout.py b/tests/_python_orig/test_immediate_layout.py
new file mode 100644
index 000000000..065b3b04c
--- /dev/null
+++ b/tests/_python_orig/test_immediate_layout.py
@@ -0,0 +1,13 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_1D():
+    N = 2
+    x = ti.field(ti.f32)
+    ti.root.dense(ti.i, N).place(x)
+
+    x[0] = 42
+    assert x[0] == 42
+    assert x[1] == 0
diff --git a/tests/_python_orig/test_indices.py b/tests/_python_orig/test_indices.py
new file mode 100644
index 000000000..fd676fae9
--- /dev/null
+++ b/tests/_python_orig/test_indices.py
@@ -0,0 +1,56 @@
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_indices():
+    a = ti.field(ti.f32, shape=(128, 32, 8))
+
+    b = ti.field(ti.f32)
+    ti.root.dense(ti.j, 32).dense(ti.i, 16).place(b)
+
+    mapping_a = a.snode._physical_index_position()
+
+    assert mapping_a == {0: 0, 1: 1, 2: 2}
+
+    mapping_b = b.snode._physical_index_position()
+
+    assert mapping_b == {0: 0, 1: 1}
+    # Note that b is column-major:
+    # the virtual first index exposed to the user comes second in memory layout.
+
+    @ti.kernel
+    def fill():
+        for i, j in b:
+            b[i, j] = i * 10 + j
+
+    @ti.kernel
+    def get_field_addr(i: ti.i32, j: ti.i32) -> ti.u64:
+        return ti.get_addr(b, [i, j])
+
+    fill()
+    for i in range(16):
+        for j in range(32):
+            assert b[i, j] == i * 10 + j
+    assert get_field_addr(0, 1) + 4 == get_field_addr(1, 1)
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_float_as_index():
+    a = ti.field(ti.f32, (8, 5))
+
+    @ti.kernel
+    def func():
+        i = 6.66
+        j = 3
+        I = ti.Vector([2, 1])
+        for _ in range(1):  # prevent constant fold
+            a[i, j] = 233
+            a[I + ti.Vector([1, 3.0])] = 666
+
+    func()
+
+    assert a[6, 3] == 233
+    assert a[3, 4] == 666
diff --git a/tests/_python_orig/test_indices_assert.py b/tests/_python_orig/test_indices_assert.py
new file mode 100644
index 000000000..2be507fd8
--- /dev/null
+++ b/tests/_python_orig/test_indices_assert.py
@@ -0,0 +1,23 @@
+import platform
+
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@pytest.mark.skipif(platform.system() == 'Windows',
+                    reason="Too much virtual memory for github windows env.")
+@test_utils.test(debug=True, gdb_trigger=False, packed=False, arch=[ti.cpu])
+def test_indices_assert():
+
+    overflow = ti.field(ti.i32, (334, 334, 334, 2 * 10))
+
+    @ti.kernel
+    def access_overflow():
+        overflow[0, 0, 0, 0] = 10
+        print(overflow[333, 333, 333, 0])
+
+    with pytest.raises(RuntimeError,
+                       match='The indices provided are too big!'):
+        access_overflow()
diff --git a/tests/_python_orig/test_internal_func.py b/tests/_python_orig/test_internal_func.py
new file mode 100644
index 000000000..78dcd8eef
--- /dev/null
+++ b/tests/_python_orig/test_internal_func.py
@@ -0,0 +1,69 @@
+import time
+
+from taichi.lang import impl
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(exclude=[ti.metal, ti.opengl, ti.cuda, ti.vulkan, ti.cc])
+def test_basic():
+    @ti.kernel
+    def test():
+        for _ in range(10):
+            impl.call_internal("do_nothing")
+
+    test()
+
+
+@test_utils.test(exclude=[ti.metal, ti.opengl, ti.cuda, ti.vulkan, ti.cc])
+def test_host_polling():
+    return
+
+    @ti.kernel
+    def test():
+        impl.call_internal("refresh_counter")
+
+    for i in range(10):
+        print('updating tail to', i)
+        test()
+        time.sleep(0.1)
+
+
+@test_utils.test(exclude=[ti.metal, ti.opengl, ti.cuda, ti.vulkan, ti.cc])
+def test_list_manager():
+    @ti.kernel
+    def test():
+        impl.call_internal("test_list_manager")
+
+    test()
+    test()
+
+
+@test_utils.test(exclude=[ti.metal, ti.opengl, ti.cuda, ti.vulkan, ti.cc])
+def test_node_manager():
+    @ti.kernel
+    def test():
+        impl.call_internal("test_node_allocator")
+
+    test()
+    test()
+
+
+@test_utils.test(exclude=[ti.metal, ti.opengl, ti.cuda, ti.vulkan, ti.cc])
+def test_node_manager_gc():
+    @ti.kernel
+    def test_cpu():
+        impl.call_internal("test_node_allocator_gc_cpu")
+
+    test_cpu()
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda], debug=True)
+def test_return():
+    @ti.kernel
+    def test_cpu():
+        ret = impl.call_internal("test_internal_func_args", 1.0, 2.0, 3)
+        assert ret == 9
+
+    test_cpu()
diff --git a/tests/_python_orig/test_kernel_arg_errors.py b/tests/_python_orig/test_kernel_arg_errors.py
new file mode 100644
index 000000000..2e0797795
--- /dev/null
+++ b/tests/_python_orig/test_kernel_arg_errors.py
@@ -0,0 +1,17 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cpu)
+def test_pass_float_as_i32():
+    @ti.kernel
+    def foo(a: ti.i32):
+        pass
+
+    with pytest.raises(ti.TaichiRuntimeTypeError) as e:
+        foo(1.2)
+
+    assert e.value.args[
+        0] == "Argument 0 (type=<class 'float'>) cannot be converted into required type i32"
diff --git a/tests/_python_orig/test_kernel_templates.py b/tests/_python_orig/test_kernel_templates.py
new file mode 100644
index 000000000..09637436c
--- /dev/null
+++ b/tests/_python_orig/test_kernel_templates.py
@@ -0,0 +1,129 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_kernel_template_basic():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.f32)
+
+    n = 16
+
+    ti.root.dense(ti.i, n).place(x, y)
+
+    @ti.kernel
+    def inc(a: ti.template(), b: ti.template()):
+        for i in a:
+            a[i] += b
+
+    inc(x, 1)
+    inc(y, 2)
+
+    for i in range(n):
+        assert x[i] == 1
+        assert y[i] == 2
+
+    @ti.kernel
+    def inc2(z: ti.i32, a: ti.template(), b: ti.i32):
+        for i in a:
+            a[i] += b + z
+
+    inc2(10, x, 1)
+    for i in range(n):
+        assert x[i] == 12
+
+
+@test_utils.test()
+def test_kernel_template_gradient():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+    z = ti.field(ti.f32)
+    loss = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 16).place(x, y, z)
+    ti.root.place(loss)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def double(a: ti.template(), b: ti.template()):
+        for i in range(16):
+            b[i] = a[i] * 2 + 1
+
+    @ti.kernel
+    def compute_loss():
+        for i in range(16):
+            ti.atomic_add(loss[None], z[i])
+
+    for i in range(16):
+        x[i] = i
+
+    with ti.Tape(loss):
+        double(x, y)
+        double(y, z)
+        compute_loss()
+
+    for i in range(16):
+        assert z[i] == i * 4 + 3
+        assert x.grad[i] == 4
+
+
+@test_utils.test()
+def test_func_template():
+    a = [ti.field(dtype=ti.f32) for _ in range(2)]
+    b = [ti.field(dtype=ti.f32) for _ in range(2)]
+
+    for l in range(2):
+        ti.root.dense(ti.ij, 16).place(a[l], b[l])
+
+    @ti.func
+    def sample(x: ti.template(), l: ti.template(), I):
+        return x[l][I]
+
+    @ti.kernel
+    def fill(l: ti.template()):
+        for I in ti.grouped(a[l]):
+            a[l][I] = l
+
+    @ti.kernel
+    def aTob(l: ti.template()):
+        for I in ti.grouped(b[l]):
+            b[l][I] = sample(a, l, I)
+
+    for l in range(2):
+        fill(l)
+        aTob(l)
+
+    for l in range(2):
+        for i in range(16):
+            for j in range(16):
+                assert b[l][i, j] == l
+
+
+@test_utils.test()
+def test_func_template2():
+    a = ti.field(dtype=ti.f32)
+    b = ti.field(dtype=ti.f32)
+
+    ti.root.dense(ti.ij, 16).place(a, b)
+
+    @ti.func
+    def sample(x: ti.template(), I):
+        return x[I]
+
+    @ti.kernel
+    def fill():
+        for I in ti.grouped(a):
+            a[I] = 1.0
+
+    @ti.kernel
+    def aTob():
+        for I in ti.grouped(b):
+            b[I] = sample(a, I)
+
+    for l in range(2):
+        fill()
+        aTob()
+
+    for i in range(16):
+        for j in range(16):
+            assert b[i, j] == 1.0
diff --git a/tests/_python_orig/test_lang.py b/tests/_python_orig/test_lang.py
new file mode 100644
index 000000000..1ce3dcde8
--- /dev/null
+++ b/tests/_python_orig/test_lang.py
@@ -0,0 +1,162 @@
+import numpy as np
+import pytest
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_nested_subscript():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    ti.root.dense(ti.i, 1).place(x)
+    ti.root.dense(ti.i, 1).place(y)
+
+    x[0] = 0
+
+    @ti.kernel
+    def inc():
+        for i in range(1):
+            x[x[i]] += 1
+
+    inc()
+
+    assert x[0] == 1
+
+
+@test_utils.test()
+def test_norm():
+    val = ti.field(ti.i32)
+    f = ti.field(ti.f32)
+
+    n = 1024
+
+    ti.root.dense(ti.i, n).dense(ti.i, 2).place(val, f)
+
+    @ti.kernel
+    def test():
+        for i in range(n):
+            s = 0
+            for j in range(10):
+                s += j
+            a = ti.Vector([0.4, 0.3])
+            val[i] = s + ti.cast(a.norm() * 100, ti.i32) + i
+
+    test()
+
+    @ti.kernel
+    def test2():
+        for i in range(n):
+            val[i] += 1
+
+    test2()
+
+    for i in range(n):
+        assert val[i] == 96 + i
+
+
+@test_utils.test()
+def test_simple2():
+    val = ti.field(ti.i32)
+    f = ti.field(ti.f32)
+
+    n = 16
+
+    ti.root.dense(ti.i, n).place(val, f)
+
+    @ti.kernel
+    def test():
+        for i in range(n):
+            val[i] = i * 2
+
+    test()
+
+    @ti.kernel
+    def test2():
+        for i in range(n):
+            val[i] += 1
+
+    test2()
+
+    for i in range(n):
+        assert val[i] == 1 + i * 2
+
+
+@test_utils.test()
+def test_recreate():
+    @ti.kernel
+    def test():
+        a = 0
+        a, b = 1, 2
+
+    test()
+
+
+@test_utils.test()
+def test_local_atomics():
+    n = 32
+    val = ti.field(ti.i32, shape=n)
+
+    @ti.kernel
+    def test():
+        for i in range(n):
+            s = 0
+            s += 45
+            print(s)
+            val[i] = s + i
+            print(val[i])
+
+    test()
+
+    for i in range(n):
+        assert val[i] == i + 45
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_loop_var_life():
+    @ti.kernel
+    def test():
+        for i in ti.static(range(8)):
+            pass
+        print(i)
+
+    with pytest.raises(Exception):
+        test()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_loop_var_life_double_iters():
+    @ti.kernel
+    def test():
+        for i, v in ti.static(enumerate(range(8))):
+            pass
+        print(i)
+
+    with pytest.raises(Exception):
+        test()
+
+
+@pytest.mark.parametrize('dtype', [ti.i32, ti.f32, ti.i64, ti.f64])
+@pytest.mark.parametrize('ti_zero,zero', [(ti.zero, 0), (ti.one, 1)])
+@pytest.mark.parametrize('is_mat', [False, True])
+@test_utils.test(arch=ti.cpu)
+def test_meta_zero_one(dtype, ti_zero, zero, is_mat):
+    if is_mat:
+        x = ti.Matrix.field(2, 3, dtype, ())
+        y = ti.Matrix.field(2, 3, dtype, ())
+    else:
+        x = ti.field(dtype, ())
+        y = ti.field(dtype, ())
+
+    @ti.kernel
+    def func():
+        y[None] = ti_zero(x[None])
+
+    for a in [-1, -2.3, -1, -0.3, 0, 1, 1.9, 2, 3]:
+        if ti.types.is_integral(dtype):
+            a = int(a)
+        x.fill(a)
+        func()
+        assert np.all(y.to_numpy() == zero)
diff --git a/tests/_python_orig/test_lexical_scope.py b/tests/_python_orig/test_lexical_scope.py
new file mode 100644
index 000000000..4c5909c93
--- /dev/null
+++ b/tests/_python_orig/test_lexical_scope.py
@@ -0,0 +1,28 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(ti.cpu)
+def test_func_closure():
+    def my_test():
+        a = 32
+
+        @ti.func
+        def foo():
+            ti.static_assert(a == 32)
+
+        @ti.kernel
+        def func():
+            ti.static_assert(a == 32)
+            foo()
+
+        def dummy():
+            func()
+
+        func()
+        dummy()
+        return dummy, func
+
+    dummy, func = my_test()
+    func()
+    dummy()
diff --git a/tests/_python_orig/test_linalg.py b/tests/_python_orig/test_linalg.py
new file mode 100644
index 000000000..3a882ce69
--- /dev/null
+++ b/tests/_python_orig/test_linalg.py
@@ -0,0 +1,460 @@
+import math
+
+import numpy as np
+import pytest
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_const_init():
+    a = ti.Matrix.field(2, 3, dtype=ti.i32, shape=())
+    b = ti.Vector.field(3, dtype=ti.i32, shape=())
+
+    @ti.kernel
+    def init():
+        a[None] = ti.Matrix([[0, 1, 2], [3, 4, 5]])
+        b[None] = ti.Vector([0, 1, 2])
+
+    init()
+
+    for i in range(2):
+        for j in range(3):
+            assert a[None][i, j] == i * 3 + j
+
+    for j in range(3):
+        assert b[None][j] == j
+
+
+@test_utils.test()
+def test_basic_utils():
+    a = ti.Vector.field(3, dtype=ti.f32)
+    b = ti.Vector.field(2, dtype=ti.f32)
+    abT = ti.Matrix.field(3, 2, dtype=ti.f32)
+    aNormalized = ti.Vector.field(3, dtype=ti.f32)
+
+    normA = ti.field(ti.f32)
+    normSqrA = ti.field(ti.f32)
+    normInvA = ti.field(ti.f32)
+
+    ti.root.place(a, b, abT, aNormalized, normA, normSqrA, normInvA)
+
+    @ti.kernel
+    def init():
+        a[None] = ti.Vector([1.0, 2.0, -3.0])
+        b[None] = ti.Vector([4.0, 5.0])
+        abT[None] = a[None].outer_product(b[None])
+
+        normA[None] = a[None].norm()
+        normSqrA[None] = a[None].norm_sqr()
+        normInvA[None] = a[None].norm_inv()
+
+        aNormalized[None] = a[None].normalized()
+
+    init()
+
+    for i in range(3):
+        for j in range(2):
+            assert abT[None][i, j] == a[None][i] * b[None][j]
+
+    sqrt14 = np.sqrt(14.0)
+    invSqrt14 = 1.0 / sqrt14
+    assert normSqrA[None] == test_utils.approx(14.0)
+    assert normInvA[None] == test_utils.approx(invSqrt14)
+    assert normA[None] == test_utils.approx(sqrt14)
+    assert aNormalized[None][0] == test_utils.approx(1.0 * invSqrt14)
+    assert aNormalized[None][1] == test_utils.approx(2.0 * invSqrt14)
+    assert aNormalized[None][2] == test_utils.approx(-3.0 * invSqrt14)
+
+
+@test_utils.test()
+def test_cross():
+    a = ti.Vector.field(3, dtype=ti.f32)
+    b = ti.Vector.field(3, dtype=ti.f32)
+    c = ti.Vector.field(3, dtype=ti.f32)
+
+    a2 = ti.Vector.field(2, dtype=ti.f32)
+    b2 = ti.Vector.field(2, dtype=ti.f32)
+    c2 = ti.field(dtype=ti.f32)
+
+    ti.root.place(a, b, c, a2, b2, c2)
+
+    @ti.kernel
+    def init():
+        a[None] = ti.Vector([1.0, 2.0, 3.0])
+        b[None] = ti.Vector([4.0, 5.0, 6.0])
+        c[None] = a[None].cross(b[None])
+
+        a2[None] = ti.Vector([1.0, 2.0])
+        b2[None] = ti.Vector([4.0, 5.0])
+        c2[None] = a2[None].cross(b2[None])
+
+    init()
+    assert c[None][0] == -3.0
+    assert c[None][1] == 6.0
+    assert c[None][2] == -3.0
+    assert c2[None] == -3.0
+
+
+@test_utils.test()
+def test_dot():
+    a = ti.Vector.field(3, dtype=ti.f32)
+    b = ti.Vector.field(3, dtype=ti.f32)
+    c = ti.field(dtype=ti.f32)
+
+    a2 = ti.Vector.field(2, dtype=ti.f32)
+    b2 = ti.Vector.field(2, dtype=ti.f32)
+    c2 = ti.field(dtype=ti.f32)
+
+    ti.root.place(a, b, c, a2, b2, c2)
+
+    @ti.kernel
+    def init():
+        a[None] = ti.Vector([1.0, 2.0, 3.0])
+        b[None] = ti.Vector([4.0, 5.0, 6.0])
+        c[None] = a[None].dot(b[None])
+
+        a2[None] = ti.Vector([1.0, 2.0])
+        b2[None] = ti.Vector([4.0, 5.0])
+        c2[None] = a2[None].dot(b2[None])
+
+    init()
+    assert c[None] == 32.0
+    assert c2[None] == 14.0
+
+
+@test_utils.test()
+def test_transpose():
+    dim = 3
+    m = ti.Matrix.field(dim, dim, ti.f32)
+
+    ti.root.place(m)
+
+    @ti.kernel
+    def transpose():
+        mat = m[None].transpose()
+        m[None] = mat
+
+    for i in range(dim):
+        for j in range(dim):
+            m[None][i, j] = i * 2 + j * 7
+
+    transpose()
+
+    for i in range(dim):
+        for j in range(dim):
+            assert m[None][j, i] == test_utils.approx(i * 2 + j * 7)
+
+
+def _test_polar_decomp(dim, dt):
+    m = ti.Matrix.field(dim, dim, dt)
+    r = ti.Matrix.field(dim, dim, dt)
+    s = ti.Matrix.field(dim, dim, dt)
+    I = ti.Matrix.field(dim, dim, dt)
+    D = ti.Matrix.field(dim, dim, dt)
+
+    ti.root.place(m, r, s, I, D)
+
+    @ti.kernel
+    def polar():
+        R, S = ti.polar_decompose(m[None], dt)
+        r[None] = R
+        s[None] = S
+        m[None] = R @ S
+        I[None] = R @ R.transpose()
+        D[None] = S - S.transpose()
+
+    def V(i, j):
+        return i * 2 + j * 7 + int(i == j) * 3
+
+    for i in range(dim):
+        for j in range(dim):
+            m[None][i, j] = V(i, j)
+
+    polar()
+
+    tol = 5e-5 if dt == ti.f32 else 1e-12
+
+    for i in range(dim):
+        for j in range(dim):
+            assert m[None][i, j] == test_utils.approx(V(i, j), abs=tol)
+            assert I[None][i, j] == test_utils.approx(int(i == j), abs=tol)
+            assert D[None][i, j] == test_utils.approx(0, abs=tol)
+
+
+def test_polar_decomp():
+    for dim in [2, 3]:
+        for dt in [ti.f32, ti.f64]:
+
+            @test_utils.test(
+                require=ti.extension.data64 if dt == ti.f64 else [],
+                default_fp=dt)
+            def wrapped():
+                _test_polar_decomp(dim, dt)
+
+            wrapped()
+
+
+@test_utils.test()
+def test_matrix():
+    x = ti.Matrix.field(2, 2, dtype=ti.i32)
+
+    ti.root.dense(ti.i, 16).place(x)
+
+    @ti.kernel
+    def inc():
+        for i in x:
+            delta = ti.Matrix([[3, 0], [0, 0]])
+            x[i][1, 1] = x[i][0, 0] + 1
+            x[i] = x[i] + delta
+            x[i] += delta
+
+    for i in range(10):
+        x[i][0, 0] = i
+
+    inc()
+
+    for i in range(10):
+        assert x[i][0, 0] == 6 + i
+        assert x[i][1, 1] == 1 + i
+
+
+@test_utils.test()
+def _test_mat_inverse_size(n):
+    m = ti.Matrix.field(n, n, dtype=ti.f32, shape=())
+    M = np.empty(shape=(n, n), dtype=np.float32)
+    for i in range(n):
+        for j in range(n):
+            M[i, j] = i * j + i * 3 + j + 1 + int(i == j) * 4
+    assert np.linalg.det(M) != 0
+
+    m.from_numpy(M)
+
+    @ti.kernel
+    def invert():
+        m[None] = m[None].inverse()
+
+    invert()
+
+    m_np = m.to_numpy(keep_dims=True)
+    np.testing.assert_almost_equal(m_np, np.linalg.inv(M))
+
+
+def test_mat_inverse():
+    for n in range(1, 5):
+        _test_mat_inverse_size(n)
+
+
+@test_utils.test()
+def test_matrix_factories():
+    a = ti.Vector.field(3, dtype=ti.i32, shape=3)
+    b = ti.Matrix.field(2, 2, dtype=ti.f32, shape=2)
+    c = ti.Matrix.field(2, 3, dtype=ti.f32, shape=2)
+
+    @ti.kernel
+    def fill():
+        b[0] = ti.Matrix.identity(ti.f32, 2)
+        b[1] = ti.Matrix.rotation2d(math.pi / 3)
+        c[0] = ti.Matrix.zero(ti.f32, 2, 3)
+        c[1] = ti.Matrix.one(ti.f32, 2, 3)
+        for i in ti.static(range(3)):
+            a[i] = ti.Vector.unit(3, i)
+
+    fill()
+
+    for i in range(3):
+        for j in range(3):
+            assert a[i][j] == int(i == j)
+
+    sqrt3o2 = math.sqrt(3) / 2
+    assert b[0].to_numpy() == test_utils.approx(np.eye(2))
+    assert b[1].to_numpy() == test_utils.approx(
+        np.array([[0.5, -sqrt3o2], [sqrt3o2, 0.5]]))
+    assert c[0].to_numpy() == test_utils.approx(np.zeros((2, 3)))
+    assert c[1].to_numpy() == test_utils.approx(np.ones((2, 3)))
+
+
+# TODO: move codes below to test_matrix.py:
+
+
+@test_utils.test()
+def test_init_matrix_from_vectors():
+    m1 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+    m2 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+    m3 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+    m4 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+
+    @ti.kernel
+    def fill():
+        for i in range(3):
+            a = ti.Vector([1.0, 4.0, 7.0])
+            b = ti.Vector([2.0, 5.0, 8.0])
+            c = ti.Vector([3.0, 6.0, 9.0])
+            m1[i] = ti.Matrix.rows([a, b, c])
+            m2[i] = ti.Matrix.cols([a, b, c])
+            m3[i] = ti.Matrix.rows([[1.0, 4.0, 7.0], [2.0, 5.0, 8.0],
+                                    [3.0, 6.0, 9.0]])
+            m4[i] = ti.Matrix.cols([[1.0, 4.0, 7.0], [2.0, 5.0, 8.0],
+                                    [3.0, 6.0, 9.0]])
+
+    fill()
+
+    for j in range(3):
+        for i in range(3):
+            assert m1[0][i, j] == int(i + 3 * j + 1)
+            assert m2[0][j, i] == int(i + 3 * j + 1)
+            assert m3[0][i, j] == int(i + 3 * j + 1)
+            assert m4[0][j, i] == int(i + 3 * j + 1)
+
+
+# TODO: Remove this once the APIs are obsolete.
+@pytest.mark.filterwarnings('ignore')
+@test_utils.test(arch=get_host_arch_list())
+def test_init_matrix_from_vectors_deprecated():
+    m1 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+    m2 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+    m3 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+    m4 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(3))
+
+    @ti.kernel
+    def fill():
+        for i in range(3):
+            a = ti.Vector([1.0, 4.0, 7.0])
+            b = ti.Vector([2.0, 5.0, 8.0])
+            c = ti.Vector([3.0, 6.0, 9.0])
+            m1[i] = ti.Matrix.rows([a, b, c])
+            m2[i] = ti.Matrix.cols([a, b, c])
+            m3[i] = ti.Matrix.rows([[1.0, 4.0, 7.0], [2.0, 5.0, 8.0],
+                                    [3.0, 6.0, 9.0]])
+            m4[i] = ti.Matrix.cols([[1.0, 4.0, 7.0], [2.0, 5.0, 8.0],
+                                    [3.0, 6.0, 9.0]])
+
+    fill()
+
+    for j in range(3):
+        for i in range(3):
+            assert m1[0][i, j] == int(i + 3 * j + 1)
+            assert m2[0][j, i] == int(i + 3 * j + 1)
+            assert m3[0][i, j] == int(i + 3 * j + 1)
+            assert m4[0][j, i] == int(i + 3 * j + 1)
+
+
+@test_utils.test()
+def test_any_all():
+    a = ti.Matrix.field(2, 2, dtype=ti.i32, shape=())
+    b = ti.field(dtype=ti.i32, shape=())
+    c = ti.field(dtype=ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        b[None] = any(a[None])
+        c[None] = all(a[None])
+
+    for i in range(2):
+        for j in range(2):
+            a[None][0, 0] = i
+            a[None][1, 0] = j
+            a[None][1, 1] = i
+            a[None][0, 1] = j
+
+            func()
+            if i == 1 or j == 1:
+                assert b[None] == 1
+            else:
+                assert b[None] == 0
+
+            if i == 1 and j == 1:
+                assert c[None] == 1
+            else:
+                assert c[None] == 0
+
+
+@test_utils.test()
+def test_min_max():
+    a = ti.Matrix.field(2, 2, dtype=ti.i32, shape=())
+    b = ti.field(dtype=ti.i32, shape=())
+    c = ti.field(dtype=ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        b[None] = a[None].max()
+        c[None] = a[None].min()
+
+    for i in range(2):
+        for j in range(2):
+            a[None][0, 0] = i
+            a[None][1, 0] = j
+            a[None][1, 1] = i
+            a[None][0, 1] = j
+
+            func()
+            assert b[None] == max(i, j)
+            assert c[None] == min(i, j)
+
+
+# must not throw any error:
+@test_utils.test()
+def test_matrix_list_assign():
+
+    m = ti.Matrix.field(2, 2, dtype=ti.i32, shape=(2, 2, 1))
+    v = ti.Vector.field(2, dtype=ti.i32, shape=(2, 2, 1))
+
+    m[1, 0, 0] = [[4, 3], [6, 7]]
+    v[1, 0, 0] = [8, 4]
+
+    assert np.allclose(m.to_numpy()[1, 0, 0, :, :], np.array([[4, 3], [6, 7]]))
+    assert np.allclose(v.to_numpy()[1, 0, 0, :], np.array([8, 4]))
+
+    @ti.kernel
+    def func():
+        m[1, 0, 0] = [[1, 2], [3, 4]]
+        v[1, 0, 0] = [5, 6]
+        m[1, 0, 0] += [[1, 2], [3, 4]]
+        v[1, 0, 0] += [5, 6]
+
+    func()
+    assert np.allclose(m.to_numpy()[1, 0, 0, :, :], np.array([[2, 4], [6, 8]]))
+    assert np.allclose(v.to_numpy()[1, 0, 0, :], np.array([10, 12]))
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_vector_xyzw_accessor():
+    u = ti.Vector.field(2, dtype=ti.i32, shape=(2, 2, 1))
+    v = ti.Vector.field(4, dtype=ti.i32, shape=(2, 2, 1))
+
+    u[1, 0, 0].y = 3
+    v[1, 0, 0].z = 0
+    v[1, 0, 0].w = 4
+
+    @ti.kernel
+    def func():
+        u[1, 0, 0].x = 8 * u[1, 0, 0].y
+        v[1, 0, 0].z = 1 - v[1, 0, 0].w
+        v[1, 0, 0].x = 6
+
+    func()
+    assert u[1, 0, 0].x == 24
+    assert u[1, 0, 0].y == 3
+    assert v[1, 0, 0].z == -3
+    assert v[1, 0, 0].w == 4
+    assert np.allclose(v.to_numpy()[1, 0, 0, :], np.array([6, 0, -3, 4]))
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_diag():
+    m1 = ti.Matrix.field(3, 3, dtype=ti.f32, shape=())
+
+    @ti.kernel
+    def fill():
+        m1[None] = ti.Matrix.diag(dim=3, val=1.4)
+
+    fill()
+
+    for i in range(3):
+        for j in range(3):
+            if i == j:
+                assert m1[None][i, j] == test_utils.approx(1.4)
+            else:
+                assert m1[None][i, j] == 0.0
diff --git a/tests/_python_orig/test_listgen.py b/tests/_python_orig/test_listgen.py
new file mode 100644
index 000000000..eb6194624
--- /dev/null
+++ b/tests/_python_orig/test_listgen.py
@@ -0,0 +1,62 @@
+from random import randrange
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_listgen():
+    x = ti.field(ti.i32)
+    n = 1024
+
+    ti.root.dense(ti.ij, 4).dense(ti.ij, 4).dense(ti.ij,
+                                                  4).dense(ti.ij,
+                                                           4).dense(ti.ij,
+                                                                    4).place(x)
+
+    @ti.kernel
+    def fill(c: ti.i32):
+        for i, j in x:
+            x[i, j] = i * 10 + j + c
+
+    for c in range(2):
+        print('Testing c=%d' % c)
+        fill(c)
+        # read it out once to avoid launching too many operator[] kernels
+        xnp = x.to_numpy()
+        for i in range(n):
+            for j in range(n):
+                assert xnp[i, j] == i * 10 + j + c
+
+        # Randomly check 1000 items to ensure [] work as well
+        for _ in range(1000):
+            i, j = randrange(n), randrange(n)
+            assert x[i, j] == i * 10 + j + c
+
+
+@test_utils.test()
+def test_nested_3d():
+    x = ti.field(ti.i32)
+    n = 128
+
+    ti.root.dense(ti.ijk, 4).dense(ti.ijk, 4).dense(ti.ijk,
+                                                    4).dense(ti.ijk,
+                                                             2).place(x)
+
+    @ti.kernel
+    def fill():
+        for i, j, k in x:
+            x[i, j, k] = (i * n + j) * n + k
+
+    fill()
+    # read it out once to avoid launching too many operator[] kernels
+    xnp = x.to_numpy()
+    for i in range(n):
+        for j in range(n):
+            for k in range(n):
+                assert xnp[i, j, k] == (i * n + j) * n + k
+
+    # Randomly check 1000 items to ensure [] work as well
+    for _ in range(1000):
+        i, j, k = randrange(n), randrange(n), randrange(n)
+        assert x[i, j, k] == (i * n + j) * n + k
diff --git a/tests/_python_orig/test_local_atomic_opt.py b/tests/_python_orig/test_local_atomic_opt.py
new file mode 100644
index 000000000..560fb9071
--- /dev/null
+++ b/tests/_python_orig/test_local_atomic_opt.py
@@ -0,0 +1,32 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_cse():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        a += 10
+        a = a + 123
+        A[None] = a
+
+    func()
+    assert A[None] == 133
+
+
+@test_utils.test()
+def test_store_forward():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        a = 123
+        a += 10
+        A[None] = a
+
+    func()
+    assert A[None] == 133
diff --git a/tests/_python_orig/test_local_atomics.py b/tests/_python_orig/test_local_atomics.py
new file mode 100644
index 000000000..272834d97
--- /dev/null
+++ b/tests/_python_orig/test_local_atomics.py
@@ -0,0 +1,184 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_explicit_local_atomic_add():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        for i in range(10):
+            ti.atomic_add(a, i)
+        A[None] = a
+
+    func()
+    assert A[None] == 45
+
+
+@test_utils.test()
+def test_implicit_local_atomic_add():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        for i in range(10):
+            a += i
+        A[None] = a
+
+    func()
+    assert A[None] == 45
+
+
+@test_utils.test()
+def test_explicit_local_atomic_sub():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        for i in range(10):
+            ti.atomic_sub(a, i)
+        A[None] = a
+
+    func()
+    assert A[None] == -45
+
+
+@test_utils.test()
+def test_implicit_local_atomic_sub():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        for i in range(10):
+            a -= i
+        A[None] = a
+
+    func()
+    assert A[None] == -45
+
+
+@test_utils.test()
+def test_explicit_local_atomic_min():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 1000
+        for i in range(10):
+            ti.atomic_min(a, i)
+        A[None] = a
+
+    func()
+    assert A[None] == 0
+
+
+@test_utils.test()
+def test_explicit_local_atomic_max():
+    A = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def func():
+        a = -1000
+        for i in range(10):
+            ti.atomic_max(a, i)
+        A[None] = a
+
+    func()
+    assert A[None] == 9
+
+
+@test_utils.test()
+def test_explicit_local_atomic_and():
+    A = ti.field(ti.i32, shape=())
+    max_int = 2147483647
+
+    @ti.kernel
+    def func():
+        a = 1023
+        for i in range(10):
+            ti.atomic_and(a, max_int - 2**i)
+        A[None] = a
+
+    func()
+    assert A[None] == 0
+
+
+@test_utils.test()
+def test_implicit_local_atomic_and():
+    A = ti.field(ti.i32, shape=())
+    max_int = 2147483647
+
+    @ti.kernel
+    def func():
+        a = 1023
+        for i in range(10):
+            a &= max_int - 2**i
+        A[None] = a
+
+    func()
+    assert A[None] == 0
+
+
+@test_utils.test()
+def test_explicit_local_atomic_or():
+    A = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        for i in range(10):
+            ti.atomic_or(a, 2**i)
+        A[None] = a
+
+    func()
+    assert A[None] == 1023
+
+
+@test_utils.test()
+def test_implicit_local_atomic_or():
+    A = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 0
+        for i in range(10):
+            a |= 2**i
+        A[None] = a
+
+    func()
+    assert A[None] == 1023
+
+
+@test_utils.test()
+def test_explicit_local_atomic_xor():
+    A = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 1023
+        for i in range(10):
+            ti.atomic_xor(a, 2**i)
+        A[None] = a
+
+    func()
+    assert A[None] == 0
+
+
+@test_utils.test()
+def test_implicit_local_atomic_xor():
+    A = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = 1023
+        for i in range(10):
+            a ^= 2**i
+        A[None] = a
+
+    func()
+    assert A[None] == 0
diff --git a/tests/_python_orig/test_loop_grad.py b/tests/_python_orig/test_loop_grad.py
new file mode 100644
index 000000000..ab2096134
--- /dev/null
+++ b/tests/_python_orig/test_loop_grad.py
@@ -0,0 +1,65 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(exclude=[ti.vulkan])
+def test_loop_grad():
+    x = ti.field(ti.f32)
+
+    n = 16
+    m = 8
+
+    ti.root.dense(ti.ij, (n, m)).place(x)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for k in range(n):
+            for i in range(m - 1):
+                x[k, i + 1] = x[k, i] * 2
+
+    for k in range(n):
+        x[k, 0] = k
+    func()
+
+    for k in range(n):
+        x.grad[k, m - 1] = 1
+    func.grad()
+
+    for k in range(n):
+        for i in range(m):
+            assert x[k, i] == 2**i * k
+            assert x.grad[k, i] == 2**(m - 1 - i)
+
+
+@test_utils.test(exclude=[ti.vulkan])
+def test_loop_grad_complex():
+    return  # This case is not supported yet
+    x = ti.field(ti.f32)
+
+    n = 16
+    m = 8
+
+    ti.root.dense(ti.ij, (n, m)).place(x)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for k in range(n):
+            t = k * k
+            tt = t * 2
+            for i in range(m - 1):
+                x[k, i + 1] = x[k, i] * 2 + tt
+
+    for k in range(n):
+        x[k, 0] = k
+    func()
+
+    for k in range(n):
+        x.grad[k, m - 1] = 1
+    func.grad()
+
+    for k in range(n):
+        for i in range(m):
+            assert x[k, i] == i**2 + 2 * k**2
+            assert x.grad[k, i] == 2**(m - 1 - i)
diff --git a/tests/_python_orig/test_loop_unique.py b/tests/_python_orig/test_loop_unique.py
new file mode 100644
index 000000000..8c439b3bd
--- /dev/null
+++ b/tests/_python_orig/test_loop_unique.py
@@ -0,0 +1,161 @@
+from taichi.lang.misc import loop_unique
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_loop_unique_simple_1d():
+    x, y = ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 16
+    ti.root.pointer(ti.i, N).place(x)
+    ti.root.pointer(ti.i, N).place(y)
+
+    @ti.kernel
+    def inc_y():
+        for i in x:
+            a = loop_unique(x[i])
+            y[a] += 1
+
+    x[1] = 2
+    x[2] = 3
+    x[7] = 5
+    y[3] = 2
+    y[4] = 3
+    inc_y()
+    expected_result = {2: 1, 3: 3, 4: 3, 5: 1}
+    for i in range(N):
+        assert y[i] == expected_result.get(i, 0)
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_loop_unique_binary_op_1d():
+    x, y = ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 16
+    ti.root.pointer(ti.i, N).place(x)
+    ti.root.pointer(ti.i, N).place(y)
+
+    @ti.kernel
+    def inc_y():
+        for i in x:
+            a = loop_unique(x[i])
+            y[a + 1] += 1
+
+    x[1] = 2
+    x[2] = 3
+    x[7] = 5
+    y[3] = 2
+    y[4] = 3
+    inc_y()
+    expected_result = {3: 3, 4: 4, 6: 1}
+    for i in range(N):
+        assert y[i] == expected_result.get(i, 0)
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_loop_unique_nested_1d():
+    x, y = ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 16
+    ti.root.pointer(ti.i, N).place(x)
+    ti.root.pointer(ti.i, N).place(y)
+
+    @ti.kernel
+    def inc_y():
+        for i in x:
+            for j in range(i):
+                a = loop_unique(x[i])
+                y[a] += 1
+
+    x[1] = 2
+    x[2] = 3
+    x[7] = 5
+    y[3] = 2
+    y[4] = 3
+    inc_y()
+    expected_result = {2: 1, 3: 4, 4: 3, 5: 7}
+    for i in range(N):
+        assert y[i] == expected_result.get(i, 0)
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_loop_unique_2d():
+    x, y, z = ti.field(ti.i32), ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 8
+    ti.root.pointer(ti.ij, N).place(x)
+    ti.root.pointer(ti.ij, N).place(y)
+    ti.root.pointer(ti.ij, N).place(z)
+
+    @ti.kernel
+    def inc_y_z():
+        for i, j in x:
+            a = loop_unique(x[i, j])
+            y[a, j] += 1
+            z[i, i] += 1  # cannot demote this
+
+    x[1, 1] = 2
+    x[1, 2] = 4
+    x[1, 3] = 5
+    x[1, 4] = 7
+    x[1, 5] = 0
+    x[1, 6] = 1
+    x[2, 5] = 3
+    x[2, 7] = 6
+    y[3, 5] = 3
+    y[6, 6] = 8
+    z[2, 2] = 5
+    inc_y_z()
+    expected_result_y = {
+        (0, 5): 1,
+        (1, 6): 1,
+        (2, 1): 1,
+        (3, 5): 4,
+        (4, 2): 1,
+        (5, 3): 1,
+        (6, 6): 8,
+        (6, 7): 1,
+        (7, 4): 1
+    }
+    expected_result_z = {(1, 1): 6, (2, 2): 7}
+    for i in range(N):
+        for j in range(N):
+            assert y[i, j] == expected_result_y.get((i, j), 0)
+            assert z[i, j] == expected_result_z.get((i, j), 0)
+
+
+@test_utils.test()
+def test_loop_unique_ndrange():
+    x, y, z = ti.field(ti.i32), ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 8
+    M = 32
+    ti.root.dense(ti.ij, N).place(x)
+    ti.root.dense(ti.i, M).place(y)
+    ti.root.dense(ti.ij, N).place(z)
+
+    a = 3
+    b = 5
+
+    @ti.kernel
+    def prepare_x():
+        for i, j in ti.ndrange(a, b):
+            x[i, j] = i * (b + 1) + j + 1
+
+    @ti.kernel
+    def inc_y_z():
+        for i, j in ti.ndrange(a, b):
+            u = loop_unique(x[i, j])
+            y[u] += i
+            z[i, j + 1] += 10  # TODO: demote this
+
+    prepare_x()
+    inc_y_z()
+    for i in range(a * (b + 1)):
+        assert y[i] == (0 if i % (b + 1) == 0 else i // (b + 1))
+
+    for i in range(a):
+        for j in range(b + 1):
+            assert z[i, j] == (0 if j == 0 else 10)
diff --git a/tests/_python_orig/test_loops.py b/tests/_python_orig/test_loops.py
new file mode 100644
index 000000000..588bf302d
--- /dev/null
+++ b/tests/_python_orig/test_loops.py
@@ -0,0 +1,174 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_loops():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+
+    N = 512
+
+    ti.root.dense(ti.i, N).place(x)
+    ti.root.dense(ti.i, N).place(y)
+    ti.root.lazy_grad()
+
+    for i in range(N // 2, N):
+        y[i] = i - 300
+
+    @ti.kernel
+    def func():
+        for i in range(ti.static(N // 2 + 3), N):
+            x[i] = abs(y[i])
+
+    func()
+
+    for i in range(N // 2 + 3):
+        assert x[i] == 0
+
+    for i in range(N // 2 + 3, N):
+        assert x[i] == abs(y[i])
+
+
+@test_utils.test()
+def test_numpy_loops():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+
+    N = 512
+
+    ti.root.dense(ti.i, N).place(x)
+    ti.root.dense(ti.i, N).place(y)
+    ti.root.lazy_grad()
+
+    for i in range(N // 2, N):
+        y[i] = i - 300
+
+    import numpy as np
+    begin = (np.ones(1) * (N // 2 + 3)).astype(np.int32).reshape(())
+    end = (np.ones(1) * N).astype(np.int32).reshape(())
+
+    @ti.kernel
+    def func():
+        for i in range(begin, end):
+            x[i] = abs(y[i])
+
+    func()
+
+    for i in range(N // 2 + 3):
+        assert x[i] == 0
+
+    for i in range(N // 2 + 3, N):
+        assert x[i] == abs(y[i])
+
+
+@test_utils.test()
+def test_nested_loops():
+    # this may crash if any LLVM allocas are called in the loop body
+    x = ti.field(ti.i32)
+
+    n = 2048
+
+    ti.root.dense(ti.ij, n).place(x)
+
+    @ti.kernel
+    def paint():
+        for i in range(n):
+            for j in range(n):
+                x[0, 0] = i
+
+    paint()
+
+
+@test_utils.test()
+def test_zero_outer_loop():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def test():
+        for i in range(0):
+            x[None] = 1
+
+    test()
+
+    assert x[None] == 0
+
+
+@test_utils.test()
+def test_zero_inner_loop():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def test():
+        for i in range(1):
+            for j in range(0):
+                x[None] = 1
+
+    test()
+
+    assert x[None] == 0
+
+
+@test_utils.test()
+def test_dynamic_loop_range():
+    x = ti.field(ti.i32)
+    c = ti.field(ti.i32)
+    n = 2000
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(c)
+
+    @ti.kernel
+    def test():
+        for i in x:
+            x[i] = ti.atomic_add(c[None], 1)
+        for i in range(c[None], c[None] * 2):
+            x[i - n] += c[None]
+
+    test()
+    assert c[None] == n
+    assert sum(x.to_numpy()) == (n * (n - 1) // 2) + n * n
+
+
+@test_utils.test()
+def test_loop_arg_as_range():
+    # Dynamic range loops are intended to make sure global tmps work
+    x = ti.field(ti.i32)
+    n = 1000
+
+    ti.root.dense(ti.i, n).place(x)
+
+    @ti.kernel
+    def test(b: ti.i32, e: ti.i32):
+        for i in range(b, e):
+            x[i - b] = i
+
+    pairs = [
+        (0, n // 2),
+        (n // 2, n),
+        (-n // 2, -n // 3),
+    ]
+    for b, e in pairs:
+        test(b, e)
+        for i in range(b, e):
+            assert x[i - b] == i
+
+
+@test_utils.test()
+def test_assignment_in_nested_loops():
+    # https://github.com/taichi-dev/taichi/issues/1109
+    m = ti.field(ti.f32, 3)
+    x = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a = x[None]
+        for i in m:
+            b = a
+            for j in range(1):
+                b = b
+            x[None] = b
+
+    x[None] = 1
+    func()
+    assert x[None] == 1
diff --git a/tests/_python_orig/test_materialize_check.py b/tests/_python_orig/test_materialize_check.py
new file mode 100644
index 000000000..e7acc6689
--- /dev/null
+++ b/tests/_python_orig/test_materialize_check.py
@@ -0,0 +1,35 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_check_field_not_placed():
+    a = ti.field(ti.i32)
+
+    @ti.kernel
+    def foo():
+        pass
+
+    with pytest.raises(RuntimeError,
+                       match=r"These field\(s\) are not placed.*"):
+        foo()
+
+
+@test_utils.test()
+def test_check_matrix_field_member_shape():
+    a = ti.Matrix.field(2, 2, ti.i32)
+    ti.root.dense(ti.i, 10).place(a.get_scalar_field(0, 0))
+    ti.root.dense(ti.i, 11).place(a.get_scalar_field(0, 1))
+    ti.root.dense(ti.i, 10).place(a.get_scalar_field(1, 0))
+    ti.root.dense(ti.i, 11).place(a.get_scalar_field(1, 1))
+
+    @ti.kernel
+    def foo():
+        pass
+
+    with pytest.raises(
+            RuntimeError,
+            match=r"Members of the following field have different shapes.*"):
+        foo()
diff --git a/tests/_python_orig/test_matrix.py b/tests/_python_orig/test_matrix.py
new file mode 100644
index 000000000..22d56d630
--- /dev/null
+++ b/tests/_python_orig/test_matrix.py
@@ -0,0 +1,526 @@
+import math
+import operator
+
+import numpy as np
+import pytest
+from taichi.lang import impl
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+operation_types = [operator.add, operator.sub, operator.matmul]
+test_matrix_arrays = [
+    np.array([[1, 2], [3, 4]]),
+    np.array([[5, 6], [7, 8]]),
+    np.array([[2, 8], [-1, 3]])
+]
+
+vector_operation_types = [operator.add, operator.sub]
+test_vector_arrays = [
+    np.array([42, 42]),
+    np.array([24, 24]),
+    np.array([83, 12])
+]
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_python_scope_vector_operations():
+    for ops in vector_operation_types:
+        a, b = test_vector_arrays[:2]
+        m1, m2 = ti.Vector(a), ti.Vector(b)
+        c = ops(m1, m2)
+        assert np.allclose(c.to_numpy(), ops(a, b))
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_python_scope_matrix_operations():
+    for ops in operation_types:
+        a, b = test_matrix_arrays[:2]
+        m1, m2 = ti.Matrix(a), ti.Matrix(b)
+        c = ops(m1, m2)
+        assert np.allclose(c.to_numpy(), ops(a, b))
+
+
+# TODO: Loops inside the function will cause AssertionError:
+# No new variables can be declared after kernel invocations
+# or Python-scope field accesses.
+# ideally we should use pytest.fixture to parameterize the tests
+# over explicit loops
+@pytest.mark.parametrize('ops', vector_operation_types)
+@test_utils.test(arch=get_host_arch_list())
+def test_python_scope_vector_field(ops):
+    t1 = ti.Vector.field(2, dtype=ti.i32, shape=())
+    t2 = ti.Vector.field(2, dtype=ti.i32, shape=())
+    a, b = test_vector_arrays[:2]
+    t1[None], t2[None] = a.tolist(), b.tolist()
+
+    c = ops(t1[None], t2[None])
+    assert np.allclose(c.to_numpy(), ops(a, b))
+
+
+@pytest.mark.parametrize('ops', vector_operation_types)
+@test_utils.test(arch=get_host_arch_list())
+def test_python_scope_matrix_field(ops):
+    t1 = ti.Matrix.field(2, 2, dtype=ti.i32, shape=())
+    t2 = ti.Matrix.field(2, 2, dtype=ti.i32, shape=())
+    a, b = test_matrix_arrays[:2]
+    # ndarray not supported here
+    t1[None], t2[None] = a.tolist(), b.tolist()
+
+    c = ops(t1[None], t2[None])
+    print(c)
+
+    assert np.allclose(c.to_numpy(), ops(a, b))
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_constant_matrices():
+    assert ti.cos(math.pi / 3) == test_utils.approx(0.5)
+    assert np.allclose((-ti.Vector([2, 3])).to_numpy(), np.array([-2, -3]))
+    assert ti.cos(ti.Vector([2, 3])).to_numpy() == test_utils.approx(
+        np.cos(np.array([2, 3])))
+    assert ti.max(2, 3) == 3
+    res = ti.max(4, ti.Vector([3, 4, 5]))
+    assert np.allclose(res.to_numpy(), np.array([4, 4, 5]))
+    res = ti.Vector([2, 3]) + ti.Vector([3, 4])
+    assert np.allclose(res.to_numpy(), np.array([5, 7]))
+    res = ti.atan2(ti.Vector([2, 3]), ti.Vector([3, 4]))
+    assert res.to_numpy() == test_utils.approx(
+        np.arctan2(np.array([2, 3]), np.array([3, 4])))
+    res = ti.Matrix([[2, 3], [4, 5]]) @ ti.Vector([2, 3])
+    assert np.allclose(res.to_numpy(), np.array([13, 23]))
+    v = ti.Vector([3, 4])
+    w = ti.Vector([5, -12])
+    r = ti.Vector([1, 2, 3, 4])
+    s = ti.Matrix([[1, 2], [3, 4]])
+    assert v.normalized().to_numpy() == test_utils.approx(np.array([0.6, 0.8]))
+    assert v.cross(w) == test_utils.approx(-12 * 3 - 4 * 5)
+    w.y = v.x * w[0]
+    r.x = r.y
+    r.y = r.z
+    r.z = r.w
+    r.w = r.x
+    assert np.allclose(w.to_numpy(), np.array([5, 15]))
+    assert ti.select(ti.Vector([1, 0]), ti.Vector([2, 3]),
+                     ti.Vector([4, 5])) == ti.Vector([2, 5])
+    s[0, 1] = 2
+    assert s[0, 1] == 2
+
+    @ti.kernel
+    def func(t: ti.i32):
+        m = ti.Matrix([[2, 3], [4, t]])
+        print(m @ ti.Vector([2, 3]))
+        m += ti.Matrix([[3, 4], [5, t]])
+        print(m @ v)
+        print(r.x, r.y, r.z, r.w)
+        s = w.transpose() @ m
+        print(s)
+        print(m)
+
+    func(5)
+
+
+@pytest.mark.parametrize('ops', vector_operation_types)
+@test_utils.test(arch=get_host_arch_list())
+def test_taichi_scope_vector_operations_with_global_vectors(ops):
+    a, b, c = test_vector_arrays[:3]
+    m1, m2 = ti.Vector(a), ti.Vector(b)
+    r1 = ti.Vector.field(2, dtype=ti.i32, shape=())
+    r2 = ti.Vector.field(2, dtype=ti.i32, shape=())
+    m3 = ti.Vector.field(2, dtype=ti.i32, shape=())
+    m3.from_numpy(c)
+
+    @ti.kernel
+    def run():
+        r1[None] = ops(m1, m2)
+        r2[None] = ops(m1, m3[None])
+
+    run()
+
+    assert np.allclose(r1[None].to_numpy(), ops(a, b))
+    assert np.allclose(r2[None].to_numpy(), ops(a, c))
+
+
+@pytest.mark.parametrize('ops', vector_operation_types)
+@test_utils.test(arch=get_host_arch_list())
+def test_taichi_scope_matrix_operations_with_global_matrices(ops):
+    a, b, c = test_matrix_arrays[:3]
+    m1, m2 = ti.Matrix(a), ti.Matrix(b)
+    r1 = ti.Matrix.field(2, 2, dtype=ti.i32, shape=())
+    r2 = ti.Matrix.field(2, 2, dtype=ti.i32, shape=())
+    m3 = ti.Matrix.field(2, 2, dtype=ti.i32, shape=())
+    m3.from_numpy(c)
+
+    @ti.kernel
+    def run():
+        r1[None] = ops(m1, m2)
+        r2[None] = ops(m1, m3[None])
+
+    run()
+
+    assert np.allclose(r1[None].to_numpy(), ops(a, b))
+    assert np.allclose(r2[None].to_numpy(), ops(a, c))
+
+
+@test_utils.test()
+def test_matrix_non_constant_index_numpy():
+    @ti.kernel
+    def func1(a: ti.any_arr(element_dim=2)):
+        for i in range(5):
+            for j, k in ti.ndrange(2, 2):
+                a[i][j, k] = j * j + k * k
+
+    m = np.empty((5, 2, 2), dtype=np.int32)
+    func1(m)
+    assert m[1][0, 1] == 1
+    assert m[2][1, 0] == 1
+    assert m[3][1, 1] == 2
+    assert m[4][0, 1] == 1
+
+    @ti.kernel
+    def func2(b: ti.any_arr(element_dim=1, layout=ti.Layout.SOA)):
+        for i in range(5):
+            for j in range(4):
+                b[i][j * j] = j * j
+
+    v = np.empty((10, 5), dtype=np.int32)
+    func2(v)
+    assert v[0][1] == 0
+    assert v[1][1] == 1
+    assert v[4][1] == 4
+    assert v[9][1] == 9
+
+
+@test_utils.test(require=ti.extension.dynamic_index,
+                 dynamic_index=True,
+                 debug=True)
+def test_matrix_non_constant_index():
+    m = ti.Matrix.field(2, 2, ti.i32, 5)
+    v = ti.Vector.field(10, ti.i32, 5)
+
+    @ti.kernel
+    def func1():
+        for i in range(5):
+            for j, k in ti.ndrange(2, 2):
+                m[i][j, k] = j * j + k * k
+        assert m[1][0, 1] == 1
+        assert m[2][1, 0] == 1
+        assert m[3][1, 1] == 2
+
+    func1()
+    assert m[4][0, 1] == 1
+
+    @ti.kernel
+    def func2():
+        for i in range(5):
+            for j in range(4):
+                v[i][j * j] = j * j
+        assert v[1][0] == 0
+        assert v[1][1] == 1
+        assert v[1][4] == 4
+
+    func2()
+    assert v[1][9] == 9
+
+    @ti.kernel
+    def func3():
+        tmp = ti.Vector([1, 2, 3])
+        for i in range(3):
+            tmp[i] = i * i
+            vec = ti.Vector([4, 5, 6])
+            for j in range(3):
+                vec[tmp[i] % 3] += vec[j % 3]
+        assert tmp[0] == 0
+        assert tmp[1] == 1
+        assert tmp[2] == 4
+
+    func3()
+
+    @ti.kernel
+    def func4(k: ti.i32):
+        tmp = ti.Vector([k, k * 2, k * 3])
+        assert tmp[0] == k
+        assert tmp[1] == k * 2
+        assert tmp[2] == k * 3
+
+    func4(10)
+
+
+@test_utils.test(arch=ti.cpu)
+def test_matrix_constant_index():
+    m = ti.Matrix.field(2, 2, ti.i32, 5)
+
+    @ti.kernel
+    def func():
+        for i in range(5):
+            for j, k in ti.static(ti.ndrange(2, 2)):
+                m[i][j, k] = 12
+
+    func()
+
+    assert np.allclose(m.to_numpy(), np.ones((5, 2, 2), np.int32) * 12)
+
+
+@test_utils.test(arch=ti.cpu)
+def test_vector_to_list():
+    a = ti.Vector.field(2, float, ())
+
+    data = [2, 3]
+    b = ti.Vector(data)
+    assert list(b) == data
+    assert len(b) == len(data)
+
+    a[None] = b
+    assert all(a[None] == ti.Vector(data))
+
+
+@test_utils.test(arch=ti.cpu)
+def test_matrix_to_list():
+    a = ti.Matrix.field(2, 3, float, ())
+
+    data = [[2, 3, 4], [5, 6, 7]]
+    b = ti.Matrix(data)
+    assert list(b) == data
+    assert len(b) == len(data)
+
+    a[None] = b
+    assert all(a[None] == ti.Matrix(data))
+
+
+@test_utils.test()
+def test_matrix_needs_grad():
+    # Just make sure the usage doesn't crash, see https://github.com/taichi-dev/taichi/pull/1545
+    n = 8
+    m1 = ti.Matrix.field(2, 2, ti.f32, n, needs_grad=True)
+    m2 = ti.Matrix.field(2, 2, ti.f32, n, needs_grad=True)
+    gr = ti.Matrix.field(2, 2, ti.f32, n)
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            gr[i] = m1.grad[i] + m2.grad[i]
+
+    func()
+
+
+@test_utils.test(debug=True)
+def test_copy_python_scope_matrix_to_taichi_scope():
+    a = ti.Vector([1, 2, 3])
+
+    @ti.kernel
+    def test():
+        b = a
+        assert b[0] == 1
+        assert b[1] == 2
+        assert b[2] == 3
+        b = ti.Vector([4, 5, 6])
+        assert b[0] == 4
+        assert b[1] == 5
+        assert b[2] == 6
+
+    test()
+
+
+@test_utils.test(debug=True)
+def test_copy_matrix_field_element_to_taichi_scope():
+    a = ti.Vector.field(3, ti.i32, shape=())
+    a[None] = ti.Vector([1, 2, 3])
+
+    @ti.kernel
+    def test():
+        b = a[None]
+        assert b[0] == 1
+        assert b[1] == 2
+        assert b[2] == 3
+        b[0] = 5
+        b[1] = 9
+        b[2] = 7
+        assert b[0] == 5
+        assert b[1] == 9
+        assert b[2] == 7
+        assert a[None][0] == 1
+        assert a[None][1] == 2
+        assert a[None][2] == 3
+
+    test()
+
+
+@test_utils.test(debug=True)
+def test_copy_matrix_in_taichi_scope():
+    @ti.kernel
+    def test():
+        a = ti.Vector([1, 2, 3])
+        b = a
+        assert b[0] == 1
+        assert b[1] == 2
+        assert b[2] == 3
+        b[0] = 5
+        b[1] = 9
+        b[2] = 7
+        assert b[0] == 5
+        assert b[1] == 9
+        assert b[2] == 7
+        assert a[0] == 1
+        assert a[1] == 2
+        assert a[2] == 3
+
+    test()
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda], dynamic_index=True, debug=True)
+def test_matrix_field_dynamic_index_stride():
+    # placeholders
+    temp_a = ti.field(ti.f32)
+    temp_b = ti.field(ti.f32)
+    temp_c = ti.field(ti.f32)
+    # target
+    v = ti.Vector.field(3, ti.i32)
+    x = v.get_scalar_field(0)
+    y = v.get_scalar_field(1)
+    z = v.get_scalar_field(2)
+
+    S0 = ti.root
+    S1 = S0.pointer(ti.i, 4)
+    S2 = S1.dense(ti.i, 2)
+    S3 = S2.pointer(ti.i, 8)
+    S3.place(temp_a)
+    S4 = S2.dense(ti.i, 16)
+    S4.place(x)
+    S5 = S1.dense(ti.i, 2)
+    S6 = S5.pointer(ti.i, 8)
+    S6.place(temp_b)
+    S7 = S5.dense(ti.i, 16)
+    S7.place(y)
+    S8 = S1.dense(ti.i, 2)
+    S9 = S8.dense(ti.i, 32)
+    S9.place(temp_c)
+    S10 = S8.dense(ti.i, 16)
+    S10.place(z)
+
+    @ti.kernel
+    def check_stride():
+        for i in range(128):
+            assert ti.get_addr(y, i) - ti.get_addr(x,
+                                                   i) == v.dynamic_index_stride
+            assert ti.get_addr(z, i) - ti.get_addr(y,
+                                                   i) == v.dynamic_index_stride
+
+    check_stride()
+
+    @ti.kernel
+    def run():
+        for i in range(128):
+            for j in range(3):
+                v[i][j] = i * j
+
+    run()
+    for i in range(128):
+        for j in range(3):
+            assert v[i][j] == i * j
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_matrix_field_dynamic_index_different_path_length():
+    v = ti.Vector.field(2, ti.i32)
+    x = v.get_scalar_field(0)
+    y = v.get_scalar_field(1)
+
+    ti.root.dense(ti.i, 8).place(x)
+    ti.root.dense(ti.i, 2).dense(ti.i, 4).place(y)
+
+    impl.get_runtime().materialize()
+    assert v.dynamic_index_stride is None
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_matrix_field_dynamic_index_not_pure_dense():
+    v = ti.Vector.field(2, ti.i32)
+    x = v.get_scalar_field(0)
+    y = v.get_scalar_field(1)
+
+    ti.root.dense(ti.i, 2).pointer(ti.i, 4).place(x)
+    ti.root.dense(ti.i, 2).dense(ti.i, 4).place(y)
+
+    impl.get_runtime().materialize()
+    assert v.dynamic_index_stride is None
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_matrix_field_dynamic_index_different_cell_size_bytes():
+    temp = ti.field(ti.f32)
+
+    v = ti.Vector.field(2, ti.i32)
+    x = v.get_scalar_field(0)
+    y = v.get_scalar_field(1)
+
+    ti.root.dense(ti.i, 8).place(x, temp)
+    ti.root.dense(ti.i, 8).place(y)
+
+    impl.get_runtime().materialize()
+    assert v.dynamic_index_stride is None
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_matrix_field_dynamic_index_different_offset_bytes_in_parent_cell():
+    temp_a = ti.field(ti.f32)
+    temp_b = ti.field(ti.f32)
+
+    v = ti.Vector.field(2, ti.i32)
+    x = v.get_scalar_field(0)
+    y = v.get_scalar_field(1)
+
+    ti.root.dense(ti.i, 8).place(temp_a, x)
+    ti.root.dense(ti.i, 8).place(y, temp_b)
+
+    impl.get_runtime().materialize()
+    assert v.dynamic_index_stride is None
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_matrix_field_dynamic_index_different_stride():
+    temp = ti.field(ti.f32)
+
+    v = ti.Vector.field(3, ti.i32)
+    x = v.get_scalar_field(0)
+    y = v.get_scalar_field(1)
+    z = v.get_scalar_field(2)
+
+    ti.root.dense(ti.i, 8).place(x, y, temp, z)
+
+    impl.get_runtime().materialize()
+    assert v.dynamic_index_stride is None
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda], dynamic_index=True)
+def test_matrix_field_dynamic_index_multiple_materialize():
+    @ti.kernel
+    def empty():
+        pass
+
+    empty()
+
+    n = 5
+    a = ti.Vector.field(3, dtype=ti.i32, shape=n)
+
+    @ti.kernel
+    def func():
+        for i in a:
+            a[i][i % 3] = i
+
+    func()
+    for i in range(n):
+        for j in range(3):
+            assert a[i][j] == (i if j == i % 3 else 0)
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda], dynamic_index=True, debug=True)
+def test_local_vector_initialized_in_a_loop():
+    @ti.kernel
+    def foo():
+        for c in range(10):
+            p = ti.Vector([c, c * 2])
+            for i in range(2):
+                assert p[i] == c * (i + 1)
+
+    foo()
diff --git a/tests/_python_orig/test_matrix_arg.py b/tests/_python_orig/test_matrix_arg.py
new file mode 100644
index 000000000..f07722f5f
--- /dev/null
+++ b/tests/_python_orig/test_matrix_arg.py
@@ -0,0 +1,37 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_matrix_arg():
+    mat1 = ti.Matrix([[1, 2, 3], [4, 5, 6]])
+
+    @ti.kernel
+    def foo(mat: ti.types.matrix(2, 3, ti.i32)) -> ti.i32:
+        return mat[0, 0] + mat[1, 2]
+
+    assert foo(mat1) == 7
+
+    mat3 = ti.Matrix([[1, 2], [3, 4], [5, 6]])
+
+    @ti.kernel
+    def foo2(var: ti.i32, mat: ti.types.matrix(3, 2, ti.i32)) -> ti.i32:
+        for i in ti.static(range(3)):
+            for j in ti.static(range(2)):
+                mat[i, j] += var
+        return mat[2, 1]
+
+    assert foo2(3, mat3) == 9
+
+
+@test_utils.test()
+def test_vector_arg():
+    vec1 = ti.Vector([1, 2, 3])
+
+    @ti.kernel
+    def foo(vec: ti.types.vector(3, ti.i32)) -> int:
+        return vec[0] + vec[1] + vec[2]
+
+    assert foo(vec1) == 6
diff --git a/tests/_python_orig/test_matrix_different_type.py b/tests/_python_orig/test_matrix_different_type.py
new file mode 100644
index 000000000..5fd00d78e
--- /dev/null
+++ b/tests/_python_orig/test_matrix_different_type.py
@@ -0,0 +1,113 @@
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+
+# TODO: test more matrix operations
+@test_utils.test()
+def test_vector():
+    type_list = [ti.f32, ti.i32]
+
+    a = ti.Vector.field(len(type_list), dtype=type_list, shape=())
+    b = ti.Vector.field(len(type_list), dtype=type_list, shape=())
+    c = ti.Vector.field(len(type_list), dtype=type_list, shape=())
+
+    @ti.kernel
+    def init():
+        a[None] = [1.0, 3]
+        b[None] = [2.0, 4]
+        c[None] = a[None] + b[None]
+
+    def verify():
+        assert isinstance(a[None][0], float)
+        assert isinstance(a[None][1], int)
+        assert isinstance(b[None][0], float)
+        assert isinstance(b[None][1], int)
+        assert c[None][0] == 3.0
+        assert c[None][1] == 7
+
+    init()
+    verify()
+
+
+# TODO: Support different element types of Matrix on opengl
+@test_utils.test(require=ti.extension.data64, exclude=ti.opengl)
+def test_matrix():
+    type_list = [[ti.f32, ti.i32], [ti.i64, ti.f32]]
+    a = ti.Matrix.field(len(type_list),
+                        len(type_list[0]),
+                        dtype=type_list,
+                        shape=())
+    b = ti.Matrix.field(len(type_list),
+                        len(type_list[0]),
+                        dtype=type_list,
+                        shape=())
+    c = ti.Matrix.field(len(type_list),
+                        len(type_list[0]),
+                        dtype=type_list,
+                        shape=())
+
+    @ti.kernel
+    def init():
+        a[None] = [[1.0, 3], [1, 3.0]]
+        b[None] = [[2.0, 4], [-2, -3.0]]
+        c[None] = a[None] + b[None]
+
+    def verify():
+        assert isinstance(a[None][0], float)
+        assert isinstance(a[None][1], int)
+        assert isinstance(b[None][0], float)
+        assert isinstance(b[None][1], int)
+        assert c[None][0, 0] == 3.0
+        assert c[None][0, 1] == 7
+        assert c[None][1, 0] == -1
+        assert c[None][1, 1] == 0.0
+
+    init()
+    verify()
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_custom_type():
+    cit1 = ti.types.quantized_types.quant.int(bits=10, signed=True)
+    cft1 = ti.types.quantized_types.type_factory.custom_float(cit1, scale=0.1)
+    cit2 = ti.types.quantized_types.quant.int(bits=22, signed=False)
+    cft2 = ti.types.quantized_types.type_factory.custom_float(cit2, scale=0.1)
+    type_list = [[cit1, cft2], [cft1, cit2]]
+    a = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
+    b = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
+    c = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
+    ti.root.dense(ti.i,
+                  1).bit_struct(num_bits=32).place(a.get_scalar_field(0, 0),
+                                                   a.get_scalar_field(0, 1))
+    ti.root.dense(ti.i,
+                  1).bit_struct(num_bits=32).place(a.get_scalar_field(1, 0),
+                                                   a.get_scalar_field(1, 1))
+    ti.root.dense(ti.i,
+                  1).bit_struct(num_bits=32).place(b.get_scalar_field(0, 0),
+                                                   b.get_scalar_field(0, 1))
+    ti.root.dense(ti.i,
+                  1).bit_struct(num_bits=32).place(b.get_scalar_field(1, 0),
+                                                   b.get_scalar_field(1, 1))
+    ti.root.dense(ti.i,
+                  1).bit_struct(num_bits=32).place(c.get_scalar_field(0, 0),
+                                                   c.get_scalar_field(0, 1))
+    ti.root.dense(ti.i,
+                  1).bit_struct(num_bits=32).place(c.get_scalar_field(1, 0),
+                                                   c.get_scalar_field(1, 1))
+
+    @ti.kernel
+    def init():
+        a[0] = [[1, 3.], [2., 1]]
+        b[0] = [[2, 4.], [-2., 1]]
+        c[0] = a[0] + b[0]
+
+    def verify():
+        assert c[0][0, 0] == approx(3, 1e-3)
+        assert c[0][0, 1] == approx(7.0, 1e-3)
+        assert c[0][1, 0] == approx(0, 1e-3)
+        assert c[0][1, 1] == approx(2, 1e-3)
+
+    init()
+    verify()
diff --git a/tests/_python_orig/test_memory.py b/tests/_python_orig/test_memory.py
new file mode 100644
index 000000000..f90c2fc30
--- /dev/null
+++ b/tests/_python_orig/test_memory.py
@@ -0,0 +1,10 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cuda)
+def test_memory_allocate():
+    HUGE_SIZE = 1024**2 * 128
+    x = ti.field(ti.i32, shape=(HUGE_SIZE, ))
+    for i in range(10):
+        x[i] = i
diff --git a/tests/_python_orig/test_mesh.py b/tests/_python_orig/test_mesh.py
new file mode 100644
index 000000000..19a1b178e
--- /dev/null
+++ b/tests/_python_orig/test_mesh.py
@@ -0,0 +1,344 @@
+import os
+
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+this_dir = os.path.dirname(os.path.abspath(__file__))
+model_file_path = os.path.join(this_dir, 'ell.json')
+
+
+@test_utils.test(require=ti.extension.mesh)
+def test_mesh_patch_idx():
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'idx': ti.i32})
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+
+    @ti.kernel
+    def foo():
+        for v in model.verts:
+            v.idx = ti.mesh_patch_idx()
+
+    foo()
+    idx = model.verts.idx.to_numpy()
+    assert idx[0] == 6
+    assert idx.sum() == 89
+
+
+def _test_mesh_for(cell_reorder=False, vert_reorder=False, extra_tests=True):
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'t': ti.i32}, reorder=vert_reorder)
+    mesh_builder.cells.place({'t': ti.i32}, reorder=cell_reorder)
+    mesh_builder.cells.link(mesh_builder.verts)
+    mesh_builder.verts.link(mesh_builder.cells)
+    mesh_builder.cells.link(mesh_builder.cells)
+    mesh_builder.verts.link(mesh_builder.verts)
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+
+    @ti.kernel
+    def cell_vert():
+        for c in model.cells:
+            for j in range(c.verts.size):
+                c.t += c.verts[j].id
+
+    cell_vert()
+    total = model.cells.t.to_numpy().sum()
+    model.cells.t.fill(0)
+    assert total == 892
+
+    @ti.kernel
+    def vert_cell():
+        for v in model.verts:
+            for j in range(v.cells.size):
+                v.t += v.cells[j].id
+
+    vert_cell()
+    total = model.verts.t.to_numpy().sum()
+    model.verts.t.fill(0)
+    assert total == 1104
+
+    if not extra_tests:
+        return
+
+    @ti.kernel
+    def cell_cell():
+        for c in model.cells:
+            for j in range(c.cells.size):
+                c.t += c.cells[j].id
+
+    cell_cell()
+    total = model.cells.t.to_numpy().sum()
+    model.cells.t.fill(0)
+    assert total == 690
+
+    @ti.kernel
+    def vert_vert():
+        for v in model.verts:
+            for j in range(v.verts.size):
+                v.t += v.verts[j].id
+
+    vert_vert()
+    total = model.verts.t.to_numpy().sum()
+    model.verts.t.fill(0)
+    assert total == 1144
+
+
+@test_utils.test(require=ti.extension.mesh)
+def test_mesh_for():
+    _test_mesh_for(False, False)
+    _test_mesh_for(False, True)
+
+
+@test_utils.test(require=ti.extension.mesh,
+                 optimize_mesh_reordered_mapping=False)
+def test_mesh_reordered_opt():
+    _test_mesh_for(True, True, False)
+
+
+@test_utils.test(require=ti.extension.mesh, mesh_localize_to_end_mapping=False)
+def test_mesh_localize_mapping0():
+    _test_mesh_for(False, False, False)
+    _test_mesh_for(True, True, False)
+
+
+@test_utils.test(require=ti.extension.mesh,
+                 mesh_localize_from_end_mapping=True)
+def test_mesh_localize_mapping1():
+    _test_mesh_for(False, False, False)
+    _test_mesh_for(True, True, False)
+
+
+@test_utils.test(require=ti.extension.mesh)
+def test_mesh_reorder():
+    vec3i = ti.types.vector(3, ti.i32)
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'s': ti.i32, 's3': vec3i}, reorder=True)
+    mesh_builder.cells.link(mesh_builder.verts)
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+
+    id2 = np.array([x**2 for x in range(len(model.verts))])
+    id123 = np.array([[x**1, x**2, x**3] for x in range(len(model.verts))])
+    model.verts.s.from_numpy(id2)
+    model.verts.s3.from_numpy(id123)
+
+    @ti.kernel
+    def foo():
+        for v in model.verts:
+            assert v.s == v.id**2
+            assert v.s3[0] == v.id**1 and v.s3[1] == v.id**2 and v.s3[
+                2] == v.id**3
+            v.s = v.id**3
+            v.s3 *= v.id
+
+    foo()
+
+    id3 = model.verts.s.to_numpy()
+    id234 = model.verts.s3.to_numpy()
+
+    for i in range(len(model.verts)):
+        assert model.verts.s[i] == i**3
+        assert id3[i] == i**3
+        assert model.verts.s3[i][0] == i**2
+        assert model.verts.s3[i][1] == i**3
+        assert model.verts.s3[i][2] == i**4
+        assert id234[i][0] == i**2
+        assert id234[i][1] == i**3
+        assert id234[i][2] == i**4
+
+
+@test_utils.test(require=ti.extension.mesh)
+def test_mesh_minor_relations():
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'y': ti.i32})
+    mesh_builder.edges.place({'x': ti.i32})
+    mesh_builder.cells.link(mesh_builder.edges)
+    mesh_builder.verts.link(mesh_builder.cells)
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+    model.edges.x.fill(1)
+
+    @ti.kernel
+    def foo():
+        for v in model.verts:
+            for i in range(v.cells.size):
+                c = v.cells[i]
+                for j in range(c.edges.size):
+                    e = c.edges[j]
+                    v.y += e.x
+
+    foo()
+    total = model.verts.y.to_numpy().sum()
+    assert total == 576
+
+
+@test_utils.test(require=ti.extension.mesh, demote_no_access_mesh_fors=True)
+def test_multiple_meshes():
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'y': ti.i32})
+    meta = ti.Mesh.load_meta(model_file_path)
+    model1 = mesh_builder.build(meta)
+    model2 = mesh_builder.build(meta)
+
+    model1.verts.y.from_numpy(
+        np.array([x**2 for x in range(len(model1.verts))]))
+
+    @ti.kernel
+    def foo():
+        for v in model1.verts:
+            model2.verts.y[v.id] = v.y
+
+    foo()
+    out = model2.verts.y.to_numpy()
+    for i in range(len(out)):
+        assert out[i] == i**2
+
+
+@test_utils.test(require=ti.extension.mesh)
+def test_mesh_local():
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'a': ti.i32})
+    mesh_builder.faces.link(mesh_builder.verts)
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+    ext_a = ti.field(ti.i32, shape=len(model.verts))
+
+    @ti.kernel
+    def foo(cache: ti.template()):
+        if ti.static(cache):
+            ti.mesh_local(ext_a, model.verts.a)
+        for f in model.faces:
+            m = f.verts[0].id + f.verts[1].id + f.verts[2].id
+            f.verts[0].a += m
+            f.verts[1].a += m
+            f.verts[2].a += m
+            ext_a[f.verts[0].id] += m
+            ext_a[f.verts[1].id] += m
+            ext_a[f.verts[2].id] += m
+
+    foo(False)
+    res1 = model.verts.a.to_numpy()
+    res2 = ext_a.to_numpy()
+    model.verts.a.fill(0)
+    ext_a.fill(0)
+    foo(True)
+    res3 = model.verts.a.to_numpy()
+    res4 = ext_a.to_numpy()
+
+    for i in range(len(model.verts)):
+        assert res1[i] == res2[i]
+        assert res1[i] == res3[i]
+        assert res1[i] == res4[i]
+
+
+@test_utils.test(require=ti.extension.mesh, experimental_auto_mesh_local=True)
+def test_auto_mesh_local():
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.verts.place({'a': ti.i32, 's': ti.i32})
+    mesh_builder.faces.link(mesh_builder.verts)
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+    ext_a = ti.field(ti.i32, shape=len(model.verts))
+
+    @ti.kernel
+    def foo(cache: ti.template()):
+        for v in model.verts:
+            v.s = v.id
+        if ti.static(cache):
+            ti.mesh_local(ext_a, model.verts.a)
+        for f in model.faces:
+            m = f.verts[0].s + f.verts[1].s + f.verts[2].s
+            f.verts[0].a += m
+            f.verts[1].a += m
+            f.verts[2].a += m
+            for i in range(3):
+                ext_a[f.verts[i].id] += m
+
+    foo(False)
+    res1 = model.verts.a.to_numpy()
+    res2 = ext_a.to_numpy()
+    model.verts.a.fill(0)
+    ext_a.fill(0)
+    foo(True)
+    res3 = model.verts.a.to_numpy()
+    res4 = ext_a.to_numpy()
+
+    for i in range(len(model.verts)):
+        assert res1[i] == res2[i]
+        assert res1[i] == res3[i]
+        assert res1[i] == res4[i]
+
+
+@test_utils.test(require=ti.extension.mesh)
+def test_nested_mesh_for():
+    mesh_builder = ti.Mesh.Tet()
+    mesh_builder.faces.place({'a': ti.i32, 'b': ti.i32})
+    mesh_builder.faces.link(mesh_builder.verts)
+    model = mesh_builder.build(ti.Mesh.load_meta(model_file_path))
+
+    @ti.kernel
+    def foo():
+        for f in model.faces:
+            for i in range(f.verts.size):
+                f.a += f.verts[i].id
+            for v in f.verts:
+                f.b += v.id
+
+    a = model.faces.a.to_numpy()
+    b = model.faces.b.to_numpy()
+    assert (a == b).all() == 1
+
+
+@test_utils.test(require=ti.extension.mesh)
+def test_multiple_mesh_major_relations():
+    mesh = ti.TetMesh()
+    mesh.verts.place({
+        's': ti.i32,
+        's_': ti.i32,
+        's1': ti.i32,
+        'a': ti.i32,
+        'b': ti.i32,
+        'c': ti.i32
+    })
+    mesh.edges.place({'s2': ti.i32})
+    mesh.cells.place({'s3': ti.i32})
+    mesh.verts.link(mesh.verts)
+    mesh.verts.link(mesh.edges)
+    mesh.verts.link(mesh.cells)
+
+    model = mesh.build(ti.Mesh.load_meta(model_file_path))
+
+    @ti.kernel
+    def foo():
+        for u in model.verts:
+            u.s1 = u.id
+        for e in model.edges:
+            e.s2 = e.id
+        for c in model.cells:
+            c.s3 = c.id
+
+        ti.mesh_local(model.verts.s1, model.edges.s2, model.cells.s3)
+        for u in model.verts:
+            a, b, c = 0, 0, 0
+            for i in range(u.verts.size):
+                a += u.verts[i].s1
+            for i in range(u.edges.size):
+                b += u.edges[i].s2
+            for i in range(u.cells.size):
+                c += u.cells[i].s3
+            u.s = a * b * c
+
+        for u in model.verts:
+            for i in range(u.verts.size):
+                u.a += u.verts[i].s1
+        for u in model.verts:
+            for i in range(u.edges.size):
+                u.b += u.edges[i].s2
+        for u in model.verts:
+            for i in range(u.cells.size):
+                u.c += u.cells[i].s3
+        for u in model.verts:
+            u.s_ = u.a * u.b * u.c
+
+    foo()
+
+    sum1 = model.verts.s.to_numpy().sum()
+    sum2 = model.verts.s_.to_numpy().sum()
+    assert sum1 == sum2
diff --git a/tests/_python_orig/test_mod.py b/tests/_python_orig/test_mod.py
new file mode 100644
index 000000000..987b1d7c4
--- /dev/null
+++ b/tests/_python_orig/test_mod.py
@@ -0,0 +1,68 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def _test_py_style_mod(arg1, a, arg2, b, arg3, c):
+    z = ti.field(arg3, shape=())
+
+    @ti.kernel
+    def func(x: arg1, y: arg2):
+        z[None] = x % y
+
+    func(a, b)
+    assert z[None] == c
+
+
+@test_utils.test()
+def _test_c_style_mod(arg1, a, arg2, b, arg3, c):
+    z = ti.field(arg3, shape=())
+
+    @ti.kernel
+    def func(x: arg1, y: arg2):
+        z[None] = ti.raw_mod(x, y)
+
+    func(a, b)
+    assert z[None] == c
+
+
+def test_py_style_mod():
+    def func(a, b):
+        _test_py_style_mod(ti.i32, a, ti.i32, b, ti.i32, a % b)
+
+    func(10, 3)
+    func(-10, 3)
+    func(10, -3)
+    func(-10, -3)
+
+
+def _c_mod(a, b):
+    return a - b * int(float(a) / b)
+
+
+def test_c_style_mod():
+    def func(a, b):
+        _test_c_style_mod(ti.i32, a, ti.i32, b, ti.i32, _c_mod(a, b))
+
+    func(10, 3)
+    func(-10, 3)
+    func(10, -3)
+    func(-10, -3)
+
+
+@test_utils.test()
+def test_mod_scan():
+    z = ti.field(ti.i32, shape=())
+    w = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func(x: ti.i32, y: ti.i32):
+        z[None] = x % y
+        w[None] = ti.raw_mod(x, y)
+
+    for i in range(-10, 11):
+        for j in range(-10, 11):
+            if j != 0:
+                func(i, j)
+                assert z[None] == i % j
+                assert w[None] == _c_mod(i, j)
diff --git a/tests/_python_orig/test_module_import.py b/tests/_python_orig/test_module_import.py
new file mode 100644
index 000000000..35f4ff4a9
--- /dev/null
+++ b/tests/_python_orig/test_module_import.py
@@ -0,0 +1,12 @@
+import taichi as myowntaichi
+from tests import test_utils
+
+
+@test_utils.test()
+def test_module_import():
+    @myowntaichi.kernel
+    def func():
+        for _ in myowntaichi.static(range(8)):
+            pass
+
+    func()
diff --git a/tests/_python_orig/test_mpm88.py b/tests/_python_orig/test_mpm88.py
new file mode 100644
index 000000000..c52d30acc
--- /dev/null
+++ b/tests/_python_orig/test_mpm88.py
@@ -0,0 +1,241 @@
+import os
+
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+def run_mpm88_test():
+    dim = 2
+    N = 64
+    n_particles = N * N
+    n_grid = 128
+    dx = 1 / n_grid
+    inv_dx = 1 / dx
+    dt = 2.0e-4
+    p_vol = (dx * 0.5)**2
+    p_rho = 1
+    p_mass = p_vol * p_rho
+    E = 400
+
+    x = ti.Vector.field(dim, dtype=ti.f32, shape=n_particles)
+    v = ti.Vector.field(dim, dtype=ti.f32, shape=n_particles)
+    C = ti.Matrix.field(dim, dim, dtype=ti.f32, shape=n_particles)
+    J = ti.field(dtype=ti.f32, shape=n_particles)
+    grid_v = ti.Vector.field(dim, dtype=ti.f32, shape=(n_grid, n_grid))
+    grid_m = ti.field(dtype=ti.f32, shape=(n_grid, n_grid))
+
+    @ti.kernel
+    def substep():
+        for p in x:
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            stress = -dt * p_vol * (J[p] - 1) * 4 * inv_dx * inv_dx * E
+            affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    offset = ti.Vector([i, j])
+                    dpos = (offset.cast(float) - fx) * dx
+                    weight = w[i][0] * w[j][1]
+                    ti.atomic_add(grid_v[base + offset],
+                                  weight * (p_mass * v[p] + affine @ dpos))
+                    ti.atomic_add(grid_m[base + offset], weight * p_mass)
+
+        for i, j in grid_m:
+            if grid_m[i, j] > 0:
+                bound = 3
+                inv_m = 1 / grid_m[i, j]
+                grid_v[i, j] = inv_m * grid_v[i, j]
+                grid_v[i, j][1] -= dt * 9.8
+                if i < bound and grid_v[i, j][0] < 0:
+                    grid_v[i, j][0] = 0
+                if i > n_grid - bound and grid_v[i, j][0] > 0:
+                    grid_v[i, j][0] = 0
+                if j < bound and grid_v[i, j][1] < 0:
+                    grid_v[i, j][1] = 0
+                if j > n_grid - bound and grid_v[i, j][1] > 0:
+                    grid_v[i, j][1] = 0
+
+        for p in x:
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            w = [
+                0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2
+            ]
+            new_v = ti.Vector.zero(ti.f32, 2)
+            new_C = ti.Matrix.zero(ti.f32, 2, 2)
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    dpos = ti.Vector([i, j]).cast(float) - fx
+                    g_v = grid_v[base + ti.Vector([i, j])]
+                    weight = w[i][0] * w[j][1]
+                    new_v += weight * g_v
+                    new_C += 4 * weight * g_v.outer_product(dpos) * inv_dx
+            v[p] = new_v
+            x[p] += dt * v[p]
+            J[p] *= 1 + dt * new_C.trace()
+            C[p] = new_C
+
+    for i in range(n_particles):
+        x[i] = [i % N / N * 0.4 + 0.2, i / N / N * 0.4 + 0.05]
+        v[i] = [0, -3]
+        J[i] = 1
+
+    for frame in range(10):
+        for s in range(50):
+            grid_v.fill([0, 0])
+            grid_m.fill(0)
+            substep()
+
+    pos = x.to_numpy()
+    pos[:, 1] *= 2
+    regression = [
+        0.31722742,
+        0.15826741,
+        0.10224003,
+        0.07810827,
+    ]
+    for i in range(4):
+        assert (pos**(i + 1)).mean() == test_utils.approx(regression[i],
+                                                          rel=1e-2)
+
+
+@test_utils.test()
+def test_mpm88():
+    run_mpm88_test()
+
+
+def _is_appveyor():
+    # AppVeyor adds `APPVEYOR=True` ('true' on Ubuntu)
+    # https://www.appveyor.com/docs/environment-variables/
+    return os.getenv('APPVEYOR', '').lower() == 'true'
+
+
+#TODO: Remove exclude of ti.metal
+@pytest.mark.skipif(_is_appveyor(), reason='Stuck on Appveyor.')
+@test_utils.test(require=ti.extension.async_mode,
+                 exclude=[ti.metal],
+                 async_mode=True)
+def test_mpm88_async():
+    # It seems that all async tests on Appveyor run super slow. For example,
+    # on Appveyor, 10+ tests have passed during the execution of
+    # test_fuse_dense_x2y2z. Maybe thread synchronizations are expensive?
+    run_mpm88_test()
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.opengl])
+def test_mpm88_numpy_and_ndarray():
+    import numpy as np
+
+    dim = 2
+    N = 64
+    n_particles = N * N
+    n_grid = 128
+    dx = 1 / n_grid
+    inv_dx = 1 / dx
+    dt = 2.0e-4
+    p_vol = (dx * 0.5)**2
+    p_rho = 1
+    p_mass = p_vol * p_rho
+    E = 400
+
+    @ti.kernel
+    def substep(x: ti.any_arr(element_dim=1), v: ti.any_arr(element_dim=1),
+                C: ti.any_arr(element_dim=2), J: ti.any_arr(),
+                grid_v: ti.any_arr(element_dim=1), grid_m: ti.any_arr()):
+        for p in x:
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            stress = -dt * p_vol * (J[p] - 1) * 4 * inv_dx * inv_dx * E
+            affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    offset = ti.Vector([i, j])
+                    dpos = (offset.cast(float) - fx) * dx
+                    weight = w[i][0] * w[j][1]
+                    ti.atomic_add(grid_v[base + offset],
+                                  weight * (p_mass * v[p] + affine @ dpos))
+                    ti.atomic_add(grid_m[base + offset], weight * p_mass)
+
+        for i, j in grid_m:
+            if grid_m[i, j] > 0:
+                bound = 3
+                inv_m = 1 / grid_m[i, j]
+                grid_v[i, j] = inv_m * grid_v[i, j]
+                grid_v[i, j][1] -= dt * 9.8
+                if i < bound and grid_v[i, j][0] < 0:
+                    grid_v[i, j][0] = 0
+                if i > n_grid - bound and grid_v[i, j][0] > 0:
+                    grid_v[i, j][0] = 0
+                if j < bound and grid_v[i, j][1] < 0:
+                    grid_v[i, j][1] = 0
+                if j > n_grid - bound and grid_v[i, j][1] > 0:
+                    grid_v[i, j][1] = 0
+
+        for p in x:
+            base = (x[p] * inv_dx - 0.5).cast(int)
+            fx = x[p] * inv_dx - base.cast(float)
+            w = [
+                0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2
+            ]
+            new_v = ti.Vector.zero(ti.f32, 2)
+            new_C = ti.Matrix.zero(ti.f32, 2, 2)
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    dpos = ti.Vector([i, j]).cast(float) - fx
+                    g_v = grid_v[base + ti.Vector([i, j])]
+                    weight = w[i][0] * w[j][1]
+                    new_v += weight * g_v
+                    new_C += 4 * weight * g_v.outer_product(dpos) * inv_dx
+            v[p] = new_v
+            x[p] += dt * v[p]
+            J[p] *= 1 + dt * new_C.trace()
+            C[p] = new_C
+
+    def run_test(x, v, C, J, grid_v, grid_m):
+        for i in range(n_particles):
+            x[i] = [i % N / N * 0.4 + 0.2, i / N / N * 0.4 + 0.05]
+            v[i] = [0, -3]
+            J[i] = 1
+
+        for frame in range(10):
+            for s in range(50):
+                grid_v.fill(0)
+                grid_m.fill(0)
+                substep(x, v, C, J, grid_v, grid_m)
+
+        pos = x if isinstance(x, np.ndarray) else x.to_numpy()
+        pos[:, 1] *= 2
+        regression = [
+            0.31722742,
+            0.15826741,
+            0.10224003,
+            0.07810827,
+        ]
+        for i in range(4):
+            assert (pos**(i + 1)).mean() == test_utils.approx(regression[i],
+                                                              rel=1e-2)
+
+    def test_numpy():
+        x = np.zeros((n_particles, dim), dtype=np.float32)
+        v = np.zeros((n_particles, dim), dtype=np.float32)
+        C = np.zeros((n_particles, dim, dim), dtype=np.float32)
+        J = np.zeros(n_particles, dtype=np.float32)
+        grid_v = np.zeros((n_grid, n_grid, dim), dtype=np.float32)
+        grid_m = np.zeros((n_grid, n_grid), dtype=np.float32)
+        run_test(x, v, C, J, grid_v, grid_m)
+
+    def test_ndarray():
+        x = ti.Vector.ndarray(dim, ti.f32, n_particles)
+        v = ti.Vector.ndarray(dim, ti.f32, n_particles)
+        C = ti.Matrix.ndarray(dim, dim, ti.f32, n_particles)
+        J = ti.ndarray(ti.f32, n_particles)
+        grid_v = ti.Vector.ndarray(dim, ti.f32, (n_grid, n_grid))
+        grid_m = ti.ndarray(ti.f32, (n_grid, n_grid))
+        run_test(x, v, C, J, grid_v, grid_m)
+
+    test_numpy()
+    test_ndarray()
diff --git a/tests/_python_orig/test_mpm_particle_list.py b/tests/_python_orig/test_mpm_particle_list.py
new file mode 100644
index 000000000..372471367
--- /dev/null
+++ b/tests/_python_orig/test_mpm_particle_list.py
@@ -0,0 +1,63 @@
+import random
+
+import taichi as ti
+from tests import test_utils
+
+
+@ti.data_oriented
+class MPMSolver:
+    def __init__(self, res):
+        dim = len(res)
+        self.dx = 1 / res[0]
+        self.inv_dx = 1.0 / self.dx
+        self.pid = ti.field(ti.i32)
+        self.x = ti.Vector.field(dim, dtype=ti.f32)
+        self.grid_m = ti.field(dtype=ti.f32)
+
+        indices = ti.ij
+
+        self.grid = ti.root.pointer(indices, 32)
+        block = self.grid.pointer(indices, 16)
+        voxel = block.dense(indices, 8)
+
+        voxel.place(self.grid_m)
+        block.dynamic(ti.axes(dim), 1024 * 1024,
+                      chunk_size=4096).place(self.pid)
+
+        ti.root.dynamic(ti.i, 2**25, 2**20).place(self.x)
+        self.substeps = 0
+
+        for i in range(10000):
+            self.x[i] = [random.random() * 0.5, random.random() * 0.5]
+
+    @ti.kernel
+    def build_pid(self):
+        ti.block_dim(256)
+        for p in self.x:
+            base = ti.floor(self.x[p] * self.inv_dx - 0.5).cast(int) + 1
+            ti.append(self.pid.parent(), base, p)
+
+    def step(self):
+        for i in range(1000):
+            self.substeps += 1
+            self.grid.deactivate_all()
+            self.build_pid()
+
+
+@test_utils.test(require=ti.extension.sparse,
+                 exclude=[ti.metal],
+                 device_memory_GB=1.0)
+def test_mpm_particle_list_no_leakage():
+    # By default Taichi will allocate 0.5 GB for testing.
+    mpm = MPMSolver(res=(128, 128))
+    mpm.step()
+
+
+@test_utils.test(require=[ti.extension.sparse, ti.extension.packed],
+                 exclude=[ti.metal],
+                 device_memory_GB=1.0,
+                 packed=True)
+def test_mpm_particle_list_no_leakage_packed():
+    # By default Taichi will allocate 0.5 GB for testing.
+    mpm = MPMSolver(res=(128, 128))
+    mpm.step()
diff --git a/tests/_python_orig/test_name_error.py b/tests/_python_orig/test_name_error.py
new file mode 100644
index 000000000..e604c436d
--- /dev/null
+++ b/tests/_python_orig/test_name_error.py
@@ -0,0 +1,15 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_name_error():
+    with pytest.raises(ti.TaichiNameError, match='Name "a" is not defined'):
+
+        @ti.kernel
+        def foo():
+            a + 1
+
+        foo()
diff --git a/tests/_python_orig/test_native_functions.py b/tests/_python_orig/test_native_functions.py
new file mode 100644
index 000000000..a0d537d01
--- /dev/null
+++ b/tests/_python_orig/test_native_functions.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_abs():
+    x = ti.field(ti.f32)
+
+    N = 16
+
+    ti.root.dense(ti.i, N).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            x[i] = abs(-i)
+            print(x[i])
+            ti.static_print(x[i])
+
+    func()
+
+    for i in range(N):
+        assert x[i] == i
+
+
+@test_utils.test()
+def test_int():
+    x = ti.field(ti.f32)
+
+    N = 16
+
+    ti.root.dense(ti.i, N).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            x[i] = int(x[i])
+            x[i] = float(int(x[i]) // 2)
+
+    for i in range(N):
+        x[i] = i + 0.4
+
+    func()
+
+    for i in range(N):
+        assert x[i] == i // 2
+
+
+@test_utils.test()
+def test_minmax():
+    x = ti.field(ti.f32)
+    y = ti.field(ti.f32)
+    z = ti.field(ti.f32)
+    minimum = ti.field(ti.f32)
+    maximum = ti.field(ti.f32)
+
+    N = 16
+
+    ti.root.dense(ti.i, N).place(x, y, z, minimum, maximum)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            minimum[i] = min(x[i], y[i], z[i])
+            maximum[i] = max(x[i], y[i], z[i])
+
+    for i in range(N):
+        x[i] = i
+        y[i] = N - i
+        z[i] = i - 2 if i % 2 else i + 2
+
+    func()
+
+    assert np.allclose(
+        minimum.to_numpy(),
+        np.minimum(np.minimum(x.to_numpy(), y.to_numpy()), z.to_numpy()))
+    assert np.allclose(
+        maximum.to_numpy(),
+        np.maximum(np.maximum(x.to_numpy(), y.to_numpy()), z.to_numpy()))
diff --git a/tests/_python_orig/test_ndarray.py b/tests/_python_orig/test_ndarray.py
new file mode 100644
index 000000000..4cb6ebd70
--- /dev/null
+++ b/tests/_python_orig/test_ndarray.py
@@ -0,0 +1,579 @@
+import copy
+
+import numpy as np
+import pytest
+from taichi.lang import impl
+from taichi.lang.misc import get_host_arch_list
+from taichi.lang.util import has_pytorch
+
+import taichi as ti
+from tests import test_utils
+
+if has_pytorch():
+    import torch
+
+# properties
+
+data_types = [ti.i32, ti.f32, ti.i64, ti.f64]
+ndarray_shapes = [(), 8, (6, 12)]
+vector_dims = [3]
+matrix_dims = [(1, 2), (2, 3)]
+supported_archs_taichi_ndarray = [ti.cpu, ti.cuda, ti.opengl, ti.vulkan]
+
+
+def _test_scalar_ndarray(dtype, shape):
+    x = ti.ndarray(dtype, shape)
+
+    if isinstance(shape, tuple):
+        assert x.shape == shape
+    else:
+        assert x.shape == (shape, )
+    assert x.element_shape == ()
+
+    assert x.dtype == dtype
+
+
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', ndarray_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_scalar_ndarray(dtype, shape):
+    _test_scalar_ndarray(dtype, shape)
+
+
+def _test_vector_ndarray(n, dtype, shape):
+    x = ti.Vector.ndarray(n, dtype, shape)
+
+    if isinstance(shape, tuple):
+        assert x.shape == shape
+    else:
+        assert x.shape == (shape, )
+    assert x.element_shape == (n, )
+
+    assert x.dtype == dtype
+    assert x.n == n
+
+
+@pytest.mark.parametrize('n', vector_dims)
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', ndarray_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_vector_ndarray(n, dtype, shape):
+    _test_vector_ndarray(n, dtype, shape)
+
+
+def _test_matrix_ndarray(n, m, dtype, shape):
+    x = ti.Matrix.ndarray(n, m, dtype, shape)
+
+    if isinstance(shape, tuple):
+        assert x.shape == shape
+    else:
+        assert x.shape == (shape, )
+    assert x.element_shape == (n, m)
+
+    assert x.dtype == dtype
+    assert x.n == n
+    assert x.m == m
+
+
+@pytest.mark.parametrize('n,m', matrix_dims)
+@pytest.mark.parametrize('dtype', data_types)
+@pytest.mark.parametrize('shape', ndarray_shapes)
+@test_utils.test(arch=get_host_arch_list())
+def test_matrix_ndarray(n, m, dtype, shape):
+    _test_matrix_ndarray(n, m, dtype, shape)
+
+
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+def test_default_fp_ndarray(dtype):
+    ti.init(arch=supported_archs_taichi_ndarray, default_fp=dtype)
+
+    x = ti.Vector.ndarray(2, float, ())
+
+    assert x.dtype == impl.get_runtime().default_fp
+
+
+@pytest.mark.parametrize('dtype', [ti.i32, ti.i64])
+def test_default_ip_ndarray(dtype):
+    ti.init(arch=supported_archs_taichi_ndarray, default_ip=dtype)
+
+    x = ti.Vector.ndarray(2, int, ())
+
+    assert x.dtype == impl.get_runtime().default_ip
+
+
+# access
+
+layouts = [ti.Layout.SOA, ti.Layout.AOS]
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_1d():
+    n = 4
+
+    @ti.kernel
+    def run(x: ti.any_arr(), y: ti.any_arr()):
+        for i in range(n):
+            x[i] += i + y[i]
+
+    a = ti.ndarray(ti.i32, shape=(n, ))
+    for i in range(n):
+        a[i] = i * i
+    b = np.ones((n, ), dtype=np.int32)
+    run(a, b)
+    for i in range(n):
+        assert a[i] == i * i + i + 1
+    run(b, a)
+    for i in range(n):
+        assert b[i] == i * i + (i + 1) * 2
+
+
+def _test_ndarray_2d():
+    n = 4
+    m = 7
+
+    @ti.kernel
+    def run(x: ti.any_arr(), y: ti.any_arr()):
+        for i in range(n):
+            for j in range(m):
+                x[i, j] += i + j + y[i, j]
+
+    a = ti.ndarray(ti.i32, shape=(n, m))
+    for i in range(n):
+        for j in range(m):
+            a[i, j] = i * j
+    b = np.ones((n, m), dtype=np.int32)
+    run(a, b)
+    for i in range(n):
+        for j in range(m):
+            assert a[i, j] == i * j + i + j + 1
+    run(b, a)
+    for i in range(n):
+        for j in range(m):
+            assert b[i, j] == i * j + (i + j + 1) * 2
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_2d():
+    _test_ndarray_2d()
+
+
+def _test_ndarray_copy_from_ndarray():
+    n = 16
+    a = ti.ndarray(ti.i32, shape=n)
+    b = ti.ndarray(ti.i32, shape=n)
+    a[0] = 1
+    a[4] = 2
+    b[0] = 4
+    b[4] = 5
+
+    a.copy_from(b)
+
+    assert a[0] == 4
+    assert a[4] == 5
+
+    x = ti.Vector.ndarray(10, ti.i32, 5, layout=ti.Layout.SOA)
+    y = ti.Vector.ndarray(10, ti.i32, 5, layout=ti.Layout.SOA)
+    x[1][0] = 1
+    x[2][4] = 2
+    y[1][0] = 4
+    y[2][4] = 5
+
+    x.copy_from(y)
+
+    assert x[1][0] == 4
+    assert x[2][4] == 5
+
+    x = ti.Matrix.ndarray(2, 2, ti.i32, 5, layout=ti.Layout.AOS)
+    y = ti.Matrix.ndarray(2, 2, ti.i32, 5, layout=ti.Layout.AOS)
+    x[0][0, 0] = 1
+    x[4][1, 0] = 3
+    y[0][0, 0] = 4
+    y[4][1, 0] = 6
+
+    x.copy_from(y)
+
+    assert x[0][0, 0] == 4
+    assert x[4][1, 0] == 6
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_copy_from_ndarray():
+    _test_ndarray_copy_from_ndarray()
+
+
+def _test_ndarray_deepcopy():
+    n = 16
+    x = ti.ndarray(ti.i32, shape=n)
+    x[0] = 1
+    x[4] = 2
+
+    y = copy.deepcopy(x)
+
+    assert y.shape == x.shape
+    assert y.dtype == x.dtype
+    assert y[0] == 1
+    assert y[4] == 2
+    x[0] = 4
+    x[4] = 5
+    assert y[0] == 1
+    assert y[4] == 2
+
+    x = ti.Vector.ndarray(10, ti.i32, 5, layout=ti.Layout.SOA)
+    x[1][0] = 4
+    x[2][4] = 5
+
+    y = copy.deepcopy(x)
+
+    assert y.shape == x.shape
+    assert y.dtype == x.dtype
+    assert y.n == x.n
+    assert y.layout == x.layout
+    assert y[1][0] == 4
+    assert y[2][4] == 5
+    x[1][0] = 1
+    x[2][4] = 2
+    assert y[1][0] == 4
+    assert y[2][4] == 5
+
+    x = ti.Matrix.ndarray(2, 2, ti.i32, 5, layout=ti.Layout.AOS)
+    x[0][0, 0] = 7
+    x[4][1, 0] = 9
+
+    y = copy.deepcopy(x)
+
+    assert y.shape == x.shape
+    assert y.dtype == x.dtype
+    assert y.m == x.m
+    assert y.n == x.n
+    assert y.layout == x.layout
+    assert y[0][0, 0] == 7
+    assert y[4][1, 0] == 9
+    x[0][0, 0] = 3
+    x[4][1, 0] = 5
+    assert y[0][0, 0] == 7
+    assert y[4][1, 0] == 9
+
+
+def test_ndarray_cuda_caching_allocator():
+    ti.init(arch=ti.cuda, ndarray_use_cached_allocator=True)
+    n = 8
+    a = ti.ndarray(ti.i32, shape=(n))
+    a.fill(2)
+    a = 1
+    b = ti.ndarray(ti.i32, shape=(n))
+    b.fill(2)
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_fill():
+    n = 8
+    a = ti.ndarray(ti.i32, shape=(n))
+    anp = np.ones((n, ), dtype=np.int32)
+    a.fill(2)
+    anp.fill(2)
+    assert (a.to_numpy() == anp).all()
+
+    b = ti.Vector.ndarray(4, ti.f32, shape=(n))
+    bnp = np.ones(shape=b.arr.shape, dtype=np.float32)
+    b.fill(2.5)
+    bnp.fill(2.5)
+    assert (b.to_numpy() == bnp).all()
+
+    c = ti.Matrix.ndarray(4, 4, ti.f32, shape=(n))
+    cnp = np.ones(shape=c.arr.shape, dtype=np.float32)
+    c.fill(1.5)
+    cnp.fill(1.5)
+    assert (c.to_numpy() == cnp).all()
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_rw_cache():
+    a = ti.Vector.ndarray(3, ti.f32, ())
+    b = ti.Vector.ndarray(3, ti.f32, 12)
+
+    n = 1000
+    for i in range(n):
+        c_a = copy.deepcopy(a)
+        c_b = copy.deepcopy(b)
+        c_a[None] = c_b[10]
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_deepcopy():
+    _test_ndarray_deepcopy()
+
+
+def _test_ndarray_numpy_io():
+    n = 7
+    m = 4
+    a = ti.ndarray(ti.i32, shape=(n, m))
+    a.fill(2)
+    b = ti.ndarray(ti.i32, shape=(n, m))
+    b.from_numpy(np.ones((n, m), dtype=np.int32) * 2)
+    assert (a.to_numpy() == b.to_numpy()).all()
+
+    d = 2
+    p = 4
+    x = ti.Vector.ndarray(d, ti.f32, p)
+    x.fill(2)
+    y = ti.Vector.ndarray(d, ti.f32, p)
+    y.from_numpy(np.ones((p, d), dtype=np.int32) * 2)
+    assert (x.to_numpy() == y.to_numpy()).all()
+
+    c = 2
+    d = 2
+    p = 4
+    x = ti.Matrix.ndarray(c, d, ti.f32, p)
+    x.fill(2)
+    y = ti.Matrix.ndarray(c, d, ti.f32, p)
+    y.from_numpy(np.ones((p, c, d), dtype=np.int32) * 2)
+    assert (x.to_numpy() == y.to_numpy()).all()
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_numpy_io():
+    _test_ndarray_numpy_io()
+
+
+def _test_matrix_ndarray_python_scope(layout):
+    a = ti.Matrix.ndarray(2, 2, ti.i32, 5, layout=layout)
+    for i in range(5):
+        for j, k in ti.ndrange(2, 2):
+            a[i][j, k] = j * j + k * k
+    assert a[0][0, 0] == 0
+    assert a[1][0, 1] == 1
+    assert a[2][1, 0] == 1
+    assert a[3][1, 1] == 2
+    assert a[4][0, 1] == 1
+
+
+@pytest.mark.parametrize('layout', layouts)
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_matrix_ndarray_python_scope(layout):
+    _test_matrix_ndarray_python_scope(layout)
+
+
+def _test_matrix_ndarray_taichi_scope(layout):
+    @ti.kernel
+    def func(a: ti.any_arr()):
+        for i in range(5):
+            for j, k in ti.ndrange(2, 2):
+                a[i][j, k] = j * j + k * k
+
+    m = ti.Matrix.ndarray(2, 2, ti.i32, 5, layout=layout)
+    func(m)
+    assert m[0][0, 0] == 0
+    assert m[1][0, 1] == 1
+    assert m[2][1, 0] == 1
+    assert m[3][1, 1] == 2
+    assert m[4][0, 1] == 1
+
+
+@pytest.mark.parametrize('layout', layouts)
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_matrix_ndarray_taichi_scope(layout):
+    _test_matrix_ndarray_taichi_scope(layout)
+
+
+def _test_matrix_ndarray_taichi_scope_struct_for(layout):
+    @ti.kernel
+    def func(a: ti.any_arr()):
+        for i in a:
+            for j, k in ti.ndrange(2, 2):
+                a[i][j, k] = j * j + k * k
+
+    m = ti.Matrix.ndarray(2, 2, ti.i32, 5, layout=layout)
+    func(m)
+    assert m[0][0, 0] == 0
+    assert m[1][0, 1] == 1
+    assert m[2][1, 0] == 1
+    assert m[3][1, 1] == 2
+    assert m[4][0, 1] == 1
+
+
+@pytest.mark.parametrize('layout', layouts)
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_matrix_ndarray_taichi_scope_struct_for(layout):
+    _test_matrix_ndarray_taichi_scope_struct_for(layout)
+
+
+@pytest.mark.parametrize('layout', layouts)
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_vector_ndarray_python_scope(layout):
+    a = ti.Vector.ndarray(10, ti.i32, 5, layout=layout)
+    for i in range(5):
+        for j in range(4):
+            a[i][j * j] = j * j
+    assert a[0][9] == 9
+    assert a[1][0] == 0
+    assert a[2][1] == 1
+    assert a[3][4] == 4
+    assert a[4][9] == 9
+
+
+@pytest.mark.parametrize('layout', layouts)
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_vector_ndarray_taichi_scope(layout):
+    @ti.kernel
+    def func(a: ti.any_arr()):
+        for i in range(5):
+            for j in range(4):
+                a[i][j * j] = j * j
+
+    v = ti.Vector.ndarray(10, ti.i32, 5, layout=layout)
+    func(v)
+    assert v[0][9] == 9
+    assert v[1][0] == 0
+    assert v[2][1] == 1
+    assert v[3][4] == 4
+    assert v[4][9] == 9
+
+
+# number of compiled functions
+
+
+def _test_compiled_functions():
+    @ti.kernel
+    def func(a: ti.any_arr(element_dim=1)):
+        for i in range(5):
+            for j in range(4):
+                a[i][j * j] = j * j
+
+    v = ti.Vector.ndarray(10, ti.i32, 5)
+    func(v)
+    assert impl.get_runtime().get_num_compiled_functions() == 1
+    v = np.zeros((6, 10), dtype=np.int32)
+    func(v)
+    assert impl.get_runtime().get_num_compiled_functions() == 1
+    v = np.zeros((6, 11), dtype=np.int32)
+    func(v)
+    assert impl.get_runtime().get_num_compiled_functions() == 2
+    v = ti.Vector.ndarray(10, ti.i32, 5, layout=ti.Layout.SOA)
+    func(v)
+    assert impl.get_runtime().get_num_compiled_functions() == 3
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_compiled_functions():
+    _test_compiled_functions()
+
+
+# annotation compatibility
+
+
+def _test_arg_not_match():
+    @ti.kernel
+    def func1(a: ti.any_arr(element_dim=1)):
+        pass
+
+    x = ti.Matrix.ndarray(2, 3, ti.i32, shape=(4, 7))
+    with pytest.raises(
+            ValueError,
+            match=
+            r'Invalid argument into ti\.any_arr\(\) - required element_dim=1, but .* is provided'
+    ):
+        func1(x)
+
+    @ti.kernel
+    def func2(a: ti.any_arr(element_dim=2)):
+        pass
+
+    x = ti.Vector.ndarray(2, ti.i32, shape=(4, 7))
+    with pytest.raises(
+            ValueError,
+            match=
+            r'Invalid argument into ti\.any_arr\(\) - required element_dim=2, but .* is provided'
+    ):
+        func2(x)
+
+    @ti.kernel
+    def func3(a: ti.any_arr(layout=ti.Layout.AOS)):
+        pass
+
+    x = ti.Matrix.ndarray(2, 3, ti.i32, shape=(4, 7), layout=ti.Layout.SOA)
+    with pytest.raises(
+            ValueError,
+            match=
+            r'Invalid argument into ti\.any_arr\(\) - required layout=Layout\.AOS, but .* is provided'
+    ):
+        func3(x)
+
+    @ti.kernel
+    def func4(a: ti.any_arr(layout=ti.Layout.SOA)):
+        pass
+
+    x = ti.Vector.ndarray(2, ti.i32, shape=(4, 7))
+    with pytest.raises(
+            ValueError,
+            match=
+            r'Invalid argument into ti\.any_arr\(\) - required layout=Layout\.SOA, but .* is provided'
+    ):
+        func4(x)
+
+    @ti.kernel
+    def func5(a: ti.any_arr(element_shape=(2, 3))):
+        pass
+
+    x = ti.Vector.ndarray(2, ti.i32, shape=(4, 7))
+    with pytest.raises(
+            ValueError,
+            match=
+            r'Invalid argument into ti\.any_arr\(\) - required element_dim'):
+        func5(x)
+
+    with pytest.raises(
+            ValueError,
+            match=r'Both element_shape and element_dim are specified'):
+
+        @ti.kernel
+        def func6(a: ti.any_arr(element_dim=1, element_shape=(2, 3))):
+            pass
+
+    @ti.kernel
+    def func7(a: ti.any_arr(field_dim=2)):
+        pass
+
+    x = ti.ndarray(ti.i32, shape=(3, ))
+    with pytest.raises(
+            ValueError,
+            match=r'Invalid argument into ti\.any_arr\(\) - required field_dim'
+    ):
+        func7(x)
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_arg_not_match():
+    _test_arg_not_match()
+
+
+def _test_size_in_bytes():
+    a = ti.ndarray(ti.i32, 8)
+    assert a._get_element_size() == 4
+    assert a._get_nelement() == 8
+
+    b = ti.Vector.ndarray(10, ti.f64, 5)
+    assert b._get_element_size() == 8
+    assert b._get_nelement() == 50
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_size_in_bytes():
+    _test_size_in_bytes()
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_different_shape():
+    n1 = 4
+    x = ti.ndarray(dtype=ti.f32, shape=(n1, n1))
+
+    @ti.kernel
+    def init(d: ti.i32, arr: ti.any_arr()):
+        for i, j in arr:
+            arr[i, j] = d
+
+    init(2, x)
+    assert (x.to_numpy() == (np.ones(shape=(n1, n1)) * 2)).all()
+    n2 = 8
+    y = ti.ndarray(dtype=ti.f32, shape=(n2, n2))
+    init(3, y)
+    assert (y.to_numpy() == (np.ones(shape=(n2, n2)) * 3)).all()
diff --git a/tests/_python_orig/test_ndrange.py b/tests/_python_orig/test_ndrange.py
new file mode 100644
index 000000000..22def2d89
--- /dev/null
+++ b/tests/_python_orig/test_ndrange.py
@@ -0,0 +1,248 @@
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_1d():
+    x = ti.field(ti.f32, shape=(16))
+
+    @ti.kernel
+    def func():
+        for i in ti.ndrange((4, 10)):
+            x[i] = i
+
+    func()
+
+    for i in range(16):
+        if 4 <= i < 10:
+            assert x[i] == i
+        else:
+            assert x[i] == 0
+
+
+@test_utils.test()
+def test_2d():
+    x = ti.field(ti.f32, shape=(16, 32))
+
+    t = 8
+
+    @ti.kernel
+    def func():
+        for i, j in ti.ndrange((4, 10), (3, t)):
+            val = i + j * 10
+            x[i, j] = val
+
+    func()
+    for i in range(16):
+        for j in range(32):
+            if 4 <= i < 10 and 3 <= j < 8:
+                assert x[i, j] == i + j * 10
+            else:
+                assert x[i, j] == 0
+
+
+@test_utils.test()
+def test_3d():
+    x = ti.field(ti.f32, shape=(16, 32, 64))
+
+    @ti.kernel
+    def func():
+        for i, j, k in ti.ndrange((4, 10), (3, 8), 17):
+            x[i, j, k] = i + j * 10 + k * 100
+
+    func()
+    for i in range(16):
+        for j in range(32):
+            for k in range(64):
+                if 4 <= i < 10 and 3 <= j < 8 and k < 17:
+                    assert x[i, j, k] == i + j * 10 + k * 100
+                else:
+                    assert x[i, j, k] == 0
+
+
+@test_utils.test()
+def test_tensor_based_3d():
+    x = ti.field(ti.i32, shape=(6, 6, 6))
+    y = ti.field(ti.i32, shape=(6, 6, 6))
+
+    @ti.kernel
+    def func():
+        lower = ti.Vector([0, 1, 2])
+        upper = ti.Vector([3, 4, 5])
+        for I in ti.grouped(
+                ti.ndrange((lower[0], upper[0]), (lower[1], upper[1]),
+                           (lower[2], upper[2]))):
+            x[I] = I[0] + I[1] + I[2]
+        for i in range(0, 3):
+            for j in range(1, 4):
+                for k in range(2, 5):
+                    y[i, j, k] = i + j + k
+
+    func()
+
+    for i in range(6):
+        for j in range(6):
+            for k in range(6):
+                assert x[i, j, k] == y[i, j, k]
+
+
+@test_utils.test()
+def test_static_grouped():
+    x = ti.field(ti.f32, shape=(16, 32, 64))
+
+    @ti.kernel
+    def func():
+        for I in ti.static(ti.grouped(ti.ndrange((4, 5), (3, 5), 5))):
+            x[I] = I[0] + I[1] * 10 + I[2] * 100
+
+    func()
+    for i in range(16):
+        for j in range(32):
+            for k in range(64):
+                if 4 <= i < 5 and 3 <= j < 5 and k < 5:
+                    assert x[i, j, k] == i + j * 10 + k * 100
+                else:
+                    assert x[i, j, k] == 0
+
+
+@test_utils.test()
+def test_static_grouped_static():
+    x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(16, 4))
+
+    @ti.kernel
+    def func():
+        for i, j in ti.ndrange(16, 4):
+            for I in ti.static(ti.grouped(ti.ndrange(2, 3))):
+                x[i, j][I] = I[0] + I[1] * 10 + i + j * 4
+
+    func()
+    for i in range(16):
+        for j in range(4):
+            for k in range(2):
+                for l in range(3):
+                    assert x[i, j][k, l] == k + l * 10 + i + j * 4
+
+
+@test_utils.test()
+def test_field_init_eye():
+    # https://github.com/taichi-dev/taichi/issues/1824
+
+    n = 32
+
+    A = ti.field(ti.f32, (n, n))
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange(n, n):
+            if i == j:
+                A[i, j] = 1
+
+    init()
+    assert np.allclose(A.to_numpy(), np.eye(n, dtype=np.float32))
+
+
+@test_utils.test()
+def test_ndrange_index_floordiv():
+    # https://github.com/taichi-dev/taichi/issues/1829
+
+    n = 10
+
+    A = ti.field(ti.f32, (n, n))
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange(n, n):
+            if i // 2 == 0:
+                A[i, j] = i
+
+    init()
+    for i in range(n):
+        for j in range(n):
+            if i // 2 == 0:
+                assert A[i, j] == i
+            else:
+                assert A[i, j] == 0
+
+
+@test_utils.test()
+def test_nested_ndrange():
+    # https://github.com/taichi-dev/taichi/issues/1829
+
+    n = 2
+
+    A = ti.field(ti.i32, (n, n, n, n))
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange(n, n):
+            for k, l in ti.ndrange(n, n):
+                r = i * n**3 + j * n**2 + k * n + l
+                A[i, j, k, l] = r
+
+    init()
+    for i in range(n):
+        for j in range(n):
+            for k in range(n):
+                for l in range(n):
+                    r = i * n**3 + j * n**2 + k * n + l
+                    assert A[i, j, k, l] == r
+
+
+@test_utils.test(ti.cpu)
+def test_ndrange_ast_transform():
+    n, u, v = 4, 3, 2
+
+    a = ti.field(ti.i32, ())
+    b = ti.field(ti.i32, ())
+    A = ti.field(ti.i32, (n, n))
+
+    @ti.kernel
+    def func():
+        # `__getitem__ cannot be called from Python-scope` will be raised if
+        # `a[None]` is not transformed to `ti.subscript(a, None)` in ti.ndrange:
+        for i, j in ti.ndrange(a[None], b[None]):
+            r = i * n + j + 1
+            A[i, j] = r
+
+    a[None] = u
+    b[None] = v
+
+    func()
+
+    for i in range(n):
+        for j in range(n):
+            if i < u and j < v:
+                r = i * n + j + 1
+            else:
+                r = 0
+            assert A[i, j] == r
+
+
+@test_utils.test()
+def test_grouped_ndrange_star():
+    @ti.kernel
+    def foo() -> ti.i32:
+        ret = 0
+        for I in ti.grouped(ti.ndrange(*[[1, 3]] * 3)):
+            ret += I[0] + I[1] + I[2]
+        return ret
+
+    assert foo() == 36
+
+
+@test_utils.test()
+def test_ndrange_three_arguments():
+    @ti.kernel
+    def foo():
+        for i in ti.ndrange((1, 2, 3)):
+            pass
+
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match=
+            r"Every argument of ndrange should be a scalar or a tuple/list like \(begin, end\)"
+    ):
+        foo()
diff --git a/tests/_python_orig/test_nested_kernel_error.py b/tests/_python_orig/test_nested_kernel_error.py
new file mode 100644
index 000000000..605432263
--- /dev/null
+++ b/tests/_python_orig/test_nested_kernel_error.py
@@ -0,0 +1,18 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_nested_kernel_error():
+    @ti.kernel
+    def B():
+        pass
+
+    @ti.kernel
+    def A():
+        B()
+
+    with pytest.raises(ti.TaichiCompilationError):
+        A()
diff --git a/tests/_python_orig/test_new_allocator.py b/tests/_python_orig/test_new_allocator.py
new file mode 100644
index 000000000..361f9af04
--- /dev/null
+++ b/tests/_python_orig/test_new_allocator.py
@@ -0,0 +1,82 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_1d():
+    N = 16
+
+    x = ti.field(ti.f32, shape=(N, ))
+    y = ti.field(ti.f32, shape=(N, ))
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            y[i] = x[i]
+
+    for i in range(N):
+        x[i] = i * 2
+
+    func()
+
+    for i in range(N):
+        assert y[i] == i * 2
+
+
+@test_utils.test()
+def test_3d():
+    N = 2
+    M = 2
+
+    x = ti.field(ti.f32, shape=(N, M))
+    y = ti.field(ti.f32, shape=(N, M))
+
+    @ti.kernel
+    def func():
+        for I in ti.grouped(x):
+            y[I] = x[I]
+
+    for i in range(N):
+        for j in range(M):
+            x[i, j] = i * 10 + j
+
+    func()
+
+    for i in range(N):
+        for j in range(M):
+            assert y[i, j] == i * 10 + j
+
+
+@test_utils.test()
+def test_matrix():
+    N = 16
+
+    x = ti.Matrix.field(2, 2, dtype=ti.f32, shape=(N, ), layout=ti.Layout.AOS)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            x[i][1, 1] = x[i][0, 0]
+
+    for i in range(N):
+        x[i][0, 0] = i + 3
+
+    func()
+
+    for i in range(N):
+        assert x[i][1, 1] == i + 3
+
+
+@test_utils.test()
+def test_alloc_in_kernel():
+    return  # build bots may not have this much memory to tests...
+    x = ti.field(ti.f32)
+
+    ti.root.pointer(ti.i, 8192).dense(ti.i, 1024 * 1024).place(x)
+
+    @ti.kernel
+    def touch():
+        for i in range(4096):
+            x[i * 1024 * 1024] = 1
+
+    touch()
diff --git a/tests/_python_orig/test_no_activate.py b/tests/_python_orig/test_no_activate.py
new file mode 100644
index 000000000..43311f2c1
--- /dev/null
+++ b/tests/_python_orig/test_no_activate.py
@@ -0,0 +1,30 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_no_activate():
+    x = ti.field(ti.f32)
+
+    n = 1024
+
+    d = ti.root.dynamic(ti.i, n, chunk_size=32)
+    d.place(x)
+
+    @ti.kernel
+    def initialize():
+        for i in range(n):
+            x[i] = 1
+
+    @ti.kernel
+    def func():
+        ti.no_activate(d)
+        for i in range(n // 2):
+            x[i * 2 + 1] += 1
+
+    initialize()
+
+    func()
+
+    for i in range(n):
+        assert x[i] == i % 2 + 1
diff --git a/tests/_python_orig/test_no_grad.py b/tests/_python_orig/test_no_grad.py
new file mode 100644
index 000000000..e89275e0b
--- /dev/null
+++ b/tests/_python_orig/test_no_grad.py
@@ -0,0 +1,45 @@
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_no_grad():
+    x = ti.field(ti.f32)
+    loss = ti.field(ti.f32)
+
+    N = 1
+
+    # no gradients allocated for x
+    ti.root.dense(ti.i, N).place(x)
+    ti.root.place(loss, loss.grad)
+
+    @ti.kernel
+    def func():
+        for i in range(N):
+            ti.atomic_add(loss[None], x[i]**2)
+
+    with ti.Tape(loss):
+        func()
+
+
+@test_utils.test()
+def test_raise_no_gradient():
+    y = ti.field(shape=(), name='y', dtype=ti.f32, needs_grad=True)
+    x = ti.field(shape=(), name='x', dtype=ti.f32)
+    z = np.array([1.0])
+
+    @ti.kernel
+    def func(x: ti.template()):
+        y[None] = x.grad[None] * x.grad[None]
+        z[0] = x.grad[None]
+
+    x[None] = 5.
+    with pytest.raises(
+            ti.TaichiCompilationError,
+            match=
+            'Gradient x.grad has not been placed, check whether `needs_grad=True`'
+    ):
+        func(x)
diff --git a/tests/_python_orig/test_non_taichi_types_in_kernel.py b/tests/_python_orig/test_non_taichi_types_in_kernel.py
new file mode 100644
index 000000000..96ed9032e
--- /dev/null
+++ b/tests/_python_orig/test_non_taichi_types_in_kernel.py
@@ -0,0 +1,20 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_subscript_user_classes_in_kernel():
+    class MyList:
+        def __init__(self, elements):
+            self.elements = elements
+
+        def __getitem__(self, index):
+            return self.elements[index]
+
+    @ti.kernel
+    def func():
+        for i in ti.static(range(3)):
+            print(a[i])
+
+    a = MyList([1, 2, 3])
+    func()
diff --git a/tests/_python_orig/test_numpy.py b/tests/_python_orig/test_numpy.py
new file mode 100644
index 000000000..58019da6e
--- /dev/null
+++ b/tests/_python_orig/test_numpy.py
@@ -0,0 +1,222 @@
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+def with_data_type(dt):
+    val = ti.field(ti.i32)
+
+    n = 4
+
+    ti.root.dense(ti.i, n).place(val)
+
+    @ti.kernel
+    def test_numpy(arr: ti.ext_arr()):
+        for i in range(n):
+            arr[i] = arr[i]**2
+
+    a = np.array([4, 8, 1, 24], dtype=dt)
+
+    for i in range(n):
+        a[i] = i * 2
+
+    test_numpy(a)
+
+    for i in range(n):
+        assert a[i] == i * i * 4
+
+
+@test_utils.test()
+def test_numpy_f32():
+    with_data_type(np.float32)
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_numpy_f64():
+    with_data_type(np.float64)
+
+
+@test_utils.test()
+def test_numpy_i32():
+    with_data_type(np.int32)
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_numpy_i64():
+    with_data_type(np.int64)
+
+
+@test_utils.test()
+def test_numpy_2d():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 7
+
+    ti.root.dense(ti.i, n).dense(ti.j, m).place(val)
+
+    @ti.kernel
+    def test_numpy(arr: ti.ext_arr()):
+        for i in range(n):
+            for j in range(m):
+                arr[i, j] += i + j
+
+    a = np.empty(shape=(n, m), dtype=np.int32)
+
+    for i in range(n):
+        for j in range(m):
+            a[i, j] = i * j
+
+    test_numpy(a)
+
+    for i in range(n):
+        for j in range(m):
+            assert a[i, j] == i * j + i + j
+
+
+@test_utils.test()
+def test_numpy_2d_transpose():
+    val = ti.field(ti.i32)
+
+    n = 8
+    m = 8
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    @ti.kernel
+    def test_numpy(arr: ti.ext_arr()):
+        for i in ti.grouped(val):
+            val[i] = arr[i]
+
+    a = np.empty(shape=(n, m), dtype=np.int32)
+
+    for i in range(n):
+        for j in range(m):
+            a[i, j] = i * j + i * 4
+
+    test_numpy(a.transpose())
+
+    for i in range(n):
+        for j in range(m):
+            assert val[i, j] == i * j + j * 4
+
+
+@test_utils.test()
+def test_numpy_3d():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 7
+    p = 11
+
+    ti.root.dense(ti.i, n).dense(ti.j, m).dense(ti.k, p).place(val)
+
+    @ti.kernel
+    def test_numpy(arr: ti.ext_arr()):
+        for i in range(n):
+            for j in range(m):
+                for k in range(p):
+                    arr[i, j, k] += i + j + k * 2
+
+    a = np.empty(shape=(n, m, p), dtype=np.int32)
+
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                a[i, j, k] = i * j * (k + 1)
+
+    test_numpy(a)
+
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                assert a[i, j, k] == i * j * (k + 1) + i + j + k * 2
+
+
+@test_utils.test()
+def test_numpy_3d_error():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 7
+    p = 11
+
+    ti.root.dense(ti.i, n).dense(ti.j, m).dense(ti.k, p).place(val)
+
+    @ti.kernel
+    def test_numpy(arr: ti.ext_arr()):
+        for i in range(n):
+            for j in range(m):
+                for k in range(p):
+                    arr[i, j] += i + j + k * 2
+
+    a = np.empty(shape=(n, m, p), dtype=np.int32)
+
+    with pytest.raises(ti.TaichiCompilationError):
+        test_numpy(a)
+
+
+@test_utils.test()
+def test_numpy_multiple_external_arrays():
+
+    n = 4
+
+    @ti.kernel
+    def test_numpy(a: ti.ext_arr(), b: ti.ext_arr()):
+        for i in range(n):
+            a[i] = a[i] * b[i]
+            b[i] = a[i] + b[i]
+
+    a = np.array([4, 8, 1, 24], dtype=np.int32)
+    b = np.array([5, 6, 12, 3], dtype=np.int32)
+    c = a * b
+    d = c + b
+
+    test_numpy(a, b)
+    for i in range(n):
+        assert a[i] == c[i]
+        assert b[i] == d[i]
+
+
+@test_utils.test()
+def test_index_mismatch():
+    with pytest.raises(AssertionError):
+        val = ti.field(ti.i32, shape=(1, 2, 3))
+        val[0, 0] = 1
+
+
+@test_utils.test()
+def test_numpy_zero():
+    @ti.kernel
+    def test_numpy(arr: ti.ext_arr()):
+        pass
+
+    test_numpy(np.empty(shape=(0), dtype=np.int32))
+    test_numpy(np.empty(shape=(0, 5), dtype=np.int32))
+    test_numpy(np.empty(shape=(5, 0), dtype=np.int32))
+
+
+@test_utils.test()
+def test_numpy_struct_for():
+    @ti.kernel
+    def func1(a: ti.any_arr()):
+        for i, j in a:
+            a[i, j] = i + j
+
+    m = np.zeros((123, 456), dtype=np.int32)
+    func1(m)
+    for i in range(123):
+        for j in range(456):
+            assert m[i, j] == i + j
+
+    @ti.kernel
+    def func2(a: ti.any_arr()):
+        for I in ti.grouped(a):
+            a[I] = I.sum()
+
+    n = np.zeros((98, 76, 54), dtype=np.int32)
+    func2(n)
+    for i, j, k in ti.ndrange(98, 76, 54):
+        assert n[i, j, k] == i + j + k
diff --git a/tests/_python_orig/test_numpy_io.py b/tests/_python_orig/test_numpy_io.py
new file mode 100644
index 000000000..e2795ff8a
--- /dev/null
+++ b/tests/_python_orig/test_numpy_io.py
@@ -0,0 +1,162 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_to_numpy_2d():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 7
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    for i in range(n):
+        for j in range(m):
+            val[i, j] = i + j * 3
+
+    arr = val.to_numpy()
+
+    assert arr.shape == (4, 7)
+    for i in range(n):
+        for j in range(m):
+            assert arr[i, j] == i + j * 3
+
+
+@test_utils.test()
+def test_from_numpy_2d():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 7
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    arr = np.empty(shape=(n, m), dtype=np.int32)
+
+    for i in range(n):
+        for j in range(m):
+            arr[i, j] = i + j * 3
+
+    val.from_numpy(arr)
+
+    for i in range(n):
+        for j in range(m):
+            assert val[i, j] == i + j * 3
+
+
+@test_utils.test()
+def test_to_numpy_struct():
+    n = 16
+    f = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+
+    for i in range(n):
+        f[i].a = i
+        f[i].b = f[i].a * 2
+
+    arr_dict = f.to_numpy()
+
+    for i in range(n):
+        assert arr_dict["a"][i] == i
+        assert arr_dict["b"][i] == i * 2
+
+
+@test_utils.test()
+def test_from_numpy_struct():
+    n = 16
+    f = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+
+    arr_dict = {
+        "a": np.arange(n, dtype=np.int32),
+        "b": np.arange(n, dtype=np.int32) * 2,
+    }
+
+    f.from_numpy(arr_dict)
+
+    for i in range(n):
+        assert f[i].a == i
+        assert f[i].b == i * 2
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_f64():
+    val = ti.field(ti.f64)
+
+    n = 4
+    m = 7
+
+    ti.root.dense(ti.ij, (n, m)).place(val)
+
+    for i in range(n):
+        for j in range(m):
+            val[i, j] = (i + j * 3) * 1e100
+
+    val.from_numpy(val.to_numpy() * 2)
+
+    for i in range(n):
+        for j in range(m):
+            assert val[i, j] == (i + j * 3) * 2e100
+
+
+@test_utils.test()
+def test_matrix():
+    n = 4
+    m = 7
+    val = ti.Matrix.field(2, 3, ti.f32, shape=(n, m))
+
+    nparr = np.empty(shape=(n, m, 2, 3), dtype=np.float32)
+    for i in range(n):
+        for j in range(m):
+            for k in range(2):
+                for l in range(3):
+                    nparr[i, j, k, l] = i + j * 2 - k - l * 3
+
+    val.from_numpy(nparr)
+    new_nparr = val.to_numpy()
+    assert (nparr == new_nparr).all()
+
+
+@test_utils.test()
+def test_numpy_io_example():
+    n = 4
+    m = 7
+
+    # Taichi tensors
+    val = ti.field(ti.i32, shape=(n, m))
+    vec = ti.Vector.field(3, dtype=ti.i32, shape=(n, m))
+    mat = ti.Matrix.field(3, 4, dtype=ti.i32, shape=(n, m))
+
+    # Scalar
+    arr = np.ones(shape=(n, m), dtype=np.int32)
+    val.from_numpy(arr)
+    arr = val.to_numpy()
+
+    # Vector
+    arr = np.ones(shape=(n, m, 3), dtype=np.int32)
+    vec.from_numpy(arr)
+
+    arr = np.ones(shape=(n, m, 3, 1), dtype=np.int32)
+    vec.from_numpy(arr)
+
+    arr = np.ones(shape=(n, m, 1, 3), dtype=np.int32)
+    vec.from_numpy(arr)
+
+    arr = vec.to_numpy()
+    assert arr.shape == (n, m, 3)
+
+    arr = vec.to_numpy(keep_dims=True)
+    assert arr.shape == (n, m, 3, 1)
+
+    # Matrix
+    arr = np.ones(shape=(n, m, 3, 4), dtype=np.int32)
+    mat.from_numpy(arr)
+
+    arr = mat.to_numpy()
+    assert arr.shape == (n, m, 3, 4)
+
+    arr = mat.to_numpy(keep_dims=True)
+    assert arr.shape == (n, m, 3, 4)
+
+    # For PyTorch tensors, use to_torch/from_torch instead
diff --git a/tests/_python_orig/test_offload.py b/tests/_python_orig/test_offload.py
new file mode 100644
index 000000000..f1ebfe7bd
--- /dev/null
+++ b/tests/_python_orig/test_offload.py
@@ -0,0 +1,84 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_running_loss():
+    return
+    steps = 16
+
+    total_loss = ti.field(ti.f32)
+    running_loss = ti.field(ti.f32)
+    additional_loss = ti.field(ti.f32)
+
+    ti.root.place(total_loss)
+    ti.root.dense(ti.i, steps).place(running_loss)
+    ti.root.place(additional_loss)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def compute_loss():
+        total_loss[None] = 0.0
+        for i in range(steps):
+            ti.atomic_add(total_loss[None], running_loss[i] * 2)
+        ti.atomic_add(total_loss[None], additional_loss[None] * 3)
+
+    compute_loss()
+
+    assert total_loss.grad[None] == 1
+    for i in range(steps):
+        assert running_loss[i] == 2
+    assert additional_loss.grad[None] == 3
+
+
+@test_utils.test()
+def test_reduce_separate():
+    a = ti.field(ti.f32, shape=(16))
+    b = ti.field(ti.f32, shape=(4))
+    c = ti.field(ti.f32, shape=())
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def reduce1():
+        for i in range(16):
+            b[i // 4] += a[i]
+
+    @ti.kernel
+    def reduce2():
+        for i in range(4):
+            c[None] += b[i]
+
+    c.grad[None] = 1
+    reduce2.grad()
+    reduce1.grad()
+
+    for i in range(4):
+        assert b.grad[i] == 1
+    for i in range(16):
+        assert a.grad[i] == 1
+
+
+@test_utils.test()
+def test_reduce_merged():
+    a = ti.field(ti.f32, shape=(16))
+    b = ti.field(ti.f32, shape=(4))
+    c = ti.field(ti.f32, shape=())
+
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def reduce():
+        for i in range(16):
+            b[i // 4] += a[i]
+
+        for i in range(4):
+            c[None] += b[i]
+
+    c.grad[None] = 1
+    reduce.grad()
+
+    for i in range(4):
+        assert b.grad[i] == 1
+    for i in range(16):
+        assert a.grad[i] == 1
diff --git a/tests/_python_orig/test_offload_cross.py b/tests/_python_orig/test_offload_cross.py
new file mode 100644
index 000000000..4a2b6d313
--- /dev/null
+++ b/tests/_python_orig/test_offload_cross.py
@@ -0,0 +1,135 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_offload_with_cross_block_locals():
+    ret = ti.field(ti.f32)
+
+    ti.root.place(ret)
+
+    @ti.kernel
+    def ker():
+        s = 0
+        for i in range(10):
+            s += i
+        ret[None] = s
+
+    ker()
+
+    assert ret[None] == 45
+
+
+@test_utils.test()
+def test_offload_with_cross_block_locals2():
+    ret = ti.field(ti.f32)
+
+    ti.root.place(ret)
+
+    @ti.kernel
+    def ker():
+        s = 0
+        for i in range(10):
+            s += i
+        ret[None] = s
+        s = ret[None] * 2
+        for i in range(10):
+            ti.atomic_add(ret[None], s)
+
+    ker()
+
+    assert ret[None] == 45 * 21
+
+
+@test_utils.test()
+def test_offload_with_cross_block_locals3():
+    ret = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def ker():
+        s = 1
+        t = s
+        for i in range(10):
+            s += i
+        ret[None] = t
+
+    ker()
+
+    assert ret[None] == 1
+
+
+@test_utils.test()
+def test_offload_with_cross_block_locals4():
+    ret = ti.field(ti.f32, shape=())
+
+    @ti.kernel
+    def ker():
+        a = 1
+        b = 0
+        for i in range(10):
+            b += a
+        ret[None] = b
+
+    ker()
+
+    assert ret[None] == 10
+
+
+@test_utils.test()
+def test_offload_with_flexible_bounds():
+    s = ti.field(ti.i32, shape=())
+    lower = ti.field(ti.i32, shape=())
+    upper = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def ker():
+        for i in range(lower[None], upper[None]):
+            s[None] += i
+
+    lower[None] = 10
+    upper[None] = 20
+    ker()
+
+    assert s[None] == 29 * 10 // 2
+
+
+@test_utils.test()
+def test_offload_with_cross_block_globals():
+    ret = ti.field(ti.f32)
+
+    ti.root.place(ret)
+
+    @ti.kernel
+    def ker():
+        ret[None] = 0
+        for i in range(10):
+            ret[None] += i
+        ret[None] += 1
+
+    ker()
+
+    assert ret[None] == 46
+
+
+@test_utils.test()
+def test_offload_with_cross_nested_for():
+    @ti.kernel
+    def run(a: ti.i32):
+        b = a + 1
+        for x in range(1):
+            for i in range(b):
+                print('OK')
+
+    run(2)
+
+
+@test_utils.test()
+def test_offload_with_cross_if_inside_for():
+    @ti.kernel
+    def run(a: ti.i32):
+        b = a > 2
+        for x in range(1):
+            if b:
+                print('OK')
+
+    run(2)
diff --git a/tests/_python_orig/test_offset.py b/tests/_python_orig/test_offset.py
new file mode 100644
index 000000000..fd41b732a
--- /dev/null
+++ b/tests/_python_orig/test_offset.py
@@ -0,0 +1,141 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_accessor():
+    a = ti.field(dtype=ti.i32)
+
+    ti.root.dense(ti.ijk, 128).place(a, offset=(1024, 2048, 2100))
+
+    a[1029, 2100, 2200] = 1
+    assert a[1029, 2100, 2200] == 1
+
+
+@test_utils.test()
+def test_struct_for_huge_offsets():
+    a = ti.field(dtype=ti.i32)
+
+    offset = 1024, 2048, 2100, 2200
+    ti.root.dense(ti.ijkl, 4).place(a, offset=offset)
+
+    @ti.kernel
+    def test():
+        for i, j, k, l in a:
+            a[i, j, k, l] = i + j * 10 + k * 100 + l * 1000
+
+    test()
+
+    for i in range(offset[0], offset[0] + 4):
+        for j in range(offset[1], offset[1] + 4):
+            for k in range(offset[2], offset[2] + 4):
+                for l in range(offset[3], offset[3] + 4):
+                    assert a[i, j, k, l] == i + j * 10 + k * 100 + l * 1000
+
+
+@test_utils.test()
+def test_struct_for_negative():
+    a = ti.field(dtype=ti.i32)
+
+    offset = 16, -16
+    ti.root.dense(ti.ij, 32).place(a, offset=offset)
+
+    @ti.kernel
+    def test():
+        for i, j in a:
+            a[i, j] = i + j * 10
+
+    test()
+
+    for i in range(16, 48):
+        for j in range(-16, 16):
+            assert a[i, j] == i + j * 10
+
+
+@test_utils.test()
+def test_offset_for_var():
+    a = ti.field(dtype=ti.i32, shape=16, offset=-48)
+    b = ti.field(dtype=ti.i32, shape=(16, ), offset=(16, ))
+    c = ti.field(dtype=ti.i32, shape=(16, 64), offset=(-16, -64))
+    d = ti.field(dtype=ti.i32, shape=(16, 64), offset=None)
+
+    offset = 4, -4
+    shape = 16, 16
+    e = ti.field(dtype=ti.i32, shape=shape, offset=offset)
+
+    @ti.kernel
+    def test():
+        for i, j in e:
+            e[i, j] = i * j
+
+    test()
+    for i in range(4, 20):
+        for j in range(-4, 12):
+            assert e[i, j] == i * j
+
+
+@test_utils.test()
+def test_offset_for_vector():
+    a = ti.field(dtype=ti.i32, shape=16, offset=-48)
+    b = ti.field(dtype=ti.i32, shape=16, offset=None)
+
+    offset = 16
+    shape = 16
+    c = ti.Vector.field(n=1, dtype=ti.i32, shape=shape, offset=offset)
+
+    @ti.kernel
+    def test():
+        for i in c:
+            c[i][0] = 2 * i
+
+    test()
+    for i in range(offset, offset + shape, 1):
+        assert c[i][0] == 2 * i
+
+
+@test_utils.test()
+def test_offset_for_matrix():
+    a = ti.Matrix.field(3,
+                        3,
+                        shape=(16, 16),
+                        offset=(-16, 16),
+                        dtype=ti.float32)
+
+    @ti.kernel
+    def test():
+        for i, j in a:
+            for m in range(3):
+                a[i, j][0, 0] = i + j
+
+    test()
+
+    for i in range(-16, 0):
+        for j in range(16, 32):
+            assert a[i, j][0, 0] == i + j
+
+
+@test_utils.test()
+def test_offset_must_throw_var():
+    with pytest.raises(AssertionError):
+        a = ti.field(dtype=ti.float32, shape=3, offset=(3, 4))
+        b = ti.field(dtype=ti.float32, shape=None, offset=(3, 4))
+
+
+@test_utils.test()
+def test_offset_must_throw_vector():
+    with pytest.raises(AssertionError):
+        a = ti.Vector.field(3, dtype=ti.float32, shape=3, offset=(3, 4))
+        b = ti.Vector.field(3, dtype=ti.float32, shape=None, offset=(3, ))
+
+
+@test_utils.test()
+def test_offset_must_throw_matrix():
+    with pytest.raises(AssertionError):
+        c = ti.Matrix.field(3,
+                            3,
+                            dtype=ti.i32,
+                            shape=(32, 16, 8),
+                            offset=(32, 16))
+        d = ti.Matrix.field(3, 3, dtype=ti.i32, shape=None, offset=(32, 16))
diff --git a/tests/_python_orig/test_oop.py b/tests/_python_orig/test_oop.py
new file mode 100644
index 000000000..059d774f9
--- /dev/null
+++ b/tests/_python_orig/test_oop.py
@@ -0,0 +1,266 @@
+import pytest
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_classfunc():
+    @ti.data_oriented
+    class Array2D:
+        def __init__(self, n, m):
+            self.n = n
+            self.m = m
+            self.val = ti.field(ti.f32, shape=(n, m))
+
+        @ti.func
+        def inc(self, i, j):
+            self.val[i, j] += i * j
+
+        @ti.func
+        def mul(self, i, j):
+            return i * j
+
+        @ti.kernel
+        def fill(self):
+            for i, j in self.val:
+                self.inc(i, j)
+                self.val[i, j] += self.mul(i, j)
+
+    arr = Array2D(128, 128)
+
+    arr.fill()
+
+    for i in range(arr.n):
+        for j in range(arr.m):
+            assert arr.val[i, j] == i * j * 2
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_oop():
+    @ti.data_oriented
+    class Array2D:
+        def __init__(self, n, m, increment):
+            self.n = n
+            self.m = m
+            self.val = ti.field(ti.f32)
+            self.total = ti.field(ti.f32)
+            self.increment = increment
+
+            ti.root.dense(ti.ij, (self.n, self.m)).place(self.val)
+            ti.root.place(self.total)
+
+        @ti.kernel
+        def inc(self):
+            for i, j in self.val:
+                self.val[i, j] += self.increment
+
+        @ti.kernel
+        def inc2(self, increment: ti.i32):
+            for i, j in self.val:
+                self.val[i, j] += increment
+
+        @ti.kernel
+        def reduce(self):
+            for i, j in self.val:
+                self.total[None] += self.val[i, j] * 4
+
+    arr = Array2D(128, 128, 3)
+
+    double_total = ti.field(ti.f32)
+
+    ti.root.place(double_total)
+    ti.root.lazy_grad()
+
+    arr.inc()
+    arr.inc.grad()
+    assert arr.val[3, 4] == 3
+    arr.inc2(4)
+    assert arr.val[3, 4] == 7
+
+    with ti.Tape(loss=arr.total):
+        arr.reduce()
+
+    for i in range(arr.n):
+        for j in range(arr.m):
+            assert arr.val.grad[i, j] == 4
+
+    @ti.kernel
+    def double():
+        double_total[None] = 2 * arr.total[None]
+
+    with ti.Tape(loss=double_total):
+        arr.reduce()
+        double()
+
+    for i in range(arr.n):
+        for j in range(arr.m):
+            assert arr.val.grad[i, j] == 8
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_oop_two_items():
+    @ti.data_oriented
+    class Array2D:
+        def __init__(self, n, m, increment, multiplier):
+            self.n = n
+            self.m = m
+            self.val = ti.field(ti.f32)
+            self.total = ti.field(ti.f32)
+            self.increment = increment
+            self.multiplier = multiplier
+            ti.root.dense(ti.ij, (self.n, self.m)).place(self.val)
+            ti.root.place(self.total)
+
+        @ti.kernel
+        def inc(self):
+            for i, j in self.val:
+                self.val[i, j] += self.increment
+
+        @ti.kernel
+        def reduce(self):
+            for i, j in self.val:
+                self.total[None] += self.val[i, j] * self.multiplier
+
+    arr1_inc, arr1_mult = 3, 4
+    arr2_inc, arr2_mult = 6, 8
+    arr1 = Array2D(128, 128, arr1_inc, arr1_mult)
+    arr2 = Array2D(16, 32, arr2_inc, arr2_mult)
+
+    ti.root.lazy_grad()
+
+    arr1.inc()
+    arr1.inc.grad()
+    arr2.inc()
+    arr2.inc.grad()
+    assert arr1.val[3, 4] == arr1_inc
+    assert arr2.val[8, 6] == arr2_inc
+
+    with ti.Tape(loss=arr1.total):
+        arr1.reduce()
+    with ti.Tape(loss=arr2.total, clear_gradients=False):
+        arr2.reduce()
+    for i in range(arr1.n):
+        for j in range(arr1.m):
+            assert arr1.val.grad[i, j] == arr1_mult
+    for i in range(arr2.n):
+        for j in range(arr2.m):
+            assert arr2.val.grad[i, j] == arr2_mult
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_oop_inherit_ok():
+    # Array1D inherits from object, which makes the callstack being 'class Array2D(object)'
+    # instead of '@ti.data_oriented'. Make sure this also works.
+    @ti.data_oriented
+    class Array1D(object):
+        def __init__(self, n, mul):
+            self.n = n
+            self.val = ti.field(ti.f32)
+            self.total = ti.field(ti.f32)
+            self.mul = mul
+            ti.root.dense(ti.ij, (self.n, )).place(self.val)
+            ti.root.place(self.total)
+
+        @ti.kernel
+        def reduce(self):
+            for i, j in self.val:
+                self.total[None] += self.val[i, j] * self.mul
+
+    arr = Array1D(128, 42)
+
+    ti.root.lazy_grad()
+
+    with ti.Tape(loss=arr.total):
+        arr.reduce()
+    for i in range(arr.n):
+        for j in range(arr.n):
+            assert arr.val.grad[i, j] == 42
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_oop_class_must_be_data_oriented():
+    class Array1D(object):
+        def __init__(self, n, mul):
+            self.n = n
+            self.val = ti.field(ti.f32)
+            self.total = ti.field(ti.f32)
+            self.mul = mul
+            ti.root.dense(ti.ij, (self.n, )).place(self.val)
+            ti.root.place(self.total)
+
+        @ti.kernel
+        def reduce(self):
+            for i, j in self.val:
+                self.total[None] += self.val[i, j] * self.mul
+
+    arr = Array1D(128, 42)
+
+    ti.root.lazy_grad()
+
+    # Array1D is not properly decorated, this will raise an Exception
+    with pytest.raises(ti.TaichiSyntaxError):
+        arr.reduce()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_hook():
+    @ti.data_oriented
+    class Solver:
+        def __init__(self, n, m, hook):
+            self.val = ti.field(ti.f32, shape=(n, m))
+            self.hook = hook
+
+        def run_hook(self):
+            self.hook(self.val)
+
+    @ti.kernel
+    def hook(x: ti.template()):
+        for i, j in x:
+            x[i, j] = 1.0
+
+    solver = Solver(32, 32, hook)
+    solver.run_hook()
+
+    for i in range(32):
+        for j in range(32):
+            assert (solver.val[i, j] == 1.0)
+
+
+@test_utils.test()
+def test_oop_with_portery_decorator():
+    @ti.data_oriented
+    class TestPortery:
+        @property
+        @ti.kernel
+        def kernel_property(self) -> ti.i32:
+            return 42
+
+        @property
+        def raw_proterty(self):
+            return 3
+
+    a = TestPortery()
+    assert a.kernel_property == 42
+
+    assert a.raw_proterty == 3
+
+
+@test_utils.test()
+def test_oop_with_static_decorator():
+    @ti.data_oriented
+    class TestStatic:
+        @staticmethod
+        @ti.kernel
+        def kernel_static() -> ti.i32:
+            return 42
+
+        @staticmethod
+        def raw_static():
+            return 3
+
+    a = TestStatic()
+    assert a.kernel_static() == 42
+
+    assert a.raw_static() == 3
diff --git a/tests/_python_orig/test_optimization.py b/tests/_python_orig/test_optimization.py
new file mode 100644
index 000000000..258fd41fa
--- /dev/null
+++ b/tests/_python_orig/test_optimization.py
@@ -0,0 +1,147 @@
+from taichi.lang.misc import serialize
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_advanced_store_forwarding_nested_loops():
+    val = ti.field(ti.i32)
+    ti.root.place(val)
+
+    @ti.kernel
+    def func():
+        # If we want to do store-forwarding to local loads inside loops,
+        # we should pass the last local store into the loop, rather than use
+        # an empty AllocaOptimize loop.
+        # See https://github.com/taichi-dev/taichi/pull/849.
+        a = val[None]
+        for i in range(1):
+            for j in range(1):
+                val[None] = a
+
+    val[None] = 10
+    func()
+    assert val[None] == 10
+
+
+@test_utils.test()
+def test_advanced_unused_store_elimination_if():
+    val = ti.field(ti.i32)
+    ti.root.place(val)
+
+    @ti.kernel
+    def func():
+        a = 1
+        if val[None]:
+            a = 2
+            if val[None]:
+                a = 3
+            else:
+                a = 4
+            val[None] = a
+        else:
+            val[None] = a
+
+    val[None] = 0
+    func()
+    assert val[None] == 1
+    func()
+    assert val[None] == 3
+
+
+@test_utils.test()
+def test_local_store_in_nested_for_and_if():
+    # See https://github.com/taichi-dev/taichi/pull/862.
+    val = ti.field(ti.i32, shape=(3, 3, 3))
+
+    @ti.kernel
+    def func():
+        serialize()
+        for i, j, k in val:
+            if i < 2 and j < 2 and k < 2:
+                a = 0
+                for di, dj, dk in ti.ndrange((0, 2), (0, 2), (0, 2)):
+                    if val[i + di, j + dj, k + dk] == 1:
+                        a = val[i + di, j + dj, k + dk]
+
+                for di, dj, dk in ti.ndrange((0, 2), (0, 2), (0, 2)):
+                    val[i + di, j + dj, k + dk] = a
+
+    val[1, 1, 1] = 1
+    func()
+
+    for i in range(3):
+        for j in range(3):
+            for k in range(3):
+                assert (val[i, j, k] == 1)
+
+
+@test_utils.test()
+def test_advanced_store_forwarding_continue_in_if():
+    val = ti.field(ti.i32)
+    ti.root.place(val)
+
+    @ti.kernel
+    def func(n: ti.i32):
+        # Launch just one thread
+        for _ in range(1):
+            a = 10
+            b = 0
+            for i in range(n):
+                b += a
+                a = i
+                if i == 5:
+                    continue
+                a = 100
+            a = 1000
+            val[None] = a + b
+
+    func(1)
+    assert val[None] == 1010
+    func(5)
+    assert val[None] == 1410
+    func(6)
+    assert val[None] == 1510
+    func(7)
+    assert val[None] == 1515
+
+
+@test_utils.test()
+def test_advanced_store_elimination_in_loop():
+    val = ti.field(ti.i32)
+    ti.root.place(val)
+
+    @ti.kernel
+    def func():
+        # Launch just one thread
+        for _ in range(1):
+            a = 1
+            for i in range(5):
+                b = 1
+                val[None] = a + b
+                b = 0
+                a = 2
+                a = 3
+            a = 4
+            val[None] += a
+
+    func()
+    assert val[None] == 8
+
+
+@test_utils.test()
+def test_parallel_assignment():
+    mat = ti.field(ti.i32, shape=(3, 4))
+
+    @ti.kernel
+    def func():
+        c = 0
+        for i in ti.static(range(4)):
+            mat[0, c], mat[1, c], mat[2, c] = 1, 2, 3
+            c += 1
+
+    func()
+    for i in range(3):
+        for j in range(4):
+            assert mat[i, j] == i + 1
diff --git a/tests/_python_orig/test_packed_size.py b/tests/_python_orig/test_packed_size.py
new file mode 100644
index 000000000..e52c0e5e9
--- /dev/null
+++ b/tests/_python_orig/test_packed_size.py
@@ -0,0 +1,10 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.packed, packed=True)
+def test_packed_size():
+    x = ti.field(ti.i32)
+    ti.root.dense(ti.i, 17).dense(ti.ijk, 129).place(x)
+    assert x.shape == (17 * 129, 129, 129)
+    assert x.snode.parent().parent()._cell_size_bytes == 4 * 129**3
diff --git a/tests/_python_orig/test_parallel_range_for.py b/tests/_python_orig/test_parallel_range_for.py
new file mode 100644
index 000000000..429a28092
--- /dev/null
+++ b/tests/_python_orig/test_parallel_range_for.py
@@ -0,0 +1,22 @@
+import taichi as ti
+from tests import test_utils
+
+
+# such small block_dim will cause grid_dim too large for OpenGL...
+@test_utils.test(exclude=ti.opengl)
+def test_parallel_range_for():
+    n = 1024 * 1024
+    val = ti.field(ti.i32, shape=(n))
+
+    @ti.kernel
+    def fill():
+        ti.parallelize(8)
+        ti.block_dim(8)
+        for i in range(n):
+            val[i] = i
+
+    fill()
+    # To speed up
+    val_np = val.to_numpy()
+    for i in range(n):
+        assert val_np[i] == i
diff --git a/tests/_python_orig/test_pow.py b/tests/_python_orig/test_pow.py
new file mode 100644
index 000000000..8ee43894a
--- /dev/null
+++ b/tests/_python_orig/test_pow.py
@@ -0,0 +1,48 @@
+import taichi as ti
+from tests import test_utils
+
+
+def _test_pow_f(dt):
+    z = ti.field(dt, shape=())
+
+    @ti.kernel
+    def func(x: dt, y: dt):
+        z[None] = x**y
+
+    for x in [0.5, 1, 1.5, 2, 6.66]:
+        for y in [-2, -1, -0.3, 0, 0.5, 1, 1.4, 2.6]:
+            func(x, y)
+            assert abs(z[None] / x**y - 1) < 0.00001
+
+
+def _test_pow_i(dt):
+    z = ti.field(dt, shape=())
+
+    @ti.kernel
+    def func(x: dt, y: ti.template()):
+        z[None] = x**y
+
+    for x in range(-5, 5):
+        for y in range(0, 4):
+            func(x, y)
+            assert z[None] == x**y
+
+
+@test_utils.test()
+def test_pow_f32():
+    _test_pow_f(ti.f32)
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_pow_f64():
+    _test_pow_f(ti.f64)
+
+
+@test_utils.test()
+def test_pow_i32():
+    _test_pow_i(ti.i32)
+
+
+@test_utils.test(require=ti.extension.data64)
+def test_pow_i64():
+    _test_pow_i(ti.i64)
diff --git a/tests/_python_orig/test_print.py b/tests/_python_orig/test_print.py
new file mode 100644
index 000000000..3e62d1919
--- /dev/null
+++ b/tests/_python_orig/test_print.py
@@ -0,0 +1,158 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+# Not really testable..
+# Just making sure it does not crash
+# Metal doesn't support print() or 64-bit data
+# While OpenGL does support print, but not 64-bit data
+@pytest.mark.parametrize('dt', [ti.i32, ti.f32, ti.i64, ti.f64])
+@test_utils.test(exclude=[ti.metal, ti.opengl, ti.vulkan])
+def test_print(dt):
+    @ti.kernel
+    def func():
+        print(ti.cast(1234.5, dt))
+
+    func()
+    # Discussion: https://github.com/taichi-dev/taichi/issues/1063#issuecomment-636421904
+    # Synchronize to prevent cross-test failure of print:
+    ti.sync()
+
+
+# TODO: As described by @k-ye above, what we want to ensure
+#       is that, the content shows on console is *correct*.
+@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+def test_multi_print():
+    @ti.kernel
+    def func(x: ti.i32, y: ti.f32):
+        print(x, 1234.5, y)
+
+    func(666, 233.3)
+    ti.sync()
+
+
+@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+def test_print_string():
+    @ti.kernel
+    def func(x: ti.i32, y: ti.f32):
+        # make sure `%` doesn't break vprintf:
+        print('hello, world! %s %d %f', 233, y)
+        print('cool', x, 'well', y)
+
+    func(666, 233.3)
+    ti.sync()
+
+
+@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+def test_print_matrix():
+    x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
+    y = ti.Vector.field(3, dtype=ti.f32, shape=3)
+
+    @ti.kernel
+    def func(k: ti.f32):
+        x[None][0, 0] = -1.0
+        y[2] += 1.0
+        print('hello', x[None], 'world!')
+        print(y[2] * k, x[None] / k, y[2])
+
+    func(233.3)
+    ti.sync()
+
+
+@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+def test_print_sep_end():
+    @ti.kernel
+    def func():
+        # hello 42 world!
+        print('hello', 42, 'world!')
+        # hello 42 Taichi 233 world!
+        print('hello', 42, 'Tai', end='')
+        print('chi', 233, 'world!')
+        # hello42world!
+        print('hello', 42, 'world!', sep='')
+        # '  ' (with no newline)
+        print('  ', end='')
+        # 'helloaswd42qwer'
+        print('  ', 42, sep='aswd', end='qwer')
+
+    func()
+    ti.sync()
+
+
+@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+def test_print_multiple_threads():
+    x = ti.field(dtype=ti.f32, shape=(128, ))
+
+    @ti.kernel
+    def func(k: ti.f32):
+        for i in x:
+            x[i] = i * k
+            print('x[', i, ']=', x[i])
+
+    func(0.1)
+    ti.sync()
+    func(10.0)
+    ti.sync()
+
+
+@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+def test_print_list():
+    x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(2, 3))
+    y = ti.Vector.field(3, dtype=ti.f32, shape=())
+
+    @ti.kernel
+    def func(k: ti.f32):
+        w = [k, x.shape]
+        print(w + [y.n])  # [233.3, [2, 3], 3]
+        print(x.shape)  # [2, 3]
+        print(y.shape)  # []
+        z = (1, )
+        print([1, k**2, k + 1])  # [1, 233.3, 234.3]
+        print(z)  # [1]
+        print([y[None], z])  # [[0, 0, 0], [1]]
+        print([])  # []
+
+    func(233.3)
+    ti.sync()
+
+
+@test_utils.test(arch=ti.cpu)
+def test_python_scope_print_field():
+    x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
+    y = ti.Vector.field(3, dtype=ti.f32, shape=3)
+    z = ti.field(dtype=ti.f32, shape=3)
+
+    print(x)
+    print(y)
+    print(z)
+
+
+@test_utils.test(arch=ti.cpu)
+def test_print_string_format():
+    @ti.kernel
+    def func(k: ti.f32):
+        print(123)
+        print("{} abc".format(123))
+        print("{} {} {}".format(1, 2, 3))
+        print("{} {name} {value}".format(k, name=999, value=123))
+        name = 123.4
+        value = 456.7
+        print("{} {name} {value}".format(k, name=name, value=value))
+
+    func(233.3)
+    ti.sync()
+
+
+@test_utils.test(arch=ti.cpu)
+def test_print_fstring():
+    def foo1(x):
+        return x + 1
+
+    @ti.kernel
+    def func(i: ti.i32, f: ti.f32):
+        print(f'qwe {foo1(1)} {foo1(2) * 2 - 1} {i} {f} {4} {True} {1.23}')
+
+    func(123, 4.56)
+    ti.sync()
diff --git a/tests/_python_orig/test_ptr_assign.py b/tests/_python_orig/test_ptr_assign.py
new file mode 100644
index 000000000..63e5a1e7d
--- /dev/null
+++ b/tests/_python_orig/test_ptr_assign.py
@@ -0,0 +1,107 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_ptr_scalar():
+    a = ti.field(dtype=ti.f32, shape=())
+
+    @ti.kernel
+    def func(t: ti.f32):
+        b = ti.static(a)
+        c = ti.static(b)
+        b[None] = b[None] * t
+        c[None] = a[None] + t
+
+    for x, y in zip(range(-5, 5), range(-4, 4)):
+        a[None] = x
+        func(y)
+        assert a[None] == x * y + y
+
+
+@test_utils.test()
+def test_ptr_matrix():
+    a = ti.Matrix.field(2, 2, dtype=ti.f32, shape=())
+
+    @ti.kernel
+    def func(t: ti.f32):
+        a[None] = [[2, 3], [4, 5]]
+        b = ti.static(a)
+        b[None][1, 0] = t
+
+    for x in range(-5, 5):
+        func(x)
+        assert a[None][1, 0] == x
+
+
+@test_utils.test()
+def test_ptr_field():
+    a = ti.field(dtype=ti.f32, shape=(3, 4))
+
+    @ti.kernel
+    def func(t: ti.f32):
+        b = ti.static(a)
+        b[1, 3] = b[1, 2] * t
+        b[2, 0] = b[2, 1] + t
+
+    for x, y in zip(range(-5, 5), range(-4, 4)):
+        a[1, 2] = x
+        a[2, 1] = x
+        func(y)
+        assert a[1, 3] == x * y
+        assert a[2, 0] == x + y
+
+
+@test_utils.test()
+def test_pythonish_tuple_assign():
+    a = ti.field(dtype=ti.f32, shape=())
+    b = ti.field(dtype=ti.f32, shape=())
+
+    @ti.kernel
+    def func(x: ti.f32, y: ti.f32):
+        u, v = ti.static(b, a)
+        u[None] = x
+        v[None] = y
+
+    for x, y in zip(range(-5, 5), range(-4, 4)):
+        func(x, y)
+        assert a[None] == y
+        assert b[None] == x
+
+
+@test_utils.test()
+def test_ptr_func():
+    a = ti.field(dtype=ti.f32, shape=())
+
+    @ti.func
+    def add2numbers(x, y):
+        return x + y
+
+    @ti.kernel
+    def func():
+        add = ti.static(add2numbers)
+        a[None] = add(2, 3)
+
+    func()
+    assert a[None] == 5.0
+
+
+@test_utils.test()
+def test_ptr_class_func():
+    @ti.data_oriented
+    class MyClass:
+        def __init__(self):
+            self.a = ti.field(dtype=ti.f32, shape=())
+
+        @ti.func
+        def add2numbers(self, x, y):
+            return x + y
+
+        @ti.kernel
+        def func(self):
+            a, add = ti.static(self.a, self.add2numbers)
+            a[None] = add(2, 3)
+
+    obj = MyClass()
+    obj.func()
+    assert obj.a[None] == 5.0
diff --git a/tests/_python_orig/test_random.py b/tests/_python_orig/test_random.py
new file mode 100644
index 000000000..fd609b1cb
--- /dev/null
+++ b/tests/_python_orig/test_random.py
@@ -0,0 +1,170 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_random_float():
+    for precision in [ti.f32, ti.f64]:
+        ti.init()
+        n = 1024
+        x = ti.field(ti.f32, shape=(n, n))
+
+        @ti.kernel
+        def fill():
+            for i in range(n):
+                for j in range(n):
+                    x[i, j] = ti.random(precision)
+
+        fill()
+        X = x.to_numpy()
+        for i in range(1, 4):
+            assert (X**i).mean() == test_utils.approx(1 / (i + 1), rel=1e-2)
+
+
+@test_utils.test()
+def test_random_int():
+    for precision in [ti.i32, ti.i64]:
+        ti.init()
+        n = 1024
+        x = ti.field(ti.f32, shape=(n, n))
+
+        @ti.kernel
+        def fill():
+            for i in range(n):
+                for j in range(n):
+                    v = ti.random(precision)
+                    if precision == ti.i32:
+                        x[i, j] = (float(v) + float(2**31)) / float(2**32)
+                    else:
+                        x[i, j] = (float(v) + float(2**63)) / float(2**64)
+
+        fill()
+        X = x.to_numpy()
+        for i in range(1, 4):
+            assert (X**i).mean() == test_utils.approx(1 / (i + 1), rel=1e-2)
+
+
+@test_utils.test()
+def test_random_independent_product():
+    n = 1024
+    x = ti.field(ti.f32, shape=n * n)
+
+    @ti.kernel
+    def fill():
+        for i in range(n * n):
+            a = ti.random()
+            b = ti.random()
+            x[i] = a * b
+
+    fill()
+    X = x.to_numpy()
+    for i in range(4):
+        assert X.mean() == test_utils.approx(1 / 4, rel=1e-2)
+
+
+@test_utils.test()
+def test_random_2d_dist():
+    n = 8192
+
+    x = ti.Vector.field(2, dtype=ti.f32, shape=n)
+
+    @ti.kernel
+    def gen():
+        for i in range(n):
+            x[i] = ti.Vector([ti.random(), ti.random()])
+
+    gen()
+
+    X = x.to_numpy()
+    counters = [0 for _ in range(4)]
+    for i in range(n):
+        c = int(X[i, 0] < 0.5) * 2 + int(X[i, 1] < 0.5)
+        counters[c] += 1
+
+    for c in range(4):
+        assert counters[c] / n == test_utils.approx(1 / 4, rel=0.2)
+
+
+@test_utils.test()
+def test_random_seed_per_launch():
+    n = 10
+    x = ti.field(ti.f32, shape=n)
+
+    @ti.kernel
+    def gen(i: ti.i32):
+        x[i] = ti.random()
+
+    count = 0
+    gen(0)
+    for i in range(1, n):
+        gen(i)
+        count += 1 if x[i] == x[i - 1] else 0
+
+    assert count <= n * 0.15
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.metal])
+def test_random_seed_per_program():
+    import numpy as np
+    n = 10
+    result = []
+    for s in [0, 1]:
+        ti.init(random_seed=s)
+        x = ti.field(ti.f32, shape=n)
+
+        @ti.kernel
+        def gen():
+            for i in x:
+                x[i] = ti.random()
+
+        gen()
+        result.append(x.to_numpy())
+        ti.reset()
+
+    assert not np.allclose(result[0], result[1])
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_random_f64():
+    '''
+    Tests the granularity of float64 random numbers.
+    See https://github.com/taichi-dev/taichi/issues/2251 for an explanation.
+    '''
+    import numpy as np
+    n = int(2**23)
+    x = ti.field(ti.f64, shape=n)
+
+    @ti.kernel
+    def foo():
+        for i in x:
+            x[i] = ti.random(dtype=ti.f64)
+
+    foo()
+    frac, _ = np.modf(x.to_numpy() * 4294967296)
+    assert np.max(frac) > 0
+
+
+@test_utils.test()
+def test_randn():
+    '''
+    Tests the generation of Gaussian random numbers.
+    '''
+    for precision in [ti.f32, ti.f64]:
+        ti.init()
+        n = 1024
+        x = ti.field(ti.f32, shape=(n, n))
+
+        @ti.kernel
+        def fill():
+            for i in range(n):
+                for j in range(n):
+                    x[i, j] = ti.randn(precision)
+
+        fill()
+        X = x.to_numpy()
+
+        # https://en.wikipedia.org/wiki/Normal_distribution#Moments
+        moments = [0.0, 1.0, 0.0, 3.0]
+        for i in range(4):
+            assert (X**(i + 1)).mean() == test_utils.approx(moments[i],
+                                                            abs=3e-2)
diff --git a/tests/_python_orig/test_reduction.py b/tests/_python_orig/test_reduction.py
new file mode 100644
index 000000000..434a1dc58
--- /dev/null
+++ b/tests/_python_orig/test_reduction.py
@@ -0,0 +1,146 @@
+import numpy as np
+import pytest
+from pytest import approx
+
+import taichi as ti
+from tests import test_utils
+
+OP_ADD = 0
+OP_MIN = 1
+OP_MAX = 2
+OP_AND = 3
+OP_OR = 4
+OP_XOR = 5
+
+ti_ops = {
+    OP_ADD: ti.atomic_add,
+    OP_MIN: ti.atomic_min,
+    OP_MAX: ti.atomic_max,
+    OP_AND: ti.atomic_and,
+    OP_OR: ti.atomic_or,
+    OP_XOR: ti.atomic_xor
+}
+
+np_ops = {
+    OP_ADD: np.sum,
+    OP_MIN: lambda a: a.min(),
+    OP_MAX: lambda a: a.max(),
+    OP_AND: np.bitwise_and.reduce,
+    OP_OR: np.bitwise_or.reduce,
+    OP_XOR: np.bitwise_xor.reduce
+}
+
+
+def _test_reduction_single(dtype, criterion, op):
+    N = 1024 * 1024
+    if (ti.lang.impl.current_cfg().arch == ti.opengl or
+            ti.lang.impl.current_cfg().arch == ti.vulkan) and dtype == ti.f32:
+        # OpenGL/Vulkan are not capable of such large number in its float32...
+        N = 1024 * 16
+
+    a = ti.field(dtype, shape=N)
+    tot = ti.field(dtype, shape=())
+
+    if dtype in [ti.f32, ti.f64]:
+
+        @ti.kernel
+        def fill():
+            for i in a:
+                a[i] = i + 0.5
+    else:
+
+        @ti.kernel
+        def fill():
+            for i in a:
+                a[i] = i + 1
+
+    ti_op = ti_ops[op]
+
+    @ti.kernel
+    def reduce():
+        for i in a:
+            ti_op(tot[None], a[i])
+
+    @ti.kernel
+    def reduce_tmp() -> dtype:
+        s = ti.zero(tot[None]) if op == OP_ADD or op == OP_XOR else a[0]
+        for i in a:
+            ti_op(s, a[i])
+        return s
+
+    fill()
+    tot[None] = 0 if op in [OP_ADD, OP_XOR] else a[0]
+    reduce()
+    tot2 = reduce_tmp()
+
+    np_arr = a.to_numpy()
+    ground_truth = np_ops[op](np_arr)
+
+    assert criterion(tot[None], ground_truth)
+    assert criterion(tot2, ground_truth)
+
+
+@pytest.mark.parametrize('op', [OP_ADD, OP_MIN, OP_MAX, OP_AND, OP_OR, OP_XOR])
+@test_utils.test()
+def test_reduction_single_i32(op):
+    _test_reduction_single(ti.i32, lambda x, y: x % 2**32 == y % 2**32, op)
+
+
+@pytest.mark.parametrize('op', [OP_ADD])
+@test_utils.test(exclude=ti.opengl)
+def test_reduction_single_u32(op):
+    _test_reduction_single(ti.u32, lambda x, y: x % 2**32 == y % 2**32, op)
+
+
+@pytest.mark.parametrize('op', [OP_ADD, OP_MIN, OP_MAX])
+@test_utils.test()
+def test_reduction_single_f32(op):
+    _test_reduction_single(ti.f32, lambda x, y: x == approx(y, 3e-4), op)
+
+
+@pytest.mark.parametrize('op', [OP_ADD])
+@test_utils.test(require=ti.extension.data64)
+def test_reduction_single_i64(op):
+    _test_reduction_single(ti.i64, lambda x, y: x % 2**64 == y % 2**64, op)
+
+
+@pytest.mark.parametrize('op', [OP_ADD])
+@test_utils.test(exclude=ti.opengl, require=ti.extension.data64)
+def test_reduction_single_u64(op):
+    _test_reduction_single(ti.u64, lambda x, y: x % 2**64 == y % 2**64, op)
+
+
+@pytest.mark.parametrize('op', [OP_ADD])
+@test_utils.test(require=ti.extension.data64)
+def test_reduction_single_f64(op):
+    _test_reduction_single(ti.f64, lambda x, y: x == approx(y, 1e-12), op)
+
+
+@test_utils.test()
+def test_reduction_different_scale():
+    @ti.kernel
+    def func(n: ti.template()) -> ti.i32:
+        x = 0
+        for i in range(n):
+            ti.atomic_add(x, 1)
+        return x
+
+    # 10 and 60 since OpenGL TLS stride size = 32
+    # 1024 and 100000 since OpenGL max threads per group ~= 1792
+    for n in [1, 10, 60, 1024, 100000]:
+        assert n == func(n)
+
+
+@test_utils.test()
+def test_reduction_any_arr():
+    @ti.kernel
+    def reduce(a: ti.any_arr()) -> ti.i32:
+        s = 0
+        for i in a:
+            ti.atomic_add(s, a[i])
+            ti.atomic_sub(s, 2)
+        return s
+
+    n = 1024
+    x = np.ones(n, dtype=np.int32)
+    assert reduce(x) == -n
diff --git a/tests/_python_orig/test_rescale.py b/tests/_python_orig/test_rescale.py
new file mode 100644
index 000000000..abf95da4e
--- /dev/null
+++ b/tests/_python_orig/test_rescale.py
@@ -0,0 +1,36 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_rescale():
+    a = ti.field(ti.f32)
+    b = ti.field(ti.f32)
+    ti.root.dense(ti.ij, 4).dense(ti.ij, 4).place(a)
+    ti.root.dense(ti.ij, 4).place(b)
+
+    @ti.kernel
+    def set_b():
+        for I in ti.grouped(a):
+            Ib = ti.rescale_index(a, b, I)
+            b[Ib] += 1.0
+
+    @ti.kernel
+    def set_a():
+        for I in ti.grouped(b):
+            Ia = ti.rescale_index(b, a, I)
+            a[Ia] = 1.0
+
+    set_a()
+    set_b()
+
+    for i in range(0, 4):
+        for j in range(0, 4):
+            assert b[i, j] == 16
+
+    for i in range(0, 16):
+        for j in range(0, 16):
+            if i % 4 == 0 and j % 4 == 0:
+                assert a[i, j] == 1
+            else:
+                assert a[i, j] == 0
diff --git a/tests/_python_orig/test_return.py b/tests/_python_orig/test_return.py
new file mode 100644
index 000000000..30bcc273d
--- /dev/null
+++ b/tests/_python_orig/test_return.py
@@ -0,0 +1,149 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_return_without_type_hint():
+    @ti.kernel
+    def kernel():
+        return 1
+
+    with pytest.raises(ti.TaichiCompilationError):
+        kernel()
+
+
+def test_const_func_ret():
+    ti.init()
+
+    @ti.kernel
+    def func1() -> ti.f32:
+        return 3
+
+    @ti.kernel
+    def func2() -> ti.i32:
+        return 3.3  # return type mismatch, will be auto-casted into ti.i32
+
+    assert func1() == test_utils.approx(3)
+    assert func2() == 3
+
+
+@test_utils.test()
+def _test_binary_func_ret(dt1, dt2, dt3, castor):
+    @ti.kernel
+    def func(a: dt1, b: dt2) -> dt3:
+        return a * b
+
+    if ti.types.is_integral(dt1):
+        xs = list(range(4))
+    else:
+        xs = [0.2, 0.4, 0.8, 1.0]
+
+    if ti.types.is_integral(dt2):
+        ys = list(range(4))
+    else:
+        ys = [0.2, 0.4, 0.8, 1.0]
+
+    for x, y in zip(xs, ys):
+        assert func(x, y) == test_utils.approx(castor(x * y))
+
+
+def test_binary_func_ret():
+    _test_binary_func_ret(ti.i32, ti.f32, ti.f32, float)
+    _test_binary_func_ret(ti.f32, ti.i32, ti.f32, float)
+    _test_binary_func_ret(ti.i32, ti.f32, ti.i32, int)
+    _test_binary_func_ret(ti.f32, ti.i32, ti.i32, int)
+
+
+@test_utils.test()
+def test_return_in_static_if():
+    @ti.kernel
+    def foo(a: ti.template()) -> ti.i32:
+        if ti.static(a == 1):
+            return 1
+        elif ti.static(a == 2):
+            return 2
+        return 3
+
+    assert foo(1) == 1
+    assert foo(2) == 2
+    assert foo(123) == 3
+
+
+@test_utils.test()
+def test_func_multiple_return():
+    @ti.func
+    def safe_sqrt(a):
+        if a > 0:
+            return ti.sqrt(a)
+        else:
+            return 0.0
+
+    @ti.kernel
+    def kern(a: float):
+        print(safe_sqrt(a))
+
+    with pytest.raises(
+            ti.TaichiCompilationError,
+            match='Return inside non-static if/for is not supported'):
+        kern(-233)
+
+
+@test_utils.test()
+def test_return_inside_static_for():
+    @ti.kernel
+    def foo() -> ti.i32:
+        a = 0
+        for i in ti.static(range(10)):
+            a += i * i
+            if ti.static(i == 8):
+                return a
+
+    assert foo() == 204
+
+
+@test_utils.test()
+def test_return_inside_non_static_for():
+    with pytest.raises(
+            ti.TaichiCompilationError,
+            match='Return inside non-static if/for is not supported'):
+
+        @ti.kernel
+        def foo() -> ti.i32:
+            for i in range(10):
+                return i
+
+        foo()
+
+
+@test_utils.test()
+def test_kernel_no_return():
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match=
+            "Kernel has a return type but does not have a return statement"):
+
+        @ti.kernel
+        def foo() -> ti.i32:
+            pass
+
+        foo()
+
+
+@test_utils.test()
+def test_func_no_return():
+    with pytest.raises(
+            ti.TaichiCompilationError,
+            match=
+            "Function has a return type but does not have a return statement"):
+
+        @ti.func
+        def bar() -> ti.i32:
+            pass
+
+        @ti.kernel
+        def foo() -> ti.i32:
+            return bar()
+
+        foo()
diff --git a/tests/_python_orig/test_runtime.py b/tests/_python_orig/test_runtime.py
new file mode 100644
index 000000000..a397e0c60
--- /dev/null
+++ b/tests/_python_orig/test_runtime.py
@@ -0,0 +1,163 @@
+import copy
+import os
+import sys
+from contextlib import contextmanager
+
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@contextmanager
+def patch_os_environ_helper(custom_environ: dict, excludes: dict):
+    """
+    Temporarily patch os.environ for testing.
+    Originally created by @rexwangcc in test_cli.py
+    @archibate tweaked this method to be an os.environ patcher.
+
+    The patched environ will be:
+        custom_environ + (os.environ - excludes - custom_environ).
+
+    I.e.:
+
+    1. custom_environ could override os.environ.
+    2. os.environ keys match excludes will not be included.
+
+    :parameter custom_environ:
+        Specify the base environment of patch, these values must
+        be included.
+
+    :parameter excludes:
+        When copying from os.environ, specify keys to be excluded.
+    """
+    environ = {}
+    for key in os.environ.keys():
+        if key not in excludes:
+            environ[key] = os.environ[key]
+    for key in custom_environ.keys():
+        environ[key] = custom_environ[key]
+    try:
+        cached_environ = os.environ
+        os.environ = custom_environ
+        yield os.environ
+    finally:
+        os.environ = cached_environ
+
+
+TF = [True, False]
+init_args = {
+    # 'key': [default, choices],
+    'log_level': ['info', ['error', 'warn', 'info', 'debug', 'trace']],
+    'gdb_trigger': [False, TF],
+    'advanced_optimization': [True, TF],
+    'debug': [False, TF],
+    'print_ir': [False, TF],
+    'verbose': [True, TF],
+    'fast_math': [True, TF],
+    'async_mode': [False, TF],
+    'flatten_if': [False, TF],
+    'simplify_before_lower_access': [True, TF],
+    'simplify_after_lower_access': [True, TF],
+    'kernel_profiler': [False, TF],
+    'check_out_of_bound': [False, TF],
+    'print_accessor_ir': [False, TF],
+    'print_evaluator_ir': [False, TF],
+    'print_struct_llvm_ir': [False, TF],
+    'print_kernel_llvm_ir': [False, TF],
+    'print_kernel_llvm_ir_optimized': [False, TF],
+    # FIXME: figure out why these two failed test:
+    #'device_memory_fraction': [0.0, [0.5, 1, 0]],
+    #'device_memory_GB': [1.0, [0.5, 1, 1.5, 2]],
+}
+
+env_configs = ['TI_' + key.upper() for key in init_args.keys()]
+
+special_init_cfgs = [
+    'log_level',
+    'gdb_trigger',
+]
+
+
+@pytest.mark.parametrize('key,values', init_args.items())
+def test_init_arg(key, values):
+    default, values = values
+
+    # helper function:
+    def test_arg(key, value, kwargs={}):
+        if key in special_init_cfgs:
+            spec_cfg = ti.init(_test_mode=True, **kwargs)
+            cfg = spec_cfg
+        else:
+            ti.init(**kwargs)
+            cfg = ti.lang.impl.current_cfg()
+        assert getattr(cfg, key) == value
+
+    with patch_os_environ_helper({}, excludes=env_configs):
+        # test if default value is correct:
+        test_arg(key, default)
+
+        # test if specified in argument:
+        for value in values:
+            kwargs = {key: value}
+            test_arg(key, value, kwargs)
+
+    # test if specified in environment:
+    env_key = 'TI_' + key.upper()
+    for value in values:
+        env_value = str(int(value) if isinstance(value, bool) else value)
+        environ = {env_key: env_value}
+        with patch_os_environ_helper(environ, excludes=env_configs):
+            test_arg(key, value)
+
+
+@pytest.mark.parametrize('arch', test_utils.expected_archs())
+def test_init_arch(arch):
+    with patch_os_environ_helper({}, excludes=['TI_ARCH']):
+        ti.init(arch=arch)
+        assert ti.lang.impl.current_cfg().arch == arch
+    with patch_os_environ_helper({'TI_ARCH': ti._lib.core.arch_name(arch)},
+                                 excludes=['TI_ARCH']):
+        ti.init(arch=ti.cc)
+        assert ti.lang.impl.current_cfg().arch == arch
+
+
+def test_init_bad_arg():
+    with pytest.raises(KeyError):
+        ti.init(_test_mode=True, debug=True, foo_bar=233)
+
+
+def test_init_require_version():
+    ti_core = ti._lib.utils.import_ti_core()
+    require_version = '{}.{}.{}'.format(ti_core.get_version_major(),
+                                        ti_core.get_version_minor(),
+                                        ti_core.get_version_patch())
+    ti.init(_test_mode=True, debug=True, require_version=require_version)
+
+
+def test_init_bad_require_version():
+    with pytest.raises(Exception):
+        ti_core = ti._lib.utils.import_ti_core()
+        bad_require_version = '{}.{}.{}'.format(
+            ti_core.get_version_major(), ti_core.get_version_minor(),
+            ti_core.get_version_patch() + 1)
+        ti.init(_test_mode=True,
+                debug=True,
+                require_version=bad_require_version)
+
+
+@pytest.mark.parametrize(
+    'level', [ti.DEBUG, ti.TRACE, ti.INFO, ti.WARN, ti.ERROR, ti.CRITICAL])
+@test_utils.test()
+def test_supported_log_levels(level):
+    spec_cfg = ti.init(_test_mode=True, log_level=level)
+    assert spec_cfg.log_level == level
+
+
+@pytest.mark.parametrize(
+    'level', [ti.DEBUG, ti.TRACE, ti.INFO, ti.WARN, ti.ERROR, ti.CRITICAL])
+@test_utils.test()
+def test_supported_log_levels(level):
+    spec_cfg = ti.init(_test_mode=True)
+    ti.set_logging_level(level)
+    assert ti._logging.is_logging_effective(level)
diff --git a/tests/_python_orig/test_scalar_op.py b/tests/_python_orig/test_scalar_op.py
new file mode 100644
index 000000000..1be760630
--- /dev/null
+++ b/tests/_python_orig/test_scalar_op.py
@@ -0,0 +1,192 @@
+import operator as ops
+
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+binary_func_table = [
+    (ops.add, ) * 2,
+    (ops.sub, ) * 2,
+    (ops.mul, ) * 2,
+    (ops.truediv, ) * 2,
+    (ops.floordiv, ) * 2,
+    (ops.mod, ) * 2,
+    (ops.pow, ) * 2,
+    (ops.and_, ) * 2,
+    (ops.or_, ) * 2,
+    (ops.xor, ) * 2,
+    (ops.eq, ) * 2,
+    (ops.ne, ) * 2,
+    (ops.lt, ) * 2,
+    (ops.le, ) * 2,
+    (ops.gt, ) * 2,
+    (ops.ge, ) * 2,
+    (ti.max, np.maximum),
+    (ti.min, np.minimum),
+    (ti.atan2, np.arctan2),
+]
+
+unary_func_table = [
+    (ops.neg, ) * 2,
+    (ops.invert, ) * 2,
+    (ti.lang.ops.logical_not, np.logical_not),
+    (ti.lang.ops.abs, np.abs),
+    (ti.exp, np.exp),
+    (ti.log, np.log),
+    (ti.sin, np.sin),
+    (ti.cos, np.cos),
+    (ti.tan, np.tan),
+    (ti.asin, np.arcsin),
+    (ti.acos, np.arccos),
+    (ti.tanh, np.tanh),
+    (ti.round, np.round),
+    (ti.floor, np.floor),
+    (ti.ceil, np.ceil),
+]
+
+
+@pytest.mark.parametrize('ti_func,np_func', binary_func_table)
+def test_python_scope_vector_binary(ti_func, np_func):
+    ti.init()
+    x = ti.Vector([2, 3])
+    y = ti.Vector([5, 4])
+
+    result = ti_func(x, y).to_numpy()
+    if ti_func in [ops.eq, ops.ne, ops.lt, ops.le, ops.gt, ops.ge]:
+        result = result.astype(bool)
+    expected = np_func(x.to_numpy(), y.to_numpy())
+    assert test_utils.allclose(result, expected)
+
+
+@pytest.mark.parametrize('ti_func,np_func', unary_func_table)
+def test_python_scope_vector_unary(ti_func, np_func):
+    ti.init()
+    x = ti.Vector([2, 3] if ti_func in
+                  [ops.invert, ti.lang.ops.logical_not] else [0.2, 0.3])
+
+    result = ti_func(x).to_numpy()
+    if ti_func in [ti.lang.ops.logical_not]:
+        result = result.astype(bool)
+    expected = np_func(x.to_numpy())
+    assert test_utils.allclose(result, expected)
+
+
+def test_python_scope_matmul():
+    ti.init()
+    a = np.array([[1, 2], [3, 4]])
+    b = np.array([[5, 6], [7, 8]])
+    x = ti.Vector(a)
+    y = ti.Vector(b)
+
+    result = (x @ y).to_numpy()
+    expected = a @ b
+    assert test_utils.allclose(result, expected)
+
+
+def test_python_scope_linalg():
+    ti.init()
+    a = np.array([3, 4, -2])
+    b = np.array([-5, 0, 6])
+    x = ti.Vector(a)
+    y = ti.Vector(b)
+
+    assert test_utils.allclose(x.dot(y), np.dot(a, b))
+    assert test_utils.allclose(x.norm(), np.sqrt(np.dot(a, a)))
+    assert test_utils.allclose(x.normalized(), a / np.sqrt(np.dot(a, a)))
+    assert x.any() == 1  # To match that of Taichi IR, we return -1 for True
+    assert y.all() == 0
+
+
+@test_utils.test(arch=[ti.x64, ti.cuda, ti.metal])
+def test_16_min_max():
+    @ti.kernel
+    def min_u16(a: ti.u16, b: ti.u16) -> ti.u16:
+        return ti.min(a, b)
+
+    @ti.kernel
+    def min_i16(a: ti.i16, b: ti.i16) -> ti.i16:
+        return ti.min(a, b)
+
+    @ti.kernel
+    def max_u16(a: ti.u16, b: ti.u16) -> ti.u16:
+        return ti.max(a, b)
+
+    @ti.kernel
+    def max_i16(a: ti.i16, b: ti.i16) -> ti.i16:
+        return ti.max(a, b)
+
+    a, b = 4, 2
+    assert min_u16(a, b) == min(a, b)
+    assert min_i16(a, b) == min(a, b)
+    assert max_u16(a, b) == max(a, b)
+    assert max_i16(a, b) == max(a, b)
+
+
+@test_utils.test(exclude=[ti.opengl, ti.cc])
+def test_32_min_max():
+    @ti.kernel
+    def min_u32(a: ti.u32, b: ti.u32) -> ti.u32:
+        return ti.min(a, b)
+
+    @ti.kernel
+    def min_i32(a: ti.i32, b: ti.i32) -> ti.i32:
+        return ti.min(a, b)
+
+    @ti.kernel
+    def max_u32(a: ti.u32, b: ti.u32) -> ti.u32:
+        return ti.max(a, b)
+
+    @ti.kernel
+    def max_i32(a: ti.i32, b: ti.i32) -> ti.i32:
+        return ti.max(a, b)
+
+    a, b = 4, 2
+    assert min_u32(a, b) == min(a, b)
+    assert min_i32(a, b) == min(a, b)
+    assert max_u32(a, b) == max(a, b)
+    assert max_i32(a, b) == max(a, b)
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_64_min_max():
+    @ti.kernel
+    def min_u64(a: ti.u64, b: ti.u64) -> ti.u64:
+        return ti.min(a, b)
+
+    @ti.kernel
+    def min_i64(a: ti.i64, b: ti.i64) -> ti.i64:
+        return ti.min(a, b)
+
+    @ti.kernel
+    def max_u64(a: ti.u64, b: ti.u64) -> ti.u64:
+        return ti.max(a, b)
+
+    @ti.kernel
+    def max_i64(a: ti.i64, b: ti.i64) -> ti.i64:
+        return ti.max(a, b)
+
+    a, b = 4, 2
+    assert min_u64(a, b) == min(a, b)
+    assert min_i64(a, b) == min(a, b)
+    assert max_u64(a, b) == max(a, b)
+    assert max_i64(a, b) == max(a, b)
+
+
+@test_utils.test()
+def test_min_max_vector_starred():
+    @ti.kernel
+    def min_starred() -> ti.i32:
+        a = ti.Vector([1, 2, 3])
+        b = ti.Vector([4, 5, 6])
+        return ti.min(*a, *b)
+
+    @ti.kernel
+    def max_starred() -> ti.i32:
+        a = ti.Vector([1, 2, 3])
+        b = ti.Vector([4, 5, 6])
+        return ti.max(*a, *b)
+
+    assert min_starred() == 1
+    assert max_starred() == 6
diff --git a/tests/_python_orig/test_scope_errors.py b/tests/_python_orig/test_scope_errors.py
new file mode 100644
index 000000000..ce6ef9231
--- /dev/null
+++ b/tests/_python_orig/test_scope_errors.py
@@ -0,0 +1,54 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_if():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        if True:
+            a = 0
+        else:
+            a = 1
+        print(a)
+
+    with pytest.raises(Exception):
+        func()
+
+
+@test_utils.test()
+def test_for():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(10):
+            a = i
+        print(a)
+
+    with pytest.raises(Exception):
+        func()
+
+
+@test_utils.test()
+def test_while():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        while True:
+            a = 0
+        print(a)
+
+    with pytest.raises(Exception):
+        func()
diff --git a/tests/_python_orig/test_serial_execution.py b/tests/_python_orig/test_serial_execution.py
new file mode 100644
index 000000000..5acd81502
--- /dev/null
+++ b/tests/_python_orig/test_serial_execution.py
@@ -0,0 +1,38 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cpu, cpu_max_num_threads=1)
+def test_serial_range_for():
+    n = 1024 * 32
+    s = ti.field(dtype=ti.i32, shape=n)
+    counter = ti.field(dtype=ti.i32, shape=())
+
+    @ti.kernel
+    def fill_range():
+        counter[None] = 0
+        for i in range(n):
+            s[ti.atomic_add(counter[None], 1)] = i
+
+    fill_range()
+
+    for i in range(n):
+        assert s[i] == i
+
+
+@test_utils.test(arch=ti.cpu, cpu_max_num_threads=1)
+def test_serial_struct_for():
+    n = 1024 * 32
+    s = ti.field(dtype=ti.i32, shape=n)
+    counter = ti.field(dtype=ti.i32, shape=())
+
+    @ti.kernel
+    def fill_struct():
+        counter[None] = 0
+        for i in s:
+            s[ti.atomic_add(counter[None], 1)] = i
+
+    fill_struct()
+
+    for i in range(n):
+        assert s[i] == i
diff --git a/tests/_python_orig/test_sfg.py b/tests/_python_orig/test_sfg.py
new file mode 100644
index 000000000..edee8e287
--- /dev/null
+++ b/tests/_python_orig/test_sfg.py
@@ -0,0 +1,124 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=[ti.extension.async_mode, ti.extension.sparse],
+                 async_mode=True)
+def test_remove_clear_list_from_fused_serial():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+    z = ti.field(ti.i32, shape=())
+
+    n = 32
+    ti.root.pointer(ti.i, n).dense(ti.i, 1).place(x)
+    ti.root.pointer(ti.i, n).dense(ti.i, 1).place(y)
+
+    @ti.kernel
+    def init_xy():
+        for i in range(n):
+            if i & 1:
+                x[i] = i
+            else:
+                y[i] = i
+
+    init_xy()
+    ti.sync()
+
+    stats = ti.tools.async_utils.get_kernel_stats()
+    stats.clear()
+
+    @ti.kernel
+    def inc(f: ti.template()):
+        for i in f:
+            f[i] += 1
+
+    @ti.kernel
+    def serial_z():
+        z[None] = 40
+        z[None] += 2
+
+    inc(x)
+    inc(y)
+    serial_z()
+    inc(x)
+    inc(y)
+    ti.sync()
+
+    counters = stats.get_counters()
+    # each of x and y has two listgens: root -> pointer -> dense
+    assert int(counters['launched_tasks_list_gen']) == 4
+    # clear list tasks have been fused into serial_z
+    assert int(counters['launched_tasks_serial']) == 1
+
+    xs = x.to_numpy()
+    ys = y.to_numpy()
+    for i in range(n):
+        if i & 1:
+            assert xs[i] == i + 2
+            assert ys[i] == 0
+        else:
+            assert ys[i] == i + 2
+            assert xs[i] == 0
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_sfg_dead_store_elimination():
+    n = 32
+
+    x = ti.field(dtype=float, shape=n, needs_grad=True)
+    total_energy = ti.field(dtype=float, shape=(), needs_grad=True)
+    unused = ti.field(dtype=float, shape=())
+
+    @ti.kernel
+    def gather():
+        for i in x:
+            e = x[i]**2
+            total_energy[None] += e
+
+    @ti.kernel
+    def scatter():
+        for i in x:
+            unused[None] += x[i]
+
+    xnp = np.arange(n, dtype=np.float32)
+    x.from_numpy(xnp)
+    ti.sync()
+
+    stats = ti.tools.async_utils.get_kernel_stats()
+    stats.clear()
+
+    for _ in range(5):
+        with ti.Tape(total_energy):
+            gather()
+        scatter()
+
+    ti.sync()
+    counters = stats.get_counters()
+
+    # gather() should be DSE'ed
+    assert counters['sfg_dse_tasks'] > 0
+
+    x_grad = x.grad.to_numpy()
+    for i in range(n):
+        assert test_utils.approx(x_grad[i]) == 2.0 * i
+
+
+@test_utils.test(require=ti.extension.async_mode, async_mode=True)
+def test_global_tmp_value_state():
+    # https://github.com/taichi-dev/taichi/issues/2024
+    n = 10
+    x = ti.field(ti.f32, shape=(n, ))
+
+    @ti.kernel
+    def compute_mean_of_boundary_edges() -> ti.i32:
+        total = 0.0
+        for i in range(n):
+            total += x[i] + x[i] * x[i]
+        result = total / ti.cast(n, ti.i32)
+        return result
+
+    x.from_numpy(np.arange(0, n, dtype=np.float32))
+    mean = compute_mean_of_boundary_edges()
+    assert test_utils.approx(mean) == 33
diff --git a/tests/_python_orig/test_snode_layout_inspection.py b/tests/_python_orig/test_snode_layout_inspection.py
new file mode 100644
index 000000000..ced2699a3
--- /dev/null
+++ b/tests/_python_orig/test_snode_layout_inspection.py
@@ -0,0 +1,58 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cpu)
+def test_primitives():
+    x = ti.field(dtype=ti.i16)
+    y = ti.field(dtype=ti.f32)
+    z = ti.field(dtype=ti.f64)
+
+    p = ti.field(dtype=ti.f32)
+    q = ti.field(dtype=ti.f32)
+    r = ti.field(dtype=ti.f64)
+
+    n1 = ti.root.dense(ti.i, 32)
+    n1.place(x)
+
+    n2 = ti.root.dense(ti.i, 32)
+    n2.place(y, z)
+
+    n3 = ti.root.dense(ti.i, 1)
+    n3.place(p, q, r)
+
+    assert n1._cell_size_bytes == 2
+    assert n2._cell_size_bytes in [12, 16]
+    assert n3._cell_size_bytes == 16
+
+    assert n1._offset_bytes_in_parent_cell == 0
+    assert n2._offset_bytes_in_parent_cell == 2 * 32
+    assert n3._offset_bytes_in_parent_cell in [
+        2 * 32 + 12 * 32, 2 * 32 + 16 * 32
+    ]
+
+    assert x.snode._offset_bytes_in_parent_cell == 0
+    assert y.snode._offset_bytes_in_parent_cell == 0
+    assert z.snode._offset_bytes_in_parent_cell in [4, 8]
+    assert p.snode._offset_bytes_in_parent_cell == 0
+    assert q.snode._offset_bytes_in_parent_cell == 4
+    assert r.snode._offset_bytes_in_parent_cell == 8
+
+
+@test_utils.test(arch=ti.cpu)
+def test_bit_struct():
+    cit = ti.types.quantized_types.quant.int(16, False)
+    x = ti.field(dtype=cit)
+    y = ti.field(dtype=ti.types.quantized_types.type_factory.custom_float(
+        significand_type=cit))
+    z = ti.field(dtype=ti.f32)
+
+    n1 = ti.root.dense(ti.i, 32)
+    n1.bit_struct(num_bits=32).place(x)
+
+    n2 = ti.root.dense(ti.i, 4)
+    n2.bit_struct(num_bits=32).place(y)
+    n2.place(z)
+
+    assert n1._cell_size_bytes == 4
+    assert n2._cell_size_bytes == 8
diff --git a/tests/_python_orig/test_sort.py b/tests/_python_orig/test_sort.py
new file mode 100644
index 000000000..1eb3647e0
--- /dev/null
+++ b/tests/_python_orig/test_sort.py
@@ -0,0 +1,33 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(exclude=[ti.cc])
+def test_sort():
+    def test_sort_for_dtype(dtype, N):
+        keys = ti.field(dtype, N)
+        values = ti.field(dtype, N)
+
+        @ti.kernel
+        def fill():
+            for i in keys:
+                keys[i] = ti.random() * N
+                values[i] = keys[i]
+
+        fill()
+        ti._kernels.parallel_sort(keys, values)
+
+        keys_host = keys.to_numpy()
+        values_host = values.to_numpy()
+
+        for i in range(N):
+            if i < N - 1:
+                assert keys_host[i] <= keys_host[i + 1]
+            assert keys_host[i] == values_host[i]
+
+    test_sort_for_dtype(ti.i32, 1)
+    test_sort_for_dtype(ti.i32, 256)
+    test_sort_for_dtype(ti.i32, 100001)
+    test_sort_for_dtype(ti.f32, 1)
+    test_sort_for_dtype(ti.f32, 256)
+    test_sort_for_dtype(ti.f32, 100001)
diff --git a/tests/_python_orig/test_sparse_activate.py b/tests/_python_orig/test_sparse_activate.py
new file mode 100644
index 000000000..3c5d49820
--- /dev/null
+++ b/tests/_python_orig/test_sparse_activate.py
@@ -0,0 +1,57 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 16
+
+    ptr = ti.root.pointer(ti.i, n)
+    ptr.dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    s[None] = 0
+
+    @ti.kernel
+    def activate():
+        ti.activate(ptr, 1)
+        ti.activate(ptr, 32)
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    activate()
+    func()
+    assert s[None] == 32
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_non_dfs_snode_order():
+    x = ti.field(dtype=ti.i32)
+    y = ti.field(dtype=ti.i32)
+
+    grid1 = ti.root.dense(ti.i, 1)
+    grid2 = ti.root.dense(ti.i, 1)
+    ptr = grid1.pointer(ti.i, 1)
+    ptr.place(x)
+    grid2.place(y)
+    '''
+    This SNode tree has node ids that do not follow DFS order:
+    S0root
+      S1dense
+        S3pointer
+          S4place<i32>
+      S2dense
+        S5place<i32>
+    '''
+    @ti.kernel
+    def foo():
+        ti.activate(ptr, [0])
+
+    foo()  # Just make sure it doesn't crash
+    ti.sync()
diff --git a/tests/_python_orig/test_sparse_basics.py b/tests/_python_orig/test_sparse_basics.py
new file mode 100644
index 000000000..118f3bfbb
--- /dev/null
+++ b/tests/_python_orig/test_sparse_basics.py
@@ -0,0 +1,106 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.pointer(ti.i, n).dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    x[0] = 1
+    x[127] = 1
+    x[256] = 1
+
+    func()
+    assert s[None] == 256
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer_is_active():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.pointer(ti.i, n).dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def func():
+        for i in range(n * n):
+            s[None] += ti.is_active(x.parent().parent(), i)
+
+    x[0] = 1
+    x[127] = 1
+    x[256] = 1
+
+    func()
+    assert s[None] == 256
+
+
+def _test_pointer2():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.pointer(ti.i, n).pointer(ti.i, n).dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    x[0] = 1
+    x[127] = 1
+    x[254] = 1
+    x[256 + n * n] = 1
+
+    x[257 + n * n] = 1
+    x[257 + n * n * 2] = 1
+    x[257 + n * n * 5] = 1
+
+    func()
+    assert s[None] == 5 * n
+    print(x[257 + n * n * 7])
+    assert s[None] == 5 * n
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer2():
+    _test_pointer2()
+
+
+@test_utils.test(require=[ti.extension.sparse, ti.extension.packed],
+                 packed=True)
+def test_pointer2_packed():
+    _test_pointer2()
+
+
+@pytest.mark.skip(reason='https://github.com/taichi-dev/taichi/issues/2520')
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer_direct_place():
+    x, y = ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 1
+    ti.root.pointer(ti.i, N).place(x)
+    ti.root.pointer(ti.i, N).place(y)
+
+    @ti.kernel
+    def foo():
+        pass
+
+    foo()
diff --git a/tests/_python_orig/test_sparse_deactivate.py b/tests/_python_orig/test_sparse_deactivate.py
new file mode 100644
index 000000000..e2bce91b4
--- /dev/null
+++ b/tests/_python_orig/test_sparse_deactivate.py
@@ -0,0 +1,218 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32, shape=())
+
+    n = 16
+
+    ptr = ti.root.pointer(ti.i, n)
+    ptr.dense(ti.i, n).place(x)
+
+    s[None] = 0
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    x[0] = 1
+    x[19] = 1
+    func()
+    assert s[None] == 32
+
+    @ti.kernel
+    def deactivate():
+        ti.deactivate(ptr, 0)
+
+    deactivate()
+    s[None] = 0
+    func()
+    assert s[None] == 16
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer1():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 16
+
+    ptr = ti.root.pointer(ti.i, n)
+    ptr.dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    s[None] = 0
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    x[0] = 1
+    x[19] = 1
+    x[20] = 1
+    x[45] = 1
+    func()
+    assert s[None] == 48
+
+    @ti.kernel
+    def deactivate():
+        ti.deactivate(ptr, 0)
+
+    deactivate()
+    s[None] = 0
+    func()
+    assert s[None] == 32
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer2():
+    x = ti.field(ti.f32)
+
+    n = 16
+
+    ptr = ti.root.pointer(ti.i, n)
+    ptr.dense(ti.i, n).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(n * n):
+            x[i] = 1.0
+
+    @ti.kernel
+    def set10():
+        x[10] = 10.0
+
+    @ti.kernel
+    def clear():
+        for i in ptr:
+            ti.deactivate(ptr, i)
+
+    func()
+    clear()
+
+    for i in range(n * n):
+        assert x[i] == 0.0
+
+    set10()
+
+    for i in range(n * n):
+        if i != 10:
+            assert x[i] == 0.0
+        else:
+            assert x[i] == 10.0
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer3():
+    x = ti.field(ti.f32)
+    x_temp = ti.field(ti.f32)
+
+    n = 16
+
+    ptr1 = ti.root.pointer(ti.ij, n)
+    ptr1.dense(ti.ij, n).place(x)
+    ptr2 = ti.root.pointer(ti.ij, n)
+    ptr2.dense(ti.ij, n).place(x_temp)
+
+    @ti.kernel
+    def fill():
+        for j in range(n * n):
+            for i in range(n * n):
+                x[i, j] = i + j
+
+    @ti.kernel
+    def fill2():
+        for i, j in x_temp:
+            if x_temp[i, j] < 100:
+                x[i, j] = x_temp[i, j]
+
+    @ti.kernel
+    def copy_to_temp():
+        for i, j in x:
+            x_temp[i, j] = x[i, j]
+
+    @ti.kernel
+    def copy_from_temp():
+        for i, j in x_temp:
+            x[i, j] = x_temp[i, j]
+
+    @ti.kernel
+    def clear():
+        for i, j in ptr1:
+            ti.deactivate(ptr1, [i, j])
+
+    @ti.kernel
+    def clear_temp():
+        for i, j in ptr2:
+            ti.deactivate(ptr2, [i, j])
+
+    fill()
+    copy_to_temp()
+    clear()
+    fill2()
+    clear_temp()
+
+    for itr in range(100):
+        copy_to_temp()
+        clear()
+        copy_from_temp()
+        clear_temp()
+
+        xn = x.to_numpy()
+        for j in range(n * n):
+            for i in range(n * n):
+                if i + j < 100:
+                    assert xn[i, j] == i + j
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_dynamic():
+    x = ti.field(ti.i32)
+    s = ti.field(ti.i32)
+
+    n = 16
+
+    lst = ti.root.dense(ti.i, n).dynamic(ti.j, 4096)
+    lst.place(x)
+    ti.root.dense(ti.i, n).place(s)
+
+    @ti.kernel
+    def func(mul: ti.i32):
+        for i in range(n):
+            for j in range(i * i * mul):
+                ti.append(lst, i, j)
+
+    @ti.kernel
+    def fetch_length():
+        for i in range(n):
+            s[i] = ti.length(lst, i)
+
+    func(1)
+    fetch_length()
+    for i in range(n):
+        assert s[i] == i * i
+
+    @ti.kernel
+    def clear():
+        for i in range(n):
+            ti.deactivate(lst, [i])
+
+    func(2)
+    fetch_length()
+    for i in range(n):
+        assert s[i] == i * i * 3
+
+    clear()
+    fetch_length()
+    for i in range(n):
+        assert s[i] == 0
+
+    func(4)
+    fetch_length()
+    for i in range(n):
+        assert s[i] == i * i * 4
diff --git a/tests/_python_orig/test_sparse_linear_solver.py b/tests/_python_orig/test_sparse_linear_solver.py
new file mode 100644
index 000000000..a3a2a3057
--- /dev/null
+++ b/tests/_python_orig/test_sparse_linear_solver.py
@@ -0,0 +1,59 @@
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+"""
+The symmetric positive definite matrix is created in matlab using the following script:
+    A = diag([1,2,3,4]);
+    OrthM = [1 0 1 0; -1 -2 0 1; 0 1 -1 0; 0, 1, 0 1];
+    U = orth(OrthM);
+    Aarray = U * A * U';
+    b = [1,2,3,4]';
+    res = inv(A) * b;
+"""
+Aarray = np.array([[
+    2.73999501130921, 0.518002544441220, 0.745119303009342, 0.0508907745638859
+], [0.518002544441220, 1.45111665837647, 0.757997555750432, 0.290885785873098],
+                   [
+                       0.745119303009342, 0.757997555750432, 2.96711176987733,
+                       -0.518002544441220
+                   ],
+                   [
+                       0.0508907745638859, 0.290885785873098,
+                       -0.518002544441220, 2.84177656043698
+                   ]])
+
+res = np.array([
+    -0.0754984396447588, 0.469972700892492, 1.18527357933586, 1.57686870529319
+])
+
+
+@pytest.mark.parametrize("dtype", [ti.f32])
+@pytest.mark.parametrize("solver_type", ["LLT", "LDLT", "LU"])
+@pytest.mark.parametrize("ordering", ["AMD", "COLAMD"])
+@test_utils.test(arch=ti.cpu)
+def test_sparse_LLT_solver(dtype, solver_type, ordering):
+    n = 4
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    b = ti.field(ti.f32, shape=n)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(),
+             InputArray: ti.ext_arr(), b: ti.template()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += InputArray[i, j]
+        for i in range(n):
+            b[i] = i + 1
+
+    fill(Abuilder, Aarray, b)
+    A = Abuilder.build()
+    solver = ti.linalg.SparseSolver(dtype=dtype,
+                                    solver_type=solver_type,
+                                    ordering=ordering)
+    solver.analyze_pattern(A)
+    solver.factorize(A)
+    x = solver.solve(b)
+    for i in range(n):
+        assert x[i] == test_utils.approx(res[i])
diff --git a/tests/_python_orig/test_sparse_matrix.py b/tests/_python_orig/test_sparse_matrix.py
new file mode 100644
index 000000000..64576cefd
--- /dev/null
+++ b/tests/_python_orig/test_sparse_matrix.py
@@ -0,0 +1,232 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_builder_deprecated_anno():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.linalg.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    for i in range(n):
+        for j in range(n):
+            assert A[i, j] == i + j
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_builder():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    for i in range(n):
+        for j in range(n):
+            assert A[i, j] == i + j
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_shape():
+    n, m = 8, 9
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, m, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, m):
+            Abuilder[i, j] += i + j
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    assert A.shape() == (n, m)
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_element_access():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
+        for i in range(n):
+            Abuilder[i, i] += i
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    for i in range(n):
+        assert A[i, i] == i
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_element_modify():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
+        for i in range(n):
+            Abuilder[i, i] += i
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    A[0, 0] = 1024.0
+    assert A[0, 0] == 1024.0
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_addition():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(),
+             Bbuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+            Bbuilder[i, j] += i - j
+
+    fill(Abuilder, Bbuilder)
+    A = Abuilder.build()
+    B = Bbuilder.build()
+    C = A + B
+    for i in range(n):
+        for j in range(n):
+            assert C[i, j] == 2 * i
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_subtraction():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(),
+             Bbuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+            Bbuilder[i, j] += i - j
+
+    fill(Abuilder, Bbuilder)
+    A = Abuilder.build()
+    B = Bbuilder.build()
+    C = A - B
+    for i in range(n):
+        for j in range(n):
+            assert C[i, j] == 2 * j
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_scalar_multiplication():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    B = A * 3.0
+    for i in range(n):
+        for j in range(n):
+            assert B[i, j] == 3 * (i + j)
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_transpose():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+
+    fill(Abuilder)
+    A = Abuilder.build()
+    B = A.transpose()
+    for i in range(n):
+        for j in range(n):
+            assert B[i, j] == A[j, i]
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_elementwise_multiplication():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(),
+             Bbuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+            Bbuilder[i, j] += i - j
+
+    fill(Abuilder, Bbuilder)
+    A = Abuilder.build()
+    B = Bbuilder.build()
+    C = A * B
+    for i in range(n):
+        for j in range(n):
+            assert C[i, j] == (i + j) * (i - j)
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_multiplication():
+    n = 2
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(),
+             Bbuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+            Bbuilder[i, j] += i - j
+
+    fill(Abuilder, Bbuilder)
+    A = Abuilder.build()
+    B = Bbuilder.build()
+    C = A @ B
+    assert C[0, 0] == 1.0
+    assert C[0, 1] == 0.0
+    assert C[1, 0] == 2.0
+    assert C[1, 1] == -1.0
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_nonsymmetric_multiplication():
+    n, k, m = 2, 3, 4
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, k, max_num_triplets=100)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(k, m, max_num_triplets=100)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(),
+             Bbuilder: ti.types.sparse_matrix_builder()):
+        for i, j in ti.ndrange(n, k):
+            Abuilder[i, j] += i + j
+        for i, j in ti.ndrange(k, m):
+            Bbuilder[i, j] -= i + j
+
+    fill(Abuilder, Bbuilder)
+    A = Abuilder.build()
+    B = Bbuilder.build()
+    C = A @ B
+    GT = [[-5, -8, -11, -14], [-8, -14, -20, -26]]
+    for i in range(n):
+        for j in range(m):
+            assert C[i, j] == GT[i][j]
diff --git a/tests/_python_orig/test_sparse_multi_tree.py b/tests/_python_orig/test_sparse_multi_tree.py
new file mode 100644
index 000000000..89ebe6a3c
--- /dev/null
+++ b/tests/_python_orig/test_sparse_multi_tree.py
@@ -0,0 +1,29 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_pointer():
+    e = ti.Vector.field(2, dtype=int, shape=16)
+
+    e[0] = ti.Vector([0, 0])
+
+    a = ti.field(float, shape=512)
+    b = ti.field(dtype=float)
+    ti.root.pointer(ti.i, 32).dense(ti.i, 16).place(b)
+
+    @ti.kernel
+    def test():
+        for i in a:
+            a[i] = i
+        for i in a:
+            b[i] += a[i]
+
+    test()
+    ti.sync()
+
+    b_np = b.to_numpy()
+    for i in range(512):
+        assert (b_np[i] == i)
diff --git a/tests/_python_orig/test_sparse_parallel.py b/tests/_python_orig/test_sparse_parallel.py
new file mode 100644
index 000000000..1a797c15c
--- /dev/null
+++ b/tests/_python_orig/test_sparse_parallel.py
@@ -0,0 +1,79 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.pointer(ti.i, n).dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def activate():
+        for i in range(n):
+            x[i * n] = 0
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += 1
+
+    activate()
+    func()
+    assert s[None] == n * n
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_pointer2():
+    x = ti.field(ti.f32)
+    s = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.pointer(ti.i, n).dense(ti.i, n).place(x)
+    ti.root.place(s)
+
+    @ti.kernel
+    def activate():
+        for i in range(n * n):
+            x[i] = i
+
+    @ti.kernel
+    def func():
+        for i in x:
+            s[None] += i
+
+    activate()
+    func()
+    N = n * n
+    assert s[None] == N * (N - 1) / 2
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_nested_struct_fill_and_clear():
+    a = ti.field(dtype=ti.f32)
+    N = 512
+
+    ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+
+    @ti.kernel
+    def fill():
+        for i, j in ti.ndrange(N * 8, N * 8):
+            a[i, j] = 2.0
+
+    @ti.kernel
+    def clear():
+        for i, j in a.parent():
+            ti.deactivate(a.parent().parent(), [i, j])
+
+    def task():
+        fill()
+        clear()
+
+    for i in range(10):
+        task()
+        ti.sync()
diff --git a/tests/_python_orig/test_spmv.py b/tests/_python_orig/test_spmv.py
new file mode 100644
index 000000000..ee0c78b4f
--- /dev/null
+++ b/tests/_python_orig/test_spmv.py
@@ -0,0 +1,71 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_vector_multiplication1():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    b = ti.field(ti.f32, shape=n)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(), b: ti.template()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i
+
+        for i in range(n):
+            b[i] = 1.0
+
+    fill(Abuilder, b)
+    A = Abuilder.build()
+    x = A @ b
+    for i in range(n):
+        assert x[i] == 8 * i
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_vector_multiplication2():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    b = ti.field(ti.f32, shape=n)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(), b: ti.template()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i - j
+
+        for i in range(n):
+            b[i] = 1.0
+
+    fill(Abuilder, b)
+    A = Abuilder.build()
+
+    x = A @ b
+    import numpy as np
+    res = np.array([-28, -20, -12, -4, 4, 12, 20, 28])
+    for i in range(n):
+        assert x[i] == res[i]
+
+
+@test_utils.test(arch=ti.cpu)
+def test_sparse_matrix_vector_multiplication3():
+    n = 8
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    b = ti.field(ti.f32, shape=n)
+
+    @ti.kernel
+    def fill(Abuilder: ti.types.sparse_matrix_builder(), b: ti.template()):
+        for i, j in ti.ndrange(n, n):
+            Abuilder[i, j] += i + j
+
+        for i in range(n):
+            b[i] = 1.0
+
+    fill(Abuilder, b)
+    A = Abuilder.build()
+
+    x = A @ b
+    import numpy as np
+    res = np.array([28, 36, 44, 52, 60, 68, 76, 84])
+    for i in range(n):
+        assert x[i] == res[i]
diff --git a/tests/_python_orig/test_ssa.py b/tests/_python_orig/test_ssa.py
new file mode 100644
index 000000000..585af0439
--- /dev/null
+++ b/tests/_python_orig/test_ssa.py
@@ -0,0 +1,75 @@
+'''
+SSA violation edge-case regression test.
+1. Ensure working well when computation result is assigned to self.
+2. Prevent duplicate-evaluation on expression with side-effect like random.
+'''
+import math
+
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_matrix_self_assign():
+    a = ti.Vector.field(2, ti.f32, ())
+    b = ti.Matrix.field(2, 2, ti.f32, ())
+    c = ti.Vector.field(2, ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a[None] = a[None].normalized()
+        b[None] = b[None].transpose()
+        c[None] = ti.Vector([c[None][1], c[None][0]])
+
+    inv_sqrt2 = 1 / math.sqrt(2)
+
+    a[None] = [1, 1]
+    b[None] = [[1, 2], [3, 4]]
+    c[None] = [2, 3]
+    func()
+    assert a[None] == ti.Vector([inv_sqrt2, inv_sqrt2])
+    assert b[None] == ti.Matrix([[1, 3], [2, 4]])
+    assert c[None] == ti.Vector([3, 2])
+
+
+@test_utils.test()
+def test_random_vector_dup_eval():
+    a = ti.Vector.field(2, ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a[None] = ti.Vector([ti.random(), 1]).normalized()
+
+    for i in range(4):
+        func()
+        assert a[None].norm_sqr() == test_utils.approx(1)
+
+
+@test_utils.test()
+def test_func_argument_dup_eval():
+    @ti.func
+    def func(a, t):
+        return a * t - a
+
+    @ti.kernel
+    def kern(t: ti.f32) -> ti.f32:
+        return func(ti.random(), t)
+
+    for i in range(4):
+        assert kern(1.0) == 0.0
+
+
+@test_utils.test()
+def test_func_random_argument_dup_eval():
+    @ti.func
+    def func(a):
+        return ti.Vector([ti.cos(a), ti.sin(a)])
+
+    @ti.kernel
+    def kern() -> ti.f32:
+        return func(ti.random()).norm_sqr()
+
+    for i in range(4):
+        assert kern() == test_utils.approx(1.0, rel=5e-5)
diff --git a/tests/_python_orig/test_static.py b/tests/_python_orig/test_static.py
new file mode 100644
index 000000000..767cf31ab
--- /dev/null
+++ b/tests/_python_orig/test_static.py
@@ -0,0 +1,90 @@
+import numpy as np
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@pytest.mark.parametrize('val', [0, 1])
+@test_utils.test(ti.cpu)
+def test_static_if(val):
+    x = ti.field(ti.i32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def static():
+        if ti.static(val > 0.5):
+            x[0] = 1
+        else:
+            x[0] = 0
+
+    static()
+    assert x[0] == val
+
+
+@test_utils.test(ti.cpu)
+def test_static_if_error():
+    x = ti.field(ti.i32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def static(val: float):
+        if ti.static(val > 0.5):
+            x[0] = 1
+        else:
+            x[0] = 0
+
+    with pytest.raises(ti.TaichiCompilationError,
+                       match='must be compile-time constants'):
+        static(42)
+
+
+@test_utils.test()
+def test_static_ndrange():
+    n = 3
+    x = ti.Matrix.field(n, n, dtype=ti.f32, shape=(n, n))
+
+    @ti.kernel
+    def fill():
+        w = [0, 1, 2]
+        for i, j in ti.static(ti.ndrange(3, 3)):
+            x[i, j][i, j] = w[i] + w[j] * 2
+
+    fill()
+    for i in range(3):
+        for j in range(3):
+            assert x[i, j][i, j] == i + j * 2
+
+
+@test_utils.test(ti.cpu)
+def test_static_break():
+    x = ti.field(ti.i32, 5)
+
+    @ti.kernel
+    def func():
+        for i in ti.static(range(5)):
+            x[i] = 1
+            if ti.static(i == 2):
+                break
+
+    func()
+
+    assert np.allclose(x.to_numpy(), np.array([1, 1, 1, 0, 0]))
+
+
+@test_utils.test(ti.cpu)
+def test_static_continue():
+    x = ti.field(ti.i32, 5)
+
+    @ti.kernel
+    def func():
+        for i in ti.static(range(5)):
+            if ti.static(i == 2):
+                continue
+            x[i] = 1
+
+    func()
+
+    assert np.allclose(x.to_numpy(), np.array([1, 1, 0, 1, 1]))
diff --git a/tests/_python_orig/test_stencils.py b/tests/_python_orig/test_stencils.py
new file mode 100644
index 000000000..3f34a6d5c
--- /dev/null
+++ b/tests/_python_orig/test_stencils.py
@@ -0,0 +1,25 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_simple():
+    # Note: access simplification does not work in this case. Maybe worth fixing.
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x, y)
+
+    @ti.kernel
+    def run():
+        for i in range(n - 1):
+            x[i] = 1
+            y[i + 1] = 2
+
+    run()
+
+    for i in range(n - 1):
+        assert x[i] == 1
+        assert y[i + 1] == 2
diff --git a/tests/_python_orig/test_stop_grad.py b/tests/_python_orig/test_stop_grad.py
new file mode 100644
index 000000000..450782b37
--- /dev/null
+++ b/tests/_python_orig/test_stop_grad.py
@@ -0,0 +1,86 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_normal_grad():
+    x = ti.field(ti.f32)
+    loss = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(loss)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            loss[None] += x[i]**2
+
+    for i in range(n):
+        x[i] = i
+
+    with ti.Tape(loss):
+        func()
+
+    for i in range(n):
+        assert x.grad[i] == i * 2
+
+
+@test_utils.test()
+def test_stop_grad():
+    x = ti.field(ti.f32)
+    loss = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(loss)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        for i in range(n):
+            ti.stop_grad(x)
+            loss[None] += x[i]**2
+
+    for i in range(n):
+        x[i] = i
+
+    with ti.Tape(loss):
+        func()
+
+    for i in range(n):
+        assert x.grad[i] == 0
+
+
+@test_utils.test()
+def test_stop_grad2():
+    x = ti.field(ti.f32)
+    loss = ti.field(ti.f32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.place(loss)
+    ti.root.lazy_grad()
+
+    @ti.kernel
+    def func():
+        # Two loops, one with stop grad on without
+        for i in range(n):
+            ti.stop_grad(x)
+            loss[None] += x[i]**2
+        for i in range(n):
+            loss[None] += x[i]**2
+
+    for i in range(n):
+        x[i] = i
+
+    with ti.Tape(loss):
+        func()
+
+    # If without stop, grad x.grad[i] = i * 4
+    for i in range(n):
+        assert x.grad[i] == i * 2
diff --git a/tests/_python_orig/test_struct.py b/tests/_python_orig/test_struct.py
new file mode 100644
index 000000000..4a1343045
--- /dev/null
+++ b/tests/_python_orig/test_struct.py
@@ -0,0 +1,80 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_linear():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.dense(ti.i, n).place(y)
+
+    for i in range(n):
+        x[i] = i
+        y[i] = i + 123
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i + 123
+
+
+def test_linear_repeated():
+    for i in range(10):
+        test_linear()
+
+
+@test_utils.test()
+def test_linear_nested():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n // 16).dense(ti.i, 16).place(x)
+    ti.root.dense(ti.i, n // 16).dense(ti.i, 16).place(y)
+
+    for i in range(n):
+        x[i] = i
+        y[i] = i + 123
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i + 123
+
+
+@test_utils.test()
+def test_linear_nested_aos():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n // 16).dense(ti.i, 16).place(x, y)
+
+    for i in range(n):
+        x[i] = i
+        y[i] = i + 123
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i + 123
+
+
+@test_utils.test(exclude=[ti.vulkan])
+def test_2d_nested():
+    x = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.ij, n // 16).dense(ti.ij, (32, 16)).place(x)
+
+    for i in range(n * 2):
+        for j in range(n):
+            x[i, j] = i + j * 10
+
+    for i in range(n * 2):
+        for j in range(n):
+            assert x[i, j] == i + j * 10
diff --git a/tests/_python_orig/test_struct_for.py b/tests/_python_orig/test_struct_for.py
new file mode 100644
index 000000000..95ce85ab2
--- /dev/null
+++ b/tests/_python_orig/test_struct_for.py
@@ -0,0 +1,315 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_singleton():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def fill():
+        for I in ti.grouped(x):
+            x[I] = 3
+
+    fill()
+
+    assert x[None] == 3
+
+
+@test_utils.test()
+def test_singleton2():
+    x = ti.field(ti.i32)
+
+    ti.root.place(x)
+
+    @ti.kernel
+    def fill():
+        for I in ti.grouped(x):
+            x[I] = 3
+
+    fill()
+
+    assert x[None] == 3
+
+
+@test_utils.test()
+def test_linear():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n).place(x)
+    ti.root.dense(ti.i, n).place(y)
+
+    @ti.kernel
+    def fill():
+        for i in x:
+            x[i] = i
+            y[i] = i * 2
+
+    fill()
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i * 2
+
+
+@test_utils.test()
+def test_nested():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.i, n // 4).dense(ti.i, 4).place(x)
+    ti.root.dense(ti.i, n).place(y)
+
+    @ti.kernel
+    def fill():
+        for i in x:
+            x[i] = i
+            y[i] = i * 2
+
+    fill()
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i * 2
+
+
+@test_utils.test()
+def test_nested2():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 2048
+
+    ti.root.dense(ti.i, n // 512).dense(ti.i, 16).dense(ti.i,
+                                                        8).dense(ti.i,
+                                                                 4).place(x)
+    ti.root.dense(ti.i, n).place(y)
+
+    @ti.kernel
+    def fill():
+        for i in x:
+            x[i] = i
+            y[i] = i * 2
+
+    fill()
+
+    for i in range(n):
+        assert x[i] == i
+        assert y[i] == i * 2
+
+
+@test_utils.test()
+def test_2d():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n, m = 32, 16
+
+    ti.root.dense(ti.ij, n).place(x, y)
+
+    @ti.kernel
+    def fill():
+        for i, j in x:
+            x[i, j] = i + j * 2
+
+    fill()
+
+    for i in range(n):
+        for j in range(m):
+            assert x[i, j] == i + j * 2
+
+
+@test_utils.test()
+def test_2d_non_POT():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32, shape=())
+
+    n, m = 13, 17
+
+    ti.root.dense(ti.ij, (n, m)).place(x)
+
+    @ti.kernel
+    def fill():
+        for i, j in x:
+            y[None] += i + j * j
+
+    fill()
+
+    tot = 0
+    for i in range(n):
+        for j in range(m):
+            tot += i + j * j
+    assert y[None] == tot
+
+
+@test_utils.test()
+def test_nested_2d():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 32
+
+    ti.root.dense(ti.ij, n // 4).dense(ti.ij, 4).place(x, y)
+
+    @ti.kernel
+    def fill():
+        for i, j in x:
+            x[i, j] = i + j * 2
+
+    fill()
+
+    for i in range(n):
+        for j in range(n):
+            assert x[i, j] == i + j * 2
+
+
+@test_utils.test()
+def test_nested_2d_more_nests():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32)
+
+    n = 64
+
+    ti.root.dense(ti.ij, n // 16).dense(ti.ij,
+                                        2).dense(ti.ij,
+                                                 4).dense(ti.ij,
+                                                          2).place(x, y)
+
+    @ti.kernel
+    def fill():
+        for i, j in x:
+            x[i, j] = i + j * 2
+
+    fill()
+
+    for i in range(n):
+        for j in range(n):
+            assert x[i, j] == i + j * 2
+
+
+@test_utils.test()
+def test_linear_k():
+    x = ti.field(ti.i32)
+
+    n = 128
+
+    ti.root.dense(ti.k, n).place(x)
+
+    @ti.kernel
+    def fill():
+        for i in x:
+            x[i] = i
+
+    fill()
+
+    for i in range(n):
+        assert x[i] == i
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_struct_for_branching():
+    # Related issue: https://github.com/taichi-dev/taichi/issues/704
+    x = ti.field(dtype=ti.i32)
+    y = ti.field(dtype=ti.i32)
+    ti.root.pointer(ti.ij, 128 // 4).dense(ti.ij, 4).place(x, y)
+
+    @ti.kernel
+    def func1():
+        for i, j in x:
+            if x[i, j] & 2 == 2:
+                y[i, j] = 1
+
+    @ti.kernel
+    def func2():
+        for i, j in x:
+            if x[i, j] == 2 or x[i, j] == 4:
+                y[i, j] = 1
+
+    @ti.kernel
+    def func3():
+        for i, j in x:
+            if x[i, j] & 2 == 2 or x[i, j] & 4 == 4:
+                y[i, j] = 1
+
+    func1()
+    func2()
+    func3()
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_struct_for_pointer_block():
+    n = 16
+    block_size = 8
+
+    f = ti.field(dtype=ti.f32)
+
+    block = ti.root.pointer(ti.ijk, n // block_size)
+    block.dense(ti.ijk, block_size).place(f)
+
+    f[0, 2, 3] = 1
+
+    @ti.kernel
+    def count() -> int:
+        tot = 0
+        for I in ti.grouped(block):
+            tot += 1
+        return tot
+
+    assert count() == 1
+
+
+@test_utils.test(require=ti.extension.quant)
+def test_struct_for_quant():
+    n = 8
+
+    ci13 = ti.types.quantized_types.quant.int(13, True)
+    x = ti.field(dtype=ci13)
+
+    ti.root.dense(ti.i, n).bit_struct(num_bits=32).place(x)
+
+    @ti.kernel
+    def count() -> int:
+        tot = 0
+        for i in x:
+            tot += i
+        return tot
+
+    assert count() == 28
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_struct_for_continue():
+    # Related issue: https://github.com/taichi-dev/taichi/issues/3272
+    x = ti.field(dtype=ti.i32)
+    n = 4
+    ti.root.pointer(ti.i, n).dense(ti.i, n).place(x)
+
+    @ti.kernel
+    def init():
+        for i in range(n):
+            x[i * n + i] = 1
+
+    @ti.kernel
+    def struct_for_continue() -> ti.i32:
+        cnt = 0
+        for i in x:
+            if x[i]: continue
+            cnt += 1
+        return cnt
+
+    @ti.kernel
+    def range_for_continue() -> ti.i32:
+        cnt = 0
+        for i in range(n * n):
+            if x[i]: continue
+            cnt += 1
+        return cnt
+
+    init()
+    assert struct_for_continue() == n * (n - 1)
+    assert range_for_continue() == n * (n - 1)
diff --git a/tests/_python_orig/test_struct_for_dynamic.py b/tests/_python_orig/test_struct_for_dynamic.py
new file mode 100644
index 000000000..67595a6e0
--- /dev/null
+++ b/tests/_python_orig/test_struct_for_dynamic.py
@@ -0,0 +1,44 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(exclude=[ti.opengl, ti.cc, ti.vulkan])
+def test_dynamic():
+    x = ti.field(ti.i32)
+    y = ti.field(ti.i32, shape=())
+
+    n = 128
+
+    ti.root.dynamic(ti.i, n).place(x)
+
+    @ti.kernel
+    def count():
+        for i in x:
+            y[None] += 1
+
+    x[n // 3] = 1
+
+    count()
+
+    assert y[None] == n // 3 + 1
+
+
+@test_utils.test(exclude=[ti.opengl, ti.cc, ti.vulkan])
+def test_dense_dynamic():
+    n = 128
+
+    x = ti.field(ti.i32)
+
+    ti.root.dense(ti.i, n).dynamic(ti.j, n, 128).place(x)
+
+    @ti.kernel
+    def append():
+        for i in range(n):
+            for j in range(i):
+                ti.append(x.parent(), i, j * 2)
+
+    append()
+
+    for i in range(n):
+        for j in range(i):
+            assert x[i, j] == j * 2
diff --git a/tests/_python_orig/test_struct_for_intermediate.py b/tests/_python_orig/test_struct_for_intermediate.py
new file mode 100644
index 000000000..4971f20f1
--- /dev/null
+++ b/tests/_python_orig/test_struct_for_intermediate.py
@@ -0,0 +1,47 @@
+import taichi as ti
+from tests import test_utils
+
+
+def _test_nested():
+    x = ti.field(ti.i32)
+
+    p, q = 3, 7
+    n, m = 2, 4
+
+    ti.root.dense(ti.ij, (p, q)).dense(ti.ij, (n, m)).place(x)
+
+    @ti.kernel
+    def iterate():
+        for i, j in x.parent():
+            x[i, j] += 1
+
+    iterate()
+    for i in range(p):
+        for j in range(q):
+            assert x[i * n, j * m] == 1, (i, j)
+
+
+@test_utils.test(require=ti.extension.sparse,
+                 demote_dense_struct_fors=False,
+                 packed=False)
+def test_nested():
+    _test_nested()
+
+
+@test_utils.test(demote_dense_struct_fors=True, packed=False)
+def test_nested_demote():
+    _test_nested()
+
+
+@test_utils.test(require=[ti.extension.sparse, ti.extension.packed],
+                 demote_dense_struct_fors=False,
+                 packed=True)
+def test_nested_packed():
+    _test_nested()
+
+
+@test_utils.test(require=ti.extension.packed,
+                 demote_dense_struct_fors=True,
+                 packed=True)
+def test_nested_demote_packed():
+    _test_nested()
diff --git a/tests/_python_orig/test_struct_for_non_pot.py b/tests/_python_orig/test_struct_for_non_pot.py
new file mode 100644
index 000000000..c8eefbe3d
--- /dev/null
+++ b/tests/_python_orig/test_struct_for_non_pot.py
@@ -0,0 +1,68 @@
+import taichi as ti
+from tests import test_utils
+
+
+def _test_1d():
+    x = ti.field(ti.i32)
+    sum = ti.field(ti.i32)
+
+    n = 100
+
+    ti.root.dense(ti.k, n).place(x)
+    ti.root.place(sum)
+
+    @ti.kernel
+    def accumulate():
+        for i in x:
+            ti.atomic_add(sum[None], i)
+
+    accumulate()
+
+    for i in range(n):
+        assert sum[None] == 4950
+
+
+@test_utils.test()
+def test_1d():
+    _test_1d()
+
+
+@test_utils.test(require=ti.extension.packed, packed=True)
+def test_1d_packed():
+    _test_1d()
+
+
+def _test_2d():
+    x = ti.field(ti.i32)
+    sum = ti.field(ti.i32)
+
+    n = 100
+    m = 19
+
+    ti.root.dense(ti.k, n).dense(ti.i, m).place(x)
+    ti.root.place(sum)
+
+    @ti.kernel
+    def accumulate():
+        for i, k in x:
+            ti.atomic_add(sum[None], i + k * 2)
+
+    gt = 0
+    for k in range(n):
+        for i in range(m):
+            gt += i + k * 2
+
+    accumulate()
+
+    for i in range(n):
+        assert sum[None] == gt
+
+
+@test_utils.test()
+def test_2d():
+    _test_2d()
+
+
+@test_utils.test(require=ti.extension.packed, packed=True)
+def test_2d_packed():
+    _test_2d()
diff --git a/tests/_python_orig/test_svd.py b/tests/_python_orig/test_svd.py
new file mode 100644
index 000000000..988dc764a
--- /dev/null
+++ b/tests/_python_orig/test_svd.py
@@ -0,0 +1,90 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.data64, fast_math=False)
+def test_precision():
+    u = ti.field(ti.f64, shape=())
+    v = ti.field(ti.f64, shape=())
+    w = ti.field(ti.f64, shape=())
+
+    @ti.kernel
+    def forward():
+        v[None] = ti.sqrt(ti.cast(u[None] + 3.25, ti.f64))
+        w[None] = ti.cast(u[None] + 7, ti.f64) / ti.cast(u[None] + 3, ti.f64)
+
+    forward()
+    assert v[None]**2 == test_utils.approx(3.25, abs=1e-12)
+    assert w[None] * 3 == test_utils.approx(7, abs=1e-12)
+
+
+def mat_equal(A, B, tol=1e-6):
+    return np.max(np.abs(A - B)) < tol
+
+
+def _test_svd(dt, n):
+    print(
+        f'arch={ti.lang.impl.current_cfg().arch} default_fp={ti.lang.impl.current_cfg().default_fp} fast_math={ti.lang.impl.current_cfg().fast_math} dim={n}'
+    )
+    A = ti.Matrix.field(n, n, dtype=dt, shape=())
+    A_reconstructed = ti.Matrix.field(n, n, dtype=dt, shape=())
+    U = ti.Matrix.field(n, n, dtype=dt, shape=())
+    UtU = ti.Matrix.field(n, n, dtype=dt, shape=())
+    sigma = ti.Matrix.field(n, n, dtype=dt, shape=())
+    V = ti.Matrix.field(n, n, dtype=dt, shape=())
+    VtV = ti.Matrix.field(n, n, dtype=dt, shape=())
+
+    @ti.kernel
+    def run():
+        U[None], sigma[None], V[None] = ti.svd(A[None], dt)
+        UtU[None] = U[None].transpose() @ U[None]
+        VtV[None] = V[None].transpose() @ V[None]
+        A_reconstructed[None] = U[None] @ sigma[None] @ V[None].transpose()
+
+    if n == 3:
+        A[None] = [[1, 1, 3], [9, -3, 2], [-3, 4, 2]]
+    else:
+        A[None] = [[1, 1], [2, 3]]
+
+    run()
+
+    tol = 1e-5 if dt == ti.f32 else 1e-12
+
+    assert mat_equal(UtU.to_numpy(), np.eye(n), tol=tol)
+    assert mat_equal(VtV.to_numpy(), np.eye(n), tol=tol)
+    assert mat_equal(A_reconstructed.to_numpy(), A.to_numpy(), tol=tol)
+    for i in range(n):
+        for j in range(n):
+            if i != j:
+                assert sigma[None][i, j] == test_utils.approx(0)
+
+
+def test_svd():
+    for fp in [ti.f32, ti.f64]:
+        for d in [2, 3]:
+
+            @test_utils.test(
+                require=ti.extension.data64 if fp == ti.f64 else [],
+                default_fp=fp,
+                fast_math=False)
+            def wrapped():
+                _test_svd(fp, d)
+
+            wrapped()
+
+
+@test_utils.test()
+def test_transpose_no_loop():
+    A = ti.Matrix.field(3, 3, dtype=ti.f32, shape=())
+    U = ti.Matrix.field(3, 3, dtype=ti.f32, shape=())
+    sigma = ti.Matrix.field(3, 3, dtype=ti.f32, shape=())
+    V = ti.Matrix.field(3, 3, dtype=ti.f32, shape=())
+
+    @ti.kernel
+    def run():
+        U[None], sigma[None], V[None] = ti.svd(A[None])
+
+    run()
+    # As long as it passes compilation we are good
diff --git a/tests/_python_orig/test_sync.py b/tests/_python_orig/test_sync.py
new file mode 100644
index 000000000..feec84008
--- /dev/null
+++ b/tests/_python_orig/test_sync.py
@@ -0,0 +1,27 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_kernel_sync():
+    n = 128
+    x = ti.field(ti.i32, shape=(3, ))
+    y = ti.field(ti.i32, shape=(n, ))
+    # These [] calls are all on CPU, so no synchronization needed
+    x[0] = 42
+    assert x[0] == 42
+    x[1] = 233
+    x[2] = -1
+
+    @ti.kernel
+    def func():
+        for i in y:
+            y[i] = x[i % 3]
+
+    # Kernel *may* run on GPU
+    # Note that the previous kernel is a write, which didn't do a sync. But that
+    # should be fine -- we only need to sync the memory after GPU -> CPU.
+    func()
+    # These [] calls are on CPU. They should be smart enough to sync only once.
+    for i in range(n):
+        assert y[i] == x[i % 3]
diff --git a/tests/_python_orig/test_syntax_errors.py b/tests/_python_orig/test_syntax_errors.py
new file mode 100644
index 000000000..6ecc89809
--- /dev/null
+++ b/tests/_python_orig/test_syntax_errors.py
@@ -0,0 +1,330 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_try():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        try:
+            a = 0
+        except:
+            a = 1
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_for_else():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        for i in range(10):
+            pass
+        else:
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_while_else():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        while True:
+            pass
+        else:
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_raise():
+    @ti.kernel
+    def foo():
+        raise Exception()
+
+    with pytest.raises(ti.TaichiSyntaxError,
+                       match='Unsupported node "Raise"') as e:
+        foo()
+
+
+@test_utils.test()
+def test_loop_var_range():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        i = 0
+        for i in range(10):
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_loop_var_struct():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        i = 0
+        for i in x:
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_loop_var_struct():
+    x = ti.field(ti.f32)
+
+    ti.root.dense(ti.i, 1).place(x)
+
+    @ti.kernel
+    def func():
+        j = 0
+        for i, j in x:
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_func_def_in_kernel():
+    @ti.kernel
+    def kernel():
+        @ti.func
+        def func():
+            return 1
+
+        print(func())
+
+    with pytest.raises(ti.TaichiCompilationError):
+        kernel()
+
+
+@test_utils.test()
+def test_func_def_in_func():
+    @ti.func
+    def func():
+        @ti.func
+        def func2():
+            return 1
+
+        return func2()
+
+    @ti.kernel
+    def kernel():
+        print(func())
+
+    with pytest.raises(ti.TaichiCompilationError):
+        kernel()
+
+
+@test_utils.test(arch=ti.cpu)
+def test_kernel_bad_argument_annotation():
+    with pytest.raises(ti.TaichiSyntaxError, match='annotation'):
+
+        @ti.kernel
+        def kernel(x: 'bar'):
+            print(x)
+
+
+@test_utils.test(arch=ti.cpu)
+def test_func_bad_argument_annotation():
+    with pytest.raises(ti.TaichiSyntaxError, match='annotation'):
+
+        @ti.func
+        def func(x: 'foo'):
+            print(x)
+
+
+@test_utils.test()
+def test_nested_static():
+    @ti.kernel
+    def func():
+        for i in ti.static(ti.static(range(1))):
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_nested_grouped():
+    @ti.kernel
+    def func():
+        for i in ti.grouped(ti.grouped(range(1))):
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_nested_ndrange():
+    @ti.kernel
+    def func():
+        for i in ti.ndrange(ti.ndrange(1)):
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_static_grouped_struct_for():
+    val = ti.field(ti.i32)
+
+    ti.root.dense(ti.ij, (1, 1)).place(val)
+
+    @ti.kernel
+    def test():
+        for I in ti.static(ti.grouped(val)):
+            pass
+
+    with pytest.raises(ti.TaichiCompilationError):
+        test()
+
+
+@test_utils.test()
+def test_is():
+    b = ti.field(ti.i32, shape=())
+    c = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = b is c
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_is_not():
+    b = ti.field(ti.i32, shape=())
+    c = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = b is not c
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_in():
+    b = ti.field(ti.i32, shape=())
+    c = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = b in c
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_not_in():
+    b = ti.field(ti.i32, shape=())
+    c = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        a = b not in c
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_expr_set():
+    @ti.kernel
+    def func():
+        x = {2, 4, 6}
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test()
+def test_redefining_template_args():
+    @ti.kernel
+    def foo(a: ti.template()):
+        a = 5
+
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match=
+            "Variable 'a' cannot be assigned. Maybe it is not a Taichi object?"
+    ):
+        foo(1)
+
+
+@test_utils.test()
+def test_break_in_outermost_for():
+    @ti.kernel
+    def foo():
+        for i in range(10):
+            break
+
+    with pytest.raises(ti.TaichiSyntaxError,
+                       match="Cannot break in the outermost loop"):
+        foo()
+
+
+@test_utils.test()
+def test_funcdef_in_kernel():
+    @ti.kernel
+    def foo():
+        def bar():
+            pass
+
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match="Function definition is not allowed in 'ti.kernel'"):
+        foo()
+
+
+@test_utils.test()
+def test_funcdef_in_func():
+    @ti.func
+    def foo():
+        def bar():
+            pass
+
+    @ti.kernel
+    def baz():
+        foo()
+
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match="Function definition is not allowed in 'ti.func'"):
+        baz()
diff --git a/tests/_python_orig/test_tensor_dimensionality.py b/tests/_python_orig/test_tensor_dimensionality.py
new file mode 100644
index 000000000..5eb079a09
--- /dev/null
+++ b/tests/_python_orig/test_tensor_dimensionality.py
@@ -0,0 +1,32 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def _test_dimensionality(d):
+    x = ti.Vector.field(2, dtype=ti.i32, shape=(2, ) * d)
+
+    @ti.kernel
+    def fill():
+        for I in ti.grouped(x):
+            x[I] += ti.Vector([I.sum(), I[0]])
+
+    for i in range(2**d):
+        indices = []
+        for j in range(d):
+            indices.append(i // (2**j) % 2)
+        x.__getitem__(tuple(indices))[0] = sum(indices) * 2
+    fill()
+    # FIXME(yuanming-hu): snode_writer needs 9 arguments actually..
+    if ti.lang.impl.current_cfg().arch == ti.cc and d >= 8:
+        return
+    for i in range(2**d):
+        indices = []
+        for j in range(d):
+            indices.append(i // (2**j) % 2)
+        assert x.__getitem__(tuple(indices))[0] == sum(indices) * 3
+
+
+def test_dimensionality():
+    for i in range(2, ti._lib.core.get_max_num_indices() + 1):
+        _test_dimensionality(i)
diff --git a/tests/_python_orig/test_tensor_reflection.py b/tests/_python_orig/test_tensor_reflection.py
new file mode 100644
index 000000000..ae662418b
--- /dev/null
+++ b/tests/_python_orig/test_tensor_reflection.py
@@ -0,0 +1,115 @@
+import pytest
+from taichi.lang import impl
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_POT():
+    val = ti.field(ti.i32)
+
+    n = 4
+    m = 8
+    p = 16
+
+    ti.root.dense(ti.i, n).dense(ti.j, m).dense(ti.k, p).place(val)
+
+    assert val.shape == (n, m, p)
+    assert val.dtype == ti.i32
+
+
+@test_utils.test()
+def test_non_POT():
+    val = ti.field(ti.i32)
+
+    n = 3
+    m = 7
+    p = 11
+
+    blk1 = ti.root.dense(ti.i, n)
+    blk2 = blk1.dense(ti.j, m)
+    blk3 = blk2.dense(ti.k, p)
+    blk3.place(val)
+
+    assert val.shape == (n, m, p)
+    assert val.dtype == ti.i32
+
+
+@test_utils.test()
+def test_unordered():
+    val = ti.field(ti.i32)
+
+    n = 3
+    m = 7
+    p = 11
+
+    blk1 = ti.root.dense(ti.k, n)
+    blk2 = blk1.dense(ti.i, m)
+    blk3 = blk2.dense(ti.j, p)
+    blk3.place(val)
+
+    assert val.dtype == ti.i32
+    assert val.shape == (m, p, n)
+    assert val.snode.parent(0) == val.snode
+    assert val.snode.parent() == blk3
+    assert val.snode.parent(1) == blk3
+    assert val.snode.parent(2) == blk2
+    assert val.snode.parent(3) == blk1
+    assert val.snode.parent(4) == ti.root
+
+    assert val.snode in blk3._get_children()
+    assert blk3 in blk2._get_children()
+    assert blk2 in blk1._get_children()
+    impl.get_runtime().materialize_root_fb(False)
+    assert blk1 in ti.FieldsBuilder._finalized_roots()[0]._get_children()
+
+    expected_str = f'ti.root => dense {[n]} => dense {[m, n]}' \
+        f' => dense {[m, p, n]} => place {[m, p, n]}'
+    assert str(val.snode) == expected_str
+
+
+@test_utils.test()
+def test_unordered_matrix():
+    val = ti.Matrix.field(3, 2, ti.i32)
+
+    n = 3
+    m = 7
+    p = 11
+
+    blk1 = ti.root.dense(ti.k, n)
+    blk2 = blk1.dense(ti.i, m)
+    blk3 = blk2.dense(ti.j, p)
+    blk3.place(val)
+
+    assert val.shape == (m, p, n)
+    assert val.dtype == ti.i32
+    assert val.snode.parent(0) == val.snode
+    assert val.snode.parent() == blk3
+    assert val.snode.parent(1) == blk3
+    assert val.snode.parent(2) == blk2
+    assert val.snode.parent(3) == blk1
+    assert val.snode.parent(4) == ti.root
+    assert val.snode._path_from_root() == [
+        ti.root, blk1, blk2, blk3, val.snode
+    ]
+
+
+@test_utils.test()
+def test_parent_exceeded():
+    val = ti.field(ti.f32)
+
+    m = 7
+    n = 3
+
+    blk1 = ti.root.dense(ti.i, m)
+    blk2 = blk1.dense(ti.j, n)
+    blk2.place(val)
+
+    assert val.snode.parent() == blk2
+    assert val.snode.parent(2) == blk1
+    assert val.snode.parent(3) == ti.root
+    assert val.snode.parent(4) == None
+    assert val.snode.parent(42) == None
+
+    assert ti.root.parent() == None
diff --git a/tests/_python_orig/test_test.py b/tests/_python_orig/test_test.py
new file mode 100644
index 000000000..c5444904c
--- /dev/null
+++ b/tests/_python_orig/test_test.py
@@ -0,0 +1,141 @@
+'''
+This file tests if Taichi's testing utilities are functional.
+
+TODO: Skips these tests after all tests are using @ti.test
+'''
+import os
+
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+### `ti.test`
+
+
+@test_utils.test()
+def test_all_archs():
+    assert ti.lang.impl.current_cfg().arch in test_utils.expected_archs()
+
+
+@test_utils.test(arch=ti.cpu)
+def test_arch_cpu():
+    assert ti.lang.impl.current_cfg().arch in [ti.cpu]
+
+
+@test_utils.test(arch=[ti.cpu])
+def test_arch_list_cpu():
+    assert ti.lang.impl.current_cfg().arch in [ti.cpu]
+
+
+@test_utils.test(exclude=ti.cpu)
+def test_exclude_cpu():
+    assert ti.lang.impl.current_cfg().arch not in [ti.cpu]
+
+
+@test_utils.test(exclude=[ti.cpu])
+def test_exclude_list_cpu():
+    assert ti.lang.impl.current_cfg().arch not in [ti.cpu]
+
+
+@test_utils.test(arch=ti.opengl)
+def test_arch_opengl():
+    assert ti.lang.impl.current_cfg().arch in [ti.opengl]
+
+
+@test_utils.test(arch=[ti.cpu, ti.opengl, ti.metal])
+def test_multiple_archs():
+    assert ti.lang.impl.current_cfg().arch in [ti.cpu, ti.opengl, ti.metal]
+
+
+@test_utils.test(arch=ti.cpu, debug=True, advanced_optimization=False)
+def test_init_args():
+    assert ti.lang.impl.current_cfg().debug == True
+    assert ti.lang.impl.current_cfg().advanced_optimization == False
+
+
+@test_utils.test(require=ti.extension.sparse)
+def test_require_extensions_1():
+    assert ti.lang.impl.current_cfg().arch in [ti.cpu, ti.cuda, ti.metal]
+
+
+@test_utils.test(arch=[ti.cpu, ti.opengl], require=ti.extension.sparse)
+def test_require_extensions_2():
+    assert ti.lang.impl.current_cfg().arch in [ti.cpu]
+
+
+@test_utils.test(arch=[ti.cpu, ti.opengl],
+                 require=[ti.extension.sparse, ti.extension.bls])
+def test_require_extensions_2():
+    assert ti.lang.impl.current_cfg().arch in [ti.cuda]
+
+
+### `test_utils.approx` and `test_utils.allclose`
+
+
+@pytest.mark.parametrize('x', [0.1, 3])
+@pytest.mark.parametrize(
+    'allclose', [test_utils.allclose, lambda x, y: x == test_utils.approx(y)])
+@test_utils.test()
+def test_allclose_rel(x, allclose):
+    rel = test_utils.get_rel_eps()
+    assert not allclose(x + x * rel * 3.0, x)
+    assert not allclose(x + x * rel * 1.2, x)
+    assert allclose(x + x * rel * 0.9, x)
+    assert allclose(x + x * rel * 0.5, x)
+    assert allclose(x, x)
+    assert allclose(x - x * rel * 0.5, x)
+    assert allclose(x - x * rel * 0.9, x)
+    assert not allclose(x - x * rel * 1.2, x)
+    assert not allclose(x - x * rel * 3.0, x)
+
+
+@pytest.mark.parametrize('x', [0.1, 3])
+@pytest.mark.parametrize(
+    'allclose', [test_utils.allclose, lambda x, y: x == test_utils.approx(y)])
+@test_utils.test()
+def test_allclose_rel_reordered1(x, allclose):
+    rel = test_utils.get_rel_eps()
+    assert not allclose(x + x * rel * 3.0, x)
+    assert not allclose(x + x * rel * 1.2, x)
+    assert allclose(x + x * rel * 0.9, x)
+    assert allclose(x + x * rel * 0.5, x)
+    assert allclose(x, x)
+    assert allclose(x - x * rel * 0.5, x)
+    assert allclose(x - x * rel * 0.9, x)
+    assert not allclose(x - x * rel * 1.2, x)
+    assert not allclose(x - x * rel * 3.0, x)
+
+
+@pytest.mark.parametrize('x', [0.1, 3])
+@pytest.mark.parametrize(
+    'allclose', [test_utils.allclose, lambda x, y: x == test_utils.approx(y)])
+@test_utils.test()
+def test_allclose_rel_reordered2(x, allclose):
+    rel = test_utils.get_rel_eps()
+    assert not allclose(x + x * rel * 3.0, x)
+    assert not allclose(x + x * rel * 1.2, x)
+    assert allclose(x + x * rel * 0.9, x)
+    assert allclose(x + x * rel * 0.5, x)
+    assert allclose(x, x)
+    assert allclose(x - x * rel * 0.5, x)
+    assert allclose(x - x * rel * 0.9, x)
+    assert not allclose(x - x * rel * 1.2, x)
+    assert not allclose(x - x * rel * 3.0, x)
+
+
+@pytest.mark.skipif(ti._lib.core.with_metal(),
+                    reason="Skip metal because metal is used as the example")
+def test_disable_fallback():
+    old_environ = os.environ.get('TI_WANTED_ARCHS', '')
+    os.environ['TI_WANTED_ARCHS'] = "metal"
+
+    with pytest.raises(RuntimeError):
+
+        @test_utils.test(ti.metal)
+        def test():
+            pass
+
+        test()
+        os.environ['TI_WANTED_ARCHS'] = old_environ
+    os.environ['TI_WANTED_ARCHS'] = old_environ
diff --git a/tests/_python_orig/test_threading.py b/tests/_python_orig/test_threading.py
new file mode 100644
index 000000000..67980c6a8
--- /dev/null
+++ b/tests/_python_orig/test_threading.py
@@ -0,0 +1,9 @@
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_while():
+    assert ti._lib.core.test_threading()
diff --git a/tests/_python_orig/test_torch_ad.py b/tests/_python_orig/test_torch_ad.py
new file mode 100644
index 000000000..d2422b6ff
--- /dev/null
+++ b/tests/_python_orig/test_torch_ad.py
@@ -0,0 +1,97 @@
+import sys
+
+import numpy as np
+import pytest
+from taichi.lang.util import has_pytorch
+
+import taichi as ti
+from tests import test_utils
+
+if has_pytorch():
+    import torch
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=ti.opengl)
+def test_torch_ad():
+    n = 32
+
+    x = ti.field(ti.f32, shape=n, needs_grad=True)
+    y = ti.field(ti.f32, shape=n, needs_grad=True)
+
+    @ti.kernel
+    def torch_kernel():
+        for i in range(n):
+            # Do whatever complex operations here
+            y[n - i - 1] = x[i] * x[i]
+
+    class Sqr(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, inp):
+            x.from_torch(inp)
+            torch_kernel()
+            outp = y.to_torch()
+            return outp
+
+        @staticmethod
+        def backward(ctx, outp_grad):
+            ti.clear_all_gradients()
+            y.grad.from_torch(outp_grad)
+            torch_kernel.grad()
+            inp_grad = x.grad.to_torch()
+            return inp_grad
+
+    sqr = Sqr.apply
+    for i in range(10):
+        X = torch.tensor(2 * np.ones((n, ), dtype=np.float32),
+                         requires_grad=True)
+        sqr(X).sum().backward()
+        ret = X.grad.cpu().numpy()
+        for j in range(n):
+            assert ret[j] == 4
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@pytest.mark.skipif(sys.platform == 'win32', reason='not working on Windows.')
+@test_utils.test(exclude=ti.opengl)
+def test_torch_ad_gpu():
+    if not torch.cuda.is_available():
+        return
+
+    device = torch.device('cuda:0')
+    n = 32
+
+    x = ti.field(ti.f32, shape=n, needs_grad=True)
+    y = ti.field(ti.f32, shape=n, needs_grad=True)
+
+    @ti.kernel
+    def torch_kernel():
+        for i in range(n):
+            # Do whatever complex operations here
+            y[n - i - 1] = x[i] * x[i]
+
+    class Sqr(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, inp):
+            x.from_torch(inp)
+            torch_kernel()
+            outp = y.to_torch(device=device)
+            return outp
+
+        @staticmethod
+        def backward(ctx, outp_grad):
+            ti.clear_all_gradients()
+            y.grad.from_torch(outp_grad)
+            torch_kernel.grad()
+            inp_grad = x.grad.to_torch(device=device)
+            return inp_grad
+
+    sqr = Sqr.apply
+    for i in range(10):
+        X = torch.tensor(2 * np.ones((n, ), dtype=np.float32),
+                         requires_grad=True,
+                         device=device)
+        sqr(X).sum().backward()
+        ret = X.grad.cpu().numpy()
+        for j in range(n):
+            assert ret[j] == 4
diff --git a/tests/_python_orig/test_torch_io.py b/tests/_python_orig/test_torch_io.py
new file mode 100644
index 000000000..fc6182efb
--- /dev/null
+++ b/tests/_python_orig/test_torch_io.py
@@ -0,0 +1,288 @@
+import numpy as np
+import pytest
+from taichi.lang import impl
+from taichi.lang.util import has_pytorch
+
+import taichi as ti
+from tests import test_utils
+
+if has_pytorch():
+    import torch
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_io_devices():
+    n = 32
+    x = ti.field(dtype=ti.i32, shape=n)
+
+    @ti.kernel
+    def load(y: ti.ext_arr()):
+        for i in x:
+            x[i] = y[i] + 10
+
+    @ti.kernel
+    def inc():
+        for i in x:
+            x[i] += i
+
+    @ti.kernel
+    def store(y: ti.ext_arr()):
+        for i in x:
+            y[i] = x[i] * 2
+
+    devices = ['cpu']
+    if torch.cuda.is_available():
+        devices.append('cuda:0')
+    for device in devices:
+        y = torch.Tensor(np.ones(shape=n, dtype=np.int32)).to(device)
+
+        load(y)
+        inc()
+        store(y)
+
+        y = y.cpu().numpy()
+
+        for i in range(n):
+            assert y[i] == (11 + i) * 2
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_io():
+    n = 32
+
+    @ti.kernel
+    def torch_kernel(t: ti.ext_arr(), o: ti.ext_arr()):
+        for i in range(n):
+            o[i] = t[i] * t[i]
+
+    @ti.kernel
+    def torch_kernel_2(t_grad: ti.ext_arr(), t: ti.ext_arr(),
+                       o_grad: ti.ext_arr()):
+        for i in range(n):
+            t_grad[i] = 2 * t[i] * o_grad[i]
+
+    class Sqr(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, inp):
+            outp = torch.zeros_like(inp)
+            ctx.save_for_backward(inp)
+            torch_kernel(inp, outp)
+            return outp
+
+        @staticmethod
+        def backward(ctx, outp_grad):
+            outp_grad = outp_grad.contiguous()
+            inp_grad = torch.zeros_like(outp_grad)
+            inp, = ctx.saved_tensors
+            torch_kernel_2(inp_grad, inp, outp_grad)
+            return inp_grad
+
+    sqr = Sqr.apply
+    X = torch.tensor(2 * np.ones((n, ), dtype=np.float32), requires_grad=True)
+    sqr(X).sum().backward()
+    ret = X.grad.cpu()
+    for i in range(n):
+        assert ret[i] == 4
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_io_2d():
+    n = 32
+
+    @ti.kernel
+    def torch_kernel(t: ti.ext_arr(), o: ti.ext_arr()):
+        for i in range(n):
+            for j in range(n):
+                o[i, j] = t[i, j] * t[i, j]
+
+    class Sqr(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, inp):
+            outp = torch.zeros_like(inp)
+            torch_kernel(inp, outp)
+            return outp
+
+    sqr = Sqr.apply
+    X = torch.tensor(2 * np.ones((n, n), dtype=np.float32), requires_grad=True)
+    val = sqr(X).sum()
+    assert val == 2 * 2 * n * n
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_io_3d():
+    n = 16
+
+    @ti.kernel
+    def torch_kernel(t: ti.ext_arr(), o: ti.ext_arr()):
+        for i in range(n):
+            for j in range(n):
+                for k in range(n):
+                    o[i, j, k] = t[i, j, k] * t[i, j, k]
+
+    class Sqr(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, inp):
+            outp = torch.zeros_like(inp)
+            torch_kernel(inp, outp)
+            return outp
+
+    sqr = Sqr.apply
+    X = torch.tensor(2 * np.ones((n, n, n), dtype=np.float32),
+                     requires_grad=True)
+    val = sqr(X).sum()
+    assert val == 2 * 2 * n * n * n
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_io_simple():
+    n = 32
+
+    x1 = ti.field(ti.f32, shape=(n, n))
+    t1 = torch.tensor(2 * np.ones((n, n), dtype=np.float32))
+
+    x2 = ti.Matrix.field(2, 3, ti.f32, shape=(n, n))
+    t2 = torch.tensor(2 * np.ones((n, n, 2, 3), dtype=np.float32))
+
+    x1.from_torch(t1)
+    for i in range(n):
+        for j in range(n):
+            assert x1[i, j] == 2
+
+    x2.from_torch(t2)
+    for i in range(n):
+        for j in range(n):
+            for k in range(2):
+                for l in range(3):
+                    assert x2[i, j][k, l] == 2
+
+    t3 = x2.to_torch()
+    assert (t2 == t3).all()
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_io_zeros():
+    mat = ti.Matrix.field(2, 6, dtype=ti.f32, shape=(), needs_grad=True)
+    zeros = torch.zeros((2, 6))
+    zeros[1, 2] = 3
+    mat.from_torch(zeros + 1)
+
+    assert mat[None][1, 2] == 4
+
+    zeros = mat.to_torch()
+    assert zeros[1, 2] == 4
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_io_struct():
+    n = 16
+    x1 = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+    t1 = {
+        "a": torch.tensor(2 * np.ones(n, dtype=np.int32)),
+        "b": torch.tensor(3 * np.ones(n, dtype=np.float32))
+    }
+
+    x1.from_torch(t1)
+    for i in range(n):
+        assert x1[i].a == 2
+        assert x1[i].b == 3
+
+    t2 = x1.to_torch()
+    for k in t1:
+        assert (t1[k] == t2[k]).all()
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_fused_kernels():
+    n = 12
+    X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
+    s = impl.get_runtime().get_num_compiled_functions()
+    t = X.to_torch()
+    assert impl.get_runtime().get_num_compiled_functions() == s + 1
+    X.from_torch(t)
+    assert impl.get_runtime().get_num_compiled_functions() == s + 2
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_device():
+    n = 12
+    X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
+    assert X.to_torch(device='cpu').device == torch.device('cpu')
+
+    if torch.cuda.is_available():
+        assert X.to_torch(device='cuda:0').device == torch.device('cuda:0')
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_shape_matrix():
+    n = 12
+    x = ti.Matrix.field(3, 2, ti.f32, shape=(n, n))
+    X = x.to_torch()
+    for i in range(n):
+        for j in range(n):
+            for k in range(3):
+                for l in range(2):
+                    X[i, j, k, l] = i * 10 + j + k * 100 + l * 1000
+
+    x.from_torch(X)
+    X1 = x.to_torch()
+    x.from_torch(X1)
+    X1 = x.to_torch()
+
+    assert (X == X1).all()
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_shape_vector():
+    n = 12
+    x = ti.Vector.field(3, ti.f32, shape=(n, n))
+    X = x.to_torch()
+    for i in range(n):
+        for j in range(n):
+            for k in range(3):
+                X[i, j, k] = i * 10 + j + k * 100
+
+    x.from_torch(X)
+    X1 = x.to_torch()
+    x.from_torch(X1)
+    X1 = x.to_torch()
+
+    assert (X == X1).all()
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_torch_zero():
+    @ti.kernel
+    def test_torch(arr: ti.ext_arr()):
+        pass
+
+    test_torch(torch.zeros((0), dtype=torch.int32))
+    test_torch(torch.zeros((0, 5), dtype=torch.int32))
+    test_torch(torch.zeros((5, 0, 5), dtype=torch.int32))
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_torch_view():
+    @ti.kernel
+    def copy(x: ti.any_arr(), y: ti.any_arr()):
+        for i, j in x:
+            y[i, j] = x[i, j]
+
+    x = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).T
+    y = ti.ndarray(int, (3, 3))
+
+    with pytest.raises(ValueError,
+                       match=r'Non contiguous tensors are not supported'):
+        copy(x, y)
diff --git a/tests/_python_orig/test_tuple_assign.py b/tests/_python_orig/test_tuple_assign.py
new file mode 100644
index 000000000..03dc05bec
--- /dev/null
+++ b/tests/_python_orig/test_tuple_assign.py
@@ -0,0 +1,229 @@
+import pytest
+from taichi.lang.misc import get_host_arch_list
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_fibonacci():
+    @ti.kernel
+    def ti_fibonacci(n: ti.i32) -> ti.i32:
+        a, b = 0, 1
+        # This is to make the inner for loop serial on purpose...
+        for _ in range(1):
+            for i in range(n):
+                a, b = b, a + b
+        return b
+
+    def py_fibonacci(n):
+        a, b = 0, 1
+        for i in range(n):
+            a, b = b, a + b
+        return b
+
+    for n in range(5):
+        assert ti_fibonacci(n) == py_fibonacci(n)
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_assign2():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a[None], b[None] = 2, 3
+
+    func()
+    assert a[None] == 2
+    assert b[None] == 3
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_assign2_mismatch3():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a[None], b[None] = 2, 3, 4
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_assign2_mismatch1():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a[None], b[None] = 2
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_swap2():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a[None], b[None] = b[None], a[None]
+
+    a[None] = 2
+    b[None] = 3
+    func()
+    assert a[None] == 3
+    assert b[None] == 2
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_assign2_static():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        # XXX: why a, b = ti.static(b, a) doesn't work?
+        c, d = ti.static(b, a)
+        c[None], d[None] = 2, 3
+
+    func()
+    assert a[None] == 3
+    assert b[None] == 2
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_swap3():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+    c = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        a[None], b[None], c[None] = b[None], c[None], a[None]
+
+    a[None] = 2
+    b[None] = 3
+    c[None] = 4
+    func()
+    assert a[None] == 3
+    assert b[None] == 4
+    assert c[None] == 2
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_unpack_from_tuple():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+    c = ti.field(ti.f32, ())
+
+    list = [2, 3, 4]
+
+    @ti.kernel
+    def func():
+        a[None], b[None], c[None] = list
+
+    func()
+    assert a[None] == 2
+    assert b[None] == 3
+    assert c[None] == 4
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_unpack_mismatch_tuple():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    list = [2, 3, 4]
+
+    @ti.kernel
+    def func():
+        a[None], b[None] = list
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_unpack_from_vector():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+    c = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        vector = ti.Vector([2, 3, 4])
+        a[None], b[None], c[None] = vector
+
+    func()
+    assert a[None] == 2
+    assert b[None] == 3
+    assert c[None] == 4
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_unpack_mismatch_vector():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        vector = ti.Vector([2, 3, 4])
+        a[None], b[None] = vector
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_unpack_mismatch_type():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+
+    bad = 12
+
+    @ti.kernel
+    def func():
+        a[None], b[None] = bad
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_unpack_mismatch_matrix():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+    c = ti.field(ti.f32, ())
+    d = ti.field(ti.f32, ())
+
+    @ti.kernel
+    def func():
+        bad = ti.Matrix([[2, 3], [4, 5]])
+        a[None], b[None], c[None], d[None] = bad
+
+    with pytest.raises(ti.TaichiCompilationError):
+        func()
+
+
+@test_utils.test(arch=get_host_arch_list())
+def test_unpack_from_shape():
+    a = ti.field(ti.f32, ())
+    b = ti.field(ti.f32, ())
+    c = ti.field(ti.f32, ())
+    d = ti.field(ti.f32, (2, 3, 4))
+
+    @ti.kernel
+    def func():
+        a[None], b[None], c[None] = d.shape
+
+    func()
+    assert a[None] == 2
+    assert b[None] == 3
+    assert c[None] == 4
diff --git a/tests/_python_orig/test_type_check.py b/tests/_python_orig/test_type_check.py
new file mode 100644
index 000000000..0330e7be7
--- /dev/null
+++ b/tests/_python_orig/test_type_check.py
@@ -0,0 +1,82 @@
+import numpy as np
+import pytest
+from taichi.lang.util import has_pytorch
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cpu)
+def test_unary_op():
+    @ti.kernel
+    def floor():
+        a = 1
+        b = ti.floor(a)
+
+    with pytest.raises(ti.TaichiTypeError,
+                       match="'floor' takes real inputs only"):
+        floor()
+
+
+@test_utils.test(arch=ti.cpu)
+def test_binary_op():
+    @ti.kernel
+    def bitwise_float():
+        a = 1
+        b = 3.1
+        c = a & b
+
+    with pytest.raises(ti.TaichiTypeError,
+                       match=r"unsupported operand type\(s\) for '&'"):
+        bitwise_float()
+
+
+@test_utils.test(arch=ti.cpu)
+def test_ternary_op():
+    @ti.kernel
+    def select():
+        a = 1.1
+        b = 3
+        c = 3.6
+        d = b if a else c
+
+    with pytest.raises(TypeError,
+                       match="`if` conditions must be of type int32"):
+        select()
+
+
+@pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
+@test_utils.test(arch=[ti.cpu, ti.opengl])
+def test_subscript():
+    a = ti.ndarray(ti.i32, shape=(10, 10))
+
+    @ti.kernel
+    def any_array(x: ti.any_arr()):
+        b = x[3, 1.1]
+
+    with pytest.raises(ti.TaichiTypeError, match="indices must be integers"):
+        any_array(a)
+
+
+@test_utils.test()
+def test_0d_ndarray():
+    @ti.kernel
+    def foo() -> ti.i32:
+        a = np.array(3, dtype=np.int32)
+        return a
+
+    assert foo() == 3
+
+
+@test_utils.test()
+def test_non_0d_ndarray():
+    @ti.kernel
+    def foo():
+        a = np.array([1])
+
+    with pytest.raises(
+            ti.TaichiTypeError,
+            match=
+            "Only 0-dimensional numpy array can be used to initialize a scalar expression"
+    ):
+        foo()
diff --git a/tests/_python_orig/test_types.py b/tests/_python_orig/test_types.py
new file mode 100644
index 000000000..2d1cb7851
--- /dev/null
+++ b/tests/_python_orig/test_types.py
@@ -0,0 +1,155 @@
+import pytest
+from taichi.lang import impl
+
+import taichi as ti
+from tests import test_utils
+
+_TI_TYPES = [ti.i8, ti.i16, ti.i32, ti.u8, ti.u16, ti.u32, ti.f32]
+_TI_64_TYPES = [ti.i64, ti.u64, ti.f64]
+
+
+def _test_type_assign_argument(dt):
+    x = ti.field(dt, shape=())
+
+    @ti.kernel
+    def func(value: dt):
+        x[None] = value
+
+    func(3)
+    assert x[None] == 3
+
+
+@pytest.mark.parametrize('dt', _TI_TYPES)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_type_assign_argument(dt):
+    _test_type_assign_argument(dt)
+
+
+@pytest.mark.parametrize('dt', _TI_64_TYPES)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+def test_type_assign_argument64(dt):
+    _test_type_assign_argument(dt)
+
+
+def _test_type_operator(dt):
+    x = ti.field(dt, shape=())
+    y = ti.field(dt, shape=())
+    add = ti.field(dt, shape=())
+    mul = ti.field(dt, shape=())
+
+    @ti.kernel
+    def func():
+        add[None] = x[None] + y[None]
+        mul[None] = x[None] * y[None]
+
+    for i in range(0, 3):
+        for j in range(0, 3):
+            x[None] = i
+            y[None] = j
+            func()
+            assert add[None] == x[None] + y[None]
+            assert mul[None] == x[None] * y[None]
+
+
+@pytest.mark.parametrize('dt', _TI_TYPES)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_type_operator(dt):
+    _test_type_operator(dt)
+
+
+@pytest.mark.parametrize('dt', _TI_64_TYPES)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+def test_type_operator64(dt):
+    _test_type_operator(dt)
+
+
+def _test_type_field(dt):
+    x = ti.field(dt, shape=(3, 2))
+
+    @ti.kernel
+    def func(i: ti.i32, j: ti.i32):
+        x[i, j] = 3
+
+    for i in range(0, 3):
+        for j in range(0, 2):
+            func(i, j)
+            assert x[i, j] == 3
+
+
+@pytest.mark.parametrize('dt', _TI_TYPES)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_type_field(dt):
+    _test_type_field(dt)
+
+
+@pytest.mark.parametrize('dt', _TI_64_TYPES)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+def test_type_field64(dt):
+    _test_type_field(dt)
+
+
+def _test_overflow(dt, n):
+    a = ti.field(dt, shape=())
+    b = ti.field(dt, shape=())
+    c = ti.field(dt, shape=())
+
+    @ti.kernel
+    def func():
+        c[None] = a[None] + b[None]
+
+    a[None] = 2**n // 3
+    b[None] = 2**n // 3
+
+    func()
+
+    assert a[None] == 2**n // 3
+    assert b[None] == 2**n // 3
+
+    if ti.types.is_signed(dt):
+        assert c[None] == 2**n // 3 * 2 - (2**n)  # overflows
+    else:
+        assert c[None] == 2**n // 3 * 2  # does not overflow
+
+
+@pytest.mark.parametrize('dt,n', [
+    (ti.i8, 8),
+    (ti.u8, 8),
+    (ti.i16, 16),
+    (ti.u16, 16),
+    (ti.i32, 32),
+    (ti.u32, 32),
+])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+def test_overflow(dt, n):
+    _test_overflow(dt, n)
+
+
+@pytest.mark.parametrize('dt,n', [
+    (ti.i64, 64),
+    (ti.u64, 64),
+])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+def test_overflow64(dt, n):
+    _test_overflow(dt, n)
+
+
+@pytest.mark.parametrize('dt,val', [
+    (ti.u32, 0xffffffff),
+    (ti.u64, 0xffffffffffffffff),
+])
+@test_utils.test(require=ti.extension.data64)
+def test_uint_max(dt, val):
+    # https://github.com/taichi-dev/taichi/issues/2060
+    impl.get_runtime().default_ip = dt
+    N = 16
+    f = ti.field(dt, shape=N)
+
+    @ti.kernel
+    def run():
+        for i in f:
+            f[i] = val
+
+    run()
+    fs = f.to_numpy()
+    for f in fs:
+        assert f == val
diff --git a/tests/_python_orig/test_unary_ops.py b/tests/_python_orig/test_unary_ops.py
new file mode 100644
index 000000000..5aea17db3
--- /dev/null
+++ b/tests/_python_orig/test_unary_ops.py
@@ -0,0 +1,55 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+def _test_op(dt, taichi_op, np_op):
+    print('arch={} default_fp={}'.format(
+        ti.lang.impl.current_cfg().arch,
+        ti.lang.impl.current_cfg().default_fp))
+    n = 4
+    val = ti.field(dt, shape=n)
+
+    def f(i):
+        return i * 0.1 + 0.4
+
+    @ti.kernel
+    def fill():
+        for i in range(n):
+            val[i] = taichi_op(f(ti.cast(i, dt)))
+
+    fill()
+
+    # check that it is double precision
+    for i in range(n):
+        if dt == ti.f64:
+            assert abs(np_op(float(f(i))) - val[i]) < 1e-15
+        else:
+            assert abs(np_op(float(f(i))) -
+                       val[i]) < 1e-6 if ti.lang.impl.current_cfg(
+                       ).arch != ti.opengl and ti.lang.impl.current_cfg(
+                       ).arch != ti.vulkan else 1e-5
+
+
+def test_f64_trig():
+    op_pairs = [
+        (ti.sin, np.sin),
+        (ti.cos, np.cos),
+        (ti.asin, np.arcsin),
+        (ti.acos, np.arccos),
+        (ti.tan, np.tan),
+        (ti.tanh, np.tanh),
+        (ti.exp, np.exp),
+        (ti.log, np.log),
+    ]
+    for dt in [ti.f32, ti.f64]:
+        for taichi_op, np_op in op_pairs:
+
+            @test_utils.test(
+                require=ti.extension.data64 if dt == ti.f64 else [],
+                default_fp=dt)
+            def wrapped():
+                _test_op(dt, taichi_op, np_op)
+
+            wrapped()
diff --git a/tests/_python_orig/test_while.py b/tests/_python_orig/test_while.py
new file mode 100644
index 000000000..a3c53e797
--- /dev/null
+++ b/tests/_python_orig/test_while.py
@@ -0,0 +1,42 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test()
+def test_while():
+    x = ti.field(ti.f32)
+
+    N = 1
+
+    ti.root.dense(ti.i, N).place(x)
+
+    @ti.kernel
+    def func():
+        i = 0
+        s = 0
+        while i < 10:
+            s += i
+            i += 1
+        x[0] = s
+
+    func()
+    assert x[0] == 45
+
+
+@test_utils.test()
+def test_break():
+    ret = ti.field(ti.i32, shape=())
+
+    @ti.kernel
+    def func():
+        i = 0
+        s = 0
+        while True:
+            s += i
+            i += 1
+            if i > 10:
+                break
+        ret[None] = s
+
+    func()
+    assert ret[None] == 55