diff --git a/.github/workflows/scripts/ti_build/alter.py b/.github/workflows/scripts/ti_build/alter.py
index cb599bf7ff39c..a1a165ff00d4e 100644
--- a/.github/workflows/scripts/ti_build/alter.py
+++ b/.github/workflows/scripts/ti_build/alter.py
@@ -33,11 +33,15 @@ def add_aot_env():
 
 def _write_ti_bashrc():
     path = get_cache_home() / "ti.bashrc"
+    envs = get_cache_home() / "ti-env.sh"
+    _write_env(envs)
     with open(path, "w") as f:
         f.write(
             "[ -f /etc/bashrc ] && source /etc/bashrc\n"
             "[ -f ~/.bashrc ] && source ~/.bashrc\n"
             r'export PS1="\[\e]0;[Taichi Build Environment]\a\]\[\033[01;31m\][Taichi Build] \[\033[00m\]$PS1"'
+            "\n"
+            f"source {envs}\n"
         )
 
     return path
@@ -47,11 +51,15 @@ def _write_ti_zshrc():
     dotdir = get_cache_home() / "zdotdir"
     dotdir.mkdir(parents=True, exist_ok=True)
     path = dotdir / ".zshrc"
+    envs = get_cache_home() / "ti-env.sh"
+    _write_env(envs)
     with open(path, "w") as f:
         f.write(
             "[ -f /etc/zsh/zshrc ] && source /etc/zsh/zshrc\n"
             "[ -f $HOME/.zshrc ] && source $HOME/.zshrc\n"
             r"export PROMPT='%{$fg_bold[red]%}[Taichi Build] %{$reset_color%}'$PROMPT"
+            "\n"
+            f"source {envs}\n"
         )
     return dotdir
 
@@ -138,10 +146,13 @@ def enter_shell():
             os.execl(shell.exe, shell.exe)
 
 
-def write_env(path):
-    cmake_args.writeback()
+def _write_env(path):
     envs = os.environ.get_changed_envs()
     envstr = ""
+
+    if isinstance(path, Path):
+        path = str(path)
+
     if path.endswith(".ps1"):
         envstr = "\n".join([f'$env:{k}="{v}"' for k, v in envs.items()])
     elif path.endswith(".sh"):
@@ -156,6 +167,10 @@ def write_env(path):
     with open(path, "w") as f:
         f.write(envstr)
 
+
+def write_env(path):
+    cmake_args.writeback()
+    _write_env(path)
     misc.info(f"Environment variables written to {path}")
 
 
diff --git a/.github/workflows/scripts/ti_build/bootstrap.py b/.github/workflows/scripts/ti_build/bootstrap.py
index 6423f2f086730..0018ac54ee5b8 100644
--- a/.github/workflows/scripts/ti_build/bootstrap.py
+++ b/.github/workflows/scripts/ti_build/bootstrap.py
@@ -2,6 +2,7 @@
 
 # -- stdlib --
 from pathlib import Path
+from types import ModuleType
 from typing import Optional
 import importlib
 import os
@@ -55,21 +56,32 @@ def restart():
         os.execl(sys.executable, sys.executable, "-S", *sys.argv)
 
 
+def _try_import(name: str) -> Optional[ModuleType]:
+    try:
+        return importlib.import_module(name)
+    except ModuleNotFoundError:
+        return None
+
+
 def ensure_dependencies(*deps: str):
     """
     Automatically install dependencies if they are not installed.
     """
 
+    pip = _try_import("pip")
+    ensurepip = _try_import("ensurepip")
+
     if not sys.flags.no_site:
-        # First run, do pip checks
-        try:
-            import pip
-        except ModuleNotFoundError:
-            print("!! pip not found, build.py needs at least a functional pip to work.", flush=True)
-            exit(1)
+        # First run, restart with no_site
+        if not pip and not ensurepip:
+            print(
+                "!! pip or ensurepip not found, build.py needs at least a functional pip/ensurepip to work.", flush=True
+            )
+            sys.exit(1)
 
         restart()
 
+    # Second run
     v = sys.version_info
     bootstrap_root = get_cache_home() / "bootstrap" / f"{v.major}.{v.minor}"
     bootstrap_root.mkdir(parents=True, exist_ok=True)
@@ -85,12 +97,18 @@ def ensure_dependencies(*deps: str):
 
     print("Installing dependencies...", flush=True)
     py = sys.executable
-
-    if run(py, "-m", "pip", "install", "pip", "--no-user", f"--target={bootstrap_root}"):
-        raise Exception("Unable to install pip!")
-
-    pipcmd = [py, "-S", "-m", "pip", "install", "--no-user", f"--target={bootstrap_root}", "-U"]
-    if run(*pipcmd, *deps, env={"PYTHONPATH": str(bootstrap_root)}):
+    pip_install = ["-m", "pip", "install", "--no-user", f"--target={bootstrap_root}", "-U"]
+
+    if ensurepip:
+        wheels = Path(ensurepip.__path__[0]).glob("**/*.whl")
+        wheels = os.pathsep.join(map(str, wheels))
+        if run(py, "-S", *pip_install, "pip", env={"PYTHONPATH": wheels}):
+            raise Exception("Unable to install pip! (ensurepip method)")
+    else:  # pip must exist
+        if run(py, *pip_install, "pip"):
+            raise Exception("Unable to install pip! (pip method)")
+
+    if run(py, "-S", *pip_install, *deps, env={"PYTHONPATH": str(bootstrap_root)}):
         raise Exception("Unable to install dependencies!")
 
     restart()
diff --git a/c_api/docs/taichi/taichi_core.h.md b/c_api/docs/taichi/taichi_core.h.md
index 8c8263562db87..54890de2738a8 100644
--- a/c_api/docs/taichi/taichi_core.h.md
+++ b/c_api/docs/taichi/taichi_core.h.md
@@ -305,6 +305,7 @@ Types of kernel and compute graph argument.
 - `enumeration.argument_type.ndarray`: ND-array wrapped around a `handle.memory`.
 - `enumeration.argument_type.texture`: Texture wrapped around a `handle.image`.
 - `enumeration.argument_type.scalar`: Typed scalar.
+- `enumeration.argument_type.tensor`: Typed tensor.
 
 
 `bit_field.memory_usage`
@@ -450,6 +451,23 @@ Scalar value represented by a power-of-two number of bits.
 
 A typed scalar value.
 
+`union.tensor_value`
+
+Tensor value represented by a power-of-two number of bits.
+
+- `union.tensor_value.x8`: Tensor value that fits into 8 bits.
+- `union.tensor_value.x16`: Tensor value that fits into 16 bits.
+- `union.tensor_value.x32`: Tensor value that fits into 32 bits.
+- `union.tensor_value.x64`: Tensor value that fits into 64 bits.
+
+`structure.tensor_value_with_length`
+
+A tensor value with a length.
+
+`structure.tensor`
+
+A typed tensor value.
+
 `union.argument_value`
 
 A scalar or structured argument value.
@@ -459,6 +477,7 @@ A scalar or structured argument value.
 - `union.argument_value.ndarray`: An ND-array to be bound.
 - `union.argument_value.texture`: A texture to be bound.
 - `union.argument_value.scalar`: An scalar to be bound.
+- `union.argument_value.tensor`: A tensor to be bound.
 
 `structure.argument`
 
diff --git a/c_api/include/taichi/cpp/taichi.hpp b/c_api/include/taichi/cpp/taichi.hpp
index 61f0b15e65290..8319f96493854 100644
--- a/c_api/include/taichi/cpp/taichi.hpp
+++ b/c_api/include/taichi/cpp/taichi.hpp
@@ -837,6 +837,25 @@ class ComputeGraph {
     return compute_graph_;
   }
 };
+template <typename T>
+struct DataTypeToEnum {
+  static constexpr TiDataType value = TI_DATA_TYPE_UNKNOWN;
+};
+#define DEFINE_DATA_TYPE_ENUM(type, enumv)                    \
+  template <>                                                 \
+  struct DataTypeToEnum<type> {                               \
+    static constexpr TiDataType value = TI_DATA_TYPE_##enumv; \
+  };
+
+DEFINE_DATA_TYPE_ENUM(int32_t, I32);
+DEFINE_DATA_TYPE_ENUM(float, F32);
+DEFINE_DATA_TYPE_ENUM(uint16_t, U16);
+DEFINE_DATA_TYPE_ENUM(int16_t, I16);
+DEFINE_DATA_TYPE_ENUM(uint8_t, U8);
+DEFINE_DATA_TYPE_ENUM(int8_t, I8);
+DEFINE_DATA_TYPE_ENUM(uint64_t, U64);
+DEFINE_DATA_TYPE_ENUM(int64_t, I64);
+#undef DEFINE_DATA_TYPE_ENUM
 
 class Kernel {
  protected:
@@ -884,11 +903,12 @@ class Kernel {
   template <typename T>
   void push_arg(const std::vector<T> &v) {
     int idx = args_.size();
-    // Temporary workaround for setting vec/matrix arguments in a flattened way.
-    args_.resize(args_.size() + v.size());
-    for (int j = 0; j < v.size(); ++j) {
-      at(idx + j) = v[j];
-    }
+    args_.resize(idx + 1);
+    args_[idx].type = TI_ARGUMENT_TYPE_TENSOR;
+    std::memcpy(args_[idx].value.tensor.contents.data.x32, v.data(),
+                v.size() * sizeof(T));
+    args_[idx].value.tensor.contents.length = v.size();
+    args_[idx].value.tensor.type = DataTypeToEnum<T>::value;
   }
 
   template <typename T>
diff --git a/c_api/include/taichi/taichi_core.h b/c_api/include/taichi/taichi_core.h
index bae403a6ff64b..f3764822c35b0 100644
--- a/c_api/include/taichi/taichi_core.h
+++ b/c_api/include/taichi/taichi_core.h
@@ -227,7 +227,7 @@
 #pragma once
 
 #ifndef TI_C_API_VERSION
-#define TI_C_API_VERSION 1005000
+#define TI_C_API_VERSION 1007000
 #endif  // TI_C_API_VERSION
 
 #ifndef TAICHI_H
@@ -463,6 +463,8 @@ typedef enum TiArgumentType {
   TI_ARGUMENT_TYPE_TEXTURE = 3,
   // Typed scalar.
   TI_ARGUMENT_TYPE_SCALAR = 4,
+  // Typed tensor.
+  TI_ARGUMENT_TYPE_TENSOR = 5,
   TI_ARGUMENT_TYPE_MAX_ENUM = 0xffffffff,
 } TiArgumentType;
 
@@ -802,6 +804,36 @@ typedef struct TiScalar {
   TiScalarValue value;
 } TiScalar;
 
+// Union `TiTensorValue`
+//
+// Tensor value represented by a power-of-two number of bits.
+typedef union TiTensorValue {
+  // Tensor value that fits into 8 bits.
+  uint8_t x8[128];
+  // Tensor value that fits into 16 bits.
+  uint16_t x16[64];
+  // Tensor value that fits into 32 bits.
+  uint32_t x32[32];
+  // Tensor value that fits into 64 bits.
+  uint64_t x64[16];
+} TiTensorValue;
+
+// Structure `TiTensorValueWithLength`
+//
+// A tensor value with a length.
+typedef struct TiTensorValueWithLength {
+  uint32_t length;
+  TiTensorValue data;
+} TiTensorValueWithLength;
+
+// Structure `TiTensor`
+//
+// A typed tensor value.
+typedef struct TiTensor {
+  TiDataType type;
+  TiTensorValueWithLength contents;
+} TiTensor;
+
 // Union `TiArgumentValue` (1.4.0)
 //
 // A scalar or structured argument value.
@@ -818,6 +850,8 @@ typedef union TiArgumentValue {
   TiTexture texture;
   // An scalar to be bound.
   TiScalar scalar;
+  // A tensor to be bound.
+  TiTensor tensor;
 } TiArgumentValue;
 
 // Structure `TiArgument` (1.4.0)
diff --git a/c_api/src/taichi_core_impl.cpp b/c_api/src/taichi_core_impl.cpp
index fa91a5da9ec71..d8366bf30b067 100644
--- a/c_api/src/taichi_core_impl.cpp
+++ b/c_api/src/taichi_core_impl.cpp
@@ -792,6 +792,28 @@ void ti_launch_kernel(TiRuntime runtime,
         devallocs.emplace_back(std::move(devalloc));
         break;
       }
+      case TI_ARGUMENT_TYPE_TENSOR: {
+        auto &tensor = arg.value.tensor;
+        if (tensor.type == TI_DATA_TYPE_I16 ||
+            tensor.type == TI_DATA_TYPE_U16 ||
+            tensor.type == TI_DATA_TYPE_F16) {
+          for (int j = 0; j < tensor.contents.length; j++) {
+            builder.set_struct_arg_impl({(int)i, j},
+                                        tensor.contents.data.x16[j]);
+          }
+        } else if (tensor.type == TI_DATA_TYPE_I32 ||
+                   tensor.type == TI_DATA_TYPE_U32 ||
+                   tensor.type == TI_DATA_TYPE_F32) {
+          for (int j = 0; j < tensor.contents.length; j++) {
+            builder.set_struct_arg_impl({(int)i, j},
+                                        tensor.contents.data.x32[j]);
+          }
+        } else {
+          ti_set_last_error(TI_ERROR_NOT_SUPPORTED,
+                            ("args[" + std::to_string(i) + "].type").c_str());
+        }
+        break;
+      }
       default: {
         ti_set_last_error(TI_ERROR_ARGUMENT_OUT_OF_RANGE,
                           ("args[" + std::to_string(i) + "].type").c_str());
diff --git a/c_api/src/taichi_llvm_impl.cpp b/c_api/src/taichi_llvm_impl.cpp
index c39bf619a7ed6..a8b2fb8d8c6d2 100644
--- a/c_api/src/taichi_llvm_impl.cpp
+++ b/c_api/src/taichi_llvm_impl.cpp
@@ -52,14 +52,13 @@ taichi::lang::Device &LlvmRuntime::get() {
 
 TiMemory LlvmRuntime::allocate_memory(
     const taichi::lang::Device::AllocParams &params) {
-  const taichi::lang::CompileConfig &config = executor_->get_config();
   taichi::lang::LLVMRuntime *llvm_runtime = executor_->get_llvm_runtime();
   taichi::lang::LlvmDevice *llvm_device = executor_->llvm_device();
 
   taichi::lang::DeviceAllocation devalloc =
-      llvm_device->allocate_memory_runtime(
-          {params, config.ndarray_use_cached_allocator,
-           executor_->get_runtime_jit_module(), llvm_runtime, result_buffer});
+      llvm_device->allocate_memory_runtime({params,
+                                            executor_->get_runtime_jit_module(),
+                                            llvm_runtime, result_buffer});
   return devalloc2devmem(*this, devalloc);
 }
 
diff --git a/c_api/taichi.json b/c_api/taichi.json
index 7971c3327b84c..f726a5cea9b73 100644
--- a/c_api/taichi.json
+++ b/c_api/taichi.json
@@ -156,7 +156,8 @@
                         "f32": 1,
                         "ndarray": 2,
                         "texture": 3,
-                        "scalar": 4
+                        "scalar": 4,
+                        "tensor": 5
                     }
                 },
                 {
@@ -484,6 +485,60 @@
                         }
                     ]
                 },
+                {
+                    "name": "tensor_value",
+                    "type": "union",
+                    "variants": [
+                        {
+                            "name": "x8",
+                            "type": "uint8_t",
+                            "count": 128
+                        },
+                        {
+                            "name": "x16",
+                            "type": "uint16_t",
+                            "count": 64
+                        },
+                        {
+                            "name": "x32",
+                            "type": "uint32_t",
+                            "count": 32
+                        },
+                        {
+                            "name": "x64",
+                            "type": "uint64_t",
+                            "count": 16
+                        }
+                    ]
+                },
+                {
+                    "name": "tensor_value_with_length",
+                    "type": "structure",
+                    "fields": [
+                        {
+                            "name": "length",
+                            "type": "uint32_t"
+                        },
+                        {
+                            "name": "data",
+                            "type": "union.tensor_value"
+                        }
+                    ]
+                },
+                {
+                    "name": "tensor",
+                    "type": "structure",
+                    "fields": [
+                        {
+                            "name": "type",
+                            "type": "enumeration.data_type"
+                        },
+                        {
+                            "name": "contents",
+                            "type": "structure.tensor_value_with_length"
+                        }
+                    ]
+                },
                 {
                     "name": "argument_value",
                     "type": "union",
@@ -508,6 +563,10 @@
                         {
                             "name": "scalar",
                             "type": "structure.scalar"
+                        },
+                        {
+                            "name": "tensor",
+                            "type": "structure.tensor"
                         }
                     ]
                 },
diff --git a/docs/cover-in-ci.lst b/docs/cover-in-ci.lst
index 420ae0ee214c6..723b95147504a 100644
--- a/docs/cover-in-ci.lst
+++ b/docs/cover-in-ci.lst
@@ -31,3 +31,5 @@ docs/lang/articles/visualization/ggui.md
 docs/lang/articles/visualization/gui_system.md
 docs/rfcs/20220410-rfc-process.md
 docs/rfcs/yyyymmdd-rfc-template.md
+docs/lang/articles/math/linear_solver.md
+docs/lang/articles/math/sparse_matrix.md
diff --git a/docs/lang/articles/c-api/taichi_core.md b/docs/lang/articles/c-api/taichi_core.md
index 4f3b4c1facc91..9892305358a6b 100644
--- a/docs/lang/articles/c-api/taichi_core.md
+++ b/docs/lang/articles/c-api/taichi_core.md
@@ -498,6 +498,7 @@ typedef enum TiArgumentType {
   TI_ARGUMENT_TYPE_NDARRAY = 2,
   TI_ARGUMENT_TYPE_TEXTURE = 3,
   TI_ARGUMENT_TYPE_SCALAR = 4,
+  TI_ARGUMENT_TYPE_TENSOR = 5,
   TI_ARGUMENT_TYPE_MAX_ENUM = 0xffffffff,
 } TiArgumentType;
 ```
@@ -509,6 +510,7 @@ Types of kernel and compute graph argument.
 - `TI_ARGUMENT_TYPE_NDARRAY`: ND-array wrapped around a [`TiMemory`](#handle-timemory).
 - `TI_ARGUMENT_TYPE_TEXTURE`: Texture wrapped around a [`TiImage`](#handle-tiimage).
 - `TI_ARGUMENT_TYPE_SCALAR`: Typed scalar.
+- `TI_ARGUMENT_TYPE_TENSOR`: Typed tensor.
 
 
 ---
@@ -927,6 +929,7 @@ typedef union TiArgumentValue {
   TiNdArray ndarray;
   TiTexture texture;
   TiScalar scalar;
+  TiTensor tensor;
 } TiArgumentValue;
 ```
 
@@ -937,6 +940,7 @@ A scalar or structured argument value.
 - `ndarray`: An ND-array to be bound.
 - `texture`: A texture to be bound.
 - `scalar`: An scalar to be bound.
+- `tensor`: A tensor to be bound.
 
 ---
 ### Structure `TiArgument`
diff --git a/docs/lang/articles/math/linear_solver.md b/docs/lang/articles/math/linear_solver.md
new file mode 100644
index 0000000000000..cd852e4d48a93
--- /dev/null
+++ b/docs/lang/articles/math/linear_solver.md
@@ -0,0 +1,73 @@
+---
+sidebar_position: 3
+---
+
+# Linear Solver
+
+Solving linear equations is a common task in scientific computing. Taichi provides basic direct and iterative linear solvers for
+various simulation scenarios. Currently, there are two categories of linear solvers available:
+1. Solvers built for `SparseMatrix`
+2. Solvers built for `ti.field`
+
+## Sparse linear solver
+You may want to solve some linear equations using sparse matrices.
+Then, the following steps could help:
+1. Create a `solver` using `ti.linalg.SparseSolver(solver_type, ordering)`. Currently, the factorization types supported on CPU backends are `LLT`, `LDLT`, and `LU`, and supported orderings include `AMD` and `COLAMD`. The sparse solver on CUDA supports the `LLT` factorization type only.
+2. Analyze and factorize the sparse matrix you want to solve using `solver.analyze_pattern(sparse_matrix)` and `solver.factorize(sparse_matrix)`
+3. Call `x = solver.solve(b)`, where `x` is the solution and `b` is the right-hand side of the linear system. On CPU backends, `x` and `b` can be NumPy arrays, Taichi Ndarrays, or Taichi fields. On the CUDA backend, `x` and `b` *must* be Taichi Ndarrays.
+4. Call `solver.info()` to check if the solving process succeeds.
+
+Here's a full example.
+
+```python
+import taichi as ti
+
+arch = ti.cpu # or ti.cuda
+ti.init(arch=arch)
+
+n = 4
+
+K = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+b = ti.ndarray(ti.f32, shape=n)
+
+@ti.kernel
+def fill(A: ti.types.sparse_matrix_builder(), b: ti.types.ndarray(), interval: ti.i32):
+    for i in range(n):
+        A[i, i] += 2.0
+
+        if i % interval == 0:
+            b[i] += 1.0
+
+fill(K, b, 3)
+
+A = K.build()
+print(">>>> Matrix A:")
+print(A)
+print(">>>> Vector b:")
+print(b)
+# outputs:
+# >>>> Matrix A:
+# [2, 0, 0, 0]
+# [0, 2, 0, 0]
+# [0, 0, 2, 0]
+# [0, 0, 0, 2]
+# >>>> Vector b:
+# [1. 0. 0. 1.]
+solver = ti.linalg.SparseSolver(solver_type="LLT")
+solver.analyze_pattern(A)
+solver.factorize(A)
+x = solver.solve(b)
+success = solver.info()
+print(">>>> Solve sparse linear systems Ax = b with the solution x:")
+print(x)
+print(f">>>> Computation succeed: {success}")
+# outputs:
+# >>>> Solve sparse linear systems Ax = b with the solution x:
+# [0.5 0.  0.  0.5]
+# >>>> Computation was successful?: True
+```
+## Examples
+
+Please have a look at our two demos for more information:
++ [Stable fluid](https://github.com/taichi-dev/taichi/blob/master/python/taichi/examples/simulation/stable_fluid.py): A 2D fluid simulation using a sparse Laplacian matrix to solve Poisson's pressure equation.
++ [Implicit mass spring](https://github.com/taichi-dev/taichi/blob/master/python/taichi/examples/simulation/implicit_mass_spring.py): A 2D cloth simulation demo using sparse matrices to solve the linear systems.
diff --git a/docs/lang/articles/math/sparse_matrix.md b/docs/lang/articles/math/sparse_matrix.md
index 2d4df0a61c73d..780ae4c08d390 100644
--- a/docs/lang/articles/math/sparse_matrix.md
+++ b/docs/lang/articles/math/sparse_matrix.md
@@ -56,7 +56,7 @@ print(A)
 
 The basic operations like `+`, `-`, `*`, `@` and transpose of sparse matrices are supported now.
 
-```python
+```python cont
 print(">>>> Summation: C = A + A")
 C = A + A
 print(C)
@@ -131,66 +131,3 @@ print(f">>>> Element Access: A[0,0] = {A[0,0]}")
 # outputs:
 # >>>> Element Access: A[0,0] = 1.0
 ```
-
-## Sparse linear solver
-You may want to solve some linear equations using sparse matrices.
-Then, the following steps could help:
-1. Create a `solver` using `ti.linalg.SparseSolver(solver_type, ordering)`. Currently, the factorization types supported on CPU backends are `LLT`, `LDLT`, and `LU`, and supported orderings include `AMD` and `COLAMD`. The sparse solver on CUDA supports the `LLT` factorization type only.
-2. Analyze and factorize the sparse matrix you want to solve using `solver.analyze_pattern(sparse_matrix)` and `solver.factorize(sparse_matrix)`
-3. Call `x = solver.solve(b)`, where `x` is the solution and `b` is the right-hand side of the linear system. On CPU backends, `x` and `b` can be NumPy arrays, Taichi Ndarrays, or Taichi fields. On the CUDA backend, `x` and `b` *must* be Taichi Ndarrays.
-4. Call `solver.info()` to check if the solving process succeeds.
-
-Here's a full example.
-
-```python
-import taichi as ti
-
-arch = ti.cpu # or ti.cuda
-ti.init(arch=arch)
-
-n = 4
-
-K = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
-b = ti.ndarray(ti.f32, shape=n)
-
-@ti.kernel
-def fill(A: ti.types.sparse_matrix_builder(), b: ti.template(), interval: ti.i32):
-    for i in range(n):
-        A[i, i] += 2.0
-
-        if i % interval == 0:
-            b[i] += 1.0
-
-fill(K, b, 3)
-
-A = K.build()
-print(">>>> Matrix A:")
-print(A)
-print(">>>> Vector b:")
-print(b)
-# outputs:
-# >>>> Matrix A:
-# [2, 0, 0, 0]
-# [0, 2, 0, 0]
-# [0, 0, 2, 0]
-# [0, 0, 0, 2]
-# >>>> Vector b:
-# [1. 0. 0. 1.]
-solver = ti.linalg.SparseSolver(solver_type="LLT")
-solver.analyze_pattern(A)
-solver.factorize(A)
-x = solver.solve(b)
-isSuccess = solver.info()
-print(">>>> Solve sparse linear systems Ax = b with the solution x:")
-print(x)
-print(f">>>> Computation was successful?: {isSuccess}")
-# outputs:
-# >>>> Solve sparse linear systems Ax = b with the solution x:
-# [0.5 0.  0.  0.5]
-# >>>> Computation was successful?: True
-```
-## Examples
-
-Please have a look at our two demos for more information:
-+ [Stable fluid](https://github.com/taichi-dev/taichi/blob/master/python/taichi/examples/simulation/stable_fluid.py): A 2D fluid simulation using a sparse Laplacian matrix to solve Poisson's pressure equation.
-+ [Implicit mass spring](https://github.com/taichi-dev/taichi/blob/master/python/taichi/examples/simulation/implicit_mass_spring.py): A 2D cloth simulation demo using sparse matrices to solve the linear systems.
diff --git a/misc/make_changelog.py b/misc/make_changelog.py
index f41ff81269449..a7e23e593ece4 100644
--- a/misc/make_changelog.py
+++ b/misc/make_changelog.py
@@ -34,16 +34,20 @@ def main(ver=None, repo_dir="."):
     # We need to find out the latest common commit among base and ver,
     # everything after this commit should be listed in the changelog.
 
-    base_commit = find_latest_tag_commit(g.tags)
-    commits_in_base_tag = list(g.iter_commits(base_commit, max_count=500))
-    commits = list(g.iter_commits(ver, max_count=500))
-    begin, end = -1, 0
+    latest_release = find_latest_tag_commit(g.tags)
+    head = g.head.commit
+    mb = g.merge_base(latest_release, head)
+    assert len(mb) == 1
+    mb = mb[0]
+    commits_in_base_tag = list(g.iter_commits(latest_release, max_count=500))
+    commits = list(g.iter_commits((mb, head)))
 
     def format(c):
         return f"{c.summary} (by **{c.author}**)"
 
     notable_changes = {}
     all_changes = []
+    by_author = {}
 
     details = load_pr_tags()
 
@@ -75,6 +79,7 @@ def format(c):
                     f'** Warning: tag {tag.lower()} undefined in the "details" dict. Please include the tag into "details", unless the tag is a typo.'
                 )
         all_changes.append(format(c))
+        by_author.setdefault(str(c.author), []).append(s)
 
     res = "Highlights:\n"
     for tag in sorted(notable_changes.keys()):
@@ -86,6 +91,13 @@ def format(c):
     for c in all_changes:
         res += f"   - {c}\n"
 
+    if args.show_per_author:
+        res += "\nContributors (in alphabetical order):\n"
+        for author in sorted(by_author.keys()):
+            res += f"   - {author}\n"
+            for item in by_author[author]:
+                res += f"      - {item}\n"
+
     return res
 
 
@@ -93,6 +105,7 @@ def format(c):
     parser = argparse.ArgumentParser()
     parser.add_argument("--ver")
     parser.add_argument("--repo_dir", type=str, default=".")
+    parser.add_argument("--show-per-author", action="store_true", default=False)
     parser.add_argument("--save", action="store_true", default=False)
     args = parser.parse_args()
     res = main(args.ver, args.repo_dir)
diff --git a/python/taichi/aot/utils.py b/python/taichi/aot/utils.py
index f85f132a8d12b..a29bda91fd43a 100644
--- a/python/taichi/aot/utils.py
+++ b/python/taichi/aot/utils.py
@@ -97,11 +97,8 @@ def produce_injected_args(kernel, symbolic_args=None):
             texture_shape = (2,) * anno.num_dimensions
             injected_args.append(Texture(Format.rgba8, texture_shape))
         elif isinstance(anno, MatrixType):
-            if not isinstance(symbolic_args[i], list):
-                raise RuntimeError("Expected a symbolic arg with Matrix type.")
-
-            symbolic_mat_n = len(symbolic_args[i])
-            symbolic_mat_m = len(symbolic_args[i][0])
+            symbolic_mat_n = symbolic_args[i].element_shape[0]
+            symbolic_mat_m = symbolic_args[i].element_shape[1]
 
             if symbolic_mat_m != anno.m or symbolic_mat_n != anno.n:
                 raise RuntimeError(
diff --git a/python/taichi/examples/machine_learning/differential_evolution.py b/python/taichi/examples/machine_learning/differential_evolution.py
new file mode 100644
index 0000000000000..02f877cff076b
--- /dev/null
+++ b/python/taichi/examples/machine_learning/differential_evolution.py
@@ -0,0 +1,282 @@
+# Authored by Erqi Chen.
+# This script shows the optimization process of differential evolution.
+# The black points are the search agents, and they finally find the minimum solution.
+
+
+import numpy as np
+import taichi as ti
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+
+ti.init(arch=ti.cpu)
+
+
+@ti.func
+def clip(_pop: ti.template(), _lb: ti.template(), _ub: ti.template()):
+    _search_num, _dim = _pop.shape
+    for ii, j in ti.ndrange(_search_num, _dim):
+        if _pop[ii, j] > _ub[j]:
+            _pop[ii, j] = _ub[j]
+        elif _pop[ii, j] < _lb[j]:
+            _pop[ii, j] = _lb[j]
+
+
+@ti.func
+def clip_only(_trial: ti.template(), _lb: ti.template(), _ub: ti.template()):
+    _dim = _trial.shape[0]
+    for j in range(_dim):
+        if _trial[j] > _ub[j]:
+            _trial[j] = _ub[j]
+        elif _trial[j] < _lb[j]:
+            _trial[j] = _lb[j]
+
+
+@ti.func
+def f1(_fit: ti.template(), _pop: ti.template()):
+    _search_num, _dim = _pop.shape
+    for ii in range(_search_num):
+        cur = 0.0
+        for j in range(_dim):
+            cur += ti.pow(_pop[ii, j], 2)
+
+        _fit[ii] = cur
+
+
+@ti.func
+def f1_only(_trial: ti.template()) -> ti.float32:
+    _dim = _trial.shape[0]
+    _res = 0.0
+    for j in range(_dim):
+        _res += ti.pow(_trial[j], 2)
+
+    return _res
+
+
+@ti.func
+def find_min(_fit: ti.template()) -> ti.i32:
+    _search_num = _fit.shape[0]
+    min_fit = _fit[0]
+    min_pos = 0
+    for _ in ti.ndrange(1):
+        for ii in ti.ndrange(_search_num):
+            if min_fit < _fit[ii]:
+                min_fit = _fit[ii]
+                min_pos = ii
+    return min_pos
+
+
+@ti.func
+def rand_int(low: ti.i32, high: ti.i32) -> ti.i32:
+    r = ti.random(float)
+    _res = r * (high - low) + low
+
+    return ti.round(_res, dtype=ti.i32)
+
+
+@ti.func
+def copy_pop_to_field(_pop: ti.template(), _trial: ti.template(), ind: ti.i32):
+    _, _dim = _pop.shape
+    for j in range(_dim):
+        _trial[j] = _pop[ind, j]
+
+
+@ti.func
+def copy_field_to_pop(_pop: ti.template(), _trial: ti.template(), ind: ti.i32):
+    _, _dim = _pop.shape
+    for j in range(dim):
+        _pop[ind, j] = _trial[j]
+
+
+@ti.func
+def copy_2d_to_3d(a: ti.template(), b: ti.template(), _iter: ti.i32):
+    r, c = b.shape
+    for ii, j in ti.ndrange(r, c):
+        a[_iter, ii, j] = b[ii, j]
+
+
+@ti.func
+def copy_field_a_to_b(a: ti.template(), b: ti.template()):
+    _dim = a.shape[0]
+    for j in range(_dim):
+        b[j] = a[j]
+
+
+@ti.func
+def de_crossover(_pop: ti.template(), _trial: ti.template(), a: ti.i32, b: ti.i32, c: ti.i32):
+    _, _dim = _pop.shape
+    CR = 0.5
+    para_F = 0.7
+    for k in range(_dim):
+        r = ti.random(float)
+        if r < CR or k == _dim - 1:
+            _trial[k] = _pop[c, k] + para_F * (_pop[a, k] - pop[b, k])
+
+
+@ti.func
+def de_loop(
+    _pop: ti.template(),
+    all_best: ti.float32,
+    _fit: ti.template(),
+    _trial: ti.template(),
+    _lb: ti.template(),
+    _ub: ti.template(),
+) -> ti.float32:
+    _search_num, _ = _pop.shape
+    for ii in range(_search_num):
+        copy_pop_to_field(_pop=_pop, _trial=_trial, ind=ii)
+
+        a = rand_int(low=0, high=_search_num)
+        while a == ii:
+            a = rand_int(low=0, high=_search_num)
+
+        b = rand_int(low=0, high=_search_num)
+        while b == ii or a == b:
+            b = rand_int(low=0, high=_search_num)
+
+        c = rand_int(low=0, high=_search_num)
+        while c == ii or c == a or c == b:
+            c = rand_int(low=0, high=_search_num)
+
+        de_crossover(_pop=_pop, _trial=_trial, a=a, b=b, c=c)
+        clip_only(_trial=_trial, _lb=_lb, _ub=_ub)
+        next_fit = f1_only(_trial=_trial)
+        if next_fit < _fit[ii]:
+            copy_field_to_pop(_pop=_pop, _trial=_trial, ind=ii)
+            _fit[ii] = next_fit
+            if next_fit < all_best:
+                all_best = next_fit
+                copy_field_a_to_b(a=_trial, b=best_pop)
+
+    return all_best
+
+
+@ti.kernel
+def DE(
+    _pop: ti.template(),
+    _max_iter: ti.i32,
+    _lb: ti.template(),
+    _ub: ti.template(),
+    _fit: ti.template(),
+    _best_fit: ti.template(),
+    _trial: ti.template(),
+):
+    f1(_fit=_fit, _pop=_pop)
+    min_pos = find_min(_fit=_fit)
+    all_best = _fit[min_pos]
+    _best_fit[0] = all_best
+    copy_2d_to_3d(a=all_pop, b=_pop, _iter=0)
+
+    for _ in range(1):
+        for cur_iter in range(1, _max_iter + 1):
+            all_best = de_loop(_pop=_pop, _fit=_fit, all_best=all_best, _trial=_trial, _lb=_lb, _ub=_ub)
+            _best_fit[cur_iter] = all_best
+            copy_2d_to_3d(a=all_pop, b=_pop, _iter=cur_iter)
+
+
+search_num = 20
+dim = 2
+max_iter = 50
+
+_lb = np.ones(dim).astype(np.int32) * (-100)
+lb = ti.field(ti.i32, shape=dim)
+lb.from_numpy(_lb)
+
+_ub = np.ones(dim).astype(np.int32) * 100
+ub = ti.field(ti.i32, shape=dim)
+ub.from_numpy(_ub)
+
+pop = ti.field(ti.float32, shape=(search_num, dim))
+pop.from_numpy((np.random.random((search_num, dim)) * (_ub - _lb) + _lb).astype(np.float32))
+
+fit = ti.field(ti.float32, shape=(search_num,))
+best_fit = ti.field(ti.float32, shape=(max_iter,))
+best_pop = ti.field(ti.float32, shape=(search_num,))
+all_pop = ti.field(ti.float32, shape=(max_iter, search_num, dim))
+
+trial = ti.field(ti.float32, shape=(search_num,))
+
+DE(_pop=pop, _max_iter=max_iter, _lb=lb, _ub=ub, _fit=fit, _best_fit=best_fit, _trial=trial)
+
+res = best_fit.to_numpy()
+
+
+@ti.kernel
+def draw_contour():
+    for ii, j in ti.ndrange(201, 201):
+        z[ii, j] = x[ii] ** 2 + y[j] ** 2
+
+
+_x = np.arange(-100, 101, 1)
+x = ti.field(ti.float32, shape=201)
+x.from_numpy(_x)
+_y = np.arange(-100, 101, 1)
+y = ti.field(ti.float32, shape=201)
+y.from_numpy(_y)
+z = ti.field(ti.float32, shape=(201, 201))
+
+draw_contour()
+
+_z = z.to_numpy()
+_pop = all_pop.to_numpy()
+
+plt.ion()
+
+"""2d visualization"""
+plt.contourf(_x, _y, _z)
+plt.colorbar()
+
+for i in range(max_iter):
+    plt.cla()
+    plt.contourf(_x, _y, _z)
+    plt.scatter(_pop[i, :, 0], _pop[i, :, 1], color="black")
+    plt.title(f"cur_iter: {i}, best_fit: {best_fit[i]:.2f}")
+    # plt.savefig(f"./2dimg/iter-{i}.png")
+    plt.pause(0.5)
+
+
+# import imageio.v2 as imageio
+# import os
+#
+# png_ls = os.listdir("./img")
+# f = []
+# for i in png_ls:
+#     f.append(imageio.imread("./img/" + i))
+#
+# imageio.mimsave("res.gif", f, "GIF", duration=0.5)
+
+
+"""3d visualization"""
+mesh_x, mesh_y = np.meshgrid(_x, _y)
+
+fig = plt.figure()
+ax = Axes3D(fig, auto_add_to_figure=False)
+ax.view_init(elev=51, azim=-70)
+fig.add_axes(ax)
+ax.plot_surface(mesh_x, mesh_y, _z, cmap="viridis", alpha=0.7)
+
+for i in range(max_iter):
+    ax.cla()
+    ax.plot_surface(mesh_x, mesh_y, _z, cmap="viridis", alpha=0.7)
+
+    row = []
+    col = []
+    val = []
+    nr, _ = _pop[i, :, :].shape
+    for _i in range(nr - 1):
+        row.append(np.round(_pop[i, _i, 0]).astype(int))
+        col.append(np.round(_pop[i, _i, 1]).astype(int))
+        val.append(_z[np.round(_pop[i, _i, 0]).astype(int) + 100, np.round(_pop[i, _i, 1]).astype(int) + 100])
+
+    ax.scatter3D(row, col, val, color="black")
+    # plt.savefig(f"./3dimg/iter-{i}.png")
+    plt.pause(0.5)
+#
+#
+# import imageio.v2 as imageio
+# import os
+#
+# png_ls = os.listdir("./3dimg")
+# f = []
+# for i in png_ls:
+#     f.append(imageio.imread("./3dimg/" + i))
+# imageio.mimsave("3dres.gif", f, "GIF", duration=0.5)
diff --git a/python/taichi/graph/_graph.py b/python/taichi/graph/_graph.py
index bcdef533e392a..f33ceb5e526a2 100644
--- a/python/taichi/graph/_graph.py
+++ b/python/taichi/graph/_graph.py
@@ -235,15 +235,7 @@ def _make_arg_matrix(kwargs: Dict[str, Any]):
     dtype = kwargs["dtype"]
     if not isinstance(dtype, MatrixType):
         raise TaichiRuntimeError(f"Tag ArgKind.MATRIX must specify matrix type, but got {dtype}.")
-    arg_list = []
-    i = 0
-    for _ in range(dtype.n):
-        arg_sublist = []
-        for _ in range(dtype.m):
-            arg_sublist.append(_ti_core.Arg(ArgKind.MATRIX, f"{name}_mat_arg_{i}", dtype.dtype, 0, []))
-            i += 1
-        arg_list.append(arg_sublist)
-    return arg_list
+    return _ti_core.Arg(ArgKind.MATRIX, f"{name}_mat_arg", dtype.dtype, 0, [dtype.n, dtype.m])
 
 
 def _make_arg_texture(kwargs: Dict[str, Any]):
diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index d211f9e410ecf..8dc92ad81614e 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -434,17 +434,6 @@ def build_call_if_is_builtin(ctx, node, args, keywords):
 
         if id(func) in replace_func:
             node.ptr = replace_func[id(func)](*args, **keywords)
-            if func is min or func is max:
-                name = "min" if func is min else "max"
-                warnings.warn_explicit(
-                    f'Calling builtin function "{name}" in Taichi scope is deprecated, '
-                    f"and it will be removed in Taichi v1.6.0."
-                    f'Please use "ti.{name}" instead.',
-                    DeprecationWarning,
-                    ctx.file,
-                    node.lineno + ctx.lineno_offset,
-                    module="taichi",
-                )
             return True
         return False
 
@@ -1023,28 +1012,19 @@ def build_Compare(ctx, node):
         ops_static = {
             ast.In: lambda l, r: l in r,
             ast.NotIn: lambda l, r: l not in r,
-            ast.Is: lambda l, r: l is r,
-            ast.IsNot: lambda l, r: l is not r,
         }
         if ctx.is_in_static_scope():
             ops = {**ops, **ops_static}
         operands = [node.left.ptr] + [comparator.ptr for comparator in node.comparators]
         val = True
         for i, node_op in enumerate(node.ops):
+            if isinstance(node_op, (ast.Is, ast.IsNot)):
+                name = "is" if isinstance(node_op, ast.Is) else "is not"
+                raise TaichiSyntaxError(f'Operator "{name}" in Taichi scope is not supported.')
             l = operands[i]
             r = operands[i + 1]
             op = ops.get(type(node_op))
-            if isinstance(node_op, (ast.Is, ast.IsNot)):
-                name = "is" if isinstance(node_op, ast.Is) else "is not"
-                warnings.warn_explicit(
-                    f'Operator "{name}" in Taichi scope is deprecated, '
-                    f"and it will be removed in Taichi v1.6.0. "
-                    f"Please avoid using it.",
-                    DeprecationWarning,
-                    ctx.file,
-                    node.lineno + ctx.lineno_offset,
-                    module="taichi",
-                )
+
             if op is None:
                 if type(node_op) in ops_static:
                     raise TaichiSyntaxError(f'"{type(node_op).__name__}" is only supported inside `ti.static`.')
@@ -1165,16 +1145,11 @@ def build_ndrange_for(ctx, node):
             I = impl.expr_init(ndrange_loop_var)
             targets = ASTTransformer.get_for_loop_targets(node)
             if len(targets) != len(ndrange_var.dimensions):
-                warnings.warn_explicit(
+                raise TaichiSyntaxError(
                     "Ndrange for loop with number of the loop variables not equal to "
-                    "the dimension of the ndrange is deprecated, "
-                    "and it will be removed in Taichi 1.6.0. "
+                    "the dimension of the ndrange is not supported. "
                     "Please check if the number of arguments of ti.ndrange() is equal to "
-                    "the number of the loop variables.",
-                    DeprecationWarning,
-                    ctx.file,
-                    node.lineno + ctx.lineno_offset,
-                    module="taichi",
+                    "the number of the loop variables."
                 )
             for i, target in enumerate(targets):
                 if i + 1 < len(targets):
diff --git a/python/taichi/lang/common_ops.py b/python/taichi/lang/common_ops.py
index 630d853d858f4..e5ef364cb8795 100644
--- a/python/taichi/lang/common_ops.py
+++ b/python/taichi/lang/common_ops.py
@@ -1,30 +1,16 @@
-import warnings
-
 from taichi.lang import ops
 from taichi.lang.util import in_python_scope
 from taichi.types import primitive_types
+from typing import TYPE_CHECKING
 
 
 class TaichiOperations:
     """The base class of taichi operations of expressions. Subclasses: :class:`~taichi.lang.expr.Expr`, :class:`~taichi.lang.matrix.Matrix`"""
 
-    __deprecated_atomic_ops__ = {
-        "atomic_add": "_atomic_add",
-        "atomic_mul": "_atomic_mul",
-        "atomic_and": "_atomic_and",
-        "atomic_or": "_atomic_or",
-        "atomic_sub": "_atomic_sub",
-        "atomic_xor": "_atomic_xor",
-    }
-
-    def __getattr__(self, item):
-        if item in TaichiOperations.__deprecated_atomic_ops__:
-            warnings.warn(
-                f"a.{item}(b) is deprecated, and it will be removed in Taichi v1.6.0. Please use ti.{item}(a, b) instead.",
-                DeprecationWarning,
-            )
-            return getattr(self, TaichiOperations.__deprecated_atomic_ops__[item])
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{item}'")
+    if TYPE_CHECKING:
+        # Make pylint happy
+        def __getattr__(self, item):
+            pass
 
     def __neg__(self):
         return ops.neg(self)
diff --git a/python/taichi/lang/kernel_arguments.py b/python/taichi/lang/kernel_arguments.py
index 1f1089090308a..c15b8555a8f34 100644
--- a/python/taichi/lang/kernel_arguments.py
+++ b/python/taichi/lang/kernel_arguments.py
@@ -7,7 +7,7 @@
 from taichi.lang.any_array import AnyArray
 from taichi.lang.enums import Layout
 from taichi.lang.expr import Expr
-from taichi.lang.matrix import MatrixType, VectorType, make_matrix
+from taichi.lang.matrix import MatrixType
 from taichi.lang.struct import StructType
 from taichi.lang.util import cook_dtype
 from taichi.types.primitive_types import RefType, u64
@@ -82,14 +82,10 @@ def get_type_for_kernel_args(dtype, name):
 
 
 def decl_matrix_arg(matrixtype, name):
-    if isinstance(matrixtype, VectorType):
-        return make_matrix([decl_scalar_arg(matrixtype.dtype, f"{name}_{i}") for i in range(matrixtype.n)])
-    return make_matrix(
-        [
-            [decl_scalar_arg(matrixtype.dtype, f"{name}_{i}_{j}") for i in range(matrixtype.m)]
-            for j in range(matrixtype.n)
-        ]
-    )
+    arg_type = get_type_for_kernel_args(matrixtype, name)
+    arg_id = impl.get_runtime().compiling_callable.insert_scalar_param(arg_type, name)
+    arg_load = Expr(_ti_core.make_arg_load_expr(arg_id, arg_type, create_load=False))
+    return matrixtype.from_taichi_object(arg_load)
 
 
 def decl_struct_arg(structtype, name):
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 226f8ef9ae5c8..6467174deb64f 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -757,39 +757,33 @@ def call_back():
                         )
                     else:
                         raise TaichiRuntimeTypeError.get(i, needed.to_string(), v)
-
                 else:
                     raise TaichiRuntimeTypeError.get(i, needed.to_string(), v)
 
             elif isinstance(needed, MatrixType):
                 if needed.dtype in primitive_types.real_types:
-                    for a in range(needed.n):
-                        for b in range(needed.m):
-                            if actual_argument_slot >= max_arg_num:
-                                exceed_max_arg_num = True
-                                break
-                            val = v[a, b] if needed.ndim == 2 else v[a]
-                            if not isinstance(val, (int, float, np.integer, np.floating)):
-                                raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(val))
-                            launch_ctx.set_arg_float(actual_argument_slot, float(val))
-                            actual_argument_slot += 1
+
+                    def cast_func(x):
+                        if not isinstance(x, (int, float, np.integer, np.floating)):
+                            raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x))
+                        return float(x)
+
                 elif needed.dtype in primitive_types.integer_types:
-                    for a in range(needed.n):
-                        for b in range(needed.m):
-                            if actual_argument_slot >= max_arg_num:
-                                exceed_max_arg_num = True
-                                break
-                            val = v[a, b] if needed.ndim == 2 else v[a]
-                            if not isinstance(val, (int, np.integer)):
-                                raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(val))
-                            if is_signed(needed.dtype):
-                                launch_ctx.set_arg_int(actual_argument_slot, int(val))
-                            else:
-                                launch_ctx.set_arg_uint(actual_argument_slot, int(val))
-                            actual_argument_slot += 1
+
+                    def cast_func(x):
+                        if not isinstance(x, (int, np.integer)):
+                            raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x))
+                        return int(x)
+
                 else:
                     raise ValueError(f"Matrix dtype {needed.dtype} is not integer type or real type.")
-                continue
+
+                if needed.ndim == 2:
+                    v = [cast_func(v[i, j]) for i in range(needed.n) for j in range(needed.m)]
+                else:
+                    v = [cast_func(v[i]) for i in range(needed.n)]
+                v = needed(*v)
+                needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,))
             elif isinstance(needed, StructType):
                 needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,))
             else:
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 7d7fb13e531d9..0aa5a9954169c 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1482,7 +1482,8 @@ def _instantiate_in_python_scope(self, entries):
                     for j in range(self.m)
                 ]
                 for i in range(self.n)
-            ]
+            ],
+            dt=self.dtype,
         )
 
     def _instantiate(self, entries):
@@ -1569,7 +1570,8 @@ def _instantiate_in_python_scope(self, entries):
             [
                 int(entries[i]) if self.dtype in primitive_types.integer_types else float(entries[i])
                 for i in range(self.n)
-            ]
+            ],
+            dt=self.dtype,
         )
 
     def _instantiate(self, entries):
diff --git a/python/taichi/linalg/__init__.py b/python/taichi/linalg/__init__.py
index 2bb594ffe273f..c4e28878b1b59 100644
--- a/python/taichi/linalg/__init__.py
+++ b/python/taichi/linalg/__init__.py
@@ -1,6 +1,6 @@
 """Taichi support module for sparse matrix operations.
 """
-from taichi.linalg.cg import CG
+from taichi.linalg.sparse_cg import SparseCG
 from taichi.linalg.sparse_matrix import *
 from taichi.linalg.sparse_solver import SparseSolver
-from taichi.linalg.taichi_cg import *
+from taichi.linalg.matrixfree_cg import *
diff --git a/python/taichi/linalg/taichi_cg.py b/python/taichi/linalg/matrixfree_cg.py
similarity index 82%
rename from python/taichi/linalg/taichi_cg.py
rename to python/taichi/linalg/matrixfree_cg.py
index bb39c0af734f0..0d333f4bbcea0 100644
--- a/python/taichi/linalg/taichi_cg.py
+++ b/python/taichi/linalg/matrixfree_cg.py
@@ -16,7 +16,21 @@ def matvec(self, x, Ax):
         self._matvec(x, Ax)
 
 
-def taichi_cg_solver(A, b, x, tol=1e-6, maxiter=5000, quiet=True):
+def MatrixFreeCG(A, b, x, tol=1e-6, maxiter=5000, quiet=True):
+    """Matrix-free conjugate-gradient solver.
+
+    Use conjugate-gradient method to solve the linear system Ax = b, where A is implicitly
+    represented as a LinearOperator.
+
+    Args:
+        A (LinearOperator): The coefficient matrix A of the linear system.
+        b (Field): The right-hand side of the linear system.
+        x (Field): The initial guess for the solution.
+        maxiter (int): Maximum number of iterations.
+        atol: Tolerance(absolute) for convergence.
+        quiet (bool): Switch to turn on/off iteration log.
+    """
+
     if b.dtype != x.dtype:
         raise TaichiTypeError(f"Dtype mismatch b.dtype({b.dtype}) != x.dtype({x.dtype}).")
     if str(b.dtype) == "f32":
diff --git a/python/taichi/linalg/cg.py b/python/taichi/linalg/sparse_cg.py
similarity index 78%
rename from python/taichi/linalg/cg.py
rename to python/taichi/linalg/sparse_cg.py
index 21973641e4db3..e0aba8112c690 100644
--- a/python/taichi/linalg/cg.py
+++ b/python/taichi/linalg/sparse_cg.py
@@ -6,7 +6,19 @@
 from taichi.types import f32, f64
 
 
-class CG:
+class SparseCG:
+    """Conjugate-gradient solver built for SparseMatrix.
+
+    Use conjugate-gradient method to solve the linear system Ax = b, where A is SparseMatrix.
+
+    Args:
+        A (SparseMatrix): The coefficient matrix A of the linear system.
+        b (numpy ndarray, taichi Ndarray): The right-hand side of the linear system.
+        x0 (numpy ndarray, taichi Ndarray): The initial guess for the solution.
+        max_iter (int): Maximum number of iterations.
+        atol: Tolerance(absolute) for convergence.
+    """
+
     def __init__(self, A, b, x0=None, max_iter=50, atol=1e-6):
         self.dtype = A.dtype
         self.ti_arch = get_runtime().prog.config().arch
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
index 9cd064d62aac9..443afd933aae3 100644
--- a/python/taichi/linalg/sparse_matrix.py
+++ b/python/taichi/linalg/sparse_matrix.py
@@ -1,4 +1,3 @@
-import warnings
 from functools import reduce
 
 import numpy as np
@@ -7,7 +6,7 @@
 from taichi.lang.exception import TaichiRuntimeError
 from taichi.lang.field import Field
 from taichi.lang.impl import get_runtime
-from taichi.types import annotations, f32
+from taichi.types import f32
 
 
 class SparseMatrix:
@@ -294,14 +293,4 @@ def build(self, dtype=f32, _format="CSR"):
         raise TaichiRuntimeError("Sparse matrix only supports CPU and CUDA backends.")
 
 
-# TODO: remove this in 1.0 release
-class sparse_matrix_builder(annotations.sparse_matrix_builder):
-    def __init__(self):
-        warnings.warn(
-            "ti.linalg.sparse_matrix_builder is deprecated, and it will be removed in Taichi v1.6.0. "
-            "Please use ti.types.sparse_matrix_builder instead.",
-            DeprecationWarning,
-        )
-
-
-__all__ = ["SparseMatrix", "SparseMatrixBuilder", "sparse_matrix_builder"]
+__all__ = ["SparseMatrix", "SparseMatrixBuilder"]
diff --git a/python/taichi/math/mathimpl.py b/python/taichi/math/mathimpl.py
index f445e4fd556a0..947cea1fb71e9 100644
--- a/python/taichi/math/mathimpl.py
+++ b/python/taichi/math/mathimpl.py
@@ -2,7 +2,7 @@
 """
 Math functions for glsl-like functions and other stuff.
 """
-from math import e, inf, nan, pi
+import math
 
 from taichi.lang import impl, ops
 from taichi.lang.impl import static, zero
@@ -31,6 +31,26 @@
 
 cfg = impl.default_cfg
 
+e = math.e
+"""The mathematical constant e = 2.718281….
+Directly imported from the Python standard library `math`.
+"""
+
+pi = math.pi
+"""The mathematical constant π = 3.141592….
+Directly imported from the Python standard library `math`.
+"""
+
+inf = math.inf
+"""A floating-point positive infinity. (For negative infinity, use `-inf`).
+Directly imported from the Python standard library `math`.
+"""
+
+nan = math.nan
+"""A floating-point "not a number" (NaN) value.
+Directly imported from the Python standard library `math`
+"""
+
 vec2 = vector(2, cfg().default_fp)
 """2D floating vector type.
 """
@@ -84,7 +104,7 @@
 def mix(x, y, a):
     """Performs a linear interpolation between `x` and `y` using
     `a` to weight between them. The return value is computed as
-    :math:`x\times a + (1-a)\times y`.
+    `x * (1 - a) + a * y`.
 
     The arguments can be scalars or :class:`~taichi.Matrix`,
     as long as the operation can be performed.
diff --git a/python/taichi/ui/ui.py b/python/taichi/ui/ui.py
index 8ed96100e9b4b..2252057ec9bfc 100644
--- a/python/taichi/ui/ui.py
+++ b/python/taichi/ui/ui.py
@@ -1,8 +1,6 @@
-import warnings
-
 from taichi._lib import core as _ti_core
 
-from .camera import Camera
+from .camera import Camera  # pylint: disable=unused-import
 from .canvas import Canvas  # pylint: disable=unused-import
 from .constants import *  # pylint: disable=unused-import,wildcard-import
 from .imgui import Gui  # pylint: disable=unused-import
@@ -11,22 +9,6 @@
 from .window import Window  # pylint: disable=unused-import
 
 
-def make_camera():
-    """Return an instance of :class:`~taichi.ui.Camera`. This is an deprecated
-    interface, please construct `~taichi.ui.Camera` directly.
-
-    Example::
-
-        >>> camera = ti.ui.make_camera()
-    """
-    warnings.warn(
-        "`ti.ui.make_camera()` is deprecated, and will be removed in Taichi v1.6.0. "
-        "Please use `ti.ui.Camera()` instead",
-        DeprecationWarning,
-    )
-    return Camera()
-
-
 # ----------------------
 ProjectionMode = _ti_core.ProjectionMode if _ti_core.GGUI_AVAILABLE else None
 """Camera projection mode, 0 for perspective and 1 for orthogonal.
diff --git a/python/taichi/ui/window.py b/python/taichi/ui/window.py
index 18670d4dd8506..397695ddd5b36 100644
--- a/python/taichi/ui/window.py
+++ b/python/taichi/ui/window.py
@@ -1,5 +1,4 @@
 import pathlib
-import warnings
 
 import numpy
 from taichi._kernels import (
@@ -145,20 +144,6 @@ def get_window_shape(self):
         """
         return self.window.get_window_shape()
 
-    def write_image(self, filename):
-        """Save the window content to an image file. This is an deprecated
-        interface; please use `save_image` instead.
-
-        Args:
-            filename (str): output filename.
-        """
-        warnings.warn(
-            "`Window.write_image()` is deprecated, and it will be removed in Taichi v1.6.0. "
-            "Please use `Window.save_image()` instead.",
-            DeprecationWarning,
-        )
-        return self.save_image(filename)
-
     def save_image(self, filename):
         """Save the window content to an image file.
 
diff --git a/taichi/aot/graph_data.cpp b/taichi/aot/graph_data.cpp
index 2423f864e3cc9..caa03f5e2fb97 100644
--- a/taichi/aot/graph_data.cpp
+++ b/taichi/aot/graph_data.cpp
@@ -44,6 +44,41 @@ void CompiledGraph::init_runtime_context(
     LaunchContextBuilder &ctx) {
   for (int i = 0; i < paramter_list.size(); ++i) {
     auto &symbolic_arg = paramter_list[i];
+    if (symbolic_arg.tag == aot::ArgKind::kMatrix) {
+      int size = symbolic_arg.element_shape[0] * symbolic_arg.element_shape[1];
+      for (int j = 0; j < size; j++) {
+        auto found = args.find(symbolic_arg.name + "_" + std::to_string(j));
+        TI_ERROR_IF(found == args.end(), "Missing runtime value for {}",
+                    symbolic_arg.name);
+        const aot::IValue &ival = found->second;
+        TI_ASSERT(ival.tag == aot::ArgKind::kScalar);
+        int type_size = data_type_size(symbolic_arg.dtype());
+        switch (type_size) {
+          case 1:
+            ctx.set_struct_arg_impl(
+                {i, j}, taichi_union_cast_with_different_sizes<int8>(ival.val));
+            break;
+          case 2:
+            ctx.set_struct_arg_impl(
+                {i, j},
+                taichi_union_cast_with_different_sizes<int16>(ival.val));
+            break;
+          case 4:
+            ctx.set_struct_arg_impl(
+                {i, j},
+                taichi_union_cast_with_different_sizes<int32>(ival.val));
+            break;
+          case 8:
+            ctx.set_struct_arg_impl(
+                {i, j},
+                taichi_union_cast_with_different_sizes<int64>(ival.val));
+            break;
+          default:
+            TI_ERROR("Unsupported type size {}", type_size);
+        }
+      }
+      continue;
+    }
     auto found = args.find(symbolic_arg.name);
     TI_ERROR_IF(found == args.end(), "Missing runtime value for {}",
                 symbolic_arg.name);
@@ -89,8 +124,7 @@ void CompiledGraph::init_runtime_context(
                   symbolic_arg.name, symbolic_arg_primitive_dtype.to_string(),
                   arr_primitive_dtype.to_string());
       ctx.set_arg_ndarray(i, *arr);
-    } else if (symbolic_arg.tag == aot::ArgKind::kScalar ||
-               symbolic_arg.tag == aot::ArgKind::kMatrix) {
+    } else if (symbolic_arg.tag == aot::ArgKind::kScalar) {
       TI_ASSERT(ival.tag == aot::ArgKind::kScalar);
       // Matrix args are flattened so they're same as scalars.
       int type_size = data_type_size(symbolic_arg.dtype());
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 28e95d8f729e5..2c2bea8285827 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -1898,10 +1898,11 @@ void TaskCodeGenLLVM::visit(ExternalPtrStmt *stmt) {
       (layout == ExternalArrayLayout::kAOS) ? num_array_args : 0;
 
   for (int i = 0; i < num_array_args; i++) {
-    auto raw_arg =
-        builder->CreateGEP(struct_type, llvm_val[stmt->base_ptr],
-                           {tlctx->get_constant(0), tlctx->get_constant(0),
-                            tlctx->get_constant(i)});
+    auto raw_arg = builder->CreateGEP(
+        struct_type, llvm_val[stmt->base_ptr],
+        {tlctx->get_constant(0),
+         tlctx->get_constant(TypeFactory::SHAPE_POS_IN_NDARRAY),
+         tlctx->get_constant(i)});
     raw_arg =
         builder->CreateLoad(tlctx->get_data_type(PrimitiveType::i32), raw_arg);
     sizes[i] = raw_arg;
@@ -1971,16 +1972,8 @@ void TaskCodeGenLLVM::visit(ExternalPtrStmt *stmt) {
 void TaskCodeGenLLVM::visit(ExternalTensorShapeAlongAxisStmt *stmt) {
   const auto arg_id = stmt->arg_id;
   const auto axis = stmt->axis;
-  if (auto struct_type = current_callable->args_type->get_element_type({arg_id})
-                             ->cast<StructType>()) {
-    // Is ndarray
-    llvm_val[stmt] = get_struct_arg({arg_id, 0, axis}, /*create_load=*/true);
-  } else {
-    // Is texture
-    llvm_val[stmt] =
-        call("RuntimeContext_get_extra_args", get_context(),
-             tlctx->get_constant(arg_id), tlctx->get_constant(axis));
-  }
+  llvm_val[stmt] = get_struct_arg(
+      {arg_id, TypeFactory::SHAPE_POS_IN_NDARRAY, axis}, /*create_load=*/true);
 }
 
 std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt,
diff --git a/taichi/codegen/spirv/kernel_utils.cpp b/taichi/codegen/spirv/kernel_utils.cpp
index e7cd29c3ab95e..d485e3269c2cb 100644
--- a/taichi/codegen/spirv/kernel_utils.cpp
+++ b/taichi/codegen/spirv/kernel_utils.cpp
@@ -50,9 +50,7 @@ std::string TaskAttributes::BufferBind::debug_string() const {
 KernelContextAttributes::KernelContextAttributes(
     const Kernel &kernel,
     const DeviceCapabilityConfig *caps)
-    : args_bytes_(0),
-      rets_bytes_(0),
-      extra_args_bytes_(RuntimeContext::extra_args_size) {
+    : args_bytes_(0), rets_bytes_(0) {
   arr_access.resize(kernel.parameter_list.size(), irpass::ExternalPtrAccess(0));
   arg_attribs_vec_.reserve(kernel.parameter_list.size());
   // TODO: We should be able to limit Kernel args and rets to be primitive types
diff --git a/taichi/codegen/spirv/kernel_utils.h b/taichi/codegen/spirv/kernel_utils.h
index 53990125b8900..16fce6aa35fb0 100644
--- a/taichi/codegen/spirv/kernel_utils.h
+++ b/taichi/codegen/spirv/kernel_utils.h
@@ -229,23 +229,6 @@ class KernelContextAttributes {
     return rets_bytes_;
   }
 
-  /**
-   * Number of bytes needed by the extra arguments.
-   *
-   * Extra argument region is used to store some metadata, like the shape of the
-   * external array.
-   */
-  inline size_t extra_args_bytes() const {
-    return extra_args_bytes_;
-  }
-
-  /**
-   * Offset (in bytes) of the extra arguments in the memory.
-   */
-  inline size_t extra_args_mem_offset() const {
-    return args_bytes();
-  }
-
   /**
    * The type of the struct that contains all the arguments.
    */
@@ -266,7 +249,6 @@ class KernelContextAttributes {
             ret_attribs_vec_,
             args_bytes_,
             rets_bytes_,
-            extra_args_bytes_,
             arr_access,
             args_type_,
             rets_type_);
@@ -277,7 +259,6 @@ class KernelContextAttributes {
 
   size_t args_bytes_{0};
   size_t rets_bytes_{0};
-  size_t extra_args_bytes_{0};
 
   const lang::StructType *args_type_{nullptr};
   const lang::StructType *rets_type_{nullptr};
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 8d266326ddeef..26c3cffd41447 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -633,30 +633,18 @@ class TaskCodegen : public IRVisitor {
     const auto arg_id = stmt->arg_id;
     const auto axis = stmt->axis;
 
-    const auto extra_args_member_index = ctx_attribs_->args().size();
-
-    const auto extra_arg_index = (arg_id * taichi_max_num_indices) + axis;
     spirv::Value var_ptr;
-    if (ctx_attribs_->args_type()
-            ->get_element_type({arg_id})
-            ->is<lang::StructType>()) {
-      // Is ndarray
-      var_ptr = ir_->make_value(
-          spv::OpAccessChain,
-          ir_->get_pointer_type(ir_->i32_type(), spv::StorageClassUniform),
-          get_buffer_value(BufferType::Args, PrimitiveType::i32),
-          ir_->int_immediate_number(ir_->i32_type(), arg_id),
-          ir_->int_immediate_number(ir_->i32_type(), 0),
-          ir_->int_immediate_number(ir_->i32_type(), axis));
-    } else {
-      // Is texture
-      var_ptr = ir_->make_value(
-          spv::OpAccessChain,
-          ir_->get_pointer_type(ir_->i32_type(), spv::StorageClassUniform),
-          get_buffer_value(BufferType::Args, PrimitiveType::i32),
-          ir_->int_immediate_number(ir_->i32_type(),
-                                    extra_args_member_index + extra_arg_index));
-    }
+    TI_ASSERT(ctx_attribs_->args_type()
+                  ->get_element_type({arg_id})
+                  ->is<lang::StructType>());
+    var_ptr = ir_->make_value(
+        spv::OpAccessChain,
+        ir_->get_pointer_type(ir_->i32_type(), spv::StorageClassUniform),
+        get_buffer_value(BufferType::Args, PrimitiveType::i32),
+        ir_->int_immediate_number(ir_->i32_type(), arg_id),
+        ir_->int_immediate_number(ir_->i32_type(),
+                                  TypeFactory::SHAPE_POS_IN_NDARRAY),
+        ir_->int_immediate_number(ir_->i32_type(), axis));
     spirv::Value var = ir_->load_variable(var_ptr, ir_->i32_type());
 
     ir_->register_value(name, var);
@@ -685,7 +673,8 @@ class TaskCodegen : public IRVisitor {
             ir_->get_pointer_type(ir_->i32_type(), spv::StorageClassUniform),
             get_buffer_value(BufferType::Args, PrimitiveType::i32),
             ir_->int_immediate_number(ir_->i32_type(), arg_id),
-            ir_->int_immediate_number(ir_->i32_type(), 0),
+            ir_->int_immediate_number(ir_->i32_type(),
+                                      TypeFactory::SHAPE_POS_IN_NDARRAY),
             ir_->int_immediate_number(ir_->i32_type(), i));
         spirv::Value var = ir_->load_variable(var_ptr, ir_->i32_type());
         ir_->register_value(var_name, var);
@@ -2236,11 +2225,6 @@ class TaskCodegen : public IRVisitor {
       element_types.push_back(
           translate_ti_type(blk, element.type, has_buffer_ptr));
     }
-    const tinyir::Type *i32_type =
-        blk.emplace_back<IntType>(/*num_bits=*/32, /*is_signed=*/true);
-    for (int i = 0; i < ctx_attribs_->extra_args_bytes() / 4; i++) {
-      element_types.push_back(i32_type);
-    }
     const tinyir::Type *struct_type =
         blk.emplace_back<StructType>(element_types);
 
diff --git a/taichi/ir/type_factory.cpp b/taichi/ir/type_factory.cpp
index 3e3e51e877c78..b63aee0662767 100644
--- a/taichi/ir/type_factory.cpp
+++ b/taichi/ir/type_factory.cpp
@@ -188,6 +188,10 @@ const Type *TypeFactory::get_ndarray_struct_type(DataType dt,
   return get_struct_type(members);
 }
 
+const Type *TypeFactory::get_rwtexture_struct_type() {
+  return get_ndarray_struct_type(PrimitiveType::f32, 3);
+}
+
 namespace {
 static bool compare_types(DataType x, DataType y) {
   // Is the first type "bigger" than the second type?
diff --git a/taichi/ir/type_factory.h b/taichi/ir/type_factory.h
index 897644fa3fded..f73343c78519f 100644
--- a/taichi/ir/type_factory.h
+++ b/taichi/ir/type_factory.h
@@ -29,6 +29,8 @@ class TypeFactory {
                                       int total_dim,
                                       bool needs_grad = false);
 
+  const Type *get_rwtexture_struct_type();
+
   Type *get_pointer_type(Type *element, bool is_bit_pointer = false);
 
   Type *get_quant_int_type(int num_bits, bool is_signed, Type *compute_type);
@@ -54,6 +56,7 @@ class TypeFactory {
 
   static DataType create_tensor_type(std::vector<int> shape, DataType element);
 
+  constexpr static int SHAPE_POS_IN_NDARRAY = 0;
   constexpr static int DATA_PTR_POS_IN_NDARRAY = 1;
   constexpr static int GRAD_PTR_POS_IN_NDARRAY = 2;
 
diff --git a/taichi/program/callable.cpp b/taichi/program/callable.cpp
index b3af705acad64..384a3c31059a6 100644
--- a/taichi/program/callable.cpp
+++ b/taichi/program/callable.cpp
@@ -46,8 +46,11 @@ int Callable::insert_ndarray_param(const DataType &dt,
 
 int Callable::insert_texture_param(int total_dim, const std::string &name) {
   // FIXME: we shouldn't abuse is_array for texture parameters
-  parameter_list.emplace_back(PrimitiveType::f32, /*is_array=*/true, 0,
-                              total_dim, std::vector<int>{});
+  // FIXME: using rwtexture struct type for texture parameters because C-API
+  // does not distinguish between texture and rwtexture.
+  auto *type = TypeFactory::get_instance().get_rwtexture_struct_type();
+  parameter_list.emplace_back(type, /*is_array=*/true, 0, total_dim,
+                              std::vector<int>{});
   parameter_list.back().name = name;
   return (int)parameter_list.size() - 1;
 }
@@ -63,8 +66,9 @@ int Callable::insert_rw_texture_param(int total_dim,
                                       BufferFormat format,
                                       const std::string &name) {
   // FIXME: we shouldn't abuse is_array for texture parameters
-  parameter_list.emplace_back(PrimitiveType::f32, /*is_array=*/true, 0,
-                              total_dim, std::vector<int>{}, format);
+  auto *type = TypeFactory::get_instance().get_rwtexture_struct_type();
+  parameter_list.emplace_back(type, /*is_array=*/true, 0, total_dim,
+                              std::vector<int>{}, format);
   parameter_list.back().name = name;
   return (int)parameter_list.size() - 1;
 }
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index 64c04371a44a2..bb6c102041d7d 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -42,7 +42,6 @@ CompileConfig::CompileConfig() {
   make_thread_local = true;
   make_block_local = true;
   detect_read_only = true;
-  ndarray_use_cached_allocator = true;
   real_matrix_scalarize = true;
   half2_vectorization = false;
   make_cpu_multithreading_loop = true;
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index df1650ec163a3..619109f4e86d8 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -37,7 +37,6 @@ struct CompileConfig {
   bool make_thread_local;
   bool make_block_local;
   bool detect_read_only;
-  bool ndarray_use_cached_allocator;
   bool real_matrix_scalarize;
   bool half2_vectorization;
   bool make_cpu_multithreading_loop;
diff --git a/taichi/program/context.h b/taichi/program/context.h
index 3b5a8412292f1..71ab23bc9644f 100644
--- a/taichi/program/context.h
+++ b/taichi/program/context.h
@@ -17,7 +17,6 @@ struct RuntimeContext {
 
   LLVMRuntime *runtime{nullptr};
 
-  int32_t extra_args[taichi_max_num_args_extra][taichi_max_num_indices];
   int32_t cpu_thread_id;
 
   // We move the pointer of result buffer from LLVMRuntime to RuntimeContext
@@ -25,8 +24,6 @@ struct RuntimeContext {
   // LLVMRuntime is shared among functions. So we moved the pointer to
   // RuntimeContext which each function have one.
   uint64_t *result_buffer;
-
-  static constexpr size_t extra_args_size = sizeof(extra_args);
 };
 
 #if defined(TI_RUNTIME_HOST)
diff --git a/taichi/program/launch_context_builder.cpp b/taichi/program/launch_context_builder.cpp
index e57c8f21b4be0..1458ae81d6203 100644
--- a/taichi/program/launch_context_builder.cpp
+++ b/taichi/program/launch_context_builder.cpp
@@ -135,10 +135,6 @@ void LaunchContextBuilder::set_arg<TypedConstant>(int i, TypedConstant d) {
   }
 }
 
-void LaunchContextBuilder::set_extra_arg_int(int i, int j, int32 d) {
-  ctx_->extra_args[i][j] = d;
-}
-
 template <typename T>
 void LaunchContextBuilder::set_struct_arg_impl(std::vector<int> arg_indices,
                                                T v) {
@@ -257,8 +253,8 @@ void LaunchContextBuilder::set_arg_rw_texture_impl(
   array_ptrs[{arg_id}] = (void *)alloc_ptr;
   set_array_device_allocation_type(arg_id, DevAllocType::kRWTexture);
   TI_ASSERT(shape.size() <= taichi_max_num_indices);
-  for (int i = 0; i < shape.size(); i++) {
-    ctx_->extra_args[arg_id][i] = shape[i];
+  for (int i = 0; i < shape.size(); ++i) {
+    set_struct_arg({arg_id, 0, i}, shape[i]);
   }
 }
 
diff --git a/taichi/program/launch_context_builder.h b/taichi/program/launch_context_builder.h
index 55d9f212fe746..ee0687d490b2c 100644
--- a/taichi/program/launch_context_builder.h
+++ b/taichi/program/launch_context_builder.h
@@ -57,7 +57,6 @@ class LaunchContextBuilder {
 
   template <typename T>
   T get_ret(int i);
-  void set_extra_arg_int(int i, int j, int32 d);
 
   void set_arg_external_array_with_shape(int arg_id,
                                          uintptr_t ptr,
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 4f34d0e3976a0..79a3833736609 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -193,8 +193,6 @@ void export_lang(py::module &m) {
       .def_readwrite("make_thread_local", &CompileConfig::make_thread_local)
       .def_readwrite("make_block_local", &CompileConfig::make_block_local)
       .def_readwrite("detect_read_only", &CompileConfig::detect_read_only)
-      .def_readwrite("ndarray_use_cached_allocator",
-                     &CompileConfig::ndarray_use_cached_allocator)
       .def_readwrite("real_matrix_scalarize",
                      &CompileConfig::real_matrix_scalarize)
       .def_readwrite("half2_vectorization", &CompileConfig::half2_vectorization)
@@ -618,66 +616,55 @@ void export_lang(py::module &m) {
       .def("seq", &GraphBuilder::seq, py::return_value_policy::reference);
 
   py::class_<aot::CompiledGraph>(m, "CompiledGraph")
-      .def("jit_run", [](aot::CompiledGraph *self,
-                         const CompileConfig &compile_config,
-                         const py::dict &pyargs) {
-        std::unordered_map<std::string, aot::IValue> args;
-        for (auto it : pyargs) {
-          std::string arg_name = py::cast<std::string>(it.first);
-          auto tag = self->args[arg_name].tag;
-          if (tag == aot::ArgKind::kNdarray) {
-            auto &val = it.second.cast<Ndarray &>();
-            args.insert(
-                {py::cast<std::string>(it.first), aot::IValue::create(val)});
-          } else if (tag == aot::ArgKind::kTexture ||
-                     tag == aot::ArgKind::kRWTexture) {
-            auto &val = it.second.cast<Texture &>();
-            args.insert(
-                {py::cast<std::string>(it.first), aot::IValue::create(val)});
-
-          } else if (tag == aot::ArgKind::kScalar ||
-                     tag == aot::ArgKind::kMatrix) {
-            std::string arg_name = py::cast<std::string>(it.first);
-            auto expected_dtype = self->args[arg_name].dtype();
-            if (expected_dtype == PrimitiveType::i32) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<int>(it.second))});
-            } else if (expected_dtype == PrimitiveType::i64) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<int64>(it.second))});
-            } else if (expected_dtype == PrimitiveType::f32) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<float>(it.second))});
-            } else if (expected_dtype == PrimitiveType::f64) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<double>(it.second))});
-            } else if (expected_dtype == PrimitiveType::i16) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<int16>(it.second))});
-            } else if (expected_dtype == PrimitiveType::u32) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<uint32>(it.second))});
-            } else if (expected_dtype == PrimitiveType::u64) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<uint64>(it.second))});
-            } else if (expected_dtype == PrimitiveType::u16) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<uint16>(it.second))});
-            } else if (expected_dtype == PrimitiveType::u8) {
-              args.insert({arg_name,
-                           aot::IValue::create(py::cast<uint8_t>(it.second))});
-            } else if (expected_dtype == PrimitiveType::i8) {
-              args.insert(
-                  {arg_name, aot::IValue::create(py::cast<int8_t>(it.second))});
-            } else {
-              TI_NOT_IMPLEMENTED;
-            }
-          } else {
-            TI_NOT_IMPLEMENTED;
-          }
-        }
-        self->jit_run(compile_config, args);
-      });
+      .def("jit_run",
+           [](aot::CompiledGraph *self, const CompileConfig &compile_config,
+              const py::dict &pyargs) {
+             std::unordered_map<std::string, aot::IValue> args;
+             auto insert_scalar_arg = [&args](std::string arg_name,
+                                              DataType expected_dtype,
+                                              py::object pyarg) {
+               auto type_id = expected_dtype->as<PrimitiveType>()->type;
+               switch (type_id) {
+#define PER_C_TYPE(type, ctype)                                           \
+  case PrimitiveTypeID::type:                                             \
+    args.insert({arg_name, aot::IValue::create(py::cast<ctype>(pyarg))}); \
+    break;
+#include "taichi/inc/data_type_with_c_type.inc.h"
+#undef PER_C_TYPE
+                 default:
+                   TI_ERROR("Unsupported scalar type {}", type_id);
+               }
+             };
+             for (const auto &[arg_name, arg] : self->args) {
+               auto tag = arg.tag;
+               if (tag == aot::ArgKind::kMatrix) {
+                 int size = arg.element_shape[0] * arg.element_shape[1];
+                 for (int i = 0; i < size; i++) {
+                   auto name = fmt::format("{}_{}", arg_name, i);
+                   TI_ASSERT(pyargs.contains(name.c_str()));
+                   auto pyarg = pyargs[name.c_str()];
+                   insert_scalar_arg(name, arg.dtype(), pyarg);
+                 }
+                 continue;
+               }
+               TI_ASSERT(pyargs.contains(arg_name.c_str()));
+               auto pyarg = pyargs[arg_name.c_str()];
+               if (tag == aot::ArgKind::kNdarray) {
+                 auto &val = pyarg.cast<Ndarray &>();
+                 args.insert({arg_name, aot::IValue::create(val)});
+               } else if (tag == aot::ArgKind::kTexture ||
+                          tag == aot::ArgKind::kRWTexture) {
+                 auto &val = pyarg.cast<Texture &>();
+                 args.insert({arg_name, aot::IValue::create(val)});
+               } else if (tag == aot::ArgKind::kScalar) {
+                 auto expected_dtype = arg.dtype();
+                 insert_scalar_arg(arg_name, expected_dtype, pyarg);
+               } else {
+                 TI_NOT_IMPLEMENTED;
+               }
+             }
+             self->jit_run(compile_config, args);
+           });
 
   py::class_<Kernel>(m, "Kernel")
       .def("no_activate",
@@ -723,7 +710,6 @@ void export_lang(py::module &m) {
            &LaunchContextBuilder::set_arg_ndarray_with_grad)
       .def("set_arg_texture", &LaunchContextBuilder::set_arg_texture)
       .def("set_arg_rw_texture", &LaunchContextBuilder::set_arg_rw_texture)
-      .def("set_extra_arg_int", &LaunchContextBuilder::set_extra_arg_int)
       .def("get_struct_ret_int", &LaunchContextBuilder::get_struct_ret_int)
       .def("get_struct_ret_uint", &LaunchContextBuilder::get_struct_ret_uint)
       .def("get_struct_ret_float", &LaunchContextBuilder::get_struct_ret_float);
diff --git a/taichi/rhi/amdgpu/amdgpu_device.cpp b/taichi/rhi/amdgpu/amdgpu_device.cpp
index f2d94a9c06883..6107904c9f644 100644
--- a/taichi/rhi/amdgpu/amdgpu_device.cpp
+++ b/taichi/rhi/amdgpu/amdgpu_device.cpp
@@ -56,17 +56,15 @@ DeviceAllocation AmdgpuDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   if (params.host_read || params.host_write) {
     TI_NOT_IMPLEMENTED
-  } else if (params.use_cached) {
+  } else {
     info.ptr =
         DeviceMemoryPool::get_instance().allocate_with_cache(this, params);
     TI_ASSERT(info.ptr != nullptr);
 
     AMDGPUDriver::get_instance().memset((void *)info.ptr, 0, info.size);
-  } else {
-    info.ptr = allocate_llvm_runtime_memory_jit(params);
   }
   info.is_imported = false;
-  info.use_cached = params.use_cached;
+  info.use_cached = true;
   info.use_preallocated = true;
 
   DeviceAllocation alloc;
diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp
index bb4bded239eb2..f75f657972cbf 100644
--- a/taichi/rhi/cuda/cuda_device.cpp
+++ b/taichi/rhi/cuda/cuda_device.cpp
@@ -51,18 +51,16 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   if (info.size == 0) {
     info.ptr = nullptr;
-  } else if (params.use_cached) {
+  } else {
     info.ptr =
         DeviceMemoryPool::get_instance().allocate_with_cache(this, params);
 
     TI_ASSERT(info.ptr != nullptr);
 
     CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size);
-  } else {
-    info.ptr = allocate_llvm_runtime_memory_jit(params);
   }
   info.is_imported = false;
-  info.use_cached = params.use_cached;
+  info.use_cached = true;
   info.use_preallocated = true;
 
   DeviceAllocation alloc;
diff --git a/taichi/rhi/llvm/llvm_device.h b/taichi/rhi/llvm/llvm_device.h
index e7104f94a20fb..7997a5c3fe964 100644
--- a/taichi/rhi/llvm/llvm_device.h
+++ b/taichi/rhi/llvm/llvm_device.h
@@ -10,7 +10,6 @@ struct LLVMRuntime;
 class LlvmDevice : public Device {
  public:
   struct LlvmRuntimeAllocParams : AllocParams {
-    bool use_cached{true};
     JITModule *runtime_jit{nullptr};
     LLVMRuntime *runtime{nullptr};
     uint64 *result_buffer{nullptr};
diff --git a/taichi/runtime/gfx/runtime.cpp b/taichi/runtime/gfx/runtime.cpp
index 1c028a1924979..c49a912226a36 100644
--- a/taichi/runtime/gfx/runtime.cpp
+++ b/taichi/runtime/gfx/runtime.cpp
@@ -93,11 +93,6 @@ class HostDeviceContextBlitter {
     std::memcpy(device_base, host_ctx_.get_context().arg_buffer,
                 ctx_attribs_->args_bytes());
 
-    void *device_ptr =
-        (uint8_t *)device_base + ctx_attribs_->extra_args_mem_offset();
-    std::memcpy(device_ptr, host_ctx_.get_context().extra_args,
-                ctx_attribs_->extra_args_bytes());
-
     device_->unmap(*device_args_buffer_);
   }
 
@@ -258,10 +253,6 @@ CompiledTaichiKernel::CompiledTaichiKernel(const Params &ti_params)
   args_buffer_size_ = arg_sz;
   ret_buffer_size_ = ret_sz;
 
-  if (arg_sz) {
-    args_buffer_size_ += ti_kernel_attribs_.ctx_attribs.extra_args_bytes();
-  }
-
   const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;
   const auto &spirv_bins = ti_params.spirv_bins;
   TI_ASSERT(task_attribs.size() == spirv_bins.size());
@@ -802,15 +793,14 @@ std::pair<const lang::StructType *, size_t>
 GfxRuntime::get_struct_type_with_data_layout(const lang::StructType *old_ty,
                                              const std::string &layout) {
   auto [new_ty, size, align] =
-      get_struct_type_with_data_layout_impl(old_ty, layout, true);
+      get_struct_type_with_data_layout_impl(old_ty, layout);
   return {new_ty, size};
 }
 
 std::tuple<const lang::StructType *, size_t, size_t>
 GfxRuntime::get_struct_type_with_data_layout_impl(
     const lang::StructType *old_ty,
-    const std::string &layout,
-    bool is_outmost) {
+    const std::string &layout) {
   TI_TRACE("get_struct_type_with_data_layout: {}", layout);
   TI_ASSERT(layout.size() == 2);
   auto is_430 = layout[0] == '4';
@@ -824,7 +814,7 @@ GfxRuntime::get_struct_type_with_data_layout_impl(
     size_t member_size;
     if (auto struct_type = member.type->cast<lang::StructType>()) {
       auto [new_ty, size, member_align_] =
-          get_struct_type_with_data_layout_impl(struct_type, layout, false);
+          get_struct_type_with_data_layout_impl(struct_type, layout);
       members[i].type = new_ty;
       member_align = member_align_;
       member_size = size;
@@ -863,7 +853,7 @@ GfxRuntime::get_struct_type_with_data_layout_impl(
 
   if (!is_430) {
     align = align_up(align, sizeof(float) * 4);
-    bytes = align_up(bytes, is_outmost ? 4 : 4 * sizeof(float));
+    bytes = align_up(bytes, 4 * sizeof(float));
   }
   TI_TRACE("  total_bytes={}", bytes);
   return {TypeFactory::get_instance()
diff --git a/taichi/runtime/gfx/runtime.h b/taichi/runtime/gfx/runtime.h
index d52de3c2ae320..725ef0aa81394 100644
--- a/taichi/runtime/gfx/runtime.h
+++ b/taichi/runtime/gfx/runtime.h
@@ -135,8 +135,7 @@ class TI_DLL_EXPORT GfxRuntime {
 
   static std::tuple<const lang::StructType *, size_t, size_t>
   get_struct_type_with_data_layout_impl(const lang::StructType *old_ty,
-                                        const std::string &layout,
-                                        bool is_outmost);
+                                        const std::string &layout);
 
  private:
   friend class taichi::lang::gfx::SNodeTreeManager;
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index c7ad98b5201e6..ffb35055618a0 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -475,7 +475,6 @@ DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
   return llvm_device()->allocate_memory_runtime(
       {{alloc_size, /*host_write=*/false, /*host_read=*/false,
         /*export_sharing=*/false, AllocUsage::Storage},
-       config_.ndarray_use_cached_allocator,
        get_runtime_jit_module(),
        get_llvm_runtime(),
        result_buffer});
diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp
index f7cb28443d824..8be6e4f69e8c4 100644
--- a/taichi/runtime/llvm/runtime_module/runtime.cpp
+++ b/taichi/runtime/llvm/runtime_module/runtime.cpp
@@ -288,10 +288,6 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val);
 STRUCT_FIELD(RuntimeContext, runtime);
 STRUCT_FIELD(RuntimeContext, result_buffer)
 
-int32 RuntimeContext_get_extra_args(RuntimeContext *ctx, int32 i, int32 j) {
-  return ctx->extra_args[i][j];
-}
-
 #include "taichi/runtime/llvm/runtime_module/atomic.h"
 
 // These structures are accessible by both the LLVM backend and this C++ runtime
diff --git a/taichi/transforms/alg_simp.cpp b/taichi/transforms/alg_simp.cpp
index 792f865410a0d..44ff6e9408185 100644
--- a/taichi/transforms/alg_simp.cpp
+++ b/taichi/transforms/alg_simp.cpp
@@ -60,8 +60,10 @@ class AlgSimp : public BasicStmtVisitor {
              data_type_bits(second_cast) <= data_type_bits(first_cast);
     }
     if (is_integral(first_cast)) {
-      // int(int(a))
-      return data_type_bits(second_cast) <= data_type_bits(first_cast);
+      // int(int(a)), note it's not always equivalent when signedness differ,
+      // see #7915
+      return data_type_bits(second_cast) <= data_type_bits(first_cast) &&
+             is_signed(second_cast) == is_signed(first_cast);
     }
     // int(float(a))
     if (data_type_bits(second_cast) <= data_type_bits(first_cast) * 2) {
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index c93fd9298b1d8..92700327cfb0f 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -127,14 +127,6 @@ void compile_to_offloads(IRNode *ir,
   print("Access flagged I");
   irpass::analysis::verify(ir);
 
-  if (config.real_matrix_scalarize) {
-    irpass::scalarize(ir);
-
-    // Remove redundant MatrixInitStmt inserted during scalarization
-    irpass::die(ir);
-    print("Scalarized");
-  }
-
   irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false});
   print("Simplified II");
   irpass::analysis::verify(ir);
@@ -187,6 +179,14 @@ void offload_to_executable(IRNode *ir,
     print("Detect read-only accesses");
   }
 
+  if (config.real_matrix_scalarize) {
+    irpass::scalarize(ir);
+
+    // Remove redundant MatrixInitStmt inserted during scalarization
+    irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false});
+    print("Scalarized");
+  }
+
   irpass::demote_atomics(ir, config);
   print("Atomics demoted I");
   irpass::analysis::verify(ir);
diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp
index fe675b0c362bf..d9f904b443caf 100644
--- a/taichi/transforms/offload.cpp
+++ b/taichi/transforms/offload.cpp
@@ -17,6 +17,8 @@ bool demotable_axis_load(Stmt *stmt) {
   // Stmt involving simple arithmetic of ExternalTensorShapeAlongAxisStmt
   // shouldn't be saved in global tmp, just clone them to each shader
   // separately.
+  if (stmt->is<GlobalLoadStmt>())
+    return false;
   int n_op = stmt->num_operands();
   if (n_op == 0) {
     return stmt->is<ExternalTensorShapeAlongAxisStmt>() ||
@@ -442,8 +444,8 @@ class PromoteIntermediateToGlobalTmp : public BasicStmtVisitor {
 
  private:
   explicit PromoteIntermediateToGlobalTmp(
-      const StmtToOffsetMap &local_to_global_offset)
-      : local_to_global_offset_(local_to_global_offset) {
+      const StmtToOffsetMap *local_to_global_offset)
+      : local_to_global_offset_(*local_to_global_offset) {
     allow_undefined_visitor = true;
     invoke_default_visitor = true;
   }
@@ -454,20 +456,20 @@ class PromoteIntermediateToGlobalTmp : public BasicStmtVisitor {
         local_to_global_offset_.find(stmt) != local_to_global_offset_.end() &&
         stored_to_global_.find(stmt) == stored_to_global_.end()) {
       stored_to_global_.insert(stmt);
-      auto offset = local_to_global_offset_[stmt];
+      auto offset = local_to_global_offset_.at(stmt);
       auto ptr = stmt->insert_after_me(
           Stmt::make<GlobalTemporaryStmt>(offset, stmt->ret_type));
       ptr->insert_after_me(Stmt::make<GlobalStoreStmt>(ptr, stmt));
     }
   }
 
-  static void run(IRNode *root, const StmtToOffsetMap &local_to_global_offset) {
+  static void run(IRNode *root, const StmtToOffsetMap *local_to_global_offset) {
     PromoteIntermediateToGlobalTmp pass(local_to_global_offset);
     root->accept(&pass);
   }
 
  private:
-  StmtToOffsetMap local_to_global_offset_;
+  const StmtToOffsetMap &local_to_global_offset_;
   std::set<Stmt *> stored_to_global_;
 };
 
@@ -477,11 +479,11 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
  private:
   FixCrossOffloadReferences(
       const CompileConfig &config,
-      const StmtToOffsetMap &local_to_global_offset,
-      const std::unordered_map<Stmt *, Stmt *> &stmt_to_offloaded,
+      const StmtToOffsetMap *local_to_global_offset,
+      std::unordered_map<Stmt *, Stmt *> &stmt_to_offloaded,
       OffloadedRanges *offloaded_ranges)
       : config_(config),
-        local_to_global_offset_(local_to_global_offset),
+        local_to_global_offset_(*local_to_global_offset),
         stmt_to_offloaded_(stmt_to_offloaded),
         offloaded_ranges_(offloaded_ranges) {
     allow_undefined_visitor = true;
@@ -499,9 +501,8 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
                            offloaded_ranges_->begin_stmts.find(stmt)->second) !=
                            local_to_global_offset_.end(),
                        "Begin fails.")
-        stmt->begin_offset =
-            local_to_global_offset_[offloaded_ranges_->begin_stmts.find(stmt)
-                                        ->second];
+        stmt->begin_offset = local_to_global_offset_.at(
+            offloaded_ranges_->begin_stmts.find(stmt)->second);
       }
       if (!stmt->const_end) {
         if (stmt->end_stmt) {
@@ -514,9 +515,8 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
                              offloaded_ranges_->end_stmts.find(stmt)->second) !=
                              local_to_global_offset_.end(),
                          "End fails.")
-          stmt->end_offset =
-              local_to_global_offset_[offloaded_ranges_->end_stmts.find(stmt)
-                                          ->second];
+          stmt->end_offset = local_to_global_offset_.at(
+              offloaded_ranges_->end_stmts.find(stmt)->second);
         }
       }
     }
@@ -530,27 +530,22 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
     auto ret_type = stmt->ret_type;
     local_to_global_vector_type_[stmt] = ret_type;
     auto ptr = replacement.push_back<GlobalTemporaryStmt>(
-        local_to_global_offset_[stmt], ret_type);
+        local_to_global_offset_.at(stmt), ret_type);
     auto offloaded = stmt_to_offloaded_[stmt];
     stmt_to_offloaded_[ptr] = offloaded;
+
+    TypedConstant zero(stmt->ret_type.get_element_type());
+    auto const_zero_stmt = replacement.push_back<ConstStmt>(zero);
     if (auto tensor_type = stmt->ret_type->cast<TensorType>()) {
-      TypedConstant zero(tensor_type->get_element_type());
-      auto const_zero_stmt = replacement.push_back<ConstStmt>(zero);
-      stmt_to_offloaded_[const_zero_stmt] = offloaded;
-      for (int i = 0; i < tensor_type->get_num_elements(); ++i) {
-        auto const_offset_stmt =
-            replacement.push_back<ConstStmt>(TypedConstant(i));
-        auto ptr_offset_stmt =
-            replacement.push_back<MatrixPtrStmt>(ptr, const_offset_stmt);
-        auto global_store_stmt = replacement.push_back<GlobalStoreStmt>(
-            ptr_offset_stmt, const_zero_stmt);
-        stmt_to_offloaded_[const_offset_stmt] = offloaded;
-        stmt_to_offloaded_[ptr_offset_stmt] = offloaded;
-        stmt_to_offloaded_[global_store_stmt] = offloaded;
-      }
+      std::vector<Stmt *> zero_values(tensor_type->get_num_elements(),
+                                      const_zero_stmt);
+      auto zero_matrix_init_stmt =
+          replacement.push_back<MatrixInitStmt>(zero_values);
+      zero_matrix_init_stmt->ret_type = stmt->ret_type.ptr_removed();
+      auto global_store_stmt =
+          replacement.push_back<GlobalStoreStmt>(ptr, zero_matrix_init_stmt);
+      stmt_to_offloaded_[global_store_stmt] = offloaded;
     } else {
-      TypedConstant zero(stmt->ret_type);
-      auto const_zero_stmt = replacement.push_back<ConstStmt>(zero);
       auto global_store_stmt =
           replacement.push_back<GlobalStoreStmt>(ptr, const_zero_stmt);
       stmt_to_offloaded_[global_store_stmt] = offloaded;
@@ -623,7 +618,7 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
       generic_visit(pcopy);
     } else {
       auto global_temporary = Stmt::make<GlobalTemporaryStmt>(
-          local_to_global_offset_[op], op->ret_type);
+          local_to_global_offset_.at(op), op->ret_type);
       stmt_to_offloaded_[global_temporary.get()] = offloaded;
       stmt->set_operand(index, global_temporary.get());
       if (op->is<AllocaStmt>() || op->ret_type.is_pointer()) {
@@ -660,8 +655,8 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
  public:
   static void run(IRNode *root,
                   const CompileConfig &config,
-                  const StmtToOffsetMap &local_to_global_offset,
-                  const std::unordered_map<Stmt *, Stmt *> &stmt_to_offloaded,
+                  const StmtToOffsetMap *local_to_global_offset,
+                  std::unordered_map<Stmt *, Stmt *> &stmt_to_offloaded,
                   OffloadedRanges *offloaded_ranges) {
     FixCrossOffloadReferences pass(config, local_to_global_offset,
                                    stmt_to_offloaded, offloaded_ranges);
@@ -670,8 +665,8 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
 
  private:
   [[maybe_unused]] const CompileConfig &config_;
-  StmtToOffsetMap local_to_global_offset_;
-  std::unordered_map<Stmt *, Stmt *> stmt_to_offloaded_;
+  const StmtToOffsetMap &local_to_global_offset_;
+  std::unordered_map<Stmt *, Stmt *> &stmt_to_offloaded_;
   OffloadedRanges *const offloaded_ranges_;
   std::unordered_map<Stmt *, DataType> local_to_global_vector_type_;
 };
@@ -783,9 +778,9 @@ void offload(IRNode *root, const CompileConfig &config) {
     auto stmt_to_offloaded = StmtToOffloaded::run(root);
     const auto local_to_global_offset = IdentifyValuesUsedInOtherOffloads::run(
         root, config, stmt_to_offloaded, &offloaded_ranges);
-    PromoteIntermediateToGlobalTmp::run(root, local_to_global_offset);
+    PromoteIntermediateToGlobalTmp::run(root, &local_to_global_offset);
     stmt_to_offloaded = StmtToOffloaded::run(root);
-    FixCrossOffloadReferences::run(root, config, local_to_global_offset,
+    FixCrossOffloadReferences::run(root, config, &local_to_global_offset,
                                    stmt_to_offloaded, &offloaded_ranges);
   }
   insert_gc(root, config);
diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp
index 40fa40f8ba9f6..69073000b5329 100644
--- a/taichi/transforms/scalarize.cpp
+++ b/taichi/transforms/scalarize.cpp
@@ -881,7 +881,7 @@ class GatherScalarizableLocalPointers : public BasicStmtVisitor {
   }
 };
 
-class ScalarizeLocalPointers : public BasicStmtVisitor {
+class ScalarizePointers : public BasicStmtVisitor {
  public:
   ImmediateIRModifier immediate_modifier_;
   DelayedIRModifier delayed_modifier_;
@@ -890,7 +890,7 @@ class ScalarizeLocalPointers : public BasicStmtVisitor {
   // { original_alloca_stmt : [scalarized_alloca_stmt0, ...] }
   std::unordered_map<Stmt *, std::vector<Stmt *>> scalarized_local_tensor_map_;
 
-  explicit ScalarizeLocalPointers(
+  explicit ScalarizePointers(
       IRNode *node,
       const std::unordered_set<Stmt *> &scalarizable_allocas)
       : immediate_modifier_(node), scalarizable_allocas_(scalarizable_allocas) {
@@ -948,16 +948,16 @@ class ScalarizeLocalPointers : public BasicStmtVisitor {
     }
   }
 
-  /*
-    Before:
-      MatrixPtrStmt(TensorType<4 x i32>* alloca_stmt, int offset)
-
-    After:
-      scalarized_alloca_stmt =
-    scalarized_local_tensor_map_[alloca_stmt][offset]
-      stmt->replace_all_usages_with(scalarized_alloca_stmt)
-  */
   void visit(MatrixPtrStmt *stmt) override {
+    /*
+      Before:
+        MatrixPtrStmt(TensorType<4 x i32>* alloca_stmt, int offset)
+
+      After:
+        scalarized_alloca_stmt =
+      scalarized_local_tensor_map_[alloca_stmt][offset]
+        stmt->replace_all_usages_with(scalarized_alloca_stmt)
+    */
     if (stmt->origin->is<AllocaStmt>() &&
         scalarizable_allocas_.count(stmt->origin) == 1) {
       auto alloca_stmt = stmt->origin->cast<AllocaStmt>();
@@ -979,6 +979,34 @@ class ScalarizeLocalPointers : public BasicStmtVisitor {
 
       immediate_modifier_.replace_usages_with(stmt, new_stmt);
       delayed_modifier_.erase(stmt);
+      return;
+    }
+
+    /*
+      Before:
+        TensorType<4 x i32>* ptr = GlobalTempStmt(offset_0)
+        i32* ptr_1 = MatrixPtrStmt(ptr, offset_1)
+
+      After:
+        i32* $1 = GlobalTempStmt(offset_0 + offset_1 * sizeof(i32))
+        replace_all_usages_with(ptr_1, $1)
+    */
+    if (stmt->origin->is<GlobalTemporaryStmt>() &&
+        stmt->offset->is<ConstStmt>()) {
+      auto global_temp_stmt = stmt->origin->as<GlobalTemporaryStmt>();
+      auto offset_0 = global_temp_stmt->offset;
+      auto offset_1 = stmt->offset->as<ConstStmt>()->val.val_int32();
+      auto new_offset =
+          offset_0 + offset_1 * data_type_size(stmt->ret_type.ptr_removed());
+
+      auto new_global_temp_stmt = std::make_unique<GlobalTemporaryStmt>(
+          new_offset, stmt->ret_type.ptr_removed().get_element_type());
+      new_global_temp_stmt->ret_type.set_is_pointer(true);
+
+      stmt->replace_usages_with(new_global_temp_stmt.get());
+      delayed_modifier_.insert_before(stmt, std::move(new_global_temp_stmt));
+      delayed_modifier_.erase(stmt);
+      return;
     }
   }
 
@@ -1021,12 +1049,24 @@ class ExtractLocalPointers : public BasicStmtVisitor {
   Block *top_level_;
 
   explicit ExtractLocalPointers(IRNode *root) : immediate_modifier_(root) {
-    TI_ASSERT(root->is<Block>());
-    top_level_ = root->as<Block>();
+    if (root->is<OffloadedStmt>()) {
+      top_level_ = root->as<OffloadedStmt>()->body.get();
+    } else {
+      TI_ASSERT(root->is<Block>());
+      top_level_ = root->as<Block>();
+    }
     root->accept(this);
     delayed_modifier_.modify_ir();
   }
 
+  void visit(OffloadedStmt *stmt) override {
+    // Extract to OffloadStmt
+    Block *orig_top_level = top_level_;
+    top_level_ = stmt->body.get();
+    stmt->all_blocks_accept(this);
+    top_level_ = orig_top_level;
+  }
+
   void visit(MatrixPtrStmt *stmt) override {
     if (stmt->origin->is<AllocaStmt>()) {
       auto alloca_stmt = stmt->origin->cast<AllocaStmt>();
@@ -1118,7 +1158,7 @@ void scalarize(IRNode *root) {
   TI_AUTO_PROF;
   Scalarize scalarize_pass(root);
   auto scalarizable_allocas = GatherScalarizableLocalPointers::run(root);
-  ScalarizeLocalPointers scalarize_pointers_pass(root, scalarizable_allocas);
+  ScalarizePointers scalarize_pointers_pass(root, scalarizable_allocas);
   ExtractLocalPointers extract_pointers_pass(root);
   MergeExternalAndMatrixPtr::run(root);
 }
diff --git a/tests/cpp/aot/gfx_utils.cpp b/tests/cpp/aot/gfx_utils.cpp
index e7e15b5fde434..9a0d965321316 100644
--- a/tests/cpp/aot/gfx_utils.cpp
+++ b/tests/cpp/aot/gfx_utils.cpp
@@ -172,7 +172,7 @@ void run_kernel_test1(Arch arch, taichi::lang::Device *device) {
   // Hack to set vector/matrix args
   std::vector<int> vec = {1, 2, 3};
   for (int i = 0; i < vec.size(); ++i) {
-    builder.set_arg(/*arg_id=*/i + 2, vec[i]);
+    builder.set_struct_arg(/*arg_indices=*/{2, i}, vec[i]);
   }
   k_run->launch(builder);
   gfx_runtime->synchronize();
diff --git a/tests/cpp/aot/llvm/kernel_aot_test.cpp b/tests/cpp/aot/llvm/kernel_aot_test.cpp
index 4dcdce34fc23d..5446cb8219cff 100644
--- a/tests/cpp/aot/llvm/kernel_aot_test.cpp
+++ b/tests/cpp/aot/llvm/kernel_aot_test.cpp
@@ -54,7 +54,7 @@ TEST(LlvmAotTest, CpuKernel) {
   builder.set_arg_ndarray(/*arg_id=*/1, arr);
   std::vector<int> vec = {1, 2, 3};
   for (int i = 0; i < vec.size(); ++i) {
-    builder.set_arg(/*arg_id=*/i + 2, vec[i]);
+    builder.set_struct_arg(/*arg_indices=*/{2, i}, vec[i]);
   }
   k_run->launch(builder);
 
@@ -100,7 +100,7 @@ TEST(LlvmAotTest, CudaKernel) {
     builder.set_arg_ndarray(/*arg_id=*/1, arr);
     std::vector<int> vec = {1, 2, 3};
     for (int i = 0; i < vec.size(); ++i) {
-      builder.set_arg(/*arg_id=*/i + 2, vec[i]);
+      builder.set_struct_arg(/*arg_indices=*/{2, i}, vec[i]);
     }
     k_run->launch(builder);
 
diff --git a/tests/python/test_argument.py b/tests/python/test_argument.py
index 4a7139b1f328a..9d1c0d078e9ef 100644
--- a/tests/python/test_argument.py
+++ b/tests/python/test_argument.py
@@ -4,32 +4,6 @@
 from tests import test_utils
 
 
-@test_utils.test(exclude=[ti.opengl, ti.gles])
-def test_exceed_max_64():
-    N = 64
-
-    @ti.kernel
-    def foo1(a: ti.types.vector(N, ti.i32)) -> ti.i32:
-        return a.sum()
-
-    A = ti.Vector([1] * N)
-    assert foo1(A) == 64
-
-    N = 65
-
-    @ti.kernel
-    def foo2(a: ti.types.vector(N, ti.i32)) -> ti.i32:
-        return a.sum()
-
-    A = ti.Vector([1] * N)
-
-    with pytest.raises(
-        ti.TaichiRuntimeError,
-        match=f"The number of elements in kernel arguments is too big! Do not exceed 64 on {ti._lib.core.arch_name(ti.lang.impl.current_cfg().arch)} backend.",
-    ):
-        foo2(A)
-
-
 @test_utils.test(debug=True)
 def test_kernel_keyword_args():
     @ti.kernel
diff --git a/tests/python/test_ast_refactor.py b/tests/python/test_ast_refactor.py
index 669c419dd724a..deab2b0310c44 100644
--- a/tests/python/test_ast_refactor.py
+++ b/tests/python/test_ast_refactor.py
@@ -164,17 +164,6 @@ def foo(a: ti.template()):
         assert a[i] == b[i]
 
 
-@test_utils.test()
-def test_compare_fail():
-    with pytest.raises(ti.TaichiCompilationError, match='"Is" is only supported inside `ti.static`.'):
-
-        @ti.kernel
-        def foo():
-            None is None
-
-        foo()
-
-
 @test_utils.test()
 def test_single_compare():
     @ti.kernel
diff --git a/tests/python/test_compare.py b/tests/python/test_compare.py
index 487e88c70f7ee..84a335bc4c054 100644
--- a/tests/python/test_compare.py
+++ b/tests/python/test_compare.py
@@ -171,33 +171,6 @@ def foo(a: ti.template()) -> ti.i32:
         foo(ti.i32)
 
 
-@test_utils.test()
-def test_static_is():
-    @ti.kernel
-    def is_f32(tp: ti.template()) -> ti.i32:
-        return ti.static(tp is ti.f32)
-
-    @ti.kernel
-    def is_not_f32(tp: ti.template()) -> ti.i32:
-        return ti.static(tp is not ti.f32)
-
-    assert is_f32(ti.f32) == 1
-    assert is_f32(ti.i32) == 0
-    assert is_not_f32(ti.f32) == 0
-    assert is_not_f32(ti.i32) == 1
-
-
-@test_utils.test()
-def test_non_static_is():
-    with pytest.raises(ti.TaichiCompilationError, match='"Is" is only supported inside `ti.static`.'):
-
-        @ti.kernel
-        def is_f32(tp: ti.template()) -> ti.i32:
-            return tp is ti.f32
-
-        is_f32(ti.f32)
-
-
 @test_utils.test(default_ip=ti.i64, require=ti.extension.data64)
 def test_compare_ret_type():
     # The purpose of this test is to make sure a comparison returns i32
diff --git a/tests/python/test_deprecation.py b/tests/python/test_deprecation.py
index 83b8d8b9a507f..358b3c9d3b55d 100644
--- a/tests/python/test_deprecation.py
+++ b/tests/python/test_deprecation.py
@@ -8,21 +8,6 @@
 from tests import test_utils
 
 
-@test_utils.test()
-def test_deprecate_a_atomic_b():
-    with pytest.warns(
-        DeprecationWarning,
-        match=r"a\.atomic_add\(b\) is deprecated, and it will be removed in Taichi v1.6.0.",
-    ):
-
-        @ti.kernel
-        def func():
-            a = 1
-            a.atomic_add(2)
-
-        func()
-
-
 @test_utils.test()
 def test_deprecate_element_shape_scalar():
     with pytest.warns(
@@ -84,26 +69,8 @@ def test_deprecate_rwtexture_ndim():
 
 
 @test_utils.test()
-def test_deprecate_builtin_min_max():
-    with pytest.warns(
-        DeprecationWarning,
-        match='Calling builtin function "max" in Taichi scope is deprecated, '
-        "and it will be removed in Taichi v1.6.0.",
-    ):
-
-        @ti.kernel
-        def func():
-            max(1, 2)
-
-        func()
-
-
-@test_utils.test()
-def test_deprecate_is_is_not():
-    with pytest.warns(
-        DeprecationWarning,
-        match='Operator "is" in Taichi scope is deprecated, ' "and it will be removed in Taichi v1.6.0.",
-    ):
+def test_remove_is_is_not():
+    with pytest.raises(ti.TaichiSyntaxError, match='Operator "is" in Taichi scope is not supported'):
 
         @ti.kernel
         def func():
@@ -112,44 +79,6 @@ def func():
         func()
 
 
-@test_utils.test()
-def test_deprecate_ndrange():
-    with pytest.warns(
-        DeprecationWarning,
-        match="Ndrange for loop with number of the loop variables not equal to "
-        "the dimension of the ndrange is deprecated, "
-        "and it will be removed in Taichi 1.6.0. ",
-    ):
-
-        @ti.kernel
-        def func():
-            for i in ti.ndrange(4, 4):
-                pass
-
-        func()
-
-
-@pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=ti.cpu)
-def test_deprecate_ti_ui_window():
-    window = ti.ui.Window("Diff SPH", (256, 256), show_window=False)
-    with pytest.warns(
-        DeprecationWarning,
-        match=r"`Window\.write_image\(\)` is deprecated, and it will be removed in Taichi v1\.6\.0\. ",
-    ):
-        window.write_image("deprecate.png")
-
-
-@pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=ti.cpu)
-def test_deprecate_ti_ui_make_camera():
-    with pytest.warns(
-        DeprecationWarning,
-        match=r"`ti\.ui\.make_camera\(\)` is deprecated, and will be removed in Taichi v1\.6\.0\. ",
-    ):
-        ti.ui.make_camera()
-
-
 @test_utils.test()
 def test_deprecation_in_taichi_init_py():
     with pytest.warns(
@@ -157,12 +86,3 @@ def test_deprecation_in_taichi_init_py():
         match="ti.SOA is deprecated, and it will be removed in Taichi v1.6.0.",
     ):
         ti.SOA
-
-
-@test_utils.test()
-def test_deprecate_sparse_matrix_builder():
-    with pytest.warns(
-        DeprecationWarning,
-        match=r"ti\.linalg\.sparse_matrix_builder is deprecated, and it will be removed in Taichi v1\.6\.0\.",
-    ):
-        ti.linalg.sparse_matrix_builder()
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index 18931a1146daa..0da6ed756c9ae 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -1333,3 +1333,12 @@ def mat_equal(A, B, tol=1e-6):
 
     tol = 1e-5 if dtype == ti.f32 else 1e-12
     assert mat_equal(x.to_numpy(), arr, tol=tol)
+
+
+@test_utils.test()
+def test_matrix_dtype():
+    a = ti.types.vector(3, dtype=ti.f32)([0, 1, 2])
+    assert a.entries.dtype == np.float32
+
+    b = ti.types.matrix(2, 2, dtype=ti.i32)([[0, 1], [2, 3]])
+    assert b.entries.dtype == np.int32
diff --git a/tests/python/test_taichi_cg.py b/tests/python/test_matrixfree_cg.py
similarity index 91%
rename from tests/python/test_taichi_cg.py
rename to tests/python/test_matrixfree_cg.py
index 122669931aa62..733a0ea24c0d2 100644
--- a/tests/python/test_taichi_cg.py
+++ b/tests/python/test_matrixfree_cg.py
@@ -1,7 +1,7 @@
 import math
 
 import pytest
-from taichi.linalg import LinearOperator, taichi_cg_solver
+from taichi.linalg import LinearOperator, MatrixFreeCG
 
 import taichi as ti
 from tests import test_utils
@@ -11,7 +11,7 @@
 
 @pytest.mark.parametrize("ti_dtype", [ti.f32, ti.f64])
 @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan], exclude=[vk_on_mac])
-def test_taichi_cg(ti_dtype):
+def test_matrixfree_cg(ti_dtype):
     GRID = 32
     Ax = ti.field(dtype=ti_dtype, shape=(GRID, GRID))
     x = ti.field(dtype=ti_dtype, shape=(GRID, GRID))
@@ -47,7 +47,7 @@ def check_solution(sol: ti.template(), ans: ti.template(), tol: ti_dtype) -> boo
 
     A = LinearOperator(compute_Ax)
     init()
-    taichi_cg_solver(A, b, x, maxiter=10 * GRID * GRID, tol=1e-18, quiet=True)
+    MatrixFreeCG(A, b, x, maxiter=10 * GRID * GRID, tol=1e-18, quiet=True)
     compute_Ax(x, Ax)
     # `tol` can't be < 1e-6 for ti.f32 because of accumulating round-off error;
     # see https://en.wikipedia.org/wiki/Conjugate_gradient_method#cite_note-6
diff --git a/tests/python/test_native_functions.py b/tests/python/test_native_functions.py
index 2176f19aecb9b..fa60291bfdb09 100644
--- a/tests/python/test_native_functions.py
+++ b/tests/python/test_native_functions.py
@@ -72,9 +72,7 @@ def func():
         y[i] = N - i
         z[i] = i - 2 if i % 2 else i + 2
 
-    with pytest.warns(DeprecationWarning, match="Calling builtin function") as records:
-        func()
-    assert len(records) > 0
+    func()
 
     assert np.allclose(
         minimum.to_numpy(),
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index e3c89fa3e0f00..f139d44917c75 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -278,7 +278,7 @@ def test_ndarray_deepcopy():
     assert y[4][1, 0] == 9
 
 
-@test_utils.test(arch=[ti.cuda], ndarray_use_cached_allocator=True)
+@test_utils.test(arch=[ti.cuda])
 def test_ndarray_caching_allocator():
     n = 8
     a = ti.ndarray(ti.i32, shape=(n))
diff --git a/tests/python/test_ndrange.py b/tests/python/test_ndrange.py
index 311f9cc770e80..925473566b9c5 100644
--- a/tests/python/test_ndrange.py
+++ b/tests/python/test_ndrange.py
@@ -312,20 +312,6 @@ def example():
     example()
 
 
-@test_utils.test(exclude=[ti.amdgpu])
-def test_n_loop_var_neq_dimension():
-    @ti.kernel
-    def iter():
-        for i in ti.ndrange(1, 4):
-            print(i)
-
-    with pytest.warns(
-        DeprecationWarning,
-        match="Ndrange for loop with number of the loop variables not equal to",
-    ):
-        iter()
-
-
 @test_utils.test()
 def test_2d_loop_over_ndarray():
     @ti.kernel
@@ -336,3 +322,19 @@ def foo(arr: ti.types.ndarray(dtype=ti.i32, ndim=1)):
 
     array = ti.ndarray(ti.i32, shape=(16,))
     foo(array)
+
+
+@test_utils.test()
+def test_dimension_error():
+    with pytest.raises(
+        ti.TaichiSyntaxError,
+        match="Ndrange for loop with number of the loop variables not equal to "
+        "the dimension of the ndrange is not supported",
+    ):
+
+        @ti.kernel
+        def func():
+            for i in ti.ndrange(4, 4):
+                pass
+
+        func()
diff --git a/tests/python/test_offload_cross.py b/tests/python/test_offload_cross.py
index 916fcf01acf16..a28c2e96d3d19 100644
--- a/tests/python/test_offload_cross.py
+++ b/tests/python/test_offload_cross.py
@@ -133,3 +133,24 @@ def run(a: ti.i32):
                 print("OK")
 
     run(2)
+
+
+@test_utils.test(exclude=ti.amdgpu)
+def test_offload_with_save():
+    a = ti.Vector.field(2, dtype=ti.f32, shape=1)
+    b = ti.Vector.field(2, dtype=ti.f32, shape=1)
+    c = ti.Vector.field(2, dtype=ti.f32, shape=1)
+
+    @ti.kernel
+    def test():
+        a[0] = ti.Vector([1, 1])
+        b[0] = ti.Vector([0, 0])
+        c[0] = ti.Vector([0, 0])
+        b[0] += a[0]  # b[0] = [1, 1]
+        b[0] /= 2  # b[0] = [0.5, 0.5]
+        for i in c:
+            c[i] += b[0]  # c[0] = [0.5, 0.5]
+
+    test()
+    assert c[0][0] == 0.5
+    assert c[0][1] == 0.5
diff --git a/tests/python/test_optimization.py b/tests/python/test_optimization.py
index fa7147d7282c1..c966464647da4 100644
--- a/tests/python/test_optimization.py
+++ b/tests/python/test_optimization.py
@@ -143,3 +143,13 @@ def func():
     for i in range(3):
         for j in range(4):
             assert mat[i, j] == i + 1
+
+
+@test_utils.test()
+def test_casts_int_uint():
+    @ti.kernel
+    def my_cast(x: ti.f32) -> ti.u32:
+        y = ti.floor(x, ti.i32)
+        return ti.cast(y, ti.u32)
+
+    assert my_cast(-1) == 4294967295
diff --git a/tests/python/test_cg.py b/tests/python/test_sparse_cg.py
similarity index 93%
rename from tests/python/test_cg.py
rename to tests/python/test_sparse_cg.py
index bd0eb38445f20..ad6afa4c071f4 100644
--- a/tests/python/test_cg.py
+++ b/tests/python/test_sparse_cg.py
@@ -28,7 +28,7 @@ def fill(
 
     fill(Abuilder, A_psd, b)
     A = Abuilder.build(dtype=ti_dtype)
-    cg = ti.linalg.CG(A, b, x0, max_iter=50, atol=1e-6)
+    cg = ti.linalg.SparseCG(A, b, x0, max_iter=50, atol=1e-6)
     x, exit_code = cg.solve()
     res = np.linalg.solve(A_psd, b.to_numpy())
     assert exit_code == True
@@ -59,7 +59,7 @@ def fill(
 
     fill(Abuilder, A_psd, b)
     A = Abuilder.build(dtype=ti_dtype)
-    cg = ti.linalg.CG(A, b, x0, max_iter=50, atol=1e-6)
+    cg = ti.linalg.SparseCG(A, b, x0, max_iter=50, atol=1e-6)
     x, exit_code = cg.solve()
     res = np.linalg.solve(A_psd, b.to_numpy())
     assert exit_code == True