diff --git a/.github/workflows/persubmit.yml b/.github/workflows/persubmit.yml
index ff7a621ffcd9f..6d3b156368806 100644
--- a/.github/workflows/persubmit.yml
+++ b/.github/workflows/persubmit.yml
@@ -11,6 +11,7 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python: [3.6, 3.7, 3.8]
+        with_cc: [OFF]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v2
@@ -30,7 +31,7 @@ jobs:
           export TAICHI_REPO_DIR=`pwd`
           export PATH=$TAICHI_REPO_DIR/taichi-llvm/bin/:$PATH
           export CXX=clang++
-          export CI_SETUP_CMAKE_ARGS=-DTI_WITH_OPENGL:BOOL=OFF
+          export CI_SETUP_CMAKE_ARGS="-DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{matrix.with_cc}}"
           # GLFW dependencies:
           #export CI_SETUP_CMAKE_ARGS=-DTI_WITH_OPENGL:BOOL=ON
           #sudo apt-get install libx11-dev libxcursor-dev libxi-dev
diff --git a/.gitignore b/.gitignore
index 50ccd25a77145..6e219c6c33fb2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 *.swp
 *.swo
 /.vs
+/tags
+/.*_localrc
 /Debug
 *.sdf
 /x64
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 628e779bf23e1..1d838bd3c848e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(taichi)
 
 SET(TI_VERSION_MAJOR 0)
 SET(TI_VERSION_MINOR 6)
-SET(TI_VERSION_PATCH 14)
+SET(TI_VERSION_PATCH 15)
 
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
diff --git a/README.md b/README.md
index c2eca75c9f678..d2f226ae09b21 100644
--- a/README.md
+++ b/README.md
@@ -13,16 +13,16 @@
 
 **Taichi** (太极) is a programming language designed for *high-performance computer graphics*. It is deeply embedded in **Python**, and its **just-in-time compiler** offloads compute-intensive tasks to multi-core CPUs and massively parallel GPUs.
 
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/fractal.py#L1-L31"> <img src="https://github.com/yuanming-hu/public_files/raw/master/graphics/taichi/fractal_code.png" height="293px"></a>  <img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/fractal_small.gif" height="293px">
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/fractal.py#L1-L31"> <img src="https://github.com/yuanming-hu/public_files/raw/master/graphics/taichi/fractal_code.png" height="270px"></a>  <img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/fractal_small.gif" height="270px">
 
 
 Advanced features of Taichi include [spatially sparse computing](https://taichi.readthedocs.io/en/latest/sparse.html) and [differentiable programming](https://taichi.readthedocs.io/en/latest/differentiable_programming.html) [[examples]](https://github.com/yuanming-hu/difftaichi).
 
 ## Gallery
 
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/mpm128.py"><img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/mpm128.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/stable_fluid.py"> <img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/stable_fluids.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/sdf_renderer.py"><img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/sdf_renderer.jpg" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/taichi_sparse.py"><img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/sparse_grids.gif" height="192px"></a>
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/mpm128.py"><img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/mpm128.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/stable_fluid.py"> <img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/stable_fluids.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/sdf_renderer.py"><img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/sdf_renderer.jpg" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/taichi_sparse.py"><img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/sparse_grids.gif" height="192px"></a>
 
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/mpm_lagrangian_forces.py"><img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/lagrangian.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/pbf2d.py"><img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/pbf.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/game_of_life.py"><img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/game_of_life.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/euler.py"><img src="https://raw.githubusercontent.com/yuanming-hu/public_files/master/graphics/taichi/euler.gif" height="192px"></a>
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/mpm_lagrangian_forces.py"><img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/lagrangian.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/pbf2d.py"><img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/pbf.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/game_of_life.py"><img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/game_of_life.gif" height="192px"></a> <a href="https://github.com/taichi-dev/taichi/blob/master/examples/euler.py"><img src="https://github.com/taichi-dev/public_files/blob/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/euler.gif" height="192px"></a>
 
 ## Installation [![Downloads](https://pepy.tech/badge/taichi/month)](https://pepy.tech/project/taichi/month)
 
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 95307ea6bcaf1..5634f5e0b7551 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -3,15 +3,27 @@ set(CORE_LIBRARY_NAME taichi_core)
 option(USE_STDCPP "Use -stdlib=libc++" OFF)
 option(TI_WITH_CUDA "Build with the CUDA backend" ON)
 option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
+option(TI_WITH_CC "Build with the C backend" OFF)
 
 if (APPLE)
     if (TI_WITH_CUDA)
         set(TI_WITH_CUDA OFF)
-        message(WARNING "CUDA not supported on OS X. Setting TI_WITH_CUDA to OFF.")
+        message(WARNING "CUDA backend not supported on OS X. Setting TI_WITH_CUDA to OFF.")
     endif()
     if (TI_WITH_OPENGL)
         set(TI_WITH_OPENGL OFF)
-        message(WARNING "OpenGL not supported on OS X. Setting TI_WITH_OPENGL to OFF.")
+        message(WARNING "OpenGL backend not supported on OS X. Setting TI_WITH_OPENGL to OFF.")
+    endif()
+    if (TI_WITH_CC)
+        set(TI_WITH_CC OFF)
+        message(WARNING "C backend not supported on OS X. Setting TI_WITH_CC to OFF.")
+    endif()
+endif()
+
+if (WIN32)
+    if (TI_WITH_CC)
+        set(TI_WITH_CC OFF)
+        message(WARNING "C backend not supported on Windows. Setting TI_WITH_CC to OFF.")
     endif()
 endif()
 
@@ -30,6 +42,7 @@ file(GLOB TAICHI_CPU_SOURCE "taichi/backends/cpu/*.cpp" "taichi/backends/cpu/*.h
 file(GLOB TAICHI_CUDA_SOURCE "taichi/backends/cuda/*.cpp" "taichi/backends/cuda/*.h")
 file(GLOB TAICHI_METAL_SOURCE "taichi/backends/metal/*.h" "taichi/backends/metal/*.cpp" "taichi/backends/metal/shaders/*")
 file(GLOB TAICHI_OPENGL_SOURCE "taichi/backends/opengl/*.h" "taichi/backends/opengl/*.cpp" "taichi/backends/opengl/shaders/*")
+file(GLOB TAICHI_CC_SOURCE "taichi/backends/cc/*.h" "taichi/backends/cc/*.cpp")
 
 list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_BACKEND_SOURCE})
 
@@ -56,6 +69,11 @@ if (TI_WITH_OPENGL)
   list(APPEND TAICHI_CORE_SOURCE ${TAICHI_GLAD_SOURCE})
 endif()
 
+if (TI_WITH_CC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_CC")
+  list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CC_SOURCE})
+endif()
+
 add_library(${CORE_LIBRARY_NAME} SHARED ${TAICHI_CORE_SOURCE} ${PROJECT_SOURCES})
 
 if (APPLE)
diff --git a/docs/compilation.rst b/docs/compilation.rst
index 00496f90e26fa..88aa3f6d82236 100644
--- a/docs/compilation.rst
+++ b/docs/compilation.rst
@@ -1,20 +1,20 @@
 .. _compilation:
 
-The life of a Taichi kernel
+Life of a Taichi kernel
 ===============================================
 
 Sometimes it is helpful to understand the life cycle of a Taichi kernel.
 In short, compilation will only happen on the first invocation of an instance of a kernel.
 
-Life cycle of a Taichi kernel looks like this:
+The life cycle of a Taichi kernel has the following stages:
 
  - Kernel registration
  - Template instantiation and caching
  - Python AST transforms
- - Taichi IR compilation, optimization, and binary generation
+ - Taichi IR compilation, optimization, and executable generation
  - Launching
 
-.. image:: life_of_kernel_lowres.jpg
+.. image:: https://raw.githubusercontent.com/taichi-dev/public_files/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/life_of_kernel.jpg
 
 Let's consider the following simple kernel:
 
@@ -26,7 +26,7 @@ Let's consider the following simple kernel:
       tensor[i] += delta
 
 
-We also allocate two 1D tensors to simplify discussion:
+We allocate two 1D tensors to simplify discussion:
 
 .. code-block:: python
 
@@ -50,7 +50,7 @@ Template instantiation and caching
 
 When ``add`` is called for the first time, the Taichi frontend compiler will instantiate the kernel.
 
-When you have a second call with the same template signature (explained later), e.g.,
+When you have a second call with the same **template signature** (explained later), e.g.,
 
 .. code-block:: python
 
@@ -69,12 +69,12 @@ will lead to a new instantiation of **add**.
 .. note::
   **Template signatures** are what distinguish different instantiations of a kernel template.
   The signature of ``add(x, 42)`` is ``(x, ti.i32)``, which is the same as that of ``add(x, 1)``. Therefore, the latter can reuse the previously compiled binary.
-  The signature of ``add(y, 42)`` is ``(y, ti.i32)``, a different value from the previous signature, therefore a new instantiation and compilation will happen.
+  The signature of ``add(y, 42)`` is ``(y, ti.i32)``, a different value from the previous signature, hence a new kernel will be instantiated and compiled.
 
 .. note::
 
-  Many basic operations in the Taichi standard library is implemented using Taichi kernels for performance,
-  with more or less metaprogramming tricks. Invoking them will incur **implicit kernel instantiations**
+  Many basic operations in the Taichi standard library are implemented using Taichi kernels
+  using metaprogramming tricks. Invoking them will incur **implicit kernel instantiations**.
 
   Examples include ``x.to_numpy()`` and ``y.from_torch(torch_tensor)``. When you invoke these functions,
   you will see kernel instantiations, as Taichi kernels will be generated to offload the hard work to multiple CPU cores/GPUs.
@@ -84,7 +84,7 @@ will lead to a new instantiation of **add**.
 Code transformation and optimizations
 ---------------------------------------
 
-When a new instantiation happens, the Taichi frontend compiler will transform the kernel body AST
+When a new instantiation happens, the Taichi frontend compiler (i.e., the ``ASTTransformer`` Python class) will transform the kernel body AST
 into a Python script, which, when executed, emits a Taichi frontend AST.
 Basically, some patches are applied to the Python AST so that the Taichi frontend can recognize it.
 
@@ -103,9 +103,10 @@ which allows a series of further IR passes to happen, such as
 The just-in-time (JIT) compilation engine
 -----------------------------------------
 
-Finally, the optimized SSA IR is fed into the LLVM IR codegen, and LLVM JIT generates high-performance executable CPU/GPU programs.
+Finally, the optimized SSA IR is fed into backend compilers such as LLVM or Apple Metal/OpenGL shader compilers.
+The backend compilers then generate high-performance executable CPU/GPU programs.
 
 Kernel launching
 ----------------
 
-Taichi kernels will be ultimately launched as multi-threaded CPU tasks or CUDA kernels.
+Taichi kernels will be ultimately launched as multi-threaded CPU tasks or GPU kernels.
diff --git a/docs/version b/docs/version
index fcbaa8478100e..6769f67e2955a 100644
--- a/docs/version
+++ b/docs/version
@@ -1 +1 @@
-0.6.14
+0.6.15
diff --git a/examples/export_videos.py b/examples/export_videos.py
index b6ddeeef6b50c..50d0738a4a1f3 100644
--- a/examples/export_videos.py
+++ b/examples/export_videos.py
@@ -13,8 +13,8 @@ def paint():
 
 result_dir = "./results"
 video_manager = ti.VideoManager(output_dir=result_dir,
-                               framerate=24,
-                               automatic_build=False)
+                                framerate=24,
+                                automatic_build=False)
 
 for i in range(50):
     paint()
diff --git a/misc/listgen_demo.py b/misc/listgen_demo.py
index 5ae7213d05e8d..2d7459a8497c1 100644
--- a/misc/listgen_demo.py
+++ b/misc/listgen_demo.py
@@ -5,9 +5,11 @@
 x = ti.var(ti.i32)
 ti.root.dense(ti.i, 4).bitmasked(ti.i, 4).place(x)
 
+
 @ti.kernel
 def func():
     for i in x:
         print(i)
 
+
 func()
diff --git a/misc/prtags.json b/misc/prtags.json
index fe9b33ddcd70a..6ed8dd0a96e72 100644
--- a/misc/prtags.json
+++ b/misc/prtags.json
@@ -28,5 +28,6 @@
   "windows"         : "Windows",
   "perf"            : "Performance improvements",
   "ipython"         : "IPython and other shells",
+  "cc"              : "C source backend",
   "release"         : "Release"
 }
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 8cd8c17b36f63..c6f9e7c3ac03d 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -7,7 +7,7 @@
 from taichi.misc import *
 from taichi.misc.gui import GUI
 from taichi.misc.np2ply import PLYWriter
-from taichi.misc.image import *
+from taichi.misc.image import imread, imwrite, imshow, imdisplay
 from taichi.misc.task import Task
 from taichi.misc.test import *
 from taichi.misc import settings as settings
diff --git a/python/taichi/core/util.py b/python/taichi/core/util.py
index 9d69334eb2a75..79ddc6531d14e 100644
--- a/python/taichi/core/util.py
+++ b/python/taichi/core/util.py
@@ -30,7 +30,7 @@ def import_ti_core(tmp_dir=None):
     global ti_core
     if get_os_name() != 'win':
         old_flags = sys.getdlopenflags()
-        sys.setdlopenflags(2)    # 2 = RTLD_NOW
+        sys.setdlopenflags(2 | 8)  # RTLD_NOW | RTLD_DEEPBIND
     else:
         pyddir = os.path.join(package_root(), 'lib')
         os.environ['PATH'] += ';' + pyddir
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 156756b64f717..a057be0f2a534 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -38,6 +38,7 @@
 cuda = core.cuda
 metal = core.metal
 opengl = core.opengl
+cc = core.cc
 gpu = [cuda, metal, opengl]
 cpu = core.host_arch()
 kernel_profiler_print = lambda: core.get_current_program(
@@ -292,6 +293,8 @@ def is_arch_supported(arch):
         return core.with_metal()
     elif arch == opengl:
         return core.with_opengl()
+    elif arch == cc:
+        return core.with_cc()
     elif arch == cpu:
         return True
     else:
@@ -299,7 +302,7 @@ def is_arch_supported(arch):
 
 
 def supported_archs():
-    archs = [cpu, cuda, metal, opengl]
+    archs = [cpu, cuda, metal, opengl, cc]
 
     wanted_archs = os.environ.get('TI_WANTED_ARCHS', '')
     want_exclude = wanted_archs.startswith('^')
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 39babe2f9a74e..5bceab2ee4637 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -59,6 +59,16 @@ def expr_init_func(rhs):  # temporary solution to allow passing in tensors as
     return expr_init(rhs)
 
 
+def begin_frontend_struct_for(group, loop_range):
+    if group.size() != len(loop_range.shape):
+        raise IndexError(
+            'Number of struct-for indices does not match loop variable dimensionality '
+            f'({group.size()} != {len(loop_range.shape)}). Maybe you wanted to '
+            'use "for I in ti.grouped(x)" to group all indices into a single vector I?'
+        )
+    taichi_lang_core.begin_frontend_struct_for(group, loop_range.ptr)
+
+
 def wrap_scalar(x):
     if type(x) in [int, float]:
         return Expr(x)
diff --git a/python/taichi/lang/transformer.py b/python/taichi/lang/transformer.py
index b8b62224f60b6..3f1b59d3afa88 100644
--- a/python/taichi/lang/transformer.py
+++ b/python/taichi/lang/transformer.py
@@ -437,7 +437,7 @@ def visit_struct_for(self, node, is_grouped):
     ___loop_var = 0
     {} = ti.make_var_vector(size=len(___loop_var.loop_range().shape))
     ___expr_group = ti.make_expr_group({})
-    ti.core.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range().ptr)
+    ti.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range())
     ti.core.end_frontend_range_for()
             '''.format(vars, vars)
             t = ast.parse(template).body[0]
@@ -450,7 +450,7 @@ def visit_struct_for(self, node, is_grouped):
 {}
     ___loop_var = 0
     ___expr_group = ti.make_expr_group({})
-    ti.core.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range().ptr)
+    ti.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range())
     ti.core.end_frontend_range_for()
             '''.format(var_decl, vars)
             t = ast.parse(template).body[0]
diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py
index 809a49ba9202a..0783f68ab2de9 100644
--- a/python/taichi/misc/gui.py
+++ b/python/taichi/misc/gui.py
@@ -85,8 +85,8 @@ def set_image(self, img):
                 self.img = self.cook_image(img.to_numpy())
             else:
                 # Type matched! We can use an optimized copy kernel.
-                assert img.shape(
-                ) == self.res, "Image resolution does not match GUI resolution"
+                assert img.shape \
+                 == self.res, "Image resolution does not match GUI resolution"
                 from taichi.lang.meta import tensor_to_image
                 tensor_to_image(img, self.img)
                 ti.sync()
@@ -96,8 +96,8 @@ def set_image(self, img):
                 self.img = self.cook_image(img.to_numpy())
             else:
                 # Type matched! We can use an optimized copy kernel.
-                assert img.shape(
-                ) == self.res, "Image resolution does not match GUI resolution"
+                assert img.shape \
+                 == self.res, "Image resolution does not match GUI resolution"
                 assert img.n in [
                     3, 4
                 ], "Only greyscale, RGB or RGBA images are supported in GUI.set_image"
diff --git a/python/taichi/misc/image.py b/python/taichi/misc/image.py
index b3ec797cb557b..7e2594e7fa1fc 100644
--- a/python/taichi/misc/image.py
+++ b/python/taichi/misc/image.py
@@ -2,7 +2,12 @@
 import taichi as ti
 
 
-def imwrite(img, filename):
+def cook_image_to_bytes(img):
+    """
+    Takes a NumPy array or Taichi tensor of any type.
+    Returns a NumPy array of uint8.
+    This is used by ti.imwrite and ti.imdisplay.
+    """
     if not isinstance(img, np.ndarray):
         img = img.to_numpy()
 
@@ -16,19 +21,47 @@ def imwrite(img, filename):
     assert len(img.shape) in [2,
                               3], "Image must be either RGB/RGBA or greyscale"
 
-    resx, resy = img.shape[:2]
     if len(img.shape) == 2:
-        comp = 1
+        img = img.reshape(*img.shape, 1)
+
+    assert img.shape[2] in [1, 3,
+                            4], "Image must be either RGB/RGBA or greyscale"
+
+    return img.swapaxes(0, 1)[::-1, :]
+
+
+def imdisplay(img):
+    """
+    Try to display image in interactive shell.
+    """
+    if ti.lang.shell.oinspect.name == ti.lang.shell.ShellType.JUPYTER:
+        import PIL.Image
+        from io import BytesIO
+        import IPython.display
+        import numpy as np
+        img = cook_image_to_bytes(img)
+        with BytesIO() as f:
+            PIL.Image.fromarray(img).save(f, 'png')
+            IPython.display.display(IPython.display.Image(data=f.getvalue()))
     else:
-        comp = img.shape[2]
-    assert comp in [1, 3, 4], "Image must be either RGB/RGBA or greyscale"
+        ti.imshow(img)
+
 
-    img = np.ascontiguousarray(img.swapaxes(0, 1)[::-1, :])
+def imwrite(img, filename):
+    """
+    Save image to a specific file.
+    """
+    img = cook_image_to_bytes(img)
+    img = np.ascontiguousarray(img)
     ptr = img.ctypes.data
+    resy, resx, comp = img.shape
     ti.core.imwrite(filename, ptr, resx, resy, comp)
 
 
 def imread(filename, channels=0):
+    """
+    Load image from a specific file.
+    """
     ptr, resx, resy, comp = ti.core.imread(filename, channels)
     img = np.ndarray(shape=(resy, resx, comp), dtype=np.uint8)
     img = np.ascontiguousarray(img)
@@ -39,6 +72,9 @@ def imread(filename, channels=0):
 
 
 def imshow(img, window_name='Taichi'):
+    """
+    Show image in a Taichi GUI.
+    """
     if not isinstance(img, np.ndarray):
         img = img.to_numpy()
     assert len(img.shape) in [2,
diff --git a/taichi/analysis/build_cfg.cpp b/taichi/analysis/build_cfg.cpp
index c5f314b4023ea..b876356a09095 100644
--- a/taichi/analysis/build_cfg.cpp
+++ b/taichi/analysis/build_cfg.cpp
@@ -15,7 +15,8 @@ class CFGBuilder : public IRVisitor {
   int current_stmt_id;
   int begin_location;
   std::vector<CFGNode *> prev_nodes;
-  bool in_offloaded_for;
+  OffloadedStmt *current_offload;
+  bool in_parallel_for;
 
  public:
   CFGBuilder()
@@ -23,7 +24,8 @@ class CFGBuilder : public IRVisitor {
         last_node_in_current_block(nullptr),
         current_stmt_id(-1),
         begin_location(-1),
-        in_offloaded_for(false) {
+        current_offload(nullptr),
+        in_parallel_for(false) {
     allow_undefined_visitor = true;
     invoke_default_visitor = true;
     graph = std::make_unique<ControlFlowGraph>();
@@ -40,7 +42,7 @@ class CFGBuilder : public IRVisitor {
 
   CFGNode *new_node(int next_begin_location) {
     auto node = graph->push_back(current_block, begin_location, current_stmt_id,
-                                 in_offloaded_for, last_node_in_current_block);
+                                 in_parallel_for, last_node_in_current_block);
     for (auto &prev_node : prev_nodes) {
       CFGNode::add_edge(prev_node, node);
     }
@@ -125,14 +127,23 @@ class CFGBuilder : public IRVisitor {
   }
 
   void visit(RangeForStmt *stmt) override {
+    auto old_in_parallel_for = in_parallel_for;
+    if (!current_offload)
+      in_parallel_for = true;
     visit_loop(stmt->body.get(), new_node(-1), false);
+    in_parallel_for = old_in_parallel_for;
   }
 
   void visit(StructForStmt *stmt) override {
+    auto old_in_parallel_for = in_parallel_for;
+    if (!current_offload)
+      in_parallel_for = true;
     visit_loop(stmt->body.get(), new_node(-1), false);
+    in_parallel_for = old_in_parallel_for;
   }
 
   void visit(OffloadedStmt *stmt) override {
+    current_offload = stmt;
     if (stmt->prologue) {
       auto before_offload = new_node(-1);
       int offload_stmt_id = current_stmt_id;
@@ -149,10 +160,10 @@ class CFGBuilder : public IRVisitor {
       auto block_begin_index = graph->size();
       if (stmt->task_type == OffloadedStmt::TaskType::range_for ||
           stmt->task_type == OffloadedStmt::TaskType::struct_for) {
-        in_offloaded_for = true;
+        in_parallel_for = true;
       }
       stmt->body->accept(this);
-      in_offloaded_for = false;
+      in_parallel_for = false;
       prev_nodes.push_back(graph->back());
       // Container statements don't belong to any CFGNodes.
       begin_location = offload_stmt_id + 1;
@@ -168,6 +179,7 @@ class CFGBuilder : public IRVisitor {
       begin_location = offload_stmt_id + 1;
       CFGNode::add_edge(before_offload, graph->nodes[block_begin_index].get());
     }
+    current_offload = nullptr;
   }
 
   void visit(Block *block) override {
diff --git a/taichi/backends/cc/cc_configuation.h b/taichi/backends/cc/cc_configuation.h
new file mode 100644
index 0000000000000..6306f36b58866
--- /dev/null
+++ b/taichi/backends/cc/cc_configuation.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+struct CCConfiguation {
+  std::string compile_cmd;
+
+  CCConfiguation() : compile_cmd("gcc -shared -fPIC -o '{}' '{}'") {
+  }
+};
+
+extern CCConfiguation cfg;
+
+bool is_c_backend_available();
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_kernel.h b/taichi/backends/cc/cc_kernel.h
new file mode 100644
index 0000000000000..390887808607e
--- /dev/null
+++ b/taichi/backends/cc/cc_kernel.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCProgram;
+
+class CCKernel {
+ public:
+  CCKernel(std::string const &source, std::string const &name)
+      : source(source), name(name) {
+  }
+
+  void compile();
+  void launch(CCProgram *program, Context *ctx);
+
+  std::string source;
+  std::string name;
+
+  std::string src_path;
+  std::string bin_path;
+};
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_layout.h b/taichi/backends/cc/cc_layout.h
new file mode 100644
index 0000000000000..0d3081c0645e6
--- /dev/null
+++ b/taichi/backends/cc/cc_layout.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCLayout {
+ public:
+  CCLayout() {
+  }
+
+  std::string source;
+};
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_program.cpp b/taichi/backends/cc/cc_program.cpp
new file mode 100644
index 0000000000000..234990d78dcc4
--- /dev/null
+++ b/taichi/backends/cc/cc_program.cpp
@@ -0,0 +1,50 @@
+#include "taichi/common/core.h"
+#include "taichi/system/dynamic_loader.h"
+#include "cc_program.h"
+#include "cc_configuation.h"
+#include "cc_kernel.h"
+#include "cc_layout.h"
+#include "cc_utils.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+CCConfiguation cfg;
+
+void CCKernel::compile() {
+  bin_path = fmt::format("{}/{}.so", runtime_tmp_dir, name);
+  src_path = fmt::format("{}/{}.c", runtime_tmp_dir, name);
+
+  std::ofstream(src_path) << source;
+  TI_INFO("[cc] compiling kernel [{}]:\n{}\n", name, source);
+  execute(cfg.compile_cmd, bin_path, src_path);
+}
+
+void CCKernel::launch(CCProgram *launcher, Context *ctx) {
+  using FuncEntryType = void();
+  DynamicLoader dll(bin_path);
+  TI_ASSERT_INFO(dll.loaded(), "[cc] could not load shared object: {}",
+                 bin_path);
+  auto main =
+      reinterpret_cast<FuncEntryType *>(dll.load_function(get_sym_name(name)));
+  TI_INFO("[cc] entering kernel [{}]", name);
+  (*main)();
+  TI_INFO("[cc] leaving kernel [{}]", name);
+}
+
+void CCProgram::launch(CCKernel *kernel, Context *ctx) {
+  kernel->launch(this, ctx);
+}
+
+CCProgram::CCProgram() {
+}
+
+CCProgram::~CCProgram() {
+}
+
+bool is_c_backend_available() {
+  return true;
+}
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_program.h b/taichi/backends/cc/cc_program.h
new file mode 100644
index 0000000000000..fc2888c30fb08
--- /dev/null
+++ b/taichi/backends/cc/cc_program.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCKernel;
+class CCLayout;
+
+class CCProgram {
+  // Launch C compiler to compile generated source code, and run them
+ public:
+  CCProgram();
+  ~CCProgram();
+
+  void launch(CCKernel *kernel, Context *ctx);
+
+  std::vector<std::unique_ptr<CCKernel>> kernels;
+  std::unique_ptr<CCLayout> layout;
+};
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_utils.h b/taichi/backends/cc/cc_utils.h
new file mode 100644
index 0000000000000..0384850685430
--- /dev/null
+++ b/taichi/backends/cc/cc_utils.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+#include "taichi/common/core.h"
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+#include <iomanip>
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+inline std::string c_quoted(std::string const &str) {
+  // https://zh.cppreference.com/w/cpp/language/escape
+  std::stringstream ss;
+  ss << '"';
+  for (auto const &c : str) {
+    switch (c) {
+#define REG_ESC(x, y) \
+  case x:             \
+    ss << "\\" y;     \
+    break;
+      REG_ESC('\n', "n");
+      REG_ESC('\a', "a");
+      REG_ESC('\b', "b");
+      REG_ESC('\?', "?");
+      REG_ESC('\v', "v");
+      REG_ESC('\t', "t");
+      REG_ESC('\f', "f");
+      REG_ESC('\'', "'");
+      REG_ESC('\"', "\"");
+      REG_ESC('\\', "\\");
+      default:
+        ss << c;
+    }
+  }
+  ss << '"';
+  return ss.str();
+}
+
+inline std::string cc_data_type_name(DataType dt) {
+  switch (dt) {
+    case DataType::i32:
+      return "int";
+    case DataType::f32:
+      return "float";
+    default:
+      TI_NOT_IMPLEMENTED
+  }
+}
+
+inline std::string get_sym_name(std::string const &name) {
+  return fmt::format("Ti_{}", name);
+}
+
+template <typename... Args>
+inline int execute(std::string fmt, Args &&... args) {
+  auto cmd = fmt::format(fmt, std::forward<Args>(args)...);
+  TI_INFO("Executing command: {}", cmd);
+  int ret = std::system(cmd.c_str());
+  TI_INFO("Command exit status: {}", ret);
+  return ret;
+}
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/codegen_cc.cpp b/taichi/backends/cc/codegen_cc.cpp
new file mode 100644
index 0000000000000..1979fab3084ea
--- /dev/null
+++ b/taichi/backends/cc/codegen_cc.cpp
@@ -0,0 +1,148 @@
+#include "codegen_cc.h"
+#include "cc_kernel.h"
+#include "cc_layout.h"
+#include "taichi/ir/ir.h"
+#include "taichi/ir/transforms.h"
+#include "taichi/util/line_appender.h"
+#include "cc_utils.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {  // Codegen for C Compiler Processor
+
+class CCTransformer : public IRVisitor {
+ private:
+  [[maybe_unused]] Kernel *kernel;
+  [[maybe_unused]] CCLayout *layout;
+
+  LineAppender line_appender;
+  LineAppender line_appender_header;
+  bool is_top_level{true};
+
+ public:
+  CCTransformer(Kernel *kernel, CCLayout *layout)
+      : kernel(kernel), layout(layout) {
+    allow_undefined_visitor = true;
+    invoke_default_visitor = true;
+  }
+
+  void run() {
+    this->lower_ast();
+    emit_header("#include <stdio.h>");
+    kernel->ir->accept(this);
+  }
+
+  void lower_ast() {
+    auto ir = kernel->ir.get();
+    auto config = kernel->program.config;
+    config.demote_dense_struct_fors = true;
+    irpass::compile_to_offloads(ir, config,
+                                /*vectorize=*/false, kernel->grad,
+                                /*ad_use_stack=*/false, config.print_ir,
+                                /*lower_global_access*/ true);
+  }
+
+  std::string get_source() {
+    return line_appender_header.lines() + "\n" + line_appender.lines();
+  }
+
+ private:
+  void visit(Block *stmt) override {
+    if (!is_top_level)
+      line_appender.push_indent();
+    for (auto &s : stmt->statements) {
+      s->accept(this);
+    }
+    if (!is_top_level)
+      line_appender.pop_indent();
+  }
+
+  void visit(Stmt *stmt) override {
+    TI_WARN("[cc] unsupported statement type {}", typeid(*stmt).name());
+  }
+
+  void visit(ConstStmt *stmt) override {
+    TI_ASSERT(stmt->width() == 1);
+    emit("{} {} = {};", cc_data_type_name(stmt->element_type()),
+         stmt->raw_name(), stmt->val[0].stringify());
+  }
+
+  void visit(PrintStmt *stmt) override {
+    std::string format;
+    std::vector<std::string> values;
+
+    for (int i = 0; i < stmt->contents.size(); i++) {
+      auto const &content = stmt->contents[i];
+
+      if (std::holds_alternative<Stmt *>(content)) {
+        auto arg_stmt = std::get<Stmt *>(content);
+        format += data_type_format(arg_stmt->ret_type.data_type);
+        values.push_back(arg_stmt->raw_name());
+
+      } else {
+        auto str = std::get<std::string>(content);
+        format += "%s";
+        values.push_back(c_quoted(str));
+      }
+    }
+
+    values.insert(values.begin(), c_quoted(format));
+    emit("printf({});", fmt::join(values, ", "));
+  }
+
+  void generate_serial_kernel(OffloadedStmt *stmt) {
+    emit_header("void {}(void);", get_sym_name(kernel->name));
+    emit("void {}(void) {{", get_sym_name(kernel->name));
+    {
+      ScopedIndent _s(line_appender);
+      stmt->body->accept(this);
+    }
+    emit("}}");
+  }
+
+  void visit(OffloadedStmt *stmt) override {
+    TI_ASSERT(is_top_level);
+    is_top_level = false;
+    if (stmt->task_type == OffloadedStmt::TaskType::serial) {
+      generate_serial_kernel(stmt);
+    } else {
+      TI_ERROR("[glsl] Unsupported offload type={} on C backend",
+               stmt->task_name());
+    }
+    is_top_level = true;
+  }
+
+  template <typename... Args>
+  void emit(std::string f, Args &&... args) {
+    line_appender.append(std::move(f), std::move(args)...);
+  }
+
+  template <typename... Args>
+  void emit_header(std::string f, Args &&... args) {
+    line_appender_header.append(std::move(f), std::move(args)...);
+  }
+};
+
+std::unique_ptr<CCKernel> CCKernelGen::compile() {
+  auto layout = kernel->program.cc_program->layout.get();
+  CCTransformer tran(kernel, layout);
+
+  tran.run();
+  auto source = tran.get_source();
+  auto ker = std::make_unique<CCKernel>(source, kernel->name);
+  ker->compile();
+  return ker;
+}
+
+FunctionType compile_kernel(Kernel *kernel) {
+  CCKernelGen codegen(kernel);
+  auto compiled = codegen.compile();
+  auto compiled_ptr = compiled.get();
+  auto program = kernel->program.cc_program.get();
+  program->kernels.push_back(std::move(compiled));
+  return [program, compiled_ptr](Context &ctx) {
+    return program->launch(compiled_ptr, &ctx);
+  };
+}
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/codegen_cc.h b/taichi/backends/cc/codegen_cc.h
new file mode 100644
index 0000000000000..de029e4f11e82
--- /dev/null
+++ b/taichi/backends/cc/codegen_cc.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+#include "taichi/codegen/codegen.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCKernel;
+
+class CCKernelGen {
+  // Generate corresponding C Source Code for a Taichi Kernel
+ public:
+  CCKernelGen(Kernel *kernel) : kernel(kernel) {
+  }
+
+  std::unique_ptr<CCKernel> compile();
+
+ private:
+  Kernel *kernel;
+};
+
+FunctionType compile_kernel(Kernel *kernel);
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/struct_cc.cpp b/taichi/backends/cc/struct_cc.cpp
new file mode 100644
index 0000000000000..252152d623d38
--- /dev/null
+++ b/taichi/backends/cc/struct_cc.cpp
@@ -0,0 +1,14 @@
+#include "struct_cc.h"
+#include "cc_layout.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+std::unique_ptr<CCLayout> CCLayoutGen::compile() {
+  auto lay = std::make_unique<CCLayout>();
+  // W.I.P.
+  return lay;
+}
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/struct_cc.h b/taichi/backends/cc/struct_cc.h
new file mode 100644
index 0000000000000..2b73b328052f9
--- /dev/null
+++ b/taichi/backends/cc/struct_cc.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+#include "taichi/ir/snode.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCLayout;
+
+class CCLayoutGen {
+  // Generate corresponding C Source Code for Taichi Structures
+ public:
+  CCLayoutGen(SNode *root) : root(root) {
+  }
+
+  std::unique_ptr<CCLayout> compile();
+
+ private:
+  SNode *root;
+};
+
+}  // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/api.cpp b/taichi/backends/metal/api.cpp
index ebfdde95ceb3c..6e8314310241a 100644
--- a/taichi/backends/metal/api.cpp
+++ b/taichi/backends/metal/api.cpp
@@ -1,4 +1,6 @@
 #include "taichi/backends/metal/api.h"
+
+#include "taichi/backends/metal/constants.h"
 #include "taichi/util/environ_config.h"
 
 TLANG_NAMESPACE_BEGIN
@@ -53,15 +55,18 @@ nsobj_unique_ptr<MTLComputeCommandEncoder> new_compute_command_encoder(
   return wrap_as_nsobj_unique_ptr(encoder);
 }
 
-nsobj_unique_ptr<MTLLibrary> new_library_with_source(
-    MTLDevice *device,
-    const std::string &source) {
+nsobj_unique_ptr<MTLLibrary> new_library_with_source(MTLDevice *device,
+                                                     const std::string &source,
+                                                     int msl_version) {
   auto source_str = mac::wrap_string_as_ns_string(source);
 
   id options = clscall("MTLCompileOptions", "alloc");
   options = call(options, "init");
   auto options_cleanup = wrap_as_nsobj_unique_ptr(options);
   call(options, "setFastMathEnabled:", false);
+  if (msl_version != kMslVersionNone) {
+    call(options, "setLanguageVersion:", msl_version);
+  }
 
   auto *lib = cast_call<MTLLibrary *>(
       device, "newLibraryWithSource:options:error:", source_str.get(), options,
diff --git a/taichi/backends/metal/api.h b/taichi/backends/metal/api.h
index b905d70693176..7cd307eebf3c8 100644
--- a/taichi/backends/metal/api.h
+++ b/taichi/backends/metal/api.h
@@ -3,12 +3,12 @@
 // Reference implementation:
 // https://github.com/halide/Halide/blob/master/src/runtime/metal.cpp
 
+#include <string>
+
 #include "taichi/common/trait.h"
 #include "taichi/lang_util.h"
 #include "taichi/platform/mac/objc_api.h"
 
-#include <string>
-
 TLANG_NAMESPACE_BEGIN
 
 namespace metal {
@@ -42,8 +42,11 @@ nsobj_unique_ptr<MTLCommandBuffer> new_command_buffer(MTLCommandQueue *queue);
 nsobj_unique_ptr<MTLComputeCommandEncoder> new_compute_command_encoder(
     MTLCommandBuffer *buffer);
 
+// msl_version: Metal Shader Language version. 0 means not set.
+// See https://developer.apple.com/documentation/metal/mtllanguageversion
 nsobj_unique_ptr<MTLLibrary> new_library_with_source(MTLDevice *device,
-                                                     const std::string &source);
+                                                     const std::string &source,
+                                                     int msl_version);
 
 nsobj_unique_ptr<MTLFunction> new_function_with_name(MTLLibrary *library,
                                                      const std::string &name);
diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index a649a14bc7983..8a8a776b925ff 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -8,6 +8,8 @@
 #include "taichi/ir/ir.h"
 #include "taichi/ir/transforms.h"
 #include "taichi/util/line_appender.h"
+#include "taichi/math/arithmetic.h"
+#include "taichi/backends/metal/api.h"
 
 TLANG_NAMESPACE_BEGIN
 namespace metal {
@@ -30,6 +32,7 @@ using BuffersEnum = KernelAttributes::Buffers;
 
 constexpr char kKernelThreadIdName[] = "utid_";        // 'u' for unsigned
 constexpr char kKernelGridSizeName[] = "ugrid_size_";  // 'u' for unsigned
+constexpr char kKernelTidInSimdgroupName[] = "utid_in_simdg_";
 constexpr char kRootBufferName[] = "root_addr";
 constexpr char kGlobalTmpsBufferName[] = "global_tmps_addr";
 constexpr char kContextBufferName[] = "ctx_addr";
@@ -43,6 +46,7 @@ constexpr char kListgenElemVarName[] = "listgen_elem_";
 constexpr char kRandStateVarName[] = "rand_state_";
 constexpr char kSNodeMetaVarName[] = "sn_meta_";
 constexpr char kMemAllocVarName[] = "mem_alloc_";
+constexpr char kTlsBufferName[] = "tls_buffer_";
 
 std::string buffer_to_name(BuffersEnum b) {
   switch (b) {
@@ -79,24 +83,22 @@ class KernelCodegen : public IRVisitor {
       Section::Kernels,
   };
 
-  struct UsedFeatures {
-    bool runtime_list_ops = false;
-  };
-
  public:
   // TODO(k-ye): Create a Params to hold these ctor params.
   KernelCodegen(const std::string &mtl_kernel_prefix,
                 const std::string &root_snode_type_name,
                 Kernel *kernel,
                 const CompiledStructs *compiled_structs,
-                PrintStringTable *print_strtab)
+                PrintStringTable *print_strtab,
+                const CodeGen::Config &config)
       : mtl_kernel_prefix_(mtl_kernel_prefix),
         root_snode_type_name_(root_snode_type_name),
         kernel_(kernel),
         compiled_structs_(compiled_structs),
         needs_root_buffer_(compiled_structs_->root_size > 0),
         ctx_attribs_(*kernel_),
-        print_strtab_(print_strtab) {
+        print_strtab_(print_strtab),
+        cgen_config_(config) {
     // allow_undefined_visitor = true;
     for (const auto s : kAllSections) {
       section_appenders_[s] = LineAppender();
@@ -107,8 +109,8 @@ class KernelCodegen : public IRVisitor {
     return ctx_attribs_;
   }
 
-  const std::vector<KernelAttributes> &kernels_attribs() const {
-    return mtl_kernels_attribs_;
+  const TaichiKernelAttributes &ti_kernels_attribs() const {
+    return ti_kernel_attribus_;
   }
 
   std::string run() {
@@ -340,6 +342,13 @@ class KernelCodegen : public IRVisitor {
          stmt->raw_name(), dt, kGlobalTmpsBufferName, stmt->offset);
   }
 
+  void visit(ThreadLocalPtrStmt *stmt) override {
+    TI_ASSERT(stmt->width() == 1);
+    emit("thread auto* {} = reinterpret_cast<thread {}*>({} + {});",
+         stmt->raw_name(), metal_data_type_name(stmt->element_type()),
+         kTlsBufferName, stmt->offset);
+  }
+
   void visit(LoopIndexStmt *stmt) override {
     const auto stmt_name = stmt->raw_name();
     if (stmt->loop->is<OffloadedStmt>()) {
@@ -451,24 +460,36 @@ class KernelCodegen : public IRVisitor {
       TI_NOT_IMPLEMENTED;
     }
 
+    std::string val_var = stmt->val->raw_name();
+    // TODO(k-ye): This is not a very reliable way to detect if we're in TLS
+    // xlogues...
+    const bool is_tls_reduction =
+        (inside_tls_epilogue_ && (op_type == AtomicOpType::add));
+    const bool use_simd_in_tls_reduction =
+        (is_tls_reduction && cgen_config_.allow_simdgroup);
+    if (use_simd_in_tls_reduction) {
+      val_var += "_simd_val_";
+      emit("const auto {} = simd_sum({});", val_var, stmt->val->raw_name());
+      emit("if ({} == 0) {{", kKernelTidInSimdgroupName);
+      current_appender().push_indent();
+    }
+
     if (dt == DataType::i32) {
       emit(
           "const auto {} = atomic_fetch_{}_explicit((device atomic_int*){}, "
           "{}, "
           "metal::memory_order_relaxed);",
-          stmt->raw_name(), op_name, stmt->dest->raw_name(),
-          stmt->val->raw_name());
+          stmt->raw_name(), op_name, stmt->dest->raw_name(), val_var);
     } else if (dt == DataType::u32) {
       emit(
           "const auto {} = atomic_fetch_{}_explicit((device atomic_uint*){}, "
           "{}, "
           "metal::memory_order_relaxed);",
-          stmt->raw_name(), op_name, stmt->dest->raw_name(),
-          stmt->val->raw_name());
+          stmt->raw_name(), op_name, stmt->dest->raw_name(), val_var);
     } else if (dt == DataType::f32) {
       if (handle_float) {
         emit("const float {} = fatomic_fetch_{}({}, {});", stmt->raw_name(),
-             op_name, stmt->dest->raw_name(), stmt->val->raw_name());
+             op_name, stmt->dest->raw_name(), val_var);
       } else {
         TI_ERROR("Metal does not support atomic {} for floating points",
                  op_name);
@@ -476,6 +497,11 @@ class KernelCodegen : public IRVisitor {
     } else {
       TI_ERROR("Metal only supports 32-bit atomic data types");
     }
+
+    if (use_simd_in_tls_reduction) {
+      current_appender().pop_indent();
+      emit("}}");  // closes `if (kKernelTidInSimdgroupName == 0) {`
+    }
   }
 
   void visit(IfStmt *if_stmt) override {
@@ -561,7 +587,8 @@ class KernelCodegen : public IRVisitor {
   }
 
   void visit(PrintStmt *stmt) override {
-    mark_print_used();
+    used_features()->print = true;
+
     const auto &contents = stmt->contents;
     const int num_entries = contents.size();
     const std::string msgbuf_var_name = stmt->raw_name() + "_msgbuf_";
@@ -654,6 +681,7 @@ class KernelCodegen : public IRVisitor {
   void emit_headers() {
     SectionGuard sg(this, Section::Headers);
     emit("#include <metal_stdlib>");
+    emit("#include <metal_compute>");
     emit("using namespace metal;");
   }
 
@@ -722,7 +750,7 @@ class KernelCodegen : public IRVisitor {
     SectionGuard sg(this, Section::Kernels);
     kernel_->ir->accept(this);
 
-    if (used_features_.runtime_list_ops) {
+    if (used_features()->sparse) {
       emit("");
       current_appender().append_raw(shaders::kMetalRuntimeKernelsSourceCode);
     }
@@ -768,7 +796,7 @@ class KernelCodegen : public IRVisitor {
     emit("}}\n");
     current_kernel_attribs_ = nullptr;
 
-    mtl_kernels_attribs_.push_back(ka);
+    mtl_kernels_attribs()->push_back(ka);
   }
 
   void generate_range_for_kernel(OffloadedStmt *stmt) {
@@ -779,9 +807,16 @@ class KernelCodegen : public IRVisitor {
     ka.task_type = stmt->task_type;
     ka.buffers = get_common_buffers();
 
-    emit_mtl_kernel_sig(mtl_kernel_name, ka.buffers);
+    const bool used_tls = (stmt->prologue != nullptr);
+    KernelSigExtensions kernel_exts;
+    kernel_exts.use_simdgroup = (used_tls && cgen_config_.allow_simdgroup);
+    used_features()->simdgroup =
+        used_features()->simdgroup || kernel_exts.use_simdgroup;
 
-    auto &range_for_attribs = ka.range_for_attribs;
+    emit_mtl_kernel_sig(mtl_kernel_name, ka.buffers, kernel_exts);
+
+    ka.range_for_attribs = KernelAttributes::RangeForAttributes();
+    auto &range_for_attribs = ka.range_for_attribs.value();
     range_for_attribs.const_begin = stmt->const_begin;
     range_for_attribs.const_end = stmt->const_end;
     range_for_attribs.begin =
@@ -820,23 +855,51 @@ class KernelCodegen : public IRVisitor {
     emit("const int begin_ = {} + {};", kKernelThreadIdName, begin_expr);
     // end_   = total_elems + begin_expr
     emit("const int end_ = {} + {};", total_elems_name, begin_expr);
+
+    if (used_tls) {
+      // Using |int32_t| because it aligns to 4bytes.
+      emit("// TLS prologue");
+      const std::string tls_bufi32_name = "tls_bufi32_";
+      emit("int32_t {}[{}];", tls_bufi32_name, (stmt->tls_size + 3) / 4);
+      emit("thread char* {} = reinterpret_cast<thread char*>({});",
+           kTlsBufferName, tls_bufi32_name);
+      stmt->prologue->accept(this);
+    }
+
     emit("for (int ii = begin_; ii < end_; ii += {}) {{", kKernelGridSizeName);
     {
       ScopedIndent s2(current_appender());
 
       current_kernel_attribs_ = &ka;
       const auto mtl_func_name = mtl_kernel_func_name(mtl_kernel_name);
-      emit_mtl_kernel_func_def(mtl_func_name, ka.buffers, stmt->body.get());
-      emit_call_mtl_kernel_func(mtl_func_name, ka.buffers,
+      std::vector<FuncParamLiteral> extra_func_params;
+      std::vector<std::string> extra_args;
+      if (used_tls) {
+        extra_func_params.push_back({"thread char*", kTlsBufferName});
+        extra_args.push_back(kTlsBufferName);
+      }
+      emit_mtl_kernel_func_def(mtl_func_name, ka.buffers, extra_func_params,
+                               stmt->body.get());
+      emit_call_mtl_kernel_func(mtl_func_name, ka.buffers, extra_args,
                                 /*loop_index_expr=*/"ii");
     }
     emit("}}");  // closes for loop
+
+    if (used_tls) {
+      TI_ASSERT(stmt->epilogue != nullptr);
+      inside_tls_epilogue_ = true;
+      emit("{{  // TLS epilogue");
+      stmt->epilogue->accept(this);
+      inside_tls_epilogue_ = false;
+      emit("}}");
+    }
+
     current_appender().pop_indent();
     // Close kernel
     emit("}}\n");
     current_kernel_attribs_ = nullptr;
 
-    mtl_kernels_attribs_.push_back(ka);
+    mtl_kernels_attribs()->push_back(ka);
   }
 
   void generate_struct_for_kernel(OffloadedStmt *stmt) {
@@ -919,7 +982,7 @@ class KernelCodegen : public IRVisitor {
     current_appender().pop_indent();
     emit("}}\n");  // closes kernel
 
-    mtl_kernels_attribs_.push_back(ka);
+    mtl_kernels_attribs()->push_back(ka);
   }
 
   void add_runtime_list_op_kernel(OffloadedStmt *stmt,
@@ -944,11 +1007,13 @@ class KernelCodegen : public IRVisitor {
     } else {
       TI_ERROR("Unsupported offload task type {}", stmt->task_name());
     }
-    ka.runtime_list_op_attribs.snode = sn;
+
+    ka.runtime_list_op_attribs = KernelAttributes::RuntimeListOpAttributes();
+    ka.runtime_list_op_attribs->snode = sn;
     current_kernel_attribs_ = nullptr;
 
-    mtl_kernels_attribs_.push_back(ka);
-    used_features_.runtime_list_ops = true;
+    mtl_kernels_attribs()->push_back(ka);
+    used_features()->sparse = true;
   }
 
   std::string inject_load_global_tmp(int offset, DataType dt = DataType::i32) {
@@ -1042,15 +1107,28 @@ class KernelCodegen : public IRVisitor {
                               loop_index_expr);
   }
 
+  struct KernelSigExtensions {
+    // https://stackoverflow.com/a/44693603/12003165
+    KernelSigExtensions() noexcept {
+    }
+
+    bool use_simdgroup = false;
+  };
+
   void emit_mtl_kernel_sig(
       const std::string &kernel_name,
-      const std::vector<KernelAttributes::Buffers> &buffers) {
+      const std::vector<KernelAttributes::Buffers> &buffers,
+      const KernelSigExtensions &exts = {}) {
     emit("kernel void {}(", kernel_name);
     for (int i = 0; i < buffers.size(); ++i) {
       emit("    device byte* {} [[buffer({})]],", buffer_to_name(buffers[i]),
            i);
     }
     emit("    const uint {} [[threads_per_grid]],", kKernelGridSizeName);
+    if (exts.use_simdgroup) {
+      emit("    const uint {} [[thread_index_in_simdgroup]],",
+           kKernelTidInSimdgroupName);
+    }
     emit("    const uint {} [[thread_position_in_grid]]) {{",
          kKernelThreadIdName);
   }
@@ -1089,11 +1167,6 @@ class KernelCodegen : public IRVisitor {
     return kernel_name + "_func";
   }
 
-  void mark_print_used() {
-    TI_ASSERT(current_kernel_attribs_ != nullptr);
-    current_kernel_attribs_->uses_print = true;
-  }
-
   class SectionGuard {
    public:
     SectionGuard(KernelCodegen *kg, Section new_sec)
@@ -1124,6 +1197,14 @@ class KernelCodegen : public IRVisitor {
     current_appender().append(std::move(f), std::forward<Args>(args)...);
   }
 
+  std::vector<KernelAttributes> *mtl_kernels_attribs() {
+    return &(ti_kernel_attribus_.mtl_kernels_attribs);
+  }
+
+  TaichiKernelAttributes::UsedFeatures *used_features() {
+    return &(ti_kernel_attribus_.used_features);
+  }
+
   const std::string mtl_kernel_prefix_;
   const std::string root_snode_type_name_;
   Kernel *const kernel_;
@@ -1131,13 +1212,14 @@ class KernelCodegen : public IRVisitor {
   const bool needs_root_buffer_;
   const KernelContextAttributes ctx_attribs_;
   PrintStringTable *const print_strtab_;
+  const CodeGen::Config &cgen_config_;
 
   bool is_top_level_{true};
   int mtl_kernel_count_{0};
-  std::vector<KernelAttributes> mtl_kernels_attribs_;
-  UsedFeatures used_features_;
+  TaichiKernelAttributes ti_kernel_attribus_;
   GetRootStmt *root_stmt_{nullptr};
   KernelAttributes *current_kernel_attribs_{nullptr};
+  bool inside_tls_epilogue_{false};
   Section code_section_{Section::Structs};
   std::unordered_map<Section, LineAppender> section_appenders_;
 };
@@ -1146,12 +1228,14 @@ class KernelCodegen : public IRVisitor {
 
 CodeGen::CodeGen(Kernel *kernel,
                  KernelManager *kernel_mgr,
-                 const CompiledStructs *compiled_structs)
+                 const CompiledStructs *compiled_structs,
+                 const Config &config)
     : kernel_(kernel),
       kernel_mgr_(kernel_mgr),
       compiled_structs_(compiled_structs),
       id_(Program::get_kernel_id()),
-      taichi_kernel_name_(fmt::format("mtl_k{:04d}_{}", id_, kernel_->name)) {
+      taichi_kernel_name_(fmt::format("mtl_k{:04d}_{}", id_, kernel_->name)),
+      config_(config) {
 }
 
 FunctionType CodeGen::compile() {
@@ -1159,14 +1243,16 @@ FunctionType CodeGen::compile() {
   config.demote_dense_struct_fors = true;
   irpass::compile_to_offloads(kernel_->ir.get(), config,
                               /*vectorize=*/false, kernel_->grad,
-                              /*ad_use_stack=*/true, config.print_ir);
+                              /*ad_use_stack=*/true, config.print_ir,
+                              /*lower_global_access=*/true,
+                              /*make_thread_local=*/true);
 
-  KernelCodegen codegen(taichi_kernel_name_,
-                        kernel_->program.snode_root->node_type_name, kernel_,
-                        compiled_structs_, kernel_mgr_->print_strtable());
+  KernelCodegen codegen(
+      taichi_kernel_name_, kernel_->program.snode_root->node_type_name, kernel_,
+      compiled_structs_, kernel_mgr_->print_strtable(), config_);
   const auto source_code = codegen.run();
   kernel_mgr_->register_taichi_kernel(taichi_kernel_name_, source_code,
-                                      codegen.kernels_attribs(),
+                                      codegen.ti_kernels_attribs(),
                                       codegen.kernel_ctx_attribs());
   return [kernel_mgr = kernel_mgr_,
           kernel_name = taichi_kernel_name_](Context &ctx) {
diff --git a/taichi/backends/metal/codegen_metal.h b/taichi/backends/metal/codegen_metal.h
index 00e58a33a2aab..468180306d973 100644
--- a/taichi/backends/metal/codegen_metal.h
+++ b/taichi/backends/metal/codegen_metal.h
@@ -18,9 +18,14 @@ namespace metal {
 
 class CodeGen {
  public:
+  struct Config {
+    bool allow_simdgroup = true;
+  };
+
   CodeGen(Kernel *kernel,
           KernelManager *kernel_mgr,
-          const CompiledStructs *compiled_structs);
+          const CompiledStructs *compiled_structs,
+          const Config &config);
 
   FunctionType compile();
 
@@ -33,6 +38,7 @@ class CodeGen {
   const CompiledStructs *const compiled_structs_;
   const int id_;
   const std::string taichi_kernel_name_;
+  const Config config_;
 };
 
 }  // namespace metal
diff --git a/taichi/backends/metal/constants.h b/taichi/backends/metal/constants.h
index 9a8d244977b52..72244377abcc4 100644
--- a/taichi/backends/metal/constants.h
+++ b/taichi/backends/metal/constants.h
@@ -9,6 +9,7 @@ namespace metal {
 
 inline constexpr int kMaxNumThreadsGridStrideLoop = 64 * 1024;
 inline constexpr int kNumRandSeeds = 64 * 1024;  // 256 KB is nothing
+inline constexpr int kMslVersionNone = 0;
 
 }  // namespace metal
-TLANG_NAMESPACE_END
\ No newline at end of file
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/env_config.cpp b/taichi/backends/metal/env_config.cpp
new file mode 100644
index 0000000000000..d1d3d2da8426b
--- /dev/null
+++ b/taichi/backends/metal/env_config.cpp
@@ -0,0 +1,21 @@
+#include "taichi/backends/metal/env_config.h"
+
+#include "taichi/lang_util.h"
+#include "taichi/util/environ_config.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace metal {
+
+EnvConfig::EnvConfig() {
+  simdgroup_enabled_ =
+      get_environ_config("TI_USE_METAL_SIMDGROUP", /*default_value=*/1);
+}
+
+const EnvConfig &EnvConfig::instance() {
+  static const EnvConfig c;
+  return c;
+}
+
+}  // namespace metal
+
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/env_config.h b/taichi/backends/metal/env_config.h
new file mode 100644
index 0000000000000..94268d7874714
--- /dev/null
+++ b/taichi/backends/metal/env_config.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace metal {
+
+// Metal specific config inferred from the environment.
+class EnvConfig {
+ public:
+  // Set TI_USE_METAL_SIMDGROUP=0 to disable SIMD group.
+  // This is an ad-hoc thing. Apple claims that SIMD group is supported in
+  // MSL 2.0, which isn't the case. According to my test, it's available in
+  // MSL 2.1. Since MSL 2.1 is released since macOS 10.14, we expect the major
+  // users would be able to use this feature.
+  inline bool is_simdgroup_enabled() const {
+    return simdgroup_enabled_;
+  }
+
+  static const EnvConfig &instance();
+
+ private:
+  EnvConfig();
+
+  bool simdgroup_enabled_;
+};
+
+}  // namespace metal
+
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index ccd63af75f48d..b111951ad0326 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -33,6 +33,14 @@ namespace shaders {
 using KernelTaskType = OffloadedStmt::TaskType;
 using BufferEnum = KernelAttributes::Buffers;
 
+inline int infer_msl_version(const TaichiKernelAttributes::UsedFeatures &f) {
+  if (f.simdgroup) {
+    // https://developer.apple.com/documentation/metal/mtllanguageversion/version2_1
+    return 131073;
+  }
+  return kMslVersionNone;
+}
+
 // This class requests the Metal buffer memory of |size| bytes from |mem_pool|.
 // Once allocated, it does not own the memory (hence the name "view"). Instead,
 // GC is deferred to the memory pool.
@@ -152,7 +160,7 @@ class RuntimeListOpsMtlKernel : public CompiledMtlKernelBase {
     const SNodeDescriptorsMap *snode_descriptors = nullptr;
 
     const SNode *snode() const {
-      return kernel_attribs->runtime_list_op_attribs.snode;
+      return kernel_attribs->runtime_list_op_attribs->snode;
     }
   };
 
@@ -208,7 +216,7 @@ class CompiledTaichiKernel {
   struct Params {
     std::string_view taichi_kernel_name;
     std::string mtl_source_code;
-    const std::vector<KernelAttributes> *mtl_kernels_attribs;
+    const TaichiKernelAttributes *ti_kernel_attribs;
     const KernelContextAttributes *ctx_attribs;
     const SNodeDescriptorsMap *snode_descriptors;
     MTLDevice *device;
@@ -216,14 +224,17 @@ class CompiledTaichiKernel {
     KernelProfilerBase *profiler;
   };
 
-  CompiledTaichiKernel(Params params) : ctx_attribs(*params.ctx_attribs) {
+  CompiledTaichiKernel(Params params)
+      : ctx_attribs(*params.ctx_attribs),
+        used_features(params.ti_kernel_attribs->used_features) {
     auto *const device = params.device;
-    auto kernel_lib = new_library_with_source(device, params.mtl_source_code);
+    auto kernel_lib = new_library_with_source(device, params.mtl_source_code,
+                                              infer_msl_version(used_features));
     if (kernel_lib == nullptr) {
       TI_ERROR("Failed to compile Metal kernel! Generated code:\n\n{}",
                params.mtl_source_code);
     }
-    for (const auto &ka : *(params.mtl_kernels_attribs)) {
+    for (const auto &ka : params.ti_kernel_attribs->mtl_kernels_attribs) {
       auto mtl_func = new_function_with_name(kernel_lib.get(), ka.name);
       TI_ASSERT(mtl_func != nullptr);
       // Note that CompiledMtlKernel doesn't own |kernel_func|.
@@ -265,8 +276,7 @@ class CompiledTaichiKernel {
   KernelContextAttributes ctx_attribs;
   std::unique_ptr<BufferMemoryView> ctx_mem;
   nsobj_unique_ptr<MTLBuffer> ctx_buffer;
-
- private:
+  TaichiKernelAttributes::UsedFeatures used_features;
 };
 
 class HostMetalCtxBlitter {
@@ -445,11 +455,10 @@ class KernelManager::Impl {
     init_print_buffer();
   }
 
-  void register_taichi_kernel(
-      const std::string &taichi_kernel_name,
-      const std::string &mtl_kernel_source_code,
-      const std::vector<KernelAttributes> &kernels_attribs,
-      const KernelContextAttributes &ctx_attribs) {
+  void register_taichi_kernel(const std::string &taichi_kernel_name,
+                              const std::string &mtl_kernel_source_code,
+                              const TaichiKernelAttributes &ti_kernel_attribs,
+                              const KernelContextAttributes &ctx_attribs) {
     TI_ASSERT(compiled_taichi_kernels_.find(taichi_kernel_name) ==
               compiled_taichi_kernels_.end());
 
@@ -463,7 +472,7 @@ class KernelManager::Impl {
     CompiledTaichiKernel::Params params;
     params.taichi_kernel_name = taichi_kernel_name;
     params.mtl_source_code = mtl_kernel_source_code;
-    params.mtl_kernels_attribs = &kernels_attribs;
+    params.ti_kernel_attribs = &ti_kernel_attribs;
     params.ctx_attribs = &ctx_attribs;
     params.snode_descriptors = &compiled_structs_.snode_descriptors;
     params.device = device_.get();
@@ -493,13 +502,11 @@ class KernelManager::Impl {
       input_buffers[BufferEnum::Context] = ctk.ctx_buffer.get();
     }
 
-    bool uses_print = false;
     for (const auto &mk : ctk.compiled_mtl_kernels) {
       mk->launch(input_buffers, cur_command_buffer_.get());
-      uses_print = (uses_print || mk->kernel_attribs()->uses_print);
     }
-
-    if (ctx_blitter || uses_print) {
+    const bool used_print = ctk.used_features.print;
+    if (ctx_blitter || used_print) {
       // TODO(k-ye): One optimization is to synchronize only when we absolutely
       // need to transfer the data back to host. This includes the cases where
       // an arg is 1) an array, or 2) used as return value.
@@ -507,7 +514,7 @@ class KernelManager::Impl {
       if (ctx_blitter) {
         ctx_blitter->metal_to_host();
       }
-      if (uses_print) {
+      if (used_print) {
         flush_print_buffers();
       }
     }
@@ -741,11 +748,10 @@ class KernelManager::Impl {
     TI_ERROR("Metal not supported on the current OS");
   }
 
-  void register_taichi_kernel(
-      const std::string &taichi_kernel_name,
-      const std::string &mtl_kernel_source_code,
-      const std::vector<KernelAttributes> &kernels_attribs,
-      const KernelContextAttributes &ctx_attribs) {
+  void register_taichi_kernel(const std::string &taichi_kernel_name,
+                              const std::string &mtl_kernel_source_code,
+                              const TaichiKernelAttributes &ti_kernel_attribs,
+                              const KernelContextAttributes &ctx_attribs) {
     TI_ERROR("Metal not supported on the current OS");
   }
 
@@ -776,10 +782,10 @@ KernelManager::~KernelManager() {
 void KernelManager::register_taichi_kernel(
     const std::string &taichi_kernel_name,
     const std::string &mtl_kernel_source_code,
-    const std::vector<KernelAttributes> &kernels_attribs,
+    const TaichiKernelAttributes &ti_kernel_attribs,
     const KernelContextAttributes &ctx_attribs) {
   impl_->register_taichi_kernel(taichi_kernel_name, mtl_kernel_source_code,
-                                kernels_attribs, ctx_attribs);
+                                ti_kernel_attribs, ctx_attribs);
 }
 
 void KernelManager::launch_taichi_kernel(const std::string &taichi_kernel_name,
diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h
index dc3502577bdd2..1c687cedec82b 100644
--- a/taichi/backends/metal/kernel_manager.h
+++ b/taichi/backends/metal/kernel_manager.h
@@ -39,11 +39,10 @@ class KernelManager {
   // * |mtl_kernel_source_code| is the complete source code compiled from a
   // Taichi kernel. It may include one or more Metal compute kernels. Each
   // Metal kernel is identified by one item in |kernels_attribs|.
-  void register_taichi_kernel(
-      const std::string &taichi_kernel_name,
-      const std::string &mtl_kernel_source_code,
-      const std::vector<KernelAttributes> &kernels_attribs,
-      const KernelContextAttributes &ctx_attribs);
+  void register_taichi_kernel(const std::string &taichi_kernel_name,
+                              const std::string &mtl_kernel_source_code,
+                              const TaichiKernelAttributes &ti_kernel_attribs,
+                              const KernelContextAttributes &ctx_attribs);
 
   // Launch the given |taichi_kernel_name|.
   // Kernel launching is asynchronous, therefore the Metal memory is not valid
diff --git a/taichi/backends/metal/kernel_util.cpp b/taichi/backends/metal/kernel_util.cpp
index a4d3cc67190dd..5ef0bbc23879b 100644
--- a/taichi/backends/metal/kernel_util.cpp
+++ b/taichi/backends/metal/kernel_util.cpp
@@ -50,7 +50,7 @@ std::string KernelAttributes::debug_string() const {
   // TODO(k-ye): show range_for
   if (task_type == OffloadedStmt::TaskType::clear_list ||
       task_type == OffloadedStmt::TaskType::listgen) {
-    result += fmt::format(" snode={}", runtime_list_op_attribs.snode->id);
+    result += fmt::format(" snode={}", runtime_list_op_attribs->snode->id);
   }
   result += ">";
   return result;
diff --git a/taichi/backends/metal/kernel_util.h b/taichi/backends/metal/kernel_util.h
index a638702c149e2..fd5a7d47b6785 100644
--- a/taichi/backends/metal/kernel_util.h
+++ b/taichi/backends/metal/kernel_util.h
@@ -66,20 +66,31 @@ struct KernelAttributes {
   };
   std::vector<Buffers> buffers;
   // Only valid when |task_type| is range_for.
-  // TODO(k-ye): Use std::optional to wrap |task_type| dependent attributes.
-  RangeForAttributes range_for_attribs;
-  // clear_list + listgen
-  RuntimeListOpAttributes runtime_list_op_attribs;
-
-  // Whether print() is called inside this kernel.
-  // TODO(k-ye): Encapsulate this inside a UsedFeatures. However, we need a
-  // TaichiKernelAttributes before we can do this.
-  bool uses_print = false;
+  std::optional<RangeForAttributes> range_for_attribs;
+  // Only valid when |task_type| is {clear_list, listgen}.
+  std::optional<RuntimeListOpAttributes> runtime_list_op_attribs;
 
   static std::string buffers_name(Buffers b);
   std::string debug_string() const;
 };
 
+// Groups all the Metal kernels generated from a single ti.kernel
+struct TaichiKernelAttributes {
+  struct UsedFeatures {
+    // Whether print() is called inside this kernel.
+    bool print = false;
+    // Whether this kernel accesses (read or write) sparse SNodes.
+    bool sparse = false;
+    // Whether [[thread_index_in_simdgroup]] is used. This is only supported
+    // since MSL 2.1
+    bool simdgroup = false;
+  };
+
+  // Attributes of all the Metal kernels produced from this Taichi kernel.
+  std::vector<KernelAttributes> mtl_kernels_attribs;
+  UsedFeatures used_features;
+};
+
 // This class contains the attributes descriptors for both the input args and
 // the return values of a Taichi kernel.
 //
diff --git a/taichi/backends/metal/shaders/print.metal.h b/taichi/backends/metal/shaders/print.metal.h
index 1c8d2c74fc7d6..5cb3971540261 100644
--- a/taichi/backends/metal/shaders/print.metal.h
+++ b/taichi/backends/metal/shaders/print.metal.h
@@ -37,19 +37,21 @@ STR(
     constant constexpr int kMetalPrintMsgTypeWidthMask =
         ((1 << kMetalNumBitsPerPrintMsgType) - 1);
 
-    [[maybe_unused]] inline int mtl_compute_num_print_msg_typemasks(
-        int num_entries) {
-      return (num_entries + kMetalNumPrintMsgTypePerI32 - 1) /
-             kMetalNumPrintMsgTypePerI32;
-    }
-
-    [[maybe_unused]] inline int mtl_compute_print_msg_bytes(int num_entries) {
-      // See PrintMsg's layout for how this is computed.
-      const int sz =
-          sizeof(int32_t) *
-          (1 + mtl_compute_num_print_msg_typemasks(num_entries) + num_entries);
-      return sz;
-    }
+        [[maybe_unused]] inline int mtl_compute_num_print_msg_typemasks(
+            int num_entries) {
+          return (num_entries + kMetalNumPrintMsgTypePerI32 - 1) /
+                 kMetalNumPrintMsgTypePerI32;
+        }
+
+            [[maybe_unused]] inline int mtl_compute_print_msg_bytes(
+                int num_entries) {
+              // See PrintMsg's layout for how this is computed.
+              const int sz =
+                  sizeof(int32_t) *
+                  (1 + mtl_compute_num_print_msg_typemasks(num_entries) +
+                   num_entries);
+              return sz;
+            }
 
     class PrintMsg {
      public:
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index d466d1bc25ae4..1cdc0eb16648c 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -322,7 +322,7 @@ bool initialize_opengl(bool error_tolerance) {
       desc = "Unknown Error";
     if (error_tolerance) {
       // error tolerated, returning false
-      TI_TRACE("[glsl] cannot create GLFW window: error {}: {}", status, desc);
+      TI_DEBUG("[glsl] cannot create GLFW window: error {}: {}", status, desc);
       supported = std::make_optional<bool>(false);
       return false;
     }
diff --git a/taichi/inc/archs.inc.h b/taichi/inc/archs.inc.h
index 3c021debbfccd..6f618f7f58adb 100644
--- a/taichi/inc/archs.inc.h
+++ b/taichi/inc/archs.inc.h
@@ -2,12 +2,14 @@
 
 // CPU archs
 PER_ARCH(x64)    // a.k.a. AMD64/x86_64
-PER_ARCH(arm64)  // a.k.a. Aarch64
-PER_ARCH(js)     // Javascript
+PER_ARCH(arm64)  // a.k.a. Aarch64, WIP
+PER_ARCH(js)     // Javascript, N/A
+PER_ARCH(cc)     // C language, WIP
 
 // GPU archs
 PER_ARCH(cuda)    // NVIDIA CUDA
-PER_ARCH(opencl)  // OpenCL
 PER_ARCH(metal)   // Apple Metal
 PER_ARCH(opengl)  // OpenGL Compute Shaders
-PER_ARCH(dx)      // Microsoft DirectX
+PER_ARCH(dx)      // Microsoft DirectX, N/A
+PER_ARCH(opencl)  // OpenCL, N/A
+PER_ARCH(amdgpu)  // AMD GPU, N/A
diff --git a/taichi/ir/control_flow_graph.cpp b/taichi/ir/control_flow_graph.cpp
index f0a726d50a808..cb92bb04030ef 100644
--- a/taichi/ir/control_flow_graph.cpp
+++ b/taichi/ir/control_flow_graph.cpp
@@ -101,8 +101,7 @@ void CFGNode::reaching_definition_analysis(bool after_lower_access) {
     auto data_source_ptr = irpass::analysis::get_store_destination(stmt);
     if (data_source_ptr) {
       // stmt provides a data source
-      if (after_lower_access &&
-          !(stmt->is<AllocaStmt>() || stmt->is<LocalStoreStmt>())) {
+      if (after_lower_access && !(data_source_ptr->is<AllocaStmt>())) {
         // After lower_access, we only analyze local variables.
         continue;
       }
diff --git a/taichi/ir/expr.cpp b/taichi/ir/expr.cpp
index 083b9fe6e1490..20ea0cd594fbe 100644
--- a/taichi/ir/expr.cpp
+++ b/taichi/ir/expr.cpp
@@ -179,7 +179,9 @@ Expr load(const Expr &ptr) {
 Expr ptr_if_global(const Expr &var) {
   if (var.is<GlobalVariableExpression>()) {
     // singleton global variable
-    TI_ASSERT(var.snode()->num_active_indices == 0);
+    TI_ASSERT_INFO(
+        var.snode()->num_active_indices == 0,
+        "Please always use 'x[None]' (instead of simply 'x') to access any 0-D tensor."
     return var[ExprGroup()];
   } else {
     // may be any local or global expr
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index d1c38dbc64bf6..fc4938a2ef97c 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -35,6 +35,7 @@ CompileConfig::CompileConfig() {
   verbose = true;
   fast_math = true;
   async = false;
+  flatten_if = false;
 
 #if defined(TI_PLATFORM_WINDOWS) or defined(TI_ARCH_ARM)
   use_unified_memory = false;
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index 6ee00204fdc72..02c5cf64c15e3 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -37,6 +37,7 @@ struct CompileConfig {
   bool fast_math;
   bool use_unified_memory;
   bool async;
+  bool flatten_if;
   DataType default_fp;
   DataType default_ip;
   std::string extra_flags;
diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp
index 55df70d6f6967..b1a0cd971fb7d 100644
--- a/taichi/program/kernel_profiler.cpp
+++ b/taichi/program/kernel_profiler.cpp
@@ -148,13 +148,10 @@ class KernelProfilerCUDA : public KernelProfilerBase {
 }  // namespace
 
 std::unique_ptr<KernelProfilerBase> make_profiler(Arch arch) {
-  if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal ||
-      arch == Arch::opengl) {
-    return std::make_unique<DefaultProfiler>(arch);
-  } else if (arch == Arch::cuda) {
+  if (arch == Arch::cuda) {
     return std::make_unique<KernelProfilerCUDA>();
   } else {
-    TI_NOT_IMPLEMENTED;
+    return std::make_unique<DefaultProfiler>(arch);
   }
 }
 
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 9a6c94f416fb5..6567ab2d87d55 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -11,12 +11,16 @@
 #include "taichi/backends/cuda/cuda_context.h"
 #endif
 #include "taichi/backends/metal/codegen_metal.h"
+#include "taichi/backends/metal/env_config.h"
 #include "taichi/backends/opengl/codegen_opengl.h"
+#include "taichi/backends/cc/codegen_cc.h"
 #include "taichi/backends/cpu/codegen_cpu.h"
 #include "taichi/struct/struct.h"
 #include "taichi/struct/struct_llvm.h"
 #include "taichi/backends/metal/struct_metal.h"
 #include "taichi/backends/opengl/struct_opengl.h"
+#include "taichi/backends/cc/struct_cc.h"
+#include "taichi/backends/cc/cc_layout.h"
 #include "taichi/system/unified_allocator.h"
 #include "taichi/ir/snode.h"
 #include "taichi/ir/frontend_ir.h"
@@ -152,13 +156,20 @@ FunctionType Program::compile(Kernel &kernel) {
     auto codegen = KernelCodeGen::create(kernel.arch, &kernel);
     ret = codegen->compile();
   } else if (kernel.arch == Arch::metal) {
+    metal::CodeGen::Config cgen_config;
+    cgen_config.allow_simdgroup =
+        metal::EnvConfig::instance().is_simdgroup_enabled();
     metal::CodeGen codegen(&kernel, metal_kernel_mgr_.get(),
-                           &metal_compiled_structs_.value());
+                           &metal_compiled_structs_.value(), cgen_config);
     ret = codegen.compile();
   } else if (kernel.arch == Arch::opengl) {
     opengl::OpenglCodeGen codegen(kernel.name, &opengl_struct_compiled_.value(),
                                   opengl_kernel_launcher_.get());
     ret = codegen.compile(*this, kernel);
+#ifdef TI_WITH_CC
+  } else if (kernel.arch == Arch::cc) {
+    ret = cccp::compile_kernel(&kernel);
+#endif
   } else {
     TI_NOT_IMPLEMENTED;
   }
@@ -315,6 +326,12 @@ void Program::materialize_layout() {
              opengl_struct_compiled_->root_size);
     opengl_kernel_launcher_ = std::make_unique<opengl::GLSLLauncher>(
         opengl_struct_compiled_->root_size);
+#ifdef TI_WITH_CC
+  } else if (config.arch == Arch::cc) {
+    cc_program = std::make_unique<cccp::CCProgram>();
+    cccp::CCLayoutGen scomp(snode_root.get());
+    cc_program->layout = scomp.compile();
+#endif
   }
 }
 
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 68b103a27eec8..740ee01f0491d 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -14,6 +14,7 @@
 #include "taichi/backends/metal/kernel_manager.h"
 #include "taichi/backends/opengl/opengl_kernel_launcher.h"
 #include "taichi/backends/opengl/opengl_kernel_util.h"
+#include "taichi/backends/cc/cc_program.h"
 #include "taichi/program/kernel.h"
 #include "taichi/program/kernel_profiler.h"
 #include "taichi/runtime/llvm/context.h"
@@ -245,6 +246,12 @@ class Program {
   // OpenGL related data structures
   std::optional<opengl::StructCompiledResult> opengl_struct_compiled_;
   std::unique_ptr<opengl::GLSLLauncher> opengl_kernel_launcher_;
+
+ public:
+#ifdef TI_WITH_CC
+  // C backend related data structures
+  std::unique_ptr<cccp::CCProgram> cc_program;
+#endif
 };
 
 TLANG_NAMESPACE_END
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index b34949ac13667..08d0c041b0f76 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -109,7 +109,8 @@ void export_lang(py::module &m) {
                      &CompileConfig::device_memory_fraction)
       .def_readwrite("fast_math", &CompileConfig::fast_math)
       .def_readwrite("ad_stack_size", &CompileConfig::ad_stack_size)
-      .def_readwrite("async", &CompileConfig::async);
+      .def_readwrite("async", &CompileConfig::async)
+      .def_readwrite("flatten_if", &CompileConfig::flatten_if);
 
   m.def("reset_default_compile_config",
         [&]() { default_compile_config = CompileConfig(); });
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index e48a3b8fb9d7c..78136fb68f713 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -15,6 +15,7 @@
 #include "taichi/system/dynamic_loader.h"
 #include "taichi/backends/metal/api.h"
 #include "taichi/backends/opengl/opengl_api.h"
+#include "taichi/backends/cc/cc_configuation.h"
 #if defined(TI_WITH_CUDA)
 #include "taichi/backends/cuda/cuda_driver.h"
 #endif
@@ -156,12 +157,15 @@ void export_misc(py::module &m) {
     }
     printf("test was successful.\n");
   });
-  m.def("pop_python_print_buffer", []() {
-      return py_cout.pop_content();
-  });
+  m.def("pop_python_print_buffer", []() { return py_cout.pop_content(); });
   m.def("with_cuda", is_cuda_api_available);
   m.def("with_metal", taichi::lang::metal::is_metal_api_available);
   m.def("with_opengl", taichi::lang::opengl::is_opengl_api_available);
+#ifdef TI_WITH_CC
+  m.def("with_cc", taichi::lang::cccp::is_c_backend_available);
+#else
+  m.def("with_cc", []() { return false; });
+#endif
 }
 
 TI_NAMESPACE_END
diff --git a/taichi/system/dynamic_loader.cpp b/taichi/system/dynamic_loader.cpp
index fbe54b7a66268..2923fa34c8642 100644
--- a/taichi/system/dynamic_loader.cpp
+++ b/taichi/system/dynamic_loader.cpp
@@ -21,7 +21,7 @@ void DynamicLoader::load_dll(const std::string &dll_path) {
 }
 
 void *DynamicLoader::load_function(const std::string &func_name) {
-  TI_ASSERT(loaded());
+  TI_ASSERT_INFO(loaded(), "DLL not opened");
 #ifdef WIN32
   auto func = (void *)GetProcAddress((HMODULE)dll, func_name.c_str());
 #else
@@ -34,7 +34,7 @@ void *DynamicLoader::load_function(const std::string &func_name) {
 }
 
 void DynamicLoader::close_dll() {
-  TI_ERROR_IF(!loaded(), "DLL not opened.");
+  TI_ASSERT_INFO(loaded(), "DLL not opened");
 #ifdef WIN32
   FreeLibrary((HMODULE)dll);
 #else
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index 9ec3112753186..3b8ae0b064c9a 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -59,13 +59,10 @@ void compile_to_offloads(IRNode *ir,
     print("Loop Split");
     irpass::analysis::verify(ir);
   }
-  irpass::simplify(ir);
+  irpass::full_simplify(ir);
   print("Simplified I");
   irpass::analysis::verify(ir);
 
-  irpass::constant_fold(ir);
-  print("Constant folded I");
-
   if (grad) {
     // Remove local atomics here so that we don't have to handle their gradients
     irpass::demote_atomics(ir);
@@ -105,9 +102,6 @@ void compile_to_offloads(IRNode *ir,
   print("Simplified II");
   irpass::analysis::verify(ir);
 
-  irpass::constant_fold(ir);
-  print("Constant folded II");
-
   irpass::offload(ir);
   print("Offloaded");
   irpass::analysis::verify(ir);
diff --git a/taichi/transforms/constant_fold.cpp b/taichi/transforms/constant_fold.cpp
index 02d407dd61140..ec1d294728277 100644
--- a/taichi/transforms/constant_fold.cpp
+++ b/taichi/transforms/constant_fold.cpp
@@ -166,6 +166,12 @@ class ConstantFold : public BasicStmtVisitor {
   }
 
   void visit(UnaryOpStmt *stmt) override {
+    if (stmt->is_cast() &&
+        stmt->cast_type == stmt->operand->ret_type.data_type) {
+      stmt->replace_with(stmt->operand);
+      modifier.erase(stmt);
+      return;
+    }
     auto operand = stmt->operand->cast<ConstStmt>();
     if (!operand)
       return;
diff --git a/taichi/transforms/simplify.cpp b/taichi/transforms/simplify.cpp
index 37d9cb74527f0..8a62fb0554603 100644
--- a/taichi/transforms/simplify.cpp
+++ b/taichi/transforms/simplify.cpp
@@ -2,6 +2,8 @@
 #include "taichi/ir/transforms.h"
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/visitors.h"
+#include "taichi/program/kernel.h"
+#include "taichi/program/program.h"
 #include <set>
 #include <unordered_set>
 #include <utility>
@@ -1062,13 +1064,15 @@ class BasicBlockSimplify : public IRVisitor {
       return false;
     };
 
-    if (if_stmt->true_statements &&
-        flatten(if_stmt->true_statements->statements, true)) {
-      throw IRModified();
-    }
-    if (if_stmt->false_statements &&
-        flatten(if_stmt->false_statements->statements, false)) {
-      throw IRModified();
+    if (kernel->program.config.flatten_if) {
+      if (if_stmt->true_statements &&
+          flatten(if_stmt->true_statements->statements, true)) {
+        throw IRModified();
+      }
+      if (if_stmt->false_statements &&
+          flatten(if_stmt->false_statements->statements, false)) {
+        throw IRModified();
+      }
     }
 
     if (if_stmt->true_statements) {
@@ -1141,6 +1145,9 @@ class Simplify : public IRVisitor {
   Kernel *kernel;
 
   Simplify(IRNode *node, Kernel *kernel) : kernel(kernel) {
+    if (!kernel)
+      this->kernel = node->get_kernel();
+    TI_ASSERT(this->kernel);
     modified = false;
     allow_undefined_visitor = true;
     invoke_default_visitor = true;
@@ -1176,7 +1183,9 @@ class Simplify : public IRVisitor {
   }
 
   void visit(StructForStmt *for_stmt) override {
-    TI_ASSERT(current_struct_for == nullptr);
+    TI_ASSERT_INFO(current_struct_for == nullptr,
+                   "Nested struct-fors are not supported for now. "
+                   "Please try to use range-fors for inner loops.");
     current_struct_for = for_stmt;
     for_stmt->body->accept(this);
     current_struct_for = nullptr;
@@ -1225,15 +1234,18 @@ void full_simplify(IRNode *root, Kernel *kernel) {
         modified = true;
       if (constant_fold(root))
         modified = true;
+      if (die(root))
+        modified = true;
       if (alg_simp(root))
         modified = true;
-      die(root);
-      if (whole_kernel_cse(root))
+      if (die(root))
         modified = true;
-      die(root);
       if (simplify(root, kernel))
         modified = true;
-      die(root);
+      if (die(root))
+        modified = true;
+      if (whole_kernel_cse(root))
+        modified = true;
       if (!modified)
         break;
     }
diff --git a/taichi/util/environ_config.h b/taichi/util/environ_config.h
index cf65e3dce4bee..9a4aae477eab4 100644
--- a/taichi/util/environ_config.h
+++ b/taichi/util/environ_config.h
@@ -7,8 +7,8 @@
 
 TLANG_NAMESPACE_BEGIN
 
-static inline int get_environ_config(const std::string &name, int default_value = 0)
-{
+static inline int get_environ_config(const std::string &name,
+                                     int default_value = 0) {
   char *res = std::getenv(name.c_str());
   if (res == nullptr)
     return default_value;
diff --git a/taichi/util/logging.cpp b/taichi/util/logging.cpp
index 212cd95518933..35a090baa4e6e 100644
--- a/taichi/util/logging.cpp
+++ b/taichi/util/logging.cpp
@@ -155,7 +155,8 @@ std::string signal_name(int sig) {
 bool python_at_exit_called = false;
 
 void signal_handler(int signo) {
-  // It seems that there's no way to pass exception to Python in signal handlers?
+  // It seems that there's no way to pass exception to Python in signal
+  // handlers?
   // @archibate found that in fact there are such solution:
   // https://docs.python.org/3/library/faulthandler.html#module-faulthandler
   auto sig_name = signal_name(signo);
diff --git a/tests/python/test_for_group_mismatch.py b/tests/python/test_for_group_mismatch.py
new file mode 100644
index 0000000000000..6c84b44ccc6c2
--- /dev/null
+++ b/tests/python/test_for_group_mismatch.py
@@ -0,0 +1,96 @@
+import taichi as ti
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def test_struct_for_mismatch():
+    x = ti.var(ti.f32, (3, 4))
+
+    @ti.kernel
+    def func():
+        for i in x:
+            print(i)
+
+    func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def test_struct_for_mismatch2():
+    x = ti.var(ti.f32, (3, 4))
+
+    @ti.kernel
+    def func():
+        for i, j, k in x:
+            print(i, j, k)
+
+    func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_grouped_struct_for_mismatch():
+    # doesn't work for now
+    # need grouped refactor
+    # for now, it just throw a unfriendly message:
+    # AssertionError: __getitem__ cannot be called in Python-scope
+    x = ti.var(ti.f32, (3, 4))
+
+    @ti.kernel
+    def func():
+        for i, j in ti.grouped(x):
+            print(i, j)
+
+    func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_ndrange_for_mismatch():
+    # doesn't work for now
+    # need ndrange refactor
+    @ti.kernel
+    def func():
+        for i in ti.ndrange(3, 4):
+            print(i)
+
+    func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_ndrange_for_mismatch2():
+    # doesn't work for now
+    # need ndrange and grouped refactor
+    @ti.kernel
+    def func():
+        for i, j, k in ti.ndrange(3, 4):
+            print(i, j, k)
+
+    func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_grouped_ndrange_for_mismatch():
+    # doesn't work for now
+    # need ndrange and grouped refactor
+    @ti.kernel
+    def func():
+        for i in ti.grouped(ti.ndrange(3, 4)):
+            print(i)
+
+    func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_static_ndrange_for_mismatch():
+    # doesn't work for now
+    # need ndrange and static refactor
+    @ti.kernel
+    def func():
+        for i in ti.static(ti.ndrange(3, 4)):
+            print(i)
+
+    func()
diff --git a/tests/python/test_return.py b/tests/python/test_return.py
index 71e383ec275a7..9400e6dfa905f 100644
--- a/tests/python/test_return.py
+++ b/tests/python/test_return.py
@@ -16,7 +16,6 @@ def kernel() -> ti.i32:
 
 @ti.must_throw(ti.TaichiSyntaxError)
 def test_return_without_type_hint():
-
     @ti.kernel
     def kernel():
         return 1
@@ -25,7 +24,6 @@ def kernel():
 
 
 def test_const_func_ret():
-
     @ti.kernel
     def func1() -> ti.f32:
         return 3
@@ -40,7 +38,6 @@ def func2() -> ti.i32:
 
 @ti.all_archs
 def _test_binary_func_ret(dt1, dt2, dt3, castor):
-
     @ti.kernel
     def func(a: dt1, b: dt2) -> dt3:
         return a * b
@@ -64,4 +61,3 @@ def test_binary_func_ret():
     _test_binary_func_ret(ti.f32, ti.i32, ti.f32, float)
     _test_binary_func_ret(ti.i32, ti.f32, ti.i32, int)
     _test_binary_func_ret(ti.f32, ti.i32, ti.i32, int)
-