diff --git a/.github/workflows/persubmit.yml b/.github/workflows/persubmit.yml
index ff7a621ffcd9f..6d3b156368806 100644
--- a/.github/workflows/persubmit.yml
+++ b/.github/workflows/persubmit.yml
@@ -11,6 +11,7 @@ jobs:
matrix:
os: [ubuntu-latest]
python: [3.6, 3.7, 3.8]
+ with_cc: [OFF]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
@@ -30,7 +31,7 @@ jobs:
export TAICHI_REPO_DIR=`pwd`
export PATH=$TAICHI_REPO_DIR/taichi-llvm/bin/:$PATH
export CXX=clang++
- export CI_SETUP_CMAKE_ARGS=-DTI_WITH_OPENGL:BOOL=OFF
+ export CI_SETUP_CMAKE_ARGS="-DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{matrix.with_cc}}"
# GLFW dependencies:
#export CI_SETUP_CMAKE_ARGS=-DTI_WITH_OPENGL:BOOL=ON
#sudo apt-get install libx11-dev libxcursor-dev libxi-dev
diff --git a/.gitignore b/.gitignore
index 50ccd25a77145..6e219c6c33fb2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
*.swp
*.swo
/.vs
+/tags
+/.*_localrc
/Debug
*.sdf
/x64
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 628e779bf23e1..1d838bd3c848e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(taichi)
SET(TI_VERSION_MAJOR 0)
SET(TI_VERSION_MINOR 6)
-SET(TI_VERSION_PATCH 14)
+SET(TI_VERSION_PATCH 15)
execute_process(
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
diff --git a/README.md b/README.md
index c2eca75c9f678..d2f226ae09b21 100644
--- a/README.md
+++ b/README.md
@@ -13,16 +13,16 @@
**Taichi** (太极) is a programming language designed for *high-performance computer graphics*. It is deeply embedded in **Python**, and its **just-in-time compiler** offloads compute-intensive tasks to multi-core CPUs and massively parallel GPUs.
-
+
Advanced features of Taichi include [spatially sparse computing](https://taichi.readthedocs.io/en/latest/sparse.html) and [differentiable programming](https://taichi.readthedocs.io/en/latest/differentiable_programming.html) [[examples]](https://github.com/yuanming-hu/difftaichi).
## Gallery
-
+
-
+
## Installation [![Downloads](https://pepy.tech/badge/taichi/month)](https://pepy.tech/project/taichi/month)
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 95307ea6bcaf1..5634f5e0b7551 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -3,15 +3,27 @@ set(CORE_LIBRARY_NAME taichi_core)
option(USE_STDCPP "Use -stdlib=libc++" OFF)
option(TI_WITH_CUDA "Build with the CUDA backend" ON)
option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
+option(TI_WITH_CC "Build with the C backend" OFF)
if (APPLE)
if (TI_WITH_CUDA)
set(TI_WITH_CUDA OFF)
- message(WARNING "CUDA not supported on OS X. Setting TI_WITH_CUDA to OFF.")
+ message(WARNING "CUDA backend not supported on OS X. Setting TI_WITH_CUDA to OFF.")
endif()
if (TI_WITH_OPENGL)
set(TI_WITH_OPENGL OFF)
- message(WARNING "OpenGL not supported on OS X. Setting TI_WITH_OPENGL to OFF.")
+ message(WARNING "OpenGL backend not supported on OS X. Setting TI_WITH_OPENGL to OFF.")
+ endif()
+ if (TI_WITH_CC)
+ set(TI_WITH_CC OFF)
+ message(WARNING "C backend not supported on OS X. Setting TI_WITH_CC to OFF.")
+ endif()
+endif()
+
+if (WIN32)
+ if (TI_WITH_CC)
+ set(TI_WITH_CC OFF)
+ message(WARNING "C backend not supported on Windows. Setting TI_WITH_CC to OFF.")
endif()
endif()
@@ -30,6 +42,7 @@ file(GLOB TAICHI_CPU_SOURCE "taichi/backends/cpu/*.cpp" "taichi/backends/cpu/*.h
file(GLOB TAICHI_CUDA_SOURCE "taichi/backends/cuda/*.cpp" "taichi/backends/cuda/*.h")
file(GLOB TAICHI_METAL_SOURCE "taichi/backends/metal/*.h" "taichi/backends/metal/*.cpp" "taichi/backends/metal/shaders/*")
file(GLOB TAICHI_OPENGL_SOURCE "taichi/backends/opengl/*.h" "taichi/backends/opengl/*.cpp" "taichi/backends/opengl/shaders/*")
+file(GLOB TAICHI_CC_SOURCE "taichi/backends/cc/*.h" "taichi/backends/cc/*.cpp")
list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_BACKEND_SOURCE})
@@ -56,6 +69,11 @@ if (TI_WITH_OPENGL)
list(APPEND TAICHI_CORE_SOURCE ${TAICHI_GLAD_SOURCE})
endif()
+if (TI_WITH_CC)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_CC")
+ list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CC_SOURCE})
+endif()
+
add_library(${CORE_LIBRARY_NAME} SHARED ${TAICHI_CORE_SOURCE} ${PROJECT_SOURCES})
if (APPLE)
diff --git a/docs/compilation.rst b/docs/compilation.rst
index 00496f90e26fa..88aa3f6d82236 100644
--- a/docs/compilation.rst
+++ b/docs/compilation.rst
@@ -1,20 +1,20 @@
.. _compilation:
-The life of a Taichi kernel
+Life of a Taichi kernel
===============================================
Sometimes it is helpful to understand the life cycle of a Taichi kernel.
In short, compilation will only happen on the first invocation of an instance of a kernel.
-Life cycle of a Taichi kernel looks like this:
+The life cycle of a Taichi kernel has the following stages:
- Kernel registration
- Template instantiation and caching
- Python AST transforms
- - Taichi IR compilation, optimization, and binary generation
+ - Taichi IR compilation, optimization, and executable generation
- Launching
-.. image:: life_of_kernel_lowres.jpg
+.. image:: https://raw.githubusercontent.com/taichi-dev/public_files/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/life_of_kernel.jpg
Let's consider the following simple kernel:
@@ -26,7 +26,7 @@ Let's consider the following simple kernel:
tensor[i] += delta
-We also allocate two 1D tensors to simplify discussion:
+We allocate two 1D tensors to simplify discussion:
.. code-block:: python
@@ -50,7 +50,7 @@ Template instantiation and caching
When ``add`` is called for the first time, the Taichi frontend compiler will instantiate the kernel.
-When you have a second call with the same template signature (explained later), e.g.,
+When you have a second call with the same **template signature** (explained later), e.g.,
.. code-block:: python
@@ -69,12 +69,12 @@ will lead to a new instantiation of **add**.
.. note::
**Template signatures** are what distinguish different instantiations of a kernel template.
The signature of ``add(x, 42)`` is ``(x, ti.i32)``, which is the same as that of ``add(x, 1)``. Therefore, the latter can reuse the previously compiled binary.
- The signature of ``add(y, 42)`` is ``(y, ti.i32)``, a different value from the previous signature, therefore a new instantiation and compilation will happen.
+ The signature of ``add(y, 42)`` is ``(y, ti.i32)``, a different value from the previous signature, hence a new kernel will be instantiated and compiled.
.. note::
- Many basic operations in the Taichi standard library is implemented using Taichi kernels for performance,
- with more or less metaprogramming tricks. Invoking them will incur **implicit kernel instantiations**
+ Many basic operations in the Taichi standard library are implemented using Taichi kernels
+ using metaprogramming tricks. Invoking them will incur **implicit kernel instantiations**.
Examples include ``x.to_numpy()`` and ``y.from_torch(torch_tensor)``. When you invoke these functions,
you will see kernel instantiations, as Taichi kernels will be generated to offload the hard work to multiple CPU cores/GPUs.
@@ -84,7 +84,7 @@ will lead to a new instantiation of **add**.
Code transformation and optimizations
---------------------------------------
-When a new instantiation happens, the Taichi frontend compiler will transform the kernel body AST
+When a new instantiation happens, the Taichi frontend compiler (i.e., the ``ASTTransformer`` Python class) will transform the kernel body AST
into a Python script, which, when executed, emits a Taichi frontend AST.
Basically, some patches are applied to the Python AST so that the Taichi frontend can recognize it.
@@ -103,9 +103,10 @@ which allows a series of further IR passes to happen, such as
The just-in-time (JIT) compilation engine
-----------------------------------------
-Finally, the optimized SSA IR is fed into the LLVM IR codegen, and LLVM JIT generates high-performance executable CPU/GPU programs.
+Finally, the optimized SSA IR is fed into backend compilers such as LLVM or Apple Metal/OpenGL shader compilers.
+The backend compilers then generate high-performance executable CPU/GPU programs.
Kernel launching
----------------
-Taichi kernels will be ultimately launched as multi-threaded CPU tasks or CUDA kernels.
+Taichi kernels will be ultimately launched as multi-threaded CPU tasks or GPU kernels.
diff --git a/docs/version b/docs/version
index fcbaa8478100e..6769f67e2955a 100644
--- a/docs/version
+++ b/docs/version
@@ -1 +1 @@
-0.6.14
+0.6.15
diff --git a/examples/export_videos.py b/examples/export_videos.py
index b6ddeeef6b50c..50d0738a4a1f3 100644
--- a/examples/export_videos.py
+++ b/examples/export_videos.py
@@ -13,8 +13,8 @@ def paint():
result_dir = "./results"
video_manager = ti.VideoManager(output_dir=result_dir,
- framerate=24,
- automatic_build=False)
+ framerate=24,
+ automatic_build=False)
for i in range(50):
paint()
diff --git a/misc/listgen_demo.py b/misc/listgen_demo.py
index 5ae7213d05e8d..2d7459a8497c1 100644
--- a/misc/listgen_demo.py
+++ b/misc/listgen_demo.py
@@ -5,9 +5,11 @@
x = ti.var(ti.i32)
ti.root.dense(ti.i, 4).bitmasked(ti.i, 4).place(x)
+
@ti.kernel
def func():
for i in x:
print(i)
+
func()
diff --git a/misc/prtags.json b/misc/prtags.json
index fe9b33ddcd70a..6ed8dd0a96e72 100644
--- a/misc/prtags.json
+++ b/misc/prtags.json
@@ -28,5 +28,6 @@
"windows" : "Windows",
"perf" : "Performance improvements",
"ipython" : "IPython and other shells",
+ "cc" : "C source backend",
"release" : "Release"
}
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 8cd8c17b36f63..c6f9e7c3ac03d 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -7,7 +7,7 @@
from taichi.misc import *
from taichi.misc.gui import GUI
from taichi.misc.np2ply import PLYWriter
-from taichi.misc.image import *
+from taichi.misc.image import imread, imwrite, imshow, imdisplay
from taichi.misc.task import Task
from taichi.misc.test import *
from taichi.misc import settings as settings
diff --git a/python/taichi/core/util.py b/python/taichi/core/util.py
index 9d69334eb2a75..79ddc6531d14e 100644
--- a/python/taichi/core/util.py
+++ b/python/taichi/core/util.py
@@ -30,7 +30,7 @@ def import_ti_core(tmp_dir=None):
global ti_core
if get_os_name() != 'win':
old_flags = sys.getdlopenflags()
- sys.setdlopenflags(2) # 2 = RTLD_NOW
+ sys.setdlopenflags(2 | 8) # RTLD_NOW | RTLD_DEEPBIND
else:
pyddir = os.path.join(package_root(), 'lib')
os.environ['PATH'] += ';' + pyddir
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 156756b64f717..a057be0f2a534 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -38,6 +38,7 @@
cuda = core.cuda
metal = core.metal
opengl = core.opengl
+cc = core.cc
gpu = [cuda, metal, opengl]
cpu = core.host_arch()
kernel_profiler_print = lambda: core.get_current_program(
@@ -292,6 +293,8 @@ def is_arch_supported(arch):
return core.with_metal()
elif arch == opengl:
return core.with_opengl()
+ elif arch == cc:
+ return core.with_cc()
elif arch == cpu:
return True
else:
@@ -299,7 +302,7 @@ def is_arch_supported(arch):
def supported_archs():
- archs = [cpu, cuda, metal, opengl]
+ archs = [cpu, cuda, metal, opengl, cc]
wanted_archs = os.environ.get('TI_WANTED_ARCHS', '')
want_exclude = wanted_archs.startswith('^')
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 39babe2f9a74e..5bceab2ee4637 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -59,6 +59,16 @@ def expr_init_func(rhs): # temporary solution to allow passing in tensors as
return expr_init(rhs)
+def begin_frontend_struct_for(group, loop_range):
+ if group.size() != len(loop_range.shape):
+ raise IndexError(
+ 'Number of struct-for indices does not match loop variable dimensionality '
+ f'({group.size()} != {len(loop_range.shape)}). Maybe you wanted to '
+ 'use "for I in ti.grouped(x)" to group all indices into a single vector I?'
+ )
+ taichi_lang_core.begin_frontend_struct_for(group, loop_range.ptr)
+
+
def wrap_scalar(x):
if type(x) in [int, float]:
return Expr(x)
diff --git a/python/taichi/lang/transformer.py b/python/taichi/lang/transformer.py
index b8b62224f60b6..3f1b59d3afa88 100644
--- a/python/taichi/lang/transformer.py
+++ b/python/taichi/lang/transformer.py
@@ -437,7 +437,7 @@ def visit_struct_for(self, node, is_grouped):
___loop_var = 0
{} = ti.make_var_vector(size=len(___loop_var.loop_range().shape))
___expr_group = ti.make_expr_group({})
- ti.core.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range().ptr)
+ ti.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range())
ti.core.end_frontend_range_for()
'''.format(vars, vars)
t = ast.parse(template).body[0]
@@ -450,7 +450,7 @@ def visit_struct_for(self, node, is_grouped):
{}
___loop_var = 0
___expr_group = ti.make_expr_group({})
- ti.core.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range().ptr)
+ ti.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range())
ti.core.end_frontend_range_for()
'''.format(var_decl, vars)
t = ast.parse(template).body[0]
diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py
index 809a49ba9202a..0783f68ab2de9 100644
--- a/python/taichi/misc/gui.py
+++ b/python/taichi/misc/gui.py
@@ -85,8 +85,8 @@ def set_image(self, img):
self.img = self.cook_image(img.to_numpy())
else:
# Type matched! We can use an optimized copy kernel.
- assert img.shape(
- ) == self.res, "Image resolution does not match GUI resolution"
+ assert img.shape \
+ == self.res, "Image resolution does not match GUI resolution"
from taichi.lang.meta import tensor_to_image
tensor_to_image(img, self.img)
ti.sync()
@@ -96,8 +96,8 @@ def set_image(self, img):
self.img = self.cook_image(img.to_numpy())
else:
# Type matched! We can use an optimized copy kernel.
- assert img.shape(
- ) == self.res, "Image resolution does not match GUI resolution"
+ assert img.shape \
+ == self.res, "Image resolution does not match GUI resolution"
assert img.n in [
3, 4
], "Only greyscale, RGB or RGBA images are supported in GUI.set_image"
diff --git a/python/taichi/misc/image.py b/python/taichi/misc/image.py
index b3ec797cb557b..7e2594e7fa1fc 100644
--- a/python/taichi/misc/image.py
+++ b/python/taichi/misc/image.py
@@ -2,7 +2,12 @@
import taichi as ti
-def imwrite(img, filename):
+def cook_image_to_bytes(img):
+ """
+ Takes a NumPy array or Taichi tensor of any type.
+ Returns a NumPy array of uint8.
+ This is used by ti.imwrite and ti.imdisplay.
+ """
if not isinstance(img, np.ndarray):
img = img.to_numpy()
@@ -16,19 +21,47 @@ def imwrite(img, filename):
assert len(img.shape) in [2,
3], "Image must be either RGB/RGBA or greyscale"
- resx, resy = img.shape[:2]
if len(img.shape) == 2:
- comp = 1
+ img = img.reshape(*img.shape, 1)
+
+ assert img.shape[2] in [1, 3,
+ 4], "Image must be either RGB/RGBA or greyscale"
+
+ return img.swapaxes(0, 1)[::-1, :]
+
+
+def imdisplay(img):
+ """
+ Try to display image in interactive shell.
+ """
+ if ti.lang.shell.oinspect.name == ti.lang.shell.ShellType.JUPYTER:
+ import PIL.Image
+ from io import BytesIO
+ import IPython.display
+ import numpy as np
+ img = cook_image_to_bytes(img)
+ with BytesIO() as f:
+ PIL.Image.fromarray(img).save(f, 'png')
+ IPython.display.display(IPython.display.Image(data=f.getvalue()))
else:
- comp = img.shape[2]
- assert comp in [1, 3, 4], "Image must be either RGB/RGBA or greyscale"
+ ti.imshow(img)
+
- img = np.ascontiguousarray(img.swapaxes(0, 1)[::-1, :])
+def imwrite(img, filename):
+ """
+ Save image to a specific file.
+ """
+ img = cook_image_to_bytes(img)
+ img = np.ascontiguousarray(img)
ptr = img.ctypes.data
+ resy, resx, comp = img.shape
ti.core.imwrite(filename, ptr, resx, resy, comp)
def imread(filename, channels=0):
+ """
+ Load image from a specific file.
+ """
ptr, resx, resy, comp = ti.core.imread(filename, channels)
img = np.ndarray(shape=(resy, resx, comp), dtype=np.uint8)
img = np.ascontiguousarray(img)
@@ -39,6 +72,9 @@ def imread(filename, channels=0):
def imshow(img, window_name='Taichi'):
+ """
+ Show image in a Taichi GUI.
+ """
if not isinstance(img, np.ndarray):
img = img.to_numpy()
assert len(img.shape) in [2,
diff --git a/taichi/analysis/build_cfg.cpp b/taichi/analysis/build_cfg.cpp
index c5f314b4023ea..b876356a09095 100644
--- a/taichi/analysis/build_cfg.cpp
+++ b/taichi/analysis/build_cfg.cpp
@@ -15,7 +15,8 @@ class CFGBuilder : public IRVisitor {
int current_stmt_id;
int begin_location;
std::vector prev_nodes;
- bool in_offloaded_for;
+ OffloadedStmt *current_offload;
+ bool in_parallel_for;
public:
CFGBuilder()
@@ -23,7 +24,8 @@ class CFGBuilder : public IRVisitor {
last_node_in_current_block(nullptr),
current_stmt_id(-1),
begin_location(-1),
- in_offloaded_for(false) {
+ current_offload(nullptr),
+ in_parallel_for(false) {
allow_undefined_visitor = true;
invoke_default_visitor = true;
graph = std::make_unique();
@@ -40,7 +42,7 @@ class CFGBuilder : public IRVisitor {
CFGNode *new_node(int next_begin_location) {
auto node = graph->push_back(current_block, begin_location, current_stmt_id,
- in_offloaded_for, last_node_in_current_block);
+ in_parallel_for, last_node_in_current_block);
for (auto &prev_node : prev_nodes) {
CFGNode::add_edge(prev_node, node);
}
@@ -125,14 +127,23 @@ class CFGBuilder : public IRVisitor {
}
void visit(RangeForStmt *stmt) override {
+ auto old_in_parallel_for = in_parallel_for;
+ if (!current_offload)
+ in_parallel_for = true;
visit_loop(stmt->body.get(), new_node(-1), false);
+ in_parallel_for = old_in_parallel_for;
}
void visit(StructForStmt *stmt) override {
+ auto old_in_parallel_for = in_parallel_for;
+ if (!current_offload)
+ in_parallel_for = true;
visit_loop(stmt->body.get(), new_node(-1), false);
+ in_parallel_for = old_in_parallel_for;
}
void visit(OffloadedStmt *stmt) override {
+ current_offload = stmt;
if (stmt->prologue) {
auto before_offload = new_node(-1);
int offload_stmt_id = current_stmt_id;
@@ -149,10 +160,10 @@ class CFGBuilder : public IRVisitor {
auto block_begin_index = graph->size();
if (stmt->task_type == OffloadedStmt::TaskType::range_for ||
stmt->task_type == OffloadedStmt::TaskType::struct_for) {
- in_offloaded_for = true;
+ in_parallel_for = true;
}
stmt->body->accept(this);
- in_offloaded_for = false;
+ in_parallel_for = false;
prev_nodes.push_back(graph->back());
// Container statements don't belong to any CFGNodes.
begin_location = offload_stmt_id + 1;
@@ -168,6 +179,7 @@ class CFGBuilder : public IRVisitor {
begin_location = offload_stmt_id + 1;
CFGNode::add_edge(before_offload, graph->nodes[block_begin_index].get());
}
+ current_offload = nullptr;
}
void visit(Block *block) override {
diff --git a/taichi/backends/cc/cc_configuation.h b/taichi/backends/cc/cc_configuation.h
new file mode 100644
index 0000000000000..6306f36b58866
--- /dev/null
+++ b/taichi/backends/cc/cc_configuation.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+struct CCConfiguation {
+ std::string compile_cmd;
+
+ CCConfiguation() : compile_cmd("gcc -shared -fPIC -o '{}' '{}'") {
+ }
+};
+
+extern CCConfiguation cfg;
+
+bool is_c_backend_available();
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_kernel.h b/taichi/backends/cc/cc_kernel.h
new file mode 100644
index 0000000000000..390887808607e
--- /dev/null
+++ b/taichi/backends/cc/cc_kernel.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCProgram;
+
+class CCKernel {
+ public:
+ CCKernel(std::string const &source, std::string const &name)
+ : source(source), name(name) {
+ }
+
+ void compile();
+ void launch(CCProgram *program, Context *ctx);
+
+ std::string source;
+ std::string name;
+
+ std::string src_path;
+ std::string bin_path;
+};
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_layout.h b/taichi/backends/cc/cc_layout.h
new file mode 100644
index 0000000000000..0d3081c0645e6
--- /dev/null
+++ b/taichi/backends/cc/cc_layout.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCLayout {
+ public:
+ CCLayout() {
+ }
+
+ std::string source;
+};
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_program.cpp b/taichi/backends/cc/cc_program.cpp
new file mode 100644
index 0000000000000..234990d78dcc4
--- /dev/null
+++ b/taichi/backends/cc/cc_program.cpp
@@ -0,0 +1,50 @@
+#include "taichi/common/core.h"
+#include "taichi/system/dynamic_loader.h"
+#include "cc_program.h"
+#include "cc_configuation.h"
+#include "cc_kernel.h"
+#include "cc_layout.h"
+#include "cc_utils.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+CCConfiguation cfg;
+
+void CCKernel::compile() {
+ bin_path = fmt::format("{}/{}.so", runtime_tmp_dir, name);
+ src_path = fmt::format("{}/{}.c", runtime_tmp_dir, name);
+
+ std::ofstream(src_path) << source;
+ TI_INFO("[cc] compiling kernel [{}]:\n{}\n", name, source);
+ execute(cfg.compile_cmd, bin_path, src_path);
+}
+
+void CCKernel::launch(CCProgram *launcher, Context *ctx) {
+ using FuncEntryType = void();
+ DynamicLoader dll(bin_path);
+ TI_ASSERT_INFO(dll.loaded(), "[cc] could not load shared object: {}",
+ bin_path);
+ auto main =
+ reinterpret_cast(dll.load_function(get_sym_name(name)));
+ TI_INFO("[cc] entering kernel [{}]", name);
+ (*main)();
+ TI_INFO("[cc] leaving kernel [{}]", name);
+}
+
+void CCProgram::launch(CCKernel *kernel, Context *ctx) {
+ kernel->launch(this, ctx);
+}
+
+CCProgram::CCProgram() {
+}
+
+CCProgram::~CCProgram() {
+}
+
+bool is_c_backend_available() {
+ return true;
+}
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_program.h b/taichi/backends/cc/cc_program.h
new file mode 100644
index 0000000000000..fc2888c30fb08
--- /dev/null
+++ b/taichi/backends/cc/cc_program.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCKernel;
+class CCLayout;
+
+class CCProgram {
+ // Launch C compiler to compile generated source code, and run them
+ public:
+ CCProgram();
+ ~CCProgram();
+
+ void launch(CCKernel *kernel, Context *ctx);
+
+ std::vector> kernels;
+ std::unique_ptr layout;
+};
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/cc_utils.h b/taichi/backends/cc/cc_utils.h
new file mode 100644
index 0000000000000..0384850685430
--- /dev/null
+++ b/taichi/backends/cc/cc_utils.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+#include "taichi/common/core.h"
+#include
+#include
+#include
+#include
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+inline std::string c_quoted(std::string const &str) {
+ // https://zh.cppreference.com/w/cpp/language/escape
+ std::stringstream ss;
+ ss << '"';
+ for (auto const &c : str) {
+ switch (c) {
+#define REG_ESC(x, y) \
+ case x: \
+ ss << "\\" y; \
+ break;
+ REG_ESC('\n', "n");
+ REG_ESC('\a', "a");
+ REG_ESC('\b', "b");
+ REG_ESC('\?', "?");
+ REG_ESC('\v', "v");
+ REG_ESC('\t', "t");
+ REG_ESC('\f', "f");
+ REG_ESC('\'', "'");
+ REG_ESC('\"', "\"");
+ REG_ESC('\\', "\\");
+ default:
+ ss << c;
+ }
+ }
+ ss << '"';
+ return ss.str();
+}
+
+inline std::string cc_data_type_name(DataType dt) {
+ switch (dt) {
+ case DataType::i32:
+ return "int";
+ case DataType::f32:
+ return "float";
+ default:
+ TI_NOT_IMPLEMENTED
+ }
+}
+
+inline std::string get_sym_name(std::string const &name) {
+ return fmt::format("Ti_{}", name);
+}
+
+template
+inline int execute(std::string fmt, Args &&... args) {
+ auto cmd = fmt::format(fmt, std::forward(args)...);
+ TI_INFO("Executing command: {}", cmd);
+ int ret = std::system(cmd.c_str());
+ TI_INFO("Command exit status: {}", ret);
+ return ret;
+}
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/codegen_cc.cpp b/taichi/backends/cc/codegen_cc.cpp
new file mode 100644
index 0000000000000..1979fab3084ea
--- /dev/null
+++ b/taichi/backends/cc/codegen_cc.cpp
@@ -0,0 +1,148 @@
+#include "codegen_cc.h"
+#include "cc_kernel.h"
+#include "cc_layout.h"
+#include "taichi/ir/ir.h"
+#include "taichi/ir/transforms.h"
+#include "taichi/util/line_appender.h"
+#include "cc_utils.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp { // Codegen for C Compiler Processor
+
+class CCTransformer : public IRVisitor {
+ private:
+ [[maybe_unused]] Kernel *kernel;
+ [[maybe_unused]] CCLayout *layout;
+
+ LineAppender line_appender;
+ LineAppender line_appender_header;
+ bool is_top_level{true};
+
+ public:
+ CCTransformer(Kernel *kernel, CCLayout *layout)
+ : kernel(kernel), layout(layout) {
+ allow_undefined_visitor = true;
+ invoke_default_visitor = true;
+ }
+
+ void run() {
+ this->lower_ast();
+ emit_header("#include ");
+ kernel->ir->accept(this);
+ }
+
+ void lower_ast() {
+ auto ir = kernel->ir.get();
+ auto config = kernel->program.config;
+ config.demote_dense_struct_fors = true;
+ irpass::compile_to_offloads(ir, config,
+ /*vectorize=*/false, kernel->grad,
+ /*ad_use_stack=*/false, config.print_ir,
+ /*lower_global_access*/ true);
+ }
+
+ std::string get_source() {
+ return line_appender_header.lines() + "\n" + line_appender.lines();
+ }
+
+ private:
+ void visit(Block *stmt) override {
+ if (!is_top_level)
+ line_appender.push_indent();
+ for (auto &s : stmt->statements) {
+ s->accept(this);
+ }
+ if (!is_top_level)
+ line_appender.pop_indent();
+ }
+
+ void visit(Stmt *stmt) override {
+ TI_WARN("[cc] unsupported statement type {}", typeid(*stmt).name());
+ }
+
+ void visit(ConstStmt *stmt) override {
+ TI_ASSERT(stmt->width() == 1);
+ emit("{} {} = {};", cc_data_type_name(stmt->element_type()),
+ stmt->raw_name(), stmt->val[0].stringify());
+ }
+
+ void visit(PrintStmt *stmt) override {
+ std::string format;
+ std::vector values;
+
+ for (int i = 0; i < stmt->contents.size(); i++) {
+ auto const &content = stmt->contents[i];
+
+ if (std::holds_alternative(content)) {
+ auto arg_stmt = std::get(content);
+ format += data_type_format(arg_stmt->ret_type.data_type);
+ values.push_back(arg_stmt->raw_name());
+
+ } else {
+ auto str = std::get(content);
+ format += "%s";
+ values.push_back(c_quoted(str));
+ }
+ }
+
+ values.insert(values.begin(), c_quoted(format));
+ emit("printf({});", fmt::join(values, ", "));
+ }
+
+ void generate_serial_kernel(OffloadedStmt *stmt) {
+ emit_header("void {}(void);", get_sym_name(kernel->name));
+ emit("void {}(void) {{", get_sym_name(kernel->name));
+ {
+ ScopedIndent _s(line_appender);
+ stmt->body->accept(this);
+ }
+ emit("}}");
+ }
+
+ void visit(OffloadedStmt *stmt) override {
+ TI_ASSERT(is_top_level);
+ is_top_level = false;
+ if (stmt->task_type == OffloadedStmt::TaskType::serial) {
+ generate_serial_kernel(stmt);
+ } else {
+ TI_ERROR("[glsl] Unsupported offload type={} on C backend",
+ stmt->task_name());
+ }
+ is_top_level = true;
+ }
+
+ template
+ void emit(std::string f, Args &&... args) {
+ line_appender.append(std::move(f), std::move(args)...);
+ }
+
+ template
+ void emit_header(std::string f, Args &&... args) {
+ line_appender_header.append(std::move(f), std::move(args)...);
+ }
+};
+
+std::unique_ptr CCKernelGen::compile() {
+ auto layout = kernel->program.cc_program->layout.get();
+ CCTransformer tran(kernel, layout);
+
+ tran.run();
+ auto source = tran.get_source();
+ auto ker = std::make_unique(source, kernel->name);
+ ker->compile();
+ return ker;
+}
+
+FunctionType compile_kernel(Kernel *kernel) {
+ CCKernelGen codegen(kernel);
+ auto compiled = codegen.compile();
+ auto compiled_ptr = compiled.get();
+ auto program = kernel->program.cc_program.get();
+ program->kernels.push_back(std::move(compiled));
+ return [program, compiled_ptr](Context &ctx) {
+ return program->launch(compiled_ptr, &ctx);
+ };
+}
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/codegen_cc.h b/taichi/backends/cc/codegen_cc.h
new file mode 100644
index 0000000000000..de029e4f11e82
--- /dev/null
+++ b/taichi/backends/cc/codegen_cc.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+#include "taichi/codegen/codegen.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCKernel;
+
+class CCKernelGen {
+ // Generate corresponding C Source Code for a Taichi Kernel
+ public:
+ CCKernelGen(Kernel *kernel) : kernel(kernel) {
+ }
+
+ std::unique_ptr compile();
+
+ private:
+ Kernel *kernel;
+};
+
+FunctionType compile_kernel(Kernel *kernel);
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/struct_cc.cpp b/taichi/backends/cc/struct_cc.cpp
new file mode 100644
index 0000000000000..252152d623d38
--- /dev/null
+++ b/taichi/backends/cc/struct_cc.cpp
@@ -0,0 +1,14 @@
+#include "struct_cc.h"
+#include "cc_layout.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+std::unique_ptr CCLayoutGen::compile() {
+ auto lay = std::make_unique();
+ // W.I.P.
+ return lay;
+}
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/cc/struct_cc.h b/taichi/backends/cc/struct_cc.h
new file mode 100644
index 0000000000000..2b73b328052f9
--- /dev/null
+++ b/taichi/backends/cc/struct_cc.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+#include "taichi/ir/snode.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace cccp {
+
+class CCLayout;
+
+class CCLayoutGen {
+ // Generate corresponding C Source Code for Taichi Structures
+ public:
+ CCLayoutGen(SNode *root) : root(root) {
+ }
+
+ std::unique_ptr compile();
+
+ private:
+ SNode *root;
+};
+
+} // namespace cccp
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/api.cpp b/taichi/backends/metal/api.cpp
index ebfdde95ceb3c..6e8314310241a 100644
--- a/taichi/backends/metal/api.cpp
+++ b/taichi/backends/metal/api.cpp
@@ -1,4 +1,6 @@
#include "taichi/backends/metal/api.h"
+
+#include "taichi/backends/metal/constants.h"
#include "taichi/util/environ_config.h"
TLANG_NAMESPACE_BEGIN
@@ -53,15 +55,18 @@ nsobj_unique_ptr new_compute_command_encoder(
return wrap_as_nsobj_unique_ptr(encoder);
}
-nsobj_unique_ptr new_library_with_source(
- MTLDevice *device,
- const std::string &source) {
+nsobj_unique_ptr new_library_with_source(MTLDevice *device,
+ const std::string &source,
+ int msl_version) {
auto source_str = mac::wrap_string_as_ns_string(source);
id options = clscall("MTLCompileOptions", "alloc");
options = call(options, "init");
auto options_cleanup = wrap_as_nsobj_unique_ptr(options);
call(options, "setFastMathEnabled:", false);
+ if (msl_version != kMslVersionNone) {
+ call(options, "setLanguageVersion:", msl_version);
+ }
auto *lib = cast_call(
device, "newLibraryWithSource:options:error:", source_str.get(), options,
diff --git a/taichi/backends/metal/api.h b/taichi/backends/metal/api.h
index b905d70693176..7cd307eebf3c8 100644
--- a/taichi/backends/metal/api.h
+++ b/taichi/backends/metal/api.h
@@ -3,12 +3,12 @@
// Reference implementation:
// https://github.com/halide/Halide/blob/master/src/runtime/metal.cpp
+#include
+
#include "taichi/common/trait.h"
#include "taichi/lang_util.h"
#include "taichi/platform/mac/objc_api.h"
-#include
-
TLANG_NAMESPACE_BEGIN
namespace metal {
@@ -42,8 +42,11 @@ nsobj_unique_ptr new_command_buffer(MTLCommandQueue *queue);
nsobj_unique_ptr new_compute_command_encoder(
MTLCommandBuffer *buffer);
+// msl_version: Metal Shader Language version. 0 means not set.
+// See https://developer.apple.com/documentation/metal/mtllanguageversion
nsobj_unique_ptr new_library_with_source(MTLDevice *device,
- const std::string &source);
+ const std::string &source,
+ int msl_version);
nsobj_unique_ptr new_function_with_name(MTLLibrary *library,
const std::string &name);
diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index a649a14bc7983..8a8a776b925ff 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -8,6 +8,8 @@
#include "taichi/ir/ir.h"
#include "taichi/ir/transforms.h"
#include "taichi/util/line_appender.h"
+#include "taichi/math/arithmetic.h"
+#include "taichi/backends/metal/api.h"
TLANG_NAMESPACE_BEGIN
namespace metal {
@@ -30,6 +32,7 @@ using BuffersEnum = KernelAttributes::Buffers;
constexpr char kKernelThreadIdName[] = "utid_"; // 'u' for unsigned
constexpr char kKernelGridSizeName[] = "ugrid_size_"; // 'u' for unsigned
+constexpr char kKernelTidInSimdgroupName[] = "utid_in_simdg_";
constexpr char kRootBufferName[] = "root_addr";
constexpr char kGlobalTmpsBufferName[] = "global_tmps_addr";
constexpr char kContextBufferName[] = "ctx_addr";
@@ -43,6 +46,7 @@ constexpr char kListgenElemVarName[] = "listgen_elem_";
constexpr char kRandStateVarName[] = "rand_state_";
constexpr char kSNodeMetaVarName[] = "sn_meta_";
constexpr char kMemAllocVarName[] = "mem_alloc_";
+constexpr char kTlsBufferName[] = "tls_buffer_";
std::string buffer_to_name(BuffersEnum b) {
switch (b) {
@@ -79,24 +83,22 @@ class KernelCodegen : public IRVisitor {
Section::Kernels,
};
- struct UsedFeatures {
- bool runtime_list_ops = false;
- };
-
public:
// TODO(k-ye): Create a Params to hold these ctor params.
KernelCodegen(const std::string &mtl_kernel_prefix,
const std::string &root_snode_type_name,
Kernel *kernel,
const CompiledStructs *compiled_structs,
- PrintStringTable *print_strtab)
+ PrintStringTable *print_strtab,
+ const CodeGen::Config &config)
: mtl_kernel_prefix_(mtl_kernel_prefix),
root_snode_type_name_(root_snode_type_name),
kernel_(kernel),
compiled_structs_(compiled_structs),
needs_root_buffer_(compiled_structs_->root_size > 0),
ctx_attribs_(*kernel_),
- print_strtab_(print_strtab) {
+ print_strtab_(print_strtab),
+ cgen_config_(config) {
// allow_undefined_visitor = true;
for (const auto s : kAllSections) {
section_appenders_[s] = LineAppender();
@@ -107,8 +109,8 @@ class KernelCodegen : public IRVisitor {
return ctx_attribs_;
}
- const std::vector &kernels_attribs() const {
- return mtl_kernels_attribs_;
+ const TaichiKernelAttributes &ti_kernels_attribs() const {
+ return ti_kernel_attribus_;
}
std::string run() {
@@ -340,6 +342,13 @@ class KernelCodegen : public IRVisitor {
stmt->raw_name(), dt, kGlobalTmpsBufferName, stmt->offset);
}
+ void visit(ThreadLocalPtrStmt *stmt) override {
+ TI_ASSERT(stmt->width() == 1);
+ emit("thread auto* {} = reinterpret_cast({} + {});",
+ stmt->raw_name(), metal_data_type_name(stmt->element_type()),
+ kTlsBufferName, stmt->offset);
+ }
+
void visit(LoopIndexStmt *stmt) override {
const auto stmt_name = stmt->raw_name();
if (stmt->loop->is()) {
@@ -451,24 +460,36 @@ class KernelCodegen : public IRVisitor {
TI_NOT_IMPLEMENTED;
}
+ std::string val_var = stmt->val->raw_name();
+ // TODO(k-ye): This is not a very reliable way to detect if we're in TLS
+ // xlogues...
+ const bool is_tls_reduction =
+ (inside_tls_epilogue_ && (op_type == AtomicOpType::add));
+ const bool use_simd_in_tls_reduction =
+ (is_tls_reduction && cgen_config_.allow_simdgroup);
+ if (use_simd_in_tls_reduction) {
+ val_var += "_simd_val_";
+ emit("const auto {} = simd_sum({});", val_var, stmt->val->raw_name());
+ emit("if ({} == 0) {{", kKernelTidInSimdgroupName);
+ current_appender().push_indent();
+ }
+
if (dt == DataType::i32) {
emit(
"const auto {} = atomic_fetch_{}_explicit((device atomic_int*){}, "
"{}, "
"metal::memory_order_relaxed);",
- stmt->raw_name(), op_name, stmt->dest->raw_name(),
- stmt->val->raw_name());
+ stmt->raw_name(), op_name, stmt->dest->raw_name(), val_var);
} else if (dt == DataType::u32) {
emit(
"const auto {} = atomic_fetch_{}_explicit((device atomic_uint*){}, "
"{}, "
"metal::memory_order_relaxed);",
- stmt->raw_name(), op_name, stmt->dest->raw_name(),
- stmt->val->raw_name());
+ stmt->raw_name(), op_name, stmt->dest->raw_name(), val_var);
} else if (dt == DataType::f32) {
if (handle_float) {
emit("const float {} = fatomic_fetch_{}({}, {});", stmt->raw_name(),
- op_name, stmt->dest->raw_name(), stmt->val->raw_name());
+ op_name, stmt->dest->raw_name(), val_var);
} else {
TI_ERROR("Metal does not support atomic {} for floating points",
op_name);
@@ -476,6 +497,11 @@ class KernelCodegen : public IRVisitor {
} else {
TI_ERROR("Metal only supports 32-bit atomic data types");
}
+
+ if (use_simd_in_tls_reduction) {
+ current_appender().pop_indent();
+ emit("}}"); // closes `if (kKernelTidInSimdgroupName == 0) {`
+ }
}
void visit(IfStmt *if_stmt) override {
@@ -561,7 +587,8 @@ class KernelCodegen : public IRVisitor {
}
void visit(PrintStmt *stmt) override {
- mark_print_used();
+ used_features()->print = true;
+
const auto &contents = stmt->contents;
const int num_entries = contents.size();
const std::string msgbuf_var_name = stmt->raw_name() + "_msgbuf_";
@@ -654,6 +681,7 @@ class KernelCodegen : public IRVisitor {
void emit_headers() {
SectionGuard sg(this, Section::Headers);
emit("#include ");
+ emit("#include ");
emit("using namespace metal;");
}
@@ -722,7 +750,7 @@ class KernelCodegen : public IRVisitor {
SectionGuard sg(this, Section::Kernels);
kernel_->ir->accept(this);
- if (used_features_.runtime_list_ops) {
+ if (used_features()->sparse) {
emit("");
current_appender().append_raw(shaders::kMetalRuntimeKernelsSourceCode);
}
@@ -768,7 +796,7 @@ class KernelCodegen : public IRVisitor {
emit("}}\n");
current_kernel_attribs_ = nullptr;
- mtl_kernels_attribs_.push_back(ka);
+ mtl_kernels_attribs()->push_back(ka);
}
void generate_range_for_kernel(OffloadedStmt *stmt) {
@@ -779,9 +807,16 @@ class KernelCodegen : public IRVisitor {
ka.task_type = stmt->task_type;
ka.buffers = get_common_buffers();
- emit_mtl_kernel_sig(mtl_kernel_name, ka.buffers);
+ const bool used_tls = (stmt->prologue != nullptr);
+ KernelSigExtensions kernel_exts;
+ kernel_exts.use_simdgroup = (used_tls && cgen_config_.allow_simdgroup);
+ used_features()->simdgroup =
+ used_features()->simdgroup || kernel_exts.use_simdgroup;
- auto &range_for_attribs = ka.range_for_attribs;
+ emit_mtl_kernel_sig(mtl_kernel_name, ka.buffers, kernel_exts);
+
+ ka.range_for_attribs = KernelAttributes::RangeForAttributes();
+ auto &range_for_attribs = ka.range_for_attribs.value();
range_for_attribs.const_begin = stmt->const_begin;
range_for_attribs.const_end = stmt->const_end;
range_for_attribs.begin =
@@ -820,23 +855,51 @@ class KernelCodegen : public IRVisitor {
emit("const int begin_ = {} + {};", kKernelThreadIdName, begin_expr);
// end_ = total_elems + begin_expr
emit("const int end_ = {} + {};", total_elems_name, begin_expr);
+
+ if (used_tls) {
+ // Using |int32_t| because it aligns to 4bytes.
+ emit("// TLS prologue");
+ const std::string tls_bufi32_name = "tls_bufi32_";
+ emit("int32_t {}[{}];", tls_bufi32_name, (stmt->tls_size + 3) / 4);
+ emit("thread char* {} = reinterpret_cast({});",
+ kTlsBufferName, tls_bufi32_name);
+ stmt->prologue->accept(this);
+ }
+
emit("for (int ii = begin_; ii < end_; ii += {}) {{", kKernelGridSizeName);
{
ScopedIndent s2(current_appender());
current_kernel_attribs_ = &ka;
const auto mtl_func_name = mtl_kernel_func_name(mtl_kernel_name);
- emit_mtl_kernel_func_def(mtl_func_name, ka.buffers, stmt->body.get());
- emit_call_mtl_kernel_func(mtl_func_name, ka.buffers,
+ std::vector extra_func_params;
+ std::vector extra_args;
+ if (used_tls) {
+ extra_func_params.push_back({"thread char*", kTlsBufferName});
+ extra_args.push_back(kTlsBufferName);
+ }
+ emit_mtl_kernel_func_def(mtl_func_name, ka.buffers, extra_func_params,
+ stmt->body.get());
+ emit_call_mtl_kernel_func(mtl_func_name, ka.buffers, extra_args,
/*loop_index_expr=*/"ii");
}
emit("}}"); // closes for loop
+
+ if (used_tls) {
+ TI_ASSERT(stmt->epilogue != nullptr);
+ inside_tls_epilogue_ = true;
+ emit("{{ // TLS epilogue");
+ stmt->epilogue->accept(this);
+ inside_tls_epilogue_ = false;
+ emit("}}");
+ }
+
current_appender().pop_indent();
// Close kernel
emit("}}\n");
current_kernel_attribs_ = nullptr;
- mtl_kernels_attribs_.push_back(ka);
+ mtl_kernels_attribs()->push_back(ka);
}
void generate_struct_for_kernel(OffloadedStmt *stmt) {
@@ -919,7 +982,7 @@ class KernelCodegen : public IRVisitor {
current_appender().pop_indent();
emit("}}\n"); // closes kernel
- mtl_kernels_attribs_.push_back(ka);
+ mtl_kernels_attribs()->push_back(ka);
}
void add_runtime_list_op_kernel(OffloadedStmt *stmt,
@@ -944,11 +1007,13 @@ class KernelCodegen : public IRVisitor {
} else {
TI_ERROR("Unsupported offload task type {}", stmt->task_name());
}
- ka.runtime_list_op_attribs.snode = sn;
+
+ ka.runtime_list_op_attribs = KernelAttributes::RuntimeListOpAttributes();
+ ka.runtime_list_op_attribs->snode = sn;
current_kernel_attribs_ = nullptr;
- mtl_kernels_attribs_.push_back(ka);
- used_features_.runtime_list_ops = true;
+ mtl_kernels_attribs()->push_back(ka);
+ used_features()->sparse = true;
}
std::string inject_load_global_tmp(int offset, DataType dt = DataType::i32) {
@@ -1042,15 +1107,28 @@ class KernelCodegen : public IRVisitor {
loop_index_expr);
}
+ struct KernelSigExtensions {
+ // https://stackoverflow.com/a/44693603/12003165
+ KernelSigExtensions() noexcept {
+ }
+
+ bool use_simdgroup = false;
+ };
+
void emit_mtl_kernel_sig(
const std::string &kernel_name,
- const std::vector &buffers) {
+ const std::vector &buffers,
+ const KernelSigExtensions &exts = {}) {
emit("kernel void {}(", kernel_name);
for (int i = 0; i < buffers.size(); ++i) {
emit(" device byte* {} [[buffer({})]],", buffer_to_name(buffers[i]),
i);
}
emit(" const uint {} [[threads_per_grid]],", kKernelGridSizeName);
+ if (exts.use_simdgroup) {
+ emit(" const uint {} [[thread_index_in_simdgroup]],",
+ kKernelTidInSimdgroupName);
+ }
emit(" const uint {} [[thread_position_in_grid]]) {{",
kKernelThreadIdName);
}
@@ -1089,11 +1167,6 @@ class KernelCodegen : public IRVisitor {
return kernel_name + "_func";
}
- void mark_print_used() {
- TI_ASSERT(current_kernel_attribs_ != nullptr);
- current_kernel_attribs_->uses_print = true;
- }
-
class SectionGuard {
public:
SectionGuard(KernelCodegen *kg, Section new_sec)
@@ -1124,6 +1197,14 @@ class KernelCodegen : public IRVisitor {
current_appender().append(std::move(f), std::forward(args)...);
}
+ std::vector *mtl_kernels_attribs() {
+ return &(ti_kernel_attribus_.mtl_kernels_attribs);
+ }
+
+ TaichiKernelAttributes::UsedFeatures *used_features() {
+ return &(ti_kernel_attribus_.used_features);
+ }
+
const std::string mtl_kernel_prefix_;
const std::string root_snode_type_name_;
Kernel *const kernel_;
@@ -1131,13 +1212,14 @@ class KernelCodegen : public IRVisitor {
const bool needs_root_buffer_;
const KernelContextAttributes ctx_attribs_;
PrintStringTable *const print_strtab_;
+ const CodeGen::Config &cgen_config_;
bool is_top_level_{true};
int mtl_kernel_count_{0};
- std::vector mtl_kernels_attribs_;
- UsedFeatures used_features_;
+ TaichiKernelAttributes ti_kernel_attribus_;
GetRootStmt *root_stmt_{nullptr};
KernelAttributes *current_kernel_attribs_{nullptr};
+ bool inside_tls_epilogue_{false};
Section code_section_{Section::Structs};
std::unordered_map section_appenders_;
};
@@ -1146,12 +1228,14 @@ class KernelCodegen : public IRVisitor {
CodeGen::CodeGen(Kernel *kernel,
KernelManager *kernel_mgr,
- const CompiledStructs *compiled_structs)
+ const CompiledStructs *compiled_structs,
+ const Config &config)
: kernel_(kernel),
kernel_mgr_(kernel_mgr),
compiled_structs_(compiled_structs),
id_(Program::get_kernel_id()),
- taichi_kernel_name_(fmt::format("mtl_k{:04d}_{}", id_, kernel_->name)) {
+ taichi_kernel_name_(fmt::format("mtl_k{:04d}_{}", id_, kernel_->name)),
+ config_(config) {
}
FunctionType CodeGen::compile() {
@@ -1159,14 +1243,16 @@ FunctionType CodeGen::compile() {
config.demote_dense_struct_fors = true;
irpass::compile_to_offloads(kernel_->ir.get(), config,
/*vectorize=*/false, kernel_->grad,
- /*ad_use_stack=*/true, config.print_ir);
+ /*ad_use_stack=*/true, config.print_ir,
+ /*lower_global_access=*/true,
+ /*make_thread_local=*/true);
- KernelCodegen codegen(taichi_kernel_name_,
- kernel_->program.snode_root->node_type_name, kernel_,
- compiled_structs_, kernel_mgr_->print_strtable());
+ KernelCodegen codegen(
+ taichi_kernel_name_, kernel_->program.snode_root->node_type_name, kernel_,
+ compiled_structs_, kernel_mgr_->print_strtable(), config_);
const auto source_code = codegen.run();
kernel_mgr_->register_taichi_kernel(taichi_kernel_name_, source_code,
- codegen.kernels_attribs(),
+ codegen.ti_kernels_attribs(),
codegen.kernel_ctx_attribs());
return [kernel_mgr = kernel_mgr_,
kernel_name = taichi_kernel_name_](Context &ctx) {
diff --git a/taichi/backends/metal/codegen_metal.h b/taichi/backends/metal/codegen_metal.h
index 00e58a33a2aab..468180306d973 100644
--- a/taichi/backends/metal/codegen_metal.h
+++ b/taichi/backends/metal/codegen_metal.h
@@ -18,9 +18,14 @@ namespace metal {
class CodeGen {
public:
+ struct Config {
+ bool allow_simdgroup = true;
+ };
+
CodeGen(Kernel *kernel,
KernelManager *kernel_mgr,
- const CompiledStructs *compiled_structs);
+ const CompiledStructs *compiled_structs,
+ const Config &config);
FunctionType compile();
@@ -33,6 +38,7 @@ class CodeGen {
const CompiledStructs *const compiled_structs_;
const int id_;
const std::string taichi_kernel_name_;
+ const Config config_;
};
} // namespace metal
diff --git a/taichi/backends/metal/constants.h b/taichi/backends/metal/constants.h
index 9a8d244977b52..72244377abcc4 100644
--- a/taichi/backends/metal/constants.h
+++ b/taichi/backends/metal/constants.h
@@ -9,6 +9,7 @@ namespace metal {
inline constexpr int kMaxNumThreadsGridStrideLoop = 64 * 1024;
inline constexpr int kNumRandSeeds = 64 * 1024; // 256 KB is nothing
+inline constexpr int kMslVersionNone = 0;
} // namespace metal
-TLANG_NAMESPACE_END
\ No newline at end of file
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/env_config.cpp b/taichi/backends/metal/env_config.cpp
new file mode 100644
index 0000000000000..d1d3d2da8426b
--- /dev/null
+++ b/taichi/backends/metal/env_config.cpp
@@ -0,0 +1,21 @@
+#include "taichi/backends/metal/env_config.h"
+
+#include "taichi/lang_util.h"
+#include "taichi/util/environ_config.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace metal {
+
+EnvConfig::EnvConfig() {
+ simdgroup_enabled_ =
+ get_environ_config("TI_USE_METAL_SIMDGROUP", /*default_value=*/1);
+}
+
+const EnvConfig &EnvConfig::instance() {
+ static const EnvConfig c;
+ return c;
+}
+
+} // namespace metal
+
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/env_config.h b/taichi/backends/metal/env_config.h
new file mode 100644
index 0000000000000..94268d7874714
--- /dev/null
+++ b/taichi/backends/metal/env_config.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace metal {
+
+// Metal specific config inferred from the environment.
+class EnvConfig {
+ public:
+ // Set TI_USE_METAL_SIMDGROUP=0 to disable SIMD group.
+ // This is an ad-hoc thing. Apple claims that SIMD group is supported in
+ // MSL 2.0, which isn't the case. According to my test, it's available in
+ // MSL 2.1. Since MSL 2.1 is released since macOS 10.14, we expect the major
+ // users would be able to use this feature.
+ inline bool is_simdgroup_enabled() const {
+ return simdgroup_enabled_;
+ }
+
+ static const EnvConfig &instance();
+
+ private:
+ EnvConfig();
+
+ bool simdgroup_enabled_;
+};
+
+} // namespace metal
+
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index ccd63af75f48d..b111951ad0326 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -33,6 +33,14 @@ namespace shaders {
using KernelTaskType = OffloadedStmt::TaskType;
using BufferEnum = KernelAttributes::Buffers;
+inline int infer_msl_version(const TaichiKernelAttributes::UsedFeatures &f) {
+ if (f.simdgroup) {
+ // https://developer.apple.com/documentation/metal/mtllanguageversion/version2_1
+ return 131073;
+ }
+ return kMslVersionNone;
+}
+
// This class requests the Metal buffer memory of |size| bytes from |mem_pool|.
// Once allocated, it does not own the memory (hence the name "view"). Instead,
// GC is deferred to the memory pool.
@@ -152,7 +160,7 @@ class RuntimeListOpsMtlKernel : public CompiledMtlKernelBase {
const SNodeDescriptorsMap *snode_descriptors = nullptr;
const SNode *snode() const {
- return kernel_attribs->runtime_list_op_attribs.snode;
+ return kernel_attribs->runtime_list_op_attribs->snode;
}
};
@@ -208,7 +216,7 @@ class CompiledTaichiKernel {
struct Params {
std::string_view taichi_kernel_name;
std::string mtl_source_code;
- const std::vector *mtl_kernels_attribs;
+ const TaichiKernelAttributes *ti_kernel_attribs;
const KernelContextAttributes *ctx_attribs;
const SNodeDescriptorsMap *snode_descriptors;
MTLDevice *device;
@@ -216,14 +224,17 @@ class CompiledTaichiKernel {
KernelProfilerBase *profiler;
};
- CompiledTaichiKernel(Params params) : ctx_attribs(*params.ctx_attribs) {
+ CompiledTaichiKernel(Params params)
+ : ctx_attribs(*params.ctx_attribs),
+ used_features(params.ti_kernel_attribs->used_features) {
auto *const device = params.device;
- auto kernel_lib = new_library_with_source(device, params.mtl_source_code);
+ auto kernel_lib = new_library_with_source(device, params.mtl_source_code,
+ infer_msl_version(used_features));
if (kernel_lib == nullptr) {
TI_ERROR("Failed to compile Metal kernel! Generated code:\n\n{}",
params.mtl_source_code);
}
- for (const auto &ka : *(params.mtl_kernels_attribs)) {
+ for (const auto &ka : params.ti_kernel_attribs->mtl_kernels_attribs) {
auto mtl_func = new_function_with_name(kernel_lib.get(), ka.name);
TI_ASSERT(mtl_func != nullptr);
// Note that CompiledMtlKernel doesn't own |kernel_func|.
@@ -265,8 +276,7 @@ class CompiledTaichiKernel {
KernelContextAttributes ctx_attribs;
std::unique_ptr ctx_mem;
nsobj_unique_ptr ctx_buffer;
-
- private:
+ TaichiKernelAttributes::UsedFeatures used_features;
};
class HostMetalCtxBlitter {
@@ -445,11 +455,10 @@ class KernelManager::Impl {
init_print_buffer();
}
- void register_taichi_kernel(
- const std::string &taichi_kernel_name,
- const std::string &mtl_kernel_source_code,
- const std::vector &kernels_attribs,
- const KernelContextAttributes &ctx_attribs) {
+ void register_taichi_kernel(const std::string &taichi_kernel_name,
+ const std::string &mtl_kernel_source_code,
+ const TaichiKernelAttributes &ti_kernel_attribs,
+ const KernelContextAttributes &ctx_attribs) {
TI_ASSERT(compiled_taichi_kernels_.find(taichi_kernel_name) ==
compiled_taichi_kernels_.end());
@@ -463,7 +472,7 @@ class KernelManager::Impl {
CompiledTaichiKernel::Params params;
params.taichi_kernel_name = taichi_kernel_name;
params.mtl_source_code = mtl_kernel_source_code;
- params.mtl_kernels_attribs = &kernels_attribs;
+ params.ti_kernel_attribs = &ti_kernel_attribs;
params.ctx_attribs = &ctx_attribs;
params.snode_descriptors = &compiled_structs_.snode_descriptors;
params.device = device_.get();
@@ -493,13 +502,11 @@ class KernelManager::Impl {
input_buffers[BufferEnum::Context] = ctk.ctx_buffer.get();
}
- bool uses_print = false;
for (const auto &mk : ctk.compiled_mtl_kernels) {
mk->launch(input_buffers, cur_command_buffer_.get());
- uses_print = (uses_print || mk->kernel_attribs()->uses_print);
}
-
- if (ctx_blitter || uses_print) {
+ const bool used_print = ctk.used_features.print;
+ if (ctx_blitter || used_print) {
// TODO(k-ye): One optimization is to synchronize only when we absolutely
// need to transfer the data back to host. This includes the cases where
// an arg is 1) an array, or 2) used as return value.
@@ -507,7 +514,7 @@ class KernelManager::Impl {
if (ctx_blitter) {
ctx_blitter->metal_to_host();
}
- if (uses_print) {
+ if (used_print) {
flush_print_buffers();
}
}
@@ -741,11 +748,10 @@ class KernelManager::Impl {
TI_ERROR("Metal not supported on the current OS");
}
- void register_taichi_kernel(
- const std::string &taichi_kernel_name,
- const std::string &mtl_kernel_source_code,
- const std::vector &kernels_attribs,
- const KernelContextAttributes &ctx_attribs) {
+ void register_taichi_kernel(const std::string &taichi_kernel_name,
+ const std::string &mtl_kernel_source_code,
+ const TaichiKernelAttributes &ti_kernel_attribs,
+ const KernelContextAttributes &ctx_attribs) {
TI_ERROR("Metal not supported on the current OS");
}
@@ -776,10 +782,10 @@ KernelManager::~KernelManager() {
void KernelManager::register_taichi_kernel(
const std::string &taichi_kernel_name,
const std::string &mtl_kernel_source_code,
- const std::vector &kernels_attribs,
+ const TaichiKernelAttributes &ti_kernel_attribs,
const KernelContextAttributes &ctx_attribs) {
impl_->register_taichi_kernel(taichi_kernel_name, mtl_kernel_source_code,
- kernels_attribs, ctx_attribs);
+ ti_kernel_attribs, ctx_attribs);
}
void KernelManager::launch_taichi_kernel(const std::string &taichi_kernel_name,
diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h
index dc3502577bdd2..1c687cedec82b 100644
--- a/taichi/backends/metal/kernel_manager.h
+++ b/taichi/backends/metal/kernel_manager.h
@@ -39,11 +39,10 @@ class KernelManager {
// * |mtl_kernel_source_code| is the complete source code compiled from a
// Taichi kernel. It may include one or more Metal compute kernels. Each
// Metal kernel is identified by one item in |kernels_attribs|.
- void register_taichi_kernel(
- const std::string &taichi_kernel_name,
- const std::string &mtl_kernel_source_code,
- const std::vector &kernels_attribs,
- const KernelContextAttributes &ctx_attribs);
+ void register_taichi_kernel(const std::string &taichi_kernel_name,
+ const std::string &mtl_kernel_source_code,
+ const TaichiKernelAttributes &ti_kernel_attribs,
+ const KernelContextAttributes &ctx_attribs);
// Launch the given |taichi_kernel_name|.
// Kernel launching is asynchronous, therefore the Metal memory is not valid
diff --git a/taichi/backends/metal/kernel_util.cpp b/taichi/backends/metal/kernel_util.cpp
index a4d3cc67190dd..5ef0bbc23879b 100644
--- a/taichi/backends/metal/kernel_util.cpp
+++ b/taichi/backends/metal/kernel_util.cpp
@@ -50,7 +50,7 @@ std::string KernelAttributes::debug_string() const {
// TODO(k-ye): show range_for
if (task_type == OffloadedStmt::TaskType::clear_list ||
task_type == OffloadedStmt::TaskType::listgen) {
- result += fmt::format(" snode={}", runtime_list_op_attribs.snode->id);
+ result += fmt::format(" snode={}", runtime_list_op_attribs->snode->id);
}
result += ">";
return result;
diff --git a/taichi/backends/metal/kernel_util.h b/taichi/backends/metal/kernel_util.h
index a638702c149e2..fd5a7d47b6785 100644
--- a/taichi/backends/metal/kernel_util.h
+++ b/taichi/backends/metal/kernel_util.h
@@ -66,20 +66,31 @@ struct KernelAttributes {
};
std::vector buffers;
// Only valid when |task_type| is range_for.
- // TODO(k-ye): Use std::optional to wrap |task_type| dependent attributes.
- RangeForAttributes range_for_attribs;
- // clear_list + listgen
- RuntimeListOpAttributes runtime_list_op_attribs;
-
- // Whether print() is called inside this kernel.
- // TODO(k-ye): Encapsulate this inside a UsedFeatures. However, we need a
- // TaichiKernelAttributes before we can do this.
- bool uses_print = false;
+ std::optional range_for_attribs;
+ // Only valid when |task_type| is {clear_list, listgen}.
+ std::optional runtime_list_op_attribs;
static std::string buffers_name(Buffers b);
std::string debug_string() const;
};
+// Groups all the Metal kernels generated from a single ti.kernel
+struct TaichiKernelAttributes {
+ struct UsedFeatures {
+ // Whether print() is called inside this kernel.
+ bool print = false;
+ // Whether this kernel accesses (read or write) sparse SNodes.
+ bool sparse = false;
+ // Whether [[thread_index_in_simdgroup]] is used. This is only supported
+ // since MSL 2.1
+ bool simdgroup = false;
+ };
+
+ // Attributes of all the Metal kernels produced from this Taichi kernel.
+ std::vector mtl_kernels_attribs;
+ UsedFeatures used_features;
+};
+
// This class contains the attributes descriptors for both the input args and
// the return values of a Taichi kernel.
//
diff --git a/taichi/backends/metal/shaders/print.metal.h b/taichi/backends/metal/shaders/print.metal.h
index 1c8d2c74fc7d6..5cb3971540261 100644
--- a/taichi/backends/metal/shaders/print.metal.h
+++ b/taichi/backends/metal/shaders/print.metal.h
@@ -37,19 +37,21 @@ STR(
constant constexpr int kMetalPrintMsgTypeWidthMask =
((1 << kMetalNumBitsPerPrintMsgType) - 1);
- [[maybe_unused]] inline int mtl_compute_num_print_msg_typemasks(
- int num_entries) {
- return (num_entries + kMetalNumPrintMsgTypePerI32 - 1) /
- kMetalNumPrintMsgTypePerI32;
- }
-
- [[maybe_unused]] inline int mtl_compute_print_msg_bytes(int num_entries) {
- // See PrintMsg's layout for how this is computed.
- const int sz =
- sizeof(int32_t) *
- (1 + mtl_compute_num_print_msg_typemasks(num_entries) + num_entries);
- return sz;
- }
+ [[maybe_unused]] inline int mtl_compute_num_print_msg_typemasks(
+ int num_entries) {
+ return (num_entries + kMetalNumPrintMsgTypePerI32 - 1) /
+ kMetalNumPrintMsgTypePerI32;
+ }
+
+ [[maybe_unused]] inline int mtl_compute_print_msg_bytes(
+ int num_entries) {
+ // See PrintMsg's layout for how this is computed.
+ const int sz =
+ sizeof(int32_t) *
+ (1 + mtl_compute_num_print_msg_typemasks(num_entries) +
+ num_entries);
+ return sz;
+ }
class PrintMsg {
public:
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index d466d1bc25ae4..1cdc0eb16648c 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -322,7 +322,7 @@ bool initialize_opengl(bool error_tolerance) {
desc = "Unknown Error";
if (error_tolerance) {
// error tolerated, returning false
- TI_TRACE("[glsl] cannot create GLFW window: error {}: {}", status, desc);
+ TI_DEBUG("[glsl] cannot create GLFW window: error {}: {}", status, desc);
supported = std::make_optional(false);
return false;
}
diff --git a/taichi/inc/archs.inc.h b/taichi/inc/archs.inc.h
index 3c021debbfccd..6f618f7f58adb 100644
--- a/taichi/inc/archs.inc.h
+++ b/taichi/inc/archs.inc.h
@@ -2,12 +2,14 @@
// CPU archs
PER_ARCH(x64) // a.k.a. AMD64/x86_64
-PER_ARCH(arm64) // a.k.a. Aarch64
-PER_ARCH(js) // Javascript
+PER_ARCH(arm64) // a.k.a. Aarch64, WIP
+PER_ARCH(js) // Javascript, N/A
+PER_ARCH(cc) // C language, WIP
// GPU archs
PER_ARCH(cuda) // NVIDIA CUDA
-PER_ARCH(opencl) // OpenCL
PER_ARCH(metal) // Apple Metal
PER_ARCH(opengl) // OpenGL Compute Shaders
-PER_ARCH(dx) // Microsoft DirectX
+PER_ARCH(dx) // Microsoft DirectX, N/A
+PER_ARCH(opencl) // OpenCL, N/A
+PER_ARCH(amdgpu) // AMD GPU, N/A
diff --git a/taichi/ir/control_flow_graph.cpp b/taichi/ir/control_flow_graph.cpp
index f0a726d50a808..cb92bb04030ef 100644
--- a/taichi/ir/control_flow_graph.cpp
+++ b/taichi/ir/control_flow_graph.cpp
@@ -101,8 +101,7 @@ void CFGNode::reaching_definition_analysis(bool after_lower_access) {
auto data_source_ptr = irpass::analysis::get_store_destination(stmt);
if (data_source_ptr) {
// stmt provides a data source
- if (after_lower_access &&
- !(stmt->is() || stmt->is())) {
+ if (after_lower_access && !(data_source_ptr->is())) {
// After lower_access, we only analyze local variables.
continue;
}
diff --git a/taichi/ir/expr.cpp b/taichi/ir/expr.cpp
index 083b9fe6e1490..20ea0cd594fbe 100644
--- a/taichi/ir/expr.cpp
+++ b/taichi/ir/expr.cpp
@@ -179,7 +179,9 @@ Expr load(const Expr &ptr) {
Expr ptr_if_global(const Expr &var) {
if (var.is()) {
// singleton global variable
- TI_ASSERT(var.snode()->num_active_indices == 0);
+ TI_ASSERT_INFO(
+ var.snode()->num_active_indices == 0,
+ "Please always use 'x[None]' (instead of simply 'x') to access any 0-D tensor."
return var[ExprGroup()];
} else {
// may be any local or global expr
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index d1c38dbc64bf6..fc4938a2ef97c 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -35,6 +35,7 @@ CompileConfig::CompileConfig() {
verbose = true;
fast_math = true;
async = false;
+ flatten_if = false;
#if defined(TI_PLATFORM_WINDOWS) or defined(TI_ARCH_ARM)
use_unified_memory = false;
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index 6ee00204fdc72..02c5cf64c15e3 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -37,6 +37,7 @@ struct CompileConfig {
bool fast_math;
bool use_unified_memory;
bool async;
+ bool flatten_if;
DataType default_fp;
DataType default_ip;
std::string extra_flags;
diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp
index 55df70d6f6967..b1a0cd971fb7d 100644
--- a/taichi/program/kernel_profiler.cpp
+++ b/taichi/program/kernel_profiler.cpp
@@ -148,13 +148,10 @@ class KernelProfilerCUDA : public KernelProfilerBase {
} // namespace
std::unique_ptr make_profiler(Arch arch) {
- if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal ||
- arch == Arch::opengl) {
- return std::make_unique(arch);
- } else if (arch == Arch::cuda) {
+ if (arch == Arch::cuda) {
return std::make_unique();
} else {
- TI_NOT_IMPLEMENTED;
+ return std::make_unique(arch);
}
}
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 9a6c94f416fb5..6567ab2d87d55 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -11,12 +11,16 @@
#include "taichi/backends/cuda/cuda_context.h"
#endif
#include "taichi/backends/metal/codegen_metal.h"
+#include "taichi/backends/metal/env_config.h"
#include "taichi/backends/opengl/codegen_opengl.h"
+#include "taichi/backends/cc/codegen_cc.h"
#include "taichi/backends/cpu/codegen_cpu.h"
#include "taichi/struct/struct.h"
#include "taichi/struct/struct_llvm.h"
#include "taichi/backends/metal/struct_metal.h"
#include "taichi/backends/opengl/struct_opengl.h"
+#include "taichi/backends/cc/struct_cc.h"
+#include "taichi/backends/cc/cc_layout.h"
#include "taichi/system/unified_allocator.h"
#include "taichi/ir/snode.h"
#include "taichi/ir/frontend_ir.h"
@@ -152,13 +156,20 @@ FunctionType Program::compile(Kernel &kernel) {
auto codegen = KernelCodeGen::create(kernel.arch, &kernel);
ret = codegen->compile();
} else if (kernel.arch == Arch::metal) {
+ metal::CodeGen::Config cgen_config;
+ cgen_config.allow_simdgroup =
+ metal::EnvConfig::instance().is_simdgroup_enabled();
metal::CodeGen codegen(&kernel, metal_kernel_mgr_.get(),
- &metal_compiled_structs_.value());
+ &metal_compiled_structs_.value(), cgen_config);
ret = codegen.compile();
} else if (kernel.arch == Arch::opengl) {
opengl::OpenglCodeGen codegen(kernel.name, &opengl_struct_compiled_.value(),
opengl_kernel_launcher_.get());
ret = codegen.compile(*this, kernel);
+#ifdef TI_WITH_CC
+ } else if (kernel.arch == Arch::cc) {
+ ret = cccp::compile_kernel(&kernel);
+#endif
} else {
TI_NOT_IMPLEMENTED;
}
@@ -315,6 +326,12 @@ void Program::materialize_layout() {
opengl_struct_compiled_->root_size);
opengl_kernel_launcher_ = std::make_unique(
opengl_struct_compiled_->root_size);
+#ifdef TI_WITH_CC
+ } else if (config.arch == Arch::cc) {
+ cc_program = std::make_unique();
+ cccp::CCLayoutGen scomp(snode_root.get());
+ cc_program->layout = scomp.compile();
+#endif
}
}
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 68b103a27eec8..740ee01f0491d 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -14,6 +14,7 @@
#include "taichi/backends/metal/kernel_manager.h"
#include "taichi/backends/opengl/opengl_kernel_launcher.h"
#include "taichi/backends/opengl/opengl_kernel_util.h"
+#include "taichi/backends/cc/cc_program.h"
#include "taichi/program/kernel.h"
#include "taichi/program/kernel_profiler.h"
#include "taichi/runtime/llvm/context.h"
@@ -245,6 +246,12 @@ class Program {
// OpenGL related data structures
std::optional opengl_struct_compiled_;
std::unique_ptr opengl_kernel_launcher_;
+
+ public:
+#ifdef TI_WITH_CC
+ // C backend related data structures
+ std::unique_ptr cc_program;
+#endif
};
TLANG_NAMESPACE_END
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index b34949ac13667..08d0c041b0f76 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -109,7 +109,8 @@ void export_lang(py::module &m) {
&CompileConfig::device_memory_fraction)
.def_readwrite("fast_math", &CompileConfig::fast_math)
.def_readwrite("ad_stack_size", &CompileConfig::ad_stack_size)
- .def_readwrite("async", &CompileConfig::async);
+ .def_readwrite("async", &CompileConfig::async)
+ .def_readwrite("flatten_if", &CompileConfig::flatten_if);
m.def("reset_default_compile_config",
[&]() { default_compile_config = CompileConfig(); });
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index e48a3b8fb9d7c..78136fb68f713 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -15,6 +15,7 @@
#include "taichi/system/dynamic_loader.h"
#include "taichi/backends/metal/api.h"
#include "taichi/backends/opengl/opengl_api.h"
+#include "taichi/backends/cc/cc_configuation.h"
#if defined(TI_WITH_CUDA)
#include "taichi/backends/cuda/cuda_driver.h"
#endif
@@ -156,12 +157,15 @@ void export_misc(py::module &m) {
}
printf("test was successful.\n");
});
- m.def("pop_python_print_buffer", []() {
- return py_cout.pop_content();
- });
+ m.def("pop_python_print_buffer", []() { return py_cout.pop_content(); });
m.def("with_cuda", is_cuda_api_available);
m.def("with_metal", taichi::lang::metal::is_metal_api_available);
m.def("with_opengl", taichi::lang::opengl::is_opengl_api_available);
+#ifdef TI_WITH_CC
+ m.def("with_cc", taichi::lang::cccp::is_c_backend_available);
+#else
+ m.def("with_cc", []() { return false; });
+#endif
}
TI_NAMESPACE_END
diff --git a/taichi/system/dynamic_loader.cpp b/taichi/system/dynamic_loader.cpp
index fbe54b7a66268..2923fa34c8642 100644
--- a/taichi/system/dynamic_loader.cpp
+++ b/taichi/system/dynamic_loader.cpp
@@ -21,7 +21,7 @@ void DynamicLoader::load_dll(const std::string &dll_path) {
}
void *DynamicLoader::load_function(const std::string &func_name) {
- TI_ASSERT(loaded());
+ TI_ASSERT_INFO(loaded(), "DLL not opened");
#ifdef WIN32
auto func = (void *)GetProcAddress((HMODULE)dll, func_name.c_str());
#else
@@ -34,7 +34,7 @@ void *DynamicLoader::load_function(const std::string &func_name) {
}
void DynamicLoader::close_dll() {
- TI_ERROR_IF(!loaded(), "DLL not opened.");
+ TI_ASSERT_INFO(loaded(), "DLL not opened");
#ifdef WIN32
FreeLibrary((HMODULE)dll);
#else
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index 9ec3112753186..3b8ae0b064c9a 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -59,13 +59,10 @@ void compile_to_offloads(IRNode *ir,
print("Loop Split");
irpass::analysis::verify(ir);
}
- irpass::simplify(ir);
+ irpass::full_simplify(ir);
print("Simplified I");
irpass::analysis::verify(ir);
- irpass::constant_fold(ir);
- print("Constant folded I");
-
if (grad) {
// Remove local atomics here so that we don't have to handle their gradients
irpass::demote_atomics(ir);
@@ -105,9 +102,6 @@ void compile_to_offloads(IRNode *ir,
print("Simplified II");
irpass::analysis::verify(ir);
- irpass::constant_fold(ir);
- print("Constant folded II");
-
irpass::offload(ir);
print("Offloaded");
irpass::analysis::verify(ir);
diff --git a/taichi/transforms/constant_fold.cpp b/taichi/transforms/constant_fold.cpp
index 02d407dd61140..ec1d294728277 100644
--- a/taichi/transforms/constant_fold.cpp
+++ b/taichi/transforms/constant_fold.cpp
@@ -166,6 +166,12 @@ class ConstantFold : public BasicStmtVisitor {
}
void visit(UnaryOpStmt *stmt) override {
+ if (stmt->is_cast() &&
+ stmt->cast_type == stmt->operand->ret_type.data_type) {
+ stmt->replace_with(stmt->operand);
+ modifier.erase(stmt);
+ return;
+ }
auto operand = stmt->operand->cast();
if (!operand)
return;
diff --git a/taichi/transforms/simplify.cpp b/taichi/transforms/simplify.cpp
index 37d9cb74527f0..8a62fb0554603 100644
--- a/taichi/transforms/simplify.cpp
+++ b/taichi/transforms/simplify.cpp
@@ -2,6 +2,8 @@
#include "taichi/ir/transforms.h"
#include "taichi/ir/analysis.h"
#include "taichi/ir/visitors.h"
+#include "taichi/program/kernel.h"
+#include "taichi/program/program.h"
#include
#include
#include
@@ -1062,13 +1064,15 @@ class BasicBlockSimplify : public IRVisitor {
return false;
};
- if (if_stmt->true_statements &&
- flatten(if_stmt->true_statements->statements, true)) {
- throw IRModified();
- }
- if (if_stmt->false_statements &&
- flatten(if_stmt->false_statements->statements, false)) {
- throw IRModified();
+ if (kernel->program.config.flatten_if) {
+ if (if_stmt->true_statements &&
+ flatten(if_stmt->true_statements->statements, true)) {
+ throw IRModified();
+ }
+ if (if_stmt->false_statements &&
+ flatten(if_stmt->false_statements->statements, false)) {
+ throw IRModified();
+ }
}
if (if_stmt->true_statements) {
@@ -1141,6 +1145,9 @@ class Simplify : public IRVisitor {
Kernel *kernel;
Simplify(IRNode *node, Kernel *kernel) : kernel(kernel) {
+ if (!kernel)
+ this->kernel = node->get_kernel();
+ TI_ASSERT(this->kernel);
modified = false;
allow_undefined_visitor = true;
invoke_default_visitor = true;
@@ -1176,7 +1183,9 @@ class Simplify : public IRVisitor {
}
void visit(StructForStmt *for_stmt) override {
- TI_ASSERT(current_struct_for == nullptr);
+ TI_ASSERT_INFO(current_struct_for == nullptr,
+ "Nested struct-fors are not supported for now. "
+ "Please try to use range-fors for inner loops.");
current_struct_for = for_stmt;
for_stmt->body->accept(this);
current_struct_for = nullptr;
@@ -1225,15 +1234,18 @@ void full_simplify(IRNode *root, Kernel *kernel) {
modified = true;
if (constant_fold(root))
modified = true;
+ if (die(root))
+ modified = true;
if (alg_simp(root))
modified = true;
- die(root);
- if (whole_kernel_cse(root))
+ if (die(root))
modified = true;
- die(root);
if (simplify(root, kernel))
modified = true;
- die(root);
+ if (die(root))
+ modified = true;
+ if (whole_kernel_cse(root))
+ modified = true;
if (!modified)
break;
}
diff --git a/taichi/util/environ_config.h b/taichi/util/environ_config.h
index cf65e3dce4bee..9a4aae477eab4 100644
--- a/taichi/util/environ_config.h
+++ b/taichi/util/environ_config.h
@@ -7,8 +7,8 @@
TLANG_NAMESPACE_BEGIN
-static inline int get_environ_config(const std::string &name, int default_value = 0)
-{
+static inline int get_environ_config(const std::string &name,
+ int default_value = 0) {
char *res = std::getenv(name.c_str());
if (res == nullptr)
return default_value;
diff --git a/taichi/util/logging.cpp b/taichi/util/logging.cpp
index 212cd95518933..35a090baa4e6e 100644
--- a/taichi/util/logging.cpp
+++ b/taichi/util/logging.cpp
@@ -155,7 +155,8 @@ std::string signal_name(int sig) {
bool python_at_exit_called = false;
void signal_handler(int signo) {
- // It seems that there's no way to pass exception to Python in signal handlers?
+ // It seems that there's no way to pass exception to Python in signal
+ // handlers?
// @archibate found that in fact there are such solution:
// https://docs.python.org/3/library/faulthandler.html#module-faulthandler
auto sig_name = signal_name(signo);
diff --git a/tests/python/test_for_group_mismatch.py b/tests/python/test_for_group_mismatch.py
new file mode 100644
index 0000000000000..6c84b44ccc6c2
--- /dev/null
+++ b/tests/python/test_for_group_mismatch.py
@@ -0,0 +1,96 @@
+import taichi as ti
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def test_struct_for_mismatch():
+ x = ti.var(ti.f32, (3, 4))
+
+ @ti.kernel
+ def func():
+ for i in x:
+ print(i)
+
+ func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def test_struct_for_mismatch2():
+ x = ti.var(ti.f32, (3, 4))
+
+ @ti.kernel
+ def func():
+ for i, j, k in x:
+ print(i, j, k)
+
+ func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_grouped_struct_for_mismatch():
+ # doesn't work for now
+ # need grouped refactor
+ # for now, it just throw a unfriendly message:
+ # AssertionError: __getitem__ cannot be called in Python-scope
+ x = ti.var(ti.f32, (3, 4))
+
+ @ti.kernel
+ def func():
+ for i, j in ti.grouped(x):
+ print(i, j)
+
+ func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_ndrange_for_mismatch():
+ # doesn't work for now
+ # need ndrange refactor
+ @ti.kernel
+ def func():
+ for i in ti.ndrange(3, 4):
+ print(i)
+
+ func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_ndrange_for_mismatch2():
+ # doesn't work for now
+ # need ndrange and grouped refactor
+ @ti.kernel
+ def func():
+ for i, j, k in ti.ndrange(3, 4):
+ print(i, j, k)
+
+ func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_grouped_ndrange_for_mismatch():
+ # doesn't work for now
+ # need ndrange and grouped refactor
+ @ti.kernel
+ def func():
+ for i in ti.grouped(ti.ndrange(3, 4)):
+ print(i)
+
+ func()
+
+
+@ti.must_throw(IndexError)
+@ti.host_arch_only
+def _test_static_ndrange_for_mismatch():
+ # doesn't work for now
+ # need ndrange and static refactor
+ @ti.kernel
+ def func():
+ for i in ti.static(ti.ndrange(3, 4)):
+ print(i)
+
+ func()
diff --git a/tests/python/test_return.py b/tests/python/test_return.py
index 71e383ec275a7..9400e6dfa905f 100644
--- a/tests/python/test_return.py
+++ b/tests/python/test_return.py
@@ -16,7 +16,6 @@ def kernel() -> ti.i32:
@ti.must_throw(ti.TaichiSyntaxError)
def test_return_without_type_hint():
-
@ti.kernel
def kernel():
return 1
@@ -25,7 +24,6 @@ def kernel():
def test_const_func_ret():
-
@ti.kernel
def func1() -> ti.f32:
return 3
@@ -40,7 +38,6 @@ def func2() -> ti.i32:
@ti.all_archs
def _test_binary_func_ret(dt1, dt2, dt3, castor):
-
@ti.kernel
def func(a: dt1, b: dt2) -> dt3:
return a * b
@@ -64,4 +61,3 @@ def test_binary_func_ret():
_test_binary_func_ret(ti.f32, ti.i32, ti.f32, float)
_test_binary_func_ret(ti.i32, ti.f32, ti.i32, int)
_test_binary_func_ret(ti.f32, ti.i32, ti.i32, int)
-