diff --git a/.github/workflows/persubmit.yml b/.github/workflows/persubmit.yml index ff7a621ffcd9f..6d3b156368806 100644 --- a/.github/workflows/persubmit.yml +++ b/.github/workflows/persubmit.yml @@ -11,6 +11,7 @@ jobs: matrix: os: [ubuntu-latest] python: [3.6, 3.7, 3.8] + with_cc: [OFF] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -30,7 +31,7 @@ jobs: export TAICHI_REPO_DIR=`pwd` export PATH=$TAICHI_REPO_DIR/taichi-llvm/bin/:$PATH export CXX=clang++ - export CI_SETUP_CMAKE_ARGS=-DTI_WITH_OPENGL:BOOL=OFF + export CI_SETUP_CMAKE_ARGS="-DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{matrix.with_cc}}" # GLFW dependencies: #export CI_SETUP_CMAKE_ARGS=-DTI_WITH_OPENGL:BOOL=ON #sudo apt-get install libx11-dev libxcursor-dev libxi-dev diff --git a/.gitignore b/.gitignore index 50ccd25a77145..6e219c6c33fb2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.swp *.swo /.vs +/tags +/.*_localrc /Debug *.sdf /x64 diff --git a/CMakeLists.txt b/CMakeLists.txt index 628e779bf23e1..1d838bd3c848e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(taichi) SET(TI_VERSION_MAJOR 0) SET(TI_VERSION_MINOR 6) -SET(TI_VERSION_PATCH 14) +SET(TI_VERSION_PATCH 15) execute_process( WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} diff --git a/README.md b/README.md index c2eca75c9f678..d2f226ae09b21 100644 --- a/README.md +++ b/README.md @@ -13,16 +13,16 @@ **Taichi** (太极) is a programming language designed for *high-performance computer graphics*. It is deeply embedded in **Python**, and its **just-in-time compiler** offloads compute-intensive tasks to multi-core CPUs and massively parallel GPUs. - + Advanced features of Taichi include [spatially sparse computing](https://taichi.readthedocs.io/en/latest/sparse.html) and [differentiable programming](https://taichi.readthedocs.io/en/latest/differentiable_programming.html) [[examples]](https://github.com/yuanming-hu/difftaichi). ## Gallery - + - + ## Installation [![Downloads](https://pepy.tech/badge/taichi/month)](https://pepy.tech/project/taichi/month) diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake index 95307ea6bcaf1..5634f5e0b7551 100644 --- a/cmake/TaichiCore.cmake +++ b/cmake/TaichiCore.cmake @@ -3,15 +3,27 @@ set(CORE_LIBRARY_NAME taichi_core) option(USE_STDCPP "Use -stdlib=libc++" OFF) option(TI_WITH_CUDA "Build with the CUDA backend" ON) option(TI_WITH_OPENGL "Build with the OpenGL backend" ON) +option(TI_WITH_CC "Build with the C backend" OFF) if (APPLE) if (TI_WITH_CUDA) set(TI_WITH_CUDA OFF) - message(WARNING "CUDA not supported on OS X. Setting TI_WITH_CUDA to OFF.") + message(WARNING "CUDA backend not supported on OS X. Setting TI_WITH_CUDA to OFF.") endif() if (TI_WITH_OPENGL) set(TI_WITH_OPENGL OFF) - message(WARNING "OpenGL not supported on OS X. Setting TI_WITH_OPENGL to OFF.") + message(WARNING "OpenGL backend not supported on OS X. Setting TI_WITH_OPENGL to OFF.") + endif() + if (TI_WITH_CC) + set(TI_WITH_CC OFF) + message(WARNING "C backend not supported on OS X. Setting TI_WITH_CC to OFF.") + endif() +endif() + +if (WIN32) + if (TI_WITH_CC) + set(TI_WITH_CC OFF) + message(WARNING "C backend not supported on Windows. Setting TI_WITH_CC to OFF.") endif() endif() @@ -30,6 +42,7 @@ file(GLOB TAICHI_CPU_SOURCE "taichi/backends/cpu/*.cpp" "taichi/backends/cpu/*.h file(GLOB TAICHI_CUDA_SOURCE "taichi/backends/cuda/*.cpp" "taichi/backends/cuda/*.h") file(GLOB TAICHI_METAL_SOURCE "taichi/backends/metal/*.h" "taichi/backends/metal/*.cpp" "taichi/backends/metal/shaders/*") file(GLOB TAICHI_OPENGL_SOURCE "taichi/backends/opengl/*.h" "taichi/backends/opengl/*.cpp" "taichi/backends/opengl/shaders/*") +file(GLOB TAICHI_CC_SOURCE "taichi/backends/cc/*.h" "taichi/backends/cc/*.cpp") list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_BACKEND_SOURCE}) @@ -56,6 +69,11 @@ if (TI_WITH_OPENGL) list(APPEND TAICHI_CORE_SOURCE ${TAICHI_GLAD_SOURCE}) endif() +if (TI_WITH_CC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_CC") + list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CC_SOURCE}) +endif() + add_library(${CORE_LIBRARY_NAME} SHARED ${TAICHI_CORE_SOURCE} ${PROJECT_SOURCES}) if (APPLE) diff --git a/docs/compilation.rst b/docs/compilation.rst index 00496f90e26fa..88aa3f6d82236 100644 --- a/docs/compilation.rst +++ b/docs/compilation.rst @@ -1,20 +1,20 @@ .. _compilation: -The life of a Taichi kernel +Life of a Taichi kernel =============================================== Sometimes it is helpful to understand the life cycle of a Taichi kernel. In short, compilation will only happen on the first invocation of an instance of a kernel. -Life cycle of a Taichi kernel looks like this: +The life cycle of a Taichi kernel has the following stages: - Kernel registration - Template instantiation and caching - Python AST transforms - - Taichi IR compilation, optimization, and binary generation + - Taichi IR compilation, optimization, and executable generation - Launching -.. image:: life_of_kernel_lowres.jpg +.. image:: https://raw.githubusercontent.com/taichi-dev/public_files/6bd234694270c83baf97ba32e0c6278b8cf37e6e/taichi/life_of_kernel.jpg Let's consider the following simple kernel: @@ -26,7 +26,7 @@ Let's consider the following simple kernel: tensor[i] += delta -We also allocate two 1D tensors to simplify discussion: +We allocate two 1D tensors to simplify discussion: .. code-block:: python @@ -50,7 +50,7 @@ Template instantiation and caching When ``add`` is called for the first time, the Taichi frontend compiler will instantiate the kernel. -When you have a second call with the same template signature (explained later), e.g., +When you have a second call with the same **template signature** (explained later), e.g., .. code-block:: python @@ -69,12 +69,12 @@ will lead to a new instantiation of **add**. .. note:: **Template signatures** are what distinguish different instantiations of a kernel template. The signature of ``add(x, 42)`` is ``(x, ti.i32)``, which is the same as that of ``add(x, 1)``. Therefore, the latter can reuse the previously compiled binary. - The signature of ``add(y, 42)`` is ``(y, ti.i32)``, a different value from the previous signature, therefore a new instantiation and compilation will happen. + The signature of ``add(y, 42)`` is ``(y, ti.i32)``, a different value from the previous signature, hence a new kernel will be instantiated and compiled. .. note:: - Many basic operations in the Taichi standard library is implemented using Taichi kernels for performance, - with more or less metaprogramming tricks. Invoking them will incur **implicit kernel instantiations** + Many basic operations in the Taichi standard library are implemented using Taichi kernels + using metaprogramming tricks. Invoking them will incur **implicit kernel instantiations**. Examples include ``x.to_numpy()`` and ``y.from_torch(torch_tensor)``. When you invoke these functions, you will see kernel instantiations, as Taichi kernels will be generated to offload the hard work to multiple CPU cores/GPUs. @@ -84,7 +84,7 @@ will lead to a new instantiation of **add**. Code transformation and optimizations --------------------------------------- -When a new instantiation happens, the Taichi frontend compiler will transform the kernel body AST +When a new instantiation happens, the Taichi frontend compiler (i.e., the ``ASTTransformer`` Python class) will transform the kernel body AST into a Python script, which, when executed, emits a Taichi frontend AST. Basically, some patches are applied to the Python AST so that the Taichi frontend can recognize it. @@ -103,9 +103,10 @@ which allows a series of further IR passes to happen, such as The just-in-time (JIT) compilation engine ----------------------------------------- -Finally, the optimized SSA IR is fed into the LLVM IR codegen, and LLVM JIT generates high-performance executable CPU/GPU programs. +Finally, the optimized SSA IR is fed into backend compilers such as LLVM or Apple Metal/OpenGL shader compilers. +The backend compilers then generate high-performance executable CPU/GPU programs. Kernel launching ---------------- -Taichi kernels will be ultimately launched as multi-threaded CPU tasks or CUDA kernels. +Taichi kernels will be ultimately launched as multi-threaded CPU tasks or GPU kernels. diff --git a/docs/version b/docs/version index fcbaa8478100e..6769f67e2955a 100644 --- a/docs/version +++ b/docs/version @@ -1 +1 @@ -0.6.14 +0.6.15 diff --git a/examples/export_videos.py b/examples/export_videos.py index b6ddeeef6b50c..50d0738a4a1f3 100644 --- a/examples/export_videos.py +++ b/examples/export_videos.py @@ -13,8 +13,8 @@ def paint(): result_dir = "./results" video_manager = ti.VideoManager(output_dir=result_dir, - framerate=24, - automatic_build=False) + framerate=24, + automatic_build=False) for i in range(50): paint() diff --git a/misc/listgen_demo.py b/misc/listgen_demo.py index 5ae7213d05e8d..2d7459a8497c1 100644 --- a/misc/listgen_demo.py +++ b/misc/listgen_demo.py @@ -5,9 +5,11 @@ x = ti.var(ti.i32) ti.root.dense(ti.i, 4).bitmasked(ti.i, 4).place(x) + @ti.kernel def func(): for i in x: print(i) + func() diff --git a/misc/prtags.json b/misc/prtags.json index fe9b33ddcd70a..6ed8dd0a96e72 100644 --- a/misc/prtags.json +++ b/misc/prtags.json @@ -28,5 +28,6 @@ "windows" : "Windows", "perf" : "Performance improvements", "ipython" : "IPython and other shells", + "cc" : "C source backend", "release" : "Release" } diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py index 8cd8c17b36f63..c6f9e7c3ac03d 100644 --- a/python/taichi/__init__.py +++ b/python/taichi/__init__.py @@ -7,7 +7,7 @@ from taichi.misc import * from taichi.misc.gui import GUI from taichi.misc.np2ply import PLYWriter -from taichi.misc.image import * +from taichi.misc.image import imread, imwrite, imshow, imdisplay from taichi.misc.task import Task from taichi.misc.test import * from taichi.misc import settings as settings diff --git a/python/taichi/core/util.py b/python/taichi/core/util.py index 9d69334eb2a75..79ddc6531d14e 100644 --- a/python/taichi/core/util.py +++ b/python/taichi/core/util.py @@ -30,7 +30,7 @@ def import_ti_core(tmp_dir=None): global ti_core if get_os_name() != 'win': old_flags = sys.getdlopenflags() - sys.setdlopenflags(2) # 2 = RTLD_NOW + sys.setdlopenflags(2 | 8) # RTLD_NOW | RTLD_DEEPBIND else: pyddir = os.path.join(package_root(), 'lib') os.environ['PATH'] += ';' + pyddir diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py index 156756b64f717..a057be0f2a534 100644 --- a/python/taichi/lang/__init__.py +++ b/python/taichi/lang/__init__.py @@ -38,6 +38,7 @@ cuda = core.cuda metal = core.metal opengl = core.opengl +cc = core.cc gpu = [cuda, metal, opengl] cpu = core.host_arch() kernel_profiler_print = lambda: core.get_current_program( @@ -292,6 +293,8 @@ def is_arch_supported(arch): return core.with_metal() elif arch == opengl: return core.with_opengl() + elif arch == cc: + return core.with_cc() elif arch == cpu: return True else: @@ -299,7 +302,7 @@ def is_arch_supported(arch): def supported_archs(): - archs = [cpu, cuda, metal, opengl] + archs = [cpu, cuda, metal, opengl, cc] wanted_archs = os.environ.get('TI_WANTED_ARCHS', '') want_exclude = wanted_archs.startswith('^') diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py index 39babe2f9a74e..5bceab2ee4637 100644 --- a/python/taichi/lang/impl.py +++ b/python/taichi/lang/impl.py @@ -59,6 +59,16 @@ def expr_init_func(rhs): # temporary solution to allow passing in tensors as return expr_init(rhs) +def begin_frontend_struct_for(group, loop_range): + if group.size() != len(loop_range.shape): + raise IndexError( + 'Number of struct-for indices does not match loop variable dimensionality ' + f'({group.size()} != {len(loop_range.shape)}). Maybe you wanted to ' + 'use "for I in ti.grouped(x)" to group all indices into a single vector I?' + ) + taichi_lang_core.begin_frontend_struct_for(group, loop_range.ptr) + + def wrap_scalar(x): if type(x) in [int, float]: return Expr(x) diff --git a/python/taichi/lang/transformer.py b/python/taichi/lang/transformer.py index b8b62224f60b6..3f1b59d3afa88 100644 --- a/python/taichi/lang/transformer.py +++ b/python/taichi/lang/transformer.py @@ -437,7 +437,7 @@ def visit_struct_for(self, node, is_grouped): ___loop_var = 0 {} = ti.make_var_vector(size=len(___loop_var.loop_range().shape)) ___expr_group = ti.make_expr_group({}) - ti.core.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range().ptr) + ti.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range()) ti.core.end_frontend_range_for() '''.format(vars, vars) t = ast.parse(template).body[0] @@ -450,7 +450,7 @@ def visit_struct_for(self, node, is_grouped): {} ___loop_var = 0 ___expr_group = ti.make_expr_group({}) - ti.core.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range().ptr) + ti.begin_frontend_struct_for(___expr_group, ___loop_var.loop_range()) ti.core.end_frontend_range_for() '''.format(var_decl, vars) t = ast.parse(template).body[0] diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py index 809a49ba9202a..0783f68ab2de9 100644 --- a/python/taichi/misc/gui.py +++ b/python/taichi/misc/gui.py @@ -85,8 +85,8 @@ def set_image(self, img): self.img = self.cook_image(img.to_numpy()) else: # Type matched! We can use an optimized copy kernel. - assert img.shape( - ) == self.res, "Image resolution does not match GUI resolution" + assert img.shape \ + == self.res, "Image resolution does not match GUI resolution" from taichi.lang.meta import tensor_to_image tensor_to_image(img, self.img) ti.sync() @@ -96,8 +96,8 @@ def set_image(self, img): self.img = self.cook_image(img.to_numpy()) else: # Type matched! We can use an optimized copy kernel. - assert img.shape( - ) == self.res, "Image resolution does not match GUI resolution" + assert img.shape \ + == self.res, "Image resolution does not match GUI resolution" assert img.n in [ 3, 4 ], "Only greyscale, RGB or RGBA images are supported in GUI.set_image" diff --git a/python/taichi/misc/image.py b/python/taichi/misc/image.py index b3ec797cb557b..7e2594e7fa1fc 100644 --- a/python/taichi/misc/image.py +++ b/python/taichi/misc/image.py @@ -2,7 +2,12 @@ import taichi as ti -def imwrite(img, filename): +def cook_image_to_bytes(img): + """ + Takes a NumPy array or Taichi tensor of any type. + Returns a NumPy array of uint8. + This is used by ti.imwrite and ti.imdisplay. + """ if not isinstance(img, np.ndarray): img = img.to_numpy() @@ -16,19 +21,47 @@ def imwrite(img, filename): assert len(img.shape) in [2, 3], "Image must be either RGB/RGBA or greyscale" - resx, resy = img.shape[:2] if len(img.shape) == 2: - comp = 1 + img = img.reshape(*img.shape, 1) + + assert img.shape[2] in [1, 3, + 4], "Image must be either RGB/RGBA or greyscale" + + return img.swapaxes(0, 1)[::-1, :] + + +def imdisplay(img): + """ + Try to display image in interactive shell. + """ + if ti.lang.shell.oinspect.name == ti.lang.shell.ShellType.JUPYTER: + import PIL.Image + from io import BytesIO + import IPython.display + import numpy as np + img = cook_image_to_bytes(img) + with BytesIO() as f: + PIL.Image.fromarray(img).save(f, 'png') + IPython.display.display(IPython.display.Image(data=f.getvalue())) else: - comp = img.shape[2] - assert comp in [1, 3, 4], "Image must be either RGB/RGBA or greyscale" + ti.imshow(img) + - img = np.ascontiguousarray(img.swapaxes(0, 1)[::-1, :]) +def imwrite(img, filename): + """ + Save image to a specific file. + """ + img = cook_image_to_bytes(img) + img = np.ascontiguousarray(img) ptr = img.ctypes.data + resy, resx, comp = img.shape ti.core.imwrite(filename, ptr, resx, resy, comp) def imread(filename, channels=0): + """ + Load image from a specific file. + """ ptr, resx, resy, comp = ti.core.imread(filename, channels) img = np.ndarray(shape=(resy, resx, comp), dtype=np.uint8) img = np.ascontiguousarray(img) @@ -39,6 +72,9 @@ def imread(filename, channels=0): def imshow(img, window_name='Taichi'): + """ + Show image in a Taichi GUI. + """ if not isinstance(img, np.ndarray): img = img.to_numpy() assert len(img.shape) in [2, diff --git a/taichi/analysis/build_cfg.cpp b/taichi/analysis/build_cfg.cpp index c5f314b4023ea..b876356a09095 100644 --- a/taichi/analysis/build_cfg.cpp +++ b/taichi/analysis/build_cfg.cpp @@ -15,7 +15,8 @@ class CFGBuilder : public IRVisitor { int current_stmt_id; int begin_location; std::vector prev_nodes; - bool in_offloaded_for; + OffloadedStmt *current_offload; + bool in_parallel_for; public: CFGBuilder() @@ -23,7 +24,8 @@ class CFGBuilder : public IRVisitor { last_node_in_current_block(nullptr), current_stmt_id(-1), begin_location(-1), - in_offloaded_for(false) { + current_offload(nullptr), + in_parallel_for(false) { allow_undefined_visitor = true; invoke_default_visitor = true; graph = std::make_unique(); @@ -40,7 +42,7 @@ class CFGBuilder : public IRVisitor { CFGNode *new_node(int next_begin_location) { auto node = graph->push_back(current_block, begin_location, current_stmt_id, - in_offloaded_for, last_node_in_current_block); + in_parallel_for, last_node_in_current_block); for (auto &prev_node : prev_nodes) { CFGNode::add_edge(prev_node, node); } @@ -125,14 +127,23 @@ class CFGBuilder : public IRVisitor { } void visit(RangeForStmt *stmt) override { + auto old_in_parallel_for = in_parallel_for; + if (!current_offload) + in_parallel_for = true; visit_loop(stmt->body.get(), new_node(-1), false); + in_parallel_for = old_in_parallel_for; } void visit(StructForStmt *stmt) override { + auto old_in_parallel_for = in_parallel_for; + if (!current_offload) + in_parallel_for = true; visit_loop(stmt->body.get(), new_node(-1), false); + in_parallel_for = old_in_parallel_for; } void visit(OffloadedStmt *stmt) override { + current_offload = stmt; if (stmt->prologue) { auto before_offload = new_node(-1); int offload_stmt_id = current_stmt_id; @@ -149,10 +160,10 @@ class CFGBuilder : public IRVisitor { auto block_begin_index = graph->size(); if (stmt->task_type == OffloadedStmt::TaskType::range_for || stmt->task_type == OffloadedStmt::TaskType::struct_for) { - in_offloaded_for = true; + in_parallel_for = true; } stmt->body->accept(this); - in_offloaded_for = false; + in_parallel_for = false; prev_nodes.push_back(graph->back()); // Container statements don't belong to any CFGNodes. begin_location = offload_stmt_id + 1; @@ -168,6 +179,7 @@ class CFGBuilder : public IRVisitor { begin_location = offload_stmt_id + 1; CFGNode::add_edge(before_offload, graph->nodes[block_begin_index].get()); } + current_offload = nullptr; } void visit(Block *block) override { diff --git a/taichi/backends/cc/cc_configuation.h b/taichi/backends/cc/cc_configuation.h new file mode 100644 index 0000000000000..6306f36b58866 --- /dev/null +++ b/taichi/backends/cc/cc_configuation.h @@ -0,0 +1,20 @@ +#pragma once + +#include "taichi/lang_util.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +struct CCConfiguation { + std::string compile_cmd; + + CCConfiguation() : compile_cmd("gcc -shared -fPIC -o '{}' '{}'") { + } +}; + +extern CCConfiguation cfg; + +bool is_c_backend_available(); + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/cc_kernel.h b/taichi/backends/cc/cc_kernel.h new file mode 100644 index 0000000000000..390887808607e --- /dev/null +++ b/taichi/backends/cc/cc_kernel.h @@ -0,0 +1,27 @@ +#pragma once + +#include "taichi/lang_util.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +class CCProgram; + +class CCKernel { + public: + CCKernel(std::string const &source, std::string const &name) + : source(source), name(name) { + } + + void compile(); + void launch(CCProgram *program, Context *ctx); + + std::string source; + std::string name; + + std::string src_path; + std::string bin_path; +}; + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/cc_layout.h b/taichi/backends/cc/cc_layout.h new file mode 100644 index 0000000000000..0d3081c0645e6 --- /dev/null +++ b/taichi/backends/cc/cc_layout.h @@ -0,0 +1,17 @@ +#pragma once + +#include "taichi/lang_util.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +class CCLayout { + public: + CCLayout() { + } + + std::string source; +}; + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/cc_program.cpp b/taichi/backends/cc/cc_program.cpp new file mode 100644 index 0000000000000..234990d78dcc4 --- /dev/null +++ b/taichi/backends/cc/cc_program.cpp @@ -0,0 +1,50 @@ +#include "taichi/common/core.h" +#include "taichi/system/dynamic_loader.h" +#include "cc_program.h" +#include "cc_configuation.h" +#include "cc_kernel.h" +#include "cc_layout.h" +#include "cc_utils.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +CCConfiguation cfg; + +void CCKernel::compile() { + bin_path = fmt::format("{}/{}.so", runtime_tmp_dir, name); + src_path = fmt::format("{}/{}.c", runtime_tmp_dir, name); + + std::ofstream(src_path) << source; + TI_INFO("[cc] compiling kernel [{}]:\n{}\n", name, source); + execute(cfg.compile_cmd, bin_path, src_path); +} + +void CCKernel::launch(CCProgram *launcher, Context *ctx) { + using FuncEntryType = void(); + DynamicLoader dll(bin_path); + TI_ASSERT_INFO(dll.loaded(), "[cc] could not load shared object: {}", + bin_path); + auto main = + reinterpret_cast(dll.load_function(get_sym_name(name))); + TI_INFO("[cc] entering kernel [{}]", name); + (*main)(); + TI_INFO("[cc] leaving kernel [{}]", name); +} + +void CCProgram::launch(CCKernel *kernel, Context *ctx) { + kernel->launch(this, ctx); +} + +CCProgram::CCProgram() { +} + +CCProgram::~CCProgram() { +} + +bool is_c_backend_available() { + return true; +} + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/cc_program.h b/taichi/backends/cc/cc_program.h new file mode 100644 index 0000000000000..fc2888c30fb08 --- /dev/null +++ b/taichi/backends/cc/cc_program.h @@ -0,0 +1,24 @@ +#pragma once + +#include "taichi/lang_util.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +class CCKernel; +class CCLayout; + +class CCProgram { + // Launch C compiler to compile generated source code, and run them + public: + CCProgram(); + ~CCProgram(); + + void launch(CCKernel *kernel, Context *ctx); + + std::vector> kernels; + std::unique_ptr layout; +}; + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/cc_utils.h b/taichi/backends/cc/cc_utils.h new file mode 100644 index 0000000000000..0384850685430 --- /dev/null +++ b/taichi/backends/cc/cc_utils.h @@ -0,0 +1,66 @@ +#pragma once + +#include "taichi/lang_util.h" +#include "taichi/common/core.h" +#include +#include +#include +#include + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +inline std::string c_quoted(std::string const &str) { + // https://zh.cppreference.com/w/cpp/language/escape + std::stringstream ss; + ss << '"'; + for (auto const &c : str) { + switch (c) { +#define REG_ESC(x, y) \ + case x: \ + ss << "\\" y; \ + break; + REG_ESC('\n', "n"); + REG_ESC('\a', "a"); + REG_ESC('\b', "b"); + REG_ESC('\?', "?"); + REG_ESC('\v', "v"); + REG_ESC('\t', "t"); + REG_ESC('\f', "f"); + REG_ESC('\'', "'"); + REG_ESC('\"', "\""); + REG_ESC('\\', "\\"); + default: + ss << c; + } + } + ss << '"'; + return ss.str(); +} + +inline std::string cc_data_type_name(DataType dt) { + switch (dt) { + case DataType::i32: + return "int"; + case DataType::f32: + return "float"; + default: + TI_NOT_IMPLEMENTED + } +} + +inline std::string get_sym_name(std::string const &name) { + return fmt::format("Ti_{}", name); +} + +template +inline int execute(std::string fmt, Args &&... args) { + auto cmd = fmt::format(fmt, std::forward(args)...); + TI_INFO("Executing command: {}", cmd); + int ret = std::system(cmd.c_str()); + TI_INFO("Command exit status: {}", ret); + return ret; +} + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/codegen_cc.cpp b/taichi/backends/cc/codegen_cc.cpp new file mode 100644 index 0000000000000..1979fab3084ea --- /dev/null +++ b/taichi/backends/cc/codegen_cc.cpp @@ -0,0 +1,148 @@ +#include "codegen_cc.h" +#include "cc_kernel.h" +#include "cc_layout.h" +#include "taichi/ir/ir.h" +#include "taichi/ir/transforms.h" +#include "taichi/util/line_appender.h" +#include "cc_utils.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { // Codegen for C Compiler Processor + +class CCTransformer : public IRVisitor { + private: + [[maybe_unused]] Kernel *kernel; + [[maybe_unused]] CCLayout *layout; + + LineAppender line_appender; + LineAppender line_appender_header; + bool is_top_level{true}; + + public: + CCTransformer(Kernel *kernel, CCLayout *layout) + : kernel(kernel), layout(layout) { + allow_undefined_visitor = true; + invoke_default_visitor = true; + } + + void run() { + this->lower_ast(); + emit_header("#include "); + kernel->ir->accept(this); + } + + void lower_ast() { + auto ir = kernel->ir.get(); + auto config = kernel->program.config; + config.demote_dense_struct_fors = true; + irpass::compile_to_offloads(ir, config, + /*vectorize=*/false, kernel->grad, + /*ad_use_stack=*/false, config.print_ir, + /*lower_global_access*/ true); + } + + std::string get_source() { + return line_appender_header.lines() + "\n" + line_appender.lines(); + } + + private: + void visit(Block *stmt) override { + if (!is_top_level) + line_appender.push_indent(); + for (auto &s : stmt->statements) { + s->accept(this); + } + if (!is_top_level) + line_appender.pop_indent(); + } + + void visit(Stmt *stmt) override { + TI_WARN("[cc] unsupported statement type {}", typeid(*stmt).name()); + } + + void visit(ConstStmt *stmt) override { + TI_ASSERT(stmt->width() == 1); + emit("{} {} = {};", cc_data_type_name(stmt->element_type()), + stmt->raw_name(), stmt->val[0].stringify()); + } + + void visit(PrintStmt *stmt) override { + std::string format; + std::vector values; + + for (int i = 0; i < stmt->contents.size(); i++) { + auto const &content = stmt->contents[i]; + + if (std::holds_alternative(content)) { + auto arg_stmt = std::get(content); + format += data_type_format(arg_stmt->ret_type.data_type); + values.push_back(arg_stmt->raw_name()); + + } else { + auto str = std::get(content); + format += "%s"; + values.push_back(c_quoted(str)); + } + } + + values.insert(values.begin(), c_quoted(format)); + emit("printf({});", fmt::join(values, ", ")); + } + + void generate_serial_kernel(OffloadedStmt *stmt) { + emit_header("void {}(void);", get_sym_name(kernel->name)); + emit("void {}(void) {{", get_sym_name(kernel->name)); + { + ScopedIndent _s(line_appender); + stmt->body->accept(this); + } + emit("}}"); + } + + void visit(OffloadedStmt *stmt) override { + TI_ASSERT(is_top_level); + is_top_level = false; + if (stmt->task_type == OffloadedStmt::TaskType::serial) { + generate_serial_kernel(stmt); + } else { + TI_ERROR("[glsl] Unsupported offload type={} on C backend", + stmt->task_name()); + } + is_top_level = true; + } + + template + void emit(std::string f, Args &&... args) { + line_appender.append(std::move(f), std::move(args)...); + } + + template + void emit_header(std::string f, Args &&... args) { + line_appender_header.append(std::move(f), std::move(args)...); + } +}; + +std::unique_ptr CCKernelGen::compile() { + auto layout = kernel->program.cc_program->layout.get(); + CCTransformer tran(kernel, layout); + + tran.run(); + auto source = tran.get_source(); + auto ker = std::make_unique(source, kernel->name); + ker->compile(); + return ker; +} + +FunctionType compile_kernel(Kernel *kernel) { + CCKernelGen codegen(kernel); + auto compiled = codegen.compile(); + auto compiled_ptr = compiled.get(); + auto program = kernel->program.cc_program.get(); + program->kernels.push_back(std::move(compiled)); + return [program, compiled_ptr](Context &ctx) { + return program->launch(compiled_ptr, &ctx); + }; +} + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/codegen_cc.h b/taichi/backends/cc/codegen_cc.h new file mode 100644 index 0000000000000..de029e4f11e82 --- /dev/null +++ b/taichi/backends/cc/codegen_cc.h @@ -0,0 +1,26 @@ +#pragma once + +#include "taichi/lang_util.h" +#include "taichi/codegen/codegen.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +class CCKernel; + +class CCKernelGen { + // Generate corresponding C Source Code for a Taichi Kernel + public: + CCKernelGen(Kernel *kernel) : kernel(kernel) { + } + + std::unique_ptr compile(); + + private: + Kernel *kernel; +}; + +FunctionType compile_kernel(Kernel *kernel); + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/struct_cc.cpp b/taichi/backends/cc/struct_cc.cpp new file mode 100644 index 0000000000000..252152d623d38 --- /dev/null +++ b/taichi/backends/cc/struct_cc.cpp @@ -0,0 +1,14 @@ +#include "struct_cc.h" +#include "cc_layout.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +std::unique_ptr CCLayoutGen::compile() { + auto lay = std::make_unique(); + // W.I.P. + return lay; +} + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/cc/struct_cc.h b/taichi/backends/cc/struct_cc.h new file mode 100644 index 0000000000000..2b73b328052f9 --- /dev/null +++ b/taichi/backends/cc/struct_cc.h @@ -0,0 +1,24 @@ +#pragma once + +#include "taichi/lang_util.h" +#include "taichi/ir/snode.h" + +TLANG_NAMESPACE_BEGIN +namespace cccp { + +class CCLayout; + +class CCLayoutGen { + // Generate corresponding C Source Code for Taichi Structures + public: + CCLayoutGen(SNode *root) : root(root) { + } + + std::unique_ptr compile(); + + private: + SNode *root; +}; + +} // namespace cccp +TLANG_NAMESPACE_END diff --git a/taichi/backends/metal/api.cpp b/taichi/backends/metal/api.cpp index ebfdde95ceb3c..6e8314310241a 100644 --- a/taichi/backends/metal/api.cpp +++ b/taichi/backends/metal/api.cpp @@ -1,4 +1,6 @@ #include "taichi/backends/metal/api.h" + +#include "taichi/backends/metal/constants.h" #include "taichi/util/environ_config.h" TLANG_NAMESPACE_BEGIN @@ -53,15 +55,18 @@ nsobj_unique_ptr new_compute_command_encoder( return wrap_as_nsobj_unique_ptr(encoder); } -nsobj_unique_ptr new_library_with_source( - MTLDevice *device, - const std::string &source) { +nsobj_unique_ptr new_library_with_source(MTLDevice *device, + const std::string &source, + int msl_version) { auto source_str = mac::wrap_string_as_ns_string(source); id options = clscall("MTLCompileOptions", "alloc"); options = call(options, "init"); auto options_cleanup = wrap_as_nsobj_unique_ptr(options); call(options, "setFastMathEnabled:", false); + if (msl_version != kMslVersionNone) { + call(options, "setLanguageVersion:", msl_version); + } auto *lib = cast_call( device, "newLibraryWithSource:options:error:", source_str.get(), options, diff --git a/taichi/backends/metal/api.h b/taichi/backends/metal/api.h index b905d70693176..7cd307eebf3c8 100644 --- a/taichi/backends/metal/api.h +++ b/taichi/backends/metal/api.h @@ -3,12 +3,12 @@ // Reference implementation: // https://github.com/halide/Halide/blob/master/src/runtime/metal.cpp +#include + #include "taichi/common/trait.h" #include "taichi/lang_util.h" #include "taichi/platform/mac/objc_api.h" -#include - TLANG_NAMESPACE_BEGIN namespace metal { @@ -42,8 +42,11 @@ nsobj_unique_ptr new_command_buffer(MTLCommandQueue *queue); nsobj_unique_ptr new_compute_command_encoder( MTLCommandBuffer *buffer); +// msl_version: Metal Shader Language version. 0 means not set. +// See https://developer.apple.com/documentation/metal/mtllanguageversion nsobj_unique_ptr new_library_with_source(MTLDevice *device, - const std::string &source); + const std::string &source, + int msl_version); nsobj_unique_ptr new_function_with_name(MTLLibrary *library, const std::string &name); diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp index a649a14bc7983..8a8a776b925ff 100644 --- a/taichi/backends/metal/codegen_metal.cpp +++ b/taichi/backends/metal/codegen_metal.cpp @@ -8,6 +8,8 @@ #include "taichi/ir/ir.h" #include "taichi/ir/transforms.h" #include "taichi/util/line_appender.h" +#include "taichi/math/arithmetic.h" +#include "taichi/backends/metal/api.h" TLANG_NAMESPACE_BEGIN namespace metal { @@ -30,6 +32,7 @@ using BuffersEnum = KernelAttributes::Buffers; constexpr char kKernelThreadIdName[] = "utid_"; // 'u' for unsigned constexpr char kKernelGridSizeName[] = "ugrid_size_"; // 'u' for unsigned +constexpr char kKernelTidInSimdgroupName[] = "utid_in_simdg_"; constexpr char kRootBufferName[] = "root_addr"; constexpr char kGlobalTmpsBufferName[] = "global_tmps_addr"; constexpr char kContextBufferName[] = "ctx_addr"; @@ -43,6 +46,7 @@ constexpr char kListgenElemVarName[] = "listgen_elem_"; constexpr char kRandStateVarName[] = "rand_state_"; constexpr char kSNodeMetaVarName[] = "sn_meta_"; constexpr char kMemAllocVarName[] = "mem_alloc_"; +constexpr char kTlsBufferName[] = "tls_buffer_"; std::string buffer_to_name(BuffersEnum b) { switch (b) { @@ -79,24 +83,22 @@ class KernelCodegen : public IRVisitor { Section::Kernels, }; - struct UsedFeatures { - bool runtime_list_ops = false; - }; - public: // TODO(k-ye): Create a Params to hold these ctor params. KernelCodegen(const std::string &mtl_kernel_prefix, const std::string &root_snode_type_name, Kernel *kernel, const CompiledStructs *compiled_structs, - PrintStringTable *print_strtab) + PrintStringTable *print_strtab, + const CodeGen::Config &config) : mtl_kernel_prefix_(mtl_kernel_prefix), root_snode_type_name_(root_snode_type_name), kernel_(kernel), compiled_structs_(compiled_structs), needs_root_buffer_(compiled_structs_->root_size > 0), ctx_attribs_(*kernel_), - print_strtab_(print_strtab) { + print_strtab_(print_strtab), + cgen_config_(config) { // allow_undefined_visitor = true; for (const auto s : kAllSections) { section_appenders_[s] = LineAppender(); @@ -107,8 +109,8 @@ class KernelCodegen : public IRVisitor { return ctx_attribs_; } - const std::vector &kernels_attribs() const { - return mtl_kernels_attribs_; + const TaichiKernelAttributes &ti_kernels_attribs() const { + return ti_kernel_attribus_; } std::string run() { @@ -340,6 +342,13 @@ class KernelCodegen : public IRVisitor { stmt->raw_name(), dt, kGlobalTmpsBufferName, stmt->offset); } + void visit(ThreadLocalPtrStmt *stmt) override { + TI_ASSERT(stmt->width() == 1); + emit("thread auto* {} = reinterpret_cast({} + {});", + stmt->raw_name(), metal_data_type_name(stmt->element_type()), + kTlsBufferName, stmt->offset); + } + void visit(LoopIndexStmt *stmt) override { const auto stmt_name = stmt->raw_name(); if (stmt->loop->is()) { @@ -451,24 +460,36 @@ class KernelCodegen : public IRVisitor { TI_NOT_IMPLEMENTED; } + std::string val_var = stmt->val->raw_name(); + // TODO(k-ye): This is not a very reliable way to detect if we're in TLS + // xlogues... + const bool is_tls_reduction = + (inside_tls_epilogue_ && (op_type == AtomicOpType::add)); + const bool use_simd_in_tls_reduction = + (is_tls_reduction && cgen_config_.allow_simdgroup); + if (use_simd_in_tls_reduction) { + val_var += "_simd_val_"; + emit("const auto {} = simd_sum({});", val_var, stmt->val->raw_name()); + emit("if ({} == 0) {{", kKernelTidInSimdgroupName); + current_appender().push_indent(); + } + if (dt == DataType::i32) { emit( "const auto {} = atomic_fetch_{}_explicit((device atomic_int*){}, " "{}, " "metal::memory_order_relaxed);", - stmt->raw_name(), op_name, stmt->dest->raw_name(), - stmt->val->raw_name()); + stmt->raw_name(), op_name, stmt->dest->raw_name(), val_var); } else if (dt == DataType::u32) { emit( "const auto {} = atomic_fetch_{}_explicit((device atomic_uint*){}, " "{}, " "metal::memory_order_relaxed);", - stmt->raw_name(), op_name, stmt->dest->raw_name(), - stmt->val->raw_name()); + stmt->raw_name(), op_name, stmt->dest->raw_name(), val_var); } else if (dt == DataType::f32) { if (handle_float) { emit("const float {} = fatomic_fetch_{}({}, {});", stmt->raw_name(), - op_name, stmt->dest->raw_name(), stmt->val->raw_name()); + op_name, stmt->dest->raw_name(), val_var); } else { TI_ERROR("Metal does not support atomic {} for floating points", op_name); @@ -476,6 +497,11 @@ class KernelCodegen : public IRVisitor { } else { TI_ERROR("Metal only supports 32-bit atomic data types"); } + + if (use_simd_in_tls_reduction) { + current_appender().pop_indent(); + emit("}}"); // closes `if (kKernelTidInSimdgroupName == 0) {` + } } void visit(IfStmt *if_stmt) override { @@ -561,7 +587,8 @@ class KernelCodegen : public IRVisitor { } void visit(PrintStmt *stmt) override { - mark_print_used(); + used_features()->print = true; + const auto &contents = stmt->contents; const int num_entries = contents.size(); const std::string msgbuf_var_name = stmt->raw_name() + "_msgbuf_"; @@ -654,6 +681,7 @@ class KernelCodegen : public IRVisitor { void emit_headers() { SectionGuard sg(this, Section::Headers); emit("#include "); + emit("#include "); emit("using namespace metal;"); } @@ -722,7 +750,7 @@ class KernelCodegen : public IRVisitor { SectionGuard sg(this, Section::Kernels); kernel_->ir->accept(this); - if (used_features_.runtime_list_ops) { + if (used_features()->sparse) { emit(""); current_appender().append_raw(shaders::kMetalRuntimeKernelsSourceCode); } @@ -768,7 +796,7 @@ class KernelCodegen : public IRVisitor { emit("}}\n"); current_kernel_attribs_ = nullptr; - mtl_kernels_attribs_.push_back(ka); + mtl_kernels_attribs()->push_back(ka); } void generate_range_for_kernel(OffloadedStmt *stmt) { @@ -779,9 +807,16 @@ class KernelCodegen : public IRVisitor { ka.task_type = stmt->task_type; ka.buffers = get_common_buffers(); - emit_mtl_kernel_sig(mtl_kernel_name, ka.buffers); + const bool used_tls = (stmt->prologue != nullptr); + KernelSigExtensions kernel_exts; + kernel_exts.use_simdgroup = (used_tls && cgen_config_.allow_simdgroup); + used_features()->simdgroup = + used_features()->simdgroup || kernel_exts.use_simdgroup; - auto &range_for_attribs = ka.range_for_attribs; + emit_mtl_kernel_sig(mtl_kernel_name, ka.buffers, kernel_exts); + + ka.range_for_attribs = KernelAttributes::RangeForAttributes(); + auto &range_for_attribs = ka.range_for_attribs.value(); range_for_attribs.const_begin = stmt->const_begin; range_for_attribs.const_end = stmt->const_end; range_for_attribs.begin = @@ -820,23 +855,51 @@ class KernelCodegen : public IRVisitor { emit("const int begin_ = {} + {};", kKernelThreadIdName, begin_expr); // end_ = total_elems + begin_expr emit("const int end_ = {} + {};", total_elems_name, begin_expr); + + if (used_tls) { + // Using |int32_t| because it aligns to 4bytes. + emit("// TLS prologue"); + const std::string tls_bufi32_name = "tls_bufi32_"; + emit("int32_t {}[{}];", tls_bufi32_name, (stmt->tls_size + 3) / 4); + emit("thread char* {} = reinterpret_cast({});", + kTlsBufferName, tls_bufi32_name); + stmt->prologue->accept(this); + } + emit("for (int ii = begin_; ii < end_; ii += {}) {{", kKernelGridSizeName); { ScopedIndent s2(current_appender()); current_kernel_attribs_ = &ka; const auto mtl_func_name = mtl_kernel_func_name(mtl_kernel_name); - emit_mtl_kernel_func_def(mtl_func_name, ka.buffers, stmt->body.get()); - emit_call_mtl_kernel_func(mtl_func_name, ka.buffers, + std::vector extra_func_params; + std::vector extra_args; + if (used_tls) { + extra_func_params.push_back({"thread char*", kTlsBufferName}); + extra_args.push_back(kTlsBufferName); + } + emit_mtl_kernel_func_def(mtl_func_name, ka.buffers, extra_func_params, + stmt->body.get()); + emit_call_mtl_kernel_func(mtl_func_name, ka.buffers, extra_args, /*loop_index_expr=*/"ii"); } emit("}}"); // closes for loop + + if (used_tls) { + TI_ASSERT(stmt->epilogue != nullptr); + inside_tls_epilogue_ = true; + emit("{{ // TLS epilogue"); + stmt->epilogue->accept(this); + inside_tls_epilogue_ = false; + emit("}}"); + } + current_appender().pop_indent(); // Close kernel emit("}}\n"); current_kernel_attribs_ = nullptr; - mtl_kernels_attribs_.push_back(ka); + mtl_kernels_attribs()->push_back(ka); } void generate_struct_for_kernel(OffloadedStmt *stmt) { @@ -919,7 +982,7 @@ class KernelCodegen : public IRVisitor { current_appender().pop_indent(); emit("}}\n"); // closes kernel - mtl_kernels_attribs_.push_back(ka); + mtl_kernels_attribs()->push_back(ka); } void add_runtime_list_op_kernel(OffloadedStmt *stmt, @@ -944,11 +1007,13 @@ class KernelCodegen : public IRVisitor { } else { TI_ERROR("Unsupported offload task type {}", stmt->task_name()); } - ka.runtime_list_op_attribs.snode = sn; + + ka.runtime_list_op_attribs = KernelAttributes::RuntimeListOpAttributes(); + ka.runtime_list_op_attribs->snode = sn; current_kernel_attribs_ = nullptr; - mtl_kernels_attribs_.push_back(ka); - used_features_.runtime_list_ops = true; + mtl_kernels_attribs()->push_back(ka); + used_features()->sparse = true; } std::string inject_load_global_tmp(int offset, DataType dt = DataType::i32) { @@ -1042,15 +1107,28 @@ class KernelCodegen : public IRVisitor { loop_index_expr); } + struct KernelSigExtensions { + // https://stackoverflow.com/a/44693603/12003165 + KernelSigExtensions() noexcept { + } + + bool use_simdgroup = false; + }; + void emit_mtl_kernel_sig( const std::string &kernel_name, - const std::vector &buffers) { + const std::vector &buffers, + const KernelSigExtensions &exts = {}) { emit("kernel void {}(", kernel_name); for (int i = 0; i < buffers.size(); ++i) { emit(" device byte* {} [[buffer({})]],", buffer_to_name(buffers[i]), i); } emit(" const uint {} [[threads_per_grid]],", kKernelGridSizeName); + if (exts.use_simdgroup) { + emit(" const uint {} [[thread_index_in_simdgroup]],", + kKernelTidInSimdgroupName); + } emit(" const uint {} [[thread_position_in_grid]]) {{", kKernelThreadIdName); } @@ -1089,11 +1167,6 @@ class KernelCodegen : public IRVisitor { return kernel_name + "_func"; } - void mark_print_used() { - TI_ASSERT(current_kernel_attribs_ != nullptr); - current_kernel_attribs_->uses_print = true; - } - class SectionGuard { public: SectionGuard(KernelCodegen *kg, Section new_sec) @@ -1124,6 +1197,14 @@ class KernelCodegen : public IRVisitor { current_appender().append(std::move(f), std::forward(args)...); } + std::vector *mtl_kernels_attribs() { + return &(ti_kernel_attribus_.mtl_kernels_attribs); + } + + TaichiKernelAttributes::UsedFeatures *used_features() { + return &(ti_kernel_attribus_.used_features); + } + const std::string mtl_kernel_prefix_; const std::string root_snode_type_name_; Kernel *const kernel_; @@ -1131,13 +1212,14 @@ class KernelCodegen : public IRVisitor { const bool needs_root_buffer_; const KernelContextAttributes ctx_attribs_; PrintStringTable *const print_strtab_; + const CodeGen::Config &cgen_config_; bool is_top_level_{true}; int mtl_kernel_count_{0}; - std::vector mtl_kernels_attribs_; - UsedFeatures used_features_; + TaichiKernelAttributes ti_kernel_attribus_; GetRootStmt *root_stmt_{nullptr}; KernelAttributes *current_kernel_attribs_{nullptr}; + bool inside_tls_epilogue_{false}; Section code_section_{Section::Structs}; std::unordered_map section_appenders_; }; @@ -1146,12 +1228,14 @@ class KernelCodegen : public IRVisitor { CodeGen::CodeGen(Kernel *kernel, KernelManager *kernel_mgr, - const CompiledStructs *compiled_structs) + const CompiledStructs *compiled_structs, + const Config &config) : kernel_(kernel), kernel_mgr_(kernel_mgr), compiled_structs_(compiled_structs), id_(Program::get_kernel_id()), - taichi_kernel_name_(fmt::format("mtl_k{:04d}_{}", id_, kernel_->name)) { + taichi_kernel_name_(fmt::format("mtl_k{:04d}_{}", id_, kernel_->name)), + config_(config) { } FunctionType CodeGen::compile() { @@ -1159,14 +1243,16 @@ FunctionType CodeGen::compile() { config.demote_dense_struct_fors = true; irpass::compile_to_offloads(kernel_->ir.get(), config, /*vectorize=*/false, kernel_->grad, - /*ad_use_stack=*/true, config.print_ir); + /*ad_use_stack=*/true, config.print_ir, + /*lower_global_access=*/true, + /*make_thread_local=*/true); - KernelCodegen codegen(taichi_kernel_name_, - kernel_->program.snode_root->node_type_name, kernel_, - compiled_structs_, kernel_mgr_->print_strtable()); + KernelCodegen codegen( + taichi_kernel_name_, kernel_->program.snode_root->node_type_name, kernel_, + compiled_structs_, kernel_mgr_->print_strtable(), config_); const auto source_code = codegen.run(); kernel_mgr_->register_taichi_kernel(taichi_kernel_name_, source_code, - codegen.kernels_attribs(), + codegen.ti_kernels_attribs(), codegen.kernel_ctx_attribs()); return [kernel_mgr = kernel_mgr_, kernel_name = taichi_kernel_name_](Context &ctx) { diff --git a/taichi/backends/metal/codegen_metal.h b/taichi/backends/metal/codegen_metal.h index 00e58a33a2aab..468180306d973 100644 --- a/taichi/backends/metal/codegen_metal.h +++ b/taichi/backends/metal/codegen_metal.h @@ -18,9 +18,14 @@ namespace metal { class CodeGen { public: + struct Config { + bool allow_simdgroup = true; + }; + CodeGen(Kernel *kernel, KernelManager *kernel_mgr, - const CompiledStructs *compiled_structs); + const CompiledStructs *compiled_structs, + const Config &config); FunctionType compile(); @@ -33,6 +38,7 @@ class CodeGen { const CompiledStructs *const compiled_structs_; const int id_; const std::string taichi_kernel_name_; + const Config config_; }; } // namespace metal diff --git a/taichi/backends/metal/constants.h b/taichi/backends/metal/constants.h index 9a8d244977b52..72244377abcc4 100644 --- a/taichi/backends/metal/constants.h +++ b/taichi/backends/metal/constants.h @@ -9,6 +9,7 @@ namespace metal { inline constexpr int kMaxNumThreadsGridStrideLoop = 64 * 1024; inline constexpr int kNumRandSeeds = 64 * 1024; // 256 KB is nothing +inline constexpr int kMslVersionNone = 0; } // namespace metal -TLANG_NAMESPACE_END \ No newline at end of file +TLANG_NAMESPACE_END diff --git a/taichi/backends/metal/env_config.cpp b/taichi/backends/metal/env_config.cpp new file mode 100644 index 0000000000000..d1d3d2da8426b --- /dev/null +++ b/taichi/backends/metal/env_config.cpp @@ -0,0 +1,21 @@ +#include "taichi/backends/metal/env_config.h" + +#include "taichi/lang_util.h" +#include "taichi/util/environ_config.h" + +TLANG_NAMESPACE_BEGIN +namespace metal { + +EnvConfig::EnvConfig() { + simdgroup_enabled_ = + get_environ_config("TI_USE_METAL_SIMDGROUP", /*default_value=*/1); +} + +const EnvConfig &EnvConfig::instance() { + static const EnvConfig c; + return c; +} + +} // namespace metal + +TLANG_NAMESPACE_END diff --git a/taichi/backends/metal/env_config.h b/taichi/backends/metal/env_config.h new file mode 100644 index 0000000000000..94268d7874714 --- /dev/null +++ b/taichi/backends/metal/env_config.h @@ -0,0 +1,30 @@ +#pragma once + +#include "taichi/lang_util.h" + +TLANG_NAMESPACE_BEGIN +namespace metal { + +// Metal specific config inferred from the environment. +class EnvConfig { + public: + // Set TI_USE_METAL_SIMDGROUP=0 to disable SIMD group. + // This is an ad-hoc thing. Apple claims that SIMD group is supported in + // MSL 2.0, which isn't the case. According to my test, it's available in + // MSL 2.1. Since MSL 2.1 is released since macOS 10.14, we expect the major + // users would be able to use this feature. + inline bool is_simdgroup_enabled() const { + return simdgroup_enabled_; + } + + static const EnvConfig &instance(); + + private: + EnvConfig(); + + bool simdgroup_enabled_; +}; + +} // namespace metal + +TLANG_NAMESPACE_END diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp index ccd63af75f48d..b111951ad0326 100644 --- a/taichi/backends/metal/kernel_manager.cpp +++ b/taichi/backends/metal/kernel_manager.cpp @@ -33,6 +33,14 @@ namespace shaders { using KernelTaskType = OffloadedStmt::TaskType; using BufferEnum = KernelAttributes::Buffers; +inline int infer_msl_version(const TaichiKernelAttributes::UsedFeatures &f) { + if (f.simdgroup) { + // https://developer.apple.com/documentation/metal/mtllanguageversion/version2_1 + return 131073; + } + return kMslVersionNone; +} + // This class requests the Metal buffer memory of |size| bytes from |mem_pool|. // Once allocated, it does not own the memory (hence the name "view"). Instead, // GC is deferred to the memory pool. @@ -152,7 +160,7 @@ class RuntimeListOpsMtlKernel : public CompiledMtlKernelBase { const SNodeDescriptorsMap *snode_descriptors = nullptr; const SNode *snode() const { - return kernel_attribs->runtime_list_op_attribs.snode; + return kernel_attribs->runtime_list_op_attribs->snode; } }; @@ -208,7 +216,7 @@ class CompiledTaichiKernel { struct Params { std::string_view taichi_kernel_name; std::string mtl_source_code; - const std::vector *mtl_kernels_attribs; + const TaichiKernelAttributes *ti_kernel_attribs; const KernelContextAttributes *ctx_attribs; const SNodeDescriptorsMap *snode_descriptors; MTLDevice *device; @@ -216,14 +224,17 @@ class CompiledTaichiKernel { KernelProfilerBase *profiler; }; - CompiledTaichiKernel(Params params) : ctx_attribs(*params.ctx_attribs) { + CompiledTaichiKernel(Params params) + : ctx_attribs(*params.ctx_attribs), + used_features(params.ti_kernel_attribs->used_features) { auto *const device = params.device; - auto kernel_lib = new_library_with_source(device, params.mtl_source_code); + auto kernel_lib = new_library_with_source(device, params.mtl_source_code, + infer_msl_version(used_features)); if (kernel_lib == nullptr) { TI_ERROR("Failed to compile Metal kernel! Generated code:\n\n{}", params.mtl_source_code); } - for (const auto &ka : *(params.mtl_kernels_attribs)) { + for (const auto &ka : params.ti_kernel_attribs->mtl_kernels_attribs) { auto mtl_func = new_function_with_name(kernel_lib.get(), ka.name); TI_ASSERT(mtl_func != nullptr); // Note that CompiledMtlKernel doesn't own |kernel_func|. @@ -265,8 +276,7 @@ class CompiledTaichiKernel { KernelContextAttributes ctx_attribs; std::unique_ptr ctx_mem; nsobj_unique_ptr ctx_buffer; - - private: + TaichiKernelAttributes::UsedFeatures used_features; }; class HostMetalCtxBlitter { @@ -445,11 +455,10 @@ class KernelManager::Impl { init_print_buffer(); } - void register_taichi_kernel( - const std::string &taichi_kernel_name, - const std::string &mtl_kernel_source_code, - const std::vector &kernels_attribs, - const KernelContextAttributes &ctx_attribs) { + void register_taichi_kernel(const std::string &taichi_kernel_name, + const std::string &mtl_kernel_source_code, + const TaichiKernelAttributes &ti_kernel_attribs, + const KernelContextAttributes &ctx_attribs) { TI_ASSERT(compiled_taichi_kernels_.find(taichi_kernel_name) == compiled_taichi_kernels_.end()); @@ -463,7 +472,7 @@ class KernelManager::Impl { CompiledTaichiKernel::Params params; params.taichi_kernel_name = taichi_kernel_name; params.mtl_source_code = mtl_kernel_source_code; - params.mtl_kernels_attribs = &kernels_attribs; + params.ti_kernel_attribs = &ti_kernel_attribs; params.ctx_attribs = &ctx_attribs; params.snode_descriptors = &compiled_structs_.snode_descriptors; params.device = device_.get(); @@ -493,13 +502,11 @@ class KernelManager::Impl { input_buffers[BufferEnum::Context] = ctk.ctx_buffer.get(); } - bool uses_print = false; for (const auto &mk : ctk.compiled_mtl_kernels) { mk->launch(input_buffers, cur_command_buffer_.get()); - uses_print = (uses_print || mk->kernel_attribs()->uses_print); } - - if (ctx_blitter || uses_print) { + const bool used_print = ctk.used_features.print; + if (ctx_blitter || used_print) { // TODO(k-ye): One optimization is to synchronize only when we absolutely // need to transfer the data back to host. This includes the cases where // an arg is 1) an array, or 2) used as return value. @@ -507,7 +514,7 @@ class KernelManager::Impl { if (ctx_blitter) { ctx_blitter->metal_to_host(); } - if (uses_print) { + if (used_print) { flush_print_buffers(); } } @@ -741,11 +748,10 @@ class KernelManager::Impl { TI_ERROR("Metal not supported on the current OS"); } - void register_taichi_kernel( - const std::string &taichi_kernel_name, - const std::string &mtl_kernel_source_code, - const std::vector &kernels_attribs, - const KernelContextAttributes &ctx_attribs) { + void register_taichi_kernel(const std::string &taichi_kernel_name, + const std::string &mtl_kernel_source_code, + const TaichiKernelAttributes &ti_kernel_attribs, + const KernelContextAttributes &ctx_attribs) { TI_ERROR("Metal not supported on the current OS"); } @@ -776,10 +782,10 @@ KernelManager::~KernelManager() { void KernelManager::register_taichi_kernel( const std::string &taichi_kernel_name, const std::string &mtl_kernel_source_code, - const std::vector &kernels_attribs, + const TaichiKernelAttributes &ti_kernel_attribs, const KernelContextAttributes &ctx_attribs) { impl_->register_taichi_kernel(taichi_kernel_name, mtl_kernel_source_code, - kernels_attribs, ctx_attribs); + ti_kernel_attribs, ctx_attribs); } void KernelManager::launch_taichi_kernel(const std::string &taichi_kernel_name, diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h index dc3502577bdd2..1c687cedec82b 100644 --- a/taichi/backends/metal/kernel_manager.h +++ b/taichi/backends/metal/kernel_manager.h @@ -39,11 +39,10 @@ class KernelManager { // * |mtl_kernel_source_code| is the complete source code compiled from a // Taichi kernel. It may include one or more Metal compute kernels. Each // Metal kernel is identified by one item in |kernels_attribs|. - void register_taichi_kernel( - const std::string &taichi_kernel_name, - const std::string &mtl_kernel_source_code, - const std::vector &kernels_attribs, - const KernelContextAttributes &ctx_attribs); + void register_taichi_kernel(const std::string &taichi_kernel_name, + const std::string &mtl_kernel_source_code, + const TaichiKernelAttributes &ti_kernel_attribs, + const KernelContextAttributes &ctx_attribs); // Launch the given |taichi_kernel_name|. // Kernel launching is asynchronous, therefore the Metal memory is not valid diff --git a/taichi/backends/metal/kernel_util.cpp b/taichi/backends/metal/kernel_util.cpp index a4d3cc67190dd..5ef0bbc23879b 100644 --- a/taichi/backends/metal/kernel_util.cpp +++ b/taichi/backends/metal/kernel_util.cpp @@ -50,7 +50,7 @@ std::string KernelAttributes::debug_string() const { // TODO(k-ye): show range_for if (task_type == OffloadedStmt::TaskType::clear_list || task_type == OffloadedStmt::TaskType::listgen) { - result += fmt::format(" snode={}", runtime_list_op_attribs.snode->id); + result += fmt::format(" snode={}", runtime_list_op_attribs->snode->id); } result += ">"; return result; diff --git a/taichi/backends/metal/kernel_util.h b/taichi/backends/metal/kernel_util.h index a638702c149e2..fd5a7d47b6785 100644 --- a/taichi/backends/metal/kernel_util.h +++ b/taichi/backends/metal/kernel_util.h @@ -66,20 +66,31 @@ struct KernelAttributes { }; std::vector buffers; // Only valid when |task_type| is range_for. - // TODO(k-ye): Use std::optional to wrap |task_type| dependent attributes. - RangeForAttributes range_for_attribs; - // clear_list + listgen - RuntimeListOpAttributes runtime_list_op_attribs; - - // Whether print() is called inside this kernel. - // TODO(k-ye): Encapsulate this inside a UsedFeatures. However, we need a - // TaichiKernelAttributes before we can do this. - bool uses_print = false; + std::optional range_for_attribs; + // Only valid when |task_type| is {clear_list, listgen}. + std::optional runtime_list_op_attribs; static std::string buffers_name(Buffers b); std::string debug_string() const; }; +// Groups all the Metal kernels generated from a single ti.kernel +struct TaichiKernelAttributes { + struct UsedFeatures { + // Whether print() is called inside this kernel. + bool print = false; + // Whether this kernel accesses (read or write) sparse SNodes. + bool sparse = false; + // Whether [[thread_index_in_simdgroup]] is used. This is only supported + // since MSL 2.1 + bool simdgroup = false; + }; + + // Attributes of all the Metal kernels produced from this Taichi kernel. + std::vector mtl_kernels_attribs; + UsedFeatures used_features; +}; + // This class contains the attributes descriptors for both the input args and // the return values of a Taichi kernel. // diff --git a/taichi/backends/metal/shaders/print.metal.h b/taichi/backends/metal/shaders/print.metal.h index 1c8d2c74fc7d6..5cb3971540261 100644 --- a/taichi/backends/metal/shaders/print.metal.h +++ b/taichi/backends/metal/shaders/print.metal.h @@ -37,19 +37,21 @@ STR( constant constexpr int kMetalPrintMsgTypeWidthMask = ((1 << kMetalNumBitsPerPrintMsgType) - 1); - [[maybe_unused]] inline int mtl_compute_num_print_msg_typemasks( - int num_entries) { - return (num_entries + kMetalNumPrintMsgTypePerI32 - 1) / - kMetalNumPrintMsgTypePerI32; - } - - [[maybe_unused]] inline int mtl_compute_print_msg_bytes(int num_entries) { - // See PrintMsg's layout for how this is computed. - const int sz = - sizeof(int32_t) * - (1 + mtl_compute_num_print_msg_typemasks(num_entries) + num_entries); - return sz; - } + [[maybe_unused]] inline int mtl_compute_num_print_msg_typemasks( + int num_entries) { + return (num_entries + kMetalNumPrintMsgTypePerI32 - 1) / + kMetalNumPrintMsgTypePerI32; + } + + [[maybe_unused]] inline int mtl_compute_print_msg_bytes( + int num_entries) { + // See PrintMsg's layout for how this is computed. + const int sz = + sizeof(int32_t) * + (1 + mtl_compute_num_print_msg_typemasks(num_entries) + + num_entries); + return sz; + } class PrintMsg { public: diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index d466d1bc25ae4..1cdc0eb16648c 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -322,7 +322,7 @@ bool initialize_opengl(bool error_tolerance) { desc = "Unknown Error"; if (error_tolerance) { // error tolerated, returning false - TI_TRACE("[glsl] cannot create GLFW window: error {}: {}", status, desc); + TI_DEBUG("[glsl] cannot create GLFW window: error {}: {}", status, desc); supported = std::make_optional(false); return false; } diff --git a/taichi/inc/archs.inc.h b/taichi/inc/archs.inc.h index 3c021debbfccd..6f618f7f58adb 100644 --- a/taichi/inc/archs.inc.h +++ b/taichi/inc/archs.inc.h @@ -2,12 +2,14 @@ // CPU archs PER_ARCH(x64) // a.k.a. AMD64/x86_64 -PER_ARCH(arm64) // a.k.a. Aarch64 -PER_ARCH(js) // Javascript +PER_ARCH(arm64) // a.k.a. Aarch64, WIP +PER_ARCH(js) // Javascript, N/A +PER_ARCH(cc) // C language, WIP // GPU archs PER_ARCH(cuda) // NVIDIA CUDA -PER_ARCH(opencl) // OpenCL PER_ARCH(metal) // Apple Metal PER_ARCH(opengl) // OpenGL Compute Shaders -PER_ARCH(dx) // Microsoft DirectX +PER_ARCH(dx) // Microsoft DirectX, N/A +PER_ARCH(opencl) // OpenCL, N/A +PER_ARCH(amdgpu) // AMD GPU, N/A diff --git a/taichi/ir/control_flow_graph.cpp b/taichi/ir/control_flow_graph.cpp index f0a726d50a808..cb92bb04030ef 100644 --- a/taichi/ir/control_flow_graph.cpp +++ b/taichi/ir/control_flow_graph.cpp @@ -101,8 +101,7 @@ void CFGNode::reaching_definition_analysis(bool after_lower_access) { auto data_source_ptr = irpass::analysis::get_store_destination(stmt); if (data_source_ptr) { // stmt provides a data source - if (after_lower_access && - !(stmt->is() || stmt->is())) { + if (after_lower_access && !(data_source_ptr->is())) { // After lower_access, we only analyze local variables. continue; } diff --git a/taichi/ir/expr.cpp b/taichi/ir/expr.cpp index 083b9fe6e1490..20ea0cd594fbe 100644 --- a/taichi/ir/expr.cpp +++ b/taichi/ir/expr.cpp @@ -179,7 +179,9 @@ Expr load(const Expr &ptr) { Expr ptr_if_global(const Expr &var) { if (var.is()) { // singleton global variable - TI_ASSERT(var.snode()->num_active_indices == 0); + TI_ASSERT_INFO( + var.snode()->num_active_indices == 0, + "Please always use 'x[None]' (instead of simply 'x') to access any 0-D tensor." return var[ExprGroup()]; } else { // may be any local or global expr diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp index d1c38dbc64bf6..fc4938a2ef97c 100644 --- a/taichi/program/compile_config.cpp +++ b/taichi/program/compile_config.cpp @@ -35,6 +35,7 @@ CompileConfig::CompileConfig() { verbose = true; fast_math = true; async = false; + flatten_if = false; #if defined(TI_PLATFORM_WINDOWS) or defined(TI_ARCH_ARM) use_unified_memory = false; diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h index 6ee00204fdc72..02c5cf64c15e3 100644 --- a/taichi/program/compile_config.h +++ b/taichi/program/compile_config.h @@ -37,6 +37,7 @@ struct CompileConfig { bool fast_math; bool use_unified_memory; bool async; + bool flatten_if; DataType default_fp; DataType default_ip; std::string extra_flags; diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp index 55df70d6f6967..b1a0cd971fb7d 100644 --- a/taichi/program/kernel_profiler.cpp +++ b/taichi/program/kernel_profiler.cpp @@ -148,13 +148,10 @@ class KernelProfilerCUDA : public KernelProfilerBase { } // namespace std::unique_ptr make_profiler(Arch arch) { - if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal || - arch == Arch::opengl) { - return std::make_unique(arch); - } else if (arch == Arch::cuda) { + if (arch == Arch::cuda) { return std::make_unique(); } else { - TI_NOT_IMPLEMENTED; + return std::make_unique(arch); } } diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index 9a6c94f416fb5..6567ab2d87d55 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -11,12 +11,16 @@ #include "taichi/backends/cuda/cuda_context.h" #endif #include "taichi/backends/metal/codegen_metal.h" +#include "taichi/backends/metal/env_config.h" #include "taichi/backends/opengl/codegen_opengl.h" +#include "taichi/backends/cc/codegen_cc.h" #include "taichi/backends/cpu/codegen_cpu.h" #include "taichi/struct/struct.h" #include "taichi/struct/struct_llvm.h" #include "taichi/backends/metal/struct_metal.h" #include "taichi/backends/opengl/struct_opengl.h" +#include "taichi/backends/cc/struct_cc.h" +#include "taichi/backends/cc/cc_layout.h" #include "taichi/system/unified_allocator.h" #include "taichi/ir/snode.h" #include "taichi/ir/frontend_ir.h" @@ -152,13 +156,20 @@ FunctionType Program::compile(Kernel &kernel) { auto codegen = KernelCodeGen::create(kernel.arch, &kernel); ret = codegen->compile(); } else if (kernel.arch == Arch::metal) { + metal::CodeGen::Config cgen_config; + cgen_config.allow_simdgroup = + metal::EnvConfig::instance().is_simdgroup_enabled(); metal::CodeGen codegen(&kernel, metal_kernel_mgr_.get(), - &metal_compiled_structs_.value()); + &metal_compiled_structs_.value(), cgen_config); ret = codegen.compile(); } else if (kernel.arch == Arch::opengl) { opengl::OpenglCodeGen codegen(kernel.name, &opengl_struct_compiled_.value(), opengl_kernel_launcher_.get()); ret = codegen.compile(*this, kernel); +#ifdef TI_WITH_CC + } else if (kernel.arch == Arch::cc) { + ret = cccp::compile_kernel(&kernel); +#endif } else { TI_NOT_IMPLEMENTED; } @@ -315,6 +326,12 @@ void Program::materialize_layout() { opengl_struct_compiled_->root_size); opengl_kernel_launcher_ = std::make_unique( opengl_struct_compiled_->root_size); +#ifdef TI_WITH_CC + } else if (config.arch == Arch::cc) { + cc_program = std::make_unique(); + cccp::CCLayoutGen scomp(snode_root.get()); + cc_program->layout = scomp.compile(); +#endif } } diff --git a/taichi/program/program.h b/taichi/program/program.h index 68b103a27eec8..740ee01f0491d 100644 --- a/taichi/program/program.h +++ b/taichi/program/program.h @@ -14,6 +14,7 @@ #include "taichi/backends/metal/kernel_manager.h" #include "taichi/backends/opengl/opengl_kernel_launcher.h" #include "taichi/backends/opengl/opengl_kernel_util.h" +#include "taichi/backends/cc/cc_program.h" #include "taichi/program/kernel.h" #include "taichi/program/kernel_profiler.h" #include "taichi/runtime/llvm/context.h" @@ -245,6 +246,12 @@ class Program { // OpenGL related data structures std::optional opengl_struct_compiled_; std::unique_ptr opengl_kernel_launcher_; + + public: +#ifdef TI_WITH_CC + // C backend related data structures + std::unique_ptr cc_program; +#endif }; TLANG_NAMESPACE_END diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index b34949ac13667..08d0c041b0f76 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -109,7 +109,8 @@ void export_lang(py::module &m) { &CompileConfig::device_memory_fraction) .def_readwrite("fast_math", &CompileConfig::fast_math) .def_readwrite("ad_stack_size", &CompileConfig::ad_stack_size) - .def_readwrite("async", &CompileConfig::async); + .def_readwrite("async", &CompileConfig::async) + .def_readwrite("flatten_if", &CompileConfig::flatten_if); m.def("reset_default_compile_config", [&]() { default_compile_config = CompileConfig(); }); diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp index e48a3b8fb9d7c..78136fb68f713 100644 --- a/taichi/python/export_misc.cpp +++ b/taichi/python/export_misc.cpp @@ -15,6 +15,7 @@ #include "taichi/system/dynamic_loader.h" #include "taichi/backends/metal/api.h" #include "taichi/backends/opengl/opengl_api.h" +#include "taichi/backends/cc/cc_configuation.h" #if defined(TI_WITH_CUDA) #include "taichi/backends/cuda/cuda_driver.h" #endif @@ -156,12 +157,15 @@ void export_misc(py::module &m) { } printf("test was successful.\n"); }); - m.def("pop_python_print_buffer", []() { - return py_cout.pop_content(); - }); + m.def("pop_python_print_buffer", []() { return py_cout.pop_content(); }); m.def("with_cuda", is_cuda_api_available); m.def("with_metal", taichi::lang::metal::is_metal_api_available); m.def("with_opengl", taichi::lang::opengl::is_opengl_api_available); +#ifdef TI_WITH_CC + m.def("with_cc", taichi::lang::cccp::is_c_backend_available); +#else + m.def("with_cc", []() { return false; }); +#endif } TI_NAMESPACE_END diff --git a/taichi/system/dynamic_loader.cpp b/taichi/system/dynamic_loader.cpp index fbe54b7a66268..2923fa34c8642 100644 --- a/taichi/system/dynamic_loader.cpp +++ b/taichi/system/dynamic_loader.cpp @@ -21,7 +21,7 @@ void DynamicLoader::load_dll(const std::string &dll_path) { } void *DynamicLoader::load_function(const std::string &func_name) { - TI_ASSERT(loaded()); + TI_ASSERT_INFO(loaded(), "DLL not opened"); #ifdef WIN32 auto func = (void *)GetProcAddress((HMODULE)dll, func_name.c_str()); #else @@ -34,7 +34,7 @@ void *DynamicLoader::load_function(const std::string &func_name) { } void DynamicLoader::close_dll() { - TI_ERROR_IF(!loaded(), "DLL not opened."); + TI_ASSERT_INFO(loaded(), "DLL not opened"); #ifdef WIN32 FreeLibrary((HMODULE)dll); #else diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 9ec3112753186..3b8ae0b064c9a 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -59,13 +59,10 @@ void compile_to_offloads(IRNode *ir, print("Loop Split"); irpass::analysis::verify(ir); } - irpass::simplify(ir); + irpass::full_simplify(ir); print("Simplified I"); irpass::analysis::verify(ir); - irpass::constant_fold(ir); - print("Constant folded I"); - if (grad) { // Remove local atomics here so that we don't have to handle their gradients irpass::demote_atomics(ir); @@ -105,9 +102,6 @@ void compile_to_offloads(IRNode *ir, print("Simplified II"); irpass::analysis::verify(ir); - irpass::constant_fold(ir); - print("Constant folded II"); - irpass::offload(ir); print("Offloaded"); irpass::analysis::verify(ir); diff --git a/taichi/transforms/constant_fold.cpp b/taichi/transforms/constant_fold.cpp index 02d407dd61140..ec1d294728277 100644 --- a/taichi/transforms/constant_fold.cpp +++ b/taichi/transforms/constant_fold.cpp @@ -166,6 +166,12 @@ class ConstantFold : public BasicStmtVisitor { } void visit(UnaryOpStmt *stmt) override { + if (stmt->is_cast() && + stmt->cast_type == stmt->operand->ret_type.data_type) { + stmt->replace_with(stmt->operand); + modifier.erase(stmt); + return; + } auto operand = stmt->operand->cast(); if (!operand) return; diff --git a/taichi/transforms/simplify.cpp b/taichi/transforms/simplify.cpp index 37d9cb74527f0..8a62fb0554603 100644 --- a/taichi/transforms/simplify.cpp +++ b/taichi/transforms/simplify.cpp @@ -2,6 +2,8 @@ #include "taichi/ir/transforms.h" #include "taichi/ir/analysis.h" #include "taichi/ir/visitors.h" +#include "taichi/program/kernel.h" +#include "taichi/program/program.h" #include #include #include @@ -1062,13 +1064,15 @@ class BasicBlockSimplify : public IRVisitor { return false; }; - if (if_stmt->true_statements && - flatten(if_stmt->true_statements->statements, true)) { - throw IRModified(); - } - if (if_stmt->false_statements && - flatten(if_stmt->false_statements->statements, false)) { - throw IRModified(); + if (kernel->program.config.flatten_if) { + if (if_stmt->true_statements && + flatten(if_stmt->true_statements->statements, true)) { + throw IRModified(); + } + if (if_stmt->false_statements && + flatten(if_stmt->false_statements->statements, false)) { + throw IRModified(); + } } if (if_stmt->true_statements) { @@ -1141,6 +1145,9 @@ class Simplify : public IRVisitor { Kernel *kernel; Simplify(IRNode *node, Kernel *kernel) : kernel(kernel) { + if (!kernel) + this->kernel = node->get_kernel(); + TI_ASSERT(this->kernel); modified = false; allow_undefined_visitor = true; invoke_default_visitor = true; @@ -1176,7 +1183,9 @@ class Simplify : public IRVisitor { } void visit(StructForStmt *for_stmt) override { - TI_ASSERT(current_struct_for == nullptr); + TI_ASSERT_INFO(current_struct_for == nullptr, + "Nested struct-fors are not supported for now. " + "Please try to use range-fors for inner loops."); current_struct_for = for_stmt; for_stmt->body->accept(this); current_struct_for = nullptr; @@ -1225,15 +1234,18 @@ void full_simplify(IRNode *root, Kernel *kernel) { modified = true; if (constant_fold(root)) modified = true; + if (die(root)) + modified = true; if (alg_simp(root)) modified = true; - die(root); - if (whole_kernel_cse(root)) + if (die(root)) modified = true; - die(root); if (simplify(root, kernel)) modified = true; - die(root); + if (die(root)) + modified = true; + if (whole_kernel_cse(root)) + modified = true; if (!modified) break; } diff --git a/taichi/util/environ_config.h b/taichi/util/environ_config.h index cf65e3dce4bee..9a4aae477eab4 100644 --- a/taichi/util/environ_config.h +++ b/taichi/util/environ_config.h @@ -7,8 +7,8 @@ TLANG_NAMESPACE_BEGIN -static inline int get_environ_config(const std::string &name, int default_value = 0) -{ +static inline int get_environ_config(const std::string &name, + int default_value = 0) { char *res = std::getenv(name.c_str()); if (res == nullptr) return default_value; diff --git a/taichi/util/logging.cpp b/taichi/util/logging.cpp index 212cd95518933..35a090baa4e6e 100644 --- a/taichi/util/logging.cpp +++ b/taichi/util/logging.cpp @@ -155,7 +155,8 @@ std::string signal_name(int sig) { bool python_at_exit_called = false; void signal_handler(int signo) { - // It seems that there's no way to pass exception to Python in signal handlers? + // It seems that there's no way to pass exception to Python in signal + // handlers? // @archibate found that in fact there are such solution: // https://docs.python.org/3/library/faulthandler.html#module-faulthandler auto sig_name = signal_name(signo); diff --git a/tests/python/test_for_group_mismatch.py b/tests/python/test_for_group_mismatch.py new file mode 100644 index 0000000000000..6c84b44ccc6c2 --- /dev/null +++ b/tests/python/test_for_group_mismatch.py @@ -0,0 +1,96 @@ +import taichi as ti + + +@ti.must_throw(IndexError) +@ti.host_arch_only +def test_struct_for_mismatch(): + x = ti.var(ti.f32, (3, 4)) + + @ti.kernel + def func(): + for i in x: + print(i) + + func() + + +@ti.must_throw(IndexError) +@ti.host_arch_only +def test_struct_for_mismatch2(): + x = ti.var(ti.f32, (3, 4)) + + @ti.kernel + def func(): + for i, j, k in x: + print(i, j, k) + + func() + + +@ti.must_throw(IndexError) +@ti.host_arch_only +def _test_grouped_struct_for_mismatch(): + # doesn't work for now + # need grouped refactor + # for now, it just throw a unfriendly message: + # AssertionError: __getitem__ cannot be called in Python-scope + x = ti.var(ti.f32, (3, 4)) + + @ti.kernel + def func(): + for i, j in ti.grouped(x): + print(i, j) + + func() + + +@ti.must_throw(IndexError) +@ti.host_arch_only +def _test_ndrange_for_mismatch(): + # doesn't work for now + # need ndrange refactor + @ti.kernel + def func(): + for i in ti.ndrange(3, 4): + print(i) + + func() + + +@ti.must_throw(IndexError) +@ti.host_arch_only +def _test_ndrange_for_mismatch2(): + # doesn't work for now + # need ndrange and grouped refactor + @ti.kernel + def func(): + for i, j, k in ti.ndrange(3, 4): + print(i, j, k) + + func() + + +@ti.must_throw(IndexError) +@ti.host_arch_only +def _test_grouped_ndrange_for_mismatch(): + # doesn't work for now + # need ndrange and grouped refactor + @ti.kernel + def func(): + for i in ti.grouped(ti.ndrange(3, 4)): + print(i) + + func() + + +@ti.must_throw(IndexError) +@ti.host_arch_only +def _test_static_ndrange_for_mismatch(): + # doesn't work for now + # need ndrange and static refactor + @ti.kernel + def func(): + for i in ti.static(ti.ndrange(3, 4)): + print(i) + + func() diff --git a/tests/python/test_return.py b/tests/python/test_return.py index 71e383ec275a7..9400e6dfa905f 100644 --- a/tests/python/test_return.py +++ b/tests/python/test_return.py @@ -16,7 +16,6 @@ def kernel() -> ti.i32: @ti.must_throw(ti.TaichiSyntaxError) def test_return_without_type_hint(): - @ti.kernel def kernel(): return 1 @@ -25,7 +24,6 @@ def kernel(): def test_const_func_ret(): - @ti.kernel def func1() -> ti.f32: return 3 @@ -40,7 +38,6 @@ def func2() -> ti.i32: @ti.all_archs def _test_binary_func_ret(dt1, dt2, dt3, castor): - @ti.kernel def func(a: dt1, b: dt2) -> dt3: return a * b @@ -64,4 +61,3 @@ def test_binary_func_ret(): _test_binary_func_ret(ti.f32, ti.i32, ti.f32, float) _test_binary_func_ret(ti.i32, ti.f32, ti.i32, int) _test_binary_func_ret(ti.f32, ti.i32, ti.i32, int) -