diff --git a/python/taichi/lang/snode.py b/python/taichi/lang/snode.py index ff235c7124554..d4ceed47473f9 100644 --- a/python/taichi/lang/snode.py +++ b/python/taichi/lang/snode.py @@ -134,6 +134,12 @@ def num_dynamically_allocated(self): runtime.materialize() return runtime.prog.get_snode_num_dynamically_allocated(self.ptr) + @property + def cell_size_bytes(self): + runtime = impl.get_runtime() + runtime.materialize() + return self.ptr.cell_size_bytes + def deactivate_all(self): ch = self.get_children() for c in ch: diff --git a/python/taichi/tools/video.py b/python/taichi/tools/video.py index a7cfc49bfe99a..6ef27ea231689 100644 --- a/python/taichi/tools/video.py +++ b/python/taichi/tools/video.py @@ -154,9 +154,9 @@ def interpolate_frames(frame_dir, mul=4): cv2.imwrite('interpolated/{:05d}.png'.format(i), img * 255.0) -def ffmpeg_common_args(frame_rate, width, height, crf): - return f" -loglevel panic -framerate {frame_rate} -s:v {width}x{height} " + \ - f"-c:v libx264 -profile:v high -crf {crf} -pix_fmt yuv420p" +def ffmpeg_common_args(frame_rate, input, width, height, crf, output_path): + return f"{get_ffmpeg_path()} -y -loglevel panic -framerate {frame_rate} -i {input} -s:v {width}x{height} " + \ + f"-c:v libx264 -profile:v high -crf {crf} -pix_fmt yuv420p {output_path}" def make_video(input_files, @@ -180,7 +180,9 @@ def make_video(input_files, height -= 1 for i, inp in enumerate(input_files): shutil.copy(inp, os.path.join(tmp_dir, '%06d.png' % i)) - command = f"{get_ffmpeg_path()} -y -i {tmp_dir}/%06d.png {ffmpeg_common_args(frame_rate, width, height, crf)} {output_path} " + inputs = f'{tmp_dir}/%06d.png' + command = ffmpeg_common_args(frame_rate, inputs, width, height, crf, + output_path) ret = os.system(command) assert ret == 0, "ffmpeg failed to generate video file." for i in range(len(input_files)): @@ -188,8 +190,9 @@ def make_video(input_files, os.rmdir(tmp_dir) elif isinstance(input_files, str): assert width != 0 and height != 0 - command = f"{get_ffmpeg_path()} -i {input_files} {ffmpeg_common_args(frame_rate, width, height, crf)} {output_path}" + command = ffmpeg_common_args(frame_rate, input_files, width, height, + crf, output_path) ret = os.system(command) assert ret == 0, "ffmpeg failed to generate video file." else: - assert False, f'input_files should be list (of files) or str (of file template, like "%04d.png") instead of {type(input_files)}' + assert False, f'input_files should be list (of files) or str (of file template, e.g., "%04d.png") instead of {type(input_files)}' diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp index 81bb4e30f2ee1..82c89ba59ea2c 100644 --- a/taichi/codegen/codegen_llvm.cpp +++ b/taichi/codegen/codegen_llvm.cpp @@ -1055,6 +1055,7 @@ llvm::Value *CodeGenLLVM::atomic_add_custom_float(AtomicOpStmt *stmt, auto cit = cft->get_digits_type()->as(); auto val_store = float_to_custom_int(cft, cit, llvm_val[stmt->val]); auto physical_type = cit->get_physical_type(); + val_store = builder->CreateSExt(val_store, llvm_type(physical_type)); return create_call( fmt::format("atomic_add_partial_bits_b{}", data_type_bits(physical_type)), @@ -2039,13 +2040,14 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) { llvm::Function *body = nullptr; auto leaf_block = stmt->snode; + // When looping over bit_arrays and bit_structs, we generate struct for on // their parent node (usually "dense") instead of itself for higher // performance. Also, note that the loop must be bit_vectorized for - // bit_arrays. - if ((leaf_block->type == SNodeType::bit_array || - leaf_block->type == SNodeType::bit_struct) && - leaf_block->parent) { + // bit_arrays, and their parent must be "dense". + if (leaf_block->type == SNodeType::bit_struct) { + leaf_block = leaf_block->parent; + } else if (leaf_block->type == SNodeType::bit_array) { if (leaf_block->parent->type == SNodeType::dense) { leaf_block = leaf_block->parent; } else { diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h index 2fab9f9dcd57a..586f868052513 100644 --- a/taichi/ir/snode.h +++ b/taichi/ir/snode.h @@ -76,6 +76,7 @@ class SNode { int64 n{}; int total_num_bits{}, total_bit_start{}; int chunk_size{}; + std::size_t cell_size_bytes{}; PrimitiveType *physical_type; // for bit_struct and bit_array only DataType dt; bool has_ambient{}; diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index 49d60c2b293bf..39cdd931c6cda 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -327,9 +327,7 @@ void Program::initialize_runtime_system(StructCompiler *scomp) { for (int i = 0; i < (int)snodes.size(); i++) { if (is_gc_able(snodes[i]->type)) { std::size_t node_size; - auto element_size = - tlctx->get_type_size(StructCompilerLLVM::get_llvm_element_type( - tlctx->get_this_thread_struct_module(), snodes[i])); + auto element_size = snodes[i]->cell_size_bytes; if (snodes[i]->type == SNodeType::pointer) { // pointer. Allocators are for single elements node_size = element_size; diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 573b2dc5f49bc..c8a93f253ff52 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -266,6 +266,7 @@ void export_lang(py::module &m) { }) .def("num_active_indices", [](SNode *snode) { return snode->num_active_indices; }) + .def_readonly("cell_size_bytes", &SNode::cell_size_bytes) .def("begin_shared_exp_placement", &SNode::begin_shared_exp_placement) .def("end_shared_exp_placement", &SNode::end_shared_exp_placement); diff --git a/taichi/struct/struct_llvm.cpp b/taichi/struct/struct_llvm.cpp index a7cc5fe4a760c..81f81718d23c6 100644 --- a/taichi/struct/struct_llvm.cpp +++ b/taichi/struct/struct_llvm.cpp @@ -46,6 +46,8 @@ void StructCompilerLLVM::generate_types(SNode &snode) { auto ch_type = llvm::StructType::create(*ctx, ch_types, snode.node_type_name + "_ch"); + snode.cell_size_bytes = tlctx->get_type_size(ch_type); + llvm::Type *body_type = nullptr, *aux_type = nullptr; if (type == SNodeType::dense || type == SNodeType::bitmasked) { TI_ASSERT(snode._morton == false); @@ -83,6 +85,10 @@ void StructCompilerLLVM::generate_types(SNode &snode) { } ch->bit_offset = total_offset; total_offset += component_cit->get_num_bits(); + auto bit_struct_size = data_type_bits(snode.physical_type); + TI_ERROR_IF(total_offset > bit_struct_size, + "Bit struct overflows: {} bits used out of {}.", total_offset, + bit_struct_size); } snode.dt = TypeFactory::get_instance().get_bit_struct_type( diff --git a/taichi/transforms/demote_atomics.cpp b/taichi/transforms/demote_atomics.cpp index b11a0f23f4ca6..0697266b2d895 100644 --- a/taichi/transforms/demote_atomics.cpp +++ b/taichi/transforms/demote_atomics.cpp @@ -62,6 +62,19 @@ class DemoteAtomics : public BasicStmtVisitor { demote = true; is_local = true; } + + if (auto dest_pointer_type = stmt->dest->ret_type->cast()) { + if (auto cft = + dest_pointer_type->get_pointee_type()->cast()) { + if (cft->get_exponent_type()) { + TI_WARN( + "AtomicOp on CustomFloatType with exponent is not supported. " + "Demoting to non-atomic RMW."); + demote = true; + } + } + } + if (demote) { // replace atomics with load, add, store auto bin_type = atomic_to_binary_op_type(stmt->op_type); diff --git a/tests/python/test_cell_size_inspection.py b/tests/python/test_cell_size_inspection.py new file mode 100644 index 0000000000000..2f66bf3385955 --- /dev/null +++ b/tests/python/test_cell_size_inspection.py @@ -0,0 +1,43 @@ +import taichi as ti + + +@ti.test(arch=ti.cpu) +def test_primitives(): + x = ti.field(dtype=ti.i16) + y = ti.field(dtype=ti.f32) + z = ti.field(dtype=ti.f64) + + p = ti.field(dtype=ti.f32) + q = ti.field(dtype=ti.f32) + r = ti.field(dtype=ti.f64) + + n1 = ti.root.dense(ti.i, 32) + n1.place(x) + + n2 = ti.root.dense(ti.i, 32) + n2.place(y, z) + + n3 = ti.root.dense(ti.i, 1) + n3.place(p, q, r) + + assert n1.cell_size_bytes == 2 + assert 12 <= n2.cell_size_bytes <= 16 + assert n3.cell_size_bytes == 16 + + +@ti.test(arch=ti.cpu) +def test_bit_struct(): + cit = ti.type_factory.custom_int(16, False) + x = ti.field(dtype=cit) + y = ti.field(dtype=ti.type_factory.custom_float(significand_type=cit)) + z = ti.field(dtype=ti.f32) + + n1 = ti.root.dense(ti.i, 32) + n1._bit_struct(num_bits=32).place(x) + + n2 = ti.root.dense(ti.i, 4) + n2._bit_struct(num_bits=32).place(y) + n2.place(z) + + assert n1.cell_size_bytes == 4 + assert n2.cell_size_bytes == 8 diff --git a/tests/python/test_custom_float_exponents.py b/tests/python/test_custom_float_exponents.py index 233604a5bcd20..b6269d12d2061 100644 --- a/tests/python/test_custom_float_exponents.py +++ b/tests/python/test_custom_float_exponents.py @@ -114,3 +114,25 @@ def test_custom_float_truncation(signed): assert x[None] == 1.5 else: assert x[None] == 1.75 + + +@ti.test(require=ti.extension.quant) +def test_custom_float_atomic_demotion(): + cit = ti.type_factory.custom_int(2, True) + exp = ti.type_factory.custom_int(5, False) + cft = ti.type_factory.custom_float(significand_type=cit, + exponent_type=exp, + scale=1) + x = ti.field(dtype=cft) + + ti.root._bit_struct(num_bits=32).place(x) + + @ti.kernel + def foo(): + for i in range(1): + x[None] += 1 + + foo() + foo() + + assert x[None] == 2