From 98aa16059abf5a9cc0d563225960b2eaa5260419 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Fri, 25 Dec 2020 22:45:49 +0800 Subject: [PATCH 1/4] add test --- misc/test_bit_array_vectorization.py | 59 -------------------- tests/python/test_bit_array_vectorization.py | 46 +++++++++++++++ 2 files changed, 46 insertions(+), 59 deletions(-) delete mode 100644 misc/test_bit_array_vectorization.py create mode 100644 tests/python/test_bit_array_vectorization.py diff --git a/misc/test_bit_array_vectorization.py b/misc/test_bit_array_vectorization.py deleted file mode 100644 index e58f3638059d1..0000000000000 --- a/misc/test_bit_array_vectorization.py +++ /dev/null @@ -1,59 +0,0 @@ -import taichi as ti - -ti.init(debug=True, cfg_optimization=False, kernel_profiler=True) - -vectorize = True - -ci1 = ti.type_factory_.get_custom_int_type(1, False) - -x = ti.field(dtype=ci1) -y = ti.field(dtype=ci1) - -N = 4096 -n_blocks = 4 -bits = 32 -boundary_offset = 1024 - -block = ti.root.pointer(ti.ij, (n_blocks, n_blocks)) -block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array( - ti.j, bits, num_bits=bits).place(x) -block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array( - ti.j, bits, num_bits=bits).place(y) - - -@ti.kernel -def init(): - for i, j in ti.ndrange((boundary_offset, N - boundary_offset), - (boundary_offset, N - boundary_offset)): - x[i, j] = ti.random(dtype=ti.i32) % 2 - - -@ti.kernel -def assign_vectorized(): - ti.bit_vectorize(32) - for i, j in x: - y[i, j] = x[i, j] - - -@ti.kernel -def assign_naive(): - for i, j in ti.ndrange((boundary_offset, N - boundary_offset), - (boundary_offset, N - boundary_offset)): - y[i, j] = x[i, j] - - -@ti.kernel -def verify(): - for i, j in ti.ndrange((boundary_offset, N - boundary_offset), - (boundary_offset, N - boundary_offset)): - assert y[i, j] == x[i, j] - - -init() -if vectorize: - assign_vectorized() -else: - assign_naive() -verify() - -ti.kernel_profiler_print() diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py new file mode 100644 index 0000000000000..a3457c25192df --- /dev/null +++ b/tests/python/test_bit_array_vectorization.py @@ -0,0 +1,46 @@ +import taichi as ti +import numpy as np + + +@ti.test(require=ti.extension.quant, debug=True, cfg_optimization=False) +def test_vectorized_struct_for(): + ci1 = ti.type_factory_.get_custom_int_type(1, False) + + x = ti.field(dtype=ci1) + y = ti.field(dtype=ci1) + + N = 4096 + n_blocks = 4 + bits = 32 + boundary_offset = 1024 + + block = ti.root.pointer(ti.ij, (n_blocks, n_blocks)) + block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array( + ti.j, bits, num_bits=bits).place(x) + block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array( + ti.j, bits, num_bits=bits).place(y) + + @ti.kernel + def init(): + for i, j in ti.ndrange((boundary_offset, N - boundary_offset), + (boundary_offset, N - boundary_offset)): + x[i, j] = ti.random(dtype=ti.i32) % 2 + + + @ti.kernel + def assign_vectorized(): + ti.bit_vectorize(32) + for i, j in x: + y[i, j] = x[i, j] + + + @ti.kernel + def verify(): + for i, j in ti.ndrange((boundary_offset, N - boundary_offset), + (boundary_offset, N - boundary_offset)): + assert y[i, j] == x[i, j] + + + init() + assign_vectorized() + verify() From 717fe0186c1ad0a11e24e71b63fe736756099b7a Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Sat, 26 Dec 2020 01:09:49 +0800 Subject: [PATCH 2/4] try fix cuda --- taichi/ir/frontend_ir.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp index df42136088b8d..e98da372f7cde 100644 --- a/taichi/ir/frontend_ir.cpp +++ b/taichi/ir/frontend_ir.cpp @@ -40,8 +40,6 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var, auto cfg = get_current_program().config; if (cfg.arch == Arch::cuda) { vectorize = 1; - // TODO: temporally setting to 1 - bit_vectorize = 1; parallelize = 1; TI_ASSERT(block_dim <= taichi_max_gpu_block_dim); } else { @@ -79,8 +77,6 @@ FrontendForStmt::FrontendForStmt(const Expr &loop_var, auto cfg = get_current_program().config; if (cfg.arch == Arch::cuda) { vectorize = 1; - // TODO: temporally setting to 1 - bit_vectorize = 1; parallelize = 1; } else { if (parallelize == 0) From 354ea424ed675f95f68e47456c5f8b90ba62cfa0 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Sat, 26 Dec 2020 01:39:38 +0800 Subject: [PATCH 3/4] enforce bit vectorization for x86 and CUDA --- taichi/transforms/compile_to_offloads.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 5f06dc4b8f2ba..e0c52dd56e65a 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -65,15 +65,19 @@ void compile_to_offloads(IRNode *ir, print("Loop Vectorized"); irpass::analysis::verify(ir); - // TODO: create a separate CompileConfig flag for the new pass - irpass::bit_loop_vectorize(ir); - print("Bit Loop Vectorized"); - irpass::analysis::verify(ir); - irpass::vector_split(ir, config.max_vector_width, config.serial_schedule); print("Loop Split"); irpass::analysis::verify(ir); } + + // TODO: strictly enforce bit vectorization for x86 cpu and CUDA now + // create a separate CompileConfig flag for the new pass + if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) { + irpass::bit_loop_vectorize(ir); + print("Bit Loop Vectorized"); + irpass::analysis::verify(ir); + } + irpass::full_simplify(ir, false); print("Simplified I"); irpass::analysis::verify(ir); From 0d34f585e95e0c056e9cac87483ec63550d4c144 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Sat, 26 Dec 2020 01:44:11 +0800 Subject: [PATCH 4/4] format --- taichi/transforms/compile_to_offloads.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index e0c52dd56e65a..e5133ad653e77 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -72,7 +72,7 @@ void compile_to_offloads(IRNode *ir, // TODO: strictly enforce bit vectorization for x86 cpu and CUDA now // create a separate CompileConfig flag for the new pass - if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) { + if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) { irpass::bit_loop_vectorize(ir); print("Bit Loop Vectorized"); irpass::analysis::verify(ir);