diff --git a/benchmarks/fill_dense.py b/benchmarks/fill_dense.py index a7dd4130d44fa..674c5591df9df 100644 --- a/benchmarks/fill_dense.py +++ b/benchmarks/fill_dense.py @@ -128,9 +128,9 @@ def fill(): # ti.cfg.arch = ti.cuda # ti.cfg.print_kernel_llvm_ir_optimized = True # ti.cfg.print_kernel_llvm_ir = True -ti.cfg.enable_profiler = True +ti.cfg.kernel_profiler = True # ti.cfg.verbose_kernel_launches = True print(benchmark_nested_struct_listgen_8x8()) # print(benchmark_root_listgen()) -ti.profiler_print() +ti.kernel_profiler_print() ''' diff --git a/benchmarks/fill_sparse.py b/benchmarks/fill_sparse.py index 35bbfd9f77ba3..fe563cf3a5669 100644 --- a/benchmarks/fill_sparse.py +++ b/benchmarks/fill_sparse.py @@ -43,7 +43,7 @@ def task(): ''' -ti.init(arch=ti.cuda, enable_profiler=True) +ti.init(arch=ti.cuda, kernel_profiler=True) benchmark_nested_struct_fill_and_clear() -ti.profiler_print() +ti.kernel_profiler_print() ''' diff --git a/examples/mgpcg.py b/examples/mgpcg.py index 542c72f1d2bbd..cd9d264e476ab 100644 --- a/examples/mgpcg.py +++ b/examples/mgpcg.py @@ -2,7 +2,7 @@ import taichi as ti real = ti.f32 -ti.init(default_fp=real, arch=ti.x64, enable_profiler=True) +ti.init(default_fp=real, arch=ti.x64, kernel_profiler=True) # grid parameters N = 128 @@ -207,4 +207,4 @@ def paint(): gui.set_image(pixels) gui.show() -ti.profiler_print() +ti.kernel_profiler_print() diff --git a/examples/mgpcg_advanced.py b/examples/mgpcg_advanced.py index d122351c05d70..31a31cf35f0f6 100644 --- a/examples/mgpcg_advanced.py +++ b/examples/mgpcg_advanced.py @@ -3,7 +3,7 @@ import taichi as ti real = ti.f32 -ti.init(default_fp=real, arch=ti.x64, enable_profiler=True) +ti.init(default_fp=real, arch=ti.x64, kernel_profiler=True) @ti.data_oriented @@ -207,7 +207,7 @@ def run(self): gui.set_image(self.pixels) gui.show() - ti.profiler_print() + ti.kernel_profiler_print() solver = MGPCG() diff --git a/misc/benchmark_parallel_compilation.py b/misc/benchmark_parallel_compilation.py index 0d8ebdb4c1285..b4e53e69bdefb 100644 --- a/misc/benchmark_parallel_compilation.py +++ b/misc/benchmark_parallel_compilation.py @@ -65,5 +65,5 @@ def substep(): for i in range(32): substep() -ti.profiler_print() +ti.kernel_profiler_print() ti.core.print_profile_info() diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py index 4c942901bbfcc..cd76ba9ccd7d7 100644 --- a/python/taichi/lang/__init__.py +++ b/python/taichi/lang/__init__.py @@ -35,10 +35,10 @@ opengl = core.opengl gpu = [cuda, metal, opengl] cpu = core.host_arch() -profiler_print = lambda: core.get_current_program().profiler_print() -profiler_clear = lambda: core.get_current_program().profiler_clear() -profiler_start = lambda n: core.get_current_program().profiler_start(n) -profiler_stop = lambda: core.get_current_program().profiler_stop() +kernel_profiler_print = lambda: core.get_current_program( +).kernel_profiler_print() +kernel_profiler_clear = lambda: core.get_current_program( +).kernel_profiler_clear() class _Extension(object): diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py index 7369b8135f559..b9574c3b46a98 100644 --- a/python/taichi/misc/gui.py +++ b/python/taichi/misc/gui.py @@ -41,9 +41,6 @@ def __init__(self, name, res=512, background_color=0x0): self.key_pressed = set() self.event = None self.clear() - if ti.core.get_current_program(): - self.core.set_profiler( - ti.core.get_current_program().get_profiler()) def __enter__(self): return self diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp index 23002984ec6dc..3827a1446fb0d 100644 --- a/taichi/backends/cpu/codegen_cpu.cpp +++ b/taichi/backends/cpu/codegen_cpu.cpp @@ -50,7 +50,7 @@ class CodeGenLLVMCPU : public CodeGenLLVM { stat.add("codegen_offloaded_tasks"); using Type = OffloadedStmt::TaskType; auto offloaded_task_name = init_offloaded_task_function(stmt); - if (prog->config.enable_profiler) { + if (prog->config.kernel_profiler && arch_is_cpu(prog->config.arch)) { call( builder.get(), "LLVMRuntime_profiler_start", {get_runtime(), builder->CreateGlobalStringPtr(offloaded_task_name)}); @@ -72,7 +72,7 @@ class CodeGenLLVMCPU : public CodeGenLLVM { } else { TI_NOT_IMPLEMENTED } - if (prog->config.enable_profiler) { + if (prog->config.kernel_profiler && arch_is_cpu(prog->config.arch)) { call(builder.get(), "LLVMRuntime_profiler_stop", {get_runtime()}); } finalize_offloaded_task_function(); diff --git a/taichi/backends/cuda/cuda_context.h b/taichi/backends/cuda/cuda_context.h index 85a080afac868..9736c9ac94620 100644 --- a/taichi/backends/cuda/cuda_context.h +++ b/taichi/backends/cuda/cuda_context.h @@ -4,7 +4,7 @@ #include #include -#include "taichi/program/profiler.h" +#include "taichi/program/kernel_profiler.h" #include "taichi/backends/cuda/cuda_driver.h" TLANG_NAMESPACE_BEGIN @@ -23,7 +23,7 @@ class CUDAContext { int dev_count; std::string mcpu; std::mutex lock; - ProfilerBase *profiler; + KernelProfilerBase *profiler; CUDADriver &driver; static std::unordered_map> @@ -45,9 +45,10 @@ class CUDAContext { unsigned gridDim, unsigned blockDim); - void set_profiler(ProfilerBase *profiler) { + void set_profiler(KernelProfilerBase *profiler) { this->profiler = profiler; } + std::string get_mcpu() const { return mcpu; } diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp index fc6ed5e1b26f0..f26dd001c1086 100644 --- a/taichi/backends/metal/kernel_manager.cpp +++ b/taichi/backends/metal/kernel_manager.cpp @@ -212,7 +212,7 @@ class CompiledTaichiKernel { const SNodeDescriptorsMap *snode_descriptors; MTLDevice *device; MemoryPool *mem_pool; - ProfilerBase *profiler; + KernelProfilerBase *profiler; }; CompiledTaichiKernel(Params params) : ctx_attribs(*params.ctx_attribs) { @@ -651,7 +651,7 @@ class KernelManager::Impl { CompileConfig *const config_; const CompiledStructs compiled_structs_; MemoryPool *const mem_pool_; - ProfilerBase *const profiler_; + KernelProfilerBase *const profiler_; nsobj_unique_ptr device_; nsobj_unique_ptr command_queue_; nsobj_unique_ptr cur_command_buffer_; diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h index 58782606b22d0..46e6ecf2c41e8 100644 --- a/taichi/backends/metal/kernel_manager.h +++ b/taichi/backends/metal/kernel_manager.h @@ -7,7 +7,7 @@ #include "taichi/backends/metal/kernel_util.h" #include "taichi/lang_util.h" -#include "taichi/program/profiler.h" +#include "taichi/program/kernel_profiler.h" #include "taichi/backends/metal/struct_metal.h" #include "taichi/system/memory_pool.h" @@ -27,7 +27,7 @@ class KernelManager { CompiledStructs compiled_structs; CompileConfig *config; MemoryPool *mem_pool; - ProfilerBase *profiler; + KernelProfilerBase *profiler; int root_id; }; diff --git a/taichi/gui/gui.h b/taichi/gui/gui.h index 3eb9c8fcd6d53..59687cce4bea2 100644 --- a/taichi/gui/gui.h +++ b/taichi/gui/gui.h @@ -2,7 +2,7 @@ #include "taichi/math/math.h" #include "taichi/system/timer.h" -#include "taichi/program/profiler.h" +#include "taichi/program/kernel_profiler.h" #include #include @@ -488,7 +488,6 @@ class GUI : public GUIBase { Vector2i cursor_pos; bool button_status[3]; int widget_height; - lang::ProfilerBase *profiler; void set_mouse_pos(int x, int y) { cursor_pos = Vector2i(x, y); @@ -886,10 +885,6 @@ class GUI : public GUIBase { } ~GUI(); - - void set_profiler(lang::ProfilerBase *profiler) { - this->profiler = profiler; - } }; TI_NAMESPACE_END diff --git a/taichi/jit/jit_module.h b/taichi/jit/jit_module.h index 9f979efbb15f7..82794c94955a0 100644 --- a/taichi/jit/jit_module.h +++ b/taichi/jit/jit_module.h @@ -6,7 +6,7 @@ #include "taichi/inc/constants.h" #include "taichi/llvm/llvm_fwd.h" #include "taichi/lang_util.h" -#include "taichi/program/profiler.h" +#include "taichi/program/kernel_profiler.h" TLANG_NAMESPACE_BEGIN diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp index 3627bc420f7f4..68fd71360eb95 100644 --- a/taichi/program/compile_config.cpp +++ b/taichi/program/compile_config.cpp @@ -29,7 +29,7 @@ CompileConfig::CompileConfig() { default_fp = DataType::f32; default_ip = DataType::i32; verbose_kernel_launches = false; - enable_profiler = false; + kernel_profiler = false; default_cpu_block_dim = 0; // 0 = adaptive default_gpu_block_dim = 64; verbose = true; diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h index e7a9f68e5bba4..4722ab4110633 100644 --- a/taichi/program/compile_config.h +++ b/taichi/program/compile_config.h @@ -32,7 +32,7 @@ struct CompileConfig { bool print_kernel_llvm_ir_optimized; bool print_kernel_nvptx; bool verbose_kernel_launches; - bool enable_profiler; + bool kernel_profiler; bool verbose; bool fast_math; bool use_unified_memory; diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp index 5b17e64a9ca34..6c438000d0d98 100644 --- a/taichi/program/kernel.cpp +++ b/taichi/program/kernel.cpp @@ -90,13 +90,17 @@ void Kernel::operator()() { } compiled(program.get_context()); program.sync = (program.sync && arch_is_cpu(arch)); - if (program.config.debug && arch_is_cpu(arch)) { + // Note that Kernel::arch may be different from program.config.arch + if (program.config.debug && arch_is_cpu(arch) && + arch_is_cpu(program.config.arch)) { program.check_runtime_error(); } } else { program.sync = false; program.async_engine->launch(this); - if (program.config.debug && arch_is_cpu(arch)) { + // Note that Kernel::arch may be different from program.config.arch + if (program.config.debug && arch_is_cpu(arch) && + arch_is_cpu(program.config.arch)) { program.check_runtime_error(); } } diff --git a/taichi/program/profiler.cpp b/taichi/program/kernel_profiler.cpp similarity index 77% rename from taichi/program/profiler.cpp rename to taichi/program/kernel_profiler.cpp index bda1ad04e4a62..55df70d6f6967 100644 --- a/taichi/program/profiler.cpp +++ b/taichi/program/kernel_profiler.cpp @@ -1,11 +1,11 @@ -#include "profiler.h" +#include "kernel_profiler.h" #include "taichi/system/timer.h" #include "taichi/backends/cuda/cuda_driver.h" TLANG_NAMESPACE_BEGIN -void ProfileRecord::insert_sample(double t) { +void KernelProfileRecord::insert_sample(double t) { if (counter == 0) { min = t; max = t; @@ -16,21 +16,23 @@ void ProfileRecord::insert_sample(double t) { total += t; } -void ProfilerBase::profiler_start(ProfilerBase *profiler, - const char *kernel_name) { +void KernelProfilerBase::profiler_start(KernelProfilerBase *profiler, + const char *kernel_name) { + TI_ASSERT(profiler); profiler->start(std::string(kernel_name)); } -void ProfilerBase::profiler_stop(ProfilerBase *profiler) { +void KernelProfilerBase::profiler_stop(KernelProfilerBase *profiler) { + TI_ASSERT(profiler); profiler->stop(); } -void ProfilerBase::print() { +void KernelProfilerBase::print() { sync(); printf("%s\n", title().c_str()); for (auto &rec : records) { printf( - "[%6.2f%%] %-40s min %7.3f ms avg %7.3f ms max %7.3f ms " + "[%6.2f%%] %-40s min %7.3f ms avg %7.3f ms max %7.3f ms " "total %7.3f s [%7dx]\n", rec.total / total_time * 100.0f, rec.name.c_str(), rec.min, rec.total / rec.counter, rec.max, rec.total / 1000.0f, rec.counter); @@ -39,7 +41,7 @@ void ProfilerBase::print() { namespace { // A simple profiler that uses Time::get_time() -class DefaultProfiler : public ProfilerBase { +class DefaultProfiler : public KernelProfilerBase { public: explicit DefaultProfiler(Arch arch) : title_(fmt::format("{} Profiler", arch_name(arch))) { @@ -60,9 +62,9 @@ class DefaultProfiler : public ProfilerBase { void stop() override { auto t = Time::get_time() - start_t_; auto ms = t * 1000.0; - auto it = - std::find_if(records.begin(), records.end(), - [&](ProfileRecord &r) { return r.name == event_name_; }); + auto it = std::find_if( + records.begin(), records.end(), + [&](KernelProfileRecord &r) { return r.name == event_name_; }); if (it == records.end()) { records.emplace_back(event_name_); it = std::prev(records.end()); @@ -78,7 +80,7 @@ class DefaultProfiler : public ProfilerBase { }; // A CUDA kernel profiler that uses CUDA timing events -class CUDAProfiler : public ProfilerBase { +class KernelProfilerCUDA : public KernelProfilerBase { public: #if defined(TI_WITH_CUDA) void *current_stop; @@ -123,7 +125,7 @@ class CUDAProfiler : public ProfilerBase { CUDADriver::get_instance().event_elapsed_time(&ms, start, stop); auto it = std::find_if( records.begin(), records.end(), - [&](ProfileRecord &r) { return r.name == map_elem.first; }); + [&](KernelProfileRecord &r) { return r.name == map_elem.first; }); if (it == records.end()) { records.emplace_back(map_elem.first); it = std::prev(records.end()); @@ -138,19 +140,19 @@ class CUDAProfiler : public ProfilerBase { #endif } - static CUDAProfiler &get_instance() { - static CUDAProfiler profiler; + static KernelProfilerCUDA &get_instance() { + static KernelProfilerCUDA profiler; return profiler; } }; } // namespace -std::unique_ptr make_profiler(Arch arch) { +std::unique_ptr make_profiler(Arch arch) { if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal || arch == Arch::opengl) { return std::make_unique(arch); } else if (arch == Arch::cuda) { - return std::make_unique(); + return std::make_unique(); } else { TI_NOT_IMPLEMENTED; } diff --git a/taichi/program/profiler.h b/taichi/program/kernel_profiler.h similarity index 62% rename from taichi/program/profiler.h rename to taichi/program/kernel_profiler.h index d02d7d59f2fb8..18c56669922e7 100644 --- a/taichi/program/profiler.h +++ b/taichi/program/kernel_profiler.h @@ -11,23 +11,23 @@ TLANG_NAMESPACE_BEGIN -struct ProfileRecord { +struct KernelProfileRecord { std::string name; int counter; double min; double max; double total; - ProfileRecord(const std::string &name) + KernelProfileRecord(const std::string &name) : name(name), counter(0), min(0), max(0), total(0) { } void insert_sample(double t); }; -class ProfilerBase { +class KernelProfilerBase { protected: - std::vector records; + std::vector records; double total_time; public: @@ -42,18 +42,19 @@ class ProfilerBase { virtual void start(const std::string &kernel_name) = 0; - static void profiler_start(ProfilerBase *profiler, const char *kernel_name); + static void profiler_start(KernelProfilerBase *profiler, + const char *kernel_name); virtual void stop() = 0; - static void profiler_stop(ProfilerBase *profiler); + static void profiler_stop(KernelProfilerBase *profiler); void print(); - virtual ~ProfilerBase() { + virtual ~KernelProfilerBase() { } }; -std::unique_ptr make_profiler(Arch arch); +std::unique_ptr make_profiler(Arch arch); TLANG_NAMESPACE_END diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index 28e7dd0b05cac..5269759ea969d 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -8,7 +8,7 @@ #if defined(TI_WITH_CUDA) #include "taichi/backends/cuda/cuda_driver.h" #include "taichi/backends/cuda/codegen_cuda.h" -#include "taichi/backends/cuda/cuda_driver.h" +#include "taichi/backends/cuda/cuda_context.h" #endif #include "taichi/backends/metal/codegen_metal.h" #include "taichi/backends/opengl/codegen_opengl.h" @@ -98,9 +98,18 @@ Program::Program(Arch desired_arch) { preallocated_device_buffer = nullptr; - if (config.enable_profiler && runtime) { + if (config.kernel_profiler && runtime) { runtime->set_profiler(profiler.get()); } +#if defined(TI_WITH_CUDA) + if (config.arch == Arch::cuda) { + if (config.kernel_profiler) { + CUDAContext::get_instance().set_profiler(profiler.get()); + } else { + CUDAContext::get_instance().set_profiler(nullptr); + } + } +#endif result_buffer = nullptr; current_kernel = nullptr; @@ -255,14 +264,19 @@ void Program::initialize_runtime_system(StructCompiler *scomp) { runtime->call("LLVMRuntime_set_assert_failed", llvm_runtime, (void *)assert_failed_host); - // Profiler functions can only be called on host kernels + } + if (arch_is_cpu(config.arch)) { + // Profiler functions can only be called on CPU kernels runtime->call("LLVMRuntime_set_profiler", llvm_runtime, profiler.get()); runtime->call("LLVMRuntime_set_profiler_start", llvm_runtime, - (void *)&ProfilerBase::profiler_start); + (void *)&KernelProfilerBase::profiler_start); runtime->call("LLVMRuntime_set_profiler_stop", llvm_runtime, - (void *)&ProfilerBase::profiler_stop); + (void *)&KernelProfilerBase::profiler_stop); + } else { + runtime->call("LLVMRuntime_set_profiler", llvm_runtime, + nullptr); } } diff --git a/taichi/program/program.h b/taichi/program/program.h index 8774124022e06..68b103a27eec8 100644 --- a/taichi/program/program.h +++ b/taichi/program/program.h @@ -15,7 +15,7 @@ #include "taichi/backends/opengl/opengl_kernel_launcher.h" #include "taichi/backends/opengl/opengl_kernel_util.h" #include "taichi/program/kernel.h" -#include "taichi/program/profiler.h" +#include "taichi/program/kernel_profiler.h" #include "taichi/runtime/llvm/context.h" #include "taichi/runtime/runtime.h" #include "taichi/backends/metal/struct_metal.h" @@ -99,7 +99,7 @@ class Program { std::vector> kernels; - std::unique_ptr profiler; + std::unique_ptr profiler; std::unordered_map> jit_evaluator_cache; @@ -110,11 +110,11 @@ class Program { Program(Arch arch); - void profiler_print() { + void kernel_profiler_print() { profiler->print(); } - void profiler_clear() { + void kernel_profiler_clear() { profiler->clear(); } @@ -126,7 +126,7 @@ class Program { profiler->stop(); } - ProfilerBase *get_profiler() { + KernelProfilerBase *get_profiler() { return profiler.get(); } diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index b8ae1ca957d01..388f19796304b 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -101,7 +101,7 @@ void export_lang(py::module &m) { .def_readwrite("demote_dense_struct_fors", &CompileConfig::demote_dense_struct_fors) .def_readwrite("use_unified_memory", &CompileConfig::use_unified_memory) - .def_readwrite("enable_profiler", &CompileConfig::enable_profiler) + .def_readwrite("kernel_profiler", &CompileConfig::kernel_profiler) .def_readwrite("default_fp", &CompileConfig::default_fp) .def_readwrite("default_ip", &CompileConfig::default_ip) .def_readwrite("device_memory_GB", &CompileConfig::device_memory_GB) @@ -120,17 +120,8 @@ void export_lang(py::module &m) { py::class_(m, "Program") .def(py::init<>()) .def_readonly("config", &Program::config) - .def("profiler_print", &Program::profiler_print) - .def("profiler_clear", &Program::profiler_clear) - .def("profiler_start", &Program::profiler_start) - .def("profiler_stop", &Program::profiler_stop) - .def("get_profiler", - [](Program *program) -> void * { - // We didn't expose the ProfilerBase interface, so the only purpose - // of this method is to expose the address of the profiler, so that - // other modules (e.g. GUI) can receive the profiler. - return (void *)(program->get_profiler()); - }) + .def("kernel_profiler_print", &Program::kernel_profiler_print) + .def("kernel_profiler_clear", &Program::kernel_profiler_clear) .def("finalize", &Program::finalize) .def("get_root", [&](Program *program) -> SNode * { diff --git a/taichi/python/export_visual.cpp b/taichi/python/export_visual.cpp index 3ad2d66a8534c..93128da96b254 100644 --- a/taichi/python/export_visual.cpp +++ b/taichi/python/export_visual.cpp @@ -32,10 +32,6 @@ void export_visual(py::module &m) { .def("pop_key_event_head", &GUI::pop_key_event_head) .def("get_cursor_pos", &GUI::get_cursor_pos) .def_readwrite("title", &GUI::window_name) - .def("set_profiler", - [](GUI *gui, void *profiler) -> void { - gui->set_profiler((lang::ProfilerBase *)profiler); - }) .def("update", &GUI::update); py::class_(m, "Canvas") .def("clear", static_cast(&Canvas::clear)) diff --git a/taichi/runtime/runtime.h b/taichi/runtime/runtime.h index 9329da2aab646..f2d5714af7f3d 100644 --- a/taichi/runtime/runtime.h +++ b/taichi/runtime/runtime.h @@ -2,7 +2,7 @@ #include "taichi/common/core.h" #include "taichi/program/arch.h" -#include "taichi/program/profiler.h" +#include "taichi/program/kernel_profiler.h" #include #include @@ -12,7 +12,7 @@ TLANG_NAMESPACE_BEGIN class Runtime { protected: - ProfilerBase *profiler; + KernelProfilerBase *profiler; public: Runtime() : profiler(nullptr) { @@ -21,7 +21,7 @@ class Runtime { // Does the machine really have the corresponding hardware? virtual bool detected() = 0; - void set_profiler(ProfilerBase *profiler) { + void set_profiler(KernelProfilerBase *profiler) { this->profiler = profiler; }