diff --git a/benchmarks/fill_dense.py b/benchmarks/fill_dense.py
index a7dd4130d44fa..674c5591df9df 100644
--- a/benchmarks/fill_dense.py
+++ b/benchmarks/fill_dense.py
@@ -128,9 +128,9 @@ def fill():
 # ti.cfg.arch = ti.cuda
 # ti.cfg.print_kernel_llvm_ir_optimized = True
 # ti.cfg.print_kernel_llvm_ir = True
-ti.cfg.enable_profiler = True
+ti.cfg.kernel_profiler = True
 # ti.cfg.verbose_kernel_launches = True
 print(benchmark_nested_struct_listgen_8x8())
 # print(benchmark_root_listgen())
-ti.profiler_print()
+ti.kernel_profiler_print()
 '''
diff --git a/benchmarks/fill_sparse.py b/benchmarks/fill_sparse.py
index 35bbfd9f77ba3..fe563cf3a5669 100644
--- a/benchmarks/fill_sparse.py
+++ b/benchmarks/fill_sparse.py
@@ -43,7 +43,7 @@ def task():
 
 
 '''
-ti.init(arch=ti.cuda, enable_profiler=True)
+ti.init(arch=ti.cuda, kernel_profiler=True)
 benchmark_nested_struct_fill_and_clear()
-ti.profiler_print()
+ti.kernel_profiler_print()
 '''
diff --git a/examples/mgpcg.py b/examples/mgpcg.py
index 542c72f1d2bbd..cd9d264e476ab 100644
--- a/examples/mgpcg.py
+++ b/examples/mgpcg.py
@@ -2,7 +2,7 @@
 import taichi as ti
 
 real = ti.f32
-ti.init(default_fp=real, arch=ti.x64, enable_profiler=True)
+ti.init(default_fp=real, arch=ti.x64, kernel_profiler=True)
 
 # grid parameters
 N = 128
@@ -207,4 +207,4 @@ def paint():
     gui.set_image(pixels)
     gui.show()
 
-ti.profiler_print()
+ti.kernel_profiler_print()
diff --git a/examples/mgpcg_advanced.py b/examples/mgpcg_advanced.py
index d122351c05d70..31a31cf35f0f6 100644
--- a/examples/mgpcg_advanced.py
+++ b/examples/mgpcg_advanced.py
@@ -3,7 +3,7 @@
 import taichi as ti
 
 real = ti.f32
-ti.init(default_fp=real, arch=ti.x64, enable_profiler=True)
+ti.init(default_fp=real, arch=ti.x64, kernel_profiler=True)
 
 
 @ti.data_oriented
@@ -207,7 +207,7 @@ def run(self):
             gui.set_image(self.pixels)
             gui.show()
 
-        ti.profiler_print()
+        ti.kernel_profiler_print()
 
 
 solver = MGPCG()
diff --git a/misc/benchmark_parallel_compilation.py b/misc/benchmark_parallel_compilation.py
index 0d8ebdb4c1285..b4e53e69bdefb 100644
--- a/misc/benchmark_parallel_compilation.py
+++ b/misc/benchmark_parallel_compilation.py
@@ -65,5 +65,5 @@ def substep():
 for i in range(32):
     substep()
 
-ti.profiler_print()
+ti.kernel_profiler_print()
 ti.core.print_profile_info()
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 4c942901bbfcc..cd76ba9ccd7d7 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -35,10 +35,10 @@
 opengl = core.opengl
 gpu = [cuda, metal, opengl]
 cpu = core.host_arch()
-profiler_print = lambda: core.get_current_program().profiler_print()
-profiler_clear = lambda: core.get_current_program().profiler_clear()
-profiler_start = lambda n: core.get_current_program().profiler_start(n)
-profiler_stop = lambda: core.get_current_program().profiler_stop()
+kernel_profiler_print = lambda: core.get_current_program(
+).kernel_profiler_print()
+kernel_profiler_clear = lambda: core.get_current_program(
+).kernel_profiler_clear()
 
 
 class _Extension(object):
diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py
index 7369b8135f559..b9574c3b46a98 100644
--- a/python/taichi/misc/gui.py
+++ b/python/taichi/misc/gui.py
@@ -41,9 +41,6 @@ def __init__(self, name, res=512, background_color=0x0):
         self.key_pressed = set()
         self.event = None
         self.clear()
-        if ti.core.get_current_program():
-            self.core.set_profiler(
-                ti.core.get_current_program().get_profiler())
 
     def __enter__(self):
         return self
diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp
index 23002984ec6dc..3827a1446fb0d 100644
--- a/taichi/backends/cpu/codegen_cpu.cpp
+++ b/taichi/backends/cpu/codegen_cpu.cpp
@@ -50,7 +50,7 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
     stat.add("codegen_offloaded_tasks");
     using Type = OffloadedStmt::TaskType;
     auto offloaded_task_name = init_offloaded_task_function(stmt);
-    if (prog->config.enable_profiler) {
+    if (prog->config.kernel_profiler && arch_is_cpu(prog->config.arch)) {
       call(
           builder.get(), "LLVMRuntime_profiler_start",
           {get_runtime(), builder->CreateGlobalStringPtr(offloaded_task_name)});
@@ -72,7 +72,7 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
     } else {
       TI_NOT_IMPLEMENTED
     }
-    if (prog->config.enable_profiler) {
+    if (prog->config.kernel_profiler && arch_is_cpu(prog->config.arch)) {
       call(builder.get(), "LLVMRuntime_profiler_stop", {get_runtime()});
     }
     finalize_offloaded_task_function();
diff --git a/taichi/backends/cuda/cuda_context.h b/taichi/backends/cuda/cuda_context.h
index 85a080afac868..9736c9ac94620 100644
--- a/taichi/backends/cuda/cuda_context.h
+++ b/taichi/backends/cuda/cuda_context.h
@@ -4,7 +4,7 @@
 #include <unordered_map>
 #include <thread>
 
-#include "taichi/program/profiler.h"
+#include "taichi/program/kernel_profiler.h"
 #include "taichi/backends/cuda/cuda_driver.h"
 
 TLANG_NAMESPACE_BEGIN
@@ -23,7 +23,7 @@ class CUDAContext {
   int dev_count;
   std::string mcpu;
   std::mutex lock;
-  ProfilerBase *profiler;
+  KernelProfilerBase *profiler;
   CUDADriver &driver;
 
   static std::unordered_map<std::thread::id, std::unique_ptr<CUDAContext>>
@@ -45,9 +45,10 @@ class CUDAContext {
               unsigned gridDim,
               unsigned blockDim);
 
-  void set_profiler(ProfilerBase *profiler) {
+  void set_profiler(KernelProfilerBase *profiler) {
     this->profiler = profiler;
   }
+
   std::string get_mcpu() const {
     return mcpu;
   }
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index fc6ed5e1b26f0..f26dd001c1086 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -212,7 +212,7 @@ class CompiledTaichiKernel {
     const SNodeDescriptorsMap *snode_descriptors;
     MTLDevice *device;
     MemoryPool *mem_pool;
-    ProfilerBase *profiler;
+    KernelProfilerBase *profiler;
   };
 
   CompiledTaichiKernel(Params params) : ctx_attribs(*params.ctx_attribs) {
@@ -651,7 +651,7 @@ class KernelManager::Impl {
   CompileConfig *const config_;
   const CompiledStructs compiled_structs_;
   MemoryPool *const mem_pool_;
-  ProfilerBase *const profiler_;
+  KernelProfilerBase *const profiler_;
   nsobj_unique_ptr<MTLDevice> device_;
   nsobj_unique_ptr<MTLCommandQueue> command_queue_;
   nsobj_unique_ptr<MTLCommandBuffer> cur_command_buffer_;
diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h
index 58782606b22d0..46e6ecf2c41e8 100644
--- a/taichi/backends/metal/kernel_manager.h
+++ b/taichi/backends/metal/kernel_manager.h
@@ -7,7 +7,7 @@
 
 #include "taichi/backends/metal/kernel_util.h"
 #include "taichi/lang_util.h"
-#include "taichi/program/profiler.h"
+#include "taichi/program/kernel_profiler.h"
 #include "taichi/backends/metal/struct_metal.h"
 #include "taichi/system/memory_pool.h"
 
@@ -27,7 +27,7 @@ class KernelManager {
     CompiledStructs compiled_structs;
     CompileConfig *config;
     MemoryPool *mem_pool;
-    ProfilerBase *profiler;
+    KernelProfilerBase *profiler;
     int root_id;
   };
 
diff --git a/taichi/gui/gui.h b/taichi/gui/gui.h
index 3eb9c8fcd6d53..59687cce4bea2 100644
--- a/taichi/gui/gui.h
+++ b/taichi/gui/gui.h
@@ -2,7 +2,7 @@
 
 #include "taichi/math/math.h"
 #include "taichi/system/timer.h"
-#include "taichi/program/profiler.h"
+#include "taichi/program/kernel_profiler.h"
 
 #include <atomic>
 #include <ctime>
@@ -488,7 +488,6 @@ class GUI : public GUIBase {
   Vector2i cursor_pos;
   bool button_status[3];
   int widget_height;
-  lang::ProfilerBase *profiler;
 
   void set_mouse_pos(int x, int y) {
     cursor_pos = Vector2i(x, y);
@@ -886,10 +885,6 @@ class GUI : public GUIBase {
   }
 
   ~GUI();
-
-  void set_profiler(lang::ProfilerBase *profiler) {
-    this->profiler = profiler;
-  }
 };
 
 TI_NAMESPACE_END
diff --git a/taichi/jit/jit_module.h b/taichi/jit/jit_module.h
index 9f979efbb15f7..82794c94955a0 100644
--- a/taichi/jit/jit_module.h
+++ b/taichi/jit/jit_module.h
@@ -6,7 +6,7 @@
 #include "taichi/inc/constants.h"
 #include "taichi/llvm/llvm_fwd.h"
 #include "taichi/lang_util.h"
-#include "taichi/program/profiler.h"
+#include "taichi/program/kernel_profiler.h"
 
 TLANG_NAMESPACE_BEGIN
 
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index 3627bc420f7f4..68fd71360eb95 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -29,7 +29,7 @@ CompileConfig::CompileConfig() {
   default_fp = DataType::f32;
   default_ip = DataType::i32;
   verbose_kernel_launches = false;
-  enable_profiler = false;
+  kernel_profiler = false;
   default_cpu_block_dim = 0;  // 0 = adaptive
   default_gpu_block_dim = 64;
   verbose = true;
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index e7a9f68e5bba4..4722ab4110633 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -32,7 +32,7 @@ struct CompileConfig {
   bool print_kernel_llvm_ir_optimized;
   bool print_kernel_nvptx;
   bool verbose_kernel_launches;
-  bool enable_profiler;
+  bool kernel_profiler;
   bool verbose;
   bool fast_math;
   bool use_unified_memory;
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 5b17e64a9ca34..6c438000d0d98 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -90,13 +90,17 @@ void Kernel::operator()() {
     }
     compiled(program.get_context());
     program.sync = (program.sync && arch_is_cpu(arch));
-    if (program.config.debug && arch_is_cpu(arch)) {
+    // Note that Kernel::arch may be different from program.config.arch
+    if (program.config.debug && arch_is_cpu(arch) &&
+        arch_is_cpu(program.config.arch)) {
       program.check_runtime_error();
     }
   } else {
     program.sync = false;
     program.async_engine->launch(this);
-    if (program.config.debug && arch_is_cpu(arch)) {
+    // Note that Kernel::arch may be different from program.config.arch
+    if (program.config.debug && arch_is_cpu(arch) &&
+        arch_is_cpu(program.config.arch)) {
       program.check_runtime_error();
     }
   }
diff --git a/taichi/program/profiler.cpp b/taichi/program/kernel_profiler.cpp
similarity index 77%
rename from taichi/program/profiler.cpp
rename to taichi/program/kernel_profiler.cpp
index bda1ad04e4a62..55df70d6f6967 100644
--- a/taichi/program/profiler.cpp
+++ b/taichi/program/kernel_profiler.cpp
@@ -1,11 +1,11 @@
-#include "profiler.h"
+#include "kernel_profiler.h"
 
 #include "taichi/system/timer.h"
 #include "taichi/backends/cuda/cuda_driver.h"
 
 TLANG_NAMESPACE_BEGIN
 
-void ProfileRecord::insert_sample(double t) {
+void KernelProfileRecord::insert_sample(double t) {
   if (counter == 0) {
     min = t;
     max = t;
@@ -16,21 +16,23 @@ void ProfileRecord::insert_sample(double t) {
   total += t;
 }
 
-void ProfilerBase::profiler_start(ProfilerBase *profiler,
-                                  const char *kernel_name) {
+void KernelProfilerBase::profiler_start(KernelProfilerBase *profiler,
+                                        const char *kernel_name) {
+  TI_ASSERT(profiler);
   profiler->start(std::string(kernel_name));
 }
 
-void ProfilerBase::profiler_stop(ProfilerBase *profiler) {
+void KernelProfilerBase::profiler_stop(KernelProfilerBase *profiler) {
+  TI_ASSERT(profiler);
   profiler->stop();
 }
 
-void ProfilerBase::print() {
+void KernelProfilerBase::print() {
   sync();
   printf("%s\n", title().c_str());
   for (auto &rec : records) {
     printf(
-        "[%6.2f%%] %-40s     min %7.3f ms   avg %7.3f ms    max %7.3f ms   "
+        "[%6.2f%%] %-40s    min %7.3f ms   avg %7.3f ms   max %7.3f ms   "
         "total %7.3f s [%7dx]\n",
         rec.total / total_time * 100.0f, rec.name.c_str(), rec.min,
         rec.total / rec.counter, rec.max, rec.total / 1000.0f, rec.counter);
@@ -39,7 +41,7 @@ void ProfilerBase::print() {
 
 namespace {
 // A simple profiler that uses Time::get_time()
-class DefaultProfiler : public ProfilerBase {
+class DefaultProfiler : public KernelProfilerBase {
  public:
   explicit DefaultProfiler(Arch arch)
       : title_(fmt::format("{} Profiler", arch_name(arch))) {
@@ -60,9 +62,9 @@ class DefaultProfiler : public ProfilerBase {
   void stop() override {
     auto t = Time::get_time() - start_t_;
     auto ms = t * 1000.0;
-    auto it =
-        std::find_if(records.begin(), records.end(),
-                     [&](ProfileRecord &r) { return r.name == event_name_; });
+    auto it = std::find_if(
+        records.begin(), records.end(),
+        [&](KernelProfileRecord &r) { return r.name == event_name_; });
     if (it == records.end()) {
       records.emplace_back(event_name_);
       it = std::prev(records.end());
@@ -78,7 +80,7 @@ class DefaultProfiler : public ProfilerBase {
 };
 
 // A CUDA kernel profiler that uses CUDA timing events
-class CUDAProfiler : public ProfilerBase {
+class KernelProfilerCUDA : public KernelProfilerBase {
  public:
 #if defined(TI_WITH_CUDA)
   void *current_stop;
@@ -123,7 +125,7 @@ class CUDAProfiler : public ProfilerBase {
         CUDADriver::get_instance().event_elapsed_time(&ms, start, stop);
         auto it = std::find_if(
             records.begin(), records.end(),
-            [&](ProfileRecord &r) { return r.name == map_elem.first; });
+            [&](KernelProfileRecord &r) { return r.name == map_elem.first; });
         if (it == records.end()) {
           records.emplace_back(map_elem.first);
           it = std::prev(records.end());
@@ -138,19 +140,19 @@ class CUDAProfiler : public ProfilerBase {
 #endif
   }
 
-  static CUDAProfiler &get_instance() {
-    static CUDAProfiler profiler;
+  static KernelProfilerCUDA &get_instance() {
+    static KernelProfilerCUDA profiler;
     return profiler;
   }
 };
 }  // namespace
 
-std::unique_ptr<ProfilerBase> make_profiler(Arch arch) {
+std::unique_ptr<KernelProfilerBase> make_profiler(Arch arch) {
   if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal ||
       arch == Arch::opengl) {
     return std::make_unique<DefaultProfiler>(arch);
   } else if (arch == Arch::cuda) {
-    return std::make_unique<CUDAProfiler>();
+    return std::make_unique<KernelProfilerCUDA>();
   } else {
     TI_NOT_IMPLEMENTED;
   }
diff --git a/taichi/program/profiler.h b/taichi/program/kernel_profiler.h
similarity index 62%
rename from taichi/program/profiler.h
rename to taichi/program/kernel_profiler.h
index d02d7d59f2fb8..18c56669922e7 100644
--- a/taichi/program/profiler.h
+++ b/taichi/program/kernel_profiler.h
@@ -11,23 +11,23 @@
 
 TLANG_NAMESPACE_BEGIN
 
-struct ProfileRecord {
+struct KernelProfileRecord {
   std::string name;
   int counter;
   double min;
   double max;
   double total;
 
-  ProfileRecord(const std::string &name)
+  KernelProfileRecord(const std::string &name)
       : name(name), counter(0), min(0), max(0), total(0) {
   }
 
   void insert_sample(double t);
 };
 
-class ProfilerBase {
+class KernelProfilerBase {
  protected:
-  std::vector<ProfileRecord> records;
+  std::vector<KernelProfileRecord> records;
   double total_time;
 
  public:
@@ -42,18 +42,19 @@ class ProfilerBase {
 
   virtual void start(const std::string &kernel_name) = 0;
 
-  static void profiler_start(ProfilerBase *profiler, const char *kernel_name);
+  static void profiler_start(KernelProfilerBase *profiler,
+                             const char *kernel_name);
 
   virtual void stop() = 0;
 
-  static void profiler_stop(ProfilerBase *profiler);
+  static void profiler_stop(KernelProfilerBase *profiler);
 
   void print();
 
-  virtual ~ProfilerBase() {
+  virtual ~KernelProfilerBase() {
   }
 };
 
-std::unique_ptr<ProfilerBase> make_profiler(Arch arch);
+std::unique_ptr<KernelProfilerBase> make_profiler(Arch arch);
 
 TLANG_NAMESPACE_END
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 28e7dd0b05cac..5269759ea969d 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -8,7 +8,7 @@
 #if defined(TI_WITH_CUDA)
 #include "taichi/backends/cuda/cuda_driver.h"
 #include "taichi/backends/cuda/codegen_cuda.h"
-#include "taichi/backends/cuda/cuda_driver.h"
+#include "taichi/backends/cuda/cuda_context.h"
 #endif
 #include "taichi/backends/metal/codegen_metal.h"
 #include "taichi/backends/opengl/codegen_opengl.h"
@@ -98,9 +98,18 @@ Program::Program(Arch desired_arch) {
 
   preallocated_device_buffer = nullptr;
 
-  if (config.enable_profiler && runtime) {
+  if (config.kernel_profiler && runtime) {
     runtime->set_profiler(profiler.get());
   }
+#if defined(TI_WITH_CUDA)
+  if (config.arch == Arch::cuda) {
+    if (config.kernel_profiler) {
+      CUDAContext::get_instance().set_profiler(profiler.get());
+    } else {
+      CUDAContext::get_instance().set_profiler(nullptr);
+    }
+  }
+#endif
 
   result_buffer = nullptr;
   current_kernel = nullptr;
@@ -255,14 +264,19 @@ void Program::initialize_runtime_system(StructCompiler *scomp) {
 
     runtime->call<void *, void *>("LLVMRuntime_set_assert_failed", llvm_runtime,
                                   (void *)assert_failed_host);
-    // Profiler functions can only be called on host kernels
+  }
+  if (arch_is_cpu(config.arch)) {
+    // Profiler functions can only be called on CPU kernels
     runtime->call<void *, void *>("LLVMRuntime_set_profiler", llvm_runtime,
                                   profiler.get());
     runtime->call<void *, void *>("LLVMRuntime_set_profiler_start",
                                   llvm_runtime,
-                                  (void *)&ProfilerBase::profiler_start);
+                                  (void *)&KernelProfilerBase::profiler_start);
     runtime->call<void *, void *>("LLVMRuntime_set_profiler_stop", llvm_runtime,
-                                  (void *)&ProfilerBase::profiler_stop);
+                                  (void *)&KernelProfilerBase::profiler_stop);
+  } else {
+    runtime->call<void *, void *>("LLVMRuntime_set_profiler", llvm_runtime,
+                                  nullptr);
   }
 }
 
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 8774124022e06..68b103a27eec8 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -15,7 +15,7 @@
 #include "taichi/backends/opengl/opengl_kernel_launcher.h"
 #include "taichi/backends/opengl/opengl_kernel_util.h"
 #include "taichi/program/kernel.h"
-#include "taichi/program/profiler.h"
+#include "taichi/program/kernel_profiler.h"
 #include "taichi/runtime/llvm/context.h"
 #include "taichi/runtime/runtime.h"
 #include "taichi/backends/metal/struct_metal.h"
@@ -99,7 +99,7 @@ class Program {
 
   std::vector<std::unique_ptr<Kernel>> kernels;
 
-  std::unique_ptr<ProfilerBase> profiler;
+  std::unique_ptr<KernelProfilerBase> profiler;
 
   std::unordered_map<JITEvaluatorId, std::unique_ptr<Kernel>>
       jit_evaluator_cache;
@@ -110,11 +110,11 @@ class Program {
 
   Program(Arch arch);
 
-  void profiler_print() {
+  void kernel_profiler_print() {
     profiler->print();
   }
 
-  void profiler_clear() {
+  void kernel_profiler_clear() {
     profiler->clear();
   }
 
@@ -126,7 +126,7 @@ class Program {
     profiler->stop();
   }
 
-  ProfilerBase *get_profiler() {
+  KernelProfilerBase *get_profiler() {
     return profiler.get();
   }
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index b8ae1ca957d01..388f19796304b 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -101,7 +101,7 @@ void export_lang(py::module &m) {
       .def_readwrite("demote_dense_struct_fors",
                      &CompileConfig::demote_dense_struct_fors)
       .def_readwrite("use_unified_memory", &CompileConfig::use_unified_memory)
-      .def_readwrite("enable_profiler", &CompileConfig::enable_profiler)
+      .def_readwrite("kernel_profiler", &CompileConfig::kernel_profiler)
       .def_readwrite("default_fp", &CompileConfig::default_fp)
       .def_readwrite("default_ip", &CompileConfig::default_ip)
       .def_readwrite("device_memory_GB", &CompileConfig::device_memory_GB)
@@ -120,17 +120,8 @@ void export_lang(py::module &m) {
   py::class_<Program>(m, "Program")
       .def(py::init<>())
       .def_readonly("config", &Program::config)
-      .def("profiler_print", &Program::profiler_print)
-      .def("profiler_clear", &Program::profiler_clear)
-      .def("profiler_start", &Program::profiler_start)
-      .def("profiler_stop", &Program::profiler_stop)
-      .def("get_profiler",
-           [](Program *program) -> void * {
-             // We didn't expose the ProfilerBase interface, so the only purpose
-             // of this method is to expose the address of the profiler, so that
-             // other modules (e.g. GUI) can receive the profiler.
-             return (void *)(program->get_profiler());
-           })
+      .def("kernel_profiler_print", &Program::kernel_profiler_print)
+      .def("kernel_profiler_clear", &Program::kernel_profiler_clear)
       .def("finalize", &Program::finalize)
       .def("get_root",
            [&](Program *program) -> SNode * {
diff --git a/taichi/python/export_visual.cpp b/taichi/python/export_visual.cpp
index 3ad2d66a8534c..93128da96b254 100644
--- a/taichi/python/export_visual.cpp
+++ b/taichi/python/export_visual.cpp
@@ -32,10 +32,6 @@ void export_visual(py::module &m) {
       .def("pop_key_event_head", &GUI::pop_key_event_head)
       .def("get_cursor_pos", &GUI::get_cursor_pos)
       .def_readwrite("title", &GUI::window_name)
-      .def("set_profiler",
-           [](GUI *gui, void *profiler) -> void {
-             gui->set_profiler((lang::ProfilerBase *)profiler);
-           })
       .def("update", &GUI::update);
   py::class_<Canvas>(m, "Canvas")
       .def("clear", static_cast<void (Canvas::*)(uint32)>(&Canvas::clear))
diff --git a/taichi/runtime/runtime.h b/taichi/runtime/runtime.h
index 9329da2aab646..f2d5714af7f3d 100644
--- a/taichi/runtime/runtime.h
+++ b/taichi/runtime/runtime.h
@@ -2,7 +2,7 @@
 
 #include "taichi/common/core.h"
 #include "taichi/program/arch.h"
-#include "taichi/program/profiler.h"
+#include "taichi/program/kernel_profiler.h"
 
 #include <map>
 #include <memory>
@@ -12,7 +12,7 @@ TLANG_NAMESPACE_BEGIN
 
 class Runtime {
  protected:
-  ProfilerBase *profiler;
+  KernelProfilerBase *profiler;
 
  public:
   Runtime() : profiler(nullptr) {
@@ -21,7 +21,7 @@ class Runtime {
   // Does the machine really have the corresponding hardware?
   virtual bool detected() = 0;
 
-  void set_profiler(ProfilerBase *profiler) {
+  void set_profiler(KernelProfilerBase *profiler) {
     this->profiler = profiler;
   }