From 0c3358eae2b2305be991b529ba98b80651860f16 Mon Sep 17 00:00:00 2001
From: Yuanming Hu <yuanmhu@gmail.com>
Date: Thu, 27 Feb 2020 23:38:08 -0500
Subject: [PATCH] removed common.h/cpp; added compile_config.h/cpp; cleanups

---
 taichi/arch.cpp                          |  11 ++
 taichi/arch.h                            |   5 +-
 taichi/codegen/codegen_llvm_cuda.cpp     |   1 -
 taichi/codegen/codegen_metal.h           |   1 -
 taichi/codegen/codegen_opengl.h          |   5 +-
 taichi/common.cpp                        |   3 -
 taichi/common.h                          | 132 -----------------------
 taichi/compile_config.cpp                |  33 ++++++
 taichi/compile_config.h                  |  40 +++++++
 taichi/constants.h                       |   1 +
 taichi/context.h                         |   2 +-
 taichi/extension.h                       |   4 +-
 taichi/ir/ir.cpp                         |   2 +-
 taichi/ir/ir.h                           |   9 +-
 taichi/ir/snode.cpp                      |  29 -----
 taichi/ir/snode.h                        |  37 ++-----
 taichi/jit/jit_arch_cuda.cpp             |   2 +-
 taichi/lang_util.cpp                     |  45 +-------
 taichi/lang_util.h                       |  48 ++-------
 taichi/platform/metal/metal_api.h        |   5 +-
 taichi/platform/opengl/opengl_api.h      |   1 -
 taichi/platform/opengl/opengl_kernel.h   |   1 -
 taichi/profiler.h                        |   6 +-
 taichi/program.cpp                       |   2 +-
 taichi/python/export_lang.cpp            |   6 +-
 taichi/struct/struct.cpp                 |  10 +-
 taichi/struct/struct_llvm.cpp            |   4 +-
 taichi/system/memory_pool.h              |  15 +--
 taichi/system/unified_allocator.h        |   2 +-
 taichi/transforms/insert_scratch_pad.cpp |   2 +-
 taichi/transforms/lower_access.cpp       |   2 +-
 31 files changed, 150 insertions(+), 316 deletions(-)
 delete mode 100644 taichi/common.cpp
 delete mode 100644 taichi/common.h
 create mode 100644 taichi/compile_config.cpp
 create mode 100644 taichi/compile_config.h
diff --git a/taichi/arch.cpp b/taichi/arch.cpp
index 37354407a5439..0cd23c4fdf383 100644
--- a/taichi/arch.cpp
+++ b/taichi/arch.cpp
@@ -48,4 +48,15 @@ bool arch_use_host_memory(Arch arch) {
   return arch_is_cpu(arch);
 }
 
+int default_simd_width(Arch arch) {
+  if (arch == Arch::x64) {
+    return 8;
+  } else if (arch == Arch::cuda) {
+    return 32;
+  } else {
+    TI_NOT_IMPLEMENTED;
+    return -1;
+  }
+}
+
 TLANG_NAMESPACE_END
diff --git a/taichi/arch.h b/taichi/arch.h
index fc4200c4ec698..b89783850f278 100644
--- a/taichi/arch.h
+++ b/taichi/arch.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #include <string>
-#include <taichi/common/util.h>
-#include <taichi/common.h>
+#include "taichi/lang_util.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -22,4 +21,6 @@ bool arch_is_gpu(Arch arch);
 
 bool arch_use_host_memory(Arch arch);
 
+int default_simd_width(Arch arch);
+
 TLANG_NAMESPACE_END
diff --git a/taichi/codegen/codegen_llvm_cuda.cpp b/taichi/codegen/codegen_llvm_cuda.cpp
index a949b4567e723..8df2579b73707 100644
--- a/taichi/codegen/codegen_llvm_cuda.cpp
+++ b/taichi/codegen/codegen_llvm_cuda.cpp
@@ -86,7 +86,6 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
     for (auto &task : offloaded_local) {
       task.cuda_func = cuda_module->lookup_function(task.name);
     }
-    auto prog = this->prog;
     return [offloaded_local, cuda_module](Context context) {
       for (auto task : offloaded_local) {
         TI_DEBUG("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
diff --git a/taichi/codegen/codegen_metal.h b/taichi/codegen/codegen_metal.h
index 6b212cabb6159..52f89e25d60b9 100644
--- a/taichi/codegen/codegen_metal.h
+++ b/taichi/codegen/codegen_metal.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <taichi/common.h>
 #include <taichi/constants.h>
 #include <taichi/platform/metal/metal_data_types.h>
 #include <taichi/platform/metal/metal_kernel_util.h>
diff --git a/taichi/codegen/codegen_opengl.h b/taichi/codegen/codegen_opengl.h
index 7a558825bfa7b..3c9c62db84dc1 100644
--- a/taichi/codegen/codegen_opengl.h
+++ b/taichi/codegen/codegen_opengl.h
@@ -1,8 +1,7 @@
 #pragma once
 
-#include <taichi/common.h>
-#include <taichi/constants.h>
-#include <taichi/lang_util.h>
+#include "taichi/constants.h"
+#include "taichi/lang_util.h"
 
 #include <string>
 #include <unordered_map>
diff --git a/taichi/common.cpp b/taichi/common.cpp
deleted file mode 100644
index ff3f606886f70..0000000000000
--- a/taichi/common.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "lang_util.h"
-#include <taichi/common.h>
-#include <taichi/system/unified_allocator.h>
diff --git a/taichi/common.h b/taichi/common.h
deleted file mode 100644
index f2bc3be219cdc..0000000000000
--- a/taichi/common.h
+++ /dev/null
@@ -1,132 +0,0 @@
-#pragma once
-#include "constants.h"
-
-#define FUNC_DECL
-
-#define TLANG_NAMESPACE_BEGIN \
-  namespace taichi {          \
-  namespace Tlang {
-#define TLANG_NAMESPACE_END \
-  }                         \
-  }
-
-#include <atomic>
-#include <numeric>
-#include <mutex>
-#include <unordered_map>
-#include <iostream>
-
-#if !defined(TI_INCLUDED)
-
-#ifdef _WIN64
-#define TI_FORCE_INLINE __forceinline
-#else
-#define TI_FORCE_INLINE inline __attribute__((always_inline))
-#endif
-#include <cstdio>
-#include <string>
-#include <cstdlib>
-#include <cstring>
-#include <cmath>
-#include <iostream>
-#include <array>
-#include <vector>
-
-using float32 = float;
-using float64 = double;
-using uint8 = std::uint8_t;
-using uint16 = std::uint16_t;
-using uint32 = std::uint32_t;
-using uint64 = std::uint64_t;
-using int8 = std::int8_t;
-using int16 = std::int16_t;
-using int32 = std::int32_t;
-using int64 = std::int64_t;
-
-namespace taichi {
-TI_FORCE_INLINE uint32 rand_int() noexcept {
-  static unsigned int x = 123456789, y = 362436069, z = 521288629, w = 88675123;
-  unsigned int t = x ^ (x << 11);
-  x = y;
-  y = z;
-  z = w;
-  return (w = (w ^ (w >> 19)) ^ (t ^ (t >> 8)));
-}
-
-TI_FORCE_INLINE uint64 rand_int64() noexcept {
-  return ((uint64)rand_int() << 32) + rand_int();
-}
-
-template <typename T>
-TI_FORCE_INLINE T rand() noexcept;
-
-template <>
-TI_FORCE_INLINE float rand<float>() noexcept {
-  return rand_int() * (1.0f / 4294967296.0f);
-}
-
-template <>
-TI_FORCE_INLINE double rand<double>() noexcept {
-  return rand_int() * (1.0 / 4294967296.0);
-}
-
-template <>
-TI_FORCE_INLINE int rand<int>() noexcept {
-  return rand_int();
-}
-
-template <typename T>
-TI_FORCE_INLINE T rand() noexcept;
-}  // namespace taichi
-
-#endif
-
-TLANG_NAMESPACE_BEGIN
-
-using size_t = std::size_t;
-
-constexpr int max_num_indices = taichi_max_num_indices;
-constexpr int max_num_args = taichi_max_num_args;
-constexpr int max_num_snodes = taichi_max_num_snodes;
-constexpr int max_gpu_block_dim = 1024;
-
-struct SNodeMeta {
-  int indices[max_num_indices];
-  int active;
-  int start_loop;
-  int end_loop;
-  int _;
-  void **snode_ptr;
-  void *ptr;
-};
-
-struct AllocatorStat {
-  int snode_id;
-  size_t pool_size;
-  size_t num_resident_blocks;
-  size_t num_recycled_blocks;
-  SNodeMeta *resident_metas;
-};
-
-template <typename T, typename G>
-T union_cast(G g) {
-  static_assert(sizeof(T) == sizeof(G), "");
-  union {
-    T t;
-    G g;
-  } u;
-  u.g = g;
-  return u.t;
-}
-
-template <typename T, typename G>
-T union_cast_different_size(G g) {
-  union {
-    T t;
-    G g;
-  } u;
-  u.g = g;
-  return u.t;
-}
-
-TLANG_NAMESPACE_END
diff --git a/taichi/compile_config.cpp b/taichi/compile_config.cpp
new file mode 100644
index 0000000000000..b240d16ae15c2
--- /dev/null
+++ b/taichi/compile_config.cpp
@@ -0,0 +1,33 @@
+#include "taichi/compile_config.h"
+
+TLANG_NAMESPACE_BEGIN
+
+CompileConfig::CompileConfig() {
+  arch = Arch::x64;
+  simd_width = default_simd_width(arch);
+  external_optimization_level = 3;
+  print_ir = false;
+  print_accessor_ir = false;
+  use_llvm = true;
+  print_struct_llvm_ir = false;
+  print_kernel_llvm_ir = false;
+  print_kernel_llvm_ir_optimized = false;
+  demote_dense_struct_fors = true;
+  max_vector_width = 8;
+  debug = false;
+  lazy_compilation = true;
+  serial_schedule = false;
+  simplify_before_lower_access = true;
+  lower_access = true;
+  simplify_after_lower_access = true;
+  default_fp = DataType::f32;
+  default_ip = DataType::i32;
+  verbose_kernel_launches = false;
+  enable_profiler = false;
+  default_cpu_block_dim = 0;  // 0 = adaptive
+  default_gpu_block_dim = 64;
+  verbose = true;
+  fast_math = true;
+}
+
+TLANG_NAMESPACE_END
diff --git a/taichi/compile_config.h b/taichi/compile_config.h
new file mode 100644
index 0000000000000..1fd5b1b2c6a37
--- /dev/null
+++ b/taichi/compile_config.h
@@ -0,0 +1,40 @@
+#include "taichi/lang_util.h"
+#include "taichi/arch.h"
+
+TLANG_NAMESPACE_BEGIN
+
+struct CompileConfig {
+  Arch arch;
+  bool debug;
+  int simd_width;
+  bool lazy_compilation;
+  int external_optimization_level;
+  int max_vector_width;
+  bool print_ir;
+  bool print_accessor_ir;
+  bool serial_schedule;
+  bool simplify_before_lower_access;
+  bool lower_access;
+  bool simplify_after_lower_access;
+  bool demote_dense_struct_fors;
+  bool use_llvm;
+  bool print_struct_llvm_ir;
+  bool print_kernel_llvm_ir;
+  bool print_kernel_llvm_ir_optimized;
+  bool verbose_kernel_launches;
+  bool enable_profiler;
+  bool verbose;
+  bool fast_math;
+  bool use_unified_memory;
+  DataType default_fp;
+  DataType default_ip;
+  std::string extra_flags;
+  int default_cpu_block_dim;
+  int default_gpu_block_dim;
+
+  CompileConfig();
+};
+
+extern CompileConfig default_compile_config;
+
+TLANG_NAMESPACE_END
diff --git a/taichi/constants.h b/taichi/constants.h
index 954970ec8c774..ab454c632d67e 100644
--- a/taichi/constants.h
+++ b/taichi/constants.h
@@ -5,6 +5,7 @@
 constexpr int taichi_max_num_indices = 8;
 constexpr int taichi_max_num_args = 8;
 constexpr int taichi_max_num_snodes = 1024;
+constexpr int taichi_max_gpu_block_dim = 1024;
 constexpr std::size_t taichi_global_tmp_buffer_size = 1024 * 1024;
 constexpr int taichi_max_num_mem_requests = 1024 * 64;
 constexpr std::size_t taichi_page_size = 4096;
diff --git a/taichi/context.h b/taichi/context.h
index d6df0c9536d8f..71acfae1a4346 100644
--- a/taichi/context.h
+++ b/taichi/context.h
@@ -3,7 +3,7 @@
 #include "constants.h"
 
 #if defined(TI_RUNTIME_HOST)
-#include "common.h"
+#include "taichi/constants.h"
 
 namespace taichi::Tlang {
 using namespace taichi;
diff --git a/taichi/extension.h b/taichi/extension.h
index 97f7b653e4d2f..6548d48fdea18 100644
--- a/taichi/extension.h
+++ b/taichi/extension.h
@@ -1,8 +1,8 @@
 #pragma once
 
+#include "taichi/arch.h"
+
 #include <string>
-#include <taichi/common.h>
-#include <taichi/arch.h>
 
 TLANG_NAMESPACE_BEGIN
 
diff --git a/taichi/ir/ir.cpp b/taichi/ir/ir.cpp
index 410bb5d9f7d84..7a6c93127d54e 100644
--- a/taichi/ir/ir.cpp
+++ b/taichi/ir/ir.cpp
@@ -222,7 +222,7 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var,
   if (cfg.arch == Arch::cuda) {
     vectorize = 1;
     parallelize = 1;
-    TI_ASSERT(block_dim <= max_gpu_block_dim);
+    TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
   } else {
     // cpu
     if (block_dim == 0)
diff --git a/taichi/ir/ir.h b/taichi/ir/ir.h
index f990bee17a8fb..07619b3e0c220 100644
--- a/taichi/ir/ir.h
+++ b/taichi/ir/ir.h
@@ -4,11 +4,12 @@
 
 #include <atomic>
 #include <unordered_map>
-#include <taichi/common/util.h>
-#include <taichi/common/bit.h>
+#include "taichi/common/util.h"
+#include "taichi/common/bit.h"
 #include "taichi/lang_util.h"
-#include "snode.h"
-#include "expr.h"
+#include "taichi/ir/snode.h"
+#include "taichi/ir/expr.h"
+#include "taichi/compile_config.h"
 #include "taichi/llvm/llvm_fwd.h"
 
 TLANG_NAMESPACE_BEGIN
diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
index 94cfcb45544a8..fc774cc8814ce 100644
--- a/taichi/ir/snode.cpp
+++ b/taichi/ir/snode.cpp
@@ -60,32 +60,6 @@ SNode &SNode::create_node(std::vector<Index> indices,
   return new_node;
 }
 
-void SNode::clear_data() {
-  if (clear_func == nullptr) {
-    if (clear_kernel == nullptr) {
-      clear_kernel = &kernel([&]() {
-        current_ast_builder().insert(Stmt::make<ClearAllStmt>(this, false));
-      });
-    }
-    (*(Kernel *)clear_kernel)();
-  } else {
-    clear_func(0);
-  }
-}
-
-void SNode::clear_data_and_deactivate() {
-  if (clear_func == nullptr) {
-    if (clear_and_deactivate_kernel == nullptr) {
-      clear_and_deactivate_kernel = &kernel([&]() {
-        current_ast_builder().insert(Stmt::make<ClearAllStmt>(this, true));
-      });
-    }
-    (*(Kernel *)clear_and_deactivate_kernel)();
-  } else {
-    clear_func(1);
-  }
-}
-
 void SNode::lazy_grad() {
   if (this->type == SNodeType::place)
     return;
@@ -223,8 +197,6 @@ SNode::SNode(int depth, SNodeType t) : depth(depth), type(t) {
   num_active_indices = 0;
   std::memset(taken_bits, 0, sizeof(taken_bits));
   std::memset(physical_index_position, -1, sizeof(physical_index_position));
-  access_func = nullptr;
-  stat_func = nullptr;
   parent = nullptr;
   _verbose = false;
   _multi_threaded = false;
@@ -234,7 +206,6 @@ SNode::SNode(int depth, SNodeType t) : depth(depth), type(t) {
   _morton = false;
   _bitmasked = false;
 
-  clear_func = nullptr;
   clear_kernel = nullptr;
   clear_and_deactivate_kernel = nullptr;
 
diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
index aac4970ed85bd..a7e21af32e18e 100644
--- a/taichi/ir/snode.h
+++ b/taichi/ir/snode.h
@@ -1,8 +1,10 @@
 #pragma once
+
 #include "taichi/lang_util.h"
+#include "taichi/common/bit.h"
 #include "taichi/llvm/llvm_fwd.h"
-#include "expr.h"
-#include <taichi/common/bit.h>
+#include "taichi/ir/expr.h"
+#include "taichi/constants.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -41,9 +43,9 @@ class Index {
     value = 0;
   }
   Index(int value) : value(value) {
-    TI_ERROR_UNLESS(0 <= value && value < max_num_indices,
+    TI_ERROR_UNLESS(0 <= value && value < taichi_max_num_indices,
                     "Too many dimensions. The maximum dimensionality is {}",
-                    max_num_indices);
+                    taichi_max_num_indices);
   }
 };
 
@@ -53,10 +55,10 @@ class SNode {
   // Children
   std::vector<std::shared_ptr<SNode>> ch;
 
-  IndexExtractor extractors[max_num_indices];
-  int taken_bits[max_num_indices]{};  // counting from the tail
+  IndexExtractor extractors[taichi_max_num_indices];
+  int taken_bits[taichi_max_num_indices]{};  // counting from the tail
   int num_active_indices{};
-  int physical_index_position[max_num_indices]{};
+  int physical_index_position[taichi_max_num_indices]{};
   // physical indices are (ti.i, ti.j, ti.k, ti.l, ...)
   // physical_index_position[i] =
   // which physical index does the i-th virtual index (the one exposed to
@@ -86,12 +88,6 @@ class SNode {
     return Tlang::data_type_name(dt);
   }
 
-  using AccessorFunction = std::function<void *(void *, int, int, int, int)>;
-  using StatFunction = std::function<AllocatorStat()>;
-  using ClearFunction = std::function<void(int)>;
-  AccessorFunction access_func;
-  StatFunction stat_func;
-  ClearFunction clear_func;
   void *clear_kernel{}, *clear_and_deactivate_kernel{};
 
   std::string node_type_name;
@@ -223,12 +219,6 @@ class SNode {
     return *this;
   }
 
-  void *evaluate(void *ds, int i, int j, int k, int l) {
-    TI_ASSERT(access_func);
-    TI_ASSERT(max_num_indices == 4);
-    return access_func(ds, i, j, k, l);
-  }
-
   // for float and double
   void write_float(const std::vector<int> &I, float64);
   float64 read_float(const std::vector<int> &I);
@@ -238,11 +228,6 @@ class SNode {
   int64 read_int(const std::vector<int> &I);
   uint64 read_uint(const std::vector<int> &I);
 
-  TI_FORCE_INLINE AllocatorStat stat() {
-    TI_ASSERT(stat_func);
-    return stat_func();
-  }
-
   int child_id(SNode *c) {
     for (int i = 0; i < (int)ch.size(); i++) {
       if (ch[i].get() == c) {
@@ -252,10 +237,6 @@ class SNode {
     return -1;
   }
 
-  void clear_data();
-
-  void clear_data_and_deactivate();
-
   bool has_null() const {
     return type == SNodeType::pointer || type == SNodeType::hash;
   }
diff --git a/taichi/jit/jit_arch_cuda.cpp b/taichi/jit/jit_arch_cuda.cpp
index 9aa046dcbb700..158c83dce55b5 100644
--- a/taichi/jit/jit_arch_cuda.cpp
+++ b/taichi/jit/jit_arch_cuda.cpp
@@ -58,7 +58,7 @@ class JITModuleCUDA : public JITModule {
   virtual void launch(const std::string &name,
                       std::size_t grid_dim,
                       std::size_t block_dim,
-                      const std::vector<void *> &arg_pointers) {
+                      const std::vector<void *> &arg_pointers) override {
     auto func = lookup_function(name);
     cuda_context->launch(func, name, arg_pointers, grid_dim, block_dim);
   }
diff --git a/taichi/lang_util.cpp b/taichi/lang_util.cpp
index 9ddca263e721a..39107f8a38c95 100644
--- a/taichi/lang_util.cpp
+++ b/taichi/lang_util.cpp
@@ -1,8 +1,10 @@
 // Definitions of utility functions and enums
 
 #include "lang_util.h"
-#include <taichi/system/timer.h>
-#include <taichi/math/linalg.h>
+#include "taichi/system/timer.h"
+#include "taichi/math/linalg.h"
+#include "taichi/arch.h"
+#include "taichi/compile_config.h"
 
 TI_NAMESPACE_BEGIN
 
@@ -61,17 +63,6 @@ real measure_cpe(std::function<void()> target,
   return elasped_cycles / float64(total_batches * elements_per_call);
 }
 
-int default_simd_width(Arch arch) {
-  if (arch == Arch::x64) {
-    return 8;
-  } else if (arch == Arch::cuda) {
-    return 32;
-  } else {
-    TI_NOT_IMPLEMENTED;
-    return -1;
-  }
-}
-
 std::string data_type_name(DataType t) {
   static std::map<DataType, std::string> type_names;
   if (type_names.empty()) {
@@ -256,34 +247,6 @@ bool command_exist(const std::string &command) {
 #endif
 }
 
-CompileConfig::CompileConfig() {
-  arch = Arch::x64;
-  simd_width = default_simd_width(arch);
-  external_optimization_level = 3;
-  print_ir = false;
-  print_accessor_ir = false;
-  use_llvm = true;
-  print_struct_llvm_ir = false;
-  print_kernel_llvm_ir = false;
-  print_kernel_llvm_ir_optimized = false;
-  demote_dense_struct_fors = true;
-  max_vector_width = 8;
-  debug = false;
-  lazy_compilation = true;
-  serial_schedule = false;
-  simplify_before_lower_access = true;
-  lower_access = true;
-  simplify_after_lower_access = true;
-  default_fp = DataType::f32;
-  default_ip = DataType::i32;
-  verbose_kernel_launches = false;
-  enable_profiler = false;
-  default_cpu_block_dim = 0;  // 0 = adaptive
-  default_gpu_block_dim = 64;
-  verbose = true;
-  fast_math = true;
-}
-
 DataType promoted_type(DataType a, DataType b) {
   std::map<std::pair<DataType, DataType>, DataType> mapping;
   if (mapping.empty()) {
diff --git a/taichi/lang_util.h b/taichi/lang_util.h
index c8b387ce7e9db..a005bdeb5ea6c 100644
--- a/taichi/lang_util.h
+++ b/taichi/lang_util.h
@@ -1,15 +1,20 @@
 // Definitions of utility functions and enums
 
 #pragma once
-#include <taichi/arch.h>
-#include <taichi/common/util.h>
 #include <taichi/util/io.h>
-#include <taichi/common.h>
+#include <taichi/common/util.h>
 #include <taichi/system/profiler.h>
 
-TLANG_NAMESPACE_BEGIN
+#define TLANG_NAMESPACE_BEGIN \
+  namespace taichi {          \
+  namespace Tlang {
+
+#define TLANG_NAMESPACE_END \
+  }                         \
+  }
 
-int default_simd_width(Arch arch);
+
+TLANG_NAMESPACE_BEGIN
 
 real get_cpu_frequency();
 
@@ -293,39 +298,6 @@ std::string make_list(const std::vector<T> &data,
 int data_type_size(DataType t);
 DataType promoted_type(DataType a, DataType b);
 
-struct CompileConfig {
-  Arch arch;
-  bool debug;
-  int simd_width;
-  bool lazy_compilation;
-  int external_optimization_level;
-  int max_vector_width;
-  bool print_ir;
-  bool print_accessor_ir;
-  bool serial_schedule;
-  bool simplify_before_lower_access;
-  bool lower_access;
-  bool simplify_after_lower_access;
-  bool demote_dense_struct_fors;
-  bool use_llvm;
-  bool print_struct_llvm_ir;
-  bool print_kernel_llvm_ir;
-  bool print_kernel_llvm_ir_optimized;
-  bool verbose_kernel_launches;
-  bool enable_profiler;
-  bool verbose;
-  bool fast_math;
-  bool use_unified_memory;
-  DataType default_fp;
-  DataType default_ip;
-  std::string extra_flags;
-  int default_cpu_block_dim;
-  int default_gpu_block_dim;
-
-  CompileConfig();
-};
-
-extern CompileConfig default_compile_config;
 extern std::string compiled_lib_dir;
 extern std::string runtime_tmp_dir;
 
diff --git a/taichi/platform/metal/metal_api.h b/taichi/platform/metal/metal_api.h
index 56dd4b035d70c..ccd39d4db2c40 100644
--- a/taichi/platform/metal/metal_api.h
+++ b/taichi/platform/metal/metal_api.h
@@ -3,9 +3,8 @@
 // Reference implementation:
 // https://github.com/halide/Halide/blob/master/src/runtime/metal.cpp
 
-#include <taichi/common.h>
-#include <taichi/common/util.h>
-#include <taichi/platform/mac/objc_api.h>
+#include "taichi/lang_util.h"
+#include "taichi/platform/mac/objc_api.h"
 
 #include <string>
 
diff --git a/taichi/platform/opengl/opengl_api.h b/taichi/platform/opengl/opengl_api.h
index 0eb23347fc3bb..297ff56660ab3 100644
--- a/taichi/platform/opengl/opengl_api.h
+++ b/taichi/platform/opengl/opengl_api.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <taichi/common.h>
 #include <taichi/common/util.h>
 
 #include <string>
diff --git a/taichi/platform/opengl/opengl_kernel.h b/taichi/platform/opengl/opengl_kernel.h
index 3fc1a0241b361..b75edbbb0d605 100644
--- a/taichi/platform/opengl/opengl_kernel.h
+++ b/taichi/platform/opengl/opengl_kernel.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <taichi/common.h>
 #include <taichi/constants.h>
 #include <taichi/lang_util.h>
 
diff --git a/taichi/profiler.h b/taichi/profiler.h
index 476a82336da28..6fdadc02d90b2 100644
--- a/taichi/profiler.h
+++ b/taichi/profiler.h
@@ -1,11 +1,13 @@
 #pragma once
+
+#include "taichi/arch.h"
+#include "taichi/lang_util.h"
+
 #include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
 #include <memory>
-#include "common.h"
-#include "lang_util.h"
 
 TLANG_NAMESPACE_BEGIN
 
diff --git a/taichi/program.cpp b/taichi/program.cpp
index e24f8e17d2702..cb06fecf8c5fb 100644
--- a/taichi/program.cpp
+++ b/taichi/program.cpp
@@ -291,7 +291,7 @@ void Program::visualize_layout(const std::string &fn) {
       }
 
       std::string indices;
-      for (int i = 0; i < max_num_indices; i++) {
+      for (int i = 0; i < taichi_max_num_indices; i++) {
         if (snode->extractors[i].active) {
           int nb = snode->extractors[i].num_bits;
           int start = snode->extractors[i].start + nb;
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index cd30c13d454d3..b68d82c75c2a3 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -138,8 +138,6 @@ void export_lang(py::module &m) {
   py::class_<Index>(m, "Index").def(py::init<int>());
   py::class_<SNode>(m, "SNode")
       .def(py::init<>())
-      .def("clear_data", &SNode::clear_data)
-      .def("clear_data_and_deactivate", &SNode::clear_data_and_deactivate)
       .def_readwrite("parent", &SNode::parent)
       .def_readonly("type", &SNode::type)
       .def("dense",
@@ -464,8 +462,8 @@ void export_lang(py::module &m) {
   m.def("test_printf", [] { printf("test_printf\n"); });
   m.def("test_logging", [] { TI_INFO("test_logging\n"); });
   m.def("trigger_crash", [] { *(int *)(1) = 0; });
-  m.def("get_max_num_indices", [] { return max_num_indices; });
-  m.def("get_max_num_args", [] { return max_num_args; });
+  m.def("get_max_num_indices", [] { return taichi_max_num_indices; });
+  m.def("get_max_num_args", [] { return taichi_max_num_args; });
   m.def("test_threading", test_threading);
   m.def("sifakis_svd_f32", sifakis_svd_export<float32, int32>);
   m.def("sifakis_svd_f64", sifakis_svd_export<float64, int64>);
diff --git a/taichi/struct/struct.cpp b/taichi/struct/struct.cpp
index cf075edd1f5bd..b49a943ca5ba7 100644
--- a/taichi/struct/struct.cpp
+++ b/taichi/struct/struct.cpp
@@ -23,10 +23,10 @@ void StructCompiler::infer_snode_properties(SNode &snode) {
   for (int ch_id = 0; ch_id < (int)snode.ch.size(); ch_id++) {
     auto &ch = snode.ch[ch_id];
     ch->parent = &snode;
-    for (int i = 0; i < max_num_indices; i++) {
+    for (int i = 0; i < taichi_max_num_indices; i++) {
       ch->extractors[i].num_elements *= snode.extractors[i].num_elements;
       bool found = false;
-      for (int k = 0; k < max_num_indices; k++) {
+      for (int k = 0; k < taichi_max_num_indices; k++) {
         if (snode.physical_index_position[k] == i) {
           found = true;
           break;
@@ -52,7 +52,7 @@ void StructCompiler::infer_snode_properties(SNode &snode) {
     }
     // infer extractors
     int acc_offsets = 0;
-    for (int i = max_num_indices - 1; i >= 0; i--) {
+    for (int i = taichi_max_num_indices - 1; i >= 0; i--) {
       int inferred = ch->extractors[i].start + ch->extractors[i].num_bits;
       if (ch_id == 0) {
         snode.extractors[i].start = inferred;
@@ -70,7 +70,7 @@ void StructCompiler::infer_snode_properties(SNode &snode) {
     }
     if (snode.type == SNodeType::dynamic) {
       int active_extractor_counder = 0;
-      for (int i = 0; i < max_num_indices; i++) {
+      for (int i = 0; i < taichi_max_num_indices; i++) {
         if (snode.extractors[i].num_bits != 0) {
           active_extractor_counder += 1;
           SNode *p = snode.parent;
@@ -91,7 +91,7 @@ void StructCompiler::infer_snode_properties(SNode &snode) {
     snode.expr->set_attribute("dim", std::to_string(snode.num_active_indices));
 
   snode.total_num_bits = 0;
-  for (int i = 0; i < max_num_indices; i++) {
+  for (int i = 0; i < taichi_max_num_indices; i++) {
     snode.total_num_bits += snode.extractors[i].num_bits;
   }
 
diff --git a/taichi/struct/struct_llvm.cpp b/taichi/struct/struct_llvm.cpp
index c664305bb0ba4..78d182f774d14 100644
--- a/taichi/struct/struct_llvm.cpp
+++ b/taichi/struct/struct_llvm.cpp
@@ -106,7 +106,7 @@ void StructCompilerLLVM::generate_refine_coordinates(SNode *snode) {
   auto outp_coords = args[1];
   auto l = args[2];
 
-  for (int i = 0; i < max_num_indices; i++) {
+  for (int i = 0; i < taichi_max_num_indices; i++) {
     auto addition = tlctx->get_constant(0);
     if (snode->extractors[i].num_bits) {
       auto mask = ((1 << snode->extractors[i].num_bits) - 1);
@@ -194,7 +194,7 @@ void StructCompilerLLVM::run(SNode &root, bool host) {
     module->print(errs(), nullptr);
   }
 
-  TI_ASSERT((int)snodes.size() <= max_num_snodes);
+  TI_ASSERT((int)snodes.size() <= taichi_max_num_snodes);
 
   root_size =
       tlctx->get_data_layout().getTypeAllocSize(snode_attr[root].llvm_type);
diff --git a/taichi/system/memory_pool.h b/taichi/system/memory_pool.h
index ff90000198ace..d577e44cc9671 100644
--- a/taichi/system/memory_pool.h
+++ b/taichi/system/memory_pool.h
@@ -1,11 +1,7 @@
 #pragma once
-#include <taichi/common/util.h>
-#include "taichi/common.h"
-#include <mutex>
-#include <vector>
-#include <memory>
-#include <thread>
-#include "unified_allocator.h"
+#include "taichi/common/util.h"
+#include "taichi/constants.h"
+#include "taichi/system/unified_allocator.h"
 #define TI_RUNTIME_HOST
 #include "taichi/context.h"
 #include "taichi/profiler.h"
@@ -13,6 +9,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#include <mutex>
+#include <vector>
+#include <memory>
+#include <thread>
+
 TLANG_NAMESPACE_BEGIN
 
 class Program;
diff --git a/taichi/system/unified_allocator.h b/taichi/system/unified_allocator.h
index fcbf35acc9abd..7763f49795d89 100644
--- a/taichi/system/unified_allocator.h
+++ b/taichi/system/unified_allocator.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "taichi/arch.h"
-#include "taichi/common.h"
+#include "taichi/constants.h"
 #include <mutex>
 #include <vector>
 #include <memory>
diff --git a/taichi/transforms/insert_scratch_pad.cpp b/taichi/transforms/insert_scratch_pad.cpp
index 306a7c8a6c4aa..44543a20d2273 100644
--- a/taichi/transforms/insert_scratch_pad.cpp
+++ b/taichi/transforms/insert_scratch_pad.cpp
@@ -31,7 +31,7 @@ class AccessAnalysis : public IRVisitor {
 
   void generate_block_indices(SNode *snode, std::vector<int> index, int s) {
     // NOTE: Assuming not vectorized
-    if (s == max_num_indices) {
+    if (s == taichi_max_num_indices) {
       block_indices.push_back(index);
       return;
     }
diff --git a/taichi/transforms/lower_access.cpp b/taichi/transforms/lower_access.cpp
index 6f378e0fd495e..ceea6c9237bc0 100644
--- a/taichi/transforms/lower_access.cpp
+++ b/taichi/transforms/lower_access.cpp
@@ -76,7 +76,7 @@ class LowerAccess : public IRVisitor {
       std::vector<int> strides;
       // extract bits
       for (int k_ = 0; k_ < (int)indices.size(); k_++) {
-        for (int k = 0; k < max_num_indices; k++) {
+        for (int k = 0; k < taichi_max_num_indices; k++) {
           if (snode->physical_index_position[k_] == k) {
             int begin = snode->extractors[k].start;
             int end = begin + snode->extractors[k].num_bits;