diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 3de97f000e316..04171fcf02de8 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -32,18 +32,18 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var,
     : global_var(global_var) {
   vectorize = dec.vectorize;
   bit_vectorize = dec.bit_vectorize;
-  parallelize = dec.parallelize;
+  num_cpu_threads = dec.num_cpu_threads;
   strictly_serialized = dec.strictly_serialized;
   block_dim = dec.block_dim;
   auto cfg = get_current_program().config;
   if (cfg.arch == Arch::cuda) {
     vectorize = 1;
-    parallelize = 1;
+    num_cpu_threads = 1;
     TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
   } else {
     // cpu
-    if (parallelize == 0)
-      parallelize = std::thread::hardware_concurrency();
+    if (num_cpu_threads == 0)
+      num_cpu_threads = std::thread::hardware_concurrency();
   }
   mem_access_opt = dec.mem_access_opt;
   dec.reset();
@@ -69,16 +69,16 @@ FrontendForStmt::FrontendForStmt(const Expr &loop_var,
     : begin(begin), end(end) {
   vectorize = dec.vectorize;
   bit_vectorize = dec.bit_vectorize;
-  parallelize = dec.parallelize;
+  num_cpu_threads = dec.num_cpu_threads;
   strictly_serialized = dec.strictly_serialized;
   block_dim = dec.block_dim;
   auto cfg = get_current_program().config;
   if (cfg.arch == Arch::cuda) {
     vectorize = 1;
-    parallelize = 1;
+    num_cpu_threads = 1;
   } else {
-    if (parallelize == 0)
-      parallelize = std::thread::hardware_concurrency();
+    if (num_cpu_threads == 0)
+      num_cpu_threads = std::thread::hardware_concurrency();
   }
   mem_access_opt = dec.mem_access_opt;
   dec.reset();
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index d532c8e02061e..2b410fda4c34a 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -123,7 +123,7 @@ class FrontendForStmt : public Stmt {
   std::vector<Identifier> loop_var_id;
   int vectorize;
   int bit_vectorize;
-  int parallelize;
+  int num_cpu_threads;
   bool strictly_serialized;
   MemoryAccessOptions mem_access_opt;
   int block_dim;
diff --git a/taichi/ir/ir.cpp b/taichi/ir/ir.cpp
index 82e2d8f7c89ce..586865419ead9 100644
--- a/taichi/ir/ir.cpp
+++ b/taichi/ir/ir.cpp
@@ -24,7 +24,7 @@ std::string snode_access_flag_name(SNodeAccessFlag type) {
 void DecoratorRecorder::reset() {
   vectorize = -1;
   bit_vectorize = -1;
-  parallelize = 0;
+  num_cpu_threads = 0;
   uniform = false;
   mem_access_opt.clear();
   block_dim = 0;
diff --git a/taichi/ir/ir.h b/taichi/ir/ir.h
index 5d2c3eadfd7d5..4f75195df99d5 100644
--- a/taichi/ir/ir.h
+++ b/taichi/ir/ir.h
@@ -74,7 +74,7 @@ class DecoratorRecorder {
  public:
   int vectorize;
   int bit_vectorize;
-  int parallelize;
+  int num_cpu_threads;
   bool strictly_serialized;
   MemoryAccessOptions mem_access_opt;
   int block_dim;
@@ -712,7 +712,7 @@ inline void BitVectorize(int v) {
 }
 
 inline void Parallelize(int v) {
-  dec.parallelize = v;
+  dec.num_cpu_threads = v;
 }
 
 inline void StrictlySerialize() {
diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp
index 798972c5a4eb5..25ef6188555a7 100644
--- a/taichi/ir/ir_builder.cpp
+++ b/taichi/ir/ir_builder.cpp
@@ -48,22 +48,22 @@ RangeForStmt *IRBuilder::create_range_for(Stmt *begin,
                                           Stmt *end,
                                           int vectorize,
                                           int bit_vectorize,
-                                          int parallelize,
+                                          int num_cpu_threads,
                                           int block_dim,
                                           bool strictly_serialized) {
   return insert(Stmt::make_typed<RangeForStmt>(
       begin, end, std::make_unique<Block>(), vectorize, bit_vectorize,
-      parallelize, block_dim, strictly_serialized));
+      num_cpu_threads, block_dim, strictly_serialized));
 }
 
 StructForStmt *IRBuilder::create_struct_for(SNode *snode,
                                             int vectorize,
                                             int bit_vectorize,
-                                            int parallelize,
+                                            int num_cpu_threads,
                                             int block_dim) {
   return insert(Stmt::make_typed<StructForStmt>(
-      snode, std::make_unique<Block>(), vectorize, bit_vectorize, parallelize,
-      block_dim));
+      snode, std::make_unique<Block>(), vectorize, bit_vectorize,
+      num_cpu_threads, block_dim));
 }
 
 WhileStmt *IRBuilder::create_while_true() {
diff --git a/taichi/ir/ir_builder.h b/taichi/ir/ir_builder.h
index c924eb8c3dd21..22ea0e50ca8ed 100644
--- a/taichi/ir/ir_builder.h
+++ b/taichi/ir/ir_builder.h
@@ -64,13 +64,13 @@ class IRBuilder {
                                  Stmt *end,
                                  int vectorize = -1,
                                  int bit_vectorize = -1,
-                                 int parallelize = 0,
+                                 int num_cpu_threads = 0,
                                  int block_dim = 0,
                                  bool strictly_serialized = false);
   StructForStmt *create_struct_for(SNode *snode,
                                    int vectorize = -1,
                                    int bit_vectorize = -1,
-                                   int parallelize = 0,
+                                   int num_cpu_threads = 0,
                                    int block_dim = 0);
   WhileStmt *create_while_true();
   IfStmt *create_if(Stmt *cond);
diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
index 17911b864e259..7b8083c007d88 100644
--- a/taichi/ir/statements.cpp
+++ b/taichi/ir/statements.cpp
@@ -224,7 +224,7 @@ RangeForStmt::RangeForStmt(Stmt *begin,
                            std::unique_ptr<Block> &&body,
                            int vectorize,
                            int bit_vectorize,
-                           int parallelize,
+                           int num_cpu_threads,
                            int block_dim,
                            bool strictly_serialized)
     : begin(begin),
@@ -232,7 +232,7 @@ RangeForStmt::RangeForStmt(Stmt *begin,
       body(std::move(body)),
       vectorize(vectorize),
       bit_vectorize(bit_vectorize),
-      parallelize(parallelize),
+      num_cpu_threads(num_cpu_threads),
       block_dim(block_dim),
       strictly_serialized(strictly_serialized) {
   reversed = false;
@@ -242,7 +242,7 @@ RangeForStmt::RangeForStmt(Stmt *begin,
 
 std::unique_ptr<Stmt> RangeForStmt::clone() const {
   auto new_stmt = std::make_unique<RangeForStmt>(
-      begin, end, body->clone(), vectorize, bit_vectorize, parallelize,
+      begin, end, body->clone(), vectorize, bit_vectorize, num_cpu_threads,
       block_dim, strictly_serialized);
   new_stmt->reversed = reversed;
   return new_stmt;
@@ -252,21 +252,22 @@ StructForStmt::StructForStmt(SNode *snode,
                              std::unique_ptr<Block> &&body,
                              int vectorize,
                              int bit_vectorize,
-                             int parallelize,
+                             int num_cpu_threads,
                              int block_dim)
     : snode(snode),
       body(std::move(body)),
       vectorize(vectorize),
       bit_vectorize(bit_vectorize),
-      parallelize(parallelize),
+      num_cpu_threads(num_cpu_threads),
       block_dim(block_dim) {
   this->body->parent_stmt = this;
   TI_STMT_REG_FIELDS;
 }
 
 std::unique_ptr<Stmt> StructForStmt::clone() const {
-  auto new_stmt = std::make_unique<StructForStmt>(
-      snode, body->clone(), vectorize, bit_vectorize, parallelize, block_dim);
+  auto new_stmt = std::make_unique<StructForStmt>(snode, body->clone(),
+                                                  vectorize, bit_vectorize,
+                                                  num_cpu_threads, block_dim);
   new_stmt->mem_access_opt = mem_access_opt;
   return new_stmt;
 }
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index 09ea4e37107fc..cd5c24717354e 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -551,7 +551,7 @@ class RangeForStmt : public Stmt {
   bool reversed;
   int vectorize;
   int bit_vectorize;
-  int parallelize;
+  int num_cpu_threads;
   int block_dim;
   bool strictly_serialized;
 
@@ -560,7 +560,7 @@ class RangeForStmt : public Stmt {
                std::unique_ptr<Block> &&body,
                int vectorize,
                int bit_vectorize,
-               int parallelize,
+               int num_cpu_threads,
                int block_dim,
                bool strictly_serialized);
 
@@ -579,7 +579,7 @@ class RangeForStmt : public Stmt {
                      reversed,
                      vectorize,
                      bit_vectorize,
-                     parallelize,
+                     num_cpu_threads,
                      block_dim,
                      strictly_serialized);
   TI_DEFINE_ACCEPT
@@ -595,7 +595,7 @@ class StructForStmt : public Stmt {
   std::vector<int> index_offsets;
   int vectorize;
   int bit_vectorize;
-  int parallelize;
+  int num_cpu_threads;
   int block_dim;
   MemoryAccessOptions mem_access_opt;
 
@@ -603,7 +603,7 @@ class StructForStmt : public Stmt {
                 std::unique_ptr<Block> &&body,
                 int vectorize,
                 int bit_vectorize,
-                int parallelize,
+                int num_cpu_threads,
                 int block_dim);
 
   bool is_container_statement() const override {
@@ -616,7 +616,7 @@ class StructForStmt : public Stmt {
                      index_offsets,
                      vectorize,
                      bit_vectorize,
-                     parallelize,
+                     num_cpu_threads,
                      block_dim,
                      mem_access_opt);
   TI_DEFINE_ACCEPT
diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index c6f09ecffde09..42b0183bff76a 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -209,7 +209,7 @@ class LowerAST : public IRVisitor {
       if (is_good_range_for) {
         auto &&new_for = std::make_unique<RangeForStmt>(
             begin->stmt, end->stmt, std::move(stmt->body), stmt->vectorize,
-            stmt->bit_vectorize, stmt->parallelize, stmt->block_dim,
+            stmt->bit_vectorize, stmt->num_cpu_threads, stmt->block_dim,
             stmt->strictly_serialized);
         new_for->body->insert(std::make_unique<LoopIndexStmt>(new_for.get(), 0),
                               0);
@@ -293,7 +293,7 @@ class LowerAST : public IRVisitor {
 
       auto &&new_for = std::make_unique<StructForStmt>(
           snode, std::move(stmt->body), stmt->vectorize, stmt->bit_vectorize,
-          stmt->parallelize, stmt->block_dim);
+          stmt->num_cpu_threads, stmt->block_dim);
       new_for->index_offsets = offsets;
       VecStatement new_statements;
       for (int i = 0; i < (int)stmt->loop_var_id.size(); i++) {
diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp
index a4b3fa710ea68..e38867fc9ed96 100644
--- a/taichi/transforms/offload.cpp
+++ b/taichi/transforms/offload.cpp
@@ -79,7 +79,7 @@ class Offloader {
               std::make_pair(offloaded.get(), s->end));
         }
         offloaded->num_cpu_threads =
-            std::min(s->parallelize,
+            std::min(s->num_cpu_threads,
                      root->get_kernel()->program.config.cpu_max_num_threads);
         replace_all_usages_with(s, s, offloaded.get());
         for (int j = 0; j < (int)s->body->statements.size(); j++) {
@@ -181,8 +181,8 @@ class Offloader {
     }
 
     offloaded_struct_for->snode = for_stmt->snode;
-    offloaded_struct_for->num_cpu_threads =
-        std::min(for_stmt->parallelize, program->config.cpu_max_num_threads);
+    offloaded_struct_for->num_cpu_threads = std::min(
+        for_stmt->num_cpu_threads, program->config.cpu_max_num_threads);
     offloaded_struct_for->mem_access_opt = mem_access_opt;
 
     root_block->insert(std::move(offloaded_struct_for));