diff --git a/python/taichi/lang/snode.py b/python/taichi/lang/snode.py
index ff235c7124554..d4ceed47473f9 100644
--- a/python/taichi/lang/snode.py
+++ b/python/taichi/lang/snode.py
@@ -134,6 +134,12 @@ def num_dynamically_allocated(self):
         runtime.materialize()
         return runtime.prog.get_snode_num_dynamically_allocated(self.ptr)
 
+    @property
+    def cell_size_bytes(self):
+        runtime = impl.get_runtime()
+        runtime.materialize()
+        return self.ptr.cell_size_bytes
+
     def deactivate_all(self):
         ch = self.get_children()
         for c in ch:
diff --git a/python/taichi/tools/video.py b/python/taichi/tools/video.py
index a7cfc49bfe99a..6ef27ea231689 100644
--- a/python/taichi/tools/video.py
+++ b/python/taichi/tools/video.py
@@ -154,9 +154,9 @@ def interpolate_frames(frame_dir, mul=4):
         cv2.imwrite('interpolated/{:05d}.png'.format(i), img * 255.0)
 
 
-def ffmpeg_common_args(frame_rate, width, height, crf):
-    return f" -loglevel panic -framerate {frame_rate} -s:v {width}x{height} " + \
-                         f"-c:v libx264 -profile:v high -crf {crf} -pix_fmt yuv420p"
+def ffmpeg_common_args(frame_rate, input, width, height, crf, output_path):
+    return f"{get_ffmpeg_path()} -y -loglevel panic -framerate {frame_rate} -i {input} -s:v {width}x{height} " + \
+                         f"-c:v libx264 -profile:v high -crf {crf} -pix_fmt yuv420p {output_path}"
 
 
 def make_video(input_files,
@@ -180,7 +180,9 @@ def make_video(input_files,
             height -= 1
         for i, inp in enumerate(input_files):
             shutil.copy(inp, os.path.join(tmp_dir, '%06d.png' % i))
-        command = f"{get_ffmpeg_path()} -y -i {tmp_dir}/%06d.png {ffmpeg_common_args(frame_rate, width, height, crf)} {output_path} "
+        inputs = f'{tmp_dir}/%06d.png'
+        command = ffmpeg_common_args(frame_rate, inputs, width, height, crf,
+                                     output_path)
         ret = os.system(command)
         assert ret == 0, "ffmpeg failed to generate video file."
         for i in range(len(input_files)):
@@ -188,8 +190,9 @@ def make_video(input_files,
         os.rmdir(tmp_dir)
     elif isinstance(input_files, str):
         assert width != 0 and height != 0
-        command = f"{get_ffmpeg_path()} -i {input_files} {ffmpeg_common_args(frame_rate, width, height, crf)} {output_path}"
+        command = ffmpeg_common_args(frame_rate, input_files, width, height,
+                                     crf, output_path)
         ret = os.system(command)
         assert ret == 0, "ffmpeg failed to generate video file."
     else:
-        assert False, f'input_files should be list (of files) or str (of file template, like "%04d.png") instead of {type(input_files)}'
+        assert False, f'input_files should be list (of files) or str (of file template, e.g., "%04d.png") instead of {type(input_files)}'
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 81bb4e30f2ee1..82c89ba59ea2c 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1055,6 +1055,7 @@ llvm::Value *CodeGenLLVM::atomic_add_custom_float(AtomicOpStmt *stmt,
   auto cit = cft->get_digits_type()->as<CustomIntType>();
   auto val_store = float_to_custom_int(cft, cit, llvm_val[stmt->val]);
   auto physical_type = cit->get_physical_type();
+  val_store = builder->CreateSExt(val_store, llvm_type(physical_type));
 
   return create_call(
       fmt::format("atomic_add_partial_bits_b{}", data_type_bits(physical_type)),
@@ -2039,13 +2040,14 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
 
   llvm::Function *body = nullptr;
   auto leaf_block = stmt->snode;
+
   // When looping over bit_arrays and bit_structs, we generate struct for on
   // their parent node (usually "dense") instead of itself for higher
   // performance. Also, note that the loop must be bit_vectorized for
-  // bit_arrays.
-  if ((leaf_block->type == SNodeType::bit_array ||
-       leaf_block->type == SNodeType::bit_struct) &&
-      leaf_block->parent) {
+  // bit_arrays, and their parent must be "dense".
+  if (leaf_block->type == SNodeType::bit_struct) {
+    leaf_block = leaf_block->parent;
+  } else if (leaf_block->type == SNodeType::bit_array) {
     if (leaf_block->parent->type == SNodeType::dense) {
       leaf_block = leaf_block->parent;
     } else {
diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
index 2fab9f9dcd57a..586f868052513 100644
--- a/taichi/ir/snode.h
+++ b/taichi/ir/snode.h
@@ -76,6 +76,7 @@ class SNode {
   int64 n{};
   int total_num_bits{}, total_bit_start{};
   int chunk_size{};
+  std::size_t cell_size_bytes{};
   PrimitiveType *physical_type;  // for bit_struct and bit_array only
   DataType dt;
   bool has_ambient{};
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 49d60c2b293bf..39cdd931c6cda 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -327,9 +327,7 @@ void Program::initialize_runtime_system(StructCompiler *scomp) {
   for (int i = 0; i < (int)snodes.size(); i++) {
     if (is_gc_able(snodes[i]->type)) {
       std::size_t node_size;
-      auto element_size =
-          tlctx->get_type_size(StructCompilerLLVM::get_llvm_element_type(
-              tlctx->get_this_thread_struct_module(), snodes[i]));
+      auto element_size = snodes[i]->cell_size_bytes;
       if (snodes[i]->type == SNodeType::pointer) {
         // pointer. Allocators are for single elements
         node_size = element_size;
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 573b2dc5f49bc..c8a93f253ff52 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -266,6 +266,7 @@ void export_lang(py::module &m) {
            })
       .def("num_active_indices",
            [](SNode *snode) { return snode->num_active_indices; })
+      .def_readonly("cell_size_bytes", &SNode::cell_size_bytes)
       .def("begin_shared_exp_placement", &SNode::begin_shared_exp_placement)
       .def("end_shared_exp_placement", &SNode::end_shared_exp_placement);
 
diff --git a/taichi/struct/struct_llvm.cpp b/taichi/struct/struct_llvm.cpp
index a7cc5fe4a760c..81f81718d23c6 100644
--- a/taichi/struct/struct_llvm.cpp
+++ b/taichi/struct/struct_llvm.cpp
@@ -46,6 +46,8 @@ void StructCompilerLLVM::generate_types(SNode &snode) {
   auto ch_type =
       llvm::StructType::create(*ctx, ch_types, snode.node_type_name + "_ch");
 
+  snode.cell_size_bytes = tlctx->get_type_size(ch_type);
+
   llvm::Type *body_type = nullptr, *aux_type = nullptr;
   if (type == SNodeType::dense || type == SNodeType::bitmasked) {
     TI_ASSERT(snode._morton == false);
@@ -83,6 +85,10 @@ void StructCompilerLLVM::generate_types(SNode &snode) {
       }
       ch->bit_offset = total_offset;
       total_offset += component_cit->get_num_bits();
+      auto bit_struct_size = data_type_bits(snode.physical_type);
+      TI_ERROR_IF(total_offset > bit_struct_size,
+                  "Bit struct overflows: {} bits used out of {}.", total_offset,
+                  bit_struct_size);
     }
 
     snode.dt = TypeFactory::get_instance().get_bit_struct_type(
diff --git a/taichi/transforms/demote_atomics.cpp b/taichi/transforms/demote_atomics.cpp
index b11a0f23f4ca6..0697266b2d895 100644
--- a/taichi/transforms/demote_atomics.cpp
+++ b/taichi/transforms/demote_atomics.cpp
@@ -62,6 +62,19 @@ class DemoteAtomics : public BasicStmtVisitor {
       demote = true;
       is_local = true;
     }
+
+    if (auto dest_pointer_type = stmt->dest->ret_type->cast<PointerType>()) {
+      if (auto cft =
+              dest_pointer_type->get_pointee_type()->cast<CustomFloatType>()) {
+        if (cft->get_exponent_type()) {
+          TI_WARN(
+              "AtomicOp on CustomFloatType with exponent is not supported. "
+              "Demoting to non-atomic RMW.");
+          demote = true;
+        }
+      }
+    }
+
     if (demote) {
       // replace atomics with load, add, store
       auto bin_type = atomic_to_binary_op_type(stmt->op_type);
diff --git a/tests/python/test_cell_size_inspection.py b/tests/python/test_cell_size_inspection.py
new file mode 100644
index 0000000000000..2f66bf3385955
--- /dev/null
+++ b/tests/python/test_cell_size_inspection.py
@@ -0,0 +1,43 @@
+import taichi as ti
+
+
+@ti.test(arch=ti.cpu)
+def test_primitives():
+    x = ti.field(dtype=ti.i16)
+    y = ti.field(dtype=ti.f32)
+    z = ti.field(dtype=ti.f64)
+
+    p = ti.field(dtype=ti.f32)
+    q = ti.field(dtype=ti.f32)
+    r = ti.field(dtype=ti.f64)
+
+    n1 = ti.root.dense(ti.i, 32)
+    n1.place(x)
+
+    n2 = ti.root.dense(ti.i, 32)
+    n2.place(y, z)
+
+    n3 = ti.root.dense(ti.i, 1)
+    n3.place(p, q, r)
+
+    assert n1.cell_size_bytes == 2
+    assert 12 <= n2.cell_size_bytes <= 16
+    assert n3.cell_size_bytes == 16
+
+
+@ti.test(arch=ti.cpu)
+def test_bit_struct():
+    cit = ti.type_factory.custom_int(16, False)
+    x = ti.field(dtype=cit)
+    y = ti.field(dtype=ti.type_factory.custom_float(significand_type=cit))
+    z = ti.field(dtype=ti.f32)
+
+    n1 = ti.root.dense(ti.i, 32)
+    n1._bit_struct(num_bits=32).place(x)
+
+    n2 = ti.root.dense(ti.i, 4)
+    n2._bit_struct(num_bits=32).place(y)
+    n2.place(z)
+
+    assert n1.cell_size_bytes == 4
+    assert n2.cell_size_bytes == 8
diff --git a/tests/python/test_custom_float_exponents.py b/tests/python/test_custom_float_exponents.py
index 233604a5bcd20..b6269d12d2061 100644
--- a/tests/python/test_custom_float_exponents.py
+++ b/tests/python/test_custom_float_exponents.py
@@ -114,3 +114,25 @@ def test_custom_float_truncation(signed):
         assert x[None] == 1.5
     else:
         assert x[None] == 1.75
+
+
+@ti.test(require=ti.extension.quant)
+def test_custom_float_atomic_demotion():
+    cit = ti.type_factory.custom_int(2, True)
+    exp = ti.type_factory.custom_int(5, False)
+    cft = ti.type_factory.custom_float(significand_type=cit,
+                                       exponent_type=exp,
+                                       scale=1)
+    x = ti.field(dtype=cft)
+
+    ti.root._bit_struct(num_bits=32).place(x)
+
+    @ti.kernel
+    def foo():
+        for i in range(1):
+            x[None] += 1
+
+    foo()
+    foo()
+
+    assert x[None] == 2