From c2ba55b13b8873b200708498abdbfbe70addec58 Mon Sep 17 00:00:00 2001
From: Ye Kuang <k-ye@users.noreply.github.com>
Date: Mon, 26 Apr 2021 17:11:15 +0800
Subject: [PATCH 01/29] [vulkan] Add Vulkan API (#2299)

* [vulkan] Add Vulkan API

* fix

* support osx

* [skip ci] enforce code format

Co-authored-by: Taichi Gardener <taichigardener@gmail.com>
---
 cmake/TaichiCore.cmake                 |  25 ++
 misc/prtags.json                       |   1 +
 taichi/backends/vulkan/vulkan_api.cpp  | 585 +++++++++++++++++++++++++
 taichi/backends/vulkan/vulkan_api.h    | 188 ++++++++
 taichi/backends/vulkan/vulkan_common.h |  25 ++
 5 files changed, 824 insertions(+)
 create mode 100644 taichi/backends/vulkan/vulkan_api.cpp
 create mode 100644 taichi/backends/vulkan/vulkan_api.h
 create mode 100644 taichi/backends/vulkan/vulkan_common.h
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index c81a213204811..c2d5db9e57502 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -2,6 +2,13 @@ option(USE_STDCPP "Use -stdlib=libc++" OFF)
 option(TI_WITH_CUDA "Build with the CUDA backend" ON)
 option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
 option(TI_WITH_CC "Build with the C backend" ON)
+option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)
+
+if(UNIX AND NOT APPLE)
+    # Handy helper for Linux
+    # https://stackoverflow.com/a/32259072/12003165
+    set(LINUX TRUE)
+endif()
 
 if (APPLE)
     if (TI_WITH_CUDA)
@@ -41,6 +48,7 @@ file(GLOB TAICHI_CUDA_SOURCE "taichi/backends/cuda/*.cpp" "taichi/backends/cuda/
 file(GLOB TAICHI_METAL_SOURCE "taichi/backends/metal/*.h" "taichi/backends/metal/*.cpp" "taichi/backends/metal/shaders/*")
 file(GLOB TAICHI_OPENGL_SOURCE "taichi/backends/opengl/*.h" "taichi/backends/opengl/*.cpp" "taichi/backends/opengl/shaders/*")
 file(GLOB TAICHI_CC_SOURCE "taichi/backends/cc/*.h" "taichi/backends/cc/*.cpp")
+file(GLOB TAICHI_VULKAN_SOURCE "taichi/backends/vulkan/*.h" "taichi/backends/vulkan/*.cpp")
 
 list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_BACKEND_SOURCE})
 
@@ -72,6 +80,12 @@ if (TI_WITH_CC)
   list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CC_SOURCE})
 endif()
 
+
+if (TI_WITH_VULKAN)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_VULKAN")
+    list(APPEND TAICHI_CORE_SOURCE ${TAICHI_VULKAN_SOURCE})
+endif()
+
 # This compiles all the libraries with -fPIC, which is critical to link a static
 # library into a shared lib.
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -172,6 +186,17 @@ if (TI_WITH_CUDA)
     target_link_libraries(${LIBRARY_NAME} ${llvm_ptx_libs})
 endif()
 
+if (TI_WITH_VULKAN)
+    # Vulkan libs
+    # https://cmake.org/cmake/help/latest/module/FindVulkan.html
+    # https://github.com/PacktPublishing/Learning-Vulkan/blob/master/Chapter%2003/HandShake/CMakeLists.txt
+    find_package(Vulkan REQUIRED)
+    message(STATUS "Vulkan_INCLUDE_DIR=${Vulkan_INCLUDE_DIR}")
+    message(STATUS "Vulkan_LIBRARY=${Vulkan_LIBRARY}")
+    include_directories(${Vulkan_INCLUDE_DIR})
+    target_link_libraries(${CORE_LIBRARY_NAME} ${Vulkan_LIBRARY})
+endif ()
+
 # Optional dependencies
 
 if (APPLE)
diff --git a/misc/prtags.json b/misc/prtags.json
index 77d0da5185e24..f7e1ec534074a 100644
--- a/misc/prtags.json
+++ b/misc/prtags.json
@@ -9,6 +9,7 @@
   "lang"            : "Language and syntax",
   "metal"           : "Metal backend",
   "opengl"          : "OpenGL backend",
+  "vulkan"          : "Vulkan backend",
   "misc"            : "Miscellaneous",
   "std"             : "Standard library",
   "opt"             : "IR optimization passes",
diff --git a/taichi/backends/vulkan/vulkan_api.cpp b/taichi/backends/vulkan/vulkan_api.cpp
new file mode 100644
index 0000000000000..2c1215b9ff802
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_api.cpp
@@ -0,0 +1,585 @@
+#include "taichi/backends/vulkan/vulkan_api.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#ifndef IN_TAI_VULKAN
+
+#include "taichi/backends/vulkan/vulkan_common.h"
+#include "taichi/common/logging.h"
+
+#else
+
+#include "vulkan_common_stub.h"
+
+#endif
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+namespace {
+
+constexpr bool kEnableValidationLayers = true;
+const std::vector<const char *> kValidationLayers = {
+    "VK_LAYER_KHRONOS_validation",
+};
+
+bool check_validation_layer_support() {
+  uint32_t layer_count;
+  vkEnumerateInstanceLayerProperties(&layer_count, nullptr);
+
+  std::vector<VkLayerProperties> available_layers(layer_count);
+  vkEnumerateInstanceLayerProperties(&layer_count, available_layers.data());
+
+  std::unordered_set<std::string> available_layer_names;
+  for (const auto &layer_props : available_layers) {
+    available_layer_names.insert(layer_props.layerName);
+  }
+  for (const char *name : kValidationLayers) {
+    if (available_layer_names.count(std::string(name)) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+VKAPI_ATTR VkBool32 VKAPI_CALL
+vk_debug_callback(VkDebugUtilsMessageSeverityFlagBitsEXT message_severity,
+                  VkDebugUtilsMessageTypeFlagsEXT message_type,
+                  const VkDebugUtilsMessengerCallbackDataEXT *p_callback_data,
+                  void *p_user_data) {
+  if (message_severity > VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) {
+    std::cerr << "validation layer: " << p_callback_data->pMessage << std::endl;
+  }
+  return VK_FALSE;
+}
+
+void populate_debug_messenger_create_info(
+    VkDebugUtilsMessengerCreateInfoEXT *create_info) {
+  *create_info = {};
+  create_info->sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+  create_info->messageSeverity =
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+  create_info->messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+                             VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+                             VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
+  create_info->pfnUserCallback = vk_debug_callback;
+  create_info->pUserData = nullptr;
+}
+
+VkResult create_debug_utils_messenger_ext(
+    VkInstance instance,
+    const VkDebugUtilsMessengerCreateInfoEXT *p_create_info,
+    const VkAllocationCallbacks *p_allocator,
+    VkDebugUtilsMessengerEXT *p_debug_messenger) {
+  auto func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(
+      instance, "vkCreateDebugUtilsMessengerEXT");
+  if (func != nullptr) {
+    return func(instance, p_create_info, p_allocator, p_debug_messenger);
+  } else {
+    return VK_ERROR_EXTENSION_NOT_PRESENT;
+  }
+}
+
+void destroy_debug_utils_messenger_ext(
+    VkInstance instance,
+    VkDebugUtilsMessengerEXT debug_messenger,
+    const VkAllocationCallbacks *p_allocator) {
+  auto func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(
+      instance, "vkDestroyDebugUtilsMessengerEXT");
+  if (func != nullptr) {
+    func(instance, debug_messenger, p_allocator);
+  }
+}
+
+std::vector<const char *> get_required_extensions() {
+  std::vector<const char *> extensions;
+  if constexpr (kEnableValidationLayers) {
+    extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+  }
+  return extensions;
+}
+
+VulkanQueueFamilyIndices find_queue_families(VkPhysicalDevice device) {
+  VulkanQueueFamilyIndices indices;
+
+  uint32_t queue_family_count = 0;
+  vkGetPhysicalDeviceQueueFamilyProperties(device, &queue_family_count,
+                                           nullptr);
+  std::vector<VkQueueFamilyProperties> queue_families(queue_family_count);
+  vkGetPhysicalDeviceQueueFamilyProperties(device, &queue_family_count,
+                                           queue_families.data());
+  // TODO: What the heck is this?
+  constexpr VkQueueFlags kFlagMask =
+      (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT));
+
+  // first try and find a queue that has just the compute bit set
+  for (int i = 0; i < (int)queue_family_count; ++i) {
+    const VkQueueFlags masked_flags = kFlagMask & queue_families[i].queueFlags;
+    if ((masked_flags & VK_QUEUE_COMPUTE_BIT) &&
+        !(masked_flags & VK_QUEUE_GRAPHICS_BIT)) {
+      indices.compute_family = i;
+    }
+    if (indices.is_complete()) {
+      return indices;
+    }
+  }
+
+  // lastly get any queue that will work
+  for (int i = 0; i < (int)queue_family_count; ++i) {
+    const VkQueueFlags masked_flags = kFlagMask & queue_families[i].queueFlags;
+    if (masked_flags & VK_QUEUE_COMPUTE_BIT) {
+      indices.compute_family = i;
+    }
+    if (indices.is_complete()) {
+      return indices;
+    }
+  }
+  return indices;
+}
+
+bool is_device_suitable(VkPhysicalDevice device) {
+  return find_queue_families(device).is_complete();
+}
+
+VkShaderModule create_shader_module(VkDevice device,
+                                    const SpirvCodeView &code) {
+  VkShaderModuleCreateInfo create_info{};
+  create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+  create_info.codeSize = code.size;
+  create_info.pCode = code.data;
+
+  VkShaderModule shader_module;
+  BAIL_ON_VK_BAD_RESULT(
+      vkCreateShaderModule(device, &create_info, kNoVkAllocCallbacks,
+                           &shader_module),
+      "failed to create shader module");
+  return shader_module;
+}
+
+}  // namespace
+
+VulkanDevice::VulkanDevice(const Params &params) {
+  create_instance(params);
+  setup_debug_messenger();
+  pick_physical_device();
+  create_logical_device();
+  create_command_pool();
+}
+
+VulkanDevice::~VulkanDevice() {
+  if constexpr (kEnableValidationLayers) {
+    destroy_debug_utils_messenger_ext(instance_, debug_messenger_,
+                                      kNoVkAllocCallbacks);
+  }
+  vkDestroyCommandPool(device_, command_pool_, kNoVkAllocCallbacks);
+  vkDestroyDevice(device_, kNoVkAllocCallbacks);
+  vkDestroyInstance(instance_, kNoVkAllocCallbacks);
+}
+
+void VulkanDevice::create_instance(const Params &params) {
+  VkApplicationInfo app_info{};
+  app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+  app_info.pApplicationName = "Taichi Vulkan Backend";
+  app_info.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
+  app_info.pEngineName = "No Engine";
+  app_info.engineVersion = VK_MAKE_VERSION(1, 0, 0);
+  app_info.apiVersion = params.api_version;  // important
+
+  VkInstanceCreateInfo create_info{};
+  create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+  create_info.pApplicationInfo = &app_info;
+
+  if constexpr (kEnableValidationLayers) {
+    TI_ASSERT_INFO(check_validation_layer_support(),
+                   "validation layers requested but not available");
+  }
+
+  VkDebugUtilsMessengerCreateInfoEXT debug_create_info{};
+
+  if constexpr (kEnableValidationLayers) {
+    create_info.enabledLayerCount = (uint32_t)kValidationLayers.size();
+    create_info.ppEnabledLayerNames = kValidationLayers.data();
+
+    populate_debug_messenger_create_info(&debug_create_info);
+    create_info.pNext = &debug_create_info;
+  } else {
+    create_info.enabledLayerCount = 0;
+    create_info.pNext = nullptr;
+  }
+  const auto extensions = get_required_extensions();
+  create_info.enabledExtensionCount = (uint32_t)extensions.size();
+  create_info.ppEnabledExtensionNames = extensions.data();
+
+  BAIL_ON_VK_BAD_RESULT(
+      vkCreateInstance(&create_info, kNoVkAllocCallbacks, &instance_),
+      "failed to create instance");
+}
+
+void VulkanDevice::setup_debug_messenger() {
+  if constexpr (!kEnableValidationLayers) {
+    return;
+  }
+  VkDebugUtilsMessengerCreateInfoEXT create_info{};
+  populate_debug_messenger_create_info(&create_info);
+
+  BAIL_ON_VK_BAD_RESULT(
+      create_debug_utils_messenger_ext(instance_, &create_info,
+                                       kNoVkAllocCallbacks, &debug_messenger_),
+      "failed to set up debug messenger");
+}
+
+void VulkanDevice::pick_physical_device() {
+  uint32_t device_count = 0;
+  vkEnumeratePhysicalDevices(instance_, &device_count, nullptr);
+  TI_ASSERT_INFO(device_count > 0, "failed to find GPUs with Vulkan support");
+
+  std::vector<VkPhysicalDevice> devices(device_count);
+  vkEnumeratePhysicalDevices(instance_, &device_count, devices.data());
+  physical_device_ = VK_NULL_HANDLE;
+  for (const auto &device : devices) {
+    if (is_device_suitable(device)) {
+      physical_device_ = device;
+      break;
+    }
+  }
+  TI_ASSERT_INFO(physical_device_ != VK_NULL_HANDLE,
+                 "failed to find a suitable GPU");
+
+  queue_family_indices_ = find_queue_families(physical_device_);
+}
+
+void VulkanDevice::create_logical_device() {
+  VkDeviceQueueCreateInfo queue_create_info{};
+  queue_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+  queue_create_info.queueFamilyIndex =
+      queue_family_indices_.compute_family.value();
+  queue_create_info.queueCount = 1;
+  constexpr float kQueuePriority = 1.0f;
+  queue_create_info.pQueuePriorities = &kQueuePriority;
+
+  VkDeviceCreateInfo create_info{};
+  create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+  create_info.pQueueCreateInfos = &queue_create_info;
+  create_info.queueCreateInfoCount = 1;
+
+  VkPhysicalDeviceFeatures device_deatures{};
+  create_info.pEnabledFeatures = &device_deatures;
+  create_info.enabledExtensionCount = 0;
+
+  if constexpr (kEnableValidationLayers) {
+    create_info.enabledLayerCount = (uint32_t)kValidationLayers.size();
+    create_info.ppEnabledLayerNames = kValidationLayers.data();
+  } else {
+    create_info.enabledLayerCount = 0;
+  }
+  BAIL_ON_VK_BAD_RESULT(vkCreateDevice(physical_device_, &create_info,
+                                       kNoVkAllocCallbacks, &device_),
+                        "failed to create logical device");
+  vkGetDeviceQueue(device_, queue_family_indices_.compute_family.value(),
+                   /*queueIndex=*/0, &compute_queue_);
+}
+
+void VulkanDevice::create_command_pool() {
+  VkCommandPoolCreateInfo pool_info{};
+  pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+  pool_info.flags = 0;
+  pool_info.queueFamilyIndex = queue_family_indices_.compute_family.value();
+  BAIL_ON_VK_BAD_RESULT(
+      vkCreateCommandPool(device_, &pool_info, kNoVkAllocCallbacks,
+                          &command_pool_),
+      "failed to create command pool");
+}
+
+VulkanPipeline::VulkanPipeline(const Params &params)
+    : device_(params.device->device()) {
+  create_descriptor_set_layout(params);
+  create_compute_pipeline(params);
+  create_descriptor_pool(params);
+  create_descriptor_sets(params);
+}
+
+VulkanPipeline::~VulkanPipeline() {
+  vkDestroyDescriptorPool(device_, descriptor_pool_, kNoVkAllocCallbacks);
+  vkDestroyPipeline(device_, pipeline_, kNoVkAllocCallbacks);
+  vkDestroyPipelineLayout(device_, pipeline_layout_, kNoVkAllocCallbacks);
+  vkDestroyDescriptorSetLayout(device_, descriptor_set_layout_,
+                               kNoVkAllocCallbacks);
+}
+
+void VulkanPipeline::create_descriptor_set_layout(const Params &params) {
+  const auto &buffer_binds = params.buffer_bindings;
+  std::vector<VkDescriptorSetLayoutBinding> layout_bindings;
+  layout_bindings.reserve(buffer_binds.size());
+  for (const auto &bb : buffer_binds) {
+    VkDescriptorSetLayoutBinding layout_binding{};
+    layout_binding.binding = bb.binding;
+    layout_binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    layout_binding.descriptorCount = 1;
+    layout_binding.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    layout_binding.pImmutableSamplers = nullptr;
+    layout_bindings.push_back(layout_binding);
+  }
+
+  VkDescriptorSetLayoutCreateInfo layout_create_info{};
+  layout_create_info.sType =
+      VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+  layout_create_info.bindingCount = layout_bindings.size();
+  layout_create_info.pBindings = layout_bindings.data();
+
+  BAIL_ON_VK_BAD_RESULT(
+      vkCreateDescriptorSetLayout(device_, &layout_create_info,
+                                  kNoVkAllocCallbacks, &descriptor_set_layout_),
+      "failed to create descriptor set layout");
+}
+
+void VulkanPipeline::create_compute_pipeline(const Params &params) {
+  VkShaderModule shader_module = create_shader_module(device_, params.code);
+
+  VkPipelineShaderStageCreateInfo shader_stage_info{};
+  shader_stage_info.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+  shader_stage_info.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+  shader_stage_info.module = shader_module;
+#pragma message("Shader storage info: pName is hardcoded to \"main\"")
+  shader_stage_info.pName = "main";
+
+  VkPipelineLayoutCreateInfo pipeline_layout_info{};
+  pipeline_layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+  pipeline_layout_info.setLayoutCount = 1;
+  pipeline_layout_info.pSetLayouts = &descriptor_set_layout_;
+  pipeline_layout_info.pushConstantRangeCount = 0;
+  pipeline_layout_info.pPushConstantRanges = nullptr;
+  BAIL_ON_VK_BAD_RESULT(
+      vkCreatePipelineLayout(device_, &pipeline_layout_info,
+                             kNoVkAllocCallbacks, &pipeline_layout_),
+      "failed to create pipeline layout");
+
+  VkComputePipelineCreateInfo pipeline_info{};
+  pipeline_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+  pipeline_info.stage = shader_stage_info;
+  pipeline_info.layout = pipeline_layout_;
+  BAIL_ON_VK_BAD_RESULT(
+      vkCreateComputePipelines(device_, /*pipelineCache=*/VK_NULL_HANDLE,
+                               /*createInfoCount=*/1, &pipeline_info,
+                               kNoVkAllocCallbacks, &pipeline_),
+      "failed to create pipeline");
+
+  vkDestroyShaderModule(device_, shader_module, kNoVkAllocCallbacks);
+}
+
+void VulkanPipeline::create_descriptor_pool(const Params &params) {
+  VkDescriptorPoolSize pool_size{};
+  pool_size.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+  // This is the total number of descriptors we will allocate from this pool,
+  // across all the descriptor sets.
+  // https://stackoverflow.com/a/51716660/12003165
+  pool_size.descriptorCount = params.buffer_bindings.size();
+
+  VkDescriptorPoolCreateInfo pool_info{};
+  pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+  pool_info.maxSets = 1;
+  pool_info.poolSizeCount = 1;
+  pool_info.pPoolSizes = &pool_size;
+  BAIL_ON_VK_BAD_RESULT(
+      vkCreateDescriptorPool(device_, &pool_info, kNoVkAllocCallbacks,
+                             &descriptor_pool_),
+      "failed to create descriptor pool");
+}
+
+void VulkanPipeline::create_descriptor_sets(const Params &params) {
+  VkDescriptorSetAllocateInfo alloc_info{};
+  alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+  alloc_info.descriptorPool = descriptor_pool_;
+  alloc_info.descriptorSetCount = 1;
+  alloc_info.pSetLayouts = &descriptor_set_layout_;
+
+  BAIL_ON_VK_BAD_RESULT(
+      vkAllocateDescriptorSets(device_, &alloc_info, &descriptor_set_),
+      "failed to allocate descriptor set");
+
+  const auto &buffer_binds = params.buffer_bindings;
+  std::vector<VkDescriptorBufferInfo> descriptor_buffer_infos;
+  descriptor_buffer_infos.reserve(buffer_binds.size());
+  for (const auto &bb : buffer_binds) {
+    VkDescriptorBufferInfo buffer_info{};
+    buffer_info.buffer = bb.buffer;
+    // Note that this is the offset within the buffer itself, not the offset
+    // of this buffer within its backing memory!
+    buffer_info.offset = 0;
+    // https://github.com/apache/tvm/blob/d288bbc5df3660355adbf97f2f84ecd232e269ff/src/runtime/vulkan/vulkan.cc#L1073
+    buffer_info.range = VK_WHOLE_SIZE;
+    descriptor_buffer_infos.push_back(buffer_info);
+  }
+
+  std::vector<VkWriteDescriptorSet> descriptor_writes;
+  descriptor_writes.reserve(descriptor_buffer_infos.size());
+  for (int i = 0; i < buffer_binds.size(); ++i) {
+    const auto &bb = buffer_binds[i];
+
+    VkWriteDescriptorSet write{};
+    write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    write.dstSet = descriptor_set_;
+    write.dstBinding = bb.binding;
+    write.dstArrayElement = 0;
+    write.descriptorCount = 1;
+    write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    write.pBufferInfo = &descriptor_buffer_infos[i];
+    write.pImageInfo = nullptr;
+    write.pTexelBufferView = nullptr;
+    descriptor_writes.push_back(write);
+  }
+
+  vkUpdateDescriptorSets(device_,
+                         /*descriptorWriteCount=*/descriptor_writes.size(),
+                         descriptor_writes.data(), /*descriptorCopyCount=*/0,
+                         /*pDescriptorCopies=*/nullptr);
+}
+
+VulkanCommandBuilder::VulkanCommandBuilder(const VulkanDevice *device) {
+  VkCommandBufferAllocateInfo alloc_info{};
+  alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+  alloc_info.commandPool = device->command_pool();
+  alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+  alloc_info.commandBufferCount = 1;
+  BAIL_ON_VK_BAD_RESULT(
+      vkAllocateCommandBuffers(device->device(), &alloc_info, &command_buffer_),
+      "failed to allocate command buffer");
+
+  VkCommandBufferBeginInfo begin_info{};
+  begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  // This flag allows us to submit the same command buffer to the queue
+  // multiple times, while they are still pending.
+  begin_info.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+  begin_info.pInheritanceInfo = nullptr;
+  BAIL_ON_VK_BAD_RESULT(vkBeginCommandBuffer(command_buffer_, &begin_info),
+                        "failed to begin recording command buffer");
+}
+
+VulkanCommandBuilder::~VulkanCommandBuilder() {
+  if (command_buffer_ != VK_NULL_HANDLE) {
+    build();
+  }
+}
+
+void VulkanCommandBuilder::append(const VulkanPipeline &pipeline,
+                                  int group_count_x) {
+  vkCmdBindPipeline(command_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE,
+                    pipeline.pipeline());
+  vkCmdBindDescriptorSets(
+      command_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE,
+      pipeline.pipeline_layout(),
+      /*firstSet=*/0, /*descriptorSetCount=*/1, &(pipeline.descriptor_set()),
+      /*dynamicOffsetCount=*/0, /*pDynamicOffsets=*/nullptr);
+  vkCmdDispatch(command_buffer_, group_count_x,
+                /*groupCountY=*/1,
+                /*groupCountZ=*/1);
+  // Copied from TVM
+  // https://github.com/apache/tvm/blob/b2a3c481ebbb7cfbd5335fb11cd516ae5f348406/src/runtime/vulkan/vulkan.cc#L1134-L1142
+  VkMemoryBarrier barrier_info{};
+  barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+  barrier_info.pNext = nullptr;
+  barrier_info.srcAccessMask =
+      VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT;
+  barrier_info.dstAccessMask =
+      (VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT |
+       VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
+  vkCmdPipelineBarrier(command_buffer_,
+                       /*srcStageMask=*/VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       /*dstStageMask=*/VK_PIPELINE_STAGE_TRANSFER_BIT |
+                           VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       /*srcStageMask=*/0, /*memoryBarrierCount=*/1,
+                       &barrier_info, /*bufferMemoryBarrierCount=*/0,
+                       /*pBufferMemoryBarriers=*/nullptr,
+                       /*imageMemoryBarrierCount=*/0,
+                       /*pImageMemoryBarriers=*/nullptr);
+}
+
+VkCommandBuffer VulkanCommandBuilder::build() {
+  BAIL_ON_VK_BAD_RESULT(vkEndCommandBuffer(command_buffer_),
+                        "failed to record command buffer");
+  VkCommandBuffer res = command_buffer_;
+  command_buffer_ = VK_NULL_HANDLE;
+  return res;
+}
+
+VkCommandBuffer record_copy_buffer_command(
+    const VulkanDevice *device,
+    VkBuffer src_buffer,
+    VkBuffer dst_buffer,
+    VkDeviceSize size,
+    VulkanCopyBufferDirection direction) {
+  VkCommandBuffer command{VK_NULL_HANDLE};
+  VkCommandBufferAllocateInfo alloc_info{};
+  alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+  alloc_info.commandPool = device->command_pool();
+  alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+  alloc_info.commandBufferCount = 1;
+  BAIL_ON_VK_BAD_RESULT(
+      vkAllocateCommandBuffers(device->device(), &alloc_info, &command),
+      "failed to allocate copy command buffer");
+
+  VkCommandBufferBeginInfo begin_info{};
+  begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  begin_info.flags = 0;
+  begin_info.pInheritanceInfo = nullptr;
+  BAIL_ON_VK_BAD_RESULT(vkBeginCommandBuffer(command, &begin_info),
+                        "failed to begin recording copy command buffer");
+
+  VkBufferCopy copy_region{};
+  copy_region.srcOffset = 0;
+  copy_region.dstOffset = 0;
+  copy_region.size = size;
+  vkCmdCopyBuffer(command, src_buffer, dst_buffer, /*regionCount=*/1,
+                  &copy_region);
+  if (direction == VulkanCopyBufferDirection::H2D) {
+    VkMemoryBarrier barrier_info;
+    barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+    barrier_info.pNext = nullptr;
+
+    barrier_info.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    barrier_info.dstAccessMask =
+        VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT;
+    vkCmdPipelineBarrier(command,
+                         /*srcStageMask=*/VK_PIPELINE_STAGE_TRANSFER_BIT,
+                         /*dstStageMask=*/VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
+                             VK_PIPELINE_STAGE_TRANSFER_BIT,
+                         0, 1, &barrier_info, 0, nullptr, 0, nullptr);
+  }
+  BAIL_ON_VK_BAD_RESULT(vkEndCommandBuffer(command),
+                        "failed to record copy command buffer");
+  return command;
+}
+
+VulkanStream::VulkanStream(const VulkanDevice *device) : device_(device) {
+}
+
+void VulkanStream::launch(VkCommandBuffer command) {
+  VkSubmitInfo submit_info{};
+  submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submit_info.commandBufferCount = 1;
+  submit_info.pCommandBuffers = &command;
+
+  BAIL_ON_VK_BAD_RESULT(
+      vkQueueSubmit(device_->compute_queue(), /*submitCount=*/1, &submit_info,
+                    /*fence=*/VK_NULL_HANDLE),
+      "failed to submit command buffer");
+}
+
+void VulkanStream::synchronize() {
+  // While vkQueueWaitIdle is strongly discouraged, this is probably the most
+  // viable way for synchronization in Taichi. Unlike graphics pipeline, there
+  // is no clear boundary (i.e. frame) for us to use a VkFence. TVM accumulates
+  // all the commands into a single buffer, then submits it all at once upon
+  // synchronization. Not sure how efficient that model is.
+  vkQueueWaitIdle(device_->compute_queue());
+}
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_api.h b/taichi/backends/vulkan/vulkan_api.h
new file mode 100644
index 0000000000000..53e5232c30ca1
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_api.h
@@ -0,0 +1,188 @@
+#pragma once
+
+#include <vulkan/vulkan.h>
+#include <vulkan/vulkan_core.h>
+
+#include <optional>
+#include <vector>
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+struct SpirvCodeView {
+  const uint32_t *data = nullptr;
+  size_t size = 0;
+
+  SpirvCodeView() = default;
+
+  explicit SpirvCodeView(const std::vector<uint32_t> &code)
+      : data(code.data()), size(code.size() * sizeof(uint32_t)) {
+  }
+};
+
+struct VulkanQueueFamilyIndices {
+  std::optional<uint32_t> compute_family;
+  // TODO: While it is the case that all COMPUTE/GRAPHICS queue also support
+  // TRANSFER by default, maye there are some performance benefits to find a
+  // TRANSFER-dedicated queue family.
+  // https://vulkan-tutorial.com/Vertex_buffers/Staging_buffer#page_Transfer-queue
+
+  bool is_complete() const {
+    return compute_family.has_value();
+  }
+};
+
+// Many classes here are inspired by TVM's runtime
+// https://github.com/apache/tvm/tree/main/src/runtime/vulkan
+
+// VulkanDevice maps to a (VkDevice, VkQueue) tuple. Right now we only use
+// a single queue from a single device, so it does not make a difference to
+// separate the queue from the device. This is similar to using a single CUDA
+// stream.
+class VulkanDevice {
+ public:
+  struct Params {
+    uint32_t api_version{VK_API_VERSION_1_0};
+  };
+  explicit VulkanDevice(const Params &params);
+  ~VulkanDevice();
+
+  VkPhysicalDevice physical_device() const {
+    return physical_device_;
+  }
+  VkDevice device() const {
+    return device_;
+  }
+  const VulkanQueueFamilyIndices &queue_family_indices() const {
+    return queue_family_indices_;
+  }
+  VkQueue compute_queue() const {
+    return compute_queue_;
+  }
+  VkCommandPool command_pool() const {
+    return command_pool_;
+  }
+
+ private:
+  void create_instance(const Params &params);
+  void setup_debug_messenger();
+  void pick_physical_device();
+  void create_logical_device();
+  void create_command_pool();
+
+  VkInstance instance_{VK_NULL_HANDLE};
+  VkDebugUtilsMessengerEXT debug_messenger_{VK_NULL_HANDLE};
+  VkPhysicalDevice physical_device_{VK_NULL_HANDLE};
+  VulkanQueueFamilyIndices queue_family_indices_;
+  VkDevice device_{VK_NULL_HANDLE};
+  // TODO: It's probably not right to put these per-queue things here. However,
+  // in Taichi we only use a single queue on a single device (i.e. a single CUDA
+  // stream), so it doesn't make a difference.
+  VkQueue compute_queue_{VK_NULL_HANDLE};
+  // TODO: Shall we have dedicated command pools for COMPUTE and TRANSFER
+  // commands, respectively?
+  VkCommandPool command_pool_{VK_NULL_HANDLE};
+};
+
+// VulkanPipeline maps to a VkPipeline, or a SPIR-V module (a GLSL compute
+// shader). Because Taichi's buffers are all pre-allocated upon startup, we
+// only need to set up the descriptor set (i.e., bind the buffers via
+// VkWriteDescriptorSet) once during the pipeline initialization.
+class VulkanPipeline {
+ public:
+  struct BufferBinding {
+    VkBuffer buffer{VK_NULL_HANDLE};
+    uint32_t binding{0};
+  };
+
+  struct Params {
+    const VulkanDevice *device{nullptr};
+    std::vector<BufferBinding> buffer_bindings;
+    SpirvCodeView code;
+  };
+
+  explicit VulkanPipeline(const Params &params);
+  ~VulkanPipeline();
+
+  VkPipelineLayout pipeline_layout() const {
+    return pipeline_layout_;
+  }
+  VkPipeline pipeline() const {
+    return pipeline_;
+  }
+  const VkDescriptorSet &descriptor_set() const {
+    return descriptor_set_;
+  }
+
+ private:
+  void create_descriptor_set_layout(const Params &params);
+  void create_compute_pipeline(const Params &params);
+  void create_descriptor_pool(const Params &params);
+  void create_descriptor_sets(const Params &params);
+
+  VkDevice device_{VK_NULL_HANDLE};  // not owned
+
+  // TODO: Commands using the same Taichi buffers should be able to share the
+  // same descriptor set layout?
+  VkDescriptorSetLayout descriptor_set_layout_{VK_NULL_HANDLE};
+  // TODO: Commands having the same |descriptor_set_layout_| should be able to
+  // share the same pipeline layout?
+  VkPipelineLayout pipeline_layout_{VK_NULL_HANDLE};
+  // This maps 1:1 to a shader, so it needs to be created per compute
+  // shader.
+  VkPipeline pipeline_{VK_NULL_HANDLE};
+  VkDescriptorPool descriptor_pool_{VK_NULL_HANDLE};
+  VkDescriptorSet descriptor_set_{VK_NULL_HANDLE};
+};
+
+// VulkanCommandBuilder builds a VkCommandBuffer by recording a given series of
+// VulkanPipelines. The workgroup count needs to be known at recording time.
+// TODO: Do we ever need to adjust the workgroup count at runtime?
+class VulkanCommandBuilder {
+ public:
+  explicit VulkanCommandBuilder(const VulkanDevice *device);
+
+  ~VulkanCommandBuilder();
+
+  void append(const VulkanPipeline &pipeline, int group_count_x);
+
+  VkCommandBuffer build();
+
+ private:
+  // VkCommandBuffers are destroyed when the underlying command pool is
+  // destroyed.
+  // https://vulkan-tutorial.com/Drawing_a_triangle/Drawing/Command_buffers#page_Command-buffer-allocation
+  VkCommandBuffer command_buffer_{VK_NULL_HANDLE};
+};
+
+enum class VulkanCopyBufferDirection {
+  H2D,
+  D2H,
+  // D2D does not have a use case yet
+};
+
+VkCommandBuffer record_copy_buffer_command(const VulkanDevice *device,
+                                           VkBuffer src_buffer,
+                                           VkBuffer dst_buffer,
+                                           VkDeviceSize size,
+                                           VulkanCopyBufferDirection direction);
+
+// A vulkan stream models an asynchronous GPU execution queue.
+// Commands are submitted via launch() and executed asynchronously.
+// synchronize()s blocks the host, until all the launched commands have
+// completed execution.
+class VulkanStream {
+ public:
+  VulkanStream(const VulkanDevice *device);
+
+  void launch(VkCommandBuffer command);
+  void synchronize();
+
+ private:
+  const VulkanDevice *const device_;
+};
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_common.h b/taichi/backends/vulkan/vulkan_common.h
new file mode 100644
index 0000000000000..7d6b4e95558d3
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_common.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <vulkan/vulkan.h>
+#include <vulkan/vulkan_core.h>
+
+#include <stdexcept>
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+#pragma message("BAIL_ON_VK_BAD_RESULT uses exception")
+
+#define BAIL_ON_VK_BAD_RESULT(result, msg) \
+  do {                                     \
+    if ((result) != VK_SUCCESS) {          \
+      throw std::runtime_error((msg));     \
+    };                                     \
+  } while (0)
+
+inline constexpr VkAllocationCallbacks *kNoVkAllocCallbacks = nullptr;
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi

From caa42fb6d2e3b49d373246ecc22c937247655e36 Mon Sep 17 00:00:00 2001
From: Ye Kuang <k-ye@users.noreply.github.com>
Date: Thu, 6 May 2021 22:25:55 +0800
Subject: [PATCH 02/29] [vulkan] Add per kernel info structs (#2300)

* [vulkan] Add per kernel info structs

* trace
---
 taichi/backends/vulkan/data_type_utils.h |  20 +++
 taichi/backends/vulkan/kernel_utils.cpp  | 123 ++++++++++++++++
 taichi/backends/vulkan/kernel_utils.h    | 179 +++++++++++++++++++++++
 3 files changed, 322 insertions(+)
 create mode 100644 taichi/backends/vulkan/data_type_utils.h
 create mode 100644 taichi/backends/vulkan/kernel_utils.cpp
 create mode 100644 taichi/backends/vulkan/kernel_utils.h

diff --git a/taichi/backends/vulkan/data_type_utils.h b/taichi/backends/vulkan/data_type_utils.h
new file mode 100644
index 0000000000000..c837a9d6d39d3
--- /dev/null
+++ b/taichi/backends/vulkan/data_type_utils.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+
+#include "taichi/lang_util.h"
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+inline std::size_t vk_data_type_size(DataType dt) {
+  // Vulkan buffers require a minimum alignment of 4 bytes.
+  // https://vulkan-tutorial.com/Uniform_buffers/Descriptor_pool_and_sets#page_Alignment-requirements
+  return std::max(data_type_size(dt), 4);
+}
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/kernel_utils.cpp b/taichi/backends/vulkan/kernel_utils.cpp
new file mode 100644
index 0000000000000..29f7c2ad6f763
--- /dev/null
+++ b/taichi/backends/vulkan/kernel_utils.cpp
@@ -0,0 +1,123 @@
+#include "taichi/backends/vulkan/kernel_utils.h"
+
+#include <unordered_map>
+
+#include "taichi/backends/vulkan/data_type_utils.h"
+#include "taichi/program/kernel.h"
+#define TI_RUNTIME_HOST
+#include "taichi/program/context.h"
+#undef TI_RUNTIME_HOST
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+// static
+std::string TaskAttributes::buffers_name(Buffers b) {
+#define REGISTER_NAME(x) \
+  { Buffers::x, #x }
+  const static std::unordered_map<Buffers, std::string> m = {
+      REGISTER_NAME(Root),
+      REGISTER_NAME(GlobalTmps),
+      REGISTER_NAME(Context),
+  };
+#undef REGISTER_NAME
+  return m.find(b)->second;
+}
+
+std::string TaskAttributes::debug_string() const {
+  std::string result;
+  result += fmt::format(
+      "<TaskAttributes name={} advisory_total_num_threads={} "
+      "task_type={} buffers=[ ",
+      name, advisory_total_num_threads, offloaded_task_type_name(task_type));
+  for (auto b : buffer_binds) {
+    result += buffers_name(b.type) + " ";
+  }
+  result += "]";  // closes |buffers|
+  // TODO(k-ye): show range_for
+  result += ">";
+  return result;
+}
+
+std::string TaskAttributes::BufferBind::debug_string() const {
+  return fmt::format("<type={} binding={}>", TaskAttributes::buffers_name(type),
+                     binding);
+}
+
+KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
+    : args_bytes_(0),
+      rets_bytes_(0),
+      extra_args_bytes_(Context::extra_args_size) {
+  arg_attribs_vec_.reserve(kernel.args.size());
+  for (const auto &ka : kernel.args) {
+    ArgAttributes aa;
+    aa.dt = ka.dt;
+    const size_t dt_bytes = vk_data_type_size(aa.dt);
+    if (dt_bytes != 4) {
+      TI_ERROR("Vulakn kernel only supports 32-bit data, got {}",
+               data_type_name(aa.dt));
+    }
+    aa.is_array = ka.is_external_array;
+    // For array, |ka.size| is #elements * elements_size
+    aa.stride = aa.is_array ? ka.size : dt_bytes;
+    aa.index = arg_attribs_vec_.size();
+    arg_attribs_vec_.push_back(aa);
+  }
+  for (const auto &kr : kernel.rets) {
+    RetAttributes ra;
+    ra.dt = kr.dt;
+    const size_t dt_bytes = vk_data_type_size(ra.dt);
+    if (dt_bytes != 4) {
+      TI_ERROR("Vulakn kernel only supports 32-bit data, got {}",
+               data_type_name(ra.dt));
+    }
+    ra.is_array = false;  // TODO(#909): this is a temporary limitation
+    ra.stride = dt_bytes;
+    ra.index = ret_attribs_vec_.size();
+    ret_attribs_vec_.push_back(ra);
+  }
+
+  auto arrange_scalar_before_array = [](auto *vec, size_t offset) -> size_t {
+    std::vector<int> scalar_indices;
+    std::vector<int> array_indices;
+    for (int i = 0; i < vec->size(); ++i) {
+      if ((*vec)[i].is_array) {
+        array_indices.push_back(i);
+      } else {
+        scalar_indices.push_back(i);
+      }
+    }
+    size_t bytes = offset;
+    // Put scalar args in the memory first
+    for (int i : scalar_indices) {
+      auto &attribs = (*vec)[i];
+      attribs.offset_in_mem = bytes;
+      bytes += attribs.stride;
+      TI_TRACE("  at={} scalar offset_in_mem={} stride={}", i,
+               attribs.offset_in_mem, attribs.stride);
+    }
+    // Then the array args
+    for (int i : array_indices) {
+      auto &attribs = (*vec)[i];
+      attribs.offset_in_mem = bytes;
+      bytes += attribs.stride;
+      TI_TRACE("  at={} array offset_in_mem={} stride={}", i,
+               attribs.offset_in_mem, attribs.stride);
+    }
+    return bytes - offset;
+  };
+
+  TI_TRACE("args:");
+  args_bytes_ = arrange_scalar_before_array(&arg_attribs_vec_, 0);
+  TI_TRACE("rets:");
+  rets_bytes_ = arrange_scalar_before_array(&ret_attribs_vec_, args_bytes_);
+  TI_TRACE("sizes: args={} rets={} ctx={} total={}", args_bytes(), rets_bytes(),
+           ctx_bytes(), total_bytes());
+  TI_ASSERT(has_args() == (args_bytes_ > 0));
+  TI_ASSERT(has_rets() == (rets_bytes_ > 0));
+}
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/kernel_utils.h b/taichi/backends/vulkan/kernel_utils.h
new file mode 100644
index 0000000000000..24e00a8f883fc
--- /dev/null
+++ b/taichi/backends/vulkan/kernel_utils.h
@@ -0,0 +1,179 @@
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "taichi/ir/offloaded_task_type.h"
+#include "taichi/ir/type.h"
+
+namespace taichi {
+namespace lang {
+
+class Kernel;
+class SNode;
+
+namespace vulkan {
+
+struct TaskAttributes {
+  enum class Buffers {
+    Root,
+    GlobalTmps,
+    Context,
+  };
+
+  struct BufferBind {
+    Buffers type;
+    int binding;
+
+    std::string debug_string() const;
+  };
+
+  std::string name;
+  // Total number of threads to launch (i.e. threads per grid). Note that this
+  // is only advisory, because eventually this number is also determined by the
+  // runtime config. This works because grid strided loop is supported.
+  int advisory_total_num_threads;
+  int advisory_num_threads_per_group;
+
+  OffloadedTaskType task_type;
+
+  struct RangeForAttributes {
+    // |begin| has differen meanings depending on |const_begin|:
+    // * true : It is the left boundary of the loop known at compile time.
+    // * false: It is the offset of the begin in the global tmps buffer.
+    //
+    // Same applies to |end|.
+    size_t begin;
+    size_t end;
+    bool const_begin{true};
+    bool const_end{true};
+
+    inline bool const_range() const {
+      return (const_begin && const_end);
+    }
+  };
+  std::vector<BufferBind> buffer_binds;
+  // Only valid when |task_type| is range_for.
+  std::optional<RangeForAttributes> range_for_attribs;
+
+  static std::string buffers_name(Buffers b);
+  std::string debug_string() const;
+};
+
+// This class contains the attributes descriptors for both the input args and
+// the return values of a Taichi kernel.
+//
+// Note that all Vulkan tasks (shaders) belonging to the same Taichi kernel will
+// share the same kernel args (i.e. they use the same Vulkan buffer for input
+// args and return values). This is because kernel arguments is a Taichi-level
+// concept.
+class KernelContextAttributes {
+ private:
+  // Attributes that are shared by the input arg and the return value.
+  struct AttribsBase {
+    // For scalar arg, this is max(stride(dt), 4)
+    // For array arg, this is #elements * max(stride(dt), 4)
+    // Unit: byte
+    size_t stride = 0;
+    // Offset in the context buffer
+    size_t offset_in_mem = 0;
+    // Index of the input arg or the return value in the host `Context`
+    int index = -1;
+    DataType dt;
+    bool is_array = false;
+  };
+
+ public:
+  // This is mostly the same as Kernel::Arg, with Vulkan specific attributes.
+  struct ArgAttributes : public AttribsBase {};
+
+  // This is mostly the same as Kernel::Ret, with Vulkan specific attributes.
+  struct RetAttributes : public AttribsBase {};
+
+  KernelContextAttributes() = default;
+  explicit KernelContextAttributes(const Kernel &kernel);
+
+  inline bool has_args() const {
+    return !arg_attribs_vec_.empty();
+  }
+
+  inline const std::vector<ArgAttributes> &args() const {
+    return arg_attribs_vec_;
+  }
+
+  inline bool has_rets() const {
+    return !ret_attribs_vec_.empty();
+  }
+
+  inline const std::vector<RetAttributes> &rets() const {
+    return ret_attribs_vec_;
+  }
+
+  // Returns true if the kernel has neither input args nor return values.
+  inline bool empty() const {
+    return !(has_args() || has_rets());
+  }
+
+  inline size_t args_bytes() const {
+    return args_bytes_;
+  }
+
+  inline size_t rets_bytes() const {
+    return rets_bytes_;
+  }
+
+  inline size_t rets_mem_offset() const {
+    return args_bytes();
+  }
+
+  // Total size in bytes of the input args and return values,
+  // *excluding* the extra args bytes!
+  inline size_t ctx_bytes() const {
+    return args_bytes() + rets_bytes();
+  }
+
+  inline size_t extra_args_bytes() const {
+    return extra_args_bytes_;
+  }
+
+  inline size_t extra_args_mem_offset() const {
+    return ctx_bytes();
+  }
+
+  // Total bytes needed for allocating the Vulkan buffer
+  inline size_t total_bytes() const {
+    return ctx_bytes() + extra_args_bytes();
+  }
+
+ private:
+  // Memory layout
+  //
+  // /---- input args ----\/---- ret vals -----\/-- extra args --\
+  // +----------+---------+----------+---------+-----------------+
+  // |  scalar  |  array  |  scalar  |  array  |      scalar     |
+  // +----------+---------+----------+---------+-----------------+
+  //
+  std::vector<ArgAttributes> arg_attribs_vec_;
+  std::vector<RetAttributes> ret_attribs_vec_;
+
+  size_t args_bytes_ = 0;
+  size_t rets_bytes_ = 0;
+  size_t extra_args_bytes_ = 0;
+};
+
+// Groups all the Vulkan kernels generated from a single ti.kernel
+struct TaichiKernelAttributes {
+  // Taichi kernel name
+  std::string name;
+  // Is this kernel for evaluating the constant fold result?
+  bool is_jit_evaluator = false;
+  // Attributes of all the tasks produced from this single Taichi kernel.
+  std::vector<TaskAttributes> tasks_attribs;
+
+  KernelContextAttributes ctx_attribs;
+};
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi

From fb90959cbfa7c7259f7db24d44e21fd93862d7bf Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Mon, 14 Jun 2021 01:08:38 -0500
Subject: [PATCH 03/29] abort python2 import

---
 python/taichi/__init__.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 680a6f12b4e1f..ae81027a1c1f3 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -1,12 +1,16 @@
-from .core import *
-from .lang import *  # TODO(archibate): It's `taichi.lang.core` overriding `taichi.core`
-from .main import main
-from .misc import *
-from .testing import *
-from .tools import *
-from .torch_io import from_torch, to_torch
+import sys
+if sys.version_info[0] == 3:
+    # Abort importing Taichi in a Python2 environment. 
+    # This is to make Houdini load correctly under a Taichi dev install.
+    from .core import *
+    from .lang import *  # TODO(archibate): It's `taichi.lang.core` overriding `taichi.core`
+    from .main import main
+    from .misc import *
+    from .testing import *
+    from .tools import *
+    from .torch_io import from_torch, to_torch
 
-__all__ = ['core', 'misc', 'lang', 'tools', 'main', 'torch_io']
+    __all__ = ['core', 'misc', 'lang', 'tools', 'main', 'torch_io']
 
-__version__ = (core.get_version_major(), core.get_version_minor(),
-               core.get_version_patch())
+    __version__ = (core.get_version_major(), core.get_version_minor(),
+                core.get_version_patch())

From f5f5ab2060397c0a2859a1e199d151ff1a8a3b5b Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Mon, 14 Jun 2021 01:33:06 -0500
Subject: [PATCH 04/29] abort python2 import

---
 python/taichi/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 90d5d8e15624b..2534ecaf13c62 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -13,7 +13,7 @@
     # Issue#2223: Do not reorder, or we're busted with partially initialized module
     from taichi import aot  # isort:skip
 
-        __all__ = ['core', 'misc', 'lang', 'tools', 'main', 'torch_io']
+    __all__ = ['core', 'misc', 'lang', 'tools', 'main', 'torch_io']
 
-        __version__ = (core.get_version_major(), core.get_version_minor(),
-                    core.get_version_patch())
+    __version__ = (core.get_version_major(), core.get_version_minor(),
+                core.get_version_patch())

From 9f45db41ea90548a22c6ddb9989fb3b46012dbc1 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Sat, 31 Jul 2021 18:40:17 -0500
Subject: [PATCH 05/29] [skip ci] add initial custom struct test

---
 tests/python/test_struct.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index 14b870d415d39..e641d19b1615e 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -77,3 +77,24 @@ def test_2d_nested():
     for i in range(n * 2):
         for j in range(n):
             assert x[i, j] == i + j * 10
+
+
+@ti.all_archs
+def test_custom_struct():
+    n = 32
+
+    st = ti.type_factory.make_struct(a=ti.i32, b=ti.f32)
+    f = ti.field(dtype=st, shape=(n, ))
+
+    @ti.kernel
+    def init():
+        for i in f:
+            f[i].a = i
+    @ti.kernel
+    def run():
+        for i in f:
+            f[i].b = f[i].a
+    init()
+    run()
+    for i in range(n):
+        assert f[i].b == i

From d68f7fe511fed9a6c429b1c84330062450d3edfa Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Sun, 1 Aug 2021 21:46:07 -0500
Subject: [PATCH 06/29] [skip ci] add struct class

---
 python/taichi/__init__.py      |  28 ++--
 python/taichi/lang/__init__.py |   1 +
 python/taichi/lang/struct.py   | 268 +++++++++++++++++++++++++++++++++
 3 files changed, 281 insertions(+), 16 deletions(-)
 create mode 100644 python/taichi/lang/struct.py

diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 2534ecaf13c62..9f6ea3938b744 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -1,19 +1,15 @@
-import sys
-if sys.version_info[0] == 3:
-    # Abort importing Taichi in a Python2 environment. 
-    # This is to make Houdini load correctly under a Taichi dev install.
-    from taichi.core import *
-    from taichi.lang import *  # TODO(archibate): It's `taichi.lang.core` overriding `taichi.core`
-    from taichi.main import main
-    from taichi.misc import *
-    from taichi.testing import *
-    from taichi.tools import *
-    from taichi.torch_io import from_torch, to_torch
+from taichi.core import *
+from taichi.lang import *  # TODO(archibate): It's `taichi.lang.core` overriding `taichi.core`
+from taichi.main import main
+from taichi.misc import *
+from taichi.testing import *
+from taichi.tools import *
+from taichi.torch_io import from_torch, to_torch
 
-    # Issue#2223: Do not reorder, or we're busted with partially initialized module
-    from taichi import aot  # isort:skip
+# Issue#2223: Do not reorder, or we're busted with partially initialized module
+from taichi import aot  # isort:skip
 
-    __all__ = ['core', 'misc', 'lang', 'tools', 'main', 'torch_io']
+__all__ = ['core', 'misc', 'lang', 'tools', 'main', 'torch_io']
 
-    __version__ = (core.get_version_major(), core.get_version_minor(),
-                core.get_version_patch())
+__version__ = (core.get_version_major(), core.get_version_minor(),
+               core.get_version_patch())
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index cae74e0e4cf0a..3d0eed2075143 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -9,6 +9,7 @@
 from taichi.lang.kernel_impl import (KernelArgError, KernelDefError,
                                      data_oriented, func, kernel, pyfunc)
 from taichi.lang.matrix import Matrix, Vector
+from taichi.lang.struct import Struct
 from taichi.lang.ndrange import GroupedNDRange, ndrange
 from taichi.lang.ops import *
 from taichi.lang.quant_impl import quant
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
new file mode 100644
index 0000000000000..cb17b70e44249
--- /dev/null
+++ b/python/taichi/lang/struct.py
@@ -0,0 +1,268 @@
+
+import numbers
+
+from taichi.lang import expr, impl
+from taichi.lang.exception import TaichiSyntaxError
+from taichi.lang.util import (python_scope, taichi_scope)
+
+import taichi as ti
+
+class Struct:
+    """The Struct type class.
+    Args:
+        entries (Dict[str, Any]): the names and data types for struct members.
+    """
+    is_taichi_class = True
+
+    def __init__(self, entries):
+        self.entries = entries
+
+    def members(self):
+        return self.entries.keys()
+
+    def empty_copy(self):
+        return Struct.empty(self.members())
+
+
+    def is_global(self):
+        results = [False for _ in self.entries.values()]
+        for i, e in enumerate(self.entries.values()):
+            if isinstance(e, expr.Expr):
+                if e.is_global():
+                    results[i] = True
+            assert results[i] == results[0], \
+                "Structs with mixed global/local entries are not allowed"
+        return results[0]
+
+    def get_field_members(self):
+        """Get struct elements list.
+
+        Returns:
+            A list of struct elements.
+        """
+        return list(self.entries.values())
+
+    def register_member(self, member_name, dtype):
+        self.entries[member_name] = impl.field(dtype, name=self.name + '.' + member_name)
+        setattr(Struct, member_name, 
+            property(
+                Struct.make_getter(member_name),
+                Struct.make_setter(member_name),
+            )
+        )
+
+    def __call__(self, name):
+        _taichi_skip_traceback = 1
+        return self.entries[name]
+
+    @taichi_scope
+    def subscript(self, *indices):
+        _taichi_skip_traceback = 1
+        if self.is_global():
+            ret = self.empty_copy()
+            for k, e in self.entries.items():
+                ret.entries[k] = impl.subscript(e, *indices)
+            return ret
+        else:
+            raise TaichiSyntaxError("Custom struct members cannot be locally subscripted")
+
+    def make_grad(self):
+        ret = self.empty_copy()
+        for k in ret.members():
+            ret.entries[k] = self.entries[k].grad
+        return ret
+
+    @staticmethod
+    def make_getter(member_name):
+        def getter(self):
+            """Get an entry from custom struct by name."""
+            _taichi_skip_traceback = 1
+            return self.entries[member_name]
+        return getter
+
+    @staticmethod
+    def make_setter(member_name):
+        @python_scope
+        def setter(self, value):
+            _taichi_skip_traceback = 1
+            self.entries[member_name] = value
+        return setter
+
+    class Proxy:
+        def __init__(self, struct, index):
+            """Proxy when a tensor of Structs is accessed by host."""
+            self.struct = struct
+            self.index = index
+            for member_name in self.struct.members():
+                setattr(Struct.Proxy, member_name, 
+                    property(
+                        Struct.Proxy.make_getter(member_name),
+                        Struct.Proxy.make_setter(member_name),
+                    )
+                )
+
+        @python_scope
+        def _get(self, name):
+            return self.struct(name)[self.index]
+
+        @python_scope
+        def _set(self, name, value):
+            self.struct(name)[self.index] = value
+
+        @staticmethod
+        def make_getter(member_name):
+            @python_scope
+            def getter(self):
+                return self.struct(member_name)[self.index]
+            return getter
+
+        @staticmethod
+        def make_setter(member_name):
+            @python_scope
+            def setter(self, value):
+                self.struct(member_name)[self.index] = value
+            return setter
+        
+        @property
+        def value(self):
+            ret = self.struct.empty_copy()
+            for k in self.struct.members():
+                ret.entries[k] = self.struct(k)[self.index]
+            return ret
+    
+    # host access & python scope operation
+    @python_scope
+    def __getitem__(self, indices):
+        """Access to the element at the given indices in a struct array.
+
+        Args:
+            indices (Sequence[Expr]): the indices of the element.
+
+        Returns:
+            The value of the element at a specific position of a struct array.
+
+        """
+        if self.is_global():
+            return Struct.Proxy(self, indices)
+        else:
+            raise TaichiSyntaxError("Custom struct members cannot be locally subscripted")
+        
+    @python_scope
+    def __setitem__(self, indices, item):
+        raise NotImplementedError("Cannot assign the whole struct in Python scope")
+
+    def __len__(self):
+        """Get the number of entries in a custom struct"""
+        return len(self.entries)
+
+    def __iter__(self):
+        return self.entries.values()
+
+    def loop_range(self):
+        return list(self.entries.values())[0]
+
+    @property
+    def shape(self):
+        return self.loop_range().shape
+
+    @property
+    def snode(self):
+        return self.loop_range().snode
+
+    def __str__(self):
+        """Python scope struct array print support."""
+        if impl.inside_kernel():
+            return f'<ti.Struct {", ".join([str(k) + "=" + str(v.dtype) for k, v in self.entries])}>'
+        else:
+            return str(self.to_numpy())
+
+    def __repr__(self):
+        if self.is_global():
+            # make interactive shell happy, prevent materialization
+            return f'<ti.Struct {", ".join([str(k) + "=" + str(v.dtype) for k, v in self.entries])}>'
+        else:
+            return str(self.to_numpy())
+
+    @python_scope
+    def from_numpy(self, array_dict):
+        for k in self.members():
+            self(k).from_numpy(array_dict[k])
+    
+    @python_scope
+    def from_torch(self, array_dict):
+        for k in self.members():
+            self(k).from_torch(array_dict[k])
+
+    @python_scope
+    def to_numpy(self):
+        return {k: v.to_numpy() for k, v in self.entries.items()}
+    
+    @python_scope
+    def to_torch(self):
+        return {k: v.to_torch() for k, v in self.entries.items()}
+
+    @classmethod
+    def empty(cls, members):
+        """Clear the struct and fill None.
+
+        Args:
+            members (Dict[str, DataType]): the names and data types for struct members.
+        Returns:
+            :class:`~taichi.lang.struct.Struct`: A :class:`~taichi.lang.struct.Struct` instance filled with None.
+
+        """
+        return cls({k: None for k in members})
+
+    @classmethod
+    @python_scope
+    def field(cls,
+              members,
+              shape=None,
+              struct_name="<Struct>",
+              offset=None,
+              needs_grad=False,
+              layout=None):
+
+        self = cls.empty(members.keys())
+        self.name = struct_name
+
+        if layout is not None:
+            assert shape is not None, 'layout is useless without shape'
+        if shape is None:
+            assert offset is None, "shape cannot be None when offset is being set"
+
+        for member_name, dtype in members.items():
+            self.register_member(member_name, dtype)
+        self.grad = self.make_grad()
+
+        if shape is not None:
+            if isinstance(shape, numbers.Number):
+                shape = (shape, )
+            if isinstance(offset, numbers.Number):
+                offset = (offset, )
+
+            if offset is not None:
+                assert len(shape) == len(
+                    offset
+                ), f'The dimensionality of shape and offset must be the same  ({len(shape)} != {len(offset)})'
+
+            if layout is None:
+                layout = ti.AOS
+
+            dim = len(shape)
+            if layout.soa:
+                for e in self.entries.values():
+                    ti.root.dense(impl.index_nd(dim),
+                                  shape).place(e, offset=offset)
+                if needs_grad:
+                    for e in self.entries.values():
+                        ti.root.dense(impl.index_nd(dim),
+                                      shape).place(e.grad, offset=offset)
+            else:
+                ti.root.dense(impl.index_nd(dim),
+                              shape).place(*tuple(self.entries.values()), offset=offset)
+                if needs_grad:
+                    ti.root.dense(impl.index_nd(dim),
+                                  shape).place(*tuple(self.entries.values()),
+                                               offset=offset)
+        return self
\ No newline at end of file

From e26889f705cc50a9a2d56536f330779856200281 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Sun, 1 Aug 2021 21:46:38 -0500
Subject: [PATCH 07/29] add custom struct tests

---
 tests/python/test_numpy_io.py | 31 +++++++++++++++++++++++++++++++
 tests/python/test_struct.py   | 31 ++++++++++++++++++++++---------
 tests/python/test_torch_io.py | 18 ++++++++++++++++++
 3 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/tests/python/test_numpy_io.py b/tests/python/test_numpy_io.py
index 68fbd48ab7d81..2fdf9da30f8ba 100644
--- a/tests/python/test_numpy_io.py
+++ b/tests/python/test_numpy_io.py
@@ -45,6 +45,37 @@ def test_from_numpy_2d():
         for j in range(m):
             assert val[i, j] == i + j * 3
 
+@ti.all_archs
+def test_to_numpy_struct():
+    n = 16
+    f = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+
+    for i in range(n):
+        f[i].a = i
+        f[i].b = f[i].a * 2
+
+    arr_dict = f.to_numpy()
+
+    for i in range(n):
+        assert arr_dict["a"][i] == i
+        assert arr_dict["b"][i] == i * 2
+
+@ti.all_archs
+def test_from_numpy_struct():
+    n = 16
+    f = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+
+    arr_dict = {
+        "a": np.arange(n),
+        "b": np.arange(n) * 2,
+    }
+
+    f.from_numpy(arr_dict)
+
+    for i in range(n):
+        assert f[i].a == i
+        assert f[i].b == i * 2
+
 
 @ti.require(ti.extension.data64)
 @ti.all_archs
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index e641d19b1615e..ef900c4991c92 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -83,18 +83,31 @@ def test_2d_nested():
 def test_custom_struct():
     n = 32
 
-    st = ti.type_factory.make_struct(a=ti.i32, b=ti.f32)
-    f = ti.field(dtype=st, shape=(n, ))
+    # May also want to suuport vector struct using compound types such as
+    # st = ti.types.struct(a=ti.types.vector(3, ti.f32), b=ti.f32)
+    # f = ti.field(dtype=st, shape=(n, ))
+    
+    x = ti.Struct.field({"a": ti.f32, "b": ti.f32}, shape=(n, ))
+    y = ti.Struct.field({"a": ti.f32, "b": ti.f32})
+
+    ti.root.dense(ti.i, n // 4).dense(ti.i, 4).place(y)
 
     @ti.kernel
     def init():
-        for i in f:
-            f[i].a = i
+        for i in x:
+            x[i].a = i
+            y[i].a = i
     @ti.kernel
-    def run():
-        for i in f:
-            f[i].b = f[i].a
+    def run_taichi_scope():
+        for i in x:
+            x[i].b = x[i].a
+    def run_python_scope():
+        for i in range(n):
+            y[i].b = y[i].a * 2 + 1
     init()
-    run()
+    run_taichi_scope()
+    for i in range(n):
+        assert x[i].b == i
+    run_python_scope()
     for i in range(n):
-        assert f[i].b == i
+        assert y[i].b == i * 2 + 1    
diff --git a/tests/python/test_torch_io.py b/tests/python/test_torch_io.py
index 1a06c2177deb8..503cc43c1745b 100644
--- a/tests/python/test_torch_io.py
+++ b/tests/python/test_torch_io.py
@@ -167,6 +167,24 @@ def test_io_zeros():
     zeros = mat.to_torch()
     assert zeros[1, 2] == 4
 
+@ti.torch_test
+def test_io_struct():
+    n = 16
+    x1 = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+    t1 = {
+        "a": torch.tensor(2 * np.ones(n, dtype=np.int32)),
+        "b": torch.tensor(3 * np.ones(n, dtype=np.float32))
+    }
+
+    x1.from_torch(t1)
+    for i in range(n):
+        assert x1[i].a == 2
+        assert x1[i].b == 3
+
+    t2 = x1.to_torch()
+    for k in t1:
+        assert (t1[k] == t2[k]).all()
+
 
 @ti.torch_test
 def test_fused_kernels():

From e7a8acece37f0e8b1c6f284ac427dbae2520814a Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Mon, 2 Aug 2021 03:09:15 +0000
Subject: [PATCH 08/29] Auto Format

---
 python/taichi/lang/__init__.py |  2 +-
 python/taichi/lang/struct.py   | 49 ++++++++++++++++++++--------------
 tests/python/test_numpy_io.py  |  2 ++
 tests/python/test_struct.py    |  7 +++--
 tests/python/test_torch_io.py  |  1 +
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 3d0eed2075143..b64bdae602082 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -9,11 +9,11 @@
 from taichi.lang.kernel_impl import (KernelArgError, KernelDefError,
                                      data_oriented, func, kernel, pyfunc)
 from taichi.lang.matrix import Matrix, Vector
-from taichi.lang.struct import Struct
 from taichi.lang.ndrange import GroupedNDRange, ndrange
 from taichi.lang.ops import *
 from taichi.lang.quant_impl import quant
 from taichi.lang.runtime_ops import async_flush, sync
+from taichi.lang.struct import Struct
 from taichi.lang.transformer import TaichiSyntaxError
 from taichi.lang.type_factory_impl import type_factory
 from taichi.lang.util import (has_pytorch, is_taichi_class, python_scope,
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index cb17b70e44249..2d48ed583bf7c 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -1,12 +1,12 @@
-
 import numbers
 
 from taichi.lang import expr, impl
 from taichi.lang.exception import TaichiSyntaxError
-from taichi.lang.util import (python_scope, taichi_scope)
+from taichi.lang.util import python_scope, taichi_scope
 
 import taichi as ti
 
+
 class Struct:
     """The Struct type class.
     Args:
@@ -23,7 +23,6 @@ def members(self):
     def empty_copy(self):
         return Struct.empty(self.members())
 
-
     def is_global(self):
         results = [False for _ in self.entries.values()]
         for i, e in enumerate(self.entries.values()):
@@ -43,13 +42,15 @@ def get_field_members(self):
         return list(self.entries.values())
 
     def register_member(self, member_name, dtype):
-        self.entries[member_name] = impl.field(dtype, name=self.name + '.' + member_name)
-        setattr(Struct, member_name, 
+        self.entries[member_name] = impl.field(dtype,
+                                               name=self.name + '.' +
+                                               member_name)
+        setattr(
+            Struct, member_name,
             property(
                 Struct.make_getter(member_name),
                 Struct.make_setter(member_name),
-            )
-        )
+            ))
 
     def __call__(self, name):
         _taichi_skip_traceback = 1
@@ -64,7 +65,8 @@ def subscript(self, *indices):
                 ret.entries[k] = impl.subscript(e, *indices)
             return ret
         else:
-            raise TaichiSyntaxError("Custom struct members cannot be locally subscripted")
+            raise TaichiSyntaxError(
+                "Custom struct members cannot be locally subscripted")
 
     def make_grad(self):
         ret = self.empty_copy()
@@ -78,6 +80,7 @@ def getter(self):
             """Get an entry from custom struct by name."""
             _taichi_skip_traceback = 1
             return self.entries[member_name]
+
         return getter
 
     @staticmethod
@@ -86,6 +89,7 @@ def make_setter(member_name):
         def setter(self, value):
             _taichi_skip_traceback = 1
             self.entries[member_name] = value
+
         return setter
 
     class Proxy:
@@ -94,12 +98,12 @@ def __init__(self, struct, index):
             self.struct = struct
             self.index = index
             for member_name in self.struct.members():
-                setattr(Struct.Proxy, member_name, 
+                setattr(
+                    Struct.Proxy, member_name,
                     property(
                         Struct.Proxy.make_getter(member_name),
                         Struct.Proxy.make_setter(member_name),
-                    )
-                )
+                    ))
 
         @python_scope
         def _get(self, name):
@@ -114,6 +118,7 @@ def make_getter(member_name):
             @python_scope
             def getter(self):
                 return self.struct(member_name)[self.index]
+
             return getter
 
         @staticmethod
@@ -121,15 +126,16 @@ def make_setter(member_name):
             @python_scope
             def setter(self, value):
                 self.struct(member_name)[self.index] = value
+
             return setter
-        
+
         @property
         def value(self):
             ret = self.struct.empty_copy()
             for k in self.struct.members():
                 ret.entries[k] = self.struct(k)[self.index]
             return ret
-    
+
     # host access & python scope operation
     @python_scope
     def __getitem__(self, indices):
@@ -145,11 +151,13 @@ def __getitem__(self, indices):
         if self.is_global():
             return Struct.Proxy(self, indices)
         else:
-            raise TaichiSyntaxError("Custom struct members cannot be locally subscripted")
-        
+            raise TaichiSyntaxError(
+                "Custom struct members cannot be locally subscripted")
+
     @python_scope
     def __setitem__(self, indices, item):
-        raise NotImplementedError("Cannot assign the whole struct in Python scope")
+        raise NotImplementedError(
+            "Cannot assign the whole struct in Python scope")
 
     def __len__(self):
         """Get the number of entries in a custom struct"""
@@ -187,7 +195,7 @@ def __repr__(self):
     def from_numpy(self, array_dict):
         for k in self.members():
             self(k).from_numpy(array_dict[k])
-    
+
     @python_scope
     def from_torch(self, array_dict):
         for k in self.members():
@@ -196,7 +204,7 @@ def from_torch(self, array_dict):
     @python_scope
     def to_numpy(self):
         return {k: v.to_numpy() for k, v in self.entries.items()}
-    
+
     @python_scope
     def to_torch(self):
         return {k: v.to_torch() for k, v in self.entries.items()}
@@ -260,9 +268,10 @@ def field(cls,
                                       shape).place(e.grad, offset=offset)
             else:
                 ti.root.dense(impl.index_nd(dim),
-                              shape).place(*tuple(self.entries.values()), offset=offset)
+                              shape).place(*tuple(self.entries.values()),
+                                           offset=offset)
                 if needs_grad:
                     ti.root.dense(impl.index_nd(dim),
                                   shape).place(*tuple(self.entries.values()),
                                                offset=offset)
-        return self
\ No newline at end of file
+        return self
diff --git a/tests/python/test_numpy_io.py b/tests/python/test_numpy_io.py
index 2fdf9da30f8ba..10c027bff4e25 100644
--- a/tests/python/test_numpy_io.py
+++ b/tests/python/test_numpy_io.py
@@ -45,6 +45,7 @@ def test_from_numpy_2d():
         for j in range(m):
             assert val[i, j] == i + j * 3
 
+
 @ti.all_archs
 def test_to_numpy_struct():
     n = 16
@@ -60,6 +61,7 @@ def test_to_numpy_struct():
         assert arr_dict["a"][i] == i
         assert arr_dict["b"][i] == i * 2
 
+
 @ti.all_archs
 def test_from_numpy_struct():
     n = 16
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index ef900c4991c92..ec7d15dc7b88c 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -86,7 +86,7 @@ def test_custom_struct():
     # May also want to suuport vector struct using compound types such as
     # st = ti.types.struct(a=ti.types.vector(3, ti.f32), b=ti.f32)
     # f = ti.field(dtype=st, shape=(n, ))
-    
+
     x = ti.Struct.field({"a": ti.f32, "b": ti.f32}, shape=(n, ))
     y = ti.Struct.field({"a": ti.f32, "b": ti.f32})
 
@@ -97,17 +97,20 @@ def init():
         for i in x:
             x[i].a = i
             y[i].a = i
+
     @ti.kernel
     def run_taichi_scope():
         for i in x:
             x[i].b = x[i].a
+
     def run_python_scope():
         for i in range(n):
             y[i].b = y[i].a * 2 + 1
+
     init()
     run_taichi_scope()
     for i in range(n):
         assert x[i].b == i
     run_python_scope()
     for i in range(n):
-        assert y[i].b == i * 2 + 1    
+        assert y[i].b == i * 2 + 1
diff --git a/tests/python/test_torch_io.py b/tests/python/test_torch_io.py
index 503cc43c1745b..ab007df85e09f 100644
--- a/tests/python/test_torch_io.py
+++ b/tests/python/test_torch_io.py
@@ -167,6 +167,7 @@ def test_io_zeros():
     zeros = mat.to_torch()
     assert zeros[1, 2] == 4
 
+
 @ti.torch_test
 def test_io_struct():
     n = 16

From 2ea8f8350028c67191fc9c18923aa70c342ce975 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Tue, 24 Aug 2021 18:45:43 -0500
Subject: [PATCH 09/29] add StructField and CompoundType

---
 python/taichi/lang/matrix.py |  62 ++++-
 python/taichi/lang/struct.py | 451 ++++++++++++++++++++++-------------
 python/taichi/lang/types.py  |  34 +++
 3 files changed, 376 insertions(+), 171 deletions(-)
 create mode 100644 python/taichi/lang/types.py

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index c7e64de7a8c35..1e066d193c02c 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -10,6 +10,7 @@
 from taichi.lang.exception import TaichiSyntaxError
 from taichi.lang.ext_array import AnyArrayAccess
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
+from taichi.lang.types import CompoundType
 from taichi.lang.util import (in_python_scope, is_taichi_class, python_scope,
                               taichi_scope, to_numpy_type, to_pytorch_type)
 from taichi.misc.util import deprecated, warning
@@ -1221,7 +1222,7 @@ class MatrixField(Field):
     """Taichi matrix field with SNode implementation.
 
     Args:
-        vars (Expr): Field members.
+        vars (List[Expr]): Field members.
         n (Int): Number of rows.
         m (Int): Number of columns.
     """
@@ -1279,7 +1280,7 @@ def fill(self, val):
 
     @python_scope
     def to_numpy(self, keep_dims=False, as_vector=None, dtype=None):
-        """Converts `self` to a numpy array.
+        """Converts the field instance to a NumPy array.
 
         Args:
             keep_dims (bool, optional): Whether to keep the dimension after conversion.
@@ -1292,7 +1293,7 @@ def to_numpy(self, keep_dims=False, as_vector=None, dtype=None):
             dtype (DataType, optional): The desired data type of returned numpy array.
 
         Returns:
-            numpy.ndarray: The result numpy array.
+            numpy.ndarray: The result NumPy array.
         """
         if as_vector is not None:
             warning(
@@ -1312,7 +1313,7 @@ def to_numpy(self, keep_dims=False, as_vector=None, dtype=None):
         return arr
 
     def to_torch(self, device=None, keep_dims=False):
-        """Converts `self` to a torch tensor.
+        """Converts the field instance to a PyTorch tensor.
 
         Args:
             device (torch.device, optional): The desired device of returned tensor.
@@ -1369,3 +1370,56 @@ def __getitem__(self, key):
     def __repr__(self):
         # make interactive shell happy, prevent materialization
         return f'<{self.n}x{self.m} ti.Matrix.field>'
+
+
+class MatrixType(CompoundType):
+
+    def __init__(self, m, n, dtype):
+        self.m = m
+        self.n = n
+        self.dtype = dtype
+
+
+    def __call__(self, *args):
+        if len(args) == 0:
+            raise TaichiSyntaxError("Custom type instances need to be created with an initial value.")
+        elif len(args) == 1:
+            # fill a single scalar
+            if isinstance(args[0], numbers.Number):
+                return self.scalar_filled(args[0])
+            # fill a single vector or matrix
+            entries = args[0]
+        else:
+            # fill in a concatenation of scalars/vectors/matrices
+            entries = []
+            for x in args:
+                if isinstance(x, numbers.Number):
+                    entries.append(x)
+                elif isinstance(x, (list, tuple)):
+                    entries += x
+                elif isinstance(x, Matrix):
+                    entries += x.entries
+        # convert vector to nx1 matrix
+        if isinstance(entries[0], numbers.Number):
+            entries = [[e] for e in entries]
+        # type cast
+        mat = self.cast(Matrix(entries))
+        return mat
+
+    def cast(self, mat, in_place=False):
+        if not in_place:
+            mat = mat.copy()
+        # sanity check shape
+        if self.m != mat.m or self.n != mat.n:
+            raise TaichiSyntaxError("Incompatible arguments for the custom vector/matrix type!")
+        mat.entries = [cast(x, self.dtype) for x in mat.entries]
+        return mat
+        
+    def empty(self):
+        """
+        Create an empty instance of the given compound type.
+        """
+        return Matrix.empty(self.m, self.n)
+
+    def field(self, **kwargs):
+        return Matrix.field(self.m, self.n, **kwargs)
\ No newline at end of file
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 2d48ed583bf7c..3949226ab0a2f 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -1,163 +1,88 @@
+import copy
 import numbers
 
 from taichi.lang import expr, impl
 from taichi.lang.exception import TaichiSyntaxError
+from taichi.lang.field import Field, SNodeHostAccess
+from taichi.lang.matrix import Matrix
+from taichi.lang.types import CompoundType
 from taichi.lang.util import python_scope, taichi_scope
 
+
 import taichi as ti
 
 
 class Struct:
     """The Struct type class.
     Args:
-        entries (Dict[str, Any]): the names and data types for struct members.
+        entries (Dict[str, Union[Dict, Expr, Matrix, Struct]]): keys and values for struct members.
     """
     is_taichi_class = True
 
     def __init__(self, entries):
-        self.entries = entries
+        # converts lists to matrices and dicts to structs
+        self.entries = {}
+        for k, v in entries.items:
+            if isinstance(v, (list, tuple)):
+                v = Matrix(v)
+            if isinstance(v, dict):
+                v = Struct(v)
+            self.entries[k] = v
+        self.register_members()
+
+    @property
+    def keys(self):
+        return list(self.entries.keys())
 
+    @property
     def members(self):
-        return self.entries.keys()
+        return list(self.entries.values())
 
-    def empty_copy(self):
-        return Struct.empty(self.members())
-
-    def is_global(self):
-        results = [False for _ in self.entries.values()]
-        for i, e in enumerate(self.entries.values()):
-            if isinstance(e, expr.Expr):
-                if e.is_global():
-                    results[i] = True
-            assert results[i] == results[0], \
-                "Structs with mixed global/local entries are not allowed"
-        return results[0]
+    def items(self):
+        return self.entries.items()
 
-    def get_field_members(self):
-        """Get struct elements list.
+    def empty_copy(self):
+        return Struct.empty(self.members)
 
-        Returns:
-            A list of struct elements.
-        """
-        return list(self.entries.values())
+    def copy(self):
+        ret = self.empty_copy()
+        ret.entries = copy.copy(self.entries)
+        return ret
 
-    def register_member(self, member_name, dtype):
-        self.entries[member_name] = impl.field(dtype,
-                                               name=self.name + '.' +
-                                               member_name)
-        setattr(
-            Struct, member_name,
-            property(
-                Struct.make_getter(member_name),
-                Struct.make_setter(member_name),
-            ))
-
-    def __call__(self, name):
-        _taichi_skip_traceback = 1
-        return self.entries[name]
+    def register_members(self):
+        for k in self.keys:
+            setattr(
+                Struct, k,
+                property(
+                    Struct.make_getter(k),
+                    Struct.make_setter(k),
+                ))
 
-    @taichi_scope
-    def subscript(self, *indices):
+    def __call__(self, key, **kwargs):
         _taichi_skip_traceback = 1
-        if self.is_global():
-            ret = self.empty_copy()
-            for k, e in self.entries.items():
-                ret.entries[k] = impl.subscript(e, *indices)
-            return ret
-        else:
-            raise TaichiSyntaxError(
-                "Custom struct members cannot be locally subscripted")
-
-    def make_grad(self):
-        ret = self.empty_copy()
-        for k in ret.members():
-            ret.entries[k] = self.entries[k].grad
+        assert kwargs == {}
+        ret = self.entries[key]
+        if isinstance(ret, SNodeHostAccess):
+            ret = ret.accessor.getter(*ret.key)
         return ret
 
+
     @staticmethod
-    def make_getter(member_name):
+    def make_getter(key):
         def getter(self):
             """Get an entry from custom struct by name."""
             _taichi_skip_traceback = 1
-            return self.entries[member_name]
-
+            return self.entries[key]
         return getter
 
     @staticmethod
-    def make_setter(member_name):
+    def make_setter(key):
         @python_scope
         def setter(self, value):
             _taichi_skip_traceback = 1
-            self.entries[member_name] = value
-
+            self.entries[key] = value
         return setter
 
-    class Proxy:
-        def __init__(self, struct, index):
-            """Proxy when a tensor of Structs is accessed by host."""
-            self.struct = struct
-            self.index = index
-            for member_name in self.struct.members():
-                setattr(
-                    Struct.Proxy, member_name,
-                    property(
-                        Struct.Proxy.make_getter(member_name),
-                        Struct.Proxy.make_setter(member_name),
-                    ))
-
-        @python_scope
-        def _get(self, name):
-            return self.struct(name)[self.index]
-
-        @python_scope
-        def _set(self, name, value):
-            self.struct(name)[self.index] = value
-
-        @staticmethod
-        def make_getter(member_name):
-            @python_scope
-            def getter(self):
-                return self.struct(member_name)[self.index]
-
-            return getter
-
-        @staticmethod
-        def make_setter(member_name):
-            @python_scope
-            def setter(self, value):
-                self.struct(member_name)[self.index] = value
-
-            return setter
-
-        @property
-        def value(self):
-            ret = self.struct.empty_copy()
-            for k in self.struct.members():
-                ret.entries[k] = self.struct(k)[self.index]
-            return ret
-
-    # host access & python scope operation
-    @python_scope
-    def __getitem__(self, indices):
-        """Access to the element at the given indices in a struct array.
-
-        Args:
-            indices (Sequence[Expr]): the indices of the element.
-
-        Returns:
-            The value of the element at a specific position of a struct array.
-
-        """
-        if self.is_global():
-            return Struct.Proxy(self, indices)
-        else:
-            raise TaichiSyntaxError(
-                "Custom struct members cannot be locally subscripted")
-
-    @python_scope
-    def __setitem__(self, indices, item):
-        raise NotImplementedError(
-            "Cannot assign the whole struct in Python scope")
 
     def __len__(self):
         """Get the number of entries in a custom struct"""
@@ -166,51 +91,29 @@ def __len__(self):
     def __iter__(self):
         return self.entries.values()
 
-    def loop_range(self):
-        return list(self.entries.values())[0]
-
-    @property
-    def shape(self):
-        return self.loop_range().shape
-
-    @property
-    def snode(self):
-        return self.loop_range().snode
-
     def __str__(self):
         """Python scope struct array print support."""
         if impl.inside_kernel():
-            return f'<ti.Struct {", ".join([str(k) + "=" + str(v.dtype) for k, v in self.entries])}>'
+            return f'<ti.Struct {", ".join([str(k) + "=" + str(v) for k, v in self.entries])}>'
         else:
             return str(self.to_numpy())
 
     def __repr__(self):
-        if self.is_global():
-            # make interactive shell happy, prevent materialization
-            return f'<ti.Struct {", ".join([str(k) + "=" + str(v.dtype) for k, v in self.entries])}>'
-        else:
-            return str(self.to_numpy())
+        return str(self.to_numpy())
 
     @python_scope
-    def from_numpy(self, array_dict):
-        for k in self.members():
-            self(k).from_numpy(array_dict[k])
+    def to_dict(self):
+        """Converts the Struct to a dictionary.
 
-    @python_scope
-    def from_torch(self, array_dict):
-        for k in self.members():
-            self(k).from_torch(array_dict[k])
-
-    @python_scope
-    def to_numpy(self):
-        return {k: v.to_numpy() for k, v in self.entries.items()}
+        Args:
 
-    @python_scope
-    def to_torch(self):
-        return {k: v.to_torch() for k, v in self.entries.items()}
+        Returns:
+            Dict: The result dictionary.
+        """
+        return self.entries
 
     @classmethod
-    def empty(cls, members):
+    def empty(cls, entries):
         """Clear the struct and fill None.
 
         Args:
@@ -219,7 +122,7 @@ def empty(cls, members):
             :class:`~taichi.lang.struct.Struct`: A :class:`~taichi.lang.struct.Struct` instance filled with None.
 
         """
-        return cls({k: None for k in members})
+        return cls({k: None for k in entries})
 
     @classmethod
     @python_scope
@@ -229,19 +132,21 @@ def field(cls,
               struct_name="<Struct>",
               offset=None,
               needs_grad=False,
-              layout=None):
-
-        self = cls.empty(members.keys())
-        self.name = struct_name
+              layout=None):       
 
         if layout is not None:
             assert shape is not None, 'layout is useless without shape'
         if shape is None:
             assert offset is None, "shape cannot be None when offset is being set"
 
-        for member_name, dtype in members.items():
-            self.register_member(member_name, dtype)
-        self.grad = self.make_grad()
+        field_dict = {}
+
+        for key, dtype in members.items():
+            name = struct_name + '.' + key
+            if isinstance(dtype, CompoundType):
+                field_dict[key] = dtype.field(shape=None, name=name, offset=offset, needs_grad=needs_grad)
+            else:
+                field_dict[key] = impl.field(dtype, shape=None, name=name, offset=offset, needs_grad=needs_grad)
 
         if shape is not None:
             if isinstance(shape, numbers.Number):
@@ -259,19 +164,231 @@ def field(cls,
 
             dim = len(shape)
             if layout.soa:
-                for e in self.entries.values():
+                for e in field_dict.values():
                     ti.root.dense(impl.index_nd(dim),
                                   shape).place(e, offset=offset)
                 if needs_grad:
-                    for e in self.entries.values():
+                    for e in field_dict.values():
                         ti.root.dense(impl.index_nd(dim),
                                       shape).place(e.grad, offset=offset)
             else:
                 ti.root.dense(impl.index_nd(dim),
-                              shape).place(*tuple(self.entries.values()),
+                              shape).place(*tuple(field_dict.values()),
                                            offset=offset)
                 if needs_grad:
+                    grads = tuple(e.grad for e in field_dict.values())
                     ti.root.dense(impl.index_nd(dim),
-                                  shape).place(*tuple(self.entries.values()),
+                                  shape).place(*grads,
                                                offset=offset)
-        return self
+        return StructField(field_dict, name=struct_name)
+
+
+class StructField(Field):
+    """Taichi struct field with SNode implementation.
+       Instead of directly contraining Expr entries, the StructField object
+       directly hosts members as `Field` instances to support nested structs. 
+
+    Args:
+        field_dict (Dict[str, Field]): Struct field members.       
+        name (string, optional): The custom name of the field.
+    """
+    def __init__(self, field_dict, name=None):
+        # will not call Field initializer
+        self.field_dict = field_dict
+        self.name = name
+        self.register_fields()
+
+    @property
+    def keys(self):
+        return list(self.field_dict.keys())
+
+    @property
+    def members(self):
+        return list(self.field_dict.values())
+
+    @staticmethod
+    def make_getter(key):
+        def getter(self):
+            """Get an entry from custom struct by name."""
+            _taichi_skip_traceback = 1
+            return self.field_dict[key]
+        return getter
+
+    @staticmethod
+    def make_setter(key):
+        @python_scope
+        def setter(self, value):
+            _taichi_skip_traceback = 1
+            self.field_dict[key] = value
+        return setter
+
+    def register_fields(self):
+        for k in self.keys:
+            setattr(
+                StructField, k,
+                property(
+                    StructField.make_getter(k),
+                    StructField.make_setter(k),
+                ))
+
+    def get_field_members(self):
+        """Get A flattened list of all struct elements.
+
+        Returns:
+            A list of struct elements.
+        """
+        field_members = []
+        for m in self.members:
+            assert isinstance(m, Field)
+            field_members += m.get_field_members()
+        return field_members
+
+    @property
+    def snode(self):
+        """Gets representative SNode for info purposes.
+
+        Returns:
+            SNode: Representative SNode (SNode of first field member).
+        """
+        return self.members[0].snode
+
+    def loop_range(self):
+        """Gets representative field member for loop range info.
+
+        Returns:
+            taichi_core.Expr: Representative (first) field member.
+        """
+        return self.members[0].loop_range()
+
+    @python_scope
+    def copy_from(self, other):
+        """Copies all elements from another field.
+
+        The shape of the other field needs to be the same as `self`.
+
+        Args:
+            other (Field): The source field.
+        """
+        assert isinstance(other, Field)
+        assert set(self.keys) == set(other.keys)
+        for k in self.keys:
+            self[k].copy_from(other[k])
+
+    @python_scope
+    def fill(self, val):
+        """Fills `self` with a specific value.
+
+        Args:
+            val (Union[int, float]): Value to fill.
+        """
+        for v in self.members:
+            v.fill(val)
+
+    def initialize_host_accessors(self):
+        for v in self.members:
+            v.initialize_host_accessors()
+
+    def get_member_field(self, key):
+        """Creates a ScalarField using a specific field member. Only used for quant.
+
+        Args:
+            key (str): Specified key of the field member.
+
+        Returns:
+            ScalarField: The result ScalarField.
+        """
+        return self.field_dict[key]
+
+    @python_scope
+    def from_numpy(self, array_dict):
+        for k in self.keys:
+            self(k).from_numpy(array_dict[k])
+
+    @python_scope
+    def from_torch(self, array_dict):
+        for k in self.keys:
+            self(k).from_torch(array_dict[k])
+
+    @python_scope
+    def to_numpy(self):
+        """Converts the Struct field instance to a dictionary of NumPy arrays. The dictionary may be nested when converting
+           nested structs.
+
+        Args:
+        Returns:
+            Dict[str, Union[numpy.ndarray, Dict]]: The result NumPy array.
+        """
+        return {k: v.to_numpy() for k, v in self.field_dict.items()}
+
+    @python_scope
+    def to_torch(self, device):
+        """Converts the Struct field instance to a dictionary of PyTorch tensors. The dictionary may be nested when converting
+           nested structs.
+
+        Args:
+            device (torch.device, optional): The desired device of returned tensor.
+        Returns:
+            Dict[str, Union[torch.Tensor, Dict]]: The result PyTorch tensor.
+        """
+        return {k: v.to_torch(device=device) for k, v in self.field_dict.items()}
+
+    
+    @python_scope
+    def __setitem__(self, indices, element):
+        self.initialize_host_accessors()
+        for k, v in element.values():
+            self[indices][k] = v
+    
+    @python_scope
+    def __getitem__(self, indices):
+        self.initialize_host_accessors()
+        indices = self.pad_key(indices)
+        entries = {
+            k: v[indices] for k, v in self.field_dict.items()
+        }
+        return Struct(entries)
+    
+
+class StructType(CompoundType):
+    
+    def __init__(self, **kwargs):
+        self.members = kwargs
+
+    def __call__(self, *args, **kwargs):
+        if len(args) == 0:
+            if kwargs == {}:
+                raise TaichiSyntaxError("Custom type instances need to be created with an initial value.")
+            else:
+                # initialize struct members by keywords
+                entries = kwargs
+        elif len(args) == 1:
+            # fill a single scalar
+            if isinstance(args[0], numbers.Number):
+                return self.scalar_filled(args[0])
+            # fill a single vector or matrix
+            # initialize struct members by dictionary
+            entries = args[0]
+        struct = self.cast(Struct(entries))
+        return struct
+
+    def cast(self, struct, in_place=False):
+        if not in_place:
+            struct = struct.copy()
+        # sanity check members
+        if self.members.keys() != struct.entries.keys():
+            raise TaichiSyntaxError("Incompatible arguments for custom struct members!")
+        for k, dtype in self.members.items():
+            if isinstance(dtype, CompoundType):
+                struct.entries[k] = dtype.cast(struct.entries[k])
+            else:
+                struct.entries[k] = cast(struct.entries[k], dtype)
+        return struct
+        
+    def empty(self):
+        """
+        Create an empty instance of the given compound type.
+        """
+        return Struct.empty(self.members)
+    
+    def field(self, **kwargs):
+        return Struct.field(self.m, self.n, **kwargs)
diff --git a/python/taichi/lang/types.py b/python/taichi/lang/types.py
new file mode 100644
index 0000000000000..a98fb596756f1
--- /dev/null
+++ b/python/taichi/lang/types.py
@@ -0,0 +1,34 @@
+import numbers
+
+from taichi.lang.exception import TaichiSyntaxError
+from taichi.lang.ops import cast
+
+
+class CompoundType:
+    
+
+    def empty(self):
+        """
+        Create an empty instance of the given compound type.
+        """
+        raise NotImplementedError
+
+    def scalar_filled(self, value):
+        instance = self.empty()
+        instance.fill(value)
+        return value
+
+    def field(self, **kwargs):
+        raise NotImplementedError
+
+def matrix(m, n, dtype=None):
+    from taichi.lang.matrix import MatrixType
+    return MatrixType(m, n, dtype=dtype)
+
+def vector(m, dtype=None):
+    from taichi.lang.matrix import MatrixType
+    return MatrixType(m, 1, dtype=dtype)
+
+def struct(**kwargs):
+    from taichi.lang.struct import StructType
+    return StructType(**kwargs)

From ad87a0ff3623abf3ec9a8564d84a781c46396857 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Tue, 24 Aug 2021 22:26:27 -0500
Subject: [PATCH 10/29] [skip ci] add custom types; struct assignment and
 elementwise ops

---
 python/taichi/lang/__init__.py |   2 +-
 python/taichi/lang/impl.py     |   9 +-
 python/taichi/lang/matrix.py   |  11 +--
 python/taichi/lang/struct.py   | 152 ++++++++++++++++++++++++++++-----
 python/taichi/lang/types.py    |   4 +-
 5 files changed, 145 insertions(+), 33 deletions(-)

diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 711c6b78d95ce..873a73ee295c5 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -4,7 +4,7 @@
 
 from taichi.core.util import locale_encode
 from taichi.core.util import ti_core as _ti_core
-from taichi.lang import impl
+from taichi.lang import impl, types
 from taichi.lang.exception import InvalidOperationError
 from taichi.lang.impl import *
 from taichi.lang.kernel_arguments import any_arr, ext_arr, template
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index a9085fb2b9ba9..ccd0610eda9a7 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -1,6 +1,6 @@
 import numbers
-import types
 import warnings
+from types import FunctionType, MethodType
 
 import numpy as np
 from taichi.core.util import ti_core as _ti_core
@@ -11,6 +11,7 @@
 from taichi.lang.matrix import MatrixField
 from taichi.lang.ndarray import ScalarNdarray
 from taichi.lang.snode import SNode
+from taichi.lang.struct import StructField
 from taichi.lang.tape import TapeImpl
 from taichi.lang.util import (cook_dtype, has_pytorch, is_taichi_class,
                               python_scope, taichi_scope, to_pytorch_type)
@@ -156,6 +157,10 @@ def subscript(value, *indices):
                 Expr(_ti_core.subscript(e.ptr, indices_expr_group))
                 for e in value.get_field_members()
             ])
+        elif isinstance(value, StructField):
+            return ti.Struct({
+                k: subscript(v, *indices) for k, v in value.items
+            })
         else:
             return Expr(_ti_core.subscript(var, indices_expr_group))
     elif isinstance(value, AnyArray):
@@ -833,7 +838,7 @@ def static(x, *xs):
         return x
     elif isinstance(x, Field):
         return x
-    elif isinstance(x, (types.FunctionType, types.MethodType)):
+    elif isinstance(x, (FunctionType, MethodType)):
         return x
     else:
         raise ValueError(
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 1e066d193c02c..ae79b6d43ac34 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -10,6 +10,7 @@
 from taichi.lang.exception import TaichiSyntaxError
 from taichi.lang.ext_array import AnyArrayAccess
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
+from taichi.lang.ops import cast
 from taichi.lang.types import CompoundType
 from taichi.lang.util import (in_python_scope, is_taichi_class, python_scope,
                               taichi_scope, to_numpy_type, to_pytorch_type)
@@ -1374,9 +1375,9 @@ def __repr__(self):
 
 class MatrixType(CompoundType):
 
-    def __init__(self, m, n, dtype):
-        self.m = m
+    def __init__(self, n, m, dtype):
         self.n = n
+        self.m = m
         self.dtype = dtype
 
 
@@ -1411,7 +1412,7 @@ def cast(self, mat, in_place=False):
             mat = mat.copy()
         # sanity check shape
         if self.m != mat.m or self.n != mat.n:
-            raise TaichiSyntaxError("Incompatible arguments for the custom vector/matrix type!")
+            raise TaichiSyntaxError(f"Incompatible arguments for the custom vector/matrix type: ({self.n}, {self.m}), ({mat.n}, {mat.m})")
         mat.entries = [cast(x, self.dtype) for x in mat.entries]
         return mat
         
@@ -1419,7 +1420,7 @@ def empty(self):
         """
         Create an empty instance of the given compound type.
         """
-        return Matrix.empty(self.m, self.n)
+        return Matrix.empty(self.n, self.m)
 
     def field(self, **kwargs):
-        return Matrix.field(self.m, self.n, **kwargs)
\ No newline at end of file
+        return Matrix.field(self.n, self.m, dtype=self.dtype, **kwargs)
\ No newline at end of file
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 3949226ab0a2f..f640e69d110a8 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -1,28 +1,39 @@
 import copy
 import numbers
 
-from taichi.lang import expr, impl
+
+from taichi.lang import impl
+from taichi.lang.common_ops import TaichiOperations
+from taichi.lang.expr import Expr
 from taichi.lang.exception import TaichiSyntaxError
 from taichi.lang.field import Field, SNodeHostAccess
 from taichi.lang.matrix import Matrix
+from taichi.lang.ops import cast
 from taichi.lang.types import CompoundType
-from taichi.lang.util import python_scope, taichi_scope
+from taichi.lang.util import is_taichi_class, python_scope, taichi_scope
 
 
 import taichi as ti
 
 
-class Struct:
+class Struct(TaichiOperations):
     """The Struct type class.
     Args:
         entries (Dict[str, Union[Dict, Expr, Matrix, Struct]]): keys and values for struct members.
     """
     is_taichi_class = True
 
-    def __init__(self, entries):
+    def __init__(self, *args, **kwargs):
         # converts lists to matrices and dicts to structs
-        self.entries = {}
-        for k, v in entries.items:
+        if len(args) == 1 and kwargs == {} and isinstance(args[0], dict):
+            self.entries = args[0]
+        elif len(args) == 0:
+            self.entries = kwargs
+        else:
+            raise TaichiSyntaxError(
+                "Custom structs need to be initialized using either dictionary or keyword arguments"
+            )
+        for k, v in self.entries.items():
             if isinstance(v, (list, tuple)):
                 v = Matrix(v)
             if isinstance(v, dict):
@@ -38,17 +49,10 @@ def keys(self):
     def members(self):
         return list(self.entries.values())
 
+    @property
     def items(self):
         return self.entries.items()
 
-    def empty_copy(self):
-        return Struct.empty(self.members)
-
-    def copy(self):
-        ret = self.empty_copy()
-        ret.entries = copy.copy(self.entries)
-        return ret
-
     def register_members(self):
         for k in self.keys:
             setattr(
@@ -83,6 +87,102 @@ def setter(self, value):
             self.entries[key] = value
         return setter
 
+    def element_wise_unary(self, foo):
+        _taichi_skip_traceback = 1
+        ret = self.empty_copy()
+        for k, v in self.items:
+            if isinstance(v, Expr):
+                ret.entries[k] = foo(v)
+            else:
+                ret.entries[k] = v.element_wise_unary(foo)
+        return ret
+
+    def element_wise_binary(self, foo, other):
+        _taichi_skip_traceback = 1
+        ret = self.empty_copy()
+        if isinstance(other, (dict)):
+            other = Struct(other)
+        if isinstance(other, Struct):
+            assert self.entries.keys() == other.entries.keys(), f"Member mismatch between structs {self.keys}, {other.keys}"
+            for k, v in self.items:
+                if isinstance(v, Expr):
+                    ret.entries[k] = foo(v, other.entries[k])
+                else:
+                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+        else:  # assumed to be scalar
+            for k, v in self.items:
+                if isinstance(v, Expr):
+                    ret.entries[k] = foo(v, other.entries[k])
+                else:
+                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+        return ret
+
+    def broadcast_copy(self, other):
+        if isinstance(other, dict):
+            other = Struct(other)
+        if not isinstance(other, Struct):
+            ret = self.empty_copy()
+            ret.entries = {k: other for k in ret.keys}
+            other = ret
+        assert self.entries.keys() == other.entries.keys(), f"Member mismatch between structs {self.keys}, {other.keys}"
+        return other
+
+    def element_wise_writeback_binary(self, foo, other):
+        ret = self.empty_copy()
+        if isinstance(other, (dict)):
+            other = Struct(other)
+        if is_taichi_class(other):
+            other = other.variable()
+        if foo.__name__ == 'assign' and not isinstance(other, Struct):
+            raise TaichiSyntaxError(
+                'cannot assign scalar expr to '
+                f'taichi class {type(self)}, maybe you want to use `a.fill(b)` instead?'
+            )
+        if isinstance(other, Struct):
+            assert self.entries.keys() == other.entries.keys(), f"Member mismatch between structs {self.keys}, {other.keys}"
+            for k, v in self.items:
+                if isinstance(v, Expr):
+                    ret.entries[k] = foo(v, other.entries[k])
+                else:
+                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+        else:  # assumed to be scalar
+            for k, v in self.items:
+                if isinstance(v, Expr):
+                    ret.entries[k] = foo(v, other.entries[k])
+                else:
+                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+        return ret
+
+    def element_wise_ternary(self, foo, other, extra):
+        ret = self.empty_copy()
+        other = self.broadcast_copy(other)
+        extra = self.broadcast_copy(extra)
+        for k, v in self.items:
+            if isinstance(v, Expr):
+                ret.entries[k] = foo(v, other.entries[k],
+                                    extra.entries[k])
+            else:
+                ret.entries[k] = v.element_wise_ternary(
+                    foo, other.entries[k], extra.entries[k]
+                )
+        return ret
+
+    def empty_copy(self):
+        return Struct.empty(self.keys)
+
+    def copy(self):
+        ret = self.empty_copy()
+        ret.entries = copy.copy(self.entries)
+        return ret
+
+    @taichi_scope
+    def variable(self):
+        ret = self.copy()
+        ret.entries = {
+            k : impl.expr_init(v) if isinstance(v, Expr) else v.variable()
+            for k, v in ret.items
+        }
+        return ret
 
     def __len__(self):
         """Get the number of entries in a custom struct"""
@@ -91,6 +191,7 @@ def __len__(self):
     def __iter__(self):
         return self.entries.values()
 
+
     def __str__(self):
         """Python scope struct array print support."""
         if impl.inside_kernel():
@@ -195,8 +296,12 @@ class StructField(Field):
     def __init__(self, field_dict, name=None):
         # will not call Field initializer
         self.field_dict = field_dict
-        self.name = name
+        self._name = name
         self.register_fields()
+        
+    @property
+    def name(self):
+        return self._name
 
     @property
     def keys(self):
@@ -206,6 +311,10 @@ def keys(self):
     def members(self):
         return list(self.field_dict.values())
 
+    @property
+    def items(self):
+        return self.field_dict.items()
+
     @staticmethod
     def make_getter(key):
         def getter(self):
@@ -318,10 +427,10 @@ def to_numpy(self):
         Returns:
             Dict[str, Union[numpy.ndarray, Dict]]: The result NumPy array.
         """
-        return {k: v.to_numpy() for k, v in self.field_dict.items()}
+        return {k: v.to_numpy() for k, v in self.items}
 
     @python_scope
-    def to_torch(self, device):
+    def to_torch(self, device=None):
         """Converts the Struct field instance to a dictionary of PyTorch tensors. The dictionary may be nested when converting
            nested structs.
 
@@ -330,7 +439,7 @@ def to_torch(self, device):
         Returns:
             Dict[str, Union[torch.Tensor, Dict]]: The result PyTorch tensor.
         """
-        return {k: v.to_torch(device=device) for k, v in self.field_dict.items()}
+        return {k: v.to_torch(device=device) for k, v in self.items}
 
     
     @python_scope
@@ -342,9 +451,8 @@ def __setitem__(self, indices, element):
     @python_scope
     def __getitem__(self, indices):
         self.initialize_host_accessors()
-        indices = self.pad_key(indices)
         entries = {
-            k: v[indices] for k, v in self.field_dict.items()
+            k: v[indices] for k, v in self.items
         }
         return Struct(entries)
     
@@ -388,7 +496,7 @@ def empty(self):
         """
         Create an empty instance of the given compound type.
         """
-        return Struct.empty(self.members)
+        return Struct.empty(self.members.keys())
     
     def field(self, **kwargs):
-        return Struct.field(self.m, self.n, **kwargs)
+        return Struct.field(self.members, **kwargs)
diff --git a/python/taichi/lang/types.py b/python/taichi/lang/types.py
index a98fb596756f1..0105bf3d93308 100644
--- a/python/taichi/lang/types.py
+++ b/python/taichi/lang/types.py
@@ -1,7 +1,6 @@
 import numbers
 
 from taichi.lang.exception import TaichiSyntaxError
-from taichi.lang.ops import cast
 
 
 class CompoundType:
@@ -15,8 +14,7 @@ def empty(self):
 
     def scalar_filled(self, value):
         instance = self.empty()
-        instance.fill(value)
-        return value
+        return instance.broadcast_copy(value)
 
     def field(self, **kwargs):
         raise NotImplementedError

From b281472fe911482d1162ed27f9a7ae3c6be4c638 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 01:45:23 -0500
Subject: [PATCH 11/29] expose integer and float primitive types

---
 python/taichi/core/primitive_types.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/taichi/core/primitive_types.py b/python/taichi/core/primitive_types.py
index ea5f5d16f1284..21d5b22bb9508 100644
--- a/python/taichi/core/primitive_types.py
+++ b/python/taichi/core/primitive_types.py
@@ -57,4 +57,6 @@
     'u32',
     'uint64',
     'u64',
+    'real_types',
+    'integer_types',
 ]

From 177aa937b10521e07783ab9ae9be79f763a6f685 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 01:45:48 -0500
Subject: [PATCH 12/29] fix character for formatter

---
 python/taichi/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/taichi/main.py b/python/taichi/main.py
index 08d85a27b9b5d..7776b46ff2af4 100644
--- a/python/taichi/main.py
+++ b/python/taichi/main.py
@@ -11,12 +11,12 @@
 from pathlib import Path
 
 from colorama import Back, Fore, Style
+
+import taichi as ti
 from taichi.core import settings
 from taichi.core import ti_core as _ti_core
 from taichi.tools import video
 
-import taichi as ti
-
 
 def timer(func):
     """Function decorator to benchmark a function runnign time."""
@@ -440,7 +440,7 @@ def video(self, arguments: list = sys.argv[2:]):
             args.inputs = sorted(
                 str(p.resolve()) for p in Path('.').glob('*.png'))
 
-        assert 1 <= args.crf <= 51, "The range of the CRF scale is 1–51, where 1 is almost lossless, 20 is the default, and 51 is worst quality possible."
+        assert 1 <= args.crf <= 51, "The range of the CRF scale is 1-51, where 1 is almost lossless, 20 is the default, and 51 is worst quality possible."
 
         ti.info(f'Making video using {len(args.inputs)} png files...')
         ti.info(f'frame_rate = {args.framerate}')

From c28f43c4a0c18937c5c73c4fc671e1ad11310e6b Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 01:47:29 -0500
Subject: [PATCH 13/29] make custom struct tests pass

---
 python/taichi/lang/field.py  |   6 +-
 python/taichi/lang/impl.py   |  10 +-
 python/taichi/lang/matrix.py |  69 ++++++++------
 python/taichi/lang/struct.py | 176 +++++++++++++++++++++++------------
 python/taichi/lang/types.py  |   5 +-
 5 files changed, 167 insertions(+), 99 deletions(-)

diff --git a/python/taichi/lang/field.py b/python/taichi/lang/field.py
index 40a4a50830832..f0de0defb3df5 100644
--- a/python/taichi/lang/field.py
+++ b/python/taichi/lang/field.py
@@ -1,9 +1,8 @@
+import taichi as ti
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl
 from taichi.lang.util import python_scope, to_numpy_type, to_pytorch_type
 
-import taichi as ti
-
 
 class Field:
     """Taichi field with SNode implementation.
@@ -206,6 +205,9 @@ def initialize_host_accessors(self):
             SNodeHostAccessor(e.ptr.snode()) for e in self.vars
         ]
 
+    def host_access(self, key):
+        return [SNodeHostAccess(e, key) for e in self.host_accessors]
+
 
 class ScalarField(Field):
     """Taichi scalar field with SNode implementation.
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index ccd0610eda9a7..6f62f5cd5969c 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -3,6 +3,8 @@
 from types import FunctionType, MethodType
 
 import numpy as np
+
+import taichi as ti
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang.exception import InvalidOperationError, TaichiSyntaxError
 from taichi.lang.expr import Expr, make_expr_group
@@ -18,8 +20,6 @@
 from taichi.misc.util import deprecated, get_traceback, warning
 from taichi.snode.fields_builder import FieldsBuilder
 
-import taichi as ti
-
 
 @taichi_scope
 def expr_init_local_tensor(shape, element_type, elements):
@@ -158,9 +158,9 @@ def subscript(value, *indices):
                 for e in value.get_field_members()
             ])
         elif isinstance(value, StructField):
-            return ti.Struct({
-                k: subscript(v, *indices) for k, v in value.items
-            })
+            return ti.Struct(
+                {k: subscript(v, *indices)
+                 for k, v in value.items})
         else:
             return Expr(_ti_core.subscript(var, indices_expr_group))
     elif isinstance(value, AnyArray):
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index ae79b6d43ac34..f2b90b5b419f6 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -3,21 +3,23 @@
 from collections.abc import Iterable
 
 import numpy as np
+
+import taichi as ti
 from taichi.lang import expr, impl
 from taichi.lang import kernel_impl as kern_mod
 from taichi.lang import ops as ops_mod
 from taichi.lang.common_ops import TaichiOperations
 from taichi.lang.exception import TaichiSyntaxError
+from taichi.lang.expr import Expr
 from taichi.lang.ext_array import AnyArrayAccess
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
 from taichi.lang.ops import cast
 from taichi.lang.types import CompoundType
-from taichi.lang.util import (in_python_scope, is_taichi_class, python_scope,
-                              taichi_scope, to_numpy_type, to_pytorch_type)
+from taichi.lang.util import (cook_dtype, in_python_scope, is_taichi_class,
+                              python_scope, taichi_scope, to_numpy_type,
+                              to_pytorch_type)
 from taichi.misc.util import deprecated, warning
 
-import taichi as ti
-
 
 class Matrix(TaichiOperations):
     """The matrix class.
@@ -468,6 +470,16 @@ def __iter__(self):
         else:
             return ([self(i, j) for j in range(self.m)] for i in range(self.n))
 
+    @python_scope
+    def set_entries(self, value):
+        if not isinstance(value, (list, tuple)):
+            value = list(value)
+        if not isinstance(value[0], (list, tuple)):
+            value = [[i] for i in value]
+        for i in range(self.n):
+            for j in range(self.m):
+                self[i, j] = value[i][j]
+
     def empty_copy(self):
         return Matrix.empty(self.n, self.m)
 
@@ -788,7 +800,7 @@ def to_numpy(self, keep_dims=False):
         """
         as_vector = self.m == 1 and not keep_dims
         shape_ext = (self.n, ) if as_vector else (self.n, self.m)
-        return np.array(self.entries).reshape(shape_ext)
+        return np.array(self.value).reshape(shape_ext)
 
     @taichi_scope
     def __ti_repr__(self):
@@ -1352,21 +1364,13 @@ def from_numpy(self, arr):
     @python_scope
     def __setitem__(self, key, value):
         self.initialize_host_accessors()
-        if not isinstance(value, (list, tuple)):
-            value = list(value)
-        if not isinstance(value[0], (list, tuple)):
-            value = [[i] for i in value]
-        for i in range(self.n):
-            for j in range(self.m):
-                self[key][i, j] = value[i][j]
+        self[key].set_entries(value)
 
     @python_scope
     def __getitem__(self, key):
         self.initialize_host_accessors()
         key = self.pad_key(key)
-        return Matrix.with_entries(
-            self.n, self.m,
-            [SNodeHostAccess(e, key) for e in self.host_accessors])
+        return Matrix.with_entries(self.n, self.m, self.host_access(key))
 
     def __repr__(self):
         # make interactive shell happy, prevent materialization
@@ -1374,19 +1378,19 @@ def __repr__(self):
 
 
 class MatrixType(CompoundType):
-
     def __init__(self, n, m, dtype):
         self.n = n
         self.m = m
-        self.dtype = dtype
-
+        self.dtype = cook_dtype(dtype)
 
     def __call__(self, *args):
         if len(args) == 0:
-            raise TaichiSyntaxError("Custom type instances need to be created with an initial value.")
+            raise TaichiSyntaxError(
+                "Custom type instances need to be created with an initial value."
+            )
         elif len(args) == 1:
             # fill a single scalar
-            if isinstance(args[0], numbers.Number):
+            if isinstance(args[0], (numbers.Number, Expr)):
                 return self.scalar_filled(args[0])
             # fill a single vector or matrix
             entries = args[0]
@@ -1394,17 +1398,17 @@ def __call__(self, *args):
             # fill in a concatenation of scalars/vectors/matrices
             entries = []
             for x in args:
-                if isinstance(x, numbers.Number):
-                    entries.append(x)
-                elif isinstance(x, (list, tuple)):
+                if isinstance(x, (list, tuple)):
                     entries += x
                 elif isinstance(x, Matrix):
                     entries += x.entries
+                else:
+                    entries.append(x)
         # convert vector to nx1 matrix
         if isinstance(entries[0], numbers.Number):
             entries = [[e] for e in entries]
         # type cast
-        mat = self.cast(Matrix(entries))
+        mat = self.cast(Matrix(entries, dt=self.dtype))
         return mat
 
     def cast(self, mat, in_place=False):
@@ -1412,10 +1416,19 @@ def cast(self, mat, in_place=False):
             mat = mat.copy()
         # sanity check shape
         if self.m != mat.m or self.n != mat.n:
-            raise TaichiSyntaxError(f"Incompatible arguments for the custom vector/matrix type: ({self.n}, {self.m}), ({mat.n}, {mat.m})")
-        mat.entries = [cast(x, self.dtype) for x in mat.entries]
+            raise TaichiSyntaxError(
+                f"Incompatible arguments for the custom vector/matrix type: ({self.n}, {self.m}), ({mat.n}, {mat.m})"
+            )
+        if in_python_scope():
+            mat.entries = [
+                int(x) if self.dtype in ti.integer_types else x
+                for x in mat.entries
+            ]
+        else:
+            # only performs casting in Taichi scope
+            mat.entries = [cast(x, self.dtype) for x in mat.entries]
         return mat
-        
+
     def empty(self):
         """
         Create an empty instance of the given compound type.
@@ -1423,4 +1436,4 @@ def empty(self):
         return Matrix.empty(self.n, self.m)
 
     def field(self, **kwargs):
-        return Matrix.field(self.n, self.m, dtype=self.dtype, **kwargs)
\ No newline at end of file
+        return Matrix.field(self.n, self.m, dtype=self.dtype, **kwargs)
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index f640e69d110a8..9c7df528fbc6b 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -1,19 +1,19 @@
 import copy
 import numbers
 
+from numpy.lib.arraysetops import isin
 
+import taichi as ti
 from taichi.lang import impl
 from taichi.lang.common_ops import TaichiOperations
-from taichi.lang.expr import Expr
 from taichi.lang.exception import TaichiSyntaxError
-from taichi.lang.field import Field, SNodeHostAccess
+from taichi.lang.expr import Expr
+from taichi.lang.field import Field, ScalarField, SNodeHostAccess
 from taichi.lang.matrix import Matrix
 from taichi.lang.ops import cast
 from taichi.lang.types import CompoundType
-from taichi.lang.util import is_taichi_class, python_scope, taichi_scope
-
-
-import taichi as ti
+from taichi.lang.util import (cook_dtype, in_python_scope, is_taichi_class,
+                              python_scope, taichi_scope)
 
 
 class Struct(TaichiOperations):
@@ -40,6 +40,8 @@ def __init__(self, *args, **kwargs):
                 v = Struct(v)
             self.entries[k] = v
         self.register_members()
+        self.local_tensor_proxy = None
+        self.any_array_access = None
 
     @property
     def keys(self):
@@ -55,28 +57,42 @@ def items(self):
 
     def register_members(self):
         for k in self.keys:
-            setattr(
-                Struct, k,
-                property(
-                    Struct.make_getter(k),
-                    Struct.make_setter(k),
-                ))
+            setattr(Struct, k,
+                    property(
+                        Struct.make_getter(k),
+                        Struct.make_setter(k),
+                    ))
 
-    def __call__(self, key, **kwargs):
+    def __getitem__(self, key):
         _taichi_skip_traceback = 1
-        assert kwargs == {}
         ret = self.entries[key]
         if isinstance(ret, SNodeHostAccess):
             ret = ret.accessor.getter(*ret.key)
         return ret
 
+    def __setitem__(self, key, value):
+        _taichi_skip_traceback = 1
+        if isinstance(self.entries[key], SNodeHostAccess):
+            self.entries[key].accessor.setter(value, *self.entries[key].key)
+        else:
+            if in_python_scope():
+                self.entries[key].set_entries(value)
+            else:
+                self.entries[key] = value
+
+    def set_entries(self, value):
+        if isinstance(value, dict):
+            value = Struct(value)
+        for k in self.keys:
+            self[k] = value[k]
 
     @staticmethod
     def make_getter(key):
         def getter(self):
             """Get an entry from custom struct by name."""
             _taichi_skip_traceback = 1
-            return self.entries[key]
+            return self[key]
+
         return getter
 
     @staticmethod
@@ -84,7 +100,8 @@ def make_setter(key):
         @python_scope
         def setter(self, value):
             _taichi_skip_traceback = 1
-            self.entries[key] = value
+            self[key] = value
+
         return setter
 
     def element_wise_unary(self, foo):
@@ -103,18 +120,20 @@ def element_wise_binary(self, foo, other):
         if isinstance(other, (dict)):
             other = Struct(other)
         if isinstance(other, Struct):
-            assert self.entries.keys() == other.entries.keys(), f"Member mismatch between structs {self.keys}, {other.keys}"
+            assert self.entries.keys() == other.entries.keys(
+            ), f"Member mismatch between structs {self.keys}, {other.keys}"
             for k, v in self.items:
                 if isinstance(v, Expr):
                     ret.entries[k] = foo(v, other.entries[k])
                 else:
-                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+                    ret.entries[k] = v.element_wise_binary(
+                        foo, other.entries[k])
         else:  # assumed to be scalar
             for k, v in self.items:
                 if isinstance(v, Expr):
-                    ret.entries[k] = foo(v, other.entries[k])
+                    ret.entries[k] = foo(v, other)
                 else:
-                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+                    ret.entries[k] = v.element_wise_binary(foo, other)
         return ret
 
     def broadcast_copy(self, other):
@@ -124,7 +143,8 @@ def broadcast_copy(self, other):
             ret = self.empty_copy()
             ret.entries = {k: other for k in ret.keys}
             other = ret
-        assert self.entries.keys() == other.entries.keys(), f"Member mismatch between structs {self.keys}, {other.keys}"
+        assert self.entries.keys() == other.entries.keys(
+        ), f"Member mismatch between structs {self.keys}, {other.keys}"
         return other
 
     def element_wise_writeback_binary(self, foo, other):
@@ -139,18 +159,20 @@ def element_wise_writeback_binary(self, foo, other):
                 f'taichi class {type(self)}, maybe you want to use `a.fill(b)` instead?'
             )
         if isinstance(other, Struct):
-            assert self.entries.keys() == other.entries.keys(), f"Member mismatch between structs {self.keys}, {other.keys}"
+            assert self.entries.keys() == other.entries.keys(
+            ), f"Member mismatch between structs {self.keys}, {other.keys}"
             for k, v in self.items:
                 if isinstance(v, Expr):
                     ret.entries[k] = foo(v, other.entries[k])
                 else:
-                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+                    ret.entries[k] = v.element_wise_binary(
+                        foo, other.entries[k])
         else:  # assumed to be scalar
             for k, v in self.items:
                 if isinstance(v, Expr):
-                    ret.entries[k] = foo(v, other.entries[k])
+                    ret.entries[k] = foo(v, other)
                 else:
-                    ret.entries[k] = v.element_wise_binary(foo, other.entries[k])
+                    ret.entries[k] = v.element_wise_binary(foo, other)
         return ret
 
     def element_wise_ternary(self, foo, other, extra):
@@ -159,14 +181,24 @@ def element_wise_ternary(self, foo, other, extra):
         extra = self.broadcast_copy(extra)
         for k, v in self.items:
             if isinstance(v, Expr):
-                ret.entries[k] = foo(v, other.entries[k],
-                                    extra.entries[k])
+                ret.entries[k] = foo(v, other.entries[k], extra.entries[k])
             else:
                 ret.entries[k] = v.element_wise_ternary(
-                    foo, other.entries[k], extra.entries[k]
-                )
+                    foo, other.entries[k], extra.entries[k])
         return ret
 
+    @taichi_scope
+    def fill(self, val):
+        """Fills the Struct with a specific value in Taichi scope.
+
+        Args:
+            val (Union[int, float]): Value to fill.
+        """
+        def assign_renamed(x, y):
+            return ti.assign(x, y)
+
+        return self.element_wise_writeback_binary(assign_renamed, val)
+
     def empty_copy(self):
         return Struct.empty(self.keys)
 
@@ -179,7 +211,7 @@ def copy(self):
     def variable(self):
         ret = self.copy()
         ret.entries = {
-            k : impl.expr_init(v) if isinstance(v, Expr) else v.variable()
+            k: impl.expr_init(v) if isinstance(v, Expr) else v.variable()
             for k, v in ret.items
         }
         return ret
@@ -191,16 +223,15 @@ def __len__(self):
     def __iter__(self):
         return self.entries.values()
 
-
     def __str__(self):
         """Python scope struct array print support."""
         if impl.inside_kernel():
             return f'<ti.Struct {", ".join([str(k) + "=" + str(v) for k, v in self.entries])}>'
         else:
-            return str(self.to_numpy())
+            return str(self.to_dict())
 
     def __repr__(self):
-        return str(self.to_numpy())
+        return str(self.to_dict())
 
     @python_scope
     def to_dict(self):
@@ -230,10 +261,10 @@ def empty(cls, entries):
     def field(cls,
               members,
               shape=None,
-              struct_name="<Struct>",
+              name="<Struct>",
               offset=None,
               needs_grad=False,
-              layout=None):       
+              layout=None):
 
         if layout is not None:
             assert shape is not None, 'layout is useless without shape'
@@ -243,11 +274,18 @@ def field(cls,
         field_dict = {}
 
         for key, dtype in members.items():
-            name = struct_name + '.' + key
+            field_name = name + '.' + key
             if isinstance(dtype, CompoundType):
-                field_dict[key] = dtype.field(shape=None, name=name, offset=offset, needs_grad=needs_grad)
+                field_dict[key] = dtype.field(shape=None,
+                                              name=field_name,
+                                              offset=offset,
+                                              needs_grad=needs_grad)
             else:
-                field_dict[key] = impl.field(dtype, shape=None, name=name, offset=offset, needs_grad=needs_grad)
+                field_dict[key] = impl.field(dtype,
+                                             shape=None,
+                                             name=field_name,
+                                             offset=offset,
+                                             needs_grad=needs_grad)
 
         if shape is not None:
             if isinstance(shape, numbers.Number):
@@ -279,18 +317,17 @@ def field(cls,
                 if needs_grad:
                     grads = tuple(e.grad for e in field_dict.values())
                     ti.root.dense(impl.index_nd(dim),
-                                  shape).place(*grads,
-                                               offset=offset)
-        return StructField(field_dict, name=struct_name)
+                                  shape).place(*grads, offset=offset)
+        return StructField(field_dict, name=name)
 
 
 class StructField(Field):
     """Taichi struct field with SNode implementation.
        Instead of directly contraining Expr entries, the StructField object
-       directly hosts members as `Field` instances to support nested structs. 
+       directly hosts members as `Field` instances to support nested structs.
 
     Args:
-        field_dict (Dict[str, Field]): Struct field members.       
+        field_dict (Dict[str, Field]): Struct field members.
         name (string, optional): The custom name of the field.
     """
     def __init__(self, field_dict, name=None):
@@ -298,7 +335,7 @@ def __init__(self, field_dict, name=None):
         self.field_dict = field_dict
         self._name = name
         self.register_fields()
-        
+
     @property
     def name(self):
         return self._name
@@ -321,6 +358,7 @@ def getter(self):
             """Get an entry from custom struct by name."""
             _taichi_skip_traceback = 1
             return self.field_dict[key]
+
         return getter
 
     @staticmethod
@@ -329,6 +367,7 @@ def make_setter(key):
         def setter(self, value):
             _taichi_skip_traceback = 1
             self.field_dict[key] = value
+
         return setter
 
     def register_fields(self):
@@ -381,7 +420,7 @@ def copy_from(self, other):
         assert isinstance(other, Field)
         assert set(self.keys) == set(other.keys)
         for k in self.keys:
-            self[k].copy_from(other[k])
+            self.field_dict[k].copy_from(other[k])
 
     @python_scope
     def fill(self, val):
@@ -410,13 +449,13 @@ def get_member_field(self, key):
 
     @python_scope
     def from_numpy(self, array_dict):
-        for k in self.keys:
-            self(k).from_numpy(array_dict[k])
+        for k, v in self.items:
+            v.from_numpy(array_dict[k])
 
     @python_scope
     def from_torch(self, array_dict):
-        for k in self.keys:
-            self(k).from_torch(array_dict[k])
+        for k, v in self.items:
+            v.from_torch(array_dict[k])
 
     @python_scope
     def to_numpy(self):
@@ -441,31 +480,38 @@ def to_torch(self, device=None):
         """
         return {k: v.to_torch(device=device) for k, v in self.items}
 
-    
     @python_scope
     def __setitem__(self, indices, element):
         self.initialize_host_accessors()
-        for k, v in element.values():
-            self[indices][k] = v
-    
+        self[indices].set_entries(element)
+
     @python_scope
     def __getitem__(self, indices):
         self.initialize_host_accessors()
+        # scalar fields does not instantiate SNodeHostAccess by default
         entries = {
-            k: v[indices] for k, v in self.items
+            k: v.host_access(self.pad_key(indices))[0] if isinstance(
+                v, ScalarField) else v[indices]
+            for k, v in self.items
         }
         return Struct(entries)
-    
+
 
 class StructType(CompoundType):
-    
     def __init__(self, **kwargs):
-        self.members = kwargs
+        self.members = {}
+        for k, dtype in kwargs.items():
+            if isinstance(dtype, CompoundType):
+                self.members[k] = dtype
+            else:
+                self.members[k] = cook_dtype(dtype)
 
     def __call__(self, *args, **kwargs):
         if len(args) == 0:
             if kwargs == {}:
-                raise TaichiSyntaxError("Custom type instances need to be created with an initial value.")
+                raise TaichiSyntaxError(
+                    "Custom type instances need to be created with an initial value."
+                )
             else:
                 # initialize struct members by keywords
                 entries = kwargs
@@ -484,19 +530,25 @@ def cast(self, struct, in_place=False):
             struct = struct.copy()
         # sanity check members
         if self.members.keys() != struct.entries.keys():
-            raise TaichiSyntaxError("Incompatible arguments for custom struct members!")
+            raise TaichiSyntaxError(
+                "Incompatible arguments for custom struct members!")
         for k, dtype in self.members.items():
             if isinstance(dtype, CompoundType):
                 struct.entries[k] = dtype.cast(struct.entries[k])
             else:
-                struct.entries[k] = cast(struct.entries[k], dtype)
+                if in_python_scope():
+                    v = struct.entries[k]
+                    struct.entries[k] = int(
+                        v) if self.dtype in ti.integer_types else v
+                else:
+                    struct.entries[k] = cast(struct.entries[k], dtype)
         return struct
-        
+
     def empty(self):
         """
         Create an empty instance of the given compound type.
         """
         return Struct.empty(self.members.keys())
-    
+
     def field(self, **kwargs):
         return Struct.field(self.members, **kwargs)
diff --git a/python/taichi/lang/types.py b/python/taichi/lang/types.py
index 0105bf3d93308..b5e23fb7802f1 100644
--- a/python/taichi/lang/types.py
+++ b/python/taichi/lang/types.py
@@ -4,8 +4,6 @@
 
 
 class CompoundType:
-    
-
     def empty(self):
         """
         Create an empty instance of the given compound type.
@@ -19,14 +17,17 @@ def scalar_filled(self, value):
     def field(self, **kwargs):
         raise NotImplementedError
 
+
 def matrix(m, n, dtype=None):
     from taichi.lang.matrix import MatrixType
     return MatrixType(m, n, dtype=dtype)
 
+
 def vector(m, dtype=None):
     from taichi.lang.matrix import MatrixType
     return MatrixType(m, 1, dtype=dtype)
 
+
 def struct(**kwargs):
     from taichi.lang.struct import StructType
     return StructType(**kwargs)

From c339281fce3ed0b7285a8e5f05781ef5f1f5633a Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 01:48:17 -0500
Subject: [PATCH 14/29] add custom struct tests

---
 tests/python/test_custom_struct.py | 213 +++++++++++++++++++++++++++++
 tests/python/test_struct.py        |  37 -----
 2 files changed, 213 insertions(+), 37 deletions(-)
 create mode 100644 tests/python/test_custom_struct.py

diff --git a/tests/python/test_custom_struct.py b/tests/python/test_custom_struct.py
new file mode 100644
index 0000000000000..5fb842bd7b119
--- /dev/null
+++ b/tests/python/test_custom_struct.py
@@ -0,0 +1,213 @@
+from pytest import approx
+import numpy as np
+
+import taichi as ti
+
+@ti.test()
+def test_struct_member_access():
+    n = 32
+
+    x = ti.Struct.field({"a": ti.f32, "b": ti.f32}, shape=(n, ))
+    y = ti.Struct.field({"a": ti.f32, "b": ti.f32})
+
+    ti.root.dense(ti.i, n // 4).dense(ti.i, 4).place(y)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i].a = i
+            y[i].a = i
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            x[i].b = x[i].a
+
+    def run_python_scope():
+        for i in range(n):
+            y[i].b = y[i].a * 2 + 1
+
+    init()
+    run_taichi_scope()
+    for i in range(n):
+        assert x[i].b == i
+    run_python_scope()
+    for i in range(n):
+        assert y[i].b == i * 2 + 1
+
+
+@ti.test()
+def test_struct_whole_access():
+    n = 32
+    
+    # also tests implicit cast
+    x = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+    y = ti.Struct.field({"a": ti.f32, "b": ti.i32})
+
+    ti.root.dense(ti.i, n // 4).dense(ti.i, 4).place(y)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i] = ti.Struct(a=2 * i, b=1.01 * i)
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            # element-wise ops only work in Taichi scope
+            y[i] = x[i] * 2 + 1
+
+    def run_python_scope():
+        for i in range(n):
+            y[i] = ti.Struct(a=x[i].a, b=int(x[i].b))
+
+    init()
+    for i in range(n):
+        assert x[i].a == 2 * i
+        assert x[i].b == approx(1.01 * i, rel=1e-4)
+    run_taichi_scope()
+    for i in range(n):
+        assert y[i].a == 4 * i + 1
+        assert y[i].b == int((1.01 * i) * 2 + 1)
+    run_python_scope()
+    for i in range(n):
+        assert y[i].a == 2 * i
+        assert y[i].b == int(1.01 * i)
+    
+
+
+@ti.test()
+def test_struct_fill():
+    n = 32
+
+    # also tests implicit cast
+    x = ti.Struct.field({"a": ti.f32, "b": ti.types.vector(3, ti.i32)}, shape=(n,))
+
+    def fill_each():
+        x.a.fill(1.0)
+        x.b.fill(1.5)
+
+    def fill_all():
+        x.fill(2.5)
+
+    @ti.kernel
+    def fill_elements():
+        for i in x:
+            x[i].fill(i + 0.5)
+
+    fill_each()
+    for i in range(n):
+        assert x[i].a == 1.0
+        assert x[i].b[0] == 1 and x[i].b[1] == 1 and x[i].b[2] == 1
+    fill_all()
+    for i in range(n):
+        assert x[i].a == 2.5
+        assert x[i].b[0] == 2 and x[i].b[1] == 2 and x[i].b[2] == 2
+    fill_elements()
+    for i in range(n):
+        assert x[i].a == i + 0.5
+        assert np.allclose(x[i].b.to_numpy(), int(x[i].a))
+
+@ti.test()
+def test_matrix_type():
+    n = 32
+    vec2f = ti.types.vector(2, ti.f32)
+    vec3i = ti.types.vector(3, ti.i32)
+    x = vec3i.field()
+    ti.root.dense(ti.i, n).place(x)
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            v = vec2f(i + 0.2)
+            # also tests implicit cast
+            x[i] = vec3i(v, i + 1.2)
+
+    def run_python_scope():
+        for i in range(n):
+            v = vec2f(i + 0.2)
+            x[i] = vec3i(i + 1.8, v)
+
+    run_taichi_scope()
+    for i in range(n):
+        assert np.allclose(x[i].to_numpy(), np.array([i, i, i + 1])) 
+    run_python_scope()
+    for i in range(n):
+        assert np.allclose(x[i].to_numpy(), np.array([i + 1, i, i])) 
+
+
+@ti.test()
+def test_struct_type():
+    n = 32
+    vec3f = ti.types.vector(3, float)
+    line3f = ti.types.struct(linedir=vec3f, length=float)
+    mystruct = ti.types.struct(line=line3f, idx=int)
+    x = mystruct.field(shape=(n,))
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            v = vec3f(1)
+            line = line3f(linedir=v, length=i + 0.5)
+            x[i] = mystruct(line=line, idx=i)
+            
+    def run_python_scope():
+        for i in range(n):
+            v = vec3f(1)
+            x[i] = ti.Struct({"line": {"linedir": v, "length": i + 0.5}, "idx": i})
+
+    x.fill(5)
+    for i in range(n):
+        assert x[i].idx == 5
+        assert np.allclose(x[i].line.linedir.to_numpy(), 5.0)
+        assert x[i].line.length == 5.0
+    run_taichi_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
+    x.fill(5)
+    run_python_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
+
+
+@ti.test()
+def test_struct_assign():
+    n = 32
+    vec3f = ti.types.vector(3, float)
+    line3f = ti.types.struct(linedir=vec3f, length=float)
+    mystruct = ti.types.struct(line=line3f, idx=int)
+    x = mystruct.field(shape=(n,))
+    y = line3f.field(shape=(n,))
+
+    @ti.kernel
+    def init():
+        for i in y:
+            y[i] = line3f(linedir=vec3f(1), length=i + 0.5)
+
+    @ti.kernel
+    def run_taichi_scope():
+        for i in x:
+            x[i].idx = i
+            x[i].line = y[i]
+            
+    def run_python_scope():
+        for i in range(n):
+            x[i].idx = i
+            x[i].line = y[i]
+
+    init()
+    run_taichi_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
+    x.fill(5)
+    run_python_scope()
+    for i in range(n):
+        assert x[i].idx == i
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == i + 0.5
\ No newline at end of file
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index d014bcc802f87..6b1b2645ab685 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -77,40 +77,3 @@ def test_2d_nested():
     for i in range(n * 2):
         for j in range(n):
             assert x[i, j] == i + j * 10
-
-
-@ti.all_archs
-def test_custom_struct():
-    n = 32
-
-    # May also want to suuport vector struct using compound types such as
-    # st = ti.types.struct(a=ti.types.vector(3, ti.f32), b=ti.f32)
-    # f = ti.field(dtype=st, shape=(n, ))
-
-    x = ti.Struct.field({"a": ti.f32, "b": ti.f32}, shape=(n, ))
-    y = ti.Struct.field({"a": ti.f32, "b": ti.f32})
-
-    ti.root.dense(ti.i, n // 4).dense(ti.i, 4).place(y)
-
-    @ti.kernel
-    def init():
-        for i in x:
-            x[i].a = i
-            y[i].a = i
-
-    @ti.kernel
-    def run_taichi_scope():
-        for i in x:
-            x[i].b = x[i].a
-
-    def run_python_scope():
-        for i in range(n):
-            y[i].b = y[i].a * 2 + 1
-
-    init()
-    run_taichi_scope()
-    for i in range(n):
-        assert x[i].b == i
-    run_python_scope()
-    for i in range(n):
-        assert y[i].b == i * 2 + 1

From 9b2fac6f47ca01f05990628d39e23a8bd9e3eefb Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Wed, 25 Aug 2021 07:36:56 +0000
Subject: [PATCH 15/29] Auto Format

---
 python/taichi/lang/field.py        |  3 ++-
 python/taichi/lang/impl.py         |  4 ++--
 python/taichi/lang/matrix.py       |  4 ++--
 python/taichi/lang/struct.py       |  4 ++--
 python/taichi/main.py              |  4 ++--
 tests/python/test_custom_struct.py | 37 +++++++++++++++++++-----------
 6 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/python/taichi/lang/field.py b/python/taichi/lang/field.py
index f0de0defb3df5..29a2d11de19c3 100644
--- a/python/taichi/lang/field.py
+++ b/python/taichi/lang/field.py
@@ -1,8 +1,9 @@
-import taichi as ti
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl
 from taichi.lang.util import python_scope, to_numpy_type, to_pytorch_type
 
+import taichi as ti
+
 
 class Field:
     """Taichi field with SNode implementation.
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 5ac2a09587cf8..a1091d1bfa505 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -3,8 +3,6 @@
 from types import FunctionType, MethodType
 
 import numpy as np
-
-import taichi as ti
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang.exception import InvalidOperationError, TaichiSyntaxError
 from taichi.lang.expr import Expr, make_expr_group
@@ -20,6 +18,8 @@
 from taichi.misc.util import deprecated, get_traceback, warning
 from taichi.snode.fields_builder import FieldsBuilder
 
+import taichi as ti
+
 
 @taichi_scope
 def expr_init_local_tensor(shape, element_type, elements):
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index d892c33d7764f..792e4e5c1c4fc 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -3,8 +3,6 @@
 from collections.abc import Iterable
 
 import numpy as np
-
-import taichi as ti
 from taichi.lang import expr, impl
 from taichi.lang import kernel_impl as kern_mod
 from taichi.lang import ops as ops_mod
@@ -21,6 +19,8 @@
                               to_pytorch_type)
 from taichi.misc.util import deprecated, warning
 
+import taichi as ti
+
 
 class Matrix(TaichiOperations):
     """The matrix class.
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 9c7df528fbc6b..44cd675d0a76c 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -2,8 +2,6 @@
 import numbers
 
 from numpy.lib.arraysetops import isin
-
-import taichi as ti
 from taichi.lang import impl
 from taichi.lang.common_ops import TaichiOperations
 from taichi.lang.exception import TaichiSyntaxError
@@ -15,6 +13,8 @@
 from taichi.lang.util import (cook_dtype, in_python_scope, is_taichi_class,
                               python_scope, taichi_scope)
 
+import taichi as ti
+
 
 class Struct(TaichiOperations):
     """The Struct type class.
diff --git a/python/taichi/main.py b/python/taichi/main.py
index 7776b46ff2af4..c26bb89867a21 100644
--- a/python/taichi/main.py
+++ b/python/taichi/main.py
@@ -11,12 +11,12 @@
 from pathlib import Path
 
 from colorama import Back, Fore, Style
-
-import taichi as ti
 from taichi.core import settings
 from taichi.core import ti_core as _ti_core
 from taichi.tools import video
 
+import taichi as ti
+
 
 def timer(func):
     """Function decorator to benchmark a function runnign time."""
diff --git a/tests/python/test_custom_struct.py b/tests/python/test_custom_struct.py
index 5fb842bd7b119..15a7a7d91c345 100644
--- a/tests/python/test_custom_struct.py
+++ b/tests/python/test_custom_struct.py
@@ -1,8 +1,9 @@
-from pytest import approx
 import numpy as np
+from pytest import approx
 
 import taichi as ti
 
+
 @ti.test()
 def test_struct_member_access():
     n = 32
@@ -39,7 +40,7 @@ def run_python_scope():
 @ti.test()
 def test_struct_whole_access():
     n = 32
-    
+
     # also tests implicit cast
     x = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
     y = ti.Struct.field({"a": ti.f32, "b": ti.i32})
@@ -73,7 +74,6 @@ def run_python_scope():
     for i in range(n):
         assert y[i].a == 2 * i
         assert y[i].b == int(1.01 * i)
-    
 
 
 @ti.test()
@@ -81,7 +81,11 @@ def test_struct_fill():
     n = 32
 
     # also tests implicit cast
-    x = ti.Struct.field({"a": ti.f32, "b": ti.types.vector(3, ti.i32)}, shape=(n,))
+    x = ti.Struct.field({
+        "a": ti.f32,
+        "b": ti.types.vector(3, ti.i32)
+    },
+                        shape=(n, ))
 
     def fill_each():
         x.a.fill(1.0)
@@ -108,6 +112,7 @@ def fill_elements():
         assert x[i].a == i + 0.5
         assert np.allclose(x[i].b.to_numpy(), int(x[i].a))
 
+
 @ti.test()
 def test_matrix_type():
     n = 32
@@ -130,10 +135,10 @@ def run_python_scope():
 
     run_taichi_scope()
     for i in range(n):
-        assert np.allclose(x[i].to_numpy(), np.array([i, i, i + 1])) 
+        assert np.allclose(x[i].to_numpy(), np.array([i, i, i + 1]))
     run_python_scope()
     for i in range(n):
-        assert np.allclose(x[i].to_numpy(), np.array([i + 1, i, i])) 
+        assert np.allclose(x[i].to_numpy(), np.array([i + 1, i, i]))
 
 
 @ti.test()
@@ -142,7 +147,7 @@ def test_struct_type():
     vec3f = ti.types.vector(3, float)
     line3f = ti.types.struct(linedir=vec3f, length=float)
     mystruct = ti.types.struct(line=line3f, idx=int)
-    x = mystruct.field(shape=(n,))
+    x = mystruct.field(shape=(n, ))
 
     @ti.kernel
     def run_taichi_scope():
@@ -150,11 +155,17 @@ def run_taichi_scope():
             v = vec3f(1)
             line = line3f(linedir=v, length=i + 0.5)
             x[i] = mystruct(line=line, idx=i)
-            
+
     def run_python_scope():
         for i in range(n):
             v = vec3f(1)
-            x[i] = ti.Struct({"line": {"linedir": v, "length": i + 0.5}, "idx": i})
+            x[i] = ti.Struct({
+                "line": {
+                    "linedir": v,
+                    "length": i + 0.5
+                },
+                "idx": i
+            })
 
     x.fill(5)
     for i in range(n):
@@ -180,8 +191,8 @@ def test_struct_assign():
     vec3f = ti.types.vector(3, float)
     line3f = ti.types.struct(linedir=vec3f, length=float)
     mystruct = ti.types.struct(line=line3f, idx=int)
-    x = mystruct.field(shape=(n,))
-    y = line3f.field(shape=(n,))
+    x = mystruct.field(shape=(n, ))
+    y = line3f.field(shape=(n, ))
 
     @ti.kernel
     def init():
@@ -193,7 +204,7 @@ def run_taichi_scope():
         for i in x:
             x[i].idx = i
             x[i].line = y[i]
-            
+
     def run_python_scope():
         for i in range(n):
             x[i].idx = i
@@ -210,4 +221,4 @@ def run_python_scope():
     for i in range(n):
         assert x[i].idx == i
         assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
-        assert x[i].line.length == i + 0.5
\ No newline at end of file
+        assert x[i].line.length == i + 0.5

From 2a8929b5be8fae4efd41f81593bc674e1a726ade Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 17:47:20 -0500
Subject: [PATCH 16/29] add struct and compound type documentation

---
 docs/lang/api/struct.md              | 109 +++++++++++++++++++++++++++
 docs/lang/articles/basic/external.md |  16 +++-
 docs/lang/articles/basic/field.md    |  36 ++++++++-
 docs/lang/articles/basic/type.md     |  44 ++++++++++-
 4 files changed, 199 insertions(+), 6 deletions(-)
 create mode 100644 docs/lang/api/struct.md

diff --git a/docs/lang/api/struct.md b/docs/lang/api/struct.md
new file mode 100644
index 0000000000000..76e91b3a2de52
--- /dev/null
+++ b/docs/lang/api/struct.md
@@ -0,0 +1,109 @@
+---
+sidebar_position: 3
+---
+
+# Structs
+
+Mixed-data-type records can be created in Taichi as custom structs. A struct in Taichi can have two forms. Similar to vectors and matrices, structs have both local variable and global field forms.
+
+## Declaration
+
+### As global vector fields
+
+::: {.function}
+ti.struct.field(members, shape = None, offset = None)
+
+parameter members
+
+: (Dict[str, DataType]) name and data type for each struct member. The data type of a member can be either a primitive type (numbers) or a compound type (vectors, matrices, structs).
+
+parameter shape
+
+: (optional, scalar or tuple) shape of the struct field, see
+`tensor`{.interpreted-text role="ref"}
+
+parameter offset
+
+: (optional, scalar or tuple) see `offset`{.interpreted-text
+role="ref"}
+
+For example, this creates a Struct field of the two float members `a` and `b`: :
+
+    # Python-scope
+    x = ti.Struct.field({'a': ti.f32, 'b': fi.f32}, shape=(5, 4))
+
+A struct field with vector, matrix, or struct components can be created with compound types: :
+
+    # Python-scope
+    vec3 = ti.types.vector(3, float)
+    x = ti.Struct.field({'a': ti.f32, 'b': vec3}, shape=(5, 4))
+
+:::
+
+### As a temporary local variable
+
+A local Struct variable can be created with *either* a dictionary *or* keyword arguments.
+
+::: {.function}
+ti.Struct(members, **kwargs)
+
+parameter members
+
+: (Dict) The dictionary containing struct members.
+
+parameter **kwargs
+
+: The keyword arguments to specify struct members.
+
+Lists and nested dictionaries in the member dictionary or keyword arguments will be converted into local `ti.Matrix` and `ti.Struct`, respectively.
+
+For example, this creates a struct with a float member `a` and vector member `b`:
+
+    # Taichi-scope
+    x = ti.Struct({'a': 1.0, 'b': [1.0, 1.0, 1.0]})
+    # or 
+    x = ti.Struct(a=1.0, b=ti.Vector([1.0, 1.0, 1.0]))
+
+:::
+
+## Accessing components
+
+### As global struct fields
+
+Global struct field members are accessed as object attributes. For example, this extracts the member `a` of struct `x[6, 3]`: :
+
+    a = x[6, 3].a
+
+    # or
+    s = x[6, 3]
+    a = s.a
+
+In contrast to vector and matrix fields, struct members in a global struct field can be accessed in both attribute-first and index-first manners:
+
+    a = x[6, 3].a
+    # is equivalent to
+    a = x.a[6, 3]
+
+This allows for all field elements for a given struct field member to be extracted as a whole as a scalar, vector, or matrix field by accessing the member attributes of *the global struct field`:
+
+    # Python-scope
+    vec3 = ti.types.vector(3, float)
+    x = ti.Struct.field({'a': ti.f32, 'b': vec3}, shape=(5, 4))
+
+    x.a.fill(1.0) # x.a is equivalent to ti.field(ti.f32, shape=(5, 4))
+    x.b.fill(2.0) # x.b is equivalent to ti.Vector.field(3, ti.f32, shape=(5, 4))
+    a = x.a.to_numpy()
+    b = x.b.to_numpy()
+
+### As a temporary local variable
+
+Members of a local struct can be accessed using both attributes (object-like) or keys (dict-like):
+
+    x = ti.Struct(a=1.0, b=ti.Vector([1.0, 1.0, 1.0]))
+    a = x['a'] # a = 1.0
+    x.b = ti.Vector([1.0, 2.0, 3.0])
+
+## Element-wise operations (WIP)
+
+TODO: add element wise operations docs
+
diff --git a/docs/lang/articles/basic/external.md b/docs/lang/articles/basic/external.md
index fef58e6a38692..83d24bf19c585 100644
--- a/docs/lang/articles/basic/external.md
+++ b/docs/lang/articles/basic/external.md
@@ -83,7 +83,7 @@ field.from_numpy(array)  # the input array must be of shape (233, 666, 3)
 ```
 
 - For matrix fields, if the matrix is `n*m`, then **the shape of NumPy
-  array should be** `(*field_shape, matrix_n, matrix_m)`:
+array should be** `(*field_shape, matrix_n, matrix_m)`:
 
 ```python
 field = ti.Matrix.field(3, 4, ti.i32, shape=(233, 666))
@@ -97,6 +97,20 @@ array.shape  # (233, 666, 3, 4)
 field.from_numpy(array)  # the input array must be of shape (233, 666, 3, 4)
 ```
 
+- For struct fields, the external array will be exported as **a dictionary of arrays** with the keys being struct member names and values being struct member arrays. Nested structs will be exported as nested dictionaries:
+
+```python
+field = ti.Struct.field({'a': ti.i32, 'b': ti.types.vector(float, 3)} shape=(233, 666))
+field.shape # (233, 666)
+
+array_dict = field.to_numpy()
+array_dict.keys() # dict_keys(['a', 'b'])
+array_dict['a'].shape # (233, 666)
+array_dict['b'].shape # (233, 666, 3)
+
+field.from_numpy(array_dict) # the input array must have the same keys as the field=
+```
+
 ## Using external arrays as Taichi kernel arguments
 
 Use the type hint `ti.ext_arr()` for passing external arrays as kernel
diff --git a/docs/lang/articles/basic/field.md b/docs/lang/articles/basic/field.md
index 3a17e13728a42..bcf0a33766714 100644
--- a/docs/lang/articles/basic/field.md
+++ b/docs/lang/articles/basic/field.md
@@ -20,7 +20,7 @@ A simple example might help you understand scalar fields. Assume you have a rect
 heat field on the wok:
 
 ``` python
-heat_field = taichi.field(dtype=ti.f32, shape=(width_wok, height_wok))
+heat_field = ti.field(dtype=ti.f32, shape=(width_wok, height_wok))
 ```
 
 - Every global variable is an N-dimensional field.
@@ -40,7 +40,7 @@ heat_field = taichi.field(dtype=ti.f32, shape=(width_wok, height_wok))
 ## Vector fields
 We are all live in a gravitational field which is a vector field. At each position of the 3D space, there is a gravity force vector. The gravitational field could be represent with:
 ```python
-gravitational_field = taichi.Vector.field(n = 3,dtype=ti.f32,shape=(x,y,z))
+gravitational_field = ti.Vector.field(n = 3,dtype=ti.f32,shape=(x,y,z))
 ```
 `x,y,z` are the sizes of each dimension of the 3D space respectively.  `n` is the number of elements of the gravity force vector.
 
@@ -51,7 +51,7 @@ gravitational_field = taichi.Vector.field(n = 3,dtype=ti.f32,shape=(x,y,z))
 Field elements can also be matrices. In continuum mechanics, each
 infinitesimal point in a material exists a strain and a stress tensor. The strain and stress tensor is a 3 by 3 matrix in the 3D space. To represent this tensor field we could use:
 ```python
-strain_tensor_field = taichi.Matrix.field(n = 3,m = 3, dtype=ti.f32, shape=(x,y,z))
+strain_tensor_field = ti.Matrix.field(n = 3,m = 3, dtype=ti.f32, shape=(x,y,z))
 ```
 
 `x,y,z` are the sizes of each dimension of the 3D material respectively. `n, m` are the dimensions of the strain tensor.
@@ -87,3 +87,33 @@ declare a field of size `64`. E.g., instead of declaring
 `ti.Matrix.field(64, 32, dtype=ti.f32, shape=(3, 2))`, declare
 `ti.Matrix.field(3, 2, dtype=ti.f32, shape=(64, 32))`. Try to put large
 dimensions to fields instead of matrices.
+
+## Struct fields
+In addition to vectors and matrices, field elements can be user-defined structs. A struct variable may contain scalars, vectors/matrices, or other structs as its members. A struct field is created by providing a dictionary of name and data type of each member. For example, a 1D field of particles with position, velocity, acceleration, and mass for each particle can be represented as:
+```python
+particle_field = ti.Struct.field({
+    "pos": ti.types.vector(3, ti.f32),
+    "vel": ti.types.vector(3, ti.f32),
+    "acc": ti.types.vector(3, ti.f32),
+    "mass": ti.f32,
+  }, shape=(n,))
+```
+[Compound types](type.md#compound-types) (`ti.types.vector`, `ti.types.matrix`, and `ti.types.struct`) need to be used to create vectors, matrices, or structs as field members. Apart from using `ti.Struct.field`, the above particle field can be alternatively created using field creation from compound types as:
+```python
+vec3f = ti.types.vector(3, ti.f32)
+particle = ti.types.struct(
+  pos=vec3f, vel=vec3f, acc=vec3f, mass=ti.f32,
+)
+particle_field = particle.field(shape=(n,))
+```
+Members of a struct field can be accessed either locally (i.e., member of a struct field element) or globally (i.e., member field of a struct field):
+```python
+# set the position of the first particle to origin
+particle_field[0] # local ti.Struct
+particle_field[0].pos = ti.Vector([0.0, 0.0, 0.0])
+
+# make the mass of all particles be 1
+particle_field.mass # global ti.Vector.field
+particle_field.mass.fill(1.0)
+```
+- See [Structs](../../api/struct.md) for more on matrices.
\ No newline at end of file
diff --git a/docs/lang/articles/basic/type.md b/docs/lang/articles/basic/type.md
index c63232b48baf8..8ce7ab10e93ed 100644
--- a/docs/lang/articles/basic/type.md
+++ b/docs/lang/articles/basic/type.md
@@ -4,6 +4,10 @@ sidebar_position: 2
 
 # Type system
 
+Data types in Taichi consist of Primitive Types and Compound Types. Primitive Types are the numerical data types used by backends, while Compound Types are user-defined types of data records composed of multiple members.
+
+## Primitive types
+
 Taichi supports common numerical data types. Each type is denoted as a
 character indicating its _category_ and a number of _precision bits_,
 e.g., `i32` and `f64`.
@@ -29,9 +33,9 @@ For example, the two most commonly used types:
 - `i32` represents a 32-bit signed integer.
 - `f32` represents a 32-bit floating pointer number.
 
-## Supported types
+## Supported primitive types
 
-Currently, supported basic types in Taichi are
+Currently, supported primitive types in Taichi are
 
 - int8 `ti.i8`
 - int16 `ti.i16`
@@ -184,3 +188,39 @@ the same width as the the old type. For example, bit-casting `i32` to
 :::note
 For people from C++, `ti.bit_cast` is equivalent to `reinterpret_cast`.
 :::
+
+## Compound types
+
+User-defined compound types are created using the `ti.types` module. Supported compound types include vectors, matrices, and structs:
+
+```python
+vec2i = ti.types.vector(2, ti.i32)
+vec3f = ti.types.vector(3, float)
+mat2f = ti.types.matrix(2, 2, float)
+ray = ti.types.struct(ro=vec3f, rd=vec3f, l=ti.f32)
+```
+
+### Creating fields
+
+Fields of a given compound type can be created with the `.field()` method of a Compound Type:
+
+```python
+# ti.Vector.field(2, dtype=ti.i32, shape=(233, 666))
+x = vec2i.field(shape=(233, 666))
+
+# ti.Matrix.field(2, 2, dtype=ti.i32, shape=(233, 666))
+x = mat2f.field(shape=(233, 666))
+
+# ti.Struct.field({'ro': vec3f, 'rd': vec3f, 'l': ti.f32}, shape=(233, 666))
+x = ray.field(shape=(233, 666))
+```
+
+### Creating local variables
+Compound types can be directly called to create matrix or struct instances. Vectors and matrices can be created using GLSL-like broadcast syntax since the shape of the vector or matrix is already known:
+```python
+ray = ray3f(0.0) # ti.Struct(ro=[0.0, 0.0, 0.0], rd=[0.0, 0.0, 0.0], l=0.0)
+ro = vec3f(0.0) # ti.Vector([0.0, 0.0, 0.0])
+rd = vec3f(vec2i(0), 1) # ti.Vector([0.0, 0.0, 1.0]), will perform implicit cast
+ray2 = ray3f(ro=ro, rd=rd, l=1.0)
+```
+

From 4d41146c474ecb3b45702ac77e473bb0acf7f9c2 Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Wed, 25 Aug 2021 22:57:36 +0000
Subject: [PATCH 17/29] Auto Format

---
 docs/lang/api/struct.md           | 3 +--
 docs/lang/articles/basic/field.md | 2 +-
 docs/lang/articles/basic/type.md  | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/lang/api/struct.md b/docs/lang/api/struct.md
index 76e91b3a2de52..7db2042710610 100644
--- a/docs/lang/api/struct.md
+++ b/docs/lang/api/struct.md
@@ -61,7 +61,7 @@ For example, this creates a struct with a float member `a` and vector member `b`
 
     # Taichi-scope
     x = ti.Struct({'a': 1.0, 'b': [1.0, 1.0, 1.0]})
-    # or 
+    # or
     x = ti.Struct(a=1.0, b=ti.Vector([1.0, 1.0, 1.0]))
 
 :::
@@ -106,4 +106,3 @@ Members of a local struct can be accessed using both attributes (object-like) or
 ## Element-wise operations (WIP)
 
 TODO: add element wise operations docs
-
diff --git a/docs/lang/articles/basic/field.md b/docs/lang/articles/basic/field.md
index bcf0a33766714..cf03c39e7a0b7 100644
--- a/docs/lang/articles/basic/field.md
+++ b/docs/lang/articles/basic/field.md
@@ -116,4 +116,4 @@ particle_field[0].pos = ti.Vector([0.0, 0.0, 0.0])
 particle_field.mass # global ti.Vector.field
 particle_field.mass.fill(1.0)
 ```
-- See [Structs](../../api/struct.md) for more on matrices.
\ No newline at end of file
+- See [Structs](../../api/struct.md) for more on matrices.
diff --git a/docs/lang/articles/basic/type.md b/docs/lang/articles/basic/type.md
index 8ce7ab10e93ed..7c710fd0646e5 100644
--- a/docs/lang/articles/basic/type.md
+++ b/docs/lang/articles/basic/type.md
@@ -223,4 +223,3 @@ ro = vec3f(0.0) # ti.Vector([0.0, 0.0, 0.0])
 rd = vec3f(vec2i(0), 1) # ti.Vector([0.0, 0.0, 1.0]), will perform implicit cast
 ray2 = ray3f(ro=ro, rd=rd, l=1.0)
 ```
-

From 52751d067d569dfc8e9c3498dba5bcc613eca51d Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 20:50:43 -0500
Subject: [PATCH 18/29] make Matrix.to_numpy() work for both global field and
 local matrix

---
 python/taichi/lang/__init__.py | 218 ++-------------------------------
 python/taichi/lang/matrix.py   |  14 ++-
 2 files changed, 21 insertions(+), 211 deletions(-)

diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index a6e5ed11ee5dd..aef672825e086 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -2,13 +2,12 @@
 import os
 from copy import deepcopy as _deepcopy
 
-from taichi.core.util import locale_encode
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl, types
 from taichi.lang.enums import Layout
 from taichi.lang.exception import InvalidOperationError
 from taichi.lang.impl import *
-from taichi.lang.kernel_arguments import any_arr, ext_arr, template
+from taichi.lang.kernel_arguments import ext_arr, template
 from taichi.lang.kernel_impl import (KernelArgError, KernelDefError,
                                      data_oriented, func, kernel, pyfunc)
 from taichi.lang.matrix import Matrix, Vector
@@ -16,13 +15,11 @@
 from taichi.lang.ops import *
 from taichi.lang.quant_impl import quant
 from taichi.lang.runtime_ops import async_flush, sync
-from taichi.lang.struct import Struct
 from taichi.lang.transformer import TaichiSyntaxError
 from taichi.lang.type_factory_impl import type_factory
 from taichi.lang.util import (has_pytorch, is_taichi_class, python_scope,
                               taichi_scope, to_numpy_type, to_pytorch_type,
                               to_taichi_type)
-from taichi.misc.util import deprecated
 from taichi.snode.fields_builder import FieldsBuilder
 
 import taichi as ti
@@ -64,6 +61,12 @@
 vulkan = _ti_core.vulkan
 gpu = [cuda, metal, opengl, vulkan]
 cpu = _ti_core.host_arch()
+kernel_profiler_print = lambda: impl.get_runtime().prog.kernel_profiler_print()
+query_kernel_profiler = lambda name: impl.get_runtime(
+).prog.query_kernel_profiler(name)
+kernel_profiler_clear = lambda: impl.get_runtime().prog.kernel_profiler_clear()
+kernel_profiler_total_time = lambda: impl.get_runtime(
+).prog.kernel_profiler_total_time()
 timeline_clear = lambda: impl.get_runtime().prog.timeline_clear()
 timeline_save = lambda fn: impl.get_runtime().prog.timeline_save(fn)
 
@@ -71,110 +74,7 @@
 type_factory_ = _ti_core.get_type_factory_instance()
 
 
-@deprecated('kernel_profiler_print()', 'print_kernel_profile_info()')
-def kernel_profiler_print():
-    return print_kernel_profile_info()
-
-
-def print_kernel_profile_info():
-    """Print the elapsed time(min,max,avg) of Taichi kernels on devices.
-    To enable this profiler, set `kernel_profiler=True` in `ti.init`.
-
-    Example::
-
-        >>> import taichi as ti
-
-        >>> ti.init(ti.cpu, kernel_profiler=True)
-        >>> var = ti.field(ti.f32, shape=1)
-
-        >>> @ti.kernel
-        >>> def compute():
-        >>>     var[0] = 1.0
-
-        >>> compute()
-        >>> ti.print_kernel_profile_info() #[1]
-
-    Note:
-        [1] Currently the result of `KernelProfiler` could be incorrect on OpenGL
-        backend due to its lack of support for `ti.sync()`.
-    """
-    impl.get_runtime().prog.print_kernel_profile_info()
-
-
-def query_kernel_profile_info(name):
-    """Query kernel elapsed time(min,avg,max) on devices using the kernel name.
-    To enable this profiler, set `kernel_profiler=True` in `ti.init`.
-
-    Args:
-        name (str): kernel name.
-
-    Returns:
-        struct KernelProfilerQueryResult with member varaibles(counter, min, max, avg)
-
-    Example::
-
-        >>> import taichi as ti
-
-        >>> ti.init(ti.cpu, kernel_profiler=True)
-        >>> n = 1024*1024
-        >>> var = ti.field(ti.f32, shape=n)
-
-        >>> @ti.kernel
-        >>> def fill():
-        >>>     for i in range(n):
-        >>>         var[i] = 0.1
-
-        >>> fill()
-        >>> ti.clear_kernel_profile_info() #[1]
-        >>> for i in range(100):
-        >>>     fill()
-        >>> query_result = ti.query_kernel_profile_info(fill.__name__) #[2]
-        >>> print("kernel excuted times =",query_result.counter)
-        >>> print("kernel elapsed time(min_in_ms) =",query_result.min)
-        >>> print("kernel elapsed time(max_in_ms) =",query_result.max)
-        >>> print("kernel elapsed time(avg_in_ms) =",query_result.avg)
-
-    Note:
-        [1] To get the correct result, query_kernel_profile_info() must be used in conjunction with
-        clear_kernel_profile_info().
-
-        [2] Currently the result of `KernelProfiler` could be incorrect on OpenGL
-        backend due to its lack of support for `ti.sync()`.
-    """
-    return impl.get_runtime().prog.query_kernel_profile_info(name)
-
-
-@deprecated('kernel_profiler_clear()', 'clear_kernel_profile_info()')
-def kernel_profiler_clear():
-    return clear_kernel_profile_info()
-
-
-def clear_kernel_profile_info():
-    """
-    Clear all KernelProfiler records.
-    """
-    impl.get_runtime().prog.clear_kernel_profile_info()
-
-
-def kernel_profiler_total_time():
-    """
-    Get elapsed time of all kernels recorded in KernelProfiler.
-
-    Returns:
-        time (double): total time in second
-    """
-    return impl.get_runtime().prog.kernel_profiler_total_time()
-
-
-@deprecated('memory_profiler_print()', 'print_memory_profile_info()')
 def memory_profiler_print():
-    return print_memory_profile_info()
-
-
-def print_memory_profile_info():
-    """Memory profiling tool for LLVM backends with full sparse support.
-    This profiler is automatically on.
-    """
     impl.get_runtime().materialize()
     impl.get_runtime().prog.print_memory_profiler_info()
 
@@ -196,7 +96,6 @@ def is_extension_supported(arch, ext):
 
 
 def reset():
-    _ti_core.reset_snode_access_flag()
     impl.reset()
     global runtime
     runtime = impl.get_runtime()
@@ -250,21 +149,6 @@ def __init__(self):
         self.experimental_real_function = False
 
 
-def prepare_sandbox():
-    '''
-    Returns a temporary directory, which will be automatically deleted on exit.
-    It may contain the taichi_core shared object or some misc. files.
-    '''
-    import atexit
-    import shutil
-    from tempfile import mkdtemp
-    tmp_dir = mkdtemp(prefix='taichi-')
-    atexit.register(shutil.rmtree, tmp_dir)
-    print(f'[Taichi] preparing sandbox at {tmp_dir}')
-    os.mkdir(os.path.join(tmp_dir, 'runtime/'))
-    return tmp_dir
-
-
 def init(arch=None,
          default_fp=None,
          default_ip=None,
@@ -357,8 +241,6 @@ def init(arch=None,
         ti.info(f'Following TI_ARCH setting up for arch={env_arch}')
         arch = _ti_core.arch_from_name(env_arch)
     ti.cfg.arch = adaptive_arch_select(arch)
-    if ti.cfg.arch == cc:
-        _ti_core.set_tmp_dir(locale_encode(prepare_sandbox()))
     print(f'[Taichi] Starting on arch={_ti_core.arch_name(ti.cfg.arch)}')
 
     if _test_mode:
@@ -379,9 +261,6 @@ def no_activate(*args):
 
 
 def block_local(*args):
-    if ti.current_cfg().dynamic_index:
-        raise InvalidOperationError(
-            'dynamic_index is not allowed when block_local is turned on.')
     for a in args:
         for v in a.get_field_members():
             _ti_core.insert_snode_access_flag(
@@ -531,33 +410,6 @@ def randn(dt=None):
 
 
 def Tape(loss, clear_gradients=True):
-    """Return a context manager of :class:`~taichi.lang.tape.TapeImpl`. The
-    context manager would catching all of the callings of functions that
-    decorated by :func:`~taichi.lang.kernel_impl.kernel` or
-    :func:`~taichi.lang.complex_kernel` under `with` statement, and calculate
-    all the partial gradients of a given loss variable by calling all of the
-    gradient function of the callings caught in reverse order while `with`
-    statement ended.
-
-    See also :func:`~taichi.lang.kernel_impl.kernel` and
-    :func:`~taichi.lang.complex_kernel` for gradient functions.
-
-    Args:
-        loss(:class:`~taichi.lang.expr.Expr`): The loss field, which shape should be ().
-        clear_gradients(Bool): Before `with` body start, clear all gradients or not.
-
-    Returns:
-        :class:`~taichi.lang.tape.TapeImpl`: The context manager.
-
-    Example::
-
-        >>> @ti.kernel
-        >>> def sum(a: ti.float32):
-        >>>     for I in ti.grouped(x):
-        >>>         y[None] += x[I] ** a
-        >>>
-        >>> with ti.Tape(loss = y):
-        >>>     sum(2)"""
     impl.get_runtime().materialize()
     if len(loss.shape) != 0:
         raise RuntimeError(
@@ -576,7 +428,6 @@ def Tape(loss, clear_gradients=True):
 
 
 def clear_all_gradients():
-    """Set all fields' gradients to 0."""
     impl.get_runtime().materialize()
 
     def visit(node):
@@ -628,7 +479,7 @@ def run_benchmark():
         for i in range(3):
             func(*args)
             ti.sync()
-        ti.clear_kernel_profile_info()
+        ti.kernel_profiler_clear()
         t = time.time()
         for n in range(repeat):
             func(*args)
@@ -811,7 +662,7 @@ def is_arch_supported(arch):
         metal: _ti_core.with_metal,
         opengl: _ti_core.with_opengl,
         cc: _ti_core.with_cc,
-        vulkan: lambda: _ti_core.with_vulkan(),
+        vulkan: lambda: _ti_core.with_vulkan,
         wasm: lambda: True,
         cpu: lambda: True,
     }
@@ -833,7 +684,7 @@ def supported_archs():
     Returns:
         List[taichi_core.Arch]: All supported archs on the machine.
     """
-    archs = [cpu, cuda, metal, vulkan, opengl, cc]
+    archs = [cpu, cuda, metal, opengl, cc]
 
     wanted_archs = os.environ.get('TI_WANTED_ARCHS', '')
     want_exclude = wanted_archs.startswith('^')
@@ -996,15 +847,11 @@ def archs_support_sparse(test, **kwargs):
 def torch_test(func):
     if ti.has_pytorch():
         # OpenGL somehow crashes torch test without a reason, unforturnately
-        return ti.test(exclude=[opengl])(func)
+        return ti.archs_excluding(ti.opengl)(func)
     else:
         return lambda: None
 
 
-def get_host_arch_list():
-    return [_ti_core.host_arch()]
-
-
 # test with host arch only
 def host_arch_only(func):
     @functools.wraps(func)
@@ -1042,7 +889,7 @@ def decorator(func):
         def func__(*args, **kwargs):
             finishes = False
             try:
-                func(*args, **kwargs)
+                host_arch_only(func)(*args, **kwargs)
                 finishes = True
             except ex:
                 # throws. test passed
@@ -1060,38 +907,6 @@ def func__(*args, **kwargs):
 
 
 def complex_kernel(func):
-    """A decorator for python function that user can customize the gradient
-    function by the decorator generated by
-    :func:`~taichi.lang.complex_kernel_grad` for this function, and could be
-    caught automatically by ti.Tape(). This decorator would not automatically
-    converted the function to a taichi kernel. Users should call other taichi
-    kernels if in need to enable automatic parallel computing.
-
-    Args:
-        fn (Callable): The Python function which needs to be decorated.
-
-    Returns:
-        Callable: The decorated function.
-
-    Example::
-
-        >>> @ti.kernel
-        >>> def multiply(a: ti.float32):
-        >>>     for I in ti.grouped(x):
-        >>>         y[I] = x[I] * a
-        >>>
-        >>> @ti.kernel
-        >>> def multiply_grad(a: ti.float32):
-        >>>     for I in ti.grouped(x):
-        >>>         x.grad[I] = y.grad[I] / a
-        >>>
-        >>> @ti.complex_kernel
-        >>> def foo(a):
-        >>>     multiply(a)
-        >>>
-        >>> @ti.complex_kernel_grad(foo)
-        >>> def foo_grad(a):
-        >>>     multiply_grad(a)"""
     def decorated(*args, **kwargs):
         impl.get_runtime().inside_complex_kernel = True
         if impl.get_runtime().target_tape:
@@ -1106,15 +921,6 @@ def decorated(*args, **kwargs):
 
 
 def complex_kernel_grad(primal):
-    """Generate the gradient decorator for a given function decorated by
-    :func:`~taichi.lang.complex_kernel`. See :func:`~taichi.lang.complex_kernel`
-    to get further information and examples.
-
-    Args:
-        primal (Callable): The primal function for the decorator.
-
-    Returns:
-        Callable: The decorator."""
     def decorator(func):
         def decorated(*args, **kwargs):
             func(*args, **kwargs)
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 792e4e5c1c4fc..4062fd4cf3ccc 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -420,11 +420,15 @@ def w(self, value):
     @property
     @python_scope
     def value(self):
-        assert isinstance(self.entries[0], SNodeHostAccess)
-        ret = self.empty_copy()
-        for i in range(self.n):
-            for j in range(self.m):
-                ret.entries[i * self.m + j] = self(i, j)
+        if isinstance(self.entries[0], SNodeHostAccess):
+            # fetch values from SNodeHostAccessor
+            ret = self.empty_copy()
+            for i in range(self.n):
+                for j in range(self.m):
+                    ret.entries[i * self.m + j] = self(i, j)
+        else:
+            # is local python-scope matrix
+            ret = self.entries
         return ret
 
     # host access & python scope operation

From e47becea0c57e32b522f8b9c01a342ff6f6b8330 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 21:09:50 -0500
Subject: [PATCH 19/29] fix /lang/__init__.py

---
 python/taichi/lang/__init__.py | 218 +++++++++++++++++++++++++++++++--
 1 file changed, 206 insertions(+), 12 deletions(-)

diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index aef672825e086..a6e5ed11ee5dd 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -2,12 +2,13 @@
 import os
 from copy import deepcopy as _deepcopy
 
+from taichi.core.util import locale_encode
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl, types
 from taichi.lang.enums import Layout
 from taichi.lang.exception import InvalidOperationError
 from taichi.lang.impl import *
-from taichi.lang.kernel_arguments import ext_arr, template
+from taichi.lang.kernel_arguments import any_arr, ext_arr, template
 from taichi.lang.kernel_impl import (KernelArgError, KernelDefError,
                                      data_oriented, func, kernel, pyfunc)
 from taichi.lang.matrix import Matrix, Vector
@@ -15,11 +16,13 @@
 from taichi.lang.ops import *
 from taichi.lang.quant_impl import quant
 from taichi.lang.runtime_ops import async_flush, sync
+from taichi.lang.struct import Struct
 from taichi.lang.transformer import TaichiSyntaxError
 from taichi.lang.type_factory_impl import type_factory
 from taichi.lang.util import (has_pytorch, is_taichi_class, python_scope,
                               taichi_scope, to_numpy_type, to_pytorch_type,
                               to_taichi_type)
+from taichi.misc.util import deprecated
 from taichi.snode.fields_builder import FieldsBuilder
 
 import taichi as ti
@@ -61,12 +64,6 @@
 vulkan = _ti_core.vulkan
 gpu = [cuda, metal, opengl, vulkan]
 cpu = _ti_core.host_arch()
-kernel_profiler_print = lambda: impl.get_runtime().prog.kernel_profiler_print()
-query_kernel_profiler = lambda name: impl.get_runtime(
-).prog.query_kernel_profiler(name)
-kernel_profiler_clear = lambda: impl.get_runtime().prog.kernel_profiler_clear()
-kernel_profiler_total_time = lambda: impl.get_runtime(
-).prog.kernel_profiler_total_time()
 timeline_clear = lambda: impl.get_runtime().prog.timeline_clear()
 timeline_save = lambda fn: impl.get_runtime().prog.timeline_save(fn)
 
@@ -74,7 +71,110 @@
 type_factory_ = _ti_core.get_type_factory_instance()
 
 
+@deprecated('kernel_profiler_print()', 'print_kernel_profile_info()')
+def kernel_profiler_print():
+    return print_kernel_profile_info()
+
+
+def print_kernel_profile_info():
+    """Print the elapsed time(min,max,avg) of Taichi kernels on devices.
+    To enable this profiler, set `kernel_profiler=True` in `ti.init`.
+
+    Example::
+
+        >>> import taichi as ti
+
+        >>> ti.init(ti.cpu, kernel_profiler=True)
+        >>> var = ti.field(ti.f32, shape=1)
+
+        >>> @ti.kernel
+        >>> def compute():
+        >>>     var[0] = 1.0
+
+        >>> compute()
+        >>> ti.print_kernel_profile_info() #[1]
+
+    Note:
+        [1] Currently the result of `KernelProfiler` could be incorrect on OpenGL
+        backend due to its lack of support for `ti.sync()`.
+    """
+    impl.get_runtime().prog.print_kernel_profile_info()
+
+
+def query_kernel_profile_info(name):
+    """Query kernel elapsed time(min,avg,max) on devices using the kernel name.
+    To enable this profiler, set `kernel_profiler=True` in `ti.init`.
+
+    Args:
+        name (str): kernel name.
+
+    Returns:
+        struct KernelProfilerQueryResult with member varaibles(counter, min, max, avg)
+
+    Example::
+
+        >>> import taichi as ti
+
+        >>> ti.init(ti.cpu, kernel_profiler=True)
+        >>> n = 1024*1024
+        >>> var = ti.field(ti.f32, shape=n)
+
+        >>> @ti.kernel
+        >>> def fill():
+        >>>     for i in range(n):
+        >>>         var[i] = 0.1
+
+        >>> fill()
+        >>> ti.clear_kernel_profile_info() #[1]
+        >>> for i in range(100):
+        >>>     fill()
+        >>> query_result = ti.query_kernel_profile_info(fill.__name__) #[2]
+        >>> print("kernel excuted times =",query_result.counter)
+        >>> print("kernel elapsed time(min_in_ms) =",query_result.min)
+        >>> print("kernel elapsed time(max_in_ms) =",query_result.max)
+        >>> print("kernel elapsed time(avg_in_ms) =",query_result.avg)
+
+    Note:
+        [1] To get the correct result, query_kernel_profile_info() must be used in conjunction with
+        clear_kernel_profile_info().
+
+        [2] Currently the result of `KernelProfiler` could be incorrect on OpenGL
+        backend due to its lack of support for `ti.sync()`.
+    """
+    return impl.get_runtime().prog.query_kernel_profile_info(name)
+
+
+@deprecated('kernel_profiler_clear()', 'clear_kernel_profile_info()')
+def kernel_profiler_clear():
+    return clear_kernel_profile_info()
+
+
+def clear_kernel_profile_info():
+    """
+    Clear all KernelProfiler records.
+    """
+    impl.get_runtime().prog.clear_kernel_profile_info()
+
+
+def kernel_profiler_total_time():
+    """
+    Get elapsed time of all kernels recorded in KernelProfiler.
+
+    Returns:
+        time (double): total time in second
+    """
+    return impl.get_runtime().prog.kernel_profiler_total_time()
+
+
+@deprecated('memory_profiler_print()', 'print_memory_profile_info()')
 def memory_profiler_print():
+    return print_memory_profile_info()
+
+
+def print_memory_profile_info():
+    """Memory profiling tool for LLVM backends with full sparse support.
+    This profiler is automatically on.
+    """
     impl.get_runtime().materialize()
     impl.get_runtime().prog.print_memory_profiler_info()
 
@@ -96,6 +196,7 @@ def is_extension_supported(arch, ext):
 
 
 def reset():
+    _ti_core.reset_snode_access_flag()
     impl.reset()
     global runtime
     runtime = impl.get_runtime()
@@ -149,6 +250,21 @@ def __init__(self):
         self.experimental_real_function = False
 
 
+def prepare_sandbox():
+    '''
+    Returns a temporary directory, which will be automatically deleted on exit.
+    It may contain the taichi_core shared object or some misc. files.
+    '''
+    import atexit
+    import shutil
+    from tempfile import mkdtemp
+    tmp_dir = mkdtemp(prefix='taichi-')
+    atexit.register(shutil.rmtree, tmp_dir)
+    print(f'[Taichi] preparing sandbox at {tmp_dir}')
+    os.mkdir(os.path.join(tmp_dir, 'runtime/'))
+    return tmp_dir
+
+
 def init(arch=None,
          default_fp=None,
          default_ip=None,
@@ -241,6 +357,8 @@ def init(arch=None,
         ti.info(f'Following TI_ARCH setting up for arch={env_arch}')
         arch = _ti_core.arch_from_name(env_arch)
     ti.cfg.arch = adaptive_arch_select(arch)
+    if ti.cfg.arch == cc:
+        _ti_core.set_tmp_dir(locale_encode(prepare_sandbox()))
     print(f'[Taichi] Starting on arch={_ti_core.arch_name(ti.cfg.arch)}')
 
     if _test_mode:
@@ -261,6 +379,9 @@ def no_activate(*args):
 
 
 def block_local(*args):
+    if ti.current_cfg().dynamic_index:
+        raise InvalidOperationError(
+            'dynamic_index is not allowed when block_local is turned on.')
     for a in args:
         for v in a.get_field_members():
             _ti_core.insert_snode_access_flag(
@@ -410,6 +531,33 @@ def randn(dt=None):
 
 
 def Tape(loss, clear_gradients=True):
+    """Return a context manager of :class:`~taichi.lang.tape.TapeImpl`. The
+    context manager would catching all of the callings of functions that
+    decorated by :func:`~taichi.lang.kernel_impl.kernel` or
+    :func:`~taichi.lang.complex_kernel` under `with` statement, and calculate
+    all the partial gradients of a given loss variable by calling all of the
+    gradient function of the callings caught in reverse order while `with`
+    statement ended.
+
+    See also :func:`~taichi.lang.kernel_impl.kernel` and
+    :func:`~taichi.lang.complex_kernel` for gradient functions.
+
+    Args:
+        loss(:class:`~taichi.lang.expr.Expr`): The loss field, which shape should be ().
+        clear_gradients(Bool): Before `with` body start, clear all gradients or not.
+
+    Returns:
+        :class:`~taichi.lang.tape.TapeImpl`: The context manager.
+
+    Example::
+
+        >>> @ti.kernel
+        >>> def sum(a: ti.float32):
+        >>>     for I in ti.grouped(x):
+        >>>         y[None] += x[I] ** a
+        >>>
+        >>> with ti.Tape(loss = y):
+        >>>     sum(2)"""
     impl.get_runtime().materialize()
     if len(loss.shape) != 0:
         raise RuntimeError(
@@ -428,6 +576,7 @@ def Tape(loss, clear_gradients=True):
 
 
 def clear_all_gradients():
+    """Set all fields' gradients to 0."""
     impl.get_runtime().materialize()
 
     def visit(node):
@@ -479,7 +628,7 @@ def run_benchmark():
         for i in range(3):
             func(*args)
             ti.sync()
-        ti.kernel_profiler_clear()
+        ti.clear_kernel_profile_info()
         t = time.time()
         for n in range(repeat):
             func(*args)
@@ -662,7 +811,7 @@ def is_arch_supported(arch):
         metal: _ti_core.with_metal,
         opengl: _ti_core.with_opengl,
         cc: _ti_core.with_cc,
-        vulkan: lambda: _ti_core.with_vulkan,
+        vulkan: lambda: _ti_core.with_vulkan(),
         wasm: lambda: True,
         cpu: lambda: True,
     }
@@ -684,7 +833,7 @@ def supported_archs():
     Returns:
         List[taichi_core.Arch]: All supported archs on the machine.
     """
-    archs = [cpu, cuda, metal, opengl, cc]
+    archs = [cpu, cuda, metal, vulkan, opengl, cc]
 
     wanted_archs = os.environ.get('TI_WANTED_ARCHS', '')
     want_exclude = wanted_archs.startswith('^')
@@ -847,11 +996,15 @@ def archs_support_sparse(test, **kwargs):
 def torch_test(func):
     if ti.has_pytorch():
         # OpenGL somehow crashes torch test without a reason, unforturnately
-        return ti.archs_excluding(ti.opengl)(func)
+        return ti.test(exclude=[opengl])(func)
     else:
         return lambda: None
 
 
+def get_host_arch_list():
+    return [_ti_core.host_arch()]
+
+
 # test with host arch only
 def host_arch_only(func):
     @functools.wraps(func)
@@ -889,7 +1042,7 @@ def decorator(func):
         def func__(*args, **kwargs):
             finishes = False
             try:
-                host_arch_only(func)(*args, **kwargs)
+                func(*args, **kwargs)
                 finishes = True
             except ex:
                 # throws. test passed
@@ -907,6 +1060,38 @@ def func__(*args, **kwargs):
 
 
 def complex_kernel(func):
+    """A decorator for python function that user can customize the gradient
+    function by the decorator generated by
+    :func:`~taichi.lang.complex_kernel_grad` for this function, and could be
+    caught automatically by ti.Tape(). This decorator would not automatically
+    converted the function to a taichi kernel. Users should call other taichi
+    kernels if in need to enable automatic parallel computing.
+
+    Args:
+        fn (Callable): The Python function which needs to be decorated.
+
+    Returns:
+        Callable: The decorated function.
+
+    Example::
+
+        >>> @ti.kernel
+        >>> def multiply(a: ti.float32):
+        >>>     for I in ti.grouped(x):
+        >>>         y[I] = x[I] * a
+        >>>
+        >>> @ti.kernel
+        >>> def multiply_grad(a: ti.float32):
+        >>>     for I in ti.grouped(x):
+        >>>         x.grad[I] = y.grad[I] / a
+        >>>
+        >>> @ti.complex_kernel
+        >>> def foo(a):
+        >>>     multiply(a)
+        >>>
+        >>> @ti.complex_kernel_grad(foo)
+        >>> def foo_grad(a):
+        >>>     multiply_grad(a)"""
     def decorated(*args, **kwargs):
         impl.get_runtime().inside_complex_kernel = True
         if impl.get_runtime().target_tape:
@@ -921,6 +1106,15 @@ def decorated(*args, **kwargs):
 
 
 def complex_kernel_grad(primal):
+    """Generate the gradient decorator for a given function decorated by
+    :func:`~taichi.lang.complex_kernel`. See :func:`~taichi.lang.complex_kernel`
+    to get further information and examples.
+
+    Args:
+        primal (Callable): The primal function for the decorator.
+
+    Returns:
+        Callable: The decorator."""
     def decorator(func):
         def decorated(*args, **kwargs):
             func(*args, **kwargs)

From 01e9d84b5d67aa1619ec6e3b0505723e57e2c9f7 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 21:55:58 -0500
Subject: [PATCH 20/29] use enum Layout

---
 python/taichi/lang/matrix.py |  4 +---
 python/taichi/lang/struct.py | 11 +++--------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 4062fd4cf3ccc..3b60a5392cbfc 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -9,8 +9,6 @@
 from taichi.lang.common_ops import TaichiOperations
 from taichi.lang.enums import Layout
 from taichi.lang.exception import TaichiSyntaxError
-from taichi.lang.expr import Expr
-from taichi.lang.ext_array import AnyArrayAccess
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
 from taichi.lang.ops import cast
 from taichi.lang.types import CompoundType
@@ -1390,7 +1388,7 @@ def __call__(self, *args):
             )
         elif len(args) == 1:
             # fill a single scalar
-            if isinstance(args[0], (numbers.Number, Expr)):
+            if isinstance(args[0], (numbers.Number, expr.Expr)):
                 return self.scalar_filled(args[0])
             # fill a single vector or matrix
             entries = args[0]
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 44cd675d0a76c..d6dd241a75ec6 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -1,10 +1,10 @@
 import copy
 import numbers
 
-from numpy.lib.arraysetops import isin
 from taichi.lang import impl
 from taichi.lang.common_ops import TaichiOperations
 from taichi.lang.exception import TaichiSyntaxError
+from taichi.lang.enums import Layout
 from taichi.lang.expr import Expr
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
 from taichi.lang.matrix import Matrix
@@ -264,10 +264,8 @@ def field(cls,
               name="<Struct>",
               offset=None,
               needs_grad=False,
-              layout=None):
+              layout=Layout.AOS):
 
-        if layout is not None:
-            assert shape is not None, 'layout is useless without shape'
         if shape is None:
             assert offset is None, "shape cannot be None when offset is being set"
 
@@ -298,11 +296,8 @@ def field(cls,
                     offset
                 ), f'The dimensionality of shape and offset must be the same  ({len(shape)} != {len(offset)})'
 
-            if layout is None:
-                layout = ti.AOS
-
             dim = len(shape)
-            if layout.soa:
+            if layout == Layout.SOA:
                 for e in field_dict.values():
                     ti.root.dense(impl.index_nd(dim),
                                   shape).place(e, offset=offset)

From 6d40197b894dc905259b1ce11ac59b6a53274f52 Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Thu, 26 Aug 2021 02:58:38 +0000
Subject: [PATCH 21/29] Auto Format

---
 python/taichi/lang/struct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index d6dd241a75ec6..361480562d460 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -3,8 +3,8 @@
 
 from taichi.lang import impl
 from taichi.lang.common_ops import TaichiOperations
-from taichi.lang.exception import TaichiSyntaxError
 from taichi.lang.enums import Layout
+from taichi.lang.exception import TaichiSyntaxError
 from taichi.lang.expr import Expr
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
 from taichi.lang.matrix import Matrix

From fa49f99f9fd48270cab7c0dbbd36e38f1be70565 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Wed, 25 Aug 2021 22:36:49 -0500
Subject: [PATCH 22/29] use np.int32 for test_from_numpy_struct

---
 tests/python/test_numpy_io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/test_numpy_io.py b/tests/python/test_numpy_io.py
index 192fcbe692eca..0270e8e061dda 100644
--- a/tests/python/test_numpy_io.py
+++ b/tests/python/test_numpy_io.py
@@ -68,8 +68,8 @@ def test_from_numpy_struct():
     f = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
 
     arr_dict = {
-        "a": np.arange(n),
-        "b": np.arange(n) * 2,
+        "a": np.arange(n, dtype=np.int32),
+        "b": np.arange(n, dtype=np.int32) * 2,
     }
 
     f.from_numpy(arr_dict)

From 8178e0098c17ef9e2ade92138cf8aa4d7ed9ac74 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Fri, 27 Aug 2021 23:48:02 -0500
Subject: [PATCH 23/29] fix doc typo

---
 docs/lang/articles/basic/field.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/lang/articles/basic/field.md b/docs/lang/articles/basic/field.md
index cf03c39e7a0b7..fed025c0ffe4e 100644
--- a/docs/lang/articles/basic/field.md
+++ b/docs/lang/articles/basic/field.md
@@ -116,4 +116,4 @@ particle_field[0].pos = ti.Vector([0.0, 0.0, 0.0])
 particle_field.mass # global ti.Vector.field
 particle_field.mass.fill(1.0)
 ```
-- See [Structs](../../api/struct.md) for more on matrices.
+- See [Structs](../../api/struct.md) for more on structs.

From 26ed83939552b6c4234147d949ad63a478d6331b Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Fri, 27 Aug 2021 23:48:19 -0500
Subject: [PATCH 24/29] use ti.test()

---
 tests/python/test_numpy_io.py | 4 ++--
 tests/python/test_torch_io.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/python/test_numpy_io.py b/tests/python/test_numpy_io.py
index 0270e8e061dda..23adb0e561410 100644
--- a/tests/python/test_numpy_io.py
+++ b/tests/python/test_numpy_io.py
@@ -46,7 +46,7 @@ def test_from_numpy_2d():
             assert val[i, j] == i + j * 3
 
 
-@ti.all_archs
+@ti.test()
 def test_to_numpy_struct():
     n = 16
     f = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
@@ -62,7 +62,7 @@ def test_to_numpy_struct():
         assert arr_dict["b"][i] == i * 2
 
 
-@ti.all_archs
+@ti.test()
 def test_from_numpy_struct():
     n = 16
     f = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
diff --git a/tests/python/test_torch_io.py b/tests/python/test_torch_io.py
index c679803e374c5..75d2bdb6a812b 100644
--- a/tests/python/test_torch_io.py
+++ b/tests/python/test_torch_io.py
@@ -175,7 +175,8 @@ def test_io_zeros():
     assert zeros[1, 2] == 4
 
 
-@ti.torch_test
+@pytest.mark.skipif(not ti.has_pytorch(), reason='Pytorch not installed.')
+@ti.test(exclude=ti.opengl)
 def test_io_struct():
     n = 16
     x1 = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))

From 8d9c5cd71e3e3f5d74405d773cb497c6e278c0f8 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Fri, 27 Aug 2021 23:49:02 -0500
Subject: [PATCH 25/29] address code review, make struct broadcast and empty
 copy recursive

---
 python/taichi/lang/struct.py | 61 +++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 361480562d460..241d62ff2b8ea 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -1,11 +1,12 @@
 import copy
 import numbers
 
-from taichi.lang import impl
+from numpy import broadcast
+
+from taichi.lang import expr, impl
 from taichi.lang.common_ops import TaichiOperations
 from taichi.lang.enums import Layout
 from taichi.lang.exception import TaichiSyntaxError
-from taichi.lang.expr import Expr
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
 from taichi.lang.matrix import Matrix
 from taichi.lang.ops import cast
@@ -108,7 +109,7 @@ def element_wise_unary(self, foo):
         _taichi_skip_traceback = 1
         ret = self.empty_copy()
         for k, v in self.items:
-            if isinstance(v, Expr):
+            if isinstance(v, expr.Expr):
                 ret.entries[k] = foo(v)
             else:
                 ret.entries[k] = v.element_wise_unary(foo)
@@ -123,14 +124,14 @@ def element_wise_binary(self, foo, other):
             assert self.entries.keys() == other.entries.keys(
             ), f"Member mismatch between structs {self.keys}, {other.keys}"
             for k, v in self.items:
-                if isinstance(v, Expr):
+                if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other.entries[k])
                 else:
                     ret.entries[k] = v.element_wise_binary(
                         foo, other.entries[k])
         else:  # assumed to be scalar
             for k, v in self.items:
-                if isinstance(v, Expr):
+                if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other)
                 else:
                     ret.entries[k] = v.element_wise_binary(foo, other)
@@ -141,7 +142,11 @@ def broadcast_copy(self, other):
             other = Struct(other)
         if not isinstance(other, Struct):
             ret = self.empty_copy()
-            ret.entries = {k: other for k in ret.keys}
+            for k, v in ret.items:
+                if isinstance(v, (Matrix, Struct)):
+                    ret.entries[k] = v.broadcast_copy(other)
+                else:
+                    ret.entries[k] = other
             other = ret
         assert self.entries.keys() == other.entries.keys(
         ), f"Member mismatch between structs {self.keys}, {other.keys}"
@@ -162,14 +167,14 @@ def element_wise_writeback_binary(self, foo, other):
             assert self.entries.keys() == other.entries.keys(
             ), f"Member mismatch between structs {self.keys}, {other.keys}"
             for k, v in self.items:
-                if isinstance(v, Expr):
+                if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other.entries[k])
                 else:
                     ret.entries[k] = v.element_wise_binary(
                         foo, other.entries[k])
         else:  # assumed to be scalar
             for k, v in self.items:
-                if isinstance(v, Expr):
+                if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other)
                 else:
                     ret.entries[k] = v.element_wise_binary(foo, other)
@@ -180,7 +185,7 @@ def element_wise_ternary(self, foo, other, extra):
         other = self.broadcast_copy(other)
         extra = self.broadcast_copy(extra)
         for k, v in self.items:
-            if isinstance(v, Expr):
+            if isinstance(v, expr.Expr):
                 ret.entries[k] = foo(v, other.entries[k], extra.entries[k])
             else:
                 ret.entries[k] = v.element_wise_ternary(
@@ -200,7 +205,14 @@ def assign_renamed(x, y):
         return self.element_wise_writeback_binary(assign_renamed, val)
 
     def empty_copy(self):
-        return Struct.empty(self.keys)
+        """
+        Nested structs and matrices need to be recursively handled.
+        """
+        struct = Struct.empty(self.keys)
+        for k, v in self.items:
+            if isinstance(v, (Struct, Matrix)):
+                struct.entries[k] = v.empty_copy()
+        return struct
 
     def copy(self):
         ret = self.empty_copy()
@@ -211,7 +223,7 @@ def copy(self):
     def variable(self):
         ret = self.copy()
         ret.entries = {
-            k: impl.expr_init(v) if isinstance(v, Expr) else v.variable()
+            k: impl.expr_init(v) if isinstance(v, (numbers.Number, expr.Expr)) else v.variable()
             for k, v in ret.items
         }
         return ret
@@ -226,7 +238,8 @@ def __iter__(self):
     def __str__(self):
         """Python scope struct array print support."""
         if impl.inside_kernel():
-            return f'<ti.Struct {", ".join([str(k) + "=" + str(v) for k, v in self.entries])}>'
+            item_str = ", ".join([str(k) + "=" + str(v) for k, v in self.items])
+            return f'<ti.Struct {item_str}>'
         else:
             return str(self.to_dict())
 
@@ -509,15 +522,16 @@ def __call__(self, *args, **kwargs):
                 )
             else:
                 # initialize struct members by keywords
-                entries = kwargs
+                entries = Struct(kwargs)
         elif len(args) == 1:
             # fill a single scalar
-            if isinstance(args[0], numbers.Number):
-                return self.scalar_filled(args[0])
-            # fill a single vector or matrix
-            # initialize struct members by dictionary
-            entries = args[0]
-        struct = self.cast(Struct(entries))
+            if isinstance(args[0], (numbers.Number, expr.Expr)):
+                entries = self.scalar_filled(args[0])
+            else:
+                # fill a single vector or matrix
+                # initialize struct members by dictionary
+                entries = Struct(args[0])
+        struct = self.cast(entries)
         return struct
 
     def cast(self, struct, in_place=False):
@@ -534,7 +548,7 @@ def cast(self, struct, in_place=False):
                 if in_python_scope():
                     v = struct.entries[k]
                     struct.entries[k] = int(
-                        v) if self.dtype in ti.integer_types else v
+                        v) if dtype in ti.integer_types else float(v)
                 else:
                     struct.entries[k] = cast(struct.entries[k], dtype)
         return struct
@@ -542,8 +556,13 @@ def cast(self, struct, in_place=False):
     def empty(self):
         """
         Create an empty instance of the given compound type.
+        Nested structs and matrices need to be recursively handled.
         """
-        return Struct.empty(self.members.keys())
+        struct = Struct.empty(self.members.keys())
+        for k, dtype in self.members.items():
+            if isinstance(dtype, CompoundType):
+                struct.entries[k] = dtype.empty()
+        return struct
 
     def field(self, **kwargs):
         return Struct.field(self.members, **kwargs)

From afd4098a67499ff41abdab62a3a6f5bef562b4fb Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Fri, 27 Aug 2021 23:49:30 -0500
Subject: [PATCH 26/29] add struct type initializer and type cast tests

---
 tests/python/test_custom_struct.py | 65 +++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/tests/python/test_custom_struct.py b/tests/python/test_custom_struct.py
index 15a7a7d91c345..f80a358e00fef 100644
--- a/tests/python/test_custom_struct.py
+++ b/tests/python/test_custom_struct.py
@@ -149,6 +149,15 @@ def test_struct_type():
     mystruct = ti.types.struct(line=line3f, idx=int)
     x = mystruct.field(shape=(n, ))
 
+    @ti.kernel
+    def init_taichi_scope():
+        for i in x:
+            x[i] = mystruct(1)
+
+    def init_python_scope():
+        for i in range(n):
+            x[i] = mystruct(3)
+
     @ti.kernel
     def run_taichi_scope():
         for i in x:
@@ -167,22 +176,31 @@ def run_python_scope():
                 "idx": i
             })
 
-    x.fill(5)
+    init_taichi_scope()
     for i in range(n):
-        assert x[i].idx == 5
-        assert np.allclose(x[i].line.linedir.to_numpy(), 5.0)
-        assert x[i].line.length == 5.0
+        assert x[i].idx == 1
+        assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
+        assert x[i].line.length == 1.0
     run_taichi_scope()
     for i in range(n):
         assert x[i].idx == i
         assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
         assert x[i].line.length == i + 0.5
-    x.fill(5)
+    init_python_scope()
+    for i in range(n):
+        assert x[i].idx == 3
+        assert np.allclose(x[i].line.linedir.to_numpy(), 3.0)
+        assert x[i].line.length == 3.0
     run_python_scope()
     for i in range(n):
         assert x[i].idx == i
         assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
         assert x[i].line.length == i + 0.5
+    x.fill(5)
+    for i in range(n):
+        assert x[i].idx == 5
+        assert np.allclose(x[i].line.linedir.to_numpy(), 5.0)
+        assert x[i].line.length == 5.0
 
 
 @ti.test()
@@ -222,3 +240,40 @@ def run_python_scope():
         assert x[i].idx == i
         assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
         assert x[i].line.length == i + 0.5
+
+@ti.test()
+def test_compound_type_implicit_cast():
+    vec2i = ti.types.vector(2, int)
+    vec2f = ti.types.vector(2, float)
+    structi = ti.types.struct(a=int, b=vec2i)
+    structf = ti.types.struct(a=float, b=vec2f)
+    
+
+    @ti.kernel
+    def f2i_taichi_scope() -> int:
+        s = structi(2.5)
+        return s.a + s.b[0] + s.b[1]
+
+    def f2i_python_scope():
+        s = structi(2.5)
+        return s.a + s.b[0] + s.b[1]
+
+    @ti.kernel
+    def i2f_taichi_scope() -> float:
+        s = structf(2)
+        return s.a + s.b[0] + s.b[1]
+
+    def i2f_python_scope():
+        s = structf(2)
+        return s.a + s.b[0] + s.b[1]
+    
+    int_value = f2i_taichi_scope()
+    assert type(int_value) == int and int_value == 6
+    int_value = f2i_python_scope()
+    assert type(int_value) == int and int_value == 6
+    float_value = i2f_taichi_scope()
+    assert type(float_value) == float and float_value == approx(6.0, rel=1e-4)
+    float_value = i2f_python_scope()
+    assert type(float_value) == float and float_value == approx(6.0, rel=1e-4)
+
+

From 074bc379e18f367e4265e2140b3c115c2e641bed Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Sat, 28 Aug 2021 04:51:44 +0000
Subject: [PATCH 27/29] Auto Format

---
 python/taichi/lang/struct.py       | 8 +++++---
 tests/python/test_custom_struct.py | 6 ++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 241d62ff2b8ea..04d6c1598d61e 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -2,7 +2,6 @@
 import numbers
 
 from numpy import broadcast
-
 from taichi.lang import expr, impl
 from taichi.lang.common_ops import TaichiOperations
 from taichi.lang.enums import Layout
@@ -223,7 +222,9 @@ def copy(self):
     def variable(self):
         ret = self.copy()
         ret.entries = {
-            k: impl.expr_init(v) if isinstance(v, (numbers.Number, expr.Expr)) else v.variable()
+            k: impl.expr_init(v) if isinstance(v,
+                                               (numbers.Number,
+                                                expr.Expr)) else v.variable()
             for k, v in ret.items
         }
         return ret
@@ -238,7 +239,8 @@ def __iter__(self):
     def __str__(self):
         """Python scope struct array print support."""
         if impl.inside_kernel():
-            item_str = ", ".join([str(k) + "=" + str(v) for k, v in self.items])
+            item_str = ", ".join(
+                [str(k) + "=" + str(v) for k, v in self.items])
             return f'<ti.Struct {item_str}>'
         else:
             return str(self.to_dict())
diff --git a/tests/python/test_custom_struct.py b/tests/python/test_custom_struct.py
index f80a358e00fef..f50421f9e5927 100644
--- a/tests/python/test_custom_struct.py
+++ b/tests/python/test_custom_struct.py
@@ -241,13 +241,13 @@ def run_python_scope():
         assert np.allclose(x[i].line.linedir.to_numpy(), 1.0)
         assert x[i].line.length == i + 0.5
 
+
 @ti.test()
 def test_compound_type_implicit_cast():
     vec2i = ti.types.vector(2, int)
     vec2f = ti.types.vector(2, float)
     structi = ti.types.struct(a=int, b=vec2i)
     structf = ti.types.struct(a=float, b=vec2f)
-    
 
     @ti.kernel
     def f2i_taichi_scope() -> int:
@@ -266,7 +266,7 @@ def i2f_taichi_scope() -> float:
     def i2f_python_scope():
         s = structf(2)
         return s.a + s.b[0] + s.b[1]
-    
+
     int_value = f2i_taichi_scope()
     assert type(int_value) == int and int_value == 6
     int_value = f2i_python_scope()
@@ -275,5 +275,3 @@ def i2f_python_scope():
     assert type(float_value) == float and float_value == approx(6.0, rel=1e-4)
     float_value = i2f_python_scope()
     assert type(float_value) == float and float_value == approx(6.0, rel=1e-4)
-
-

From b4b4b5d78631278c6c116e5eb0475a69aebb0a64 Mon Sep 17 00:00:00 2001
From: Andrew Sun <victoriacity74@gmail.com>
Date: Mon, 30 Aug 2021 00:42:59 -0500
Subject: [PATCH 28/29] change assertion to raising errors

---
 python/taichi/lang/struct.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 04d6c1598d61e..1786a3cba26b6 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -120,8 +120,8 @@ def element_wise_binary(self, foo, other):
         if isinstance(other, (dict)):
             other = Struct(other)
         if isinstance(other, Struct):
-            assert self.entries.keys() == other.entries.keys(
-            ), f"Member mismatch between structs {self.keys}, {other.keys}"
+            if self.entries.keys() != other.entries.keys():
+                raise TypeError(f"Member mismatch between structs {self.keys}, {other.keys}")
             for k, v in self.items:
                 if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other.entries[k])
@@ -147,8 +147,8 @@ def broadcast_copy(self, other):
                 else:
                     ret.entries[k] = other
             other = ret
-        assert self.entries.keys() == other.entries.keys(
-        ), f"Member mismatch between structs {self.keys}, {other.keys}"
+        if self.entries.keys() != other.entries.keys():
+            raise TypeError(f"Member mismatch between structs {self.keys}, {other.keys}")
         return other
 
     def element_wise_writeback_binary(self, foo, other):
@@ -163,8 +163,8 @@ def element_wise_writeback_binary(self, foo, other):
                 f'taichi class {type(self)}, maybe you want to use `a.fill(b)` instead?'
             )
         if isinstance(other, Struct):
-            assert self.entries.keys() == other.entries.keys(
-            ), f"Member mismatch between structs {self.keys}, {other.keys}"
+            if self.entries.keys() != other.entries.keys():
+                raise TypeError(f"Member mismatch between structs {self.keys}, {other.keys}")
             for k, v in self.items:
                 if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other.entries[k])
@@ -281,8 +281,8 @@ def field(cls,
               needs_grad=False,
               layout=Layout.AOS):
 
-        if shape is None:
-            assert offset is None, "shape cannot be None when offset is being set"
+        if shape is None and offset is not None:
+            raise TaichiSyntaxError("shape cannot be None when offset is being set")
 
         field_dict = {}
 
@@ -306,11 +306,10 @@ def field(cls,
             if isinstance(offset, numbers.Number):
                 offset = (offset, )
 
-            if offset is not None:
-                assert len(shape) == len(
-                    offset
-                ), f'The dimensionality of shape and offset must be the same  ({len(shape)} != {len(offset)})'
-
+            if offset is not None and len(shape) != len(offset):
+                raise TaichiSyntaxError(
+                    f'The dimensionality of shape and offset must be the same ({len(shape)} != {len(offset)})'
+                )
             dim = len(shape)
             if layout == Layout.SOA:
                 for e in field_dict.values():

From 0a36e89f246083be6e4ccc120649895b7b71220c Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Mon, 30 Aug 2021 05:45:00 +0000
Subject: [PATCH 29/29] Auto Format

---
 python/taichi/lang/struct.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 1786a3cba26b6..bab51cd0e6b96 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -121,7 +121,9 @@ def element_wise_binary(self, foo, other):
             other = Struct(other)
         if isinstance(other, Struct):
             if self.entries.keys() != other.entries.keys():
-                raise TypeError(f"Member mismatch between structs {self.keys}, {other.keys}")
+                raise TypeError(
+                    f"Member mismatch between structs {self.keys}, {other.keys}"
+                )
             for k, v in self.items:
                 if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other.entries[k])
@@ -148,7 +150,8 @@ def broadcast_copy(self, other):
                     ret.entries[k] = other
             other = ret
         if self.entries.keys() != other.entries.keys():
-            raise TypeError(f"Member mismatch between structs {self.keys}, {other.keys}")
+            raise TypeError(
+                f"Member mismatch between structs {self.keys}, {other.keys}")
         return other
 
     def element_wise_writeback_binary(self, foo, other):
@@ -164,7 +167,9 @@ def element_wise_writeback_binary(self, foo, other):
             )
         if isinstance(other, Struct):
             if self.entries.keys() != other.entries.keys():
-                raise TypeError(f"Member mismatch between structs {self.keys}, {other.keys}")
+                raise TypeError(
+                    f"Member mismatch between structs {self.keys}, {other.keys}"
+                )
             for k, v in self.items:
                 if isinstance(v, expr.Expr):
                     ret.entries[k] = foo(v, other.entries[k])
@@ -282,7 +287,8 @@ def field(cls,
               layout=Layout.AOS):
 
         if shape is None and offset is not None:
-            raise TaichiSyntaxError("shape cannot be None when offset is being set")
+            raise TaichiSyntaxError(
+                "shape cannot be None when offset is being set")
 
         field_dict = {}