flexflow · reyna-abhyankar · Aug 25, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/.proj.toml b/.proj.toml
@@ -12,6 +12,7 @@ build_targets = [
   "compiler",
   "substitution-generator",
   "local-execution",
+  "task-spec",
   "models",
   "export-model-arch",
   "substitution-to-dot",

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(runtime)
 add_subdirectory(op-attrs)
 add_subdirectory(kernels)
 add_subdirectory(local-execution)
+add_subdirectory(task-spec)
 add_subdirectory(utils)
 add_subdirectory(ffi)
 add_subdirectory(substitutions)

diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
@@ -8,6 +8,8 @@ file(GLOB_RECURSE SRC
      LIST_DIRECTORIES False
      src/*.cc
      src/cuda/cuda_helper.cu
+     src/cuda/loss_function_kernels.cu
+     src/cuda/optimizer_kernels.cu
      src/cuda/ops/*.cu
      )
 

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
@@ -15,9 +15,10 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
-  ArrayShape(TensorShape const &shape);
-  ArrayShape(std::vector<nonnegative_int> const &);
+  explicit ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
+  explicit ArrayShape(TensorShape const &shape);
+  explicit ArrayShape(std::vector<nonnegative_int> const &);
+  explicit ArrayShape(LegionOrdered<nonnegative_int> const &);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
@@ -46,9 +47,11 @@ struct ArrayShape {
   std::optional<nonnegative_int> at_maybe(legion_dim_t) const;
   std::optional<nonnegative_int> at_maybe(ff_dim_t) const;
 
-  ArrayShape
-      sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-                std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
+  ArrayShape sub_shape(std::optional<ff_dim_t> start,
+                       std::optional<ff_dim_t> end) const;
+
+  ArrayShape sub_shape(std::optional<legion_dim_t> start,
+                       std::optional<legion_dim_t> end) const;
 
 public:
   LegionOrdered<nonnegative_int> dims;
@@ -66,4 +69,11 @@ std::ostream &operator<<(std::ostream &, ArrayShape const &);
 
 } // namespace FlexFlow
 
+namespace std {
+template <>
+struct hash<::FlexFlow::ArrayShape> {
+  size_t operator()(::FlexFlow::ArrayShape const &) const;
+};
+} // namespace std
+
 #endif
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
@@ -10,6 +10,8 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
 
+ff_dim_t ff_dim_from_legion_dim(legion_dim_t, nonnegative_int num_dimensions);
+
 template <typename T>
 using LegionOrdered = DimOrdered<legion_dim_t, T>;
 

diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -20,7 +21,8 @@ void sgd_nccl_update_task_gpu(ffStream_t,
                               float lr,
                               float momentum,
                               bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
@@ -32,6 +34,8 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float beta2,
                              float weight_decay,
                              float epsilon,
+                             size_t size,
+                             int num_replicas,
                              float const *weight_grad_ptr,
                              float *adam_m_ptr,
                              float *adam_v_ptr,
@@ -43,6 +47,7 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                float beta2,
                                float weight_decay,
                                float epsilon,
+                               size_t size,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
                                float *adam_m_ptr,

diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
@@ -15,7 +15,7 @@ GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr =
       this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
-  return {tensor_shape.data_type, tensor_shape, ptr};
+  return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
@@ -1,6 +1,8 @@
 #include "kernels/array_shape.h"
+#include "op-attrs/dim_ordered/slice.h"
 #include "utils/containers/product.h"
 #include "utils/containers/reversed.h"
+#include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
 #include "utils/nonnegative_int/num_elements.h"
 
@@ -20,6 +22,9 @@
 ArrayShape::ArrayShape(std::vector<nonnegative_int> const &input_dims)
     : dims(input_dims) {}
 
+ArrayShape::ArrayShape(LegionOrdered<nonnegative_int> const &legion_tensor_dims)
+    : dims(legion_tensor_dims) {}
+
 nonnegative_int ArrayShape::get_volume() const {
   return this->num_elements();
 }
@@ -51,6 +56,26 @@
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
+ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
+                                 std::optional<ff_dim_t> end) const {
+  return ArrayShape{legion_ordered_from_ff_ordered(
+      slice(ff_ordered_from_legion_ordered(this->dims), start, end))};
+}
+
+ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
+                                 std::optional<legion_dim_t> end) const {
+  std::optional<ff_dim_t> legion_start =
+      transform(start, [&](auto const &start_unwrapped) {
+        return ff_dim_from_legion_dim(start_unwrapped, num_dims());
+      });
+
+  std::optional<ff_dim_t> legion_end =
+      transform(end, [&](auto const &end_unwrapped) {
+        return ff_dim_from_legion_dim(end_unwrapped, num_dims());
+      });
+  return this->sub_shape(legion_start, legion_end);
+}
+
 bool ArrayShape::operator==(ArrayShape const &other) const {
   return this->tie() == other.tie();
 }
@@ -59,11 +84,11 @@
   return this->tie() != other.tie();
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-  NOT_IMPLEMENTED();
-}
+// ArrayShape ArrayShape::sub_shape(
+//     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
+//     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
+//   NOT_IMPLEMENTED();
+// }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
   if (index.value < dims.size()) {
@@ -103,3 +128,14 @@
 }
 
 } // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::ArrayShape>::operator()(
+    ::FlexFlow::ArrayShape const &x) const {
+  size_t result = 0;
+  result ^= std::hash<::FlexFlow::LegionOrdered<::FlexFlow::nonnegative_int>>{}(
+                x.dims) +
+            0x9e3779b9 + (result << 6) + (result >> 2);
+  return result;
+}
+} // namespace std
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
@@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }

diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -15,6 +15,8 @@
 
 #include "device.h"
 #include "kernels/concat_kernels.h"
+#include "kernels/legion_dim.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <cassert>
 
 namespace FlexFlow {
@@ -25,7 +27,8 @@ void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0_n}, axis)
+  legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims());
+  blk_size = shape.sub_shape(legion_dim_t{nonnegative_int{0}}, axis_legion_dim)
                  .num_elements()
                  .unwrap_nonnegative();
   num_blocks =