[XPUPS]add support for kunlun2 (#40985)

[XPUPS]add support for kunlun2 Co-authored-by: WorgenZhang <[email protected]>
PaddlePaddle · Apr 13, 2022 · c9c03e7 · c9c03e7 · paddle-bot-old · Apr 13, 2022
1 parent ca4aea2
commit c9c03e7
Show file tree

Hide file tree

Showing 20 changed files with 1,938 additions and 391 deletions.
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <vector>
 
 #ifdef PADDLE_WITH_PSLIB
-#include "common_value.h"  // NOLINT
+#include "common/common_value.h"  // NOLINT
 #endif
 
 #ifdef PADDLE_WITH_PSCORE

diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -7,7 +7,9 @@ IF(WITH_GPU)
         get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
         SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
     endif()
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
+    nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
     if(WITH_PSCORE)

diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -52,18 +52,18 @@ struct FeaturePushValue {
   float lr_g;
   float mf_g[MF_DIM];
 
-  __device__ __forceinline__ FeaturePushValue
-  operator+(const FeaturePushValue& a) const {
-    FeaturePushValue out;
-    out.slot = a.slot;
-    out.show = a.show + show;
-    out.clk = a.clk + clk;
-    out.lr_g = a.lr_g + lr_g;
-    for (int i = 0; i < MF_DIM; ++i) {
-      out.mf_g[i] = a.mf_g[i] + mf_g[i];
-    }
-    return out;
-  }
+  // __device__ __forceinline__ FeaturePushValue
+  // operator+(const FeaturePushValue& a) const {
+  //  FeaturePushValue out;
+  //  out.slot = a.slot;
+  //  out.show = a.show + show;
+  //  out.clk = a.clk + clk;
+  //  out.lr_g = a.lr_g + lr_g;
+  //  for (int i = 0; i < MF_DIM; ++i) {
+  //    out.mf_g[i] = a.mf_g[i] + mf_g[i];
+  //  }
+  //  return out;
+  // }
 };
 
 }  // end namespace framework

diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -13,28 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifdef PADDLE_WITH_HETERPS
 #include <glog/logging.h>
 #include <limits>
 #include <memory>
 #include <vector>
+
 #ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
 #endif
-#ifdef PADDLE_WITH_PSCORE
+
+#if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #endif
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/phi/core/utils/rw_lock.h"
-#include "thrust/pair.h"
-// #include "cudf/concurrent_unordered_map.cuh.h"
+
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
-#ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "thrust/pair.h"
+#elif defined(__xpu__)
+#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/math.h"
+#include "xpu/kernel/simd.h"
+#endif
 
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
 template <typename KeyType, typename ValType>
 class TableContainer
     : public concurrent_unordered_map<KeyType, ValType,
@@ -45,31 +55,84 @@ class TableContainer
                                  std::numeric_limits<KeyType>::max()>(
             capacity, ValType()) {}
 };
+#elif defined(PADDLE_WITH_XPU_KP)
+
+template <typename KeyType, typename ValType>
+class XPUCacheArray {
+ public:
+  explicit XPUCacheArray(size_t capacity) : capacity_(capacity), size_(0) {
+    xpu_malloc(reinterpret_cast<void**>(&keys), capacity_ * sizeof(KeyType));
+    xpu_malloc(reinterpret_cast<void**>(&vals), capacity_ * sizeof(ValType));
+  }
+
+  virtual ~XPUCacheArray() {
+    xpu_free(keys);
+    xpu_free(vals);
+  }
+
+  void print() {}
+  // ValType* find(const KeyType& key) { return NULL; }
+  // bool insert(const KeyType& key, const ValType& val) { return true; }
+
+  int prefetch(const int dev_id, XPUStream stream = NULL) {}
+  size_t size() { return size_; }
+
+ private:
+  long long capacity_;
+  long long size_;
+  KeyType* keys;
+  ValType* vals;
+};
+#endif
 
 template <typename KeyType, typename ValType>
 class HashTable {
  public:
-  HashTable(size_t capacity);
+  explicit HashTable(size_t capacity);
   virtual ~HashTable();
   HashTable(const HashTable&) = delete;
   HashTable& operator=(const HashTable&) = delete;
+
+  template <typename StreamType>
   void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
-              gpuStream_t stream);
+              StreamType stream);
+
+  template <typename StreamType>
   void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
-              gpuStream_t stream);
+              StreamType stream);
+
+  template <typename StreamType>
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
-           gpuStream_t stream);
-  void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream);
+           StreamType stream);
+
+  template <typename StreamType>
+  void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream);
+
   void show();
-  void dump_to_cpu(int devid, cudaStream_t stream);
 
-  template <typename GradType, typename Sgd>
+  template <typename StreamType>
+  void dump_to_cpu(int devid, StreamType stream);
+
+#if defined(PADDLE_WITH_CUDA)
+
+  template <typename GradType, typename Sgd, typename StreamType>
   void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
-              Sgd sgd, gpuStream_t stream);
+              Sgd sgd, StreamType stream);
 
-  template <typename Sgd>
+  template <typename Sgd, typename StreamType>
   void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd,
-              gpuStream_t stream);
+              StreamType stream);
+
+#elif defined(PADDLE_WITH_XPU_KP)
+  template <typename GradType, typename StreamType>
+  void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
+              StreamType stream);
+
+  template <typename StreamType>
+  void update(const KeyType* d_keys, const char* d_grads, size_t len,
+              StreamType stream);
+
+#endif
 
   int size() { return container_->size(); }
 
@@ -84,7 +147,11 @@ class HashTable {
   std::unique_ptr<phi::RWLock> rwlock_{nullptr};
 
  private:
+#if defined(PADDLE_WITH_CUDA)
   TableContainer<KeyType, ValType>* container_;
+#elif defined(PADDLE_WITH_XPU_KP)
+  XPUCacheArray<KeyType, ValType>* container_;
+#endif
   int BLOCK_SIZE_{256};
   float LOAD_FACTOR{0.75f};
   size_t capacity_;
@@ -94,5 +161,4 @@ class HashTable {
 };
 }  // end namespace framework
 }  // end namespace paddle
-#include "hashtable_inl.h"
 #endif
diff --git a/.../framework/fleet/heter_ps/hashtable_inl.h → ...mework/fleet/heter_ps/hashtable_kernel.cu b/.../framework/fleet/heter_ps/hashtable_inl.h → ...mework/fleet/heter_ps/hashtable_kernel.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
+#include <thread>
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
+
 template <typename value_type>
 struct ReplaceOp {
   __host__ __device__ value_type operator()(value_type new_value,
@@ -87,6 +92,7 @@ __global__ void dy_mf_search_kernel(Table* table,
     }
   }
 }
+
 template <typename Table, typename GradType, typename Sgd>
 __global__ void update_kernel(Table* table,
                               const typename Table::key_type* const keys,
@@ -135,8 +141,9 @@ void HashTable<KeyType, ValType>::show() {
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
-                                      size_t len, gpuStream_t stream) {
+                                      size_t len, StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -146,8 +153,9 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
-                                      size_t len, gpuStream_t stream) {
+                                      size_t len, StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -157,9 +165,10 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
                                          const ValType* d_vals, size_t len,
-                                         gpuStream_t stream) {
+                                         StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -169,22 +178,24 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
                                          char* pool, size_t start_index,
-                                         gpuStream_t stream) {
+                                         StreamType stream) {
   if (len == 0) {
     return;
   }
-  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   if (pool == NULL) {
     return;
   }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys, len,
                                                        pool, start_index);
 }
 
 template <typename KeyType, typename ValType>
-void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
+template <typename StreamType>
+void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
   container_->prefetch(cudaCpuDeviceId, stream);
   std::vector<std::thread> threads;
   size_t num = container_->size();
@@ -260,10 +271,10 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
 }
 
 template <typename KeyType, typename ValType>
-template <typename GradType, typename Sgd>
+template <typename GradType, typename Sgd, typename StreamType>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                          const GradType* d_grads, size_t len,
-                                         Sgd sgd, gpuStream_t stream) {
+                                         Sgd sgd, StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -273,19 +284,66 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 }
 
 template <typename KeyType, typename ValType>
-template <typename Sgd>
+template <typename Sgd, typename StreamType>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                          const char* d_grads, size_t len,
-                                         Sgd sgd, gpuStream_t stream) {
+                                         Sgd sgd, StreamType stream) {
   if (len == 0) {
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-
   dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
       container_, d_keys, d_grads, len, sgd, push_grad_value_size_);
 }
 
+template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
+    cudaStream_t>(const unsigned long* d_keys,
+                  paddle::framework::FeatureValue* d_vals, size_t len,
+                  cudaStream_t stream);
+
+// template void
+// HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
+//    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
+//    stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
+    cudaStream_t>(const unsigned long* d_keys,
+                  const paddle::framework::FeatureValue* d_vals, size_t len,
+                  cudaStream_t stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::insert<
+//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
+//                  size_t start_index, cudaStream_t stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::
+    dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
+    paddle::framework::FeaturePushValue,
+    Optimizer<paddle::framework::FeatureValue,
+              paddle::framework::FeaturePushValue>,
+    cudaStream_t>(const unsigned long* d_keys,
+                  const paddle::framework::FeaturePushValue* d_grads,
+                  size_t len, Optimizer<paddle::framework::FeatureValue,
+                                        paddle::framework::FeaturePushValue>
+                                  sgd,
+                  cudaStream_t stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::update<
+//    Optimizer<paddle::framework::FeatureValue,
+//              paddle::framework::FeaturePushValue>,
+//    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t
+//    len,
+//                  Optimizer<paddle::framework::FeatureValue,
+//                            paddle::framework::FeaturePushValue>
+//                      sgd,
+//                  cudaStream_t stream);
+
+#endif
 }  // end namespace framework
 }  // end namespace paddle
 #endif