PaddlePaddle · fuyinno4 · Jun 2, 2022 · Mar 25, 2022 · Mar 25, 2022 · Mar 25, 2022
@@ -254,6 +254,7 @@ option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
 option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
 option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
 option(WITH_ARM_BRPC "Supprot Brpc in Arm"    OFF)
+option(WITH_FLPS     "FL PS mode"    OFF)
 
 if(WITH_RECORD_BUILDTIME)
     set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
@@ -78,6 +78,10 @@ if(WITH_ARM_BRPC)
     add_definitions(-DPADDLE_WITH_ARM_BRPC)
 endif()
 
+if(WITH_FLPS) 
+    add_definitions(-DPADDLE_WITH_FLPS)
+endif()
+
 if(WITH_GLOO)
     add_definitions(-DPADDLE_WITH_GLOO)
 endif()

diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -139,8 +139,9 @@ void HeterClient::SendAndRecvAsync(
       message_name, send_var_name_val, recv_var_name_val, *p_ctx, p_scope,
       &request, &request_io_buffer);
 
-  int micro_id = GetMicroId(ctx, p_scope);
+  int micro_id = GetMicroId(ctx, p_scope);  // global
   auto minibatch_id = micro_id / 10;
+  VLOG(4) << "micro_id: " << micro_id;
   // select channel according to micro id
   if (mode == "forward") {
     int num = minibatch_id % xpu_channels_.size();

diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -155,13 +155,13 @@ class HeterClient {
 
   // HeterClient singleton
   static std::shared_ptr<HeterClient> GetInstance(
-      const std::vector<std::string>& endpoint,
-      const std::vector<std::string>& previous_endpoint,
+      const std::vector<std::string>& endpoints,
+      const std::vector<std::string>& previous_endpoints,
       const int& trainer_id) {
     if (NULL == s_instance_) {
       s_instance_.reset(new HeterClient());
-      s_instance_->SetXpuList(endpoint);
-      s_instance_->SetPreviousXpuList(previous_endpoint);
+      s_instance_->SetXpuList(endpoints);
+      s_instance_->SetPreviousXpuList(previous_endpoints);
       s_instance_->SetTrainerID(trainer_id);
       s_instance_->CreateClient2XpuConnection();
     }

diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -94,7 +94,6 @@ void HeterServer::StartHeterInterService(bool neeed_encrypt) {
     VLOG(4) << "switch inter server server start success! listen on "
             << endpoint_inter_;
   }
-
   {
     std::lock_guard<std::mutex> lock(this->mutex_ready_);
     stoped_ = false;
@@ -115,9 +114,6 @@ void HeterServer::SetFanin(const int& fan_in) { service_.SetFanin(fan_in); }
 void HeterServer::WaitServerReady() {
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  while (!this->ready_) {
-    sleep(1);
-  }
 }
 
 int SendAndRecvVariableHandler::SaveInSwitchWithShard(

diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h
@@ -90,8 +90,10 @@ class ServiceHandlerBase {
 
 using SharedMiniScope =
     std::shared_ptr<std::unordered_map<int, ::paddle::framework::Scope*>>;
+
 using SharedMicroScope = std::shared_ptr<std::unordered_map<
     int, std::shared_ptr<std::vector<::paddle::framework::Scope*>>>>;
+
 using SharedTaskQueue = std::shared_ptr<
     std::unordered_map<int, std::shared_ptr<::paddle::framework::BlockingQueue<
                                 std::pair<std::string, int>>>>>;
@@ -226,6 +228,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
     auto* tensor = var->GetMutable<framework::LoDTensor>();
     auto data = reinterpret_cast<const float*>(tensor->data());
     auto micro_id = static_cast<int>(data[0]);
+    VLOG(4) << "micro_id in heter server: " << micro_id;
     int minibatch_index = micro_id / 10;
     int microbatch_index = micro_id % 10;
 
@@ -261,6 +264,9 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
     distributed::DeserializeFromMultiVarMsgAndIOBuf(
         *request, &request_io_buffer, *dev_ctx_, micro_scope);
     // blocking queue handles multi thread
+    VLOG(4) << "Handle in HeterServer: " << message_name << ", "
+            << microbatch_index;
+    VLOG(4) << "task_queue_ size: " << task_queue_->size();
     (*task_queue_)[minibatch_index]->Push(
         std::make_pair(message_name, microbatch_index));
 
@@ -274,6 +280,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
     distributed::SerializeToMultiVarMsgAndIOBuf(
         message_name, response_var_names, empty_var_names, *dev_ctx_,
         &local_scope, response, &response_io_buffer);
+    VLOG(4) << "Handle over";
     return 0;
   }
 
@@ -612,11 +619,9 @@ class HeterServer {
 
   // HeterWrapper singleton
   static std::shared_ptr<HeterServer> GetInstance() {
+    std::unique_lock<std::mutex> lock(mtx_);
     if (s_instance_ == nullptr) {
-      std::unique_lock<std::mutex> lock(mtx_);
-      if (NULL == s_instance_) {
-        s_instance_.reset(new HeterServer());
-      }
+      s_instance_.reset(new HeterServer());
     }
     return s_instance_;
   }

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
@@ -220,6 +220,7 @@ bool DataFeed::PickOneFile(std::string* filename) {
       file_idx_, platform::errors::PreconditionNotMet(
                      "You should call SetFileListIndex before PickOneFile"));
   std::unique_lock<std::mutex> lock(*mutex_for_pick_file_);
+  VLOG(4) << "filelist_ size: " << filelist_.size();
   if (*file_idx_ == filelist_.size()) {
     VLOG(3) << "DataFeed::PickOneFile no more file to pick";
     return false;
@@ -284,6 +285,7 @@ void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
 
 template <typename T>
 bool PrivateQueueDataFeed<T>::Start() {
+  VLOG(4) << "entering PrivateQueueDataFeed<T>::Start()";
   CheckSetFileList();
   read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
   read_thread_.detach();
@@ -295,6 +297,7 @@ bool PrivateQueueDataFeed<T>::Start() {
 template <typename T>
 void PrivateQueueDataFeed<T>::ReadThread() {
 #ifdef _LINUX
+  VLOG(4) << "entering PrivateQueueDataFeed<T>::ReadThread()";
   std::string filename;
   while (PickOneFile(&filename)) {
     int err_no = 0;
@@ -356,6 +359,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
 template <typename T>
 bool InMemoryDataFeed<T>::Start() {
 #ifdef _LINUX
+  VLOG(4) << "entering InMemoryDataFeed<T>::Start()";
   this->CheckSetFileList();
   if (output_channel_->Size() == 0 && input_channel_->Size() != 0) {
     std::vector<T> data;
@@ -664,6 +668,7 @@ void MultiSlotDataFeed::Init(
 
 void MultiSlotDataFeed::ReadThread() {
 #ifdef _LINUX
+  VLOG(4) << "entering MultiSlotDataFeed::ReadThread()";
   std::string filename;
   while (PickOneFile(&filename)) {
     int err_no = 0;
@@ -831,7 +836,6 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
   } else {
     int use_slots_num = use_slots_.size();
     instance->resize(use_slots_num);
-
     const char* str = reader.get();
     std::string line = std::string(str);
 
@@ -971,10 +975,13 @@ void MultiSlotDataFeed::PutToFeedVec(
     if (feed_vec_[i] == nullptr) {
       continue;
     }
+    VLOG(4) << "MultiSlotDataFeed::PutToFeedVec i: " << i;
     const auto& type = ins_vec[i].GetType();
     const auto& offset = ins_vec[i].GetOffset();
     int total_instance = static_cast<int>(offset.back());
-
+    VLOG(4) << "total_instance: " << total_instance;
+    // platform::CPUPlace()
+    VLOG(4) << "this->place_: " << this->place_;
     if (type[0] == 'f') {  // float
       const auto& feasign = ins_vec[i].GetFloatData();
       float* tensor_ptr =
@@ -2573,6 +2580,7 @@ void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) {
 }
 
 bool SlotRecordInMemoryDataFeed::Start() {
+  VLOG(4) << "entering SlotRecordInMemoryDataFeed::Start";
 #ifdef _LINUX
   this->CheckSetFileList();
   if (input_channel_->Size() != 0) {

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
@@ -314,6 +314,7 @@ message DistributedStrategy {
   optional bool adam_d2sum = 36 [ default = false ];
   optional bool auto_search = 37 [ default = false ];
   optional bool heter_ccl_mode = 38 [ default = false ];
+  optional bool is_fl_ps_mode = 39 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;

diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc
@@ -32,7 +32,9 @@ using TaskQueue =
                                 std::pair<std::string, int>>>>;
 
 void HeterPipelineTrainer::ResetDataset(Dataset* dataset) {
+#ifndef PADDLE_WITH_FLPS
   if (pipeline_stage_ == 0) {
+#endif
     SetDataset(dataset);
     const std::vector<paddle::framework::DataFeed*> readers =
         dataset->GetReaders();
@@ -51,40 +53,39 @@ void HeterPipelineTrainer::ResetDataset(Dataset* dataset) {
       this_worker->SetDataFeed(readers[cnt]);
       this_worker->SetReaderPlace(place_);
     }
+#ifndef PADDLE_WITH_FLPS
   }
+#endif
 }
 
 void HeterPipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
                                       Dataset* dataset) {
+  trainer_desc_ = trainer_desc;
   thread_num_ = trainer_desc.thread_num();
   ParseDumpConfig(trainer_desc);
   SetDebug(trainer_desc.debug());
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
-  VLOG(3) << "readers num: " << readers.size();
   // change thread num to readers num
   thread_num_ = readers.size();
-  VLOG(3) << "worker thread num: " << thread_num_;
+  VLOG(3) << "worker(readers) thread num: " << thread_num_;
   const auto& heter_section_params = trainer_desc.heter_section_param();
   num_pipeline_stages_ = heter_section_params.num_pipeline_stages();
   pipeline_stage_ = heter_section_params.pipeline_stage();
   num_microbatches_ = heter_section_params.num_microbatches();
   VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
-  trainer_desc_ = trainer_desc;
   trainer_id_ = trainer_desc.trainer_id();
   for (int i = 0; i < num_pipeline_stages_; ++i) {
     auto trainer_num = trainer_desc.trainers(i);
     trainers_.push_back(trainer_num);
   }
   int cpu_trainer_num = trainers_[0];
-  // int cur_stage_trainer_num = trainers_[pipeline_stage_];
-  // int global_thread_num = cpu_trainer_num * thread_num_;
-  // int previous_trainers = 0;
-  // for (int i = 0; i < pipeline_stage_; i++) previous_trainers +=
-  // trainers_[i];
-  // int stage_trainer_id =
-  //    trainer_id_ - previous_trainers;  // trainer id in current stage
-
+  VLOG(4) << "trainer_id_: " << trainer_id_;
+  VLOG(4) << "cpu_trainer_num: " << cpu_trainer_num
+          << " xpu_trainer_num: " << trainers_[1];
+#ifdef PADDLE_WITH_FLPS
+  thread_num_ = 1;
+#endif
   if (pipeline_stage_ == 0) {  // for cpu trainer
     int cnt = -1;
     int real_thread_id = trainer_id_;
@@ -103,25 +104,33 @@ void HeterPipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
       this_worker->InitRandomDumpConfig(trainer_desc);
       this_worker->SetDeviceIndex(real_thread_id);
       real_thread_id += cpu_trainer_num;
-      // if (pipeline_stage_ == 0) {
       this_worker->SetDataFeed(readers[cnt]);
-      //}
       this_worker->SetMicrobatchNum(num_microbatches_);
       this_worker->SetPipelineStageNum(num_pipeline_stages_);
       this_worker->SetPipelineStage(pipeline_stage_);
     }
-  } else {  // for heter_trainer
-    // heter trainer with thread_id == -1 is not for
-    // real training
+  } else {
+    // for heter_trainer
+    // heter trainer with thread_id == -1 is not for real training, just for run
+    // listen op
     workers_[-1] = DeviceWorkerFactory::CreateDeviceWorker(
         trainer_desc.device_worker_name());
     auto this_worker =
         std::dynamic_pointer_cast<paddle::framework::HeterSectionWorker>(
             workers_[-1]);
+#ifdef PADDLE_WITH_FLPS
+    this_worker->SetDebug(debug_);
+    this_worker->SetNeedDumpField(need_dump_field_);
+    this_worker->SetNeedDumpParam(need_dump_param_);
+    this_worker->SetDumpFieldVector(dump_fields_);
+    this_worker->SetDumpParamVector(dump_param_);
+    this_worker->InitRandomDumpConfig(trainer_desc);
+    this_worker->SetDataFeed(readers[0]);
+#endif
+    this_worker->SetDeviceIndex(-1);
     this_worker->SetMicrobatchNum(num_microbatches_);
     this_worker->SetPipelineStageNum(num_pipeline_stages_);
     this_worker->SetPipelineStage(pipeline_stage_);
-    this_worker->SetDeviceIndex(-1);
   }
 }
 
@@ -159,14 +168,19 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
   for (auto& worker_pair : workers_) {
     auto worker_index = worker_pair.first;
     auto device_worker = worker_pair.second;
+    VLOG(0) << "workers index in InitTrainerEnv: " << worker_index;
     auto this_worker =
         std::dynamic_pointer_cast<paddle::framework::HeterSectionWorker>(
             device_worker);
     this_worker->SetPlace(place);
     this_worker->Initialize(trainer_desc_);
+#ifdef PADDLE_WITH_FLPS
+    this_worker->SetReaderPlace(place);
+#else
     if (pipeline_stage_ == 0) {
       this_worker->SetReaderPlace(place);
     }
+#endif
     this_worker->SetRootScope(root_scope_);
     // generate mini_batch scope for every worker
     auto* minibatch_scope = &root_scope_->NewScope();
@@ -175,13 +189,15 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
     // after set micro num & mini batch scope
     this_worker->CreateMicrobatchScopes();
     (*micro_scopes_)[worker_index] = this_worker->GetMicrobatchScopes();
+    VLOG(4) << "worker_index: " << worker_index;
     (*task_queue_)[worker_index] = this_worker->GetThreadQueue();
   }
 }
 
 void HeterPipelineTrainer::Run() {
   VLOG(3) << "Going to run HeterPipelineTrainer::Run()";
   if (listen_ptr_ == nullptr) {
+    VLOG(3) << "listen_ptr_ is null";
     for (auto& worker_pair : workers_) {
       auto& device_worker = worker_pair.second;
       auto worker_0 =
@@ -196,10 +212,14 @@ void HeterPipelineTrainer::Run() {
   heter_server->WaitServerReady();
   heter_server->SetMiniBatchScopes(mini_scopes_);
   heter_server->SetMicroBatchScopes(micro_scopes_);
+  VLOG(4) << "heter_server SetTaskQueue";
   heter_server->SetTaskQueue(task_queue_);
+
   // main training logic
+  VLOG(3) << "pipeline_stage_ is " << pipeline_stage_;
   if (pipeline_stage_ == 0) {  // for cpu trainer
     for (auto& worker_pair : workers_) {
+      VLOG(4) << "cpu worker index : " << worker_pair.first;
       auto device_worker = worker_pair.second;
       if (!debug_) {
         threads_.push_back(
@@ -212,6 +232,7 @@ void HeterPipelineTrainer::Run() {
   } else {  // for heter worker
     // start thread_worker with thread_id = -1
     for (auto& worker_pair : workers_) {
+      VLOG(4) << "xpu worker index : " << worker_pair.first;
       auto device_worker = worker_pair.second;
       if (!debug_) {
         threads_.push_back(
@@ -252,6 +273,10 @@ void HeterPipelineTrainer::Run() {
           this_worker->SetPipelineStageNum(num_pipeline_stages_);
           this_worker->SetPipelineStage(pipeline_stage_);
           this_worker->SetPlace(place_);
+#ifdef PADDLE_WITH_FLPS
+          this_worker->SetDataFeed(workers_[-1]->device_reader_);
+          this_worker->SetReaderPlace(place_);
+#endif
           this_worker->Initialize(trainer_desc_);
           this_worker->SetRootScope(root_scope_);
 
@@ -308,5 +333,5 @@ Scope* HeterPipelineTrainer::GetWorkerScope(int thread_id) {
 }
 
 }  // end namespace framework
-}  // end namespace paddle
+}  // namespace paddle
 #endif