Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/operator run place #6783

Merged
merged 17 commits into from
Dec 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/design/block.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,10 @@ public:
}

void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override {
const platform::Place& place) const override {
PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
for (auto& op : runtime_table_.ops()) {
op->Run(scope, dev_ctx);
op->Run(scope, place);
}
}

Expand Down
4 changes: 2 additions & 2 deletions paddle/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
Expand Down Expand Up @@ -59,5 +59,5 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
cc_test(init_test SRCS init_test.cc DEPS init)
11 changes: 2 additions & 9 deletions paddle/framework/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,7 @@ namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";

DeviceContextPool* DeviceContextPool::pool = nullptr;

Executor::Executor(const std::vector<platform::Place>& places) {
DeviceContextPool& pool = DeviceContextPool::Get();
auto borrowed_contexts = pool.Borrow(places);
device_contexts_.swap(borrowed_contexts);
}
Executor::Executor(const platform::Place& place) : place_(place) {}

static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
if (var_type == proto::VarDesc::LOD_TENSOR) {
Expand Down Expand Up @@ -71,7 +65,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
// - will change to use multiple blocks for RNN op and Cond Op
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
auto& block = pdesc.Block(block_id);
auto& device = device_contexts_[0];

Scope* local_scope = scope;
if (create_vars) {
Expand Down Expand Up @@ -107,7 +100,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugString();
op->Run(*local_scope, *device);
op->Run(*local_scope, place_);
}
if (create_local_scope) {
scope->DeleteScope(local_scope);
Expand Down
92 changes: 3 additions & 89 deletions paddle/framework/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ limitations under the License. */

#pragma once

#include <map>
#include <unordered_map>

#include "paddle/framework/op_info.h"
#include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h"
Expand All @@ -26,96 +23,13 @@ limitations under the License. */
namespace paddle {
namespace framework {

class DeviceContextPool {
public:
static DeviceContextPool& Get() {
PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
return *pool;
}

static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
return *pool;
}

const platform::DeviceContext* Borrow(const platform::Place& place) {
auto range = device_contexts_.equal_range(place);
if (range.first == range.second) {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
return range.first->second;
}

std::vector<const platform::DeviceContext*> Borrow(
const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
std::vector<const platform::DeviceContext*> borrowed_contexts;
for (auto& place : places) {
auto range = device_contexts_.equal_range(place);
if (range.first == range.second) {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
// TODO(dzhwinter) : assign the first found device. Will enhanced later.
// device load balancer maybe useful here.
borrowed_contexts.emplace_back(range.first->second);
}
return borrowed_contexts;
}

explicit DeviceContextPool(const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) {
if (platform::is_cpu_place(places[i])) {
device_contexts_.emplace(
places[i], new platform::CPUDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
} else if (platform::is_gpu_place(places[i])) {
#ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(
places[i], new platform::CUDADeviceContext(
boost::get<platform::GPUPlace>(places[i])));
#else
PADDLE_THROW(
"'GPUPlace' is not supported, Please re-compile with WITH_GPU "
"option");
#endif
}
}
}

~DeviceContextPool() {}

private:
static DeviceContextPool* pool;
struct Hash {
std::hash<int> hash_;
size_t operator()(const platform::Place& place) const {
return hash_(place.which());
}
};
std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
Hash>
device_contexts_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
};

class Executor {
public:
// TODO(dzhwinter) : Do not rely on this function, it will be removed
explicit Executor(const platform::DeviceContext& device)
: Executor(std::vector<platform::Place>({device.GetPlace()})) {}

explicit Executor(const platform::Place& place)
: Executor(std::vector<platform::Place>({place})) {}
: Executor(device.GetPlace()) {}

explicit Executor(const std::vector<platform::Place>& places);
explicit Executor(const platform::Place& place);

/* @Brief
* Runtime evaluation of the given ProgramDesc under certain Scope
Expand All @@ -128,7 +42,7 @@ class Executor {
bool create_vars = true);

private:
std::vector<const platform::DeviceContext*> device_contexts_;
const platform::Place place_;
};

} // namespace framework
Expand Down
9 changes: 4 additions & 5 deletions paddle/framework/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
#include <algorithm>
#include <string>

#include "paddle/framework/executor.h"
#include "paddle/framework/init.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
#include "paddle/string/piece.h"

Expand Down Expand Up @@ -48,7 +48,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
std::vector<platform::Place> places;
for (auto &device : devices) {
auto p = string::Piece(device);
if (string::Find(p, ':', 0) == string::Piece::npos) {
if (string::HasPrefix(p, "CPU")) {
places.emplace_back(platform::CPUPlace());
} else if (string::HasPrefix(p, "GPU")) {
#ifdef PADDLE_WITH_CUDA
Expand All @@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) {
return platform::is_cpu_place(place);
}) == places.end()) {
places.emplace_back(platform::CPUPlace());
LOG(WARNING) << "Not specified any device, use CPU by Default.";
LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
}
DeviceContextPool::Create(places);
return true;
platform::DeviceContextPool::Create(places);
return true;
}

Expand Down
4 changes: 4 additions & 0 deletions paddle/framework/init_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,9 @@ TEST(Init, InitDevices) {
#ifdef PADDLE_WITH_CUDA
std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
ASSERT_EQ(InitDevices(ds2), true);

// test re-init
std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
ASSERT_EQ(InitDevices(ds3), true);
#endif
}
18 changes: 8 additions & 10 deletions paddle/framework/op_registry_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ namespace framework {
class CosineOp : public OperatorBase {
public:
using OperatorBase::OperatorBase;
void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const override {}
void Run(const Scope& scope, const platform::Place& place) const override {}
};

class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
Expand All @@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
class MyTestOp : public OperatorBase {
public:
using OperatorBase::OperatorBase;
void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const override {}
void Run(const Scope& scope, const platform::Place& place) const override {}
};

class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
Expand Down Expand Up @@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) {

auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx);
paddle::platform::CPUPlace cpu_place;
op->Run(scope, cpu_place);
float scale_get = op->Attr<float>("scale");
ASSERT_EQ(scale_get, scale);
}
Expand Down Expand Up @@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {

auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx);
paddle::platform::CPUPlace cpu_place;
op->Run(scope, cpu_place);
ASSERT_EQ(op->Attr<float>("scale"), 1.0);
}

Expand Down Expand Up @@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) {
attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(4);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::platform::CPUDeviceContext dev_ctx;
paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope;
op->Run(scope, dev_ctx);
op->Run(scope, cpu_place);
int test_attr = op->Attr<int>("test_attr");
ASSERT_EQ(test_attr, 4);
}
Expand Down
12 changes: 8 additions & 4 deletions paddle/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/framework/operator.h"
#include <algorithm>
#include <atomic>

#include "paddle/framework/executor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"

Expand Down Expand Up @@ -388,11 +390,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
};

void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const {
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);

ExecutionContext ctx(*this, scope, dev_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
auto dev_ctx = pool.Borrow(place);

// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
Expand All @@ -404,6 +406,8 @@ void OperatorWithKernel::Run(const Scope& scope,

// check if op[type] have kernel for kernel_key
OpKernelMap& kernels = kernels_iter->second;

ExecutionContext ctx(*this, scope, *dev_ctx);
auto kernel_key = GetKernelType(ctx);
auto kernel_iter = kernels.find(kernel_key);

Expand Down
9 changes: 3 additions & 6 deletions paddle/framework/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ class OperatorBase {
virtual std::string DebugString() const;

/// Net will call this function to Run an op.
virtual void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const = 0;
virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, we can simply add another interface here:

virtual void Run(const Scope& scope,
                           const platform::DeviceContext& dev_ctx) const = 0;

void Run(const Scope& scope, const platform::Place& place) const {
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
    auto &dev_ctx = *pool.Borrow(place);
    Run(scope, dev_ctx);
}

Copy link
Contributor Author

@dzhwinter dzhwinter Dec 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We hold the same believe that Operatorbase has one and only one runnable interface.
Now it's the early time that we can fix operator interface once, otherwise, we can not simply remove the second Run in the future.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same confusing with @QiJune , for this PR, too many operators will fetch the device_ctx and add the following code:

// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);

I knew from @dzhwinter that one Op without Kernel would fetch the dev_ctx from DeviceContextPool, and I think the suggestion from @QiJune maybe a simple way, and don't add too much repeated code in the Ops.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hold a different view. @QiJune 's suggestion is a transitional approach.

DeviceContextPool contains device related resources, so I believe only the OperatorWithKernel need to touch the pool.

void OperatorWithKernel::Run(const Scope& scope,
                             const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
  auto dev_ctx = pool.Borrow(place);

If we make an agreement on above, so we should not add a Run(DeviceContext) in the final solution.
In addition, most of the operators don't need to add the redundant snippet of get device from the global pool, so why this PR contains a lot Get() ?
The reason is our CopyFrom didn't different CPUDevice and GPUDevice, actually, only the GPUDevice need to get a devicecontext from the pool and do a copy. We will change it in the future.


virtual bool IsNetOp() const { return false; }

Expand Down Expand Up @@ -159,8 +158,7 @@ class OperatorBase {
class NOP : public OperatorBase {
public:
using OperatorBase::OperatorBase;
void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const override {}
void Run(const Scope& scope, const platform::Place& place) const override {}
std::unique_ptr<OperatorBase> Clone() const override {
return std::unique_ptr<OperatorBase>(new NOP(*this));
}
Expand Down Expand Up @@ -383,8 +381,7 @@ class OperatorWithKernel : public OperatorBase {
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}

void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const final;
void Run(const Scope& scope, const platform::Place& place) const final;

static std::unordered_map<std::string /* op_type */, OpKernelMap>&
AllOpKernels() {
Expand Down
Loading