Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/use cudnn #7141

Merged
merged 15 commits into from
Jan 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions paddle/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)


cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

init does rely on operator

cc_test(init_test SRCS init_test.cc DEPS init)

cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
24 changes: 24 additions & 0 deletions paddle/framework/data_transform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,28 @@ auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNCHW, LibraryType::kPlain);

// TODO(dzhwinter): Only for testing multiple op kernel.
// Dummy transform function for library_type
// should be removed.
auto KernelPlain = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
DataLayout::kAnyLayout, LibraryType::kPlain);

auto KernelCUDNN = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
DataLayout::kAnyLayout, LibraryType::kCUDNN);

void DummyTrans(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since all Tensors are actually LoDTensors, here should be

in.IsType<LoDTensor>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Will fixed in next PR

PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataType Only Support DataType transform on same place!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
*dst = src;
}

void TransDataType(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
Expand Down Expand Up @@ -121,6 +143,8 @@ std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
}

REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
REGISTER_DATA_TRANSFORM_FN(f::KernelPlain, f::KernelCUDNN, f::DummyTrans);
REGISTER_DATA_TRANSFORM_FN(f::KernelCUDNN, f::KernelPlain, f::DummyTrans);
REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
std::bind(f::TransDataLayout, NHWC2NCHW,
std::placeholders::_1,
Expand Down
3 changes: 2 additions & 1 deletion paddle/framework/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ limitations under the License. */
#include <string>

#include "paddle/framework/init.h"
#include "paddle/framework/operator.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
#include "paddle/string/piece.h"
Expand All @@ -24,7 +25,6 @@ namespace framework {

std::once_flag gflags_init_flag;

// TODO(qijun) move init gflags to init.cc
void InitGflags(std::vector<std::string> &argv) {
std::call_once(gflags_init_flag, [&]() {
int argc = argv.size();
Expand Down Expand Up @@ -72,6 +72,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
}
platform::DeviceContextPool::Init(places);
framework::UseALL();
return true;
}

Expand Down
135 changes: 132 additions & 3 deletions paddle/framework/op_registry_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/framework/op_registry.h"
#include <glog/logging.h>
#include <gtest/gtest.h>

#include "paddle/framework/op_registry.h"

namespace pd = paddle::framework;

namespace paddle {
namespace framework {

class CosineOp : public OperatorBase {
public:
using OperatorBase::OperatorBase;
Expand Down Expand Up @@ -252,7 +255,6 @@ TEST(OperatorRegistrar, CPU) {
op->Run(scope, cpu_place);
}

#ifdef PADDLE_WITH_CUDA
TEST(OperatorRegistrar, CUDA) {
paddle::framework::proto::OpDesc op_desc;
paddle::platform::CUDAPlace cuda_place(0);
Expand All @@ -263,4 +265,131 @@ TEST(OperatorRegistrar, CUDA) {

op->Run(scope, cuda_place);
}
#endif

static int op_test_value = 0;

using paddle::platform::DeviceContext;
using paddle::platform::CPUDeviceContext;
using paddle::platform::CUDADeviceContext;

namespace paddle {
namespace framework {

class OpWithMultiKernelTest : public OperatorWithKernel {
public:
using OperatorWithKernel::OperatorWithKernel;

protected:
void InferShape(InferShapeContext* ctx) const override {}

framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
}

framework::OpKernelType GetExpectedKernelType(
const framework::OpKernelType& kernel) const override {
return framework::OpKernelType(kernel.data_type_, platform::CUDAPlace(0),
kernel.data_layout_,
framework::LibraryType::kCUDNN);
}
};

template <typename DeviceContext, typename T>
class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const;
};

template <typename T>
class OpMultiKernelTest<CPUDeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
++op_test_value;
}
};

template <typename T>
class OpMultiKernelTest<CUDADeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
--op_test_value;
}
};

template <typename DeviceContext, typename T>
class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const;
};

template <typename T>
class OpMultiKernelTest2<CPUDeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
op_test_value += 10;
}
};

template <typename T>
class OpMultiKernelTest2<CUDADeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
op_test_value -= 10;
}
};

} // namespace framework
} // namespace paddle

REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
paddle::framework::OpWithMultiKernelTest,
paddle::framework::OpKernelTestMaker);
REGISTER_OP_KERNEL(
op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
REGISTER_OP_KERNEL(
op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
REGISTER_OP_KERNEL(
op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
REGISTER_OP_KERNEL(
op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);

TEST(OperatorRegistrar, OpWithMultiKernel) {
paddle::framework::proto::OpDesc op_desc;
paddle::platform::CUDAPlace cuda_place(0);
paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope;

op_desc.set_type("op_with_multi_kernel");
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);

// use all available kernels
paddle::framework::UseALL();
op->Run(scope, cuda_place);
EXPECT_EQ(op_test_value, -10);

// remove cuda kernels
paddle::framework::UseCPU();
op->Run(scope, cpu_place);

EXPECT_EQ(op_test_value, -9);

// add cuda kernels
paddle::framework::UseCUDA();
op->Run(scope, cuda_place);

EXPECT_EQ(op_test_value, -10);

// use cudnn kernel
paddle::framework::UseCUDNN();
op->Run(scope, cuda_place);
EXPECT_EQ(op_test_value, -20);
}
95 changes: 86 additions & 9 deletions paddle/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>

#include <algorithm>
#include <atomic>
Expand All @@ -25,6 +26,53 @@ limitations under the License. */
namespace paddle {
namespace framework {

std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;

void UseCPU() {
kKernelPriority.clear();
/*Plain CPU*/
auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kPlain);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
}

void UseMKLDNN() {
UseCPU();
#if PADDLE_WITH_MKLML
{
/*MKLDNN Kernel*/
auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
}
#endif
}

void UseCUDA() {
UseMKLDNN();
#if PADDLE_WITH_CUDA
/*Plain GPU*/
auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
#endif
}

void UseCUDNN() {
UseCUDA();
#if PADDLE_WITH_CUDA
if (platform::dynload::HasCUDNN()) {
/*CUDNN Kernel*/
auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
}
#endif
}

void UseALL() {
Copy link
Member

@QiJune QiJune Jan 5, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since UseCUDNN calls UseCUDA
UseCUDA calls UseMKLDNN...

UseALL is no needed. We can call UseCUDNN directly

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, UseXXX is recursively called previous UseXXX.
But UseALL called UseCUDNN looks odd, just make it more clearly. And this interface should be removed in the future, we should ONLY allow user configure op in the attribute.

UseCPU();
UseMKLDNN();
UseCUDA();
UseCUDNN();
}

std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL,
Expand Down Expand Up @@ -402,6 +450,12 @@ const platform::DeviceContext* GetDeviceContext(
}
}

const platform::DeviceContext* GetDeviceContext(
const framework::OpKernelType& kernel) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
return pool.Get(kernel.place_);
}

void OperatorWithKernel::Run(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
Expand All @@ -422,23 +476,33 @@ void OperatorWithKernel::Run(const Scope& scope,

ExecutionContext ctx(*this, scope, *dev_ctx);
auto actual_kernel_key = GetActualKernelType(ctx);
auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
auto kernel_iter = kernels.find(expected_kernel_key);

if (kernel_iter == kernels.end()) {
PADDLE_THROW("The operator %s does not support %s", type_,
expected_kernel_key);
}
auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);

if (actual_kernel_key == expected_kernel_key) {
PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
"Currently, model parallelism is only supported between "
"CPU and other devices. For example, multi-GPU model "
"parallelism will failed.");
} else {
// find the best key candidate
const DataTransformFnMap& trans_map = DataTransformFnMap::Instance();
for (auto& candidate : kKernelPriority) {
auto candidate_key =
OpKernelType(actual_kernel_key.data_type_, std::get<0>(candidate),
actual_kernel_key.data_layout_, std::get<1>(candidate));

auto candidate_pair = std::make_pair(actual_kernel_key, candidate_key);
if ((actual_kernel_key == candidate_key) ||
(kernels.count(candidate_key) &&
trans_map.GetNullable(candidate_pair))) {
expected_kernel_key = candidate_key;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default Priority will overwrite user's configuration.
We should strictly obey users' configuration first. If user does not provide a preference, then, we can find a kernel key under the guide of default Priority.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this does not obey the user configuration first rule. Will fix it in the next PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cost of DataTrans are different, and the cost from small to large is the following: DataType, Layout, CPU<->GPU. when choosing candidate_key, these cost should take into account.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我觉得我们把问题搞复杂了,目前为止只有 CPU <-> GPU的需求,MKLDNNLayout <-> kPlain的需求。下个PR里从op的attribute让用户选是否使用就可以,考虑cost就和tensorflow的cost model一样了

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

premature optimization is the root of all evil.

break;
}
}

auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
const DataTransformFn* trans_fun =
DataTransformFnMap::Instance().GetNullable(kernel_pair);
const DataTransformFn* trans_fun = trans_map.GetNullable(kernel_pair);
if (trans_fun) {
auto input_vars = this->InputVars();
// TODO(qijun) filter the input vars that do not need to be transformed
Expand Down Expand Up @@ -471,7 +535,20 @@ void OperatorWithKernel::Run(const Scope& scope,
}
}

kernel_iter->second->Compute(ctx);
VLOG(10) << "Actual kernel: " << actual_kernel_key
<< "Expected kernel: " << expected_kernel_key;

auto kernel_iter = kernels.find(expected_kernel_key);

if (kernel_iter == kernels.end()) {
PADDLE_THROW("The operator %s does not support %s", type_,
expected_kernel_key);
}

auto* expected_dev_ctx = GetDeviceContext(expected_kernel_key);
ExecutionContext expected_ctx(*this, scope, *expected_dev_ctx);

kernel_iter->second->Compute(expected_ctx);
}

OpKernelType OperatorWithKernel::GetActualKernelType(
Expand Down
Loading