Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PHI Decoupling]Remove memory header (Part3) #51288

Merged
merged 28 commits into from
Mar 24, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9b3af99
decouple memory copy
YuanRisheng Feb 24, 2023
b239891
Merge branch 'develop' of github.com:YuanRisheng/Paddle into decouple…
YuanRisheng Feb 24, 2023
be631bd
fix ci bugs
YuanRisheng Feb 24, 2023
8ce3714
fix ci compile bugs
YuanRisheng Feb 27, 2023
d09d235
fix rocm compile
YuanRisheng Feb 27, 2023
648df79
fix ci bugs
YuanRisheng Feb 28, 2023
db69ddd
Merge branch 'develop' of github.com:YuanRisheng/Paddle into decouple…
YuanRisheng Mar 3, 2023
38b7711
Merge branch 'develop' of github.com:YuanRisheng/Paddle into decouple…
YuanRisheng Mar 6, 2023
7392493
decouple memory
YuanRisheng Mar 7, 2023
50c5241
Merge branch 'develop' of github.com:YuanRisheng/Paddle into decouple…
YuanRisheng Mar 8, 2023
50cf666
Merge branch 'develop' of github.com:YuanRisheng/Paddle into decouple…
YuanRisheng Mar 9, 2023
b7940b9
merge develop
YuanRisheng Mar 13, 2023
ffdc7b8
Merge branch 'develop' of github.com:YuanRisheng/Paddle into decouple…
YuanRisheng Mar 15, 2023
1164ead
deal with conflict
YuanRisheng Mar 15, 2023
dec62bd
deal with conflict
YuanRisheng Mar 16, 2023
6030860
fix ci conflict
YuanRisheng Mar 16, 2023
2d0f887
fix xpu compile bugs
YuanRisheng Mar 17, 2023
b3d9112
fix xpu bugs
YuanRisheng Mar 17, 2023
163f7c3
deal with xpu bugs
YuanRisheng Mar 17, 2023
7e2a0e3
fix cmake bugs
YuanRisheng Mar 20, 2023
b9b3ab4
deal with conflict
YuanRisheng Mar 20, 2023
8999c9c
fix windows bugs
YuanRisheng Mar 21, 2023
c657008
fix ci bugs
YuanRisheng Mar 21, 2023
a6dc453
fix ci bugs
YuanRisheng Mar 22, 2023
8c219d5
delete redundance code
YuanRisheng Mar 22, 2023
2289fb1
add code for pybind
YuanRisheng Mar 22, 2023
32a1dcb
fix py3 bugs
YuanRisheng Mar 22, 2023
45534d8
fix ci bugs
YuanRisheng Mar 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions paddle/fluid/distributed/collective/process_group_bkcl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,8 @@ void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place,
// must use XPUDeviceContext here to make sure XPUContext::Init() is called
auto comm_ctx = std::make_unique<XPUDeviceContext>(place);
// set allocator
comm_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
comm_ctx->SetAllocator(
memory::allocation::AllocatorFacade::Instance().GetAllocator(place));

BKCLContext_t bkcl_comm;
BKCLCHECK(bkcl_init_rank(&bkcl_comm, GetRank(), GetSize(), &bkcl_id));
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/framework/data_type_transform_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
gpu_place, context.stream()));
context.PartialInitWithAllocator();

auto kernel_fp16 = phi::KernelKey(
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,7 @@ void TensorCopyImpl(const TENSOR& src,
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
.GetAllocator(npu_pinned_place));
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
Expand Down
9 changes: 3 additions & 6 deletions paddle/fluid/framework/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,7 @@ void TensorFromArray(const T* src,
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
.GetAllocator(npu_pinned_place));
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
Expand Down Expand Up @@ -255,8 +254,7 @@ void TensorFromVector(const std::vector<T>& src,
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
.GetAllocator(npu_pinned_place));
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
Expand Down Expand Up @@ -347,8 +345,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
.GetAllocator(npu_pinned_place));
phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
Expand Down
30 changes: 15 additions & 15 deletions paddle/fluid/framework/tensor_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ TEST(TensorCopy, Tensor) {
// CPU phi::DenseTensor to GPU phi::DenseTensor
auto gpu_place = new platform::CUDAPlace(0);
phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
*gpu_place, gpu_ctx.stream()));
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

Expand Down Expand Up @@ -170,9 +170,9 @@ TEST(TensorFromVector, Tensor) {
gpu_tensor.Resize(phi::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace();
phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
*gpu_place, gpu_ctx.stream()));
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
Expand Down Expand Up @@ -238,9 +238,9 @@ TEST(TensorToVector, Tensor) {
phi::DenseTensor gpu_tensor;
paddle::platform::CUDAPlace place;
phi::GPUContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
place, gpu_ctx.stream()));
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

Expand Down Expand Up @@ -285,9 +285,9 @@ TEST(TensorToVector, Tensor_bool) {
phi::DenseTensor gpu_tensor;
paddle::platform::CUDAPlace place;
phi::GPUContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
place, gpu_ctx.stream()));
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);

Expand Down Expand Up @@ -523,9 +523,9 @@ TEST(Tensor, FromAndToStream) {

auto gpu_place = new platform::CUDAPlace();
phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
*gpu_place, gpu_ctx.stream()));
gpu_ctx.PartialInitWithAllocator();

TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
Expand Down
16 changes: 7 additions & 9 deletions paddle/fluid/imperative/gloo_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,15 @@ void GLOOParallelContext::Init() {
gloo_wrapper->Init();
device_ = std::unique_ptr<phi::CPUContext>(
new phi::CPUContext(platform::CPUPlace()));
device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
device_->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
platform::CPUPlace()));
device_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CPUPlace()));
device_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetZeroAllocator(
platform::CPUPlace()));
}

void GLOOParallelContext::InitWithRingID(int ring_id) {
Expand Down
28 changes: 11 additions & 17 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -421,25 +421,20 @@ void AnalysisPredictor::InitDeviceContexts() {
ResourceManager::Instance().GetGPUResource(predictor_stream_);
auto *gpu_context = new InferGPUContext(place_);
gpu_context->SetAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, gpu_resource->GetStream())
.get());
memory::allocation::AllocatorFacade::Instance().GetAllocator(
place_, gpu_resource->GetStream()));
gpu_context->SetPinnedAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CUDAPinnedPlace()));
gpu_context->SetHostAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
memory::allocation::AllocatorFacade::Instance().GetAllocator(
platform::CPUPlace()));
gpu_context->SetZeroAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_)
.get());
memory::allocation::AllocatorFacade::Instance().GetZeroAllocator(
place_));
gpu_context->SetHostZeroAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
memory::allocation::AllocatorFacade::Instance().GetZeroAllocator(
platform::CPUPlace()));
gpu_context->SetGenerator(
phi::DefaultCUDAGenerator(place_.GetDeviceId()).get());
gpu_context->SetHostGenerator(phi::DefaultCPUGenerator().get());
Expand Down Expand Up @@ -472,8 +467,7 @@ void AnalysisPredictor::InitDeviceContexts() {
<< ", allotor ptr is "
<< reinterpret_cast<void *>(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, gpu_resource->GetStream())
.get());
.GetAllocator(place_, gpu_resource->GetStream()));
return std::unique_ptr<phi::DeviceContext>(gpu_context);
}));
}
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/inference/api/resource_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,9 @@ void GPUContextResource::InitGpuProperties() {
}

void GPUContextResource::InitGpuEigenDevice() {
auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_)
.get();
auto* allocator =
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
place_);
eigen_stream_.reset(new internal::EigenGpuStreamDevice());
eigen_stream_->Reinitialize(stream_, allocator, place_);
gpu_eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
Expand Down Expand Up @@ -435,9 +435,9 @@ void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
GPUPlace place) const {
if (eigen_stream_) {
auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_)
.get();
auto* allocator =
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
place_);
eigen_stream_->Reinitialize(stream, allocator, place);
}
}
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/inference/lite/test_engine_lite.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ void make_fake_model(std::string* model, std::string* param) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place;
phi::GPUContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
place, ctx.stream()));
ctx.PartialInitWithAllocator();
#else
platform::CPUPlace place;
Expand Down
95 changes: 40 additions & 55 deletions paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,18 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
platform::CUDAPlace(0), ctx_->stream()));
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CPUPlace()));
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
.GetZeroAllocator(platform::CUDAPlace(0)));
ctx_->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CUDAPinnedPlace()));
ctx_->PartialInitWithAllocator();

std::map<std::string, std::vector<int>> min_input_shape = {
Expand Down Expand Up @@ -167,21 +164,18 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
platform::CUDAPlace(0), ctx_->stream()));
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CPUPlace()));
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
.GetZeroAllocator(platform::CUDAPlace(0)));
ctx_->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CUDAPinnedPlace()));
ctx_->PartialInitWithAllocator();

std::map<std::string, std::vector<int>> min_input_shape = {
Expand Down Expand Up @@ -335,21 +329,18 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
platform::CUDAPlace(0), ctx_->stream()));
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CPUPlace()));
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
.GetZeroAllocator(platform::CUDAPlace(0)));
ctx_->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CUDAPinnedPlace()));
ctx_->PartialInitWithAllocator();

std::map<std::string, std::vector<int>> min_input_shape = {
Expand Down Expand Up @@ -543,21 +534,18 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
platform::CUDAPlace(0), ctx_->stream()));
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CPUPlace()));
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
.GetZeroAllocator(platform::CUDAPlace(0)));
ctx_->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CUDAPinnedPlace()));
ctx_->PartialInitWithAllocator();

std::map<std::string, std::vector<int>> min_input_shape = {
Expand Down Expand Up @@ -751,21 +739,18 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
platform::CUDAPlace(0), ctx_->stream()));
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CPUPlace()));
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
.GetZeroAllocator(platform::CUDAPlace(0)));
ctx_->SetPinnedAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
paddle::memory::allocation::AllocatorFacade::Instance().GetAllocator(
paddle::platform::CUDAPinnedPlace()));
ctx_->PartialInitWithAllocator();

std::map<std::string, std::vector<int>> min_input_shape = {
Expand Down
Loading