-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
【Hackathon No.40】为 Paddle 新增 ASGD API #58834
Changes from all commits
9cb8ec3
afcc16f
80b5266
c3a3a3c
1f3b0bd
1cb5a92
c9e2961
56925e4
58ed273
c70caab
20e41ec
49d8ead
3fd08f6
bbc0779
6b70f57
6fec604
4800282
def823f
541a885
08fb34f
476832f
e170c3d
7529d13
34d75b0
ca3d4ff
517d371
176c067
97c9eec
ce01712
e8d30eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -191,6 +191,19 @@ | |
backward : as_strided_grad | ||
no_need_buffer : input | ||
|
||
- op : asgd_ | ||
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor d, Tensor y, Tensor n, Tensor master_param, bool multi_precision=false) | ||
output : Tensor(param_out), Tensor(d_out), Tensor(y_out), Tensor(master_param_out) | ||
infer_meta : | ||
func : ASGDInferMeta | ||
kernel : | ||
func : asgd | ||
data_type : param | ||
data_transform : | ||
support_trans_dtype : learning_rate, n | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里为什么要特别指定一下 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里参照的是 Paddle SGD 的写法,但是 ops.yaml 中仅 SGD 有这个,实测删除不影响结果,推测是历史遗留问题。已删除~~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里回复的不对,删除之后需要重新cmake一下,我之前没做,抱歉抱歉。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 原因在下面哈 |
||
optional : master_param, master_param_out | ||
inplace : (param -> param_out), (d -> d_out), (y -> y_out), (master_param -> master_param_out) | ||
|
||
- op : asin | ||
args : (Tensor x) | ||
output : Tensor(out) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#pragma once | ||
|
||
#include "paddle/phi/core/dense_tensor.h" | ||
#include "paddle/phi/core/selected_rows.h" | ||
|
||
namespace phi { | ||
|
||
template <typename T, typename Context> | ||
void ASGDKernel(const Context& dev_ctx, | ||
const DenseTensor& param, | ||
const DenseTensor& grad, | ||
const DenseTensor& learning_rate, | ||
const DenseTensor& d, | ||
const DenseTensor& y, | ||
const DenseTensor& n, | ||
const paddle::optional<DenseTensor>& master_param, | ||
bool multi_precision, | ||
DenseTensor* param_out, | ||
DenseTensor* d_out, | ||
DenseTensor* y_out, | ||
DenseTensor* master_param_out); | ||
|
||
} // namespace phi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "paddle/phi/kernels/asgd_kernel.h" | ||
|
||
#include "paddle/phi/backends/cpu/cpu_context.h" | ||
#include "paddle/phi/core/kernel_registry.h" | ||
#include "paddle/phi/kernels/funcs/eigen/common.h" | ||
#include "paddle/phi/kernels/funcs/jit/kernels.h" | ||
|
||
namespace phi { | ||
|
||
template <typename T, typename Context> | ||
void ASGDKernelCPUImpl(const Context& dev_ctx, | ||
const DenseTensor& param, | ||
const DenseTensor& grad, | ||
const DenseTensor& learning_rate, | ||
const DenseTensor& d, | ||
const DenseTensor& y, | ||
const DenseTensor& n, | ||
DenseTensor* param_out, | ||
DenseTensor* d_out, | ||
DenseTensor* y_out) { | ||
auto param_eigen = EigenVector<T>::Flatten(param); | ||
auto grad_eigen = EigenVector<T>::Flatten(grad); | ||
auto d_eigen = EigenVector<T>::Flatten(d); | ||
auto y_eigen = EigenVector<T>::Flatten(y); | ||
auto param_out_eigen = EigenVector<T>::Flatten(*param_out); | ||
auto d_out_eigen = EigenVector<T>::Flatten(*d_out); | ||
auto y_out_eigen = EigenVector<T>::Flatten(*y_out); | ||
T learning_rate_T = learning_rate.data<T>()[0]; | ||
T n_T = n.data<T>()[0]; | ||
|
||
d_out_eigen = d_eigen - y_eigen + grad_eigen; | ||
y_out_eigen = grad_eigen; | ||
param_out_eigen = param_eigen - (learning_rate_T / n_T) * d_out_eigen; | ||
} | ||
|
||
template <typename T, typename Context> | ||
void ASGDKernel(const Context& dev_ctx, | ||
const DenseTensor& param, | ||
const DenseTensor& grad, | ||
const DenseTensor& learning_rate, | ||
const DenseTensor& d, | ||
const DenseTensor& y, | ||
const DenseTensor& n, | ||
const paddle::optional<DenseTensor>& master_param UNUSED, | ||
bool multi_precision UNUSED, | ||
DenseTensor* param_out, | ||
DenseTensor* d_out, | ||
DenseTensor* y_out, | ||
DenseTensor* master_param_out UNUSED) { | ||
dev_ctx.template Alloc<T>(param_out); | ||
dev_ctx.template Alloc<T>(d_out); | ||
dev_ctx.template Alloc<T>(y_out); | ||
ASGDKernelCPUImpl<T, Context>( | ||
dev_ctx, param, grad, learning_rate, d, y, n, param_out, d_out, y_out); | ||
} | ||
|
||
} // namespace phi | ||
|
||
PD_REGISTER_KERNEL(asgd, CPU, ALL_LAYOUT, phi::ASGDKernel, float, double) {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "paddle/phi/kernels/asgd_kernel.h" | ||
|
||
#include "paddle/phi/backends/gpu/gpu_context.h" | ||
#include "paddle/phi/backends/gpu/gpu_helper.h" | ||
#include "paddle/phi/backends/gpu/gpu_primitives.h" | ||
#include "paddle/phi/common/amp_type_traits.h" | ||
#include "paddle/phi/core/kernel_registry.h" | ||
#include "paddle/phi/core/mixed_vector.h" | ||
|
||
namespace phi { | ||
|
||
template <typename T, typename MT> | ||
__global__ void ASGDKernelGPUImpl(const T* param, | ||
const T* grad, | ||
const T* learning_rate, | ||
const T* d, | ||
const T* y, | ||
const T* n, | ||
const MT* master_param, | ||
int num, | ||
T* param_out, | ||
T* d_out, | ||
T* y_out, | ||
MT* master_param_out) { | ||
MT learning_rate_MT = static_cast<MT>(learning_rate[0]); | ||
MT n_MT = static_cast<MT>(n[0]); | ||
CUDA_KERNEL_LOOP(i, num) { | ||
MT param_data = master_param ? master_param[i] : static_cast<MT>(param[i]); | ||
MT grad_data = static_cast<MT>(grad[i]); | ||
MT d_data = static_cast<MT>(d[i]); | ||
MT y_data = static_cast<MT>(y[i]); | ||
d_data = d_data - y_data + grad_data; | ||
y_data = grad_data; | ||
param_data = param_data - (learning_rate_MT / n_MT) * d_data; | ||
param_out[i] = static_cast<T>(param_data); | ||
d_out[i] = static_cast<T>(d_data); | ||
y_out[i] = static_cast<T>(y_data); | ||
if (master_param_out) { | ||
master_param_out[i] = param_data; | ||
} | ||
} | ||
} | ||
|
||
template <typename T, typename Context> | ||
void ASGDKernel(const Context& dev_ctx, | ||
const DenseTensor& param, | ||
const DenseTensor& grad, | ||
const DenseTensor& learning_rate, | ||
const DenseTensor& d, | ||
const DenseTensor& y, | ||
const DenseTensor& n, | ||
const paddle::optional<DenseTensor>& master_param, | ||
bool multi_precision, | ||
DenseTensor* param_out, | ||
DenseTensor* d_out, | ||
DenseTensor* y_out, | ||
DenseTensor* master_param_out) { | ||
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type; | ||
const MPDType* master_in_data = | ||
multi_precision ? master_param->data<MPDType>() : nullptr; | ||
MPDType* master_out_data = | ||
multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_out) | ||
: nullptr; | ||
|
||
int block = 512; | ||
int grid = (param.numel() + block - 1) / block; | ||
|
||
ASGDKernelGPUImpl<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>( | ||
param.data<T>(), | ||
grad.data<T>(), | ||
learning_rate.data<T>(), | ||
d.data<T>(), | ||
y.data<T>(), | ||
n.data<T>(), | ||
master_in_data, | ||
param.numel(), | ||
dev_ctx.template Alloc<T>(param_out), | ||
dev_ctx.template Alloc<T>(d_out), | ||
dev_ctx.template Alloc<T>(y_out), | ||
master_out_data); | ||
} | ||
|
||
} // namespace phi | ||
|
||
PD_REGISTER_KERNEL(asgd, | ||
GPU, | ||
ALL_LAYOUT, | ||
phi::ASGDKernel, | ||
phi::dtype::float16, | ||
phi::dtype::bfloat16, | ||
float, | ||
double) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
入参顺序和其他的优化器保持一致吧,把grad 放在第二位
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done