From a308637a7c5350878a6f21286886835014a8102d Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 14 Aug 2020 19:54:38 +0000 Subject: [PATCH 1/8] supporting dynamic sequence length for transformer kernel --- csrc/includes/custom_cuda_layers.h | 23 +--- csrc/includes/ds_transformer_cuda.h | 13 +- csrc/includes/gelu.h | 4 +- csrc/includes/normalize_layer.h | 89 ++++-------- csrc/includes/softmax.h | 10 +- csrc/includes/strided_batch_gemm.h | 3 + csrc/transformer/ds_transformer_cuda.cpp | 86 ++++++++---- csrc/transformer/gelu_kernels.cu | 21 ++- csrc/transformer/normalize_kernels.cu | 165 +++++++---------------- deepspeed/ops/transformer/transformer.py | 12 +- 10 files changed, 186 insertions(+), 240 deletions(-) diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h index 2e72a35292c6..764687da4ff4 100644 --- a/csrc/includes/custom_cuda_layers.h +++ b/csrc/includes/custom_cuda_layers.h @@ -29,7 +29,6 @@ void launch_bias_gelu(const T* input, T* output, int intermediate_size, int batch_size, - int sequence_length, cudaStream_t stream); template @@ -37,7 +36,6 @@ void launch_gelu(const T* input, T* output, int intermediate_size, int batch_size, - int sequence_length, cudaStream_t stream); template @@ -46,7 +44,6 @@ void launch_d_gelu(T* d_output, const T* bias, int intermediate_size, int batch_size, - int sequence_length, cudaStream_t stream); // Custom fused bias add with layer normalization @@ -57,14 +54,12 @@ void launch_bias_residual_layer_norm(T* vals, const T* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, - bool training = false, - T* vars = nullptr, - T* means = nullptr, - T* vals_hat = nullptr); + bool training, + T* vars, + T* means); template void launch_bias_residual_layer_norm(T* vals, @@ -73,14 +68,11 @@ void launch_bias_residual_layer_norm(T* vals, const T* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, - bool training = false, - T* vars = nullptr, - T* vals_hat = nullptr, - bool save_vals = false); + bool training, + T* vars); template void launch_layerNorm_backward_fused_add(const T* out_grad1, @@ -93,7 +85,6 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, T* betta_grad, T* inp_grad, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream[2]); template @@ -106,7 +97,6 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, T* betta_grad, T* inp_grad, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream[2], bool invertible = false, @@ -122,7 +112,6 @@ void launch_layerNorm_backward(const T* out_grad, T* betta_grad, T* inp_grad, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream[2]); @@ -135,7 +124,6 @@ void launch_layerNorm_backward(const T* out_grad, T* betta_grad, T* inp_grad, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream[2], bool invertible = false, @@ -153,7 +141,6 @@ void launch_layerNorm_backward_nreversible(const T* out_grad, T* betta_grad, T* inp_grad, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream[2]); diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h index 896dce8c26db..968f8c648d3d 100755 --- a/csrc/includes/ds_transformer_cuda.h +++ b/csrc/includes/ds_transformer_cuda.h @@ -121,11 +121,22 @@ class BertTransformerLayer { void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr, uint8_t* attn_output_dropout_mask_ptr, - uint8_t* layer_output_dropout_mask_ptr); + uint8_t* layer_output_dropout_mask_ptr, + T*, T*, T*, T*); inline int GetBatchSize() const { return _batch_size; } inline int GetNumHeads() const { return _heads; } inline int GetSeqLength() const { return _seq_length; } + int SetSeqLength(int seq_len) { + _softmax.SetSeqlen(seq_len); + _seq_length = seq_len; + _attn_scores.SetConfig(_seq_length, + _seq_length, + _hidden_size / _heads); + _attn_context.SetConfig(_hidden_size / _heads, + _seq_length, + _seq_length); + } inline int GetHiddenSize() const { return _hidden_size; } void SetTrainingMode(bool training); diff --git a/csrc/includes/gelu.h b/csrc/includes/gelu.h index 247bfb273de0..8ae2c82e3b56 100644 --- a/csrc/includes/gelu.h +++ b/csrc/includes/gelu.h @@ -29,13 +29,13 @@ class Gelu { cudaStream_t stream) { launch_bias_gelu( - input_buf, bias, output, _config.intermediate_size, bsz, _config.seq_length, stream); + input_buf, bias, output, _config.intermediate_size, bsz, stream); } void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream) { launch_d_gelu( - d_output, input_buf, bias, _config.intermediate_size, bsz, _config.seq_length, stream); + d_output, input_buf, bias, _config.intermediate_size, bsz, stream); } private: diff --git a/csrc/includes/normalize_layer.h b/csrc/includes/normalize_layer.h index 37ee752c88b5..1b74cf168cc7 100644 --- a/csrc/includes/normalize_layer.h +++ b/csrc/includes/normalize_layer.h @@ -16,57 +16,30 @@ class Normalize_Layer { uint32_t seqLength; uint32_t hiddenDim; float epsilon; - bool training, save_vals; - bool allocateGrad; + bool training; bool useMean; Config(uint32_t batch, uint32_t seq, uint32_t h, bool training, - bool save_vals = true, - bool allocateGrad = true, bool useMean = true) : batchSize(batch), seqLength(seq), hiddenDim(h), epsilon(1e-12), training(training), - save_vals(save_vals), - allocateGrad(allocateGrad), useMean(useMean) { } }; - Normalize_Layer(Config config) : config_(config), vars(nullptr), vals_hat(nullptr) - { - if (config_.training) { - cudaMalloc((void**)&vars, config_.batchSize * config_.seqLength * sizeof(T)); - - if (config_.useMean) - cudaMalloc((void**)&means, config_.batchSize * config_.seqLength * sizeof(T)); - - if (config_.save_vals) - cudaMalloc((void**)&vals_hat, - config_.batchSize * config_.seqLength * config_.hiddenDim * sizeof(T)); - - if (config_.allocateGrad) - cudaMalloc((void**)&inp_grad, - config_.batchSize * config_.seqLength * config_.hiddenDim * sizeof(T)); - } - } + Normalize_Layer(Config config) : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr) + {} ~Normalize_Layer() - { - if (config_.training) { - cudaFree(vars); - if (config_.useMean) cudaFree(means); - if (config_.save_vals) cudaFree(vals_hat); - if (config_.allocateGrad) cudaFree(inp_grad); - } - } + {} - void ForwardCheckpoint(int bsz, + void ForwardCheckpoint(int bsz, //batch * seq T* vals, const T* residual, const T* gamma, @@ -80,14 +53,12 @@ class Normalize_Layer { betta, config_.epsilon, bsz, - config_.seqLength, config_.hiddenDim, stream, preLayerNorm, config_.training, vars, - means, - vals_hat); + means); } void Forward(int bsz, @@ -104,14 +75,11 @@ class Normalize_Layer { betta, config_.epsilon, bsz, - config_.seqLength, config_.hiddenDim, stream, preLayerNorm, config_.training, - vars, - vals_hat, - config_.save_vals); + vars); } void Backward(int bsz, @@ -120,7 +88,7 @@ class Normalize_Layer { T* gamma_grad, T* betta_grad, cudaStream_t stream[2], - T* inp_grad_out = nullptr, + T* inp_grad_out, const T* norm_in = nullptr) { launch_layerNorm_backward(out_grad, @@ -130,9 +98,8 @@ class Normalize_Layer { gamma, gamma_grad, betta_grad, - (config_.allocateGrad ? inp_grad : inp_grad_out), + inp_grad_out, bsz, - config_.seqLength, config_.hiddenDim, stream); } @@ -144,21 +111,20 @@ class Normalize_Layer { T* gamma_grad, T* betta_grad, cudaStream_t stream[2], - T* inp_grad_out = nullptr, - const T* norm_out = nullptr) + T* inp_grad_out, + const T* norm_out) { launch_layerNorm_backward(out_grad, - (config_.save_vals ? vals_hat : norm_out), + norm_out, vars, gamma, gamma_grad, betta_grad, - (config_.allocateGrad ? inp_grad : inp_grad_out), + inp_grad_out, bsz, - config_.seqLength, config_.hiddenDim, stream, - config_.save_vals, + !config_.useMean, betta); } @@ -169,7 +135,7 @@ class Normalize_Layer { T* gamma_grad, T* betta_grad, cudaStream_t stream[2], - T* inp_grad_out = nullptr, + T* inp_grad_out, const T* norm_in = nullptr) { launch_layerNorm_backward_fused_add(out_grad1, @@ -180,9 +146,8 @@ class Normalize_Layer { gamma, gamma_grad, betta_grad, - (config_.allocateGrad ? inp_grad : inp_grad_out), + inp_grad_out, bsz, - config_.seqLength, config_.hiddenDim, stream); } @@ -195,33 +160,37 @@ class Normalize_Layer { T* gamma_grad, T* betta_grad, cudaStream_t stream[2], - T* inp_grad_out = nullptr, - const T* norm_out = nullptr) + T* inp_grad_out, + const T* norm_out) { launch_layerNorm_backward_fused_add(out_grad1, out_grad2, - (config_.save_vals ? vals_hat : norm_out), + norm_out, vars, gamma, gamma_grad, betta_grad, - (config_.allocateGrad ? inp_grad : inp_grad_out), + inp_grad_out, bsz, - config_.seqLength, config_.hiddenDim, stream, - config_.save_vals, + !config_.useMean, betta); } - inline T* GetInputGrad() const { return inp_grad; } - inline bool UseMean() const { return config_.useMean; } + inline void SetVar(T *variance) { + if (!variance) { throw std::runtime_error("Normalize variance is null."); } + vars = variance; } + + inline void SetMean(T *mean) { + if (!mean) { throw std::runtime_error("Normalize mean is null."); } + means = mean; } + private: Config config_; T* vars; T* means; T* vals_hat; - T* inp_grad; }; diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h index 2a18daee0b78..86e11ae7e54c 100644 --- a/csrc/includes/softmax.h +++ b/csrc/includes/softmax.h @@ -45,13 +45,15 @@ class Softmax { out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream); } - inline int GetProbDepth() const { return config_.prob_depth; } + inline size_t GetProbDepth() const { return config_.prob_depth; } - inline int GetBatchSize() const { return config_.batchSize; } + inline size_t GetBatchSize() const { return config_.batchSize; } - inline int GetNumHeads() const { return config_.heads; } + inline size_t GetNumHeads() const { return config_.heads; } - inline int GetSeqLength() const { return config_.seq_length; } + inline size_t GetSeqLength() const { return config_.seq_length; } + + inline void SetSeqlen(size_t seq_len) { config_.seq_length = seq_len; } private: Config config_; diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h index 8c43608e2ecf..29d08e04f7e7 100644 --- a/csrc/includes/strided_batch_gemm.h +++ b/csrc/includes/strided_batch_gemm.h @@ -38,6 +38,7 @@ class StridedBatchGemm { gemm_algos(algos) { } + void SetConfig(int m, int n, int k) { m = m; n = n; k = k; } }; StridedBatchGemm(const Config& config) : _config(config) {} @@ -163,6 +164,8 @@ class StridedBatchGemm { inline const T* GetBufferB() const { return q_buf; } + inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); } + private: Config _config; const T* q_buf; diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp index e36c3786944f..3307c8d9bce0 100644 --- a/csrc/transformer/ds_transformer_cuda.cpp +++ b/csrc/transformer/ds_transformer_cuda.cpp @@ -82,15 +82,11 @@ BertTransformerLayer::BertTransformerLayer(int layer_id, seq_length, hidden_size, true, - false, - false, !normalize_invertible)), _norm_layer3(typename Normalize_Layer::Config(batch_size, seq_length, hidden_size, true, - false, - false, !normalize_invertible)), _ff1(typename FeedForward::Config(batch_size * seq_length, intermediate_size, @@ -196,17 +192,18 @@ void BertTransformerLayer::Forward(int bsz, if (_normalize_invertible) add_res_ptr = buf_1 + 3 * small_buf_size; if (_attn_dropout_checkpoint) ctx_bufB_ptr = buf_1 + 4 * small_buf_size; + int bsz_seq = bsz * _seq_length; if (_pre_or_postLayerNorm) { if (_norm_layer3.UseMean()) _norm_layer3.ForwardCheckpoint( - bsz, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true); + bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true); else _norm_layer3.Forward( - bsz, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true); + bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true); } - int bsz_seq = bsz * _seq_length; + if (_pre_or_postLayerNorm) _qkv_linear.Forward(bsz_seq, inp_norm_ptr, attn_qkvw_ptr, buf_0, _cublasHandle); @@ -249,17 +246,17 @@ void BertTransformerLayer::Forward(int bsz, if (_pre_or_postLayerNorm) { if (_norm_layer2.UseMean()) _norm_layer2.ForwardCheckpoint( - bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); + bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); else _norm_layer2.Forward( - bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); + bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); } else { if (_norm_layer2.UseMean()) _norm_layer2.ForwardCheckpoint( - bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); + bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); else _norm_layer2.Forward( - bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); + bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); } _ff1.Forward(bsz_seq, @@ -268,7 +265,7 @@ void BertTransformerLayer::Forward(int bsz, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), _cublasHandle); - _gelu.ForwardWithBiasAdd(bsz, + _gelu.ForwardWithBiasAdd(bsz_seq, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, (_gelu_checkpoint ? ctx_bufB_ptr : ff2_inp_ptr), @@ -291,9 +288,9 @@ void BertTransformerLayer::Forward(int bsz, if (!_pre_or_postLayerNorm) { if (_norm_layer3.UseMean()) _norm_layer3.ForwardCheckpoint( - bsz, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); + bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); else - _norm_layer3.Forward(bsz, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); + _norm_layer3.Forward(bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); } } @@ -359,7 +356,7 @@ void BertTransformerLayer::Backward(int bsz, if (!_pre_or_postLayerNorm) { if (_norm_layer3.UseMean()) - _norm_layer3.Backward(bsz, + _norm_layer3.Backward(bsz_seq, grad_output_ptr, norm_w_ptr, grad_norm_w_ptr, @@ -369,7 +366,7 @@ void BertTransformerLayer::Backward(int bsz, inp_norm_ptr); else - _norm_layer3.Backward(bsz, + _norm_layer3.Backward(bsz_seq, grad_output_ptr, norm_w_ptr, norm_b_ptr, @@ -389,7 +386,7 @@ void BertTransformerLayer::Backward(int bsz, ? buf_0 : (_pre_or_postLayerNorm ? grad_output_ptr : buf_1); - if (_gelu_checkpoint) _gelu.ForwardWithBiasAdd(bsz, ff2_inp_ptr, inter_b_ptr, buf_2, _stream); + if (_gelu_checkpoint) _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream); _ff2.Backward(bsz_seq, layer_dropout_buf, (_gelu_checkpoint ? buf_2 : ff2_inp_ptr), @@ -401,7 +398,7 @@ void BertTransformerLayer::Backward(int bsz, ff2_buf); _gelu.Backward( - bsz, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream); + bsz_seq, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream); _ff1.Backward(bsz_seq, ff2_buf, @@ -418,7 +415,7 @@ void BertTransformerLayer::Backward(int bsz, if (_pre_or_postLayerNorm) { if (_norm_layer2.UseMean()) - _norm_layer2.BackwardFusedAdd(bsz, + _norm_layer2.BackwardFusedAdd(bsz_seq, buf_3, grad_output_ptr, attn_nw_ptr, @@ -429,7 +426,7 @@ void BertTransformerLayer::Backward(int bsz, add_res_ptr); else - _norm_layer2.BackwardFusedAdd(bsz, + _norm_layer2.BackwardFusedAdd(bsz_seq, buf_3, grad_output_ptr, attn_nw_ptr, @@ -441,7 +438,7 @@ void BertTransformerLayer::Backward(int bsz, ff1_inp_ptr); } else { if (_norm_layer2.UseMean()) - _norm_layer2.Backward(bsz, + _norm_layer2.Backward(bsz_seq, buf_2, attn_nw_ptr, grad_attn_nw_ptr, @@ -451,7 +448,7 @@ void BertTransformerLayer::Backward(int bsz, add_res_ptr); else - _norm_layer2.Backward(bsz, + _norm_layer2.Backward(bsz_seq, buf_2, attn_nw_ptr, attn_nb_ptr, @@ -525,7 +522,7 @@ void BertTransformerLayer::Backward(int bsz, if (_pre_or_postLayerNorm) { if (_norm_layer3.UseMean()) - _norm_layer3.BackwardFusedAdd(bsz, + _norm_layer3.BackwardFusedAdd(bsz_seq, buf_2, buf_0, norm_w_ptr, @@ -536,7 +533,7 @@ void BertTransformerLayer::Backward(int bsz, input_ptr); else - _norm_layer3.BackwardFusedAdd(bsz, + _norm_layer3.BackwardFusedAdd(bsz_seq, buf_2, buf_0, norm_w_ptr, @@ -562,11 +559,18 @@ void BertTransformerLayer::SetTrainingMode(bool training) template void BertTransformerLayer::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr, uint8_t* attn_output_dropout_mask_ptr, - uint8_t* layer_output_dropout_mask_ptr) + uint8_t* layer_output_dropout_mask_ptr, + T *norm2Var, T* norm2Mean, + T *norm3Var, T* norm3Mean) { _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr); _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr); _layer_output_dropout.SetMask(layer_output_dropout_mask_ptr); + + _norm_layer2.SetVar(norm2Var); + _norm_layer2.SetMean(norm2Mean); + _norm_layer3.SetVar(norm3Var); + _norm_layer3.SetMean(norm3Mean); } template @@ -687,6 +691,12 @@ std::vector ds_transformer_forward(int layer_id, std::shared_ptr> layer = std::static_pointer_cast>(s_transformer_layers[layer_id]); + if(input.size(1) != layer->GetSeqLength()) + { + printf("Info: changing sequence-length from %d to %d \n", layer->GetSeqLength(), input.size(1)); + layer->SetSeqLength(input.size(1)); + } + auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output); auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input)); auto attn_o_inp = torch::empty_like(input); @@ -700,6 +710,11 @@ std::vector ds_transformer_forward(int layer_id, auto layer_output_dropout_mask = torch::empty({(bsz * layer->GetSeqLength()), layer->GetHiddenSize()}, uint8_options); + auto norm2Var = torch::empty({(bsz * layer->GetSeqLength())}, options); + auto norm2Mean = torch::empty({(bsz * layer->GetSeqLength())}, options); + auto norm3Var = torch::empty({(bsz * layer->GetSeqLength())}, options); + auto norm3Mean = torch::empty({(bsz * layer->GetSeqLength())}, options); + T* inp_norm_ptr = (T*)inp_norm.data_ptr(); T* add_res_ptr = (T*)add_res.data_ptr(); T* q_tf_ptr = (T*)qkv_tf.data_ptr(); @@ -734,7 +749,11 @@ std::vector ds_transformer_forward(int layer_id, layer->SetTrainingMode(training_mode); layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(), (uint8_t*)attn_output_dropout_mask.data_ptr(), - (uint8_t*)layer_output_dropout_mask.data_ptr()); + (uint8_t*)layer_output_dropout_mask.data_ptr(), + (T*)norm2Var.data_ptr(), + (T*)norm2Mean.data_ptr(), + (T*)norm3Var.data_ptr(), + (T*)norm3Mean.data_ptr()); layer->Forward(bsz, input_ptr, @@ -776,7 +795,9 @@ std::vector ds_transformer_forward(int layer_id, ff2_inp, attn_prob_dropout_mask, attn_output_dropout_mask, - layer_output_dropout_mask}; + layer_output_dropout_mask, + norm2Var, norm2Mean, + norm3Var, norm3Mean}; } template @@ -795,6 +816,10 @@ std::vector ds_transformer_backward(int layer_id, const torch::Tensor& attn_prob_dropout_mask, const torch::Tensor& attn_output_dropout_mask, const torch::Tensor& layer_output_dropout_mask, + const torch::Tensor& norm2Var, + const torch::Tensor& norm2Mean, + const torch::Tensor& norm3Var, + const torch::Tensor& norm3Mean, const torch::Tensor& input, const torch::Tensor& input_mask, const torch::Tensor& attn_qkvw, @@ -838,9 +863,10 @@ std::vector ds_transformer_backward(int layer_id, CHECK_INPUT(norm_b); int bsz = g_output.size(0); + std::shared_ptr> layer = std::static_pointer_cast>(s_transformer_layers[layer_id]); - + auto grad_input = torch::empty_like(input); auto grad_attn_qkvw = torch::empty_like(attn_qkvw); auto grad_attn_qkvb = torch::empty_like(attn_qkvb); @@ -900,7 +926,9 @@ std::vector ds_transformer_backward(int layer_id, layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(), (uint8_t*)attn_output_dropout_mask.data_ptr(), - (uint8_t*)layer_output_dropout_mask.data_ptr()); + (uint8_t*)layer_output_dropout_mask.data_ptr(), + (T*)norm2Var.data_ptr(), (T*)norm2Mean.data_ptr(), + (T*)norm3Var.data_ptr(), (T*)norm3Mean.data_ptr()); layer->Backward(bsz, grad_output_ptr, diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu index f0e65e3829b5..d5858ff4ebef 100755 --- a/csrc/transformer/gelu_kernels.cu +++ b/csrc/transformer/gelu_kernels.cu @@ -279,13 +279,12 @@ void launch_bias_gelu(const T* input, T* output, int intermediate_size, int batch_size, - int sequence_length, cudaStream_t stream) { int iterations = (intermediate_size + 1023) / 1024; int threads = intermediate_size / iterations / 4; dim3 block_dims(threads); - dim3 grid_dims(sequence_length * batch_size); + dim3 grid_dims(batch_size); fused_bias_gelu<<>>(input, bias, output, intermediate_size); } @@ -295,24 +294,23 @@ void launch_gelu(const T* input, T* output, int intermediate_size, int batch_size, - int sequence_length, cudaStream_t stream) { int iterations = (intermediate_size + 1023) / 1024; int threads = intermediate_size / iterations / 4; dim3 block_dims(threads); - dim3 grid_dims(sequence_length * batch_size); + dim3 grid_dims(batch_size); gelu_kernel<<>>(input, output, intermediate_size); } template void -launch_bias_gelu(const float*, const float*, float*, int, int, int, cudaStream_t); +launch_bias_gelu(const float*, const float*, float*, int, int, cudaStream_t); template void -launch_bias_gelu<__half>(const __half*, const __half*, __half*, int, int, int, cudaStream_t); +launch_bias_gelu<__half>(const __half*, const __half*, __half*, int, int, cudaStream_t); -template void launch_gelu(const float*, float*, int, int, int, cudaStream_t); -template void launch_gelu<__half>(const __half*, __half*, int, int, int, cudaStream_t); +template void launch_gelu(const float*, float*, int, int, cudaStream_t); +template void launch_gelu<__half>(const __half*, __half*, int, int, cudaStream_t); template void launch_d_gelu(T* d_output, @@ -320,17 +318,16 @@ void launch_d_gelu(T* d_output, const T* bias, int intermediate_size, int batch_size, - int sequence_length, cudaStream_t stream) { int iterations = (intermediate_size + 1023) / 1024; int threads = intermediate_size / iterations / 4; dim3 block_dims(threads); - dim3 grid_dims(sequence_length * batch_size); + dim3 grid_dims(batch_size); d_gelu_func<<>>(d_output, input, bias, intermediate_size); } -template void launch_d_gelu(float*, const float*, const float*, int, int, int, cudaStream_t); +template void launch_d_gelu(float*, const float*, const float*, int, int, cudaStream_t); template void -launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, int, cudaStream_t); +launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t); diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu index 7345175694bf..f47a66785130 100755 --- a/csrc/transformer/normalize_kernels.cu +++ b/csrc/transformer/normalize_kernels.cu @@ -27,10 +27,9 @@ __global__ void fused_bias_residual_layer_norm(float* vals, const float* beta, float epsilon, bool preLayerNorm, - bool training = false, - float* vars = nullptr, - float* means = nullptr, - float* vals_hat = nullptr) + bool training, + float* vars, + float* means) { constexpr int iteration_stride = row_stride / iterations; @@ -108,10 +107,9 @@ __global__ void fused_bias_residual_layer_norm(__half* vals, const __half* beta, float epsilon, bool preLayerNorm, - bool training = false, - __half* vars = nullptr, - __half* means = nullptr, - __half* vals_hat = nullptr) + bool training, + __half* vars, + __half* means) { #if __CUDA_ARCH__ >= 700 constexpr int iteration_stride = row_stride / iterations; @@ -204,14 +202,12 @@ void launch_bias_residual_layer_norm(T* vals, const T* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, bool training, T* vars, - T* means, - T* vals_hat); + T* means); template <> void launch_bias_residual_layer_norm(float* vals, @@ -220,40 +216,38 @@ void launch_bias_residual_layer_norm(float* vals, const float* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, bool training, float* vars, - float* means, - float* vals_hat) + float* means) { constexpr int threads = THREADS; - dim3 grid_dim(batch_size * sequence_length); + dim3 grid_dim(batch_size); dim3 block_dim(threads); // There are some limitations to call below functions, now just enumerate the situations. if (hidden_dim == 768) fused_bias_residual_layer_norm<768, 3><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 512) fused_bias_residual_layer_norm<512, 2><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 1024) fused_bias_residual_layer_norm<1024, 4><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 1536) fused_bias_residual_layer_norm<1536, 6><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 2048) fused_bias_residual_layer_norm<2048, 8><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 2560) fused_bias_residual_layer_norm<2560, 10><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else throw std::runtime_error("Unsupport hidden_dim."); } @@ -265,39 +259,37 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, const __half* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, bool training, __half* vars, - __half* means, - __half* vals_hat) + __half* means) { constexpr int threads = 128; - dim3 grid_dim(batch_size * sequence_length); + dim3 grid_dim(batch_size); dim3 block_dim(threads); // There are some limitations to call below functions, now just enumerate the situations. if (hidden_dim == 768) fused_bias_residual_layer_norm<384, 3><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 512) fused_bias_residual_layer_norm<256, 2><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 1024) fused_bias_residual_layer_norm<512, 4><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 1536) fused_bias_residual_layer_norm<768, 6><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 2048) fused_bias_residual_layer_norm<1024, 8><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else if (hidden_dim == 2560) fused_bias_residual_layer_norm<1280, 10><<>>( - vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat); + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means); else throw std::runtime_error("Unsupport hidden_dim."); } @@ -309,10 +301,8 @@ __global__ void fused_bias_residual_layer_norm(float* vals, const float* beta, float epsilon, bool preLayerNorm, - bool training = false, - float* vars = nullptr, - float* vals_hat = nullptr, - bool save_vals = false) + bool training, + float* vars) { constexpr int iteration_stride = row_stride / iterations; @@ -388,10 +378,8 @@ __global__ void fused_bias_residual_layer_norm(__half* vals, const __half* beta, float epsilon, bool preLayerNorm, - bool training = false, - __half* vars = nullptr, - __half* vals_hat = nullptr, - bool save_vals = false) + bool training, + __half* vars) { #if __CUDA_ARCH__ >= 700 constexpr int iteration_stride = row_stride / iterations; @@ -481,14 +469,11 @@ void launch_bias_residual_layer_norm(T* vals, const T* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, bool training, - T* vars, - T* vals_hat, - bool save_vals); + T* vars); /* To tune this launch the following restrictions must be met: @@ -512,18 +497,15 @@ void launch_bias_residual_layer_norm(float* vals, const float* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, bool training, - float* vars, - float* vals_hat, - bool save_vals) + float* vars) { constexpr int threads = THREADS; - dim3 grid_dim(batch_size * sequence_length); + dim3 grid_dim(batch_size); dim3 block_dim(threads); @@ -536,9 +518,7 @@ void launch_bias_residual_layer_norm(float* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 512) fused_bias_residual_layer_norm<512, 2><<>>(vals, residual, @@ -547,9 +527,7 @@ void launch_bias_residual_layer_norm(float* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 1024) fused_bias_residual_layer_norm<1024, 4><<>>(vals, residual, @@ -558,9 +536,7 @@ void launch_bias_residual_layer_norm(float* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 1536) fused_bias_residual_layer_norm<1536, 6><<>>(vals, residual, @@ -569,9 +545,7 @@ void launch_bias_residual_layer_norm(float* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 2048) fused_bias_residual_layer_norm<2048, 8><<>>(vals, residual, @@ -580,9 +554,7 @@ void launch_bias_residual_layer_norm(float* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 2560) fused_bias_residual_layer_norm<2560, 10><<>>(vals, residual, @@ -591,9 +563,7 @@ void launch_bias_residual_layer_norm(float* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else throw std::runtime_error("Unsupport hidden_dim."); } @@ -605,18 +575,15 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, const __half* beta, float epsilon, int batch_size, - int sequence_length, int hidden_dim, cudaStream_t stream, bool preLayerNorm, bool training, - __half* vars, - __half* vals_hat, - bool save_vals) + __half* vars) { constexpr int threads = 128; - dim3 grid_dim(batch_size * sequence_length); + dim3 grid_dim(batch_size); dim3 block_dim(threads); // There are some limitations to call below functions, now just enumerate the situations. @@ -628,9 +595,7 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 512) fused_bias_residual_layer_norm<256, 2><<>>(vals, residual, @@ -639,9 +604,7 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 1024) fused_bias_residual_layer_norm<512, 4><<>>(vals, residual, @@ -650,9 +613,7 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 1536) fused_bias_residual_layer_norm<768, 6><<>>(vals, residual, @@ -661,9 +622,7 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 2048) fused_bias_residual_layer_norm<1024, 8><<>>(vals, residual, @@ -672,9 +631,7 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else if (hidden_dim == 2560) fused_bias_residual_layer_norm<1280, 10><<>>(vals, residual, @@ -683,9 +640,7 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, epsilon, preLayerNorm, training, - vars, - vals_hat, - save_vals); + vars); else throw std::runtime_error("Unsupport hidden_dim."); } @@ -1037,15 +992,13 @@ void launch_layerNorm_backward(const float* out_grad, float* gamma_grad, float* betta_grad, float* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2], bool invertible, const float* betta) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); @@ -1086,15 +1039,13 @@ void launch_layerNorm_backward<__half>(const __half* out_grad, __half* gamma_grad, __half* betta_grad, __half* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2], bool invertible, const __half* betta) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); @@ -1336,13 +1287,11 @@ void launch_layerNorm_backward(const float* out_grad, float* gamma_grad, float* betta_grad, float* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2]) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); @@ -1384,13 +1333,11 @@ void launch_layerNorm_backward<__half>(const __half* out_grad, __half* gamma_grad, __half* betta_grad, __half* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2]) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); @@ -1759,15 +1706,13 @@ void launch_layerNorm_backward_fused_add(const float* out_grad1, float* gamma_grad, float* betta_grad, float* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2], bool invertible, const float* betta) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); @@ -1808,15 +1753,13 @@ void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1, __half* gamma_grad, __half* betta_grad, __half* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2], bool invertible, const __half* betta) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); @@ -2070,13 +2013,11 @@ void launch_layerNorm_backward_fused_add(const float* out_grad1, float* gamma_grad, float* betta_grad, float* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2]) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); @@ -2119,13 +2060,11 @@ void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1, __half* gamma_grad, __half* betta_grad, __half* inp_grad, - int batch_size, - int sequence_length, + int batch, int hidden_dim, cudaStream_t stream[2]) { constexpr int threads = THREADS; - int batch = batch_size * sequence_length; dim3 grid_dim(hidden_dim / TILE_DIM); dim3 block_dim(TILE_DIM, TILE_DIM); diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 97a0beefc305..d676bef160fb 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -184,7 +184,9 @@ def forward(ctx, ff2_inp, attn_prob_dropout_mask, attn_output_dropout_mask, - layer_output_dropout_mask) = forward_func(config.layer_id, + layer_output_dropout_mask, + norm2_var, norm2_mean, + norm3_var, norm3_mean) = forward_func(config.layer_id, input, input_mask, attn_qkvw, @@ -288,6 +290,10 @@ def forward(ctx, ctx.attn_prob_dropout_mask = attn_prob_dropout_mask ctx.attn_output_dropout_mask = attn_output_dropout_mask ctx.layer_output_dropout_mask = layer_output_dropout_mask + ctx.norm2_var = norm2_var + ctx.norm2_mean = norm2_mean + ctx.norm3_var = norm3_var + ctx.norm3_mean = norm3_mean return output @@ -364,6 +370,10 @@ def backward(ctx, grad_output): ctx.attn_prob_dropout_mask, ctx.attn_output_dropout_mask, ctx.layer_output_dropout_mask, + ctx.norm2_var, + ctx.norm2_mean, + ctx.norm3_var, + ctx.norm3_mean, (ctx.inp_norm if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else input), input_mask, From 0d3039be114228e46bb4a4cf6d6aa457c08cabf3 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 14 Aug 2020 23:00:45 +0000 Subject: [PATCH 2/8] set sequence in more layers --- csrc/includes/context.h | 2 +- csrc/includes/dropout.h | 3 ++ csrc/includes/ds_transformer_cuda.h | 12 +----- csrc/transformer/ds_transformer_cuda.cpp | 53 ++++++++++++++++-------- tests/unit/test_cuda_forward.py | 29 ++++++------- 5 files changed, 57 insertions(+), 42 deletions(-) diff --git a/csrc/includes/context.h b/csrc/includes/context.h index 1e4820177c5d..573ed00de38d 100644 --- a/csrc/includes/context.h +++ b/csrc/includes/context.h @@ -69,7 +69,7 @@ class Context { if (!_workspace) { assert(_workspace == nullptr); cudaMalloc(&_workspace, size); - } else if (_workSpaceSize != size) { + } else if (_workSpaceSize < size) { cudaFree(_workspace); cudaMalloc(&_workspace, size); } diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h index 090df3a0abf8..0d7d6ab28401 100644 --- a/csrc/includes/dropout.h +++ b/csrc/includes/dropout.h @@ -18,6 +18,7 @@ class Dropout { } float RATIO() const { return training ? ratio : 0.0; } + inline void SetDim(uint32_t d) { dim = d; } }; Dropout(const Config& config) : _config(config), _mask(nullptr) {} @@ -70,6 +71,8 @@ class Dropout { Config GetConfig() const { return _config; } + inline void SetDimension(uint32_t dim) { _config.SetDim(dim); } + private: uint8_t* _mask; Config _config; diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h index 968f8c648d3d..2f25e6d43da1 100755 --- a/csrc/includes/ds_transformer_cuda.h +++ b/csrc/includes/ds_transformer_cuda.h @@ -127,16 +127,8 @@ class BertTransformerLayer { inline int GetBatchSize() const { return _batch_size; } inline int GetNumHeads() const { return _heads; } inline int GetSeqLength() const { return _seq_length; } - int SetSeqLength(int seq_len) { - _softmax.SetSeqlen(seq_len); - _seq_length = seq_len; - _attn_scores.SetConfig(_seq_length, - _seq_length, - _hidden_size / _heads); - _attn_context.SetConfig(_hidden_size / _heads, - _seq_length, - _seq_length); - } + + void SetSeqLength(int seq_len); inline int GetHiddenSize() const { return _hidden_size; } void SetTrainingMode(bool training); diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp index 3307c8d9bce0..cef94ba536f1 100644 --- a/csrc/transformer/ds_transformer_cuda.cpp +++ b/csrc/transformer/ds_transformer_cuda.cpp @@ -193,6 +193,7 @@ void BertTransformerLayer::Forward(int bsz, if (_attn_dropout_checkpoint) ctx_bufB_ptr = buf_1 + 4 * small_buf_size; int bsz_seq = bsz * _seq_length; + if (_pre_or_postLayerNorm) { if (_norm_layer3.UseMean()) _norm_layer3.ForwardCheckpoint( @@ -573,6 +574,21 @@ void BertTransformerLayer::SetIntermediateBuffers(uint8_t* attn_prob_dropout_ _norm_layer3.SetMean(norm3Mean); } +template +void BertTransformerLayer::SetSeqLength(int seq_len) +{ + _seq_length = seq_len; + + _softmax.SetSeqlen(_seq_length); + _attn_prob_dropout.SetDimension(_seq_length); + _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads); + _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length); + + Context::Instance().GenWorkSpace(get_workspace_size( + _batch_size, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint)); +} + + template int create_transformer_layer(int layer_id, int batch_size, @@ -691,57 +707,60 @@ std::vector ds_transformer_forward(int layer_id, std::shared_ptr> layer = std::static_pointer_cast>(s_transformer_layers[layer_id]); - if(input.size(1) != layer->GetSeqLength()) + int seq_len = layer->GetSeqLength(); + if(input.size(1) != seq_len) { - printf("Info: changing sequence-length from %d to %d \n", layer->GetSeqLength(), input.size(1)); - layer->SetSeqLength(input.size(1)); + printf("Info: changing sequence-length from %d to %d \n", seq_len, input.size(1)); + + seq_len = input.size(1); + layer->SetSeqLength(seq_len); } auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output); auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input)); auto attn_o_inp = torch::empty_like(input); - auto qkv_tf = torch::empty({(bsz * layer->GetSeqLength()), output_w.size(0) * 3}, options); + auto qkv_tf = torch::empty({(bsz * seq_len), output_w.size(0) * 3}, options); auto attn_prob_dropout_mask = - torch::empty({(bsz * layer->GetNumHeads() * layer->GetSeqLength()), layer->GetSeqLength()}, + torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, uint8_options); auto attn_output_dropout_mask = - torch::empty({(bsz * layer->GetSeqLength()), layer->GetHiddenSize()}, uint8_options); + torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options); auto layer_output_dropout_mask = - torch::empty({(bsz * layer->GetSeqLength()), layer->GetHiddenSize()}, uint8_options); + torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options); - auto norm2Var = torch::empty({(bsz * layer->GetSeqLength())}, options); - auto norm2Mean = torch::empty({(bsz * layer->GetSeqLength())}, options); - auto norm3Var = torch::empty({(bsz * layer->GetSeqLength())}, options); - auto norm3Mean = torch::empty({(bsz * layer->GetSeqLength())}, options); + auto norm2Var = torch::empty({(bsz * seq_len)}, options); + auto norm2Mean = torch::empty({(bsz * seq_len)}, options); + auto norm3Var = torch::empty({(bsz * seq_len)}, options); + auto norm3Mean = torch::empty({(bsz * seq_len)}, options); T* inp_norm_ptr = (T*)inp_norm.data_ptr(); T* add_res_ptr = (T*)add_res.data_ptr(); T* q_tf_ptr = (T*)qkv_tf.data_ptr(); T* k_tf_ptr = - q_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0)); //(T*)k_tf.data_ptr(); + q_tf_ptr + (bsz * seq_len * output_w.size(0)); //(T*)k_tf.data_ptr(); T* v_tf_ptr = - k_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0)); //(T*)v_tf.data_ptr(); + k_tf_ptr + (bsz * seq_len * output_w.size(0)); //(T*)v_tf.data_ptr(); T* attn_o_inp_ptr = (T*)attn_o_inp.data_ptr(); torch::Tensor ff2_inp = - torch::empty({(bsz * layer->GetSeqLength()), output_w.size(1)}, options); + torch::empty({(bsz * seq_len), output_w.size(1)}, options); torch::Tensor gelu_inp = (gelu_checkpoint ? ff2_inp - : torch::empty({(bsz * layer->GetSeqLength()), output_w.size(1)}, options)); + : torch::empty({(bsz * seq_len), output_w.size(1)}, options)); auto ff1_inp = torch::empty_like(input); T* ff2_inp_ptr = (T*)ff2_inp.data_ptr(); T* gelu_inp_ptr = (T*)gelu_inp.data_ptr(); T* ff1_inp_ptr = (T*)ff1_inp.data_ptr(); torch::Tensor soft_out = torch::empty( - {(bsz * layer->GetNumHeads() * layer->GetSeqLength()), layer->GetSeqLength()}, options); + {(bsz * layer->GetNumHeads() * seq_len), seq_len}, options); torch::Tensor ctx_bufB = (attn_dropout_checkpoint ? soft_out : torch::empty( - {(bsz * layer->GetNumHeads() * layer->GetSeqLength()), layer->GetSeqLength()}, + {(bsz * layer->GetNumHeads() * seq_len), seq_len}, options)); T* soft_out_ptr = (T*)soft_out.data_ptr(); T* ctx_bufB_ptr = (T*)ctx_bufB.data_ptr(); diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index fc8b8cc7e210..52b79c5e1871 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -109,7 +109,7 @@ def create_models(ds_config): num_hidden_layers=ds_config.num_hidden_layers, num_attention_heads=ds_config.heads, batch_size=ds_config.batch_size, - intermediate_size=ds_config.intermediate_size, + intermediate_size=4 * ds_config.hidden_size, hidden_act="gelu", hidden_dropout_prob=ds_config.hidden_dropout_ratio, attention_probs_dropout_prob=ds_config.attn_dropout_ratio, @@ -130,12 +130,12 @@ def create_models(ds_config): weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[4].data.fill_(1.0) weights.append( - nn.Parameter(torch.Tensor(ds_config.intermediate_size, + nn.Parameter(torch.Tensor(4 * ds_config.hidden_size, ds_config.hidden_size))) weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append( nn.Parameter(torch.Tensor(ds_config.hidden_size, - ds_config.intermediate_size))) + 4 * ds_config.hidden_size))) weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[7].data.fill_(1.0) @@ -145,7 +145,7 @@ def create_models(ds_config): for i in range(4): biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases[i + 1].data.zero_() - biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size))) + biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size))) biases[5].data.zero_() biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases[6].data.zero_() @@ -174,7 +174,7 @@ def set_seed(seed): torch.manual_seed(seed) -def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): +def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): set_seed(123) bert_encoder, ds_encoder = create_models(ds_config) @@ -183,10 +183,12 @@ def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): # prepare test data kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32 hidden_states = torch.randn(bsz, - ds_config.max_seq_length, + seq_len, #ds_config.max_seq_length, ds_config.hidden_size, **kwargs) - input_mask = torch.randn(bsz, 1, 1, ds_config.max_seq_length, **kwargs) + input_mask = torch.randn(bsz, 1, 1, + seq_len, #ds_config.max_seq_length, + **kwargs) # run baseline base_results = bert_encoder(hidden_states, @@ -209,8 +211,10 @@ def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): [ (64,1024,128,16,3,True,False), (64,1024,128,16,3,True,True), + (64,1024,128,16,3,True,True), (8,1024,384,16,3,True,False), (8,1024,384,16,3,True,True), + (8,1024,384,16,3,True,True), (8,1024,512,16,3,True,False), (8,1024,512,16,3,True,True), (64,1024,128,16,3,False,False), @@ -242,8 +246,7 @@ def test_forward(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.intermediate_size = 4 * hidden_size - ds_config.max_seq_length = seq_len + ds_config.max_seq_length = 128 #seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 ds_config.hidden_dropout_ratio = 0.0 @@ -252,7 +255,7 @@ def test_forward(batch_size, ds_config.initializer_range = 0.02 ds_config.fp16 = use_fp16 - run_forward(ds_config, atol=2e-2) + run_forward(ds_config, seq_len, atol=2e-2) @pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', @@ -279,7 +282,6 @@ def test_forward_with_small_bsz(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.intermediate_size = 4 * hidden_size ds_config.max_seq_length = seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 @@ -289,7 +291,7 @@ def test_forward_with_small_bsz(batch_size, ds_config.initializer_range = 0.02 ds_config.fp16 = use_fp16 - run_forward(ds_config, atol=2e-2, test_bsz=small_bsz) + run_forward(ds_config, seq_len, atol=2e-2, test_bsz=small_bsz) @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', [ @@ -314,7 +316,6 @@ def test_forward_stochastic(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.intermediate_size = hidden_size ds_config.max_seq_length = seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 @@ -325,4 +326,4 @@ def test_forward_stochastic(batch_size, ds_config.fp16 = use_fp16 ds_config.stochastic_mode = True - run_forward(ds_config, atol=7e-2) + run_forward(ds_config, seq_len, atol=7e-2) From c896b5c067b8d16ff91265c06ef55a44b039e9d0 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 14 Aug 2020 23:55:28 +0000 Subject: [PATCH 3/8] fixing the strided-gemm bug --- csrc/includes/context.h | 2 +- csrc/includes/dropout.h | 0 csrc/includes/softmax.h | 0 csrc/includes/strided_batch_gemm.h | 7 ++++++- csrc/transformer/ds_transformer_cuda.cpp | 3 +-- tests/unit/test_cuda_forward.py | 2 +- 6 files changed, 9 insertions(+), 5 deletions(-) mode change 100644 => 100755 csrc/includes/context.h mode change 100644 => 100755 csrc/includes/dropout.h mode change 100644 => 100755 csrc/includes/softmax.h mode change 100644 => 100755 csrc/includes/strided_batch_gemm.h diff --git a/csrc/includes/context.h b/csrc/includes/context.h old mode 100644 new mode 100755 index 573ed00de38d..1e4820177c5d --- a/csrc/includes/context.h +++ b/csrc/includes/context.h @@ -69,7 +69,7 @@ class Context { if (!_workspace) { assert(_workspace == nullptr); cudaMalloc(&_workspace, size); - } else if (_workSpaceSize < size) { + } else if (_workSpaceSize != size) { cudaFree(_workspace); cudaMalloc(&_workspace, size); } diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h old mode 100644 new mode 100755 diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h old mode 100644 new mode 100755 diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h old mode 100644 new mode 100755 index 29d08e04f7e7..4d0113e8208a --- a/csrc/includes/strided_batch_gemm.h +++ b/csrc/includes/strided_batch_gemm.h @@ -38,7 +38,12 @@ class StridedBatchGemm { gemm_algos(algos) { } - void SetConfig(int m, int n, int k) { m = m; n = n; k = k; } + void SetConfig(int mm, int nn, int kk) + { + m = mm; + n = nn; + k = kk; + } }; StridedBatchGemm(const Config& config) : _config(config) {} diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp index cef94ba536f1..ddae855559dc 100644 --- a/csrc/transformer/ds_transformer_cuda.cpp +++ b/csrc/transformer/ds_transformer_cuda.cpp @@ -711,8 +711,7 @@ std::vector ds_transformer_forward(int layer_id, if(input.size(1) != seq_len) { printf("Info: changing sequence-length from %d to %d \n", seq_len, input.size(1)); - - seq_len = input.size(1); + seq_len = input.size(1); layer->SetSeqLength(seq_len); } diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 52b79c5e1871..028fb3b57fb5 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -246,7 +246,7 @@ def test_forward(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.max_seq_length = 128 #seq_len + ds_config.max_seq_length = 128 #seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 ds_config.hidden_dropout_ratio = 0.0 From aad95c404c44b4f446c1dbb96d52c8422d87a2f3 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Sat, 15 Aug 2020 00:09:16 +0000 Subject: [PATCH 4/8] precommit --- csrc/includes/dropout.h | 2 +- csrc/includes/ds_transformer_cuda.h | 7 +- csrc/includes/gelu.h | 6 +- csrc/includes/normalize_layer.h | 37 +++---- csrc/includes/strided_batch_gemm.h | 10 +- csrc/transformer/ds_transformer_cuda.cpp | 92 +++++++++-------- csrc/transformer/gelu_kernels.cu | 14 +-- csrc/transformer/normalize_kernels.cu | 120 +++++------------------ deepspeed/ops/transformer/transformer.py | 44 +++++---- tests/unit/test_cuda_forward.py | 2 +- 10 files changed, 132 insertions(+), 202 deletions(-) mode change 100755 => 100644 csrc/includes/dropout.h mode change 100755 => 100644 csrc/includes/ds_transformer_cuda.h mode change 100755 => 100644 csrc/includes/strided_batch_gemm.h mode change 100755 => 100644 csrc/transformer/gelu_kernels.cu mode change 100755 => 100644 csrc/transformer/normalize_kernels.cu diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h old mode 100755 new mode 100644 index 0d7d6ab28401..ae72a6215c99 --- a/csrc/includes/dropout.h +++ b/csrc/includes/dropout.h @@ -18,7 +18,7 @@ class Dropout { } float RATIO() const { return training ? ratio : 0.0; } - inline void SetDim(uint32_t d) { dim = d; } + inline void SetDim(uint32_t d) { dim = d; } }; Dropout(const Config& config) : _config(config), _mask(nullptr) {} diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h old mode 100755 new mode 100644 index 2f25e6d43da1..644181a77feb --- a/csrc/includes/ds_transformer_cuda.h +++ b/csrc/includes/ds_transformer_cuda.h @@ -122,13 +122,16 @@ class BertTransformerLayer { void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr, uint8_t* attn_output_dropout_mask_ptr, uint8_t* layer_output_dropout_mask_ptr, - T*, T*, T*, T*); + T*, + T*, + T*, + T*); inline int GetBatchSize() const { return _batch_size; } inline int GetNumHeads() const { return _heads; } inline int GetSeqLength() const { return _seq_length; } - void SetSeqLength(int seq_len); + void SetSeqLength(int seq_len); inline int GetHiddenSize() const { return _hidden_size; } void SetTrainingMode(bool training); diff --git a/csrc/includes/gelu.h b/csrc/includes/gelu.h index 8ae2c82e3b56..7dd3b91b5f17 100644 --- a/csrc/includes/gelu.h +++ b/csrc/includes/gelu.h @@ -28,14 +28,12 @@ class Gelu { T* output, cudaStream_t stream) { - launch_bias_gelu( - input_buf, bias, output, _config.intermediate_size, bsz, stream); + launch_bias_gelu(input_buf, bias, output, _config.intermediate_size, bsz, stream); } void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream) { - launch_d_gelu( - d_output, input_buf, bias, _config.intermediate_size, bsz, stream); + launch_d_gelu(d_output, input_buf, bias, _config.intermediate_size, bsz, stream); } private: diff --git a/csrc/includes/normalize_layer.h b/csrc/includes/normalize_layer.h index 1b74cf168cc7..bfe84636ddb9 100644 --- a/csrc/includes/normalize_layer.h +++ b/csrc/includes/normalize_layer.h @@ -18,11 +18,7 @@ class Normalize_Layer { float epsilon; bool training; bool useMean; - Config(uint32_t batch, - uint32_t seq, - uint32_t h, - bool training, - bool useMean = true) + Config(uint32_t batch, uint32_t seq, uint32_t h, bool training, bool useMean = true) : batchSize(batch), seqLength(seq), hiddenDim(h), @@ -33,13 +29,14 @@ class Normalize_Layer { } }; - Normalize_Layer(Config config) : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr) - {} + Normalize_Layer(Config config) + : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr) + { + } - ~Normalize_Layer() - {} + ~Normalize_Layer() {} - void ForwardCheckpoint(int bsz, //batch * seq + void ForwardCheckpoint(int bsz, // batch * seq T* vals, const T* residual, const T* gamma, @@ -124,7 +121,7 @@ class Normalize_Layer { bsz, config_.hiddenDim, stream, - !config_.useMean, + !config_.useMean, betta); } @@ -174,19 +171,23 @@ class Normalize_Layer { bsz, config_.hiddenDim, stream, - !config_.useMean, + !config_.useMean, betta); } inline bool UseMean() const { return config_.useMean; } - inline void SetVar(T *variance) { - if (!variance) { throw std::runtime_error("Normalize variance is null."); } - vars = variance; } + inline void SetVar(T* variance) + { + if (!variance) { throw std::runtime_error("Normalize variance is null."); } + vars = variance; + } - inline void SetMean(T *mean) { - if (!mean) { throw std::runtime_error("Normalize mean is null."); } - means = mean; } + inline void SetMean(T* mean) + { + if (!mean) { throw std::runtime_error("Normalize mean is null."); } + means = mean; + } private: Config config_; diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h old mode 100755 new mode 100644 index 4d0113e8208a..0c093f88417c --- a/csrc/includes/strided_batch_gemm.h +++ b/csrc/includes/strided_batch_gemm.h @@ -38,11 +38,11 @@ class StridedBatchGemm { gemm_algos(algos) { } - void SetConfig(int mm, int nn, int kk) - { - m = mm; - n = nn; - k = kk; + void SetConfig(int mm, int nn, int kk) + { + m = mm; + n = nn; + k = kk; } }; diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp index ddae855559dc..7850f60be675 100644 --- a/csrc/transformer/ds_transformer_cuda.cpp +++ b/csrc/transformer/ds_transformer_cuda.cpp @@ -204,8 +204,6 @@ void BertTransformerLayer::Forward(int bsz, bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true); } - - if (_pre_or_postLayerNorm) _qkv_linear.Forward(bsz_seq, inp_norm_ptr, attn_qkvw_ptr, buf_0, _cublasHandle); else @@ -291,7 +289,8 @@ void BertTransformerLayer::Forward(int bsz, _norm_layer3.ForwardCheckpoint( bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); else - _norm_layer3.Forward(bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); + _norm_layer3.Forward( + bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); } } @@ -387,7 +386,8 @@ void BertTransformerLayer::Backward(int bsz, ? buf_0 : (_pre_or_postLayerNorm ? grad_output_ptr : buf_1); - if (_gelu_checkpoint) _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream); + if (_gelu_checkpoint) + _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream); _ff2.Backward(bsz_seq, layer_dropout_buf, (_gelu_checkpoint ? buf_2 : ff2_inp_ptr), @@ -561,8 +561,10 @@ template void BertTransformerLayer::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr, uint8_t* attn_output_dropout_mask_ptr, uint8_t* layer_output_dropout_mask_ptr, - T *norm2Var, T* norm2Mean, - T *norm3Var, T* norm3Mean) + T* norm2Var, + T* norm2Mean, + T* norm3Var, + T* norm3Mean) { _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr); _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr); @@ -577,18 +579,17 @@ void BertTransformerLayer::SetIntermediateBuffers(uint8_t* attn_prob_dropout_ template void BertTransformerLayer::SetSeqLength(int seq_len) { - _seq_length = seq_len; + _seq_length = seq_len; - _softmax.SetSeqlen(_seq_length); - _attn_prob_dropout.SetDimension(_seq_length); - _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads); - _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length); + _softmax.SetSeqlen(_seq_length); + _attn_prob_dropout.SetDimension(_seq_length); + _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads); + _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length); - Context::Instance().GenWorkSpace(get_workspace_size( - _batch_size, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint)); + Context::Instance().GenWorkSpace(get_workspace_size( + _batch_size, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint)); } - template int create_transformer_layer(int layer_id, int batch_size, @@ -708,11 +709,10 @@ std::vector ds_transformer_forward(int layer_id, std::static_pointer_cast>(s_transformer_layers[layer_id]); int seq_len = layer->GetSeqLength(); - if(input.size(1) != seq_len) - { - printf("Info: changing sequence-length from %d to %d \n", seq_len, input.size(1)); - seq_len = input.size(1); - layer->SetSeqLength(seq_len); + if (input.size(1) != seq_len) { + printf("Info: changing sequence-length from %d to %d \n", seq_len, input.size(1)); + seq_len = input.size(1); + layer->SetSeqLength(seq_len); } auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output); @@ -721,8 +721,7 @@ std::vector ds_transformer_forward(int layer_id, auto qkv_tf = torch::empty({(bsz * seq_len), output_w.size(0) * 3}, options); auto attn_prob_dropout_mask = - torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, - uint8_options); + torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, uint8_options); auto attn_output_dropout_mask = torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options); auto layer_output_dropout_mask = @@ -736,31 +735,24 @@ std::vector ds_transformer_forward(int layer_id, T* inp_norm_ptr = (T*)inp_norm.data_ptr(); T* add_res_ptr = (T*)add_res.data_ptr(); T* q_tf_ptr = (T*)qkv_tf.data_ptr(); - T* k_tf_ptr = - q_tf_ptr + (bsz * seq_len * output_w.size(0)); //(T*)k_tf.data_ptr(); - T* v_tf_ptr = - k_tf_ptr + (bsz * seq_len * output_w.size(0)); //(T*)v_tf.data_ptr(); + T* k_tf_ptr = q_tf_ptr + (bsz * seq_len * output_w.size(0)); //(T*)k_tf.data_ptr(); + T* v_tf_ptr = k_tf_ptr + (bsz * seq_len * output_w.size(0)); //(T*)v_tf.data_ptr(); T* attn_o_inp_ptr = (T*)attn_o_inp.data_ptr(); - torch::Tensor ff2_inp = - torch::empty({(bsz * seq_len), output_w.size(1)}, options); + torch::Tensor ff2_inp = torch::empty({(bsz * seq_len), output_w.size(1)}, options); torch::Tensor gelu_inp = - (gelu_checkpoint - ? ff2_inp - : torch::empty({(bsz * seq_len), output_w.size(1)}, options)); + (gelu_checkpoint ? ff2_inp : torch::empty({(bsz * seq_len), output_w.size(1)}, options)); auto ff1_inp = torch::empty_like(input); T* ff2_inp_ptr = (T*)ff2_inp.data_ptr(); T* gelu_inp_ptr = (T*)gelu_inp.data_ptr(); T* ff1_inp_ptr = (T*)ff1_inp.data_ptr(); - torch::Tensor soft_out = torch::empty( - {(bsz * layer->GetNumHeads() * seq_len), seq_len}, options); + torch::Tensor soft_out = + torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options); torch::Tensor ctx_bufB = (attn_dropout_checkpoint ? soft_out - : torch::empty( - {(bsz * layer->GetNumHeads() * seq_len), seq_len}, - options)); + : torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options)); T* soft_out_ptr = (T*)soft_out.data_ptr(); T* ctx_bufB_ptr = (T*)ctx_bufB.data_ptr(); @@ -768,10 +760,10 @@ std::vector ds_transformer_forward(int layer_id, layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(), (uint8_t*)attn_output_dropout_mask.data_ptr(), (uint8_t*)layer_output_dropout_mask.data_ptr(), - (T*)norm2Var.data_ptr(), - (T*)norm2Mean.data_ptr(), - (T*)norm3Var.data_ptr(), - (T*)norm3Mean.data_ptr()); + (T*)norm2Var.data_ptr(), + (T*)norm2Mean.data_ptr(), + (T*)norm3Var.data_ptr(), + (T*)norm3Mean.data_ptr()); layer->Forward(bsz, input_ptr, @@ -814,8 +806,10 @@ std::vector ds_transformer_forward(int layer_id, attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, - norm2Var, norm2Mean, - norm3Var, norm3Mean}; + norm2Var, + norm2Mean, + norm3Var, + norm3Mean}; } template @@ -834,10 +828,10 @@ std::vector ds_transformer_backward(int layer_id, const torch::Tensor& attn_prob_dropout_mask, const torch::Tensor& attn_output_dropout_mask, const torch::Tensor& layer_output_dropout_mask, - const torch::Tensor& norm2Var, - const torch::Tensor& norm2Mean, - const torch::Tensor& norm3Var, - const torch::Tensor& norm3Mean, + const torch::Tensor& norm2Var, + const torch::Tensor& norm2Mean, + const torch::Tensor& norm3Var, + const torch::Tensor& norm3Mean, const torch::Tensor& input, const torch::Tensor& input_mask, const torch::Tensor& attn_qkvw, @@ -884,7 +878,7 @@ std::vector ds_transformer_backward(int layer_id, std::shared_ptr> layer = std::static_pointer_cast>(s_transformer_layers[layer_id]); - + auto grad_input = torch::empty_like(input); auto grad_attn_qkvw = torch::empty_like(attn_qkvw); auto grad_attn_qkvb = torch::empty_like(attn_qkvb); @@ -945,8 +939,10 @@ std::vector ds_transformer_backward(int layer_id, layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(), (uint8_t*)attn_output_dropout_mask.data_ptr(), (uint8_t*)layer_output_dropout_mask.data_ptr(), - (T*)norm2Var.data_ptr(), (T*)norm2Mean.data_ptr(), - (T*)norm3Var.data_ptr(), (T*)norm3Mean.data_ptr()); + (T*)norm2Var.data_ptr(), + (T*)norm2Mean.data_ptr(), + (T*)norm3Var.data_ptr(), + (T*)norm3Mean.data_ptr()); layer->Backward(bsz, grad_output_ptr, diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu old mode 100755 new mode 100644 index d5858ff4ebef..209b64a90902 --- a/csrc/transformer/gelu_kernels.cu +++ b/csrc/transformer/gelu_kernels.cu @@ -304,10 +304,13 @@ void launch_gelu(const T* input, gelu_kernel<<>>(input, output, intermediate_size); } -template void -launch_bias_gelu(const float*, const float*, float*, int, int, cudaStream_t); -template void -launch_bias_gelu<__half>(const __half*, const __half*, __half*, int, int, cudaStream_t); +template void launch_bias_gelu(const float*, const float*, float*, int, int, cudaStream_t); +template void launch_bias_gelu<__half>(const __half*, + const __half*, + __half*, + int, + int, + cudaStream_t); template void launch_gelu(const float*, float*, int, int, cudaStream_t); template void launch_gelu<__half>(const __half*, __half*, int, int, cudaStream_t); @@ -329,5 +332,4 @@ void launch_d_gelu(T* d_output, } template void launch_d_gelu(float*, const float*, const float*, int, int, cudaStream_t); -template void -launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t); +template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t); diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu old mode 100755 new mode 100644 index f47a66785130..4431aeb3d8e5 --- a/csrc/transformer/normalize_kernels.cu +++ b/csrc/transformer/normalize_kernels.cu @@ -511,59 +511,23 @@ void launch_bias_residual_layer_norm(float* vals, // There are some limitations to call below functions, now just enumerate the situations. if (hidden_dim == 768) - fused_bias_residual_layer_norm<768, 3><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<768, 3><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 512) - fused_bias_residual_layer_norm<512, 2><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<512, 2><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 1024) - fused_bias_residual_layer_norm<1024, 4><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<1024, 4><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 1536) - fused_bias_residual_layer_norm<1536, 6><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<1536, 6><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 2048) - fused_bias_residual_layer_norm<2048, 8><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<2048, 8><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 2560) - fused_bias_residual_layer_norm<2560, 10><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<2560, 10><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else throw std::runtime_error("Unsupport hidden_dim."); } @@ -588,59 +552,23 @@ void launch_bias_residual_layer_norm<__half>(__half* vals, // There are some limitations to call below functions, now just enumerate the situations. if (hidden_dim == 768) - fused_bias_residual_layer_norm<384, 3><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<384, 3><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 512) - fused_bias_residual_layer_norm<256, 2><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<256, 2><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 1024) - fused_bias_residual_layer_norm<512, 4><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<512, 4><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 1536) - fused_bias_residual_layer_norm<768, 6><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<768, 6><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 2048) - fused_bias_residual_layer_norm<1024, 8><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<1024, 8><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else if (hidden_dim == 2560) - fused_bias_residual_layer_norm<1280, 10><<>>(vals, - residual, - gamma, - beta, - epsilon, - preLayerNorm, - training, - vars); + fused_bias_residual_layer_norm<1280, 10><<>>( + vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars); else throw std::runtime_error("Unsupport hidden_dim."); } diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index d676bef160fb..445f45282292 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -185,27 +185,29 @@ def forward(ctx, attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, - norm2_var, norm2_mean, - norm3_var, norm3_mean) = forward_func(config.layer_id, - input, - input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b, - config.training, - config.pre_layer_norm, - config.attn_dropout_checkpoint, - config.normalize_invertible, - config.gelu_checkpoint) + norm2_var, + norm2_mean, + norm3_var, + norm3_mean) = forward_func(config.layer_id, + input, + input_mask, + attn_qkvw, + attn_qkvb, + attn_ow, + attn_ob, + attn_nw, + attn_nb, + inter_w, + inter_b, + output_w, + output_b, + norm_w, + norm_b, + config.training, + config.pre_layer_norm, + config.attn_dropout_checkpoint, + config.normalize_invertible, + config.gelu_checkpoint) # For testing only. if grads is not None: diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 028fb3b57fb5..52b79c5e1871 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -246,7 +246,7 @@ def test_forward(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.max_seq_length = 128 #seq_len + ds_config.max_seq_length = 128 #seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 ds_config.hidden_dropout_ratio = 0.0 From dc3e064331e90f9e7ad4cd98f40013b3f9029962 Mon Sep 17 00:00:00 2001 From: Conglong Li Date: Mon, 17 Aug 2020 17:58:11 +0000 Subject: [PATCH 5/8] fixing cublas error --- csrc/includes/context.h | 2 +- csrc/includes/ds_transformer_cuda.h | 2 +- csrc/includes/strided_batch_gemm.h | 1 + csrc/transformer/cublas_wrappers.cu | 28 ++++++++++++++++++++---- csrc/transformer/ds_transformer_cuda.cpp | 9 ++++---- 5 files changed, 31 insertions(+), 11 deletions(-) diff --git a/csrc/includes/context.h b/csrc/includes/context.h index 1e4820177c5d..573ed00de38d 100755 --- a/csrc/includes/context.h +++ b/csrc/includes/context.h @@ -69,7 +69,7 @@ class Context { if (!_workspace) { assert(_workspace == nullptr); cudaMalloc(&_workspace, size); - } else if (_workSpaceSize != size) { + } else if (_workSpaceSize < size) { cudaFree(_workspace); cudaMalloc(&_workspace, size); } diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h index 644181a77feb..907f27bad31f 100644 --- a/csrc/includes/ds_transformer_cuda.h +++ b/csrc/includes/ds_transformer_cuda.h @@ -131,7 +131,7 @@ class BertTransformerLayer { inline int GetNumHeads() const { return _heads; } inline int GetSeqLength() const { return _seq_length; } - void SetSeqLength(int seq_len); + void SetSeqLength(int seq_len, int bsz); inline int GetHiddenSize() const { return _hidden_size; } void SetTrainingMode(bool training); diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h index 0c093f88417c..44a1b313b986 100644 --- a/csrc/includes/strided_batch_gemm.h +++ b/csrc/includes/strided_batch_gemm.h @@ -3,6 +3,7 @@ #include #include #include +#include "context.h" template class StridedBatchGemm { diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu index 7b0016bcae5e..84530cfbf9e9 100644 --- a/csrc/transformer/cublas_wrappers.cu +++ b/csrc/transformer/cublas_wrappers.cu @@ -34,7 +34,12 @@ int cublas_gemm_ex(cublasHandle_t handle, algo); if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! kernel execution error.\n"); + fprintf(stderr, + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + m, + n, + k, + (int)status); return EXIT_FAILURE; } return 0; @@ -74,7 +79,12 @@ int cublas_gemm_ex(cublasHandle_t handle, algo); if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! kernel execution error.\n"); + fprintf(stderr, + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + m, + n, + k, + (int)status); return EXIT_FAILURE; } return 0; @@ -122,7 +132,12 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, algo); if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! kernel execution error.\n"); + fprintf(stderr, + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + m, + n, + k, + (int)status); return EXIT_FAILURE; } return 0; @@ -170,7 +185,12 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, algo); if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! kernel execution error.\n"); + fprintf(stderr, + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + m, + n, + k, + (int)status); return EXIT_FAILURE; } diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp index 7850f60be675..5e9467ad5f66 100644 --- a/csrc/transformer/ds_transformer_cuda.cpp +++ b/csrc/transformer/ds_transformer_cuda.cpp @@ -577,7 +577,7 @@ void BertTransformerLayer::SetIntermediateBuffers(uint8_t* attn_prob_dropout_ } template -void BertTransformerLayer::SetSeqLength(int seq_len) +void BertTransformerLayer::SetSeqLength(int seq_len, int bsz) { _seq_length = seq_len; @@ -586,8 +586,8 @@ void BertTransformerLayer::SetSeqLength(int seq_len) _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads); _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length); - Context::Instance().GenWorkSpace(get_workspace_size( - _batch_size, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint)); + Context::Instance().GenWorkSpace( + get_workspace_size(bsz, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint)); } template @@ -710,9 +710,8 @@ std::vector ds_transformer_forward(int layer_id, int seq_len = layer->GetSeqLength(); if (input.size(1) != seq_len) { - printf("Info: changing sequence-length from %d to %d \n", seq_len, input.size(1)); seq_len = input.size(1); - layer->SetSeqLength(seq_len); + layer->SetSeqLength(seq_len, bsz); } auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output); From 5963997b172e8f62bfa4b36cee5e579dc4383131 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Wed, 19 Aug 2020 20:09:48 +0000 Subject: [PATCH 6/8] fixing parameters and naming --- csrc/includes/dropout.h | 7 +- csrc/includes/ds_transformer_cuda.h | 12 +- csrc/includes/gelu.h | 7 +- csrc/includes/softmax.h | 2 +- csrc/transformer/cublas_wrappers.cu | 8 +- csrc/transformer/ds_transformer_cuda.cpp | 268 +++++++++++------------ deepspeed/ops/transformer/transformer.py | 63 +++--- 7 files changed, 177 insertions(+), 190 deletions(-) diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h index ae72a6215c99..f6e32af5608d 100644 --- a/csrc/includes/dropout.h +++ b/csrc/includes/dropout.h @@ -9,13 +9,10 @@ class Dropout { public: struct Config { float ratio; - uint32_t batch, dim; + uint32_t dim; bool training; - Config(float r, uint32_t batch, uint32_t dim) - : ratio(r), batch(batch), dim(dim), training(true) - { - } + Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {} float RATIO() const { return training ? ratio : 0.0; } inline void SetDim(uint32_t d) { dim = d; } diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h index 907f27bad31f..3fac43e4c6a5 100644 --- a/csrc/includes/ds_transformer_cuda.h +++ b/csrc/includes/ds_transformer_cuda.h @@ -122,10 +122,10 @@ class BertTransformerLayer { void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr, uint8_t* attn_output_dropout_mask_ptr, uint8_t* layer_output_dropout_mask_ptr, - T*, - T*, - T*, - T*); + T* layer_norm_var, + T* layer_norm_mean, + T* attn_layer_norm_var, + T* attn_layer_norm_mean); inline int GetBatchSize() const { return _batch_size; } inline int GetNumHeads() const { return _heads; } @@ -156,8 +156,8 @@ class BertTransformerLayer { // layers FeedForward _qkv_linear; FeedForward _attn_out_linear; - Normalize_Layer _norm_layer2; - Normalize_Layer _norm_layer3; + Normalize_Layer _attn_layer_norm; + Normalize_Layer _layer_norm; Normalize_Layer* _last_normalize; FeedForward _ff1, _ff2; Softmax _softmax; diff --git a/csrc/includes/gelu.h b/csrc/includes/gelu.h index 7dd3b91b5f17..41cf6f2a68a7 100644 --- a/csrc/includes/gelu.h +++ b/csrc/includes/gelu.h @@ -9,13 +9,8 @@ template class Gelu { public: struct Config { - uint32_t batch_size; - uint32_t seq_length; uint32_t intermediate_size; - Config(uint32_t batch, uint32_t seq, uint32_t inter_size) - : batch_size(batch), seq_length(seq), intermediate_size(inter_size) - { - } + Config(uint32_t inter_size) : intermediate_size(inter_size) {} }; Gelu(const Config& config) : _config(config) {} diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h index 86e11ae7e54c..2bc2f67059cf 100755 --- a/csrc/includes/softmax.h +++ b/csrc/includes/softmax.h @@ -53,7 +53,7 @@ class Softmax { inline size_t GetSeqLength() const { return config_.seq_length; } - inline void SetSeqlen(size_t seq_len) { config_.seq_length = seq_len; } + inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; } private: Config config_; diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu index 84530cfbf9e9..3128e6c0fd64 100644 --- a/csrc/transformer/cublas_wrappers.cu +++ b/csrc/transformer/cublas_wrappers.cu @@ -35,7 +35,7 @@ int cublas_gemm_ex(cublasHandle_t handle, if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, - "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n", m, n, k, @@ -80,7 +80,7 @@ int cublas_gemm_ex(cublasHandle_t handle, if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, - "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n", m, n, k, @@ -133,7 +133,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, - "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n", m, n, k, @@ -186,7 +186,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, - "!!!! kernel execution error. (m: %d, n: %d, k: %d, error : %d) \n", + "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n", m, n, k, diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp index 5e9467ad5f66..9d275916c68e 100644 --- a/csrc/transformer/ds_transformer_cuda.cpp +++ b/csrc/transformer/ds_transformer_cuda.cpp @@ -78,16 +78,16 @@ BertTransformerLayer::BertTransformerLayer(int layer_id, hidden_size, hidden_size, gemm_algos[0])), - _norm_layer2(typename Normalize_Layer::Config(batch_size, - seq_length, - hidden_size, - true, - !normalize_invertible)), - _norm_layer3(typename Normalize_Layer::Config(batch_size, - seq_length, - hidden_size, - true, - !normalize_invertible)), + _attn_layer_norm(typename Normalize_Layer::Config(batch_size, + seq_length, + hidden_size, + true, + !normalize_invertible)), + _layer_norm(typename Normalize_Layer::Config(batch_size, + seq_length, + hidden_size, + true, + !normalize_invertible)), _ff1(typename FeedForward::Config(batch_size * seq_length, intermediate_size, hidden_size, @@ -97,16 +97,10 @@ BertTransformerLayer::BertTransformerLayer(int layer_id, intermediate_size, gemm_algos[2])), _softmax(typename Softmax::Config(batch_size, num_heads, seq_length)), - _gelu(typename Gelu::Config(_batch_size, _seq_length, intermediate_size)), - _attn_prob_dropout(typename Dropout::Config(attn_prob_dropout_ratio, - _batch_size * _heads * _seq_length, - _seq_length)), - _attn_output_dropout(typename Dropout::Config(hidden_output_dropout_ratio, - _batch_size * _seq_length, - _hidden_size)), - _layer_output_dropout(typename Dropout::Config(hidden_output_dropout_ratio, - _batch_size * _seq_length, - _hidden_size)), + _gelu(typename Gelu::Config(_intermediate_size)), + _attn_prob_dropout(typename Dropout::Config(attn_prob_dropout_ratio, _seq_length)), + _attn_output_dropout(typename Dropout::Config(hidden_output_dropout_ratio, _hidden_size)), + _layer_output_dropout(typename Dropout::Config(hidden_output_dropout_ratio, _hidden_size)), _attn_scores(typename StridedBatchGemm::Config(_batch_size * _heads, _seq_length, _seq_length, @@ -195,12 +189,12 @@ void BertTransformerLayer::Forward(int bsz, int bsz_seq = bsz * _seq_length; if (_pre_or_postLayerNorm) { - if (_norm_layer3.UseMean()) - _norm_layer3.ForwardCheckpoint( + if (_layer_norm.UseMean()) + _layer_norm.ForwardCheckpoint( bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true); else - _norm_layer3.Forward( + _layer_norm.Forward( bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true); } @@ -243,18 +237,18 @@ void BertTransformerLayer::Forward(int bsz, bsz_seq, add_res_ptr, ff1_inp_ptr, input_ptr, attn_ob_ptr, _stream); if (_pre_or_postLayerNorm) { - if (_norm_layer2.UseMean()) - _norm_layer2.ForwardCheckpoint( + if (_attn_layer_norm.UseMean()) + _attn_layer_norm.ForwardCheckpoint( bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); else - _norm_layer2.Forward( + _attn_layer_norm.Forward( bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); } else { - if (_norm_layer2.UseMean()) - _norm_layer2.ForwardCheckpoint( + if (_attn_layer_norm.UseMean()) + _attn_layer_norm.ForwardCheckpoint( bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); else - _norm_layer2.Forward( + _attn_layer_norm.Forward( bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true); } @@ -285,11 +279,11 @@ void BertTransformerLayer::Forward(int bsz, bsz_seq, inp_norm_ptr, out_ptr, ff1_inp_ptr, output_b_ptr, _stream); if (!_pre_or_postLayerNorm) { - if (_norm_layer3.UseMean()) - _norm_layer3.ForwardCheckpoint( + if (_layer_norm.UseMean()) + _layer_norm.ForwardCheckpoint( bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); else - _norm_layer3.Forward( + _layer_norm.Forward( bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true); } } @@ -355,26 +349,26 @@ void BertTransformerLayer::Backward(int bsz, int bsz_heads = bsz * _heads; if (!_pre_or_postLayerNorm) { - if (_norm_layer3.UseMean()) - _norm_layer3.Backward(bsz_seq, - grad_output_ptr, - norm_w_ptr, - grad_norm_w_ptr, - grad_norm_b_ptr, - streams, - buf_1, - inp_norm_ptr); + if (_layer_norm.UseMean()) + _layer_norm.Backward(bsz_seq, + grad_output_ptr, + norm_w_ptr, + grad_norm_w_ptr, + grad_norm_b_ptr, + streams, + buf_1, + inp_norm_ptr); else - _norm_layer3.Backward(bsz_seq, - grad_output_ptr, - norm_w_ptr, - norm_b_ptr, - grad_norm_w_ptr, - grad_norm_b_ptr, - streams, - buf_1, - output_ptr); + _layer_norm.Backward(bsz_seq, + grad_output_ptr, + norm_w_ptr, + norm_b_ptr, + grad_norm_w_ptr, + grad_norm_b_ptr, + streams, + buf_1, + output_ptr); } if (_pre_or_postLayerNorm) @@ -415,49 +409,49 @@ void BertTransformerLayer::Backward(int bsz, launch_fused_add2(buf_2, buf_3, buf_1, bsz, _seq_length, _hidden_size, _stream); if (_pre_or_postLayerNorm) { - if (_norm_layer2.UseMean()) - _norm_layer2.BackwardFusedAdd(bsz_seq, - buf_3, - grad_output_ptr, - attn_nw_ptr, - grad_attn_nw_ptr, - grad_attn_nb_ptr, - streams, - buf_0, - add_res_ptr); + if (_attn_layer_norm.UseMean()) + _attn_layer_norm.BackwardFusedAdd(bsz_seq, + buf_3, + grad_output_ptr, + attn_nw_ptr, + grad_attn_nw_ptr, + grad_attn_nb_ptr, + streams, + buf_0, + add_res_ptr); else - _norm_layer2.BackwardFusedAdd(bsz_seq, - buf_3, - grad_output_ptr, - attn_nw_ptr, - attn_nb_ptr, - grad_attn_nw_ptr, - grad_attn_nb_ptr, - streams, - buf_0, - ff1_inp_ptr); + _attn_layer_norm.BackwardFusedAdd(bsz_seq, + buf_3, + grad_output_ptr, + attn_nw_ptr, + attn_nb_ptr, + grad_attn_nw_ptr, + grad_attn_nb_ptr, + streams, + buf_0, + ff1_inp_ptr); } else { - if (_norm_layer2.UseMean()) - _norm_layer2.Backward(bsz_seq, - buf_2, - attn_nw_ptr, - grad_attn_nw_ptr, - grad_attn_nb_ptr, - streams, - buf_0, - add_res_ptr); + if (_attn_layer_norm.UseMean()) + _attn_layer_norm.Backward(bsz_seq, + buf_2, + attn_nw_ptr, + grad_attn_nw_ptr, + grad_attn_nb_ptr, + streams, + buf_0, + add_res_ptr); else - _norm_layer2.Backward(bsz_seq, - buf_2, - attn_nw_ptr, - attn_nb_ptr, - grad_attn_nw_ptr, - grad_attn_nb_ptr, - streams, - buf_0, - ff1_inp_ptr); + _attn_layer_norm.Backward(bsz_seq, + buf_2, + attn_nw_ptr, + attn_nb_ptr, + grad_attn_nw_ptr, + grad_attn_nb_ptr, + streams, + buf_0, + ff1_inp_ptr); } _attn_output_dropout.Backward(bsz_seq, buf_2, buf_0, _stream); @@ -522,28 +516,28 @@ void BertTransformerLayer::Backward(int bsz, buf_2); if (_pre_or_postLayerNorm) { - if (_norm_layer3.UseMean()) - _norm_layer3.BackwardFusedAdd(bsz_seq, - buf_2, - buf_0, - norm_w_ptr, - grad_norm_w_ptr, - grad_norm_b_ptr, - streams, - grad_input_ptr, - input_ptr); + if (_layer_norm.UseMean()) + _layer_norm.BackwardFusedAdd(bsz_seq, + buf_2, + buf_0, + norm_w_ptr, + grad_norm_w_ptr, + grad_norm_b_ptr, + streams, + grad_input_ptr, + input_ptr); else - _norm_layer3.BackwardFusedAdd(bsz_seq, - buf_2, - buf_0, - norm_w_ptr, - norm_b_ptr, - grad_norm_w_ptr, - grad_norm_b_ptr, - streams, - grad_input_ptr, - inp_norm_ptr); + _layer_norm.BackwardFusedAdd(bsz_seq, + buf_2, + buf_0, + norm_w_ptr, + norm_b_ptr, + grad_norm_w_ptr, + grad_norm_b_ptr, + streams, + grad_input_ptr, + inp_norm_ptr); } else launch_fused_add2(grad_input_ptr, buf_2, buf_0, bsz, _seq_length, _hidden_size, _stream); } @@ -561,19 +555,19 @@ template void BertTransformerLayer::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr, uint8_t* attn_output_dropout_mask_ptr, uint8_t* layer_output_dropout_mask_ptr, - T* norm2Var, - T* norm2Mean, - T* norm3Var, - T* norm3Mean) + T* attn_layer_norm_var, + T* attn_layer_norm_mean, + T* layer_norm_var, + T* layer_norm_mean) { _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr); _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr); _layer_output_dropout.SetMask(layer_output_dropout_mask_ptr); - _norm_layer2.SetVar(norm2Var); - _norm_layer2.SetMean(norm2Mean); - _norm_layer3.SetVar(norm3Var); - _norm_layer3.SetMean(norm3Mean); + _attn_layer_norm.SetVar(attn_layer_norm_var); + _attn_layer_norm.SetMean(attn_layer_norm_mean); + _layer_norm.SetVar(layer_norm_var); + _layer_norm.SetMean(layer_norm_mean); } template @@ -581,7 +575,7 @@ void BertTransformerLayer::SetSeqLength(int seq_len, int bsz) { _seq_length = seq_len; - _softmax.SetSeqlen(_seq_length); + _softmax.SetSeqLength(_seq_length); _attn_prob_dropout.SetDimension(_seq_length); _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads); _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length); @@ -726,10 +720,10 @@ std::vector ds_transformer_forward(int layer_id, auto layer_output_dropout_mask = torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options); - auto norm2Var = torch::empty({(bsz * seq_len)}, options); - auto norm2Mean = torch::empty({(bsz * seq_len)}, options); - auto norm3Var = torch::empty({(bsz * seq_len)}, options); - auto norm3Mean = torch::empty({(bsz * seq_len)}, options); + auto attn_layer_norm_var = torch::empty({(bsz * seq_len)}, options); + auto attn_layer_norm_mean = torch::empty({(bsz * seq_len)}, options); + auto layer_norm_var = torch::empty({(bsz * seq_len)}, options); + auto layer_norm_mean = torch::empty({(bsz * seq_len)}, options); T* inp_norm_ptr = (T*)inp_norm.data_ptr(); T* add_res_ptr = (T*)add_res.data_ptr(); @@ -759,10 +753,10 @@ std::vector ds_transformer_forward(int layer_id, layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(), (uint8_t*)attn_output_dropout_mask.data_ptr(), (uint8_t*)layer_output_dropout_mask.data_ptr(), - (T*)norm2Var.data_ptr(), - (T*)norm2Mean.data_ptr(), - (T*)norm3Var.data_ptr(), - (T*)norm3Mean.data_ptr()); + (T*)attn_layer_norm_var.data_ptr(), + (T*)attn_layer_norm_mean.data_ptr(), + (T*)layer_norm_var.data_ptr(), + (T*)layer_norm_mean.data_ptr()); layer->Forward(bsz, input_ptr, @@ -805,10 +799,10 @@ std::vector ds_transformer_forward(int layer_id, attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, - norm2Var, - norm2Mean, - norm3Var, - norm3Mean}; + attn_layer_norm_var, + attn_layer_norm_mean, + layer_norm_var, + layer_norm_mean}; } template @@ -827,10 +821,10 @@ std::vector ds_transformer_backward(int layer_id, const torch::Tensor& attn_prob_dropout_mask, const torch::Tensor& attn_output_dropout_mask, const torch::Tensor& layer_output_dropout_mask, - const torch::Tensor& norm2Var, - const torch::Tensor& norm2Mean, - const torch::Tensor& norm3Var, - const torch::Tensor& norm3Mean, + const torch::Tensor& attn_layer_norm_var, + const torch::Tensor& attn_layer_norm_mean, + const torch::Tensor& layer_norm_var, + const torch::Tensor& layer_norm_mean, const torch::Tensor& input, const torch::Tensor& input_mask, const torch::Tensor& attn_qkvw, @@ -938,10 +932,10 @@ std::vector ds_transformer_backward(int layer_id, layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(), (uint8_t*)attn_output_dropout_mask.data_ptr(), (uint8_t*)layer_output_dropout_mask.data_ptr(), - (T*)norm2Var.data_ptr(), - (T*)norm2Mean.data_ptr(), - (T*)norm3Var.data_ptr(), - (T*)norm3Mean.data_ptr()); + (T*)attn_layer_norm_var.data_ptr(), + (T*)attn_layer_norm_mean.data_ptr(), + (T*)layer_norm_var.data_ptr(), + (T*)layer_norm_mean.data_ptr()); layer->Backward(bsz, grad_output_ptr, diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 445f45282292..7dc66c562e51 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -185,29 +185,29 @@ def forward(ctx, attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, - norm2_var, - norm2_mean, - norm3_var, - norm3_mean) = forward_func(config.layer_id, - input, - input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b, - config.training, - config.pre_layer_norm, - config.attn_dropout_checkpoint, - config.normalize_invertible, - config.gelu_checkpoint) + attn_layer_norm_var, + attn_layer_norm_mean, + layer_norm_var, + layer_norm_mean) = forward_func(config.layer_id, + input, + input_mask, + attn_qkvw, + attn_qkvb, + attn_ow, + attn_ob, + attn_nw, + attn_nb, + inter_w, + inter_b, + output_w, + output_b, + norm_w, + norm_b, + config.training, + config.pre_layer_norm, + config.attn_dropout_checkpoint, + config.normalize_invertible, + config.gelu_checkpoint) # For testing only. if grads is not None: @@ -284,6 +284,9 @@ def forward(ctx, if not config.normalize_invertible: ctx.add_res = add_res + ctx.attn_layer_norm_mean = attn_layer_norm_mean + ctx.layer_norm_mean = layer_norm_mean + ctx.ff1_inp = ff1_inp if not config.gelu_checkpoint: ctx.gelu_inp = gelu_inp @@ -292,10 +295,8 @@ def forward(ctx, ctx.attn_prob_dropout_mask = attn_prob_dropout_mask ctx.attn_output_dropout_mask = attn_output_dropout_mask ctx.layer_output_dropout_mask = layer_output_dropout_mask - ctx.norm2_var = norm2_var - ctx.norm2_mean = norm2_mean - ctx.norm3_var = norm3_var - ctx.norm3_mean = norm3_mean + ctx.attn_layer_norm_var = attn_layer_norm_var + ctx.layer_norm_var = layer_norm_var return output @@ -372,10 +373,10 @@ def backward(ctx, grad_output): ctx.attn_prob_dropout_mask, ctx.attn_output_dropout_mask, ctx.layer_output_dropout_mask, - ctx.norm2_var, - ctx.norm2_mean, - ctx.norm3_var, - ctx.norm3_mean, + ctx.attn_layer_norm_var, + ctx.attn_layer_norm_mean, + ctx.layer_norm_var, + ctx.layer_norm_mean, (ctx.inp_norm if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else input), input_mask, From 00439bd1d9fa7d4271e30cdb8587cfb4a06dd631 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Tue, 1 Sep 2020 23:45:13 +0000 Subject: [PATCH 7/8] fixing the dynamic sequence-length with any arbitrary 8-aligned seq-len --- csrc/includes/context.h | 2 +- csrc/transformer/general_kernels.cu | 26 ++++--- csrc/transformer/softmax_kernels.cu | 104 +++++++++++++--------------- tests/unit/test_cuda_backward.py | 8 +-- tests/unit/test_cuda_forward.py | 9 ++- 5 files changed, 75 insertions(+), 74 deletions(-) mode change 100644 => 100755 csrc/transformer/general_kernels.cu mode change 100644 => 100755 csrc/transformer/softmax_kernels.cu diff --git a/csrc/includes/context.h b/csrc/includes/context.h index 573ed00de38d..e05c41dc1d0a 100755 --- a/csrc/includes/context.h +++ b/csrc/includes/context.h @@ -29,7 +29,7 @@ for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y) #define DS_CUDA_NUM_THREADS 512 -#define DS_MAXIMUM_NUM_BLOCKS 4096 +#define DS_MAXIMUM_NUM_BLOCKS 262144 inline int DS_GET_BLOCKS(const int N) { diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu old mode 100644 new mode 100755 index 0ce280a702ab..a5d7e506013c --- a/csrc/transformer/general_kernels.cu +++ b/csrc/transformer/general_kernels.cu @@ -14,15 +14,19 @@ __global__ void column_sum_reduce(const T* __restrict__ inp, cg::thread_block_tile g = cg::tiled_partition(b); int idx = blockDim.x * blockIdx.x + threadIdx.x; - int offset = threadIdx.y * width + idx; + int y_stride = width * TILE_DIM; float localSum = 0; // Loop across matrix height - for (int r = threadIdx.y; r < rows; r += TILE_DIM) { - localSum += (float)inp[offset]; - offset += y_stride; + if(idx < width) + { + int offset = threadIdx.y * width + idx; + for (int r = threadIdx.y; r < rows; r += TILE_DIM) { + localSum += (float)inp[offset]; + offset += y_stride; + } } tile[threadIdx.x][threadIdx.y] = localSum; @@ -40,7 +44,7 @@ __global__ void column_sum_reduce(const T* __restrict__ inp, if (threadIdx.x == 0) { int pos = blockIdx.x * TILE_DIM + threadIdx.y; - out[pos] = sum; + if(pos < (rows * width))out[pos] = sum; } } @@ -58,10 +62,10 @@ void launch_fuse_transpose_bias_kernel(const float* inp, int cols, cudaStream_t stream) { - assert(rows % TILE_DIM == 0); - assert(cols % TILE_DIM == 0); + //assert(rows % TILE_DIM == 0); + //assert(cols % TILE_DIM == 0); - dim3 grid_dim(cols / TILE_DIM); + dim3 grid_dim((cols-1) / TILE_DIM + 1); dim3 block_dim(TILE_DIM, TILE_DIM); column_sum_reduce<<>>(inp, out, rows, cols); @@ -74,10 +78,10 @@ void launch_fuse_transpose_bias_kernel<__half>(const __half* inp, int cols, cudaStream_t stream) { - assert(rows % TILE_DIM == 0); - assert(cols % TILE_DIM == 0); + //assert(rows % TILE_DIM == 0); + //assert(cols % TILE_DIM == 0); - dim3 grid_dim(cols / TILE_DIM); + dim3 grid_dim((cols-1) / TILE_DIM + 1); dim3 block_dim(TILE_DIM, TILE_DIM); column_sum_reduce<__half><<>>(inp, out, rows, cols); diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu old mode 100644 new mode 100755 index 8e2b86901609..d912af9b4a94 --- a/csrc/transformer/softmax_kernels.cu +++ b/csrc/transformer/softmax_kernels.cu @@ -1,5 +1,6 @@ #include "custom_cuda_layers.h" #include "general_kernels.h" +#include namespace cg = cooperative_groups; @@ -282,7 +283,7 @@ __global__ void attn_softmax(__half* vals, } template -void launch_attn_softmax(T*, const T*, int, int, int, cudaStream_t, bool); +void launch_attn_softmax(T*, const T*, int, int, int, cudaStream_t); template <> void launch_attn_softmax(float* vals, @@ -294,11 +295,12 @@ void launch_attn_softmax(float* vals, { const int threads = 128; int seq_length4 = sequence_length / 4; - int seq2 = sequence_length * seq_length4; int block_compute_size = - (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4); - dim3 grid_dim(batch_size, heads * seq2 / block_compute_size); + (seq_length4 < threads ? + (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : + 1); + dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -330,8 +332,10 @@ void launch_attn_softmax(float* vals, else { const int threads = 256; block_compute_size = - (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4); - dim3 grid_dim(batch_size, heads * seq2 / block_compute_size); + (seq_length4 < threads ? + (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : + 1); + dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -362,11 +366,12 @@ void launch_attn_softmax<__half>(__half* vals, { const int threads = 128; int seq_length4 = sequence_length / 4; - int seq2 = sequence_length * seq_length4; int block_compute_size = - (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4); - dim3 grid_dim(batch_size, heads * seq2 / block_compute_size); + (seq_length4 < threads ? + (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : + 1); + dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -399,8 +404,10 @@ void launch_attn_softmax<__half>(__half* vals, else { const int threads = 256; block_compute_size = - (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4); - dim3 grid_dim(batch_size, heads * seq2 / block_compute_size); + (seq_length4 < threads ? + (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : + 1); + dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -531,55 +538,42 @@ void launch_attn_softmax_backward_v2(T* out_grad, int seq_length, cudaStream_t stream) { - if ((seq_length % WARP_SIZE) != 0 || seq_length > 2048) - throw std::runtime_error("Invalid sequence length found in softmax backward."); const int warps_per_block = 4; dim3 grid_dim(batch_size * heads * seq_length / warps_per_block); dim3 block_dim(WARP_SIZE, warps_per_block); - switch (seq_length) { - case 32: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 64: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 128: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 256: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 384: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 512: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 768: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 1024: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - case 2048: - softmax_backward_kernel_v2 - <<>>(out_grad, soft_inp, seq_length); - break; - default: - throw std::runtime_error( - std::string("Special sequence length found in softmax backward, seq_length: ") + - std::to_string(seq_length)); - } + if(seq_length <= 32) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 64) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 128) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 256) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 384) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 512) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 768) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 1024) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else if(seq_length <= 2048) + softmax_backward_kernel_v2 + <<>>(out_grad, soft_inp, seq_length); + else + throw std::runtime_error( + std::string("Special sequence length found in softmax backward, seq_length: ") + + std::to_string(seq_length)); } template void launch_attn_softmax_backward_v2<__half>(__half* out_grad, diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index 8e678334ab91..3d114fa0c8dc 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -252,10 +252,10 @@ def run_backward(ds_config, atol=1e-2, verbose=False): @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol', [ - (3,1024,128,16,24,True,False, 0.05), - (3,1024,128,16,24,True,True, 0.05), - (3,1024,128,16,24,False,False, 0.1), - (3,1024,128,16,24,False,True, 0.2), + (3,1024,120,16,24,True,False, 0.05), + (3,1024,120,16,24,True,True, 0.05), + (3,1024,56,16,24,False,False, 0.1), + (3,1024,56,16,24,False,True, 0.2), ]) # yapf: disable def test_backward(batch_size, hidden_size, diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 52b79c5e1871..5c21a73dbe91 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -211,14 +211,17 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): [ (64,1024,128,16,3,True,False), (64,1024,128,16,3,True,True), - (64,1024,128,16,3,True,True), (8,1024,384,16,3,True,False), (8,1024,384,16,3,True,True), (8,1024,384,16,3,True,True), + (8,1024,120,16,3,True,False), + (8,1024,120,16,3,True,True), (8,1024,512,16,3,True,False), (8,1024,512,16,3,True,True), - (64,1024,128,16,3,False,False), - (64,1024,128,16,3,False,True), + (64,1024,56,16,3,False,False), + (64,1024,56,16,3,False,True), + (64,1024,24,16,3,False,False), + (64,1024,24,16,3,False,True), (8,1024,384,16,3,False,False), (8,1024,384,16,3,False,True), (8,1024,512,16,3,False,False), From 643c33b1c39152e8f81fecb2a8beda8fb3eeff6a Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 11 Sep 2020 17:02:09 +0000 Subject: [PATCH 8/8] running precommit --- csrc/transformer/general_kernels.cu | 19 +++++++------- csrc/transformer/softmax_kernels.cu | 39 ++++++++++++----------------- 2 files changed, 25 insertions(+), 33 deletions(-) mode change 100755 => 100644 csrc/transformer/general_kernels.cu mode change 100755 => 100644 csrc/transformer/softmax_kernels.cu diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu old mode 100755 new mode 100644 index a5d7e506013c..ad97a9a4793c --- a/csrc/transformer/general_kernels.cu +++ b/csrc/transformer/general_kernels.cu @@ -14,14 +14,13 @@ __global__ void column_sum_reduce(const T* __restrict__ inp, cg::thread_block_tile g = cg::tiled_partition(b); int idx = blockDim.x * blockIdx.x + threadIdx.x; - + int y_stride = width * TILE_DIM; float localSum = 0; // Loop across matrix height - if(idx < width) - { + if (idx < width) { int offset = threadIdx.y * width + idx; for (int r = threadIdx.y; r < rows; r += TILE_DIM) { localSum += (float)inp[offset]; @@ -44,7 +43,7 @@ __global__ void column_sum_reduce(const T* __restrict__ inp, if (threadIdx.x == 0) { int pos = blockIdx.x * TILE_DIM + threadIdx.y; - if(pos < (rows * width))out[pos] = sum; + if (pos < (rows * width)) out[pos] = sum; } } @@ -62,10 +61,10 @@ void launch_fuse_transpose_bias_kernel(const float* inp, int cols, cudaStream_t stream) { - //assert(rows % TILE_DIM == 0); - //assert(cols % TILE_DIM == 0); + // assert(rows % TILE_DIM == 0); + // assert(cols % TILE_DIM == 0); - dim3 grid_dim((cols-1) / TILE_DIM + 1); + dim3 grid_dim((cols - 1) / TILE_DIM + 1); dim3 block_dim(TILE_DIM, TILE_DIM); column_sum_reduce<<>>(inp, out, rows, cols); @@ -78,10 +77,10 @@ void launch_fuse_transpose_bias_kernel<__half>(const __half* inp, int cols, cudaStream_t stream) { - //assert(rows % TILE_DIM == 0); - //assert(cols % TILE_DIM == 0); + // assert(rows % TILE_DIM == 0); + // assert(cols % TILE_DIM == 0); - dim3 grid_dim((cols-1) / TILE_DIM + 1); + dim3 grid_dim((cols - 1) / TILE_DIM + 1); dim3 block_dim(TILE_DIM, TILE_DIM); column_sum_reduce<__half><<>>(inp, out, rows, cols); diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu old mode 100755 new mode 100644 index d912af9b4a94..582da4829f47 --- a/csrc/transformer/softmax_kernels.cu +++ b/csrc/transformer/softmax_kernels.cu @@ -1,6 +1,6 @@ +#include #include "custom_cuda_layers.h" #include "general_kernels.h" -#include namespace cg = cooperative_groups; @@ -297,9 +297,7 @@ void launch_attn_softmax(float* vals, int seq_length4 = sequence_length / 4; int block_compute_size = - (seq_length4 < threads ? - (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : - 1); + (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1); dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -332,9 +330,8 @@ void launch_attn_softmax(float* vals, else { const int threads = 256; block_compute_size = - (seq_length4 < threads ? - (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : - 1); + (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) + : 1); dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -368,9 +365,7 @@ void launch_attn_softmax<__half>(__half* vals, int seq_length4 = sequence_length / 4; int block_compute_size = - (seq_length4 < threads ? - (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : - 1); + (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1); dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -404,9 +399,8 @@ void launch_attn_softmax<__half>(__half* vals, else { const int threads = 256; block_compute_size = - (seq_length4 < threads ? - (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : - 1); + (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) + : 1); dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size); int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads; @@ -538,36 +532,35 @@ void launch_attn_softmax_backward_v2(T* out_grad, int seq_length, cudaStream_t stream) { - const int warps_per_block = 4; dim3 grid_dim(batch_size * heads * seq_length / warps_per_block); dim3 block_dim(WARP_SIZE, warps_per_block); - if(seq_length <= 32) + if (seq_length <= 32) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 64) + else if (seq_length <= 64) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 128) + else if (seq_length <= 128) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 256) + else if (seq_length <= 256) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 384) + else if (seq_length <= 384) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 512) + else if (seq_length <= 512) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 768) + else if (seq_length <= 768) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 1024) + else if (seq_length <= 1024) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); - else if(seq_length <= 2048) + else if (seq_length <= 2048) softmax_backward_kernel_v2 <<>>(out_grad, soft_inp, seq_length); else