create branch for public release (#489)

* Merge chatgpt v2 to v3 - finalized (#484) * [squash] staging chatgpt v1 (#463) Co-authored-by: Reza Yazdani <[email protected]> Co-authored-by: yaozhewei <[email protected]> Co-authored-by: Tunji Ruwase <[email protected]> * [partial] formatting fixes * quantizer fixes * fix for bert tests * formatting fixes * re-enable _param_slice_mappings in z2 * Enable the QKV requires_grad when in training mode (#466) Co-authored-by: Jeff Rasley <[email protected]> * fixes for attention enable_training flag * commit to trigger CI * fix for distil-bert param * fixes for training context errors * remove reza's qkv-optimization (#469) Co-authored-by: Jeff Rasley <[email protected]> * Chatgpt - Fuse lora params at HybridEngine (#472) Co-authored-by: Jeff Rasley <[email protected]> * add option to enable non-pin mode (#473) * Chatgpt - fuse lora non pinned case (#474) * Fix fuse/unfuse lora for Z3 and non-pinned parameter * unfuse_lora_weight for non-pinned case * fix the multiple issue for lora parameters * formatting * fuse lora only when available --------- Co-authored-by: Jeff Rasley <[email protected]> * Chatgpt/release inference cache (#475) * Fix fuse/unfuse lora for Z3 and non-pinned parameter * unfuse_lora_weight for non-pinned case * release/retake the inference cache after/before generate * remove duplicated _fuse_lora function * fix formatting * fix hybrid-engine config issue * update formatting * Chatgpt - fuse qkv v2 (#478) Co-authored-by: Jeff Rasley <[email protected]> * ChatGPT: Refactor Hybrid Engine Config (#477) Co-authored-by: Lok Chand Koppaka <[email protected]> * Inference Workspace Tweaks (#481) * Safety checks around inference workspace allocation, extra flushing * Formatting fixes * Merge fix * Chatgpt/inference tp (#480) * Update the merged-QKV weights only if there is difference with the model parameter * remove the hard-coded size * always reset qkv params to updated ones after running step * Add the infernce-tp group and tensor sharding to run inference in model-parallel mode * optimize the gather/mp-sharding part * Add hybrid_engine changes * fix config issue * Formatting fixes. Reset_qkv duplicate removal. * fix bloom container. * fix format. --------- Co-authored-by: Ammar Ahmad Awan <[email protected]> Co-authored-by: Lok Chand Koppaka <[email protected]> * fix formatting * more clean-up --------- Co-authored-by: Jeff Rasley <[email protected]> Co-authored-by: yaozhewei <[email protected]> Co-authored-by: Tunji Ruwase <[email protected]> Co-authored-by: Masahiro Tanaka <[email protected]> Co-authored-by: Michael Wyatt <[email protected]> Co-authored-by: Lok Chand Koppaka <[email protected]> Co-authored-by: Connor Holmes <[email protected]> Co-authored-by: Ammar Ahmad Awan <[email protected]> * fix a bug on lora-fusion (#487) * Cholmes/v3 workspace bugfixes (#488) * Miscellaneous workspace fixes, new config param * Fix typo --------- Co-authored-by: Reza Yazdani <[email protected]> Co-authored-by: Jeff Rasley <[email protected]> Co-authored-by: yaozhewei <[email protected]> Co-authored-by: Tunji Ruwase <[email protected]> Co-authored-by: Masahiro Tanaka <[email protected]> Co-authored-by: Michael Wyatt <[email protected]> Co-authored-by: Lok Chand Koppaka <[email protected]> Co-authored-by: Connor Holmes <[email protected]>
microsoft · Apr 10, 2023 · 0ce390e · 0ce390e
1 parent 4d27225
commit 0ce390e
Show file tree

Hide file tree

Showing 48 changed files with 1,448 additions and 408 deletions.
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
@@ -26,7 +26,6 @@ def __init__(self):
         # put all valid class name <--> class type mapping into class_dict
         op_builder_dir = self.op_builder_dir()
         op_builder_module = importlib.import_module(op_builder_dir)
-
         for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
             # avoid self references
             if module_name != 'all_ops' and module_name != 'builder':

diff --git a/csrc/includes/context.h b/csrc/includes/context.h
@@ -44,9 +44,9 @@ inline int DS_GET_BLOCKS(const int N)
         1);
 }
 
-class Context {
+class TrainingContext {
 public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
+    TrainingContext() : _workspace(nullptr), _seed(42), _curr_offset(0)
     {
         curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
         curandSetPseudoRandomGeneratorSeed(_gen, 123);
@@ -57,15 +57,15 @@ class Context {
         }
     }
 
-    virtual ~Context()
+    virtual ~TrainingContext()
     {
         cublasDestroy(_cublasHandle);
         cudaFree(_workspace);
     }
 
-    static Context& Instance()
+    static TrainingContext& Instance()
     {
-        static Context _ctx;
+        static TrainingContext _ctx;
         return _ctx;
     }
 

diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h
@@ -39,8 +39,8 @@ class Adagrad_Optimizer {
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
         _buf_index = false;
 #endif
     }

diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
@@ -54,8 +54,8 @@ class Adam_Optimizer {
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
         _buf_index = false;
 #endif
     }

diff --git a/csrc/quantization/fake_quantizer.cu b/csrc/quantization/fake_quantizer.cu
@@ -457,7 +457,7 @@ void launch_sr_fake_quantize_kernel(T* vals,
     dim3 grid_dim(group_num);
 
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
@@ -1011,7 +1011,7 @@ void launch_sr_fake_quantize_kernel_asym(T* vals,
     dim3 grid_dim(group_num);
 
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);

diff --git a/csrc/transformer/dropout_kernels.cu b/csrc/transformer/dropout_kernels.cu
@@ -278,7 +278,7 @@ void launch_dropout(T* out,
         grid_dim.x <<= 1;
     }
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
     if (bwd)
         dropout_kernel_bwd<<<grid_dim, block_dim, 0, stream>>>(
             total_count, ratio, vals, out, mask, seed);
@@ -625,7 +625,7 @@ void launch_dropout(T* out,
     dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
         total_count, dim, ratio, bias, out, mask, seed);
@@ -847,7 +847,7 @@ void launch_dropout(T* out,
     dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
         total_count, dim, ratio, input, residual, bias, out, mask, seed);

diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
@@ -78,8 +78,8 @@ BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
       _normalize_invertible(normalize_invertible),
       _gelu_checkpoint(gelu_checkpoint),
       _stochastic_mode(stochastic_mode),
-      _stream(Context::Instance().GetCurrentStream()),
-      _cublasHandle(Context::Instance().GetCublasHandle()),
+      _stream(TrainingContext::Instance().GetCurrentStream()),
+      _cublasHandle(TrainingContext::Instance().GetCublasHandle()),
       _qkv_linear(typename FeedForward<T>::Config(batch_size * seq_length,
                                                   3 * hidden_size,
                                                   hidden_size,
@@ -183,7 +183,7 @@ void BertTransformerLayer<T>::Forward(unsigned bsz,
 
     if (!_stochastic_mode) cudaStreamSynchronize(_stream);
 
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
+    T* workspace = static_cast<T*>(TrainingContext::Instance().GetWorkSpace());
     size_t small_buf_size = bsz * _seq_length * _hidden_size;
     T* buf_0 = workspace;
     T* buf_1 = buf_0 + small_buf_size;
@@ -343,7 +343,7 @@ void BertTransformerLayer<T>::Backward(unsigned bsz,
 
     if (!_stochastic_mode) cudaStreamSynchronize(_stream);
 
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
+    T* workspace = static_cast<T*>(TrainingContext::Instance().GetWorkSpace());
     size_t small_buf_size = bsz * _seq_length * _hidden_size;
     T* buf_0 = workspace;
     T* buf_1 = buf_0 + small_buf_size;
@@ -609,25 +609,26 @@ int create_transformer_layer(unsigned layer_id,
                              bool gelu_checkpoint,
                              bool stochastic_mode)
 {
-    Context::Instance().SetSeed(seed);
-    Context::Instance().TestGemmFP16(
+    TrainingContext::Instance().SetSeed(seed);
+    TrainingContext::Instance().TestGemmFP16(
         test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
 
-    auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
-                                                           batch_size,
-                                                           hidden_dim,
-                                                           num_heads,
-                                                           intermediate_size,
-                                                           init_seq_length,
-                                                           attn_dropout_ratio,
-                                                           hidden_dropout_ratio,
-                                                           layer_norm_eps,
-                                                           pre_or_postLayerNorm,
-                                                           Context::Instance().GetGemmAlgos(),
-                                                           attn_dropout_checkpoint,
-                                                           normalize_invertible,
-                                                           gelu_checkpoint,
-                                                           stochastic_mode);
+    auto layer =
+        std::make_shared<BertTransformerLayer<T>>(layer_id,
+                                                  batch_size,
+                                                  hidden_dim,
+                                                  num_heads,
+                                                  intermediate_size,
+                                                  init_seq_length,
+                                                  attn_dropout_ratio,
+                                                  hidden_dropout_ratio,
+                                                  layer_norm_eps,
+                                                  pre_or_postLayerNorm,
+                                                  TrainingContext::Instance().GetGemmAlgos(),
+                                                  attn_dropout_checkpoint,
+                                                  normalize_invertible,
+                                                  gelu_checkpoint,
+                                                  stochastic_mode);
 
     s_transformer_layers[layer_id] = layer;
 
@@ -725,7 +726,7 @@ std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
                                                          layer->IsTrainingMode(),
                                                          layer->GeluCheckpoint())},
                                   options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+    TrainingContext::Instance().SetWorkSpace((T*)workspace.data_ptr());
 
     auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
     auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
@@ -909,7 +910,7 @@ std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
                                                          layer->IsTrainingMode(),
                                                          layer->GeluCheckpoint())},
                                   options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+    TrainingContext::Instance().SetWorkSpace((T*)workspace.data_ptr());
 
     auto grad_input = torch::empty_like(input);
     auto grad_attn_qkvw = torch::empty_like(attn_qkvw);