Skip to content

Commit

Permalink
Merge main and fix conflicts
Browse files Browse the repository at this point in the history
Signed-off-by: MaximumEntropy <[email protected]>
  • Loading branch information
MaximumEntropy committed Feb 15, 2022
2 parents 9d4c3aa + 277b088 commit fd8c672
Show file tree
Hide file tree
Showing 15 changed files with 318 additions and 123 deletions.
17 changes: 11 additions & 6 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -718,9 +718,11 @@ pipeline {
parallel {
stage('SGD-GEN') {
steps {
sh 'cd examples/nlp/dialogue_state_tracking_generative && \
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue_state_tracking_generative && \
python sgd_gen.py \
model.dataset.data_dir=/home/TestData/nlp/sgd_small \
model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
model.dataset.dialogues_example_dir=sgd_gen_outputs \
model.dataset.task_name=debug_sample \
trainer.max_steps=1 \
Expand All @@ -732,16 +734,18 @@ pipeline {
trainer.val_check_interval=0.0 \
trainer.gpus=[0] \
model.dataset.use_cache=false \
model.tokenizer.special_tokens={pad_token:"endoftext"}\
model.language_model.pretrained_model_name=gpt2 \
model.tokenizer.special_tokens={pad_token:"endoftext"} \
model.tokenizer.tokenizer_name=gpt2 \
model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
trainer.accelerator=ddp \
exp_manager=null && \
rm -rf sgd_gen_outputs'
}
}
stage('SGD-GEN Backward compatible with SGDQA') {
steps {
sh 'cd examples/nlp/dialogue_state_tracking_generative && \
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue_state_tracking_generative && \
python sgd_gen.py \
model.dataset.data_dir=/home/TestData/nlp/sgd_small \
model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
Expand All @@ -758,7 +762,7 @@ pipeline {
model.language_model.pretrained_model_name=bert-base-cased \
trainer.accelerator=ddp \
exp_manager=null && \
rm -rf sgd_gen_bert_outputs'
rm -rf sgd_gen_bert_outputs && TRANSFORMERS_OFFLINE=1'
}
}
}
Expand Down Expand Up @@ -1189,7 +1193,8 @@ pipeline {
sh 'data_dir=/home/TestData/nlp/token_classification_punctuation && \
usual_data=${data_dir}/wmt_wiki_10000 && \
tarred_data=${data_dir}/train_tarred && \
output=${data_dir}/output && \
TIME=`date +"%Y-%m-%d-%T"` \
output=${data_dir}/output_${TIME} && \
tokens_in_batch=2000 && \
max_seq_length=512 && \
lm_model=distilbert-base-uncased && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ trainer:
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
checkpoint_callback: False # Provided by exp_manager
logger: False # Provided by exp_manager

logger: False # Provided by exp_manager
model:
tensor_model_parallel_size: 1
nemo_path: null # filename to save the model and associated artifacts to .nemo file
library: huggingface # huggingface or megatron
tokenizer:
tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
vocab_file: null # path to vocab file
Expand Down
15 changes: 13 additions & 2 deletions examples/nlp/dialogue_state_tracking_generative/sgd_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,29 @@

from nemo.collections.nlp.models.dialogue_state_tracking_generative.dialogue_gpt_model import DialogueGPTModel
from nemo.collections.nlp.models.dialogue_state_tracking_sgdqa.sgdqa_model import SGDQAModel
from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.app_state import AppState
from nemo.utils.exp_manager import exp_manager


@hydra_runner(config_path="conf", config_name="dialogue_config")
def main(cfg: DictConfig) -> None:
pl.seed_everything(42)
logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
trainer = pl.Trainer(**cfg.trainer)

plugin = NLPDDPPlugin()
trainer = pl.Trainer(**cfg.trainer, plugins=plugin)

exp_manager(trainer, cfg.get("exp_manager", None))

app_state = AppState()
if cfg.model.tensor_model_parallel_size > 1:
app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)

if 'bert' in cfg.model.language_model.pretrained_model_name:
model_class = SGDQAModel
elif 'gpt' in cfg.model.language_model.pretrained_model_name.lower():
Expand Down Expand Up @@ -155,7 +166,7 @@ def main(cfg: DictConfig) -> None:

if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
gpu = 1 if cfg.trainer.gpus != 0 else 0
trainer = pl.Trainer(gpus=gpu)
trainer = pl.Trainer(gpus=gpu, plugins=plugin, precision=16)
model.setup_multiple_test_data(test_data_config=cfg.model.test_ds)
if model.prepare_test(trainer):
trainer.test(model)
Expand Down
6 changes: 6 additions & 0 deletions examples/nlp/language_modeling/conf/megatron_t5_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ model:
masked_lm_prob: 0.15
short_seq_prob: 0.1
dataset_type: 't5'
max_ngram_size: 10
mean_ngram_size: null
geometric_dist: True
permutation: False
whole_word_masking: True
favor_longer_ngrams: False

optim:
name: fused_adam
Expand Down
7 changes: 6 additions & 1 deletion nemo/collections/asr/models/label_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,17 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
"embs": NeuralType(('B', 'D'), AcousticEncodedRepresentation()),
}

@typecheck()
def forward_for_export(self, processed_signal, processed_signal_len):
encoded, length = self.encoder(audio_signal=processed_signal, length=processed_signal_len)
logits, embs = self.decoder(encoder_output=encoded, length=length)
return logits, embs

@typecheck()
def forward(self, input_signal, input_signal_length):
processed_signal, processed_signal_len = self.preprocessor(
input_signal=input_signal, length=input_signal_length,
)

encoded, length = self.encoder(audio_signal=processed_signal, length=processed_signal_len)
logits, embs = self.decoder(encoder_output=encoded, length=length)
return logits, embs
Expand Down
7 changes: 2 additions & 5 deletions nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,6 @@ def rnnt_loss_gpu(
### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ###
acts, acts_shape = rnnt_helper.flatten_tensor(acts)

### REPRESENT THE CUDA ARRAY INTERFACE OF COSTS VECTOR ###
costs_repr = cuda.as_cuda_array(costs, sync=False) # NO COPY OF DATA, JUST CHANGE REPRESENTATION

wrapper = gpu_rnnt.GPURNNT(
minibatch=minibatch_size,
maxT=maxT,
Expand All @@ -210,7 +207,7 @@ def rnnt_loss_gpu(
if grads is None:
status = wrapper.score_forward(
acts=acts.data,
costs=costs_repr,
costs=costs.data,
pad_labels=labels.data,
label_lengths=label_lengths.data,
input_lengths=input_lengths.data,
Expand All @@ -226,7 +223,7 @@ def rnnt_loss_gpu(
status = wrapper.cost_and_grad(
acts=acts.data,
grads=grads.data,
costs=costs_repr,
costs=costs.data,
pad_labels=labels.data,
label_lengths=label_lengths.data,
input_lengths=input_lengths.data,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
# limitations under the License.

import multiprocessing
from typing import Optional
from typing import Optional, Tuple

import numba
import torch
from numba import cuda

from nemo.collections.asr.parts.numba.rnnt_loss.utils import global_constants
from nemo.collections.asr.parts.numba.rnnt_loss.utils import global_constants, rnnt_helper
from nemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils import gpu_rnnt_kernel, reduce


Expand Down Expand Up @@ -83,6 +83,7 @@ def __init__(

if num_threads > 0:
numba.set_num_threads(min(multiprocessing.cpu_count(), num_threads))
self.num_threads_ = numba.get_num_threads()
else:
self.num_threads_ = numba.get_num_threads()

Expand Down Expand Up @@ -147,27 +148,12 @@ def compute_cost_and_score(
An enum that either represents a successful RNNT operation or failure.
"""
training = grads is not None
used_offset = 0

# // denom
denom = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_]
used_offset += self.maxT_ * self.maxU_ * self.minibatch_

# // alphas & betas
alphas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_]
used_offset += self.maxT_ * self.maxU_ * self.minibatch_
betas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_]
used_offset += self.maxT_ * self.maxU_ * self.minibatch_

# // logllh
llForward = self.gpu_workspace[used_offset : used_offset + self.minibatch_]
used_offset += self.minibatch_
llBackward = self.gpu_workspace[used_offset : used_offset + self.minibatch_]
used_offset += self.minibatch_

if training:
grads *= 0.0 # zero grads

used_offset, (denom, alphas, betas, llForward, llBackward) = self._prepare_workspace()

######## START EXECUTION ########
self.log_softmax(acts, denom)

Expand Down Expand Up @@ -226,16 +212,19 @@ def compute_cost_and_score(
self.clamp_,
)

# // cost
costs.copy_to_device(llForward, stream=self.stream_)
# // cost copy, negate (for log likelihood) and update with additional regularizers
# This needs to be done via CUDA, because we used temporary memory llForward
# passed to alpha, which was updated with log likelihoods.
# But copying this data into a pytorch pointer is more difficult (numba api is one way)
# Therefore launch a pointwise CUDA kernel to update the costs inplace from data of llForward
# Then negate to compute the loglikelihood.
threadsperblock = min(costs.shape[0], 32)
blockspergrid = (costs.shape[0] + (threadsperblock - 1)) // threadsperblock
rnnt_helper.compute_costs_data[blockspergrid, threadsperblock, self.stream_, 0](
llForward, costs, self.fastemit_lambda_
)
self.stream_.synchronize()

# compute negative log likelihood.
for mb in range(self.minibatch_):
# Scale llForward by FastEmit lambda
costs[mb] = -costs[mb]
costs[mb] = (1.0 + self.fastemit_lambda_) * costs[mb]

return global_constants.RNNTStatus.RNNT_STATUS_SUCCESS

def cost_and_grad(
Expand Down Expand Up @@ -271,3 +260,31 @@ def score_forward(
return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE

return self.compute_cost_and_score(acts, None, costs, pad_labels, label_lengths, input_lengths)

def _prepare_workspace(self) -> (int, Tuple[torch.Tensor]):
"""
Helper method that uses the workspace and constructs slices of it that can be used.
Returns:
An int, representing the offset of the used workspace (practically, the slice of the workspace consumed)
A tuple of tensors representing the shared workspace.
"""
used_offset = 0

# // denom
denom = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_]
used_offset += self.maxT_ * self.maxU_ * self.minibatch_

# // alphas & betas
alphas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_]
used_offset += self.maxT_ * self.maxU_ * self.minibatch_
betas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_]
used_offset += self.maxT_ * self.maxU_ * self.minibatch_

# // logllh
llForward = self.gpu_workspace[used_offset : used_offset + self.minibatch_]
used_offset += self.minibatch_
llBackward = self.gpu_workspace[used_offset : used_offset + self.minibatch_]
used_offset += self.minibatch_

return used_offset, (denom, alphas, betas, llForward, llBackward)
18 changes: 18 additions & 0 deletions nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,24 @@ def log_plus(p1: float, p2: float):
return result


@cuda.jit(device=True, inline=True)
def copy_data_1d(source: torch.Tensor, dest: torch.Tensor, idx: int):
dest[idx] = source[idx]


@cuda.jit()
def compute_costs_data(source: torch.Tensor, dest: torch.Tensor, fastemit_lambda: float):
block = cuda.blockIdx.x
tid = cuda.threadIdx.x
idx = block * cuda.blockDim.x + tid
length = source.shape[0]

if idx < length:
copy_data_1d(source, dest, idx)
dest[idx] *= -1.0
dest[idx] *= 1.0 + fastemit_lambda


def get_workspace_size(
maxT: int, maxU: int, minibatch: int, gpu: bool
) -> (Optional[int], global_constants.RNNTStatus):
Expand Down
Loading

0 comments on commit fd8c672

Please sign in to comment.