Skip to content

Commit

Permalink
Merge branch 'main' into akoumparouli/nemo_ux_changes
Browse files Browse the repository at this point in the history
  • Loading branch information
akoumpa authored Jun 6, 2024
2 parents 00168d6 + cc24d38 commit ba20266
Show file tree
Hide file tree
Showing 24 changed files with 1,039 additions and 1,251 deletions.
66 changes: 65 additions & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@ jobs:
RUNNER: self-hosted-azure
SCRIPT: |
cd tools/ctc_segmentation && \
$=`date +"%Y-%m-%d-%T"` && \
TIME=`date +"%Y-%m-%d-%T"` && \
/bin/bash run_segmentation.sh \
--MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
--DATA_DIR=/home/TestData/ctc_segmentation/eng \
Expand Down Expand Up @@ -3006,6 +3006,68 @@ jobs:
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"

L2_RAG_Pipeline_Indexing:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/rag/rag_indexing.py \
trainer.num_nodes=1 \
trainer.devices=1 \
trainer.precision='bf16-mixed' \
indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
indexing.embedder.embed_batch_size=128 \
indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \
indexing.data.chunk_size=256 \
indexing.data.chunk_overlap=10 \
indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index'
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_RAG_Pipeline_Generating:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/rag/rag_generating.py \
trainer.devices=1 \
trainer.precision='bf16-mixed' \
indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \
generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \
generating.inference.tokens_to_generate=50 \
generating.inference.greedy=False \
generating.inference.temperature=1.0 \
generating.query='Which art schools did I applied to?'
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_BioMegatron_Bert_NER_Task:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4923,6 +4985,8 @@ jobs:
- L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
- L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
- L2_Megatron_RETRO_Pretraining_and_Resume_Training
- L2_RAG_Pipeline_Indexing
- L2_RAG_Pipeline_Generating
- L2_BioMegatron_Bert_NER_Task
- L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
Expand Down
1 change: 1 addition & 0 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n
"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
"llama-index==0.10.43" \
-r tools/ctc_segmentation/requirements.txt \
".[all]"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
from tqdm import tqdm

from nemo.collections.multimodal.data.neva.neva_dataset import make_supervised_data_module
from nemo.collections.multimodal.parts.utils import create_image_processor
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.utils import logging

Expand Down Expand Up @@ -254,8 +255,14 @@ def main():
nemo_config.model.data.conv_template = args.conv_template
nemo_config.model.data.image_aspect_ratio = args.image_aspect_ratio

tokenizer = get_nmt_tokenizer(library="sentencepiece", tokenizer_model=args.tokenizer_path,)
train_ds = make_supervised_data_module(tokenizer=tokenizer, model_cfg=nemo_config.model)["train_dataset"]
tokenizer = get_nmt_tokenizer(
library="sentencepiece",
tokenizer_model=args.tokenizer_path,
)
image_processor = create_image_processor(nemo_config.model.mm_cfg)
train_ds = make_supervised_data_module(
tokenizer=tokenizer, image_processor=image_processor, model_cfg=nemo_config.model
)["train_dataset"]
train_dl = DataLoader(train_ds, num_workers=32, collate_fn=None, shuffle=False)
# Example shape: {'tokens': torch.Size([1, 344]), 'labels': torch.Size([1, 344]), 'image': torch.Size([1, 1, 3, 224, 224])}

Expand Down
107 changes: 52 additions & 55 deletions nemo/collections/multimodal/data/neva/neva_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from omegaconf import DictConfig
from PIL import Image
from torch.utils.data import Dataset, default_collate
from transformers import CLIPImageProcessor
from transformers import CLIPImageProcessor, SiglipImageProcessor

import nemo.collections.multimodal.data.neva.conversation as conversation_lib
from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
Expand Down Expand Up @@ -294,6 +294,42 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
return sources


def process_image(processor, image, image_aspect_ratio="square"):
if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor):
# image processor from HF
if image_aspect_ratio == 'keep':
max_hw, min_hw = max(image.size), min(image.size)
aspect_ratio = max_hw / min_hw
max_len, min_len = 448, 224
shortest_edge = int(min(max_len / aspect_ratio, min_len))
image = processor.preprocess(
image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
)['pixel_values'][0]
elif image_aspect_ratio == 'pad':

def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result

image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
else:
image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
else:
assert image_aspect_ratio == 'square', 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
image = processor(image)
return image


def preprocess_llama_3(
sources: dict,
tokenizer,
Expand Down Expand Up @@ -456,9 +492,11 @@ def preprocess_llama_2(
parts[0] += sep

round_len = len(tokenizer.text_to_ids(rou + conv.sep2))
instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
if i > 0:
round_len -= 1 # Remove extra token added by sp tokenizer
instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
else:
instruction_len += 1
target[cur_len : cur_len + instruction_len] = IGNORE_INDEX

cur_len += round_len
Expand Down Expand Up @@ -723,7 +761,11 @@ def preprocess_nv_dpo(

if i % 2 == 1:
turn['from'] = conv.roles[1]
conv.append_message(turn['from'], turn['value'])
if "label" in turn:
value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
else:
value = turn["value"]
conv.append_message(turn['from'], value)
if not turn["value"]:
strip_end_for_inference = (
True # in inference, current turn is empty, thus end tokens need to striped.
Expand Down Expand Up @@ -765,7 +807,11 @@ def preprocess_nv_dpo(
if len(parts) != 2:
break

instruction_len = len(tokenizer.text_to_ids(parts[0] + sep))
# handle label if exists
labels_match = re.search(rf"{re.escape(DEFAULT_LABELS_TOKEN)}.*?\n", parts[1])
instruction_len = len(
tokenizer.text_to_ids(parts[0] + sep + (parts[1][: labels_match.end()] if labels_match else ""))
)
round_len = len(tokenizer.text_to_ids(rou + conv.sep))
target[cur_len : cur_len + instruction_len] = IGNORE_INDEX

Expand Down Expand Up @@ -886,40 +932,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
image = self.image_loader.open_image(image_file)
if image is None:
logging.warning(f"Image {image_file} could not be found!")
if isinstance(self.processor, CLIPImageProcessor):
# image processor from HF
if self.multimodal_cfg['image_aspect_ratio'] == 'keep':
max_hw, min_hw = max(image.size), min(image.size)
aspect_ratio = max_hw / min_hw
max_len, min_len = 448, 224
shortest_edge = int(min(max_len / aspect_ratio, min_len))
image = self.processor.preprocess(
image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
)['pixel_values'][0]
elif self.multimodal_cfg['image_aspect_ratio'] == 'pad':

def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result

image = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
image = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
else:
image = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
else:
assert (
self.multimodal_cfg['image_aspect_ratio'] == 'square'
), 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
image = self.processor(image)
image = process_image(self.processor, image, self.multimodal_cfg['image_aspect_ratio'])
images.append(image)
media_tensors = torch.tensor([])
if images:
Expand Down Expand Up @@ -1205,30 +1218,14 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
return batch


def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict:
"""Make dataset and collator for supervised fine-tuning."""
data_cfg = model_cfg.data
mm_cfg = model_cfg.mm_cfg
add_extra_token = 1
if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False):
add_extra_token = 0
crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
if mm_cfg.vision_encoder.from_hf:
image_processor = CLIPImageProcessor.from_pretrained(
mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
)
assert crop_size == (
image_processor.crop_size['height'],
image_processor.crop_size['width'],
), f"Crop size {crop_size} does not match the HuggingFace CLIP model's crop size {(image_processor.crop_size['height'], image_processor.crop_size['width'])}"
else:
# TODO(yuya): Fix this hard-code for our own CLIP
image_processor = image_transform(
crop_size,
is_train=False,
mean=None,
std=None,
)

train_dataset = NevaDataset(
tokenizer=tokenizer,
Expand Down
Loading

0 comments on commit ba20266

Please sign in to comment.