Merge branch 'main' into gaod/moe/add_expert_tensor_parallesim_support

NVIDIA · Jan 17, 2025 · 712de8f · 712de8f
2 parents 18d9b0e + 0cd990d
commit 712de8f
Show file tree

Hide file tree

Showing 92 changed files with 1,279 additions and 399 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -47,7 +47,7 @@ jobs:
     steps:
         - name: Docker system cleanup
           run: |
-            docker system prune -a --filter "until=48h" --force || true
+            docker system prune -af --filter "until=24h" --force || true
 
         - name: Docker pull image
           run: |

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -2937,7 +2937,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
-        CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -2965,6 +2965,7 @@ jobs:
         +model.tp_comm_overlap_ag=False \
         +model.tp_comm_overlap_rs=False \
         +model.tp_comm_overlap_disable_qkv=True \
+        +model.attention_backend="unfused" \
         model.peft.peft_scheme="lora" \
         model.peft.lora_tuning.adapter_dim=16 \
         model.peft.lora_tuning.alpha=32 \
@@ -4329,11 +4330,24 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python tests/collections/vlm/neva_train.py \
+        python tests/collections/vlm/test_neva_train.py \
         --devices=1 \
         --max-steps=5 \
         --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }}
 
+  L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tests/collections/vlm/test_neva_train.py \
+        --devices=1 \
+        --max-steps=5 \
+        --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }} \
+        --use_packed_sequence
+
   L2_NeMo_2_MLLAMA_MOCK_TRAINING:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4342,7 +4356,7 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: |
         TRANSFORMERS_OFFLINE=1 \
-        python tests/collections/vlm/mllama_train.py \
+        python tests/collections/vlm/test_mllama_train.py \
         --devices=1 \
         --max-steps=5 \
         --experiment-dir=/tmp/nemo2_mllama_results/${{ github.run_id }}
@@ -4354,7 +4368,7 @@ jobs:
       with:
         RUNNER: self-hosted-azure
         SCRIPT: |
-          NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
+          python3 tests/collections/llm/megatron_mixtral_pretraining.py \
           --experiment-dir=/tmp/mixtral_pretrain_results \
           --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
 
@@ -4915,6 +4929,36 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /tmp/nemo2_llava_next_results
 
+  L2_NeMo_2_VLLM_EXPORT:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_VLLM_EXPORT') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tests/setup/models/create_hf_model.py \
+          --model_name_or_path /home/TestData/nlp/megatron_llama/llama-ci-hf \
+          --output_dir /tmp/llama_head64 \
+          --config_updates "{\"hidden_size\": 512, \"num_attention_heads\": 4, \"numx_hidden_layers\": 2, \"num_key_value_heads\": 4, \"intermediate_size\": 1024, \"head_dim\": 128, \"num_hidden_layers\": 2, \"torch_dtype\": \"float16\" }"
+
+        python tests/collections/llm/test_hf_import.py --hf_model /tmp/llama_head64 --output_path /tmp/nemo2_ckpt
+
+        /opt/venv/bin/python tests/export/nemo_export.py \
+          --min_tps 1 \
+          --max_tps 1 \
+          --use_vllm True \
+          --model_type llama \
+          --max_output_len 128 \
+          --test_deployment True \
+          --model_name nemo2_ckpt \
+          --model_dir /tmp/vllm_from_nemo2 \
+          --checkpoint_dir /tmp/nemo2_ckpt
+
+      AFTER_SCRIPT: |
+        rm -rf /tmp/llama_head64
+        rm -rf /tmp/nemo2_ckpt
+        rm -rf /tmp/vllm_from_nemo2
+
   Nemo_CICD_Test:
     needs:
       - pre-flight
@@ -5030,6 +5074,7 @@ jobs:
       - Speech_Checkpoints_tests
       - L2_Stable_Diffusion_Training
       - L2_NeMo_2_NEVA_MOCK_TRAINING
+      - L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING
       - L2_NeMo_2_MLLAMA_MOCK_TRAINING
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
@@ -5102,6 +5147,7 @@ jobs:
       - L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
       - L2_HF_Transformer_SFT_FSDP2_2gpu
       - L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
+      - L2_NeMo_2_VLLM_EXPORT
     if: always()
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml
@@ -1,73 +1,52 @@
 name: CI-Import-Check
 
 on:
-  push:
   pull_request:
     paths:
       - "**"
 
 # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags
 jobs:
-
-  test-asr-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
+  test-imports:
+    name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        collection: 
+          - asr
+          # - nlp # Currently broken
+          - tts
+        python: ['3.10', '3.11', '3.12']
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
-    - name: Update base dependencies
-      run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '${{ matrix.python }}' 
+    - name: Build wheel
       id: nemo-wheel
       run:  |
-        pip install Cython
-        # install test requirements
-        pip install -r requirements/requirements_test.txt
         # Build nemo as a wheel
         pip install build
-        python -m build --no-isolation --wheel
+        python -m build --wheel
+        
         # Preserve wheel location
         DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test ASR Domain Imports
-      run: |
-        # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]"
-        # Run import checks
-        python tests/core_ptl/check_imports.py --domain "asr"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
-  test-tts-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
-    steps:
-    - name: Checkout repo
-      uses: actions/checkout@v2
-    - name: Update base dependencies
+        echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT"
+    
+    - name: Install NeMo + test dependencies
       run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
-      id: nemo-wheel
-      run:  |
-        pip install Cython
         # install test requirements
         pip install -r requirements/requirements_test.txt
-        # Build nemo as a wheel
-        pip install build
-        python -m build --no-isolation --wheel
-        # Preserve wheel location
-        DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test TTS Domain Imports
-      run: |
+        
         # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]"
+        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]"
+    
+    - name: Run ${{ matrix.collection }} checks
+      run: |
         # Run import checks
-        python tests/core_ptl/check_imports.py --domain "tts"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
+        python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}"
+  
+
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,17 +34,12 @@ EOF
 WORKDIR /workspace
 
 # Install Mamba Dependancy
-ARG CAUSAL_CONV_TAG=v1.2.2.post1
+ARG CAUSAL_CONV_TAG=v1.2.2.post1 
+ARG MAMBA_TAG=v2.2.0
 
 RUN <<"EOF" bash -ex
 # Mamba dependancy installation
-
-git clone --depth 1 --branch ${CAUSAL_CONV_TAG} https://github.com/Dao-AILab/causal-conv1d && \
-  cd causal-conv1d && \
-  python setup.py install && \
-  cd .. && \
-  rm -rf causal-conv1d
-
+MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 install --no-cache-dir -v git+https://github.com/Dao-AILab/causal-conv1d.git@${CAUSAL_CONV_TAG} git+https://github.com/state-spaces/mamba.git@${MAMBA_TAG}
 EOF
 
 RUN pip install hatchling   # needed to install nemo-run
@@ -54,8 +49,6 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.21.0
-ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
-
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
   --mount=type=bind,source=requirements,target=requirements \
@@ -65,23 +58,22 @@ RUN \
   --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
 pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
 "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
-"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
 "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
 "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
 "unstructured==0.14.9" \
 "llama-index==0.10.43" \
 "onnxscript @ git+https://github.com/microsoft/onnxscript" \
 -r tools/ctc_segmentation/requirements.txt \
 ".[all]"
+EOF
 
-# Megatron Core installation
-git clone https://github.com/NVIDIA/Megatron-LM.git && \
-pushd Megatron-LM && \
-git checkout ${MCORE_TAG} && \
-  pushd megatron/core/datasets && \
-  make && \
-  popd && \
-popd
+ARG MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031
+RUN <<"EOF" bash -ex
+# Megatron-LM installation
+git clone https://github.com/NVIDIA/Megatron-LM.git
+pushd Megatron-LM
+git checkout ${MCORE_TAG} 
+pip install -e .
 export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
 
 # Install nvidia-resiliency-ext
@@ -98,4 +90,11 @@ pip install --no-cache-dir --no-build-isolation ".[all]"
 chmod 777 -R /workspace
 EOF
 
+# Install vLLM in virtualenv
+RUN pip install --no-cache-dir --no-build-isolation virtualenv && \
+  virtualenv /opt/venv && \
+  /opt/venv/bin/pip install --no-cache-dir --no-build-isolation \
+      -r /workspace/requirements/requirements_vllm.txt \
+      -r /workspace/requirements/requirements_infer.txt
+
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
diff --git a/README.md b/README.md
@@ -15,12 +15,39 @@
 <details open>
   <summary><b>NeMo 2.0</b></summary>
       We've released NeMo 2.0, an update on the NeMo Framework which prioritizes modularity and ease-of-use. Please refer to the <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html>NeMo Framework User Guide</a> to get started.
+</details>
+<details open>
+  <summary><b>New Cosmos World Foundation Models Support</b></summary>
+    <details> 
+      <summary> <a href="https://developer.nvidia.com/blog/advancing-physical-ai-with-nvidia-cosmos-world-foundation-model-platform">Advancing Physical AI with NVIDIA Cosmos World Foundation Model Platform </a> (2025-01-09) 
+      </summary> 
+        The end-to-end NVIDIA Cosmos platform accelerates world model development for physical AI systems. Built on CUDA, Cosmos combines state-of-the-art world foundation models, video tokenizers, and AI-accelerated data processing pipelines. Developers can accelerate world model development by fine-tuning Cosmos world foundation models or building new ones from the ground up. These models create realistic synthetic videos of environments and interactions, providing a scalable foundation for training complex systems, from simulating humanoid robots performing advanced actions to developing end-to-end autonomous driving models. 
+        <br><br>
     </details>
-  </details>
-
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/accelerate-custom-video-foundation-model-pipelines-with-new-nvidia-nemo-framework-capabilities/">
+          Accelerate Custom Video Foundation Model Pipelines with New NVIDIA NeMo Framework Capabilities
+        </a> (2025-01-07)
+      </summary>
+        The NeMo Framework now supports training and customizing the <a href="https://github.com/NVIDIA/Cosmos">NVIDIA Cosmos</a> collection of world foundation models. Cosmos leverages advanced text-to-world generation techniques to create fluid, coherent video content from natural language prompts.
+        <br><br>
+        You can also now accelerate your video processing step using the <a href="https://developer.nvidia.com/nemo-curator-video-processing-early-access">NeMo Curator</a> library, which provides optimized video processing and captioning features that can deliver up to 89x faster video processing when compared to an unoptimized CPU pipeline.
+      <br><br>
+    </details>
+</details>
 <details open>
   <summary><b>Large Language Models and Multimodal Models</b></summary>
-      <details>
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/state-of-the-art-multimodal-generative-ai-model-development-with-nvidia-nemo/">
+          State-of-the-Art Multimodal Generative AI Model Development with NVIDIA NeMo
+        </a> (2024-11-06)
+      </summary>
+        NVIDIA recently announced significant enhancements to the NeMo platform, focusing on multimodal generative AI models. The update includes NeMo Curator and the Cosmos tokenizer, which streamline the data curation process and enhance the quality of visual data. These tools are designed to handle large-scale data efficiently, making it easier to develop high-quality AI models for various applications, including robotics and autonomous driving. The Cosmos tokenizers, in particular, efficiently map visual data into compact, semantic tokens, which is crucial for training large-scale generative models. The tokenizer is available now on the <a href=http://github.com/NVIDIA/cosmos-tokenizer/NVIDIA/cosmos-tokenizer>NVIDIA/cosmos-tokenizer</a> GitHub repo and on <a href=https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x8x8>Hugging Face</a>.
+      <br><br>
+    </details>
+    <details>
       <summary>
         <a href="https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama/index.html#new-llama-3-1-support for more information/">
         New Llama 3.1 Support
@@ -81,7 +108,6 @@
         <br><br>
       </details>
 </details>
-
 <details open>
   <summary><b>Speech Recognition</b></summary>
   <details>
@@ -163,6 +189,10 @@ Overall, these enhancements make NeMo 2.0 a powerful, scalable, and user-friendl
 - For an in-depth exploration of the main features of NeMo 2.0, see the [Feature Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/index.html#feature-guide).
 - To transition from NeMo 1.0 to 2.0, see the [Migration Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/migration/index.html#migration-guide) for step-by-step instructions.
 
+### Get Started with Cosmos
+
+NeMo Curator and NeMo Framework support video curation and post-training of the Cosmos World Foundation Models, which are open and available on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cosmos/collections/cosmos) and [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6). For more information on video datasets, refer to [NeMo Curator](https://developer.nvidia.com/nemo-curator). To post-train World Foundation Models using the NeMo Framework for your custom physical AI tasks, see the [Cosmos Diffusion models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/diffusion/nemo/post_training/README.md) and the [Cosmos Autoregressive models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/autoregressive/nemo/post_training/README.md).
+
 ## LLMs and MMs Training, Alignment, and Customization
 
 All NeMo models are trained with

diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
@@ -70,9 +70,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     VALIDATION_DATASET_PATH= # Path to validation dataset 
     SAVE_DIR= # where the checkpoint and logs are saved
     mkdir -p $SAVE_DIR
-    export NVTE_FLASH_ATTN=0
     export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-    export NVTE_FUSED_ATTN=0
     
     python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
     --config-path=${CONFIG_PATH} \
@@ -87,6 +85,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     model.post_process=False \
     model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size
     model.micro_batch_size=8 \
+    model.attention_backend="unfused" \ 
     model.optim.lr=0.000005 \
     model.optim.sched.min_lr=0.00000001 \
     model.optim.sched.warmup_steps=100 \