Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump version to 0.6.0 #1023

Merged
merged 5 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,7 @@ We *strongly* recommend working with LLM Foundry inside a Docker container (see
```bash
git clone https://github.com/mosaicml/llm-foundry.git
cd llm-foundry
pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU.
# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
pip install -e ".[gpu]" # or `pip install -e .` if no NVIDIA GPU.
```

### Without Docker (not recommended)
Expand All @@ -152,9 +150,7 @@ source llmfoundry-venv/bin/activate

pip install cmake packaging torch # setup.py requires these be installed

pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU.
# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
pip install -e ".[gpu]" # or `pip install -e .` if no NVIDIA GPU.
```

### TransformerEngine and amp_fp8 support
Expand Down
13 changes: 2 additions & 11 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@

hf_dynamic_modules_logger.addFilter(new_files_warning_filter)

# Before importing any transformers models, we need to disable transformers flash attention if
# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
# gated import otherwise.
import transformers

from llmfoundry import optim, utils
from llmfoundry.data import (ConcatTokensDataset, MixtureOfDenoisersCollator,
NoConcatDataset, Seq2SeqFinetuningCollator,
Expand All @@ -33,18 +28,14 @@
ComposerHFT5)
from llmfoundry.models.layers.attention import (
MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
flash_attn_fn, is_flash_v1_installed,
scaled_multihead_dot_product_attention, triton_flash_attn_fn)
flash_attn_fn, scaled_multihead_dot_product_attention, triton_flash_attn_fn)
from llmfoundry.models.layers.blocks import MPTBlock
from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
MPTForCausalLM, MPTModel, MPTPreTrainedModel)
from llmfoundry.tokenizers import TiktokenTokenizerWrapper

if is_flash_v1_installed():
transformers.utils.is_flash_attn_available = lambda: False

__all__ = [
'build_text_denoising_dataloader',
'build_finetuning_dataloader',
Expand Down Expand Up @@ -77,4 +68,4 @@
'TiktokenTokenizerWrapper',
]

__version__ = '0.5.0'
__version__ = '0.6.0'
7 changes: 0 additions & 7 deletions llmfoundry/models/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,6 @@ def check_alibi_support(attention_impl: str) -> bool:
v2_version='v2.4.2')


# Before importing any transformers models, we need to disable transformers flash attention if
# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
# gated import otherwise.
if is_flash_v1_installed():
import transformers
transformers.utils.is_flash_attn_available = lambda: False

from transformers.models.llama.modeling_llama import apply_rotary_pos_emb


Expand Down
8 changes: 0 additions & 8 deletions llmfoundry/models/mpt/configuration_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from transformers import PretrainedConfig

from llmfoundry.models.layers.attention import (check_alibi_support,
is_flash_v1_installed,
is_flash_v2_installed)
from llmfoundry.models.layers.blocks import attn_config_defaults

Expand Down Expand Up @@ -230,13 +229,6 @@ def _validate_config(self) -> None:
raise NotImplementedError(
'prefix_lm only implemented with torch and triton attention.')

if self.attn_config['attn_impl'] == 'flash' and is_flash_v1_installed():
warnings.warn(
VersionedDeprecationWarning(
'Support for Flash Attention v1 is deprecated. Please upgrade to Flash Attention v2.4.2. To install Flash Attention v2.4.2, please run `pip install -e ".[gpu-flash2]"` from the root directory of the llm-foundry repository.',
remove_version='0.6.0',
))

if self.attn_config[
'attn_impl'] == 'triton' and not self.attn_config['prefix_lm']:
warnings.warn(
Expand Down
9 changes: 1 addition & 8 deletions llmfoundry/models/mpt/modeling_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
from composer.utils import dist

from llmfoundry.metrics import TokenAccuracy
from llmfoundry.models.layers.attention import (is_flash_v1_installed,
is_flash_v2_installed)
from llmfoundry.models.layers.attention import is_flash_v2_installed
from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY

if is_flash_v2_installed():
Expand All @@ -39,12 +38,6 @@
except Exception as e:
raise e

if is_flash_v1_installed():
try: # This try...except is needed because transformers requires it despite the 'if' statement above
from flash_attn import bert_padding
except Exception as e:
raise e

from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from transformers import PreTrainedModel, PreTrainedTokenizerBase
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-llama2-finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ parameters:
pretrained: true
# Note: you must have set the HUGGING_FACE_HUB_TOKEN environment variable and have access to the llama2 models
use_auth_token: true
attention_patch_type: triton
use_flash_attention_2: true

# Tokenizer
tokenizer:
Expand All @@ -62,7 +62,7 @@ parameters:
# # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
# # to profile this run's optimal packing_ratio as it depends on GPU count,
# # batch size, sequence length
# packing_ratio:
# packing_ratio: auto
drop_last: true
num_workers: 8
pin_memory: false
Expand Down
10 changes: 4 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""MosaicML LLM Foundry package setup."""

import copy
import os
import re

Expand Down Expand Up @@ -98,16 +99,13 @@
'mosaicml[tensorboard]>=0.20.1,<0.21',
]

extra_deps['gpu'] = [
'flash-attn==1.0.9',
# PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
'xentropy-cuda-lib@git+https://github.com/HazyResearch/[email protected]#subdirectory=csrc/xentropy',
]

# Flash 2 group kept for backwards compatibility
extra_deps['gpu-flash2'] = [
'flash-attn==2.5.0',
]

extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])

extra_deps['peft'] = [
'mosaicml[peft]>=0.20.1,<0.21',
]
Expand Down
15 changes: 3 additions & 12 deletions tests/models/layers/test_huggingface_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,14 @@
from composer.utils import reproducibility
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from transformers.models.llama.modeling_llama import LlamaAttention

from llmfoundry import COMPOSER_MODEL_REGISTRY
from llmfoundry.models.hf.hf_fsdp import rgetattr
from llmfoundry.models.layers.attention import (is_flash_v1_installed,
is_flash_v2_installed)
from llmfoundry.utils.builders import build_tokenizer

# Before importing any transformers models, we need to disable transformers flash attention if
# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
# gated import otherwise.
if is_flash_v1_installed():
transformers.utils.is_flash_attn_available = lambda: False

from transformers.models.llama.modeling_llama import LlamaAttention

from llmfoundry.models.layers.attention import is_flash_v2_installed
from llmfoundry.models.layers.llama_attention_monkeypatch import (
llama_attention_patch_torch, llama_attention_patch_triton)
from llmfoundry.utils.builders import build_tokenizer


@pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])
Expand Down
Loading