From dabd456f973cf190785710c3ec305703b51bfa41 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 18 Jul 2024 08:06:02 +0200 Subject: [PATCH 1/2] fix merging --- src/transformers/models/chameleon/modeling_chameleon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 2cf7d5679138..4ddf496ff131 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1279,7 +1279,8 @@ def forward( if pixel_values is not None: image_tokens = self.get_image_tokens(pixel_values) special_image_mask = input_ids == self.vocabulary_mapping.image_token_id - input_ids[special_image_mask] = image_tokens.flatten().to(input_ids.device, input_ids.dtype) + image_tokens = image_tokens.to(input_ids.device, input_ids.dtype) + input_ids = input_ids.masked_scatter(special_image_mask, image_tokens) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) From 118ff0a3b9c34fad4210970db8f6b90e40e28d18 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 18 Jul 2024 13:31:26 +0200 Subject: [PATCH 2/2] make chameleon conditional --- docs/source/en/model_doc/chameleon.md | 20 ++++++------- src/transformers/__init__.py | 4 +-- src/transformers/models/auto/modeling_auto.py | 2 +- src/transformers/models/chameleon/__init__.py | 4 +-- .../models/chameleon/modeling_chameleon.py | 6 ++-- src/transformers/utils/dummy_pt_objects.py | 2 +- .../chameleon/test_modeling_chameleon.py | 28 +++++++++++-------- 7 files changed, 36 insertions(+), 30 deletions(-) diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md index e2a0012ba97f..fb524b324794 100644 --- a/docs/source/en/model_doc/chameleon.md +++ b/docs/source/en/model_doc/chameleon.md @@ -64,13 +64,13 @@ The original code can be found [here](https://github.com/facebookresearch/chamel Here's how to load the model and perform inference in half-precision (`torch.float16`): ```python -from transformers import ChameleonProcessor, ChameleonForCausalLM +from transformers import ChameleonProcessor, ChameleonForConditionalGeneration import torch from PIL import Image import requests processor = ChameleonProcessor.from_pretrained("meta-chameleon") -model = ChameleonForCausalLM.from_pretrained("meta-chameleon", torch_dtype=torch.float16, device_map="auto") +model = ChameleonForConditionalGeneration.from_pretrained("meta-chameleon", torch_dtype=torch.float16, device_map="auto") # prepare image and text prompt url = "https://bjiujitsu.com/wp-content/uploads/2021/01/jiu_jitsu_belt_white_1.jpg" @@ -89,13 +89,13 @@ print(processor.decode(output[0], skip_special_tokens=True)) Chameleon can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it: ```python -from transformers import ChameleonProcessor, ChameleonForCausalLM +from transformers import ChameleonProcessor, ChameleonForConditionalGeneration import torch from PIL import Image import requests processor = ChameleonProcessor.from_pretrained("meta-chameleon") -model = ChameleonForCausalLM.from_pretrained("meta-chameleon", torch_dtype=torch.float16, device_map="auto") +model = ChameleonForConditionalGeneration.from_pretrained("meta-chameleon", torch_dtype=torch.float16, device_map="auto") # Get three different images url = "https://www.ilankelman.org/stopsigns/australia.jpg" @@ -129,7 +129,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with: ```python -from transformers import ChameleonForCausalLM, BitsAndBytesConfig +from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig # specify how to quantize the model quantization_config = BitsAndBytesConfig( @@ -138,7 +138,7 @@ quantization_config = BitsAndBytesConfig( bnb_4bit_compute_dtype=torch.float16, ) -model = ChameleonForCausalLM.from_pretrained("meta-chameleon", quantization_config=quantization_config, device_map="auto") +model = ChameleonForConditionalGeneration.from_pretrained("meta-chameleon", quantization_config=quantization_config, device_map="auto") ``` ### Use Flash-Attention 2 and SDPA to further speed-up generation @@ -146,9 +146,9 @@ model = ChameleonForCausalLM.from_pretrained("meta-chameleon", quantization_conf The models supports both, Flash-Attention 2 and PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) which can be enables for optimization. SDPA is the default options when you load the model, If you want to switch for Flash Attention 2, first make sure to install flash-attn. Refer to the [original repository](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with: ```python -from transformers import ChameleonForCausalLM +from transformers import ChameleonForConditionalGeneration -model = ChameleonForCausalLM.from_pretrained( +model = ChameleonForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, @@ -183,7 +183,7 @@ model = ChameleonForCausalLM.from_pretrained( [[autodoc]] ChameleonModel - forward -## ChameleonForCausalLM +## ChameleonForConditionalGeneration -[[autodoc]] ChameleonForCausalLM +[[autodoc]] ChameleonForConditionalGeneration - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 6d52b30e1ca5..3b7a3a59a7a8 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1616,7 +1616,7 @@ ) _import_structure["models.chameleon"].extend( [ - "ChameleonForCausalLM", + "ChameleonForConditionalGeneration", "ChameleonModel", "ChameleonPreTrainedModel", "ChameleonProcessor", @@ -6276,7 +6276,7 @@ load_tf_weights_in_canine, ) from .models.chameleon import ( - ChameleonForCausalLM, + ChameleonForConditionalGeneration, ChameleonModel, ChameleonPreTrainedModel, ChameleonProcessor, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index b99cbe19bbd6..d096abf43426 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -446,7 +446,6 @@ ("blenderbot-small", "BlenderbotSmallForCausalLM"), ("bloom", "BloomForCausalLM"), ("camembert", "CamembertForCausalLM"), - ("chameleon", "ChameleonForCausalLM"), ("code_llama", "LlamaForCausalLM"), ("codegen", "CodeGenForCausalLM"), ("cohere", "CohereForCausalLM"), @@ -703,6 +702,7 @@ [ ("blip", "BlipForConditionalGeneration"), ("blip-2", "Blip2ForConditionalGeneration"), + ("chameleon", "ChameleonForConditionalGeneration"), ("git", "GitForCausalLM"), ("idefics2", "Idefics2ForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"), diff --git a/src/transformers/models/chameleon/__init__.py b/src/transformers/models/chameleon/__init__.py index 71e40a5da4af..e8e38630d252 100644 --- a/src/transformers/models/chameleon/__init__.py +++ b/src/transformers/models/chameleon/__init__.py @@ -36,7 +36,7 @@ pass else: _import_structure["modeling_chameleon"] = [ - "ChameleonForCausalLM", + "ChameleonForConditionalGeneration", "ChameleonModel", "ChameleonPreTrainedModel", "ChameleonVQVAE", @@ -62,7 +62,7 @@ pass else: from .modeling_chameleon import ( - ChameleonForCausalLM, + ChameleonForConditionalGeneration, ChameleonModel, ChameleonPreTrainedModel, ChameleonVQVAE, diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 4ddf496ff131..346479c771bf 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1446,7 +1446,7 @@ def _update_causal_mask( "Chameleon Model with a head on top used for outputting logits for next token prediction.", CHAMELEON_START_DOCSTRING, ) -class ChameleonForCausalLM(ChameleonPreTrainedModel): +class ChameleonForConditionalGeneration(ChameleonPreTrainedModel): _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -1505,12 +1505,12 @@ def forward( Example: ```python - >>> from transformers import ChameleonProcessor, ChameleonForCausalLM + >>> from transformers import ChameleonProcessor, ChameleonForConditionalGeneration >>> import torch >>> import requests >>> from PIL import Image - >>> model = ChameleonForCausalLM.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16) + >>> model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16) >>> processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") >>> prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation." diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 725d35b0096f..81d4c2105586 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1835,7 +1835,7 @@ def load_tf_weights_in_canine(*args, **kwargs): requires_backends(load_tf_weights_in_canine, ["torch"]) -class ChameleonForCausalLM(metaclass=DummyObject): +class ChameleonForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index 7e3b688c93d2..4e685411a041 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -44,7 +44,7 @@ import torch from transformers import ( - ChameleonForCausalLM, + ChameleonForConditionalGeneration, ChameleonModel, ChameleonProcessor, ) @@ -191,7 +191,7 @@ def create_and_check_for_causal_lm( encoder_hidden_states, encoder_attention_mask, ): - model = ChameleonForCausalLM(config=config) + model = ChameleonForConditionalGeneration(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) @@ -209,7 +209,7 @@ def create_and_check_decoder_model_past_large_inputs( encoder_attention_mask, ): config.is_decoder = True - model = ChameleonForCausalLM(config=config) + model = ChameleonForConditionalGeneration(config=config) model.to(torch_device) model.eval() @@ -273,12 +273,12 @@ def prepare_config_and_inputs_for_common(self): @require_torch class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (ChameleonModel, ChameleonForCausalLM) if is_torch_available() else () - all_generative_model_classes = (ChameleonForCausalLM,) if is_torch_available() else () + all_model_classes = (ChameleonModel, ChameleonForConditionalGeneration) if is_torch_available() else () + all_generative_model_classes = (ChameleonForConditionalGeneration,) if is_torch_available() else () pipeline_model_mapping = ( { "feature-extraction": ChameleonModel, - "text-generation": ChameleonForCausalLM, + "text-generation": ChameleonForConditionalGeneration, } if is_torch_available() else {} @@ -339,7 +339,7 @@ def test_flash_attn_2_generate_padding_right(self): """ Overwritting the common test as the test is flaky on tiny models """ - model = ChameleonForCausalLM.from_pretrained( + model = ChameleonForConditionalGeneration.from_pretrained( "facebook/chameleon-7b", load_in_4bit=True, device_map={"": 0}, @@ -355,7 +355,7 @@ def test_flash_attn_2_generate_padding_right(self): output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) output_native = processor.tokenizer.batch_decode(output_native) - model = ChameleonForCausalLM.from_pretrained( + model = ChameleonForConditionalGeneration.from_pretrained( "facebook/chameleon-7b", load_in_4bit=True, attn_implementation="flash_attention_2", @@ -377,7 +377,9 @@ class ChameleonIntegrationTest(unittest.TestCase): @require_bitsandbytes @require_read_token def test_model_7b(self): - model = ChameleonForCausalLM.from_pretrained("facebook/chameleon-7b", load_in_4bit=True, device_map="auto") + model = ChameleonForConditionalGeneration.from_pretrained( + "facebook/chameleon-7b", load_in_4bit=True, device_map="auto" + ) processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") image = Image.open( @@ -397,7 +399,9 @@ def test_model_7b(self): @require_bitsandbytes @require_read_token def test_model_7b_batched(self): - model = ChameleonForCausalLM.from_pretrained("facebook/chameleon-7b", load_in_4bit=True, device_map="auto") + model = ChameleonForConditionalGeneration.from_pretrained( + "facebook/chameleon-7b", load_in_4bit=True, device_map="auto" + ) processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") image = Image.open( @@ -428,7 +432,9 @@ def test_model_7b_batched(self): @require_bitsandbytes @require_read_token def test_model_7b_multi_image(self): - model = ChameleonForCausalLM.from_pretrained("facebook/chameleon-7b", load_in_4bit=True, device_map="auto") + model = ChameleonForConditionalGeneration.from_pretrained( + "facebook/chameleon-7b", load_in_4bit=True, device_map="auto" + ) processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") image = Image.open(