RLHF-V · zxrys · Nov 22, 2024 · Nov 23, 2024 · Nov 24, 2024 · Nov 25, 2024
diff --git a/builder/__init__.py b/builder/__init__.py
diff --git a/builder/builder.py b/builder/builder.py
@@ -0,0 +1,91 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from transformers import BitsAndBytesConfig
+import torch
+
+
+class ModelBuilder:
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        """
+        Judge if the model can be built by this builder.
+        Args:
+            model_name: The name of the model.
+
+        Returns:
+            bool: True if the model can be built by this builder.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def build(cls, model_path, model_base, model_name, **kwargs):
+        """
+        Build the model.
+        Returns:
+            tokenizer: The tokenizer of the model.
+            model: The model. This one must be returned. Otherwise, an error will be raised.
+            image_processor: The image processor.
+        """
+        raise NotImplementedError
+
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto",
+                          device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+
+    # import here to avoid circular import
+    from .llava_builder import LLaVABuilder
+    from .omnillm_builder import OmniLLMBuilder
+    from .language_model_builder import LanguageModelBuilder
+
+    # Note: please put LanguageModelBuilder at the end of the list if you want you add your own builder
+    model_builder_list = [LLaVABuilder, OmniLLMBuilder, LanguageModelBuilder]
+
+    tokenizer, model, image_processor = None, None, None
+    for builder in model_builder_list:
+        if builder.judge_able_to_build(model_name):
+            tokenizer, model, image_processor = builder.build(model_path, model_base, model_name, **kwargs)
+            break
+
+    if model is None:
+        raise ValueError(f"Cannot find a suitable builder for model {model_name}\n Please check whether the model name\
+         is correct. If the model you use is not supported by default, please implement a new builder and add to the \
+         model_builder_list in the file RLAIF-V/builder/builder.py")
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, image_processor, context_len
diff --git a/builder/language_model_builder.py b/builder/language_model_builder.py
@@ -0,0 +1,43 @@
+from builder.builder import ModelBuilder
+import warnings
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+
+class LanguageModelBuilder(ModelBuilder):
+    """
+    **Note**: Please put this class at the end of the model builder list.
+    This model builder is a fallback builder for all language models.
+    It returns no image processor.
+    """
+
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        return True
+
+    @classmethod
+    def build(cls, model_path, model_base, model_name, **kwargs):
+        warnings.warn(
+            "Warning: LanguageModel is the fall back model. Please make sure you are loading the correct model.")
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True,
+                                                             **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+        return tokenizer, model, None
diff --git a/llava/model/builder.py → builder/llava_builder.py b/llava/model/builder.py → builder/llava_builder.py
@@ -1,54 +1,27 @@
-#    Copyright 2023 Haotian Liu
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-
-import os
+from builder.builder import ModelBuilder
 import warnings
+import os
 import shutil
 
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+from transformers import AutoTokenizer, AutoConfig
 import torch
 from llava.model import *
 from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 
 
-def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
-    kwargs = {"device_map": device_map, **kwargs}
-
-    if device != "cuda":
-        kwargs['device_map'] = {"": device}
+class LLaVABuilder(ModelBuilder):
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        lower_name = model_name.lower()
+        return 'llava' in lower_name or ('rlaif' in lower_name and '7b' in lower_name)
 
-    if load_8bit:
-        kwargs['load_in_8bit'] = True
-    elif load_4bit:
-        kwargs['load_in_4bit'] = True
-        kwargs['quantization_config'] = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type='nf4'
-        )
-    else:
-        kwargs['torch_dtype'] = torch.float16
-
-    if use_flash_attn:
-        kwargs['attn_implementation'] = 'flash_attention_2'
-
-    if 'llava' in model_name.lower():
-        # Load LLaVA model
+    @classmethod
+    def build(cls, model_path, model_base, model_name, **kwargs):
         if 'lora' in model_name.lower() and model_base is None:
-            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+            warnings.warn(
+                'There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, \
+                please provide the `model_base` argument. Detailed instruction: \
+                https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
         if 'lora' in model_name.lower() and model_base is not None:
             from llava.model.language_model.llava_llama import LlavaConfig
             lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
@@ -57,12 +30,15 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
             model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
             token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
             if model.lm_head.weight.shape[0] != token_num:
-                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
-                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 
             print('Loading additional LLaVA weights...')
             if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
-                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+                non_lora_trainables = (
+                    torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu'))
             else:
                 # this is probably from HF Hub
                 from huggingface_hub import hf_hub_download
@@ -72,10 +48,15 @@ def load_from_hf(repo_id, filename, subfolder=None):
                         filename=filename,
                         subfolder=subfolder)
                     return torch.load(cache_file, map_location='cpu')
+
                 non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
-            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            non_lora_trainables = {
+                (k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()
+            }
             if any(k.startswith('model.model.') for k in non_lora_trainables):
-                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+                non_lora_trainables = {
+                    (k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()
+                }
             model.load_state_dict(non_lora_trainables, strict=False)
 
             from peft import PeftModel
@@ -89,14 +70,17 @@ def load_from_hf(repo_id, filename, subfolder=None):
             print('Loading LLaVA from base model...')
             if 'mpt' in model_name.lower():
                 if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
-                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
+                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'),
+                                    os.path.join(model_path, 'configuration_mpt.py'))
                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
                 cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained,
+                                                            **kwargs)
             else:
                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
                 cfg_pretrained = AutoConfig.from_pretrained(model_path)
-                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained,
+                                                              **kwargs)
 
             mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
             mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
@@ -119,31 +103,7 @@ def load_from_hf(repo_id, filename, subfolder=None):
                     low_cpu_mem_usage=True,
                     **kwargs
                 )
-    else:
-        # Load language model
-        if model_base is not None:
-            # PEFT model
-            from peft import PeftModel
-            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
-            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
-            print(f"Loading LoRA weights from {model_path}")
-            model = PeftModel.from_pretrained(model, model_path)
-            print(f"Merging weights")
-            model = model.merge_and_unload()
-            print('Convert to FP16...')
-            model.to(torch.float16)
-        else:
-            use_fast = False
-            if 'mpt' in model_name.lower():
-                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
-                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
-            else:
-                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
-                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
 
-    image_processor = None
-
-    if 'llava' in model_name.lower():
         mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
         mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
         if mm_use_im_patch_token:
@@ -153,15 +113,11 @@ def load_from_hf(repo_id, filename, subfolder=None):
         model.resize_token_embeddings(len(tokenizer))
 
         vision_tower = model.get_vision_tower()
+        device_map = kwargs.get('device_map', 'auto')
         if not vision_tower.is_loaded:
             vision_tower.load_model(device_map=device_map)
         if device_map != 'auto':
             vision_tower.to(device=device_map[''], dtype=torch.float16)
         image_processor = vision_tower.image_processor
 
-    if hasattr(model.config, "max_sequence_length"):
-        context_len = model.config.max_sequence_length
-    else:
-        context_len = 2048
-
-    return tokenizer, model, image_processor, context_len
+        return tokenizer, model, image_processor
diff --git a/builder/omnillm_builder.py b/builder/omnillm_builder.py
@@ -0,0 +1,47 @@
+from builder.builder import ModelBuilder
+
+from transformers import AutoTokenizer
+import torch
+from omnilmm.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from omnilmm.model.omnilmm import OmniLMMForCausalLM
+from omnilmm.model.utils import build_transform
+
+
+class OmniLLMBuilder(ModelBuilder):
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        lower_name = model_name.lower()
+        return 'omnilmm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
+
+    @classmethod
+    def build(cls, model_path, _, model_name, **kwargs):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048)
+
+        if False:
+            # model on multiple devices for small size gpu memory (Nvidia 3090 24G x2)
+            with init_empty_weights():
+                model = OmniLMMForCausalLM.from_pretrained(model_name, tune_clip=True, torch_dtype=torch.bfloat16)
+            model = load_checkpoint_and_dispatch(model, model_name, dtype=torch.bfloat16,
+                                                 device_map="auto",
+                                                 no_split_module_classes=['Eva', 'MistralDecoderLayer', 'ModuleList',
+                                                                          'Resampler']
+                                                 )
+        else:
+            model = OmniLMMForCausalLM.from_pretrained(
+                model_path, tune_clip=True, torch_dtype=torch.bfloat16
+            ).to(device='cuda', dtype=torch.bfloat16)
+
+        img_processor = build_transform(
+            is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP')
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN,
+                              DEFAULT_IM_END_TOKEN], special_tokens=True)
+        vision_config = model.model.vision_config
+        vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+            [DEFAULT_IMAGE_PATCH_TOKEN])[0]
+        vision_config.use_im_start_end = mm_use_im_start_end
+        vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+
+        return tokenizer, model, img_processor