From 8dcd127be2830d03d5d0a7c6ccc9ea8a698c8298 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Fri, 22 Nov 2024 15:51:35 +0800
Subject: [PATCH 01/18] [upgrade] stage 1 code clean

---
 builder/__init__.py                           |   0
 builder/builder.py                            |  91 +++++++
 builder/language_model_builder.py             |  43 ++++
 .../builder.py => builder/llava_builder.py    | 113 +++------
 builder/omnillm_builder.py                    |  47 ++++
 chat.py                                       |  78 ++----
 data_engine/__init__.py                       |   0
 data_engine/dataset.py                        |  57 +++++
 data_engine/logps_calculator.py               | 231 ++++++++++++++++++
 muffin/constants.py                           |   7 +
 muffin/eval/llava15_chair.py                  |   7 +-
 muffin/eval/muffin_inference_logp.py          |   1 +
 muffin/eval/muffin_vqa.py                     |   7 +-
 muffin/llava15_gen_data.py                    |  55 +++--
 muffin/train/train_utils.py                   |   9 +-
 omnilmm/constants.py                          |   8 +-
 omnilmm/model/omnilmm.py                      | 220 +++++++++--------
 omnilmm/train/train_utils.py                  |   6 -
 18 files changed, 704 insertions(+), 276 deletions(-)
 create mode 100644 builder/__init__.py
 create mode 100644 builder/builder.py
 create mode 100644 builder/language_model_builder.py
 rename llava/model/builder.py => builder/llava_builder.py (54%)
 create mode 100644 builder/omnillm_builder.py
 create mode 100644 data_engine/__init__.py
 create mode 100644 data_engine/dataset.py
 create mode 100644 data_engine/logps_calculator.py

diff --git a/builder/__init__.py b/builder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/builder/builder.py b/builder/builder.py
new file mode 100644
index 0000000..5a7a130
--- /dev/null
+++ b/builder/builder.py
@@ -0,0 +1,91 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from transformers import BitsAndBytesConfig
+import torch
+
+
+class ModelBuilder:
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        """
+        Judge if the model can be built by this builder.
+        Args:
+            model_name: The name of the model.
+
+        Returns:
+            bool: True if the model can be built by this builder.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def build(cls, model_path, model_base, model_name, **kwargs):
+        """
+        Build the model.
+        Returns:
+            tokenizer: The tokenizer of the model.
+            model: The model. This one must be returned. Otherwise, an error will be raised.
+            image_processor: The image processor.
+        """
+        raise NotImplementedError
+
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto",
+                          device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+
+    # import here to avoid circular import
+    from llava_builder import LLaVABuilder
+    from omnillm_builder import OmniLLMBuilder
+    from language_model_builder import LanguageModelBuilder
+
+    # Note: please put LanguageModelBuilder at the end of the list if you want you add your own builder
+    model_builder_list = [LLaVABuilder, OmniLLMBuilder, LanguageModelBuilder]
+
+    tokenizer, model, image_processor = None, None, None
+    for builder in model_builder_list:
+        if builder.judge_able_to_build(model_name):
+            tokenizer, model, image_processor = builder.build(model_path, model_base, model_name, **kwargs)
+            break
+
+    if model is None:
+        raise ValueError(f"Cannot find a suitable builder for model {model_name}\n Please check whether the model name\
+         is correct. If the model you use is not supported by default, please implement a new builder and add to the \
+         model_builder_list in the file RLAIF-V/builder/builder.py")
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, image_processor, context_len
diff --git a/builder/language_model_builder.py b/builder/language_model_builder.py
new file mode 100644
index 0000000..1b48f0c
--- /dev/null
+++ b/builder/language_model_builder.py
@@ -0,0 +1,43 @@
+from builder.builder import ModelBuilder
+import warnings
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+
+class LanguageModelBuilder(ModelBuilder):
+    """
+    **Note**: Please put this class at the end of the model builder list.
+    This model builder is a fallback builder for all language models.
+    It returns no image processor.
+    """
+
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        return True
+
+    @classmethod
+    def build(cls, model_path, model_base, model_name, **kwargs):
+        warnings.warn(
+            "Warning: LanguageModel is the fall back model. Please make sure you are loading the correct model.")
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True,
+                                                             **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+        return tokenizer, model, None
diff --git a/llava/model/builder.py b/builder/llava_builder.py
similarity index 54%
rename from llava/model/builder.py
rename to builder/llava_builder.py
index aa1c020..d59a303 100644
--- a/llava/model/builder.py
+++ b/builder/llava_builder.py
@@ -1,54 +1,26 @@
-#    Copyright 2023 Haotian Liu
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-
-import os
+from builder.builder import ModelBuilder
 import warnings
+import os
 import shutil
 
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+from transformers import AutoTokenizer, AutoConfig
 import torch
 from llava.model import *
 from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 
 
-def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
-    kwargs = {"device_map": device_map, **kwargs}
-
-    if device != "cuda":
-        kwargs['device_map'] = {"": device}
+class LLaVABuilder(ModelBuilder):
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        return 'llava' in model_name.lower()
 
-    if load_8bit:
-        kwargs['load_in_8bit'] = True
-    elif load_4bit:
-        kwargs['load_in_4bit'] = True
-        kwargs['quantization_config'] = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type='nf4'
-        )
-    else:
-        kwargs['torch_dtype'] = torch.float16
-
-    if use_flash_attn:
-        kwargs['attn_implementation'] = 'flash_attention_2'
-
-    if 'llava' in model_name.lower():
-        # Load LLaVA model
+    @classmethod
+    def build(cls, model_path, model_base, model_name, **kwargs):
         if 'lora' in model_name.lower() and model_base is None:
-            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+            warnings.warn(
+                'There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, \
+                please provide the `model_base` argument. Detailed instruction: \
+                https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
         if 'lora' in model_name.lower() and model_base is not None:
             from llava.model.language_model.llava_llama import LlavaConfig
             lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
@@ -57,12 +29,15 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
             model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
             token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
             if model.lm_head.weight.shape[0] != token_num:
-                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
-                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 
             print('Loading additional LLaVA weights...')
             if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
-                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+                non_lora_trainables = (
+                    torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu'))
             else:
                 # this is probably from HF Hub
                 from huggingface_hub import hf_hub_download
@@ -72,10 +47,15 @@ def load_from_hf(repo_id, filename, subfolder=None):
                         filename=filename,
                         subfolder=subfolder)
                     return torch.load(cache_file, map_location='cpu')
+
                 non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
-            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            non_lora_trainables = {
+                (k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()
+            }
             if any(k.startswith('model.model.') for k in non_lora_trainables):
-                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+                non_lora_trainables = {
+                    (k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()
+                }
             model.load_state_dict(non_lora_trainables, strict=False)
 
             from peft import PeftModel
@@ -89,14 +69,17 @@ def load_from_hf(repo_id, filename, subfolder=None):
             print('Loading LLaVA from base model...')
             if 'mpt' in model_name.lower():
                 if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
-                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
+                    shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'),
+                                    os.path.join(model_path, 'configuration_mpt.py'))
                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
                 cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+                model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained,
+                                                            **kwargs)
             else:
                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
                 cfg_pretrained = AutoConfig.from_pretrained(model_path)
-                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained,
+                                                              **kwargs)
 
             mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
             mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
@@ -119,31 +102,7 @@ def load_from_hf(repo_id, filename, subfolder=None):
                     low_cpu_mem_usage=True,
                     **kwargs
                 )
-    else:
-        # Load language model
-        if model_base is not None:
-            # PEFT model
-            from peft import PeftModel
-            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
-            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
-            print(f"Loading LoRA weights from {model_path}")
-            model = PeftModel.from_pretrained(model, model_path)
-            print(f"Merging weights")
-            model = model.merge_and_unload()
-            print('Convert to FP16...')
-            model.to(torch.float16)
-        else:
-            use_fast = False
-            if 'mpt' in model_name.lower():
-                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
-                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
-            else:
-                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
-                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
 
-    image_processor = None
-
-    if 'llava' in model_name.lower():
         mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
         mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
         if mm_use_im_patch_token:
@@ -153,15 +112,11 @@ def load_from_hf(repo_id, filename, subfolder=None):
         model.resize_token_embeddings(len(tokenizer))
 
         vision_tower = model.get_vision_tower()
+        device_map = kwargs.get('device_map', 'auto')
         if not vision_tower.is_loaded:
             vision_tower.load_model(device_map=device_map)
         if device_map != 'auto':
             vision_tower.to(device=device_map[''], dtype=torch.float16)
         image_processor = vision_tower.image_processor
 
-    if hasattr(model.config, "max_sequence_length"):
-        context_len = model.config.max_sequence_length
-    else:
-        context_len = 2048
-
-    return tokenizer, model, image_processor, context_len
+        return tokenizer, model, image_processor
diff --git a/builder/omnillm_builder.py b/builder/omnillm_builder.py
new file mode 100644
index 0000000..c055755
--- /dev/null
+++ b/builder/omnillm_builder.py
@@ -0,0 +1,47 @@
+from builder.builder import ModelBuilder
+
+from transformers import AutoTokenizer
+import torch
+from omnilmm.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from omnilmm.model.omnilmm import OmniLMMForCausalLM
+from omnilmm.model.utils import build_transform
+
+
+class OmniLLMBuilder(ModelBuilder):
+    @classmethod
+    def judge_able_to_build(cls, model_name: str) -> bool:
+        lower_name = model_name.lower()
+        return 'omnillm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
+
+    @classmethod
+    def build(cls, model_path, _, model_name, **kwargs):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048)
+
+        if False:
+            # model on multiple devices for small size gpu memory (Nvidia 3090 24G x2)
+            with init_empty_weights():
+                model = OmniLMMForCausalLM.from_pretrained(model_name, tune_clip=True, torch_dtype=torch.bfloat16)
+            model = load_checkpoint_and_dispatch(model, model_name, dtype=torch.bfloat16,
+                                                 device_map="auto",
+                                                 no_split_module_classes=['Eva', 'MistralDecoderLayer', 'ModuleList',
+                                                                          'Resampler']
+                                                 )
+        else:
+            model = OmniLMMForCausalLM.from_pretrained(
+                model_path, tune_clip=True, torch_dtype=torch.bfloat16
+            ).to(device='cuda', dtype=torch.bfloat16)
+
+        img_processor = build_transform(
+            is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP')
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN,
+                              DEFAULT_IM_END_TOKEN], special_tokens=True)
+        vision_config = model.model.vision_config
+        vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+            [DEFAULT_IMAGE_PATCH_TOKEN])[0]
+        vision_config.use_im_start_end = mm_use_im_start_end
+        vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+
+        return tokenizer, model, img_processor
diff --git a/chat.py b/chat.py
index eb2a125..f427588 100644
--- a/chat.py
+++ b/chat.py
@@ -1,23 +1,17 @@
 import json
 
 import torch
-from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-from llava.conversation import conv_templates, SeparatorStyle
-from llava.model.builder import load_pretrained_model
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, \
+    DEFAULT_IMAGE_PATCH_TOKEN
+from llava.conversation import conv_templates
+from builder.builder import load_pretrained_model
 from llava.utils import disable_torch_init
-from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from llava.mm_utils import tokenizer_image_token, process_images
 from PIL import Image
 import base64
 import io
 import os
-from omnilmm.model.omnilmm import OmniLMMForCausalLM
-from omnilmm.model.utils import build_transform
 from omnilmm.train.train_utils import omni_preprocess
-from transformers import AutoTokenizer, AutoModel
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
 
 
 def init_omni_lmm(model_path):
@@ -25,50 +19,22 @@ def init_omni_lmm(model_path):
     disable_torch_init()
     model_name = os.path.expanduser(model_path)
     print(f'Load RLAIF-V-12B model and tokenizer from {model_name}')
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name, model_max_length=2048)
-
-    if False:
-        # model on multiple devices for small size gpu memory (Nvidia 3090 24G x2)
-        with init_empty_weights():
-            model = OmniLMMForCausalLM.from_pretrained(model_name, tune_clip=True, torch_dtype=torch.bfloat16)
-        model = load_checkpoint_and_dispatch(model, model_name, dtype=torch.bfloat16,
-                    device_map="auto",  no_split_module_classes=['Eva','MistralDecoderLayer', 'ModuleList', 'Resampler']
-        )
-    else:
-        model = OmniLMMForCausalLM.from_pretrained(
-            model_name, tune_clip=True, torch_dtype=torch.bfloat16
-        ).to(device='cuda', dtype=torch.bfloat16)
-
-    image_processor = build_transform(
-        is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP')
-
-    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
-    assert mm_use_im_start_end
-
-    tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN,
-                         DEFAULT_IM_END_TOKEN], special_tokens=True)
-
-
-    vision_config = model.model.vision_config
-    vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
-        [DEFAULT_IMAGE_PATCH_TOKEN])[0]
-    vision_config.use_im_start_end = mm_use_im_start_end
-    vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
-        [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+    tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, model_name)
     image_token_len = model.model.config.num_query
 
     return model, image_processor, image_token_len, tokenizer
 
+
 def expand_question_into_multimodal(question_text, image_token_len, im_st_token, im_ed_token, im_patch_token):
     if '<image>' in question_text[0]['content']:
         question_text[0]['content'] = question_text[0]['content'].replace(
             '<image>', im_st_token + im_patch_token * image_token_len + im_ed_token)
     else:
         question_text[0]['content'] = im_st_token + im_patch_token * \
-            image_token_len + im_ed_token + '\n' + question_text[0]['content']
+                                      image_token_len + im_ed_token + '\n' + question_text[0]['content']
     return question_text
 
+
 def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
     if isinstance(question, str):
         question = [{"role": "user", "content": question}]
@@ -78,8 +44,8 @@ def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
 
     conversation = question
     data_dict = omni_preprocess(sources=[conversation],
-                                  tokenizer=tokenizer,
-                                  generation=True)
+                                tokenizer=tokenizer,
+                                generation=True)
 
     data_dict = dict(input_ids=data_dict["input_ids"][0],
                      labels=data_dict["labels"][0])
@@ -118,7 +84,7 @@ def decode(self, image, input_ids):
 
     def chat(self, input):
         im_64 = img2base64(input['image'])
-        msgs=json.dumps([{"role": "user", "content": input['question']}])
+        msgs = json.dumps([{"role": "user", "content": input['question']}])
 
         try:
             image = Image.open(io.BytesIO(base64.b64decode(im_64))).convert('RGB')
@@ -135,21 +101,23 @@ def chat(self, input):
 
         return out
 
+
 def img2base64(file_name):
     with open(file_name, 'rb') as f:
         encoded_string = base64.b64encode(f.read())
         return encoded_string
 
+
 class RLAIFV7B:
     def __init__(self, model_path) -> None:
         disable_torch_init()
-        model_name='llava-v1.5-7b'
+        model_name = 'llava-v1.5-7b'
         tokenizer, model, image_processor, context_len = load_pretrained_model(
-        model_path, model_base=None,model_name=model_name, device_map={"": 'cuda'})
-        self.tokenizer=tokenizer
-        self.model=model
-        self.image_processor=image_processor
-        self.context_len=context_len
+            model_path, model_base=None, model_name=model_name, device_map={"": 'cuda'})
+        self.tokenizer = tokenizer
+        self.model = model
+        self.image_processor = image_processor
+        self.context_len = context_len
 
     def chat(self, input):
         msgs = input['question']
@@ -164,7 +132,8 @@ def chat(self, input):
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
 
-        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+            0).cuda()
         image_tensor = process_images([image], self.image_processor, self.model.config)[0]
         with torch.inference_mode():
             output_ids = self.model.generate(
@@ -192,9 +161,8 @@ def chat(self, input):
 
 
 if __name__ == '__main__':
-
     chat_model = RLAIFVChat('RLAIF-V/RLAIF-V-7B')  # or 'HaoyeZhang/RLAIF-V-12B'
-    image_path="./examples/test.jpeg"
+    image_path = "./examples/test.jpeg"
     msgs = "Why did the car in the picture stop?"
     inputs = {"image": image_path, "question": msgs}
     answer = chat_model.chat(inputs)
diff --git a/data_engine/__init__.py b/data_engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_engine/dataset.py b/data_engine/dataset.py
new file mode 100644
index 0000000..e00f82b
--- /dev/null
+++ b/data_engine/dataset.py
@@ -0,0 +1,57 @@
+from functools import partial
+
+import torch.utils.data as torch_data
+import json
+
+from muffin.data.datasets import bytes_to_PIL_image
+from muffin.train.train_utils import encode_multimodal_preference_sample, preprocess_v1
+
+
+class PreferenceInferenceDataset(torch_data.Dataset):
+    def __init__(self,
+                 data,
+                 tokenizer,
+                 image_token_len,
+                 img_processor,
+                 use_im_start_end=True):
+        self.data = data
+
+        self.mm_cfg = {
+            'image_processor': img_processor,
+            'is_multimodal': True,
+            'image_token_len': image_token_len,
+            'use_im_start_end': use_im_start_end,
+            'keep_image_tag': True
+        }
+        self.tokenizer = tokenizer
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        metainfo = {
+            "origin_dataset": sample['origin_dataset'],
+            "origin_split": json.loads(sample['origin_split']),
+            "origin_idx": sample['idx'],
+            "image_id": sample['image_path'],
+        }
+        question = {'from': 'human', 'value': f"<image>\n{sample['question']}"}
+        chosen = {'from': 'gpt', 'value': sample['chosen']}
+        rejected = {'from': 'gpt', 'value': sample['rejected']}
+
+        # image = bytes_to_PIL_image(sample['image']['bytes'])
+        image = bytes_to_PIL_image(sample['image_bytes'])
+
+        formated_sample = {
+            'image': image,
+            "question": question,
+            "chosen": chosen,
+            "rejected": rejected,
+            "idx": sample['idx'],
+            "metainfo": metainfo
+        }
+        preprocess_func = partial(preprocess_v1, has_image=True)
+        rej_data_dict, win_data_dict = encode_multimodal_preference_sample(
+            formated_sample, self.tokenizer, self.mm_cfg, preprocess_func=preprocess_func)
+        return rej_data_dict, win_data_dict
+
+    def __len__(self):
+        return len(self.data)
diff --git a/data_engine/logps_calculator.py b/data_engine/logps_calculator.py
new file mode 100644
index 0000000..0ef45d1
--- /dev/null
+++ b/data_engine/logps_calculator.py
@@ -0,0 +1,231 @@
+import os
+import json
+import tqdm
+import copy
+import itertools
+import argparse
+import pandas as pd
+import torch.utils.data as torch_data
+from functools import partial
+from muffin.train.train_utils import SFT_collator_fn
+import numpy as np
+import datasets as hf_datasets
+from transformers.image_processing_utils import BatchFeature
+
+from builder.builder import load_pretrained_model
+from muffin.eval.muffin_inference_logp import (get_batch_logps, InferenceSampler, concate_pad)
+from dataset import PreferenceInferenceDataset
+
+import torch
+import torch.distributed as dist
+
+
+def preference_collator_fn(instances, pad_token_id, use_12b_model=False):
+    rej_instances, win_instances = list(zip(*instances))
+    rej_batch = SFT_collator_fn(rej_instances, pad_token_id)
+    win_batch = SFT_collator_fn(win_instances, pad_token_id)
+
+    concatenated_input_ids = concate_pad(win_batch['input_ids'], rej_batch['input_ids'], pad_token_id)
+    concatenated_labels = concate_pad(win_batch['labels'], rej_batch['labels'], -100)
+    concatenated_attention_mask = concatenated_input_ids.ne(pad_token_id)
+
+    if not use_12b_model:
+        if isinstance(win_batch['images'][0], BatchFeature):
+            win_images = torch.stack([torch.tensor(img.pixel_values[0]) for img in win_batch['images']])
+        elif isinstance(win_batch['images'][0], np.ndarray):
+            win_images = torch.stack([torch.tensor(img) for img in win_batch['images']])
+        else:
+            win_images = win_batch['images']
+
+    batch = dict(
+        concatenated_input_ids=concatenated_input_ids,
+        concatenated_labels=concatenated_labels,
+        concatenated_attention_mask=concatenated_attention_mask,
+        win_input_ids=win_batch['input_ids'],
+        rej_input_ids=rej_batch['input_ids'],
+        win_labels=win_batch['labels'],
+        rej_labels=rej_batch['labels'],
+        win_attention_mask=win_batch['attention_mask'],
+        rej_attention_mask=rej_batch['attention_mask'],
+        images=win_batch['images'] if use_12b_model else win_images,
+    )
+    return batch
+
+
+def get_multimodal_sample_logps(model, dataloader, tokenizer, is_llava15=False):
+    win_logp_list = []
+    rej_logp_list = []
+
+    win_avg_logp_list = []
+    rej_avg_logp_list = []
+
+    win_per_token_logp_list = []
+    rej_per_token_logp_list = []
+
+    with torch.inference_mode():
+        idx = 0
+        for batch in tqdm.tqdm(dataloader):
+            for key in ['win', 'rej']:
+                input_ids = batch[f'{key}_input_ids'].cuda()
+                # tokens = tokenizer.batch_decode(copy.deepcopy(input_ids))
+                # print(tokens)
+                labels = batch[f'{key}_labels'].cuda()
+                attention_mask = batch[f'{key}_attention_mask'].cuda()
+
+                if is_llava15:
+                    # print("is llava15")
+                    (
+                        _,
+                        _,
+                        _,
+                        _,
+                        inputs_embeds,
+                        labels
+                    ) = model.prepare_inputs_labels_for_multimodal(
+                        input_ids=input_ids,
+                        position_ids=None,
+                        attention_mask=None,
+                        past_key_values=None,
+                        labels=labels,
+                        images=batch['images'].to(dtype=torch.bfloat16, device='cuda'),
+                    )
+                    output = model.forward(
+                        inputs_embeds=inputs_embeds,
+                        labels=None,
+                    )
+                else:
+                    output = model(
+                        input_ids=input_ids,
+                        labels=labels,
+                        attention_mask=attention_mask,
+                        images=batch['images'].to(dtype=torch.bfloat16, device='cuda'),
+                    )
+                per_token_logp, log_prob, average_log_prob = get_batch_logps(output.logits, labels, return_all=True)
+
+                # print(per_token_logp.shape, input_ids.shape, labels.shape, flush=True)
+                assert per_token_logp.size(1) >= input_ids.size(1) - 1
+                per_token_logp = per_token_logp.tolist()
+                # per_token_logp = [x[:input_ids[i].ne(tokenizer.pad_token_id).sum().item()] for i, x in enumerate(per_token_logp)]
+                log_prob = log_prob.tolist()
+                average_log_prob = average_log_prob.tolist()
+
+                if key == 'win':
+                    win_logp_list += log_prob
+                    win_avg_logp_list += average_log_prob
+                    win_per_token_logp_list += per_token_logp
+                else:
+                    rej_logp_list += log_prob
+                    rej_avg_logp_list += average_log_prob
+                    rej_per_token_logp_list += per_token_logp
+            # print(f'{key} logits in {output.logits.shape}, logp in {log_prob.shape} avg_logp in {average_log_prob.shape}', flush=True)
+
+    return win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
+
+
+def write_logp_to_preference_parquet(origin_data, cache_file, logps, overwrite_logps=True):
+    out_data = []
+
+    for index in range(len(logps)):
+        line = origin_data[index]
+        logp_data = {}
+        logp_data['logps'] = logps[index]
+
+        new_line = copy.deepcopy(line)
+
+        if 'logps' in new_line.keys():
+            assert overwrite_logps, 'Found existing logp data, pass overwrite_logps=True to force overwritting'
+            new_line['logps'] = json.dumps(logp_data)
+
+        else:
+            assert (('question' in list(new_line.keys()))
+                    and ('chosen' in list(new_line.keys()))
+                    and ('rejected' in list(new_line.keys()))), \
+                f'Undefined data structure, expecting [Q, Win, Rej] in keys, got {new_line.keys()}'
+            new_line['logps'] = json.dumps(logp_data)
+
+        out_data.append(new_line)
+
+    # df = none
+    if torch.distributed.get_rank() == 0:
+        step = 5000
+        for idx, start in enumerate(range(0, len(out_data), step)):
+            temp_data = out_data[start: min(start + step, len(out_data))]
+            df = pd.DataFrame(temp_data)
+            df.to_parquet(os.path.join(cache_file, f'RLAIF-V-Dataset-withlogp_{idx:03}-{len(temp_data)}.parquet'))
+
+    torch.distributed.barrier()
+    return df
+
+
+def inference_logp(model_path, dataset_path, output_file, use_12b_model=False):
+    dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
+                            rank=int(os.getenv('RANK', '0')), )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    if not use_12b_model:
+        model_name = 'llava-v1.5-7b'
+        tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
+                                                                               device_map={"": 'cuda'})
+    else:
+        model_name = 'OmniLMM-12B'
+        tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
+                                                                               device_map={"": 'cuda'})
+        image_token_len = model.model.config.num_query
+
+    model = model.to(dtype=torch.bfloat16, device='cuda')
+    hf_data = hf_datasets.load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column("image",
+                                                                                               hf_datasets.Image(
+                                                                                                   decode=False))
+    dataset = PreferenceInferenceDataset(tokenizer=tokenizer,
+                                         data=hf_data,
+                                         image_token_len=0 if not use_12b_model else image_token_len,
+                                         img_processor=image_processor,
+                                         use_im_start_end=False)
+    collate_fn = partial(preference_collator_fn, pad_token_id=tokenizer.pad_token_id, use_12b_model=use_12b_model)
+    dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
+                                       num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
+
+    outputs = get_multimodal_sample_logps(model, dataloader, tokenizer,
+                                          is_llava15=True)  # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
+
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [[None for _ in range(world_size)] for i in range(len(outputs))]
+    for i in range(len(outputs)):
+        torch.distributed.all_gather_object(merged_outputs[i], outputs[i])
+        merged_outputs[i] = [_ for _ in itertools.chain.from_iterable(merged_outputs[i])]
+
+    win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list \
+        = merged_outputs
+
+    logps = list(zip(win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list,
+                     rej_per_token_logp_list))
+
+    df = write_logp_to_preference_parquet(dataset.data, output_file, logps, overwrite_logps=True)
+
+    torch.distributed.barrier()
+
+    del model
+    return df
+
+
+def main(reward_model_path: str, instruct_model_path: str, dataset_path: str, reward_model_output_file: str,
+         instruct_model_output_file: str, use_12b_model=False) -> None:
+    reward_model_output_df = inference_logp(reward_model_path, dataset_path, reward_model_output_file, use_12b_model=use_12b_model)
+    instruct_model_output_df = inference_logp(instruct_model_path, dataset_path, instruct_model_output_file, use_12b_model=use_12b_model)
+
+    return reward_model_output_df, instruct_model_output_df
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="inference and save the results")
+    parser.add_argument('--reward_model_path', type=str, default="/home/qmli/models/llava_1.5_7b")
+    parser.add_argument('--instruct_model_path', type=str, default="/home/qmli/models/llava_1.5_7b")
+    parser.add_argument('--dataset_path', type=str, default='/home/qmli/RLAIF-V/Refo_test/result/parquet')
+    parser.add_argument('--reward_model_output_file', type=str, default='/home/qmli/RLAIF-V/script_test')
+    parser.add_argument('--instruct_model_output_file', type=str, default='/home/qmli/RLAIF-V/script_test')
+    parser.add_argument('--use_12b_model', action='store_true')
+    parser.add_argument('--local-rank', type=int, default=0)
+    args = parser.parse_args()
+
+    main(args.reward_model_path, args.instruct_model_path, args.dataset_path, args.reward_model_output_file,
+         args.instruct_model_output_file, use_12b_model=args.use_12b_model)
diff --git a/muffin/constants.py b/muffin/constants.py
index a1ac41d..a402228 100644
--- a/muffin/constants.py
+++ b/muffin/constants.py
@@ -1,4 +1,11 @@
 CONTROLLER_HEART_BEAT_EXPIRATION = 30
 WORKER_HEART_BEAT_INTERVAL = 15
 
+IMAGE_TOKEN_INDEX = -200  # from llava 1.5, used to determin image in forward function
+IGNORE_INDEX = -100
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
 LOGDIR = "."
diff --git a/muffin/eval/llava15_chair.py b/muffin/eval/llava15_chair.py
index 2b8c366..cee3807 100644
--- a/muffin/eval/llava15_chair.py
+++ b/muffin/eval/llava15_chair.py
@@ -6,12 +6,11 @@
 import shortuuid
 import base64
 import io
-import sys
 from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-from llava.conversation import conv_templates, SeparatorStyle
-from llava.model.builder import load_pretrained_model
+from llava.conversation import conv_templates
+from builder.builder import load_pretrained_model
 from llava.utils import disable_torch_init
-from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from llava.mm_utils import tokenizer_image_token, process_images
 
 from PIL import Image
 import math
diff --git a/muffin/eval/muffin_inference_logp.py b/muffin/eval/muffin_inference_logp.py
index 9eb3ffe..4cdcda1 100644
--- a/muffin/eval/muffin_inference_logp.py
+++ b/muffin/eval/muffin_inference_logp.py
@@ -55,6 +55,7 @@ def get_batch_logps_minicpm(logits: torch.FloatTensor, labels: torch.LongTensor,
 class InferenceSampler(torch.utils.data.sampler.Sampler):
 
     def __init__(self, size):
+        super().__init__()
         self._size = int(size)
         assert size > 0
         self._rank = torch.distributed.get_rank()
diff --git a/muffin/eval/muffin_vqa.py b/muffin/eval/muffin_vqa.py
index 2b8c366..cee3807 100644
--- a/muffin/eval/muffin_vqa.py
+++ b/muffin/eval/muffin_vqa.py
@@ -6,12 +6,11 @@
 import shortuuid
 import base64
 import io
-import sys
 from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-from llava.conversation import conv_templates, SeparatorStyle
-from llava.model.builder import load_pretrained_model
+from llava.conversation import conv_templates
+from builder.builder import load_pretrained_model
 from llava.utils import disable_torch_init
-from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from llava.mm_utils import tokenizer_image_token, process_images
 
 from PIL import Image
 import math
diff --git a/muffin/llava15_gen_data.py b/muffin/llava15_gen_data.py
index 4a7de37..4b92792 100644
--- a/muffin/llava15_gen_data.py
+++ b/muffin/llava15_gen_data.py
@@ -14,12 +14,11 @@
 
 from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 from llava.conversation import conv_templates
-from llava.model.builder import load_pretrained_model
+from builder.builder import load_pretrained_model
 from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 
 
 def torch_pad_sequence(sequence, padding_value, batch_first=True, padding_side='right'):
-
     if padding_side == 'right':
         sequence = torch.nn.utils.rnn.pad_sequence(
             sequence,
@@ -35,9 +34,11 @@ def torch_pad_sequence(sequence, padding_value, batch_first=True, padding_side='
         raise NotImplementedError(f'padding_size={padding_side}')
     return sequence
 
+
 class InferenceSampler(torch.utils.data.sampler.Sampler):
 
     def __init__(self, size):
+        super().__init__()
         self._size = int(size)
         assert size > 0
         self._rank = torch.distributed.get_rank()
@@ -61,6 +62,7 @@ def __iter__(self):
     def __len__(self):
         return len(self._local_indices)
 
+
 class GenDataset(torch_data.Dataset):
     def __init__(self, qa_file, question_process, max_size, start=0, end=-1, repeat_time=1):
         '''
@@ -74,7 +76,7 @@ def __init__(self, qa_file, question_process, max_size, start=0, end=-1, repeat_
         try:
             self.qa_data = [json.loads(line) for line in open(self.qa_file)]
             if isinstance(self.qa_data[0], list):
-                self.qa_data = self.qa_data[0] # unwrap one-line json question file
+                self.qa_data = self.qa_data[0]  # unwrap one-line json question file
         except:
             try:
                 with open(self.qa_file, "r") as f:
@@ -130,7 +132,7 @@ def __getitem__(self, index):
             # print("in metainfos")
             image = Image.open(item['metainfos']['image_path']).convert('RGB')
 
-        metainfo = {key:value for key,value in item.items() if key not in ["image_id", "question", "image"]}
+        metainfo = {key: value for key, value in item.items() if key not in ["image_id", "question", "image"]}
 
         raw_question = item['question']
 
@@ -138,7 +140,7 @@ def __getitem__(self, index):
         # print("question_input_ids:", question_input_ids)
 
         return {
-            'question_id': item['question_id'] if 'question_id' in item else self.start_idx+index,
+            'question_id': item['question_id'] if 'question_id' in item else self.start_idx + index,
             'image': image,
             'question_input_ids': question_input_ids,
             'raw_question': raw_question,
@@ -149,6 +151,7 @@ def __getitem__(self, index):
     def __len__(self):
         return len(self.qa_data)
 
+
 def wrap_question_for_llava15(question, tokenizer, mm_use_im_start_end, conv_mode):
     qs = question
     if DEFAULT_IMAGE_TOKEN in qs:
@@ -168,6 +171,7 @@ def wrap_question_for_llava15(question, tokenizer, mm_use_im_start_end, conv_mod
 
     return input_ids
 
+
 def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
     input_ids = [torch.as_tensor(x['question_input_ids']) for x in data_list]
 
@@ -203,6 +207,7 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
 
     return data
 
+
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser()
@@ -240,14 +245,18 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
     print(f'Init Rank-{torch.distributed.get_rank()}')
     model_path = os.path.expanduser(args.checkpoint)
     model_name = get_model_name_from_path(model_path)
-    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, device_map={"": 'cuda'}) # device_map={"": 'cuda'}
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
+                                                                           device_map={
+                                                                               "": 'cuda'})  # device_map={"": 'cuda'}
 
     random.seed(args.seed)
 
     question_process_func = partial(
-            wrap_question_for_llava15, tokenizer=tokenizer, mm_use_im_start_end=model.config.mm_use_im_start_end, conv_mode=args.conv_mode)
+        wrap_question_for_llava15, tokenizer=tokenizer, mm_use_im_start_end=model.config.mm_use_im_start_end,
+        conv_mode=args.conv_mode)
 
-    dataset = GenDataset(args.ds_name, question_process_func, max_size=args.max_sample, start=args.start_pos, end=args.end_pos, repeat_time=args.repeat)
+    dataset = GenDataset(args.ds_name, question_process_func, max_size=args.max_sample, start=args.start_pos,
+                         end=args.end_pos, repeat_time=args.repeat)
     print(f'Dataset size is {len(dataset)}')
 
     collate_fn = partial(llava15_qa_colloator_fn, tokenizer=tokenizer,
@@ -276,7 +285,7 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
             # print(f'input_ids: {batch["input_ids"]}')
             # print(f'Input: {tokenizer.batch_decode(batch["input_ids"])}'
             #       f'input_ids: {batch["input_ids"]}')
-                #   f'attn_mask: {batch["attention_mask"]}')
+            #   f'attn_mask: {batch["attention_mask"]}')
             if args.is_yesno:
                 output = model.generate(
                     inputs=batch['input_ids'].cuda(),
@@ -293,10 +302,15 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                 # print("output_scores len:", len(output.scores))
                 output_scores_all = torch.stack(output.scores, dim=0)
                 # print(output_scores_all.shape)
-                output_scores_reshape = (batch['input_ids'].shape[0], len(output.scores), args.num_beam, output.scores[0].shape[-1])
+                output_scores_reshape = (
+                batch['input_ids'].shape[0], len(output.scores), args.num_beam, output.scores[0].shape[-1])
                 new_output_scores = output_scores_all.view(output_scores_reshape)
 
-                for question, output_ids, output_scores, question_id, metainfos in zip(batch['raw_questions'], output.sequences, new_output_scores, batch['question_id'], batch['metainfos']):
+                for question, output_ids, output_scores, question_id, metainfos in zip(batch['raw_questions'],
+                                                                                       output.sequences,
+                                                                                       new_output_scores,
+                                                                                       batch['question_id'],
+                                                                                       batch['metainfos']):
 
                     response = tokenizer.decode(
                         output_ids, skip_special_tokens=True)
@@ -324,13 +338,13 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                         })
                     else:
                         outputs.append({
-                        'question_id': question_id,
-                        'raw_question': question,
-                        'answer': response,
-                        'scores': item_scores,
-                        'metainfos': metainfos,
-                        'model_path': args.checkpoint
-                    })
+                            'question_id': question_id,
+                            'raw_question': question,
+                            'answer': response,
+                            'scores': item_scores,
+                            'metainfos': metainfos,
+                            'model_path': args.checkpoint
+                        })
 
             else:
                 if args.num_beam >= 1:
@@ -357,9 +371,10 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                         return_dict_in_generate=True)
 
                 # print(output.scores, flush=True)
-                for question, output_ids, question_id, metainfos in zip(batch['raw_questions'], output.sequences, batch['question_id'], batch['metainfos']):
+                for question, output_ids, question_id, metainfos in zip(batch['raw_questions'], output.sequences,
+                                                                        batch['question_id'], batch['metainfos']):
                     response = tokenizer.decode(
-                            output_ids, skip_special_tokens=True)
+                        output_ids, skip_special_tokens=True)
                     response = response.strip()
 
                     if 'ds_question_id' in metainfos:
diff --git a/muffin/train/train_utils.py b/muffin/train/train_utils.py
index 3c84031..f7d5d9f 100644
--- a/muffin/train/train_utils.py
+++ b/muffin/train/train_utils.py
@@ -14,16 +14,9 @@
 from typing import Dict, Optional, Sequence
 from muffin import conversation as conversation_lib
 from packaging import version
-
+from muffin.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_TOKEN
 
 IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
-IMAGE_TOKEN_INDEX = -200 # from llava 1.5, used to determin image in forward function
-IGNORE_INDEX = -100
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
-
 
 def _tokenize_fn(strings: Sequence[str],
                  tokenizer: transformers.PreTrainedTokenizer) -> Dict:
diff --git a/omnilmm/constants.py b/omnilmm/constants.py
index 16bfb27..3494f5c 100644
--- a/omnilmm/constants.py
+++ b/omnilmm/constants.py
@@ -1,4 +1,10 @@
 CONTROLLER_HEART_BEAT_EXPIRATION = 30
 WORKER_HEART_BEAT_INTERVAL = 15
 
-LOGDIR = "."
\ No newline at end of file
+IGNORE_INDEX = -100
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
+LOGDIR = "."
diff --git a/omnilmm/model/omnilmm.py b/omnilmm/model/omnilmm.py
index 052d9cc..2281434 100644
--- a/omnilmm/model/omnilmm.py
+++ b/omnilmm/model/omnilmm.py
@@ -13,10 +13,7 @@
 
 from omnilmm.model.utils import build_transform
 from omnilmm.model.resampler import Resampler
-
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
+from omnilmm.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 
 
 class OmniLMMConfig(MistralConfig):
@@ -115,7 +112,7 @@ def get_vision_embedding(self, pixel_values):
             pixel_values.type(dtype))
         if hasattr(vision_tower, 'num_prefix_tokens') and vision_tower.num_prefix_tokens > 0:
             vision_embedding = vision_embedding[:,
-                                                vision_tower.num_prefix_tokens:]
+                               vision_tower.num_prefix_tokens:]
         res = self.resampler(vision_embedding)
         return res
 
@@ -132,16 +129,21 @@ def get_vllm_embedding(self, data):
         else:
             vision_hidden_states = data['vision_hidden_states']
 
-        #vllm_embedding = self.llm.model.embed_tokens(data['input_ids']) * self.llm.config.scale_emb
+        # vllm_embedding = self.llm.model.embed_tokens(data['input_ids']) * self.llm.config.scale_emb
         inputs_embeds = self.embed_tokens(data['input_ids'])
         vision_hidden_states = [i.type(inputs_embeds.dtype)
-            if isinstance(i, torch.Tensor) else i for i in vision_hidden_states
-        ]
-
+                                if isinstance(i, torch.Tensor) else i for i in vision_hidden_states
+                                ]
 
         # HACK: replace back original embeddings for LLaVA pretraining
         orig_embeds_params = getattr(self, 'orig_embeds_params', None)
 
+        dummy_image_features = torch.zeros(
+            self.config.num_query,
+            self.config.hidden_size,
+            device=inputs_embeds.device,
+            dtype=inputs_embeds.dtype)
+
         new_input_embeds = []
         cur_image_idx = 0
         for cur_input_ids, cur_input_embeds in zip(data['input_ids'], inputs_embeds):
@@ -154,7 +156,8 @@ def get_vllm_embedding(self, data):
             if self.vision_config.use_im_start_end:
                 cur_image_features = vision_hidden_states[cur_image_idx]
                 num_patches = cur_image_features.shape[0]
-                if (cur_input_ids == self.vision_config.im_start_token).sum() != (cur_input_ids == self.vision_config.im_end_token).sum():
+                if (cur_input_ids == self.vision_config.im_start_token).sum() != (
+                        cur_input_ids == self.vision_config.im_end_token).sum():
                     raise ValueError(
                         "The number of image start tokens and image end tokens should be the same.")
                 image_start_tokens = torch.where(
@@ -167,11 +170,18 @@ def get_vllm_embedding(self, data):
                         raise ValueError(
                             "The image end token should follow the image start token.")
                     if orig_embeds_params is not None:
-                        cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features,
-                                                         cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(),
+                                                          cur_input_embeds[
+                                                          image_start_token_pos:image_start_token_pos + 1],
+                                                          cur_image_features,
+                                                          cur_input_embeds[
+                                                          image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2],
+                                                          cur_input_embeds[
+                                                          image_start_token_pos + num_patches + 2:].detach()), dim=0)
                     else:
                         cur_new_input_embeds = torch.cat(
-                            (cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                            (cur_input_embeds[:image_start_token_pos + 1], cur_image_features,
+                             cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
                     cur_image_idx += 1
                 new_input_embeds.append(cur_new_input_embeds)
             else:
@@ -181,80 +191,91 @@ def get_vllm_embedding(self, data):
         return inputs_embeds, vision_hidden_states
 
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        images: Optional[torch.FloatTensor] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            images: Optional[torch.FloatTensor] = None,
+            return_dict: Optional[bool] = None,
+            **kwargs
     ) -> Union[Tuple, BaseModelOutputWithPast]:
 
         # HACK: replace back original embeddings for LLaVA pretraining
         orig_embeds_params = getattr(self, 'orig_embeds_params', None)
 
         if inputs_embeds is None and past_key_values is None:
-          inputs_embeds = self.embed_tokens(input_ids)
+            inputs_embeds = self.embed_tokens(input_ids)
 
-          vision_tower = getattr(self, 'vision_tower', None)
-          if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
+            vision_tower = getattr(self, 'vision_tower', None)
+            if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
 
-            if type(images) is list:
-                image_features = []
-                for image in images:
-                    image_forward_out = self.get_vision_embedding(image.unsqueeze(0))[
-                        0]
-                    image_features.append(image_forward_out)
-            else:
-                image_features = self.get_vision_embedding(images)
-
-            dummy_image_features = torch.zeros(
-                self.config.num_query,
-                self.config.hidden_size,
-                device=inputs_embeds.device,
-                dtype=inputs_embeds.dtype)
-
-            new_input_embeds = []
-            cur_image_idx = 0
-            for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
-                if (cur_input_ids == self.vision_config.im_patch_token).sum() == 0:
-                    # multimodal LLM, but the current sample is not multimodal
-                    cur_input_embeds = cur_input_embeds + \
-                        (0. * dummy_image_features).sum()
-                    new_input_embeds.append(cur_input_embeds)
-                    continue
-
-                if self.vision_config.use_im_start_end:
-                    cur_image_features = image_features[cur_image_idx]
-                    num_patches = cur_image_features.shape[0]
-                    if (cur_input_ids == self.vision_config.im_start_token).sum() != (cur_input_ids == self.vision_config.im_end_token).sum():
-                        raise ValueError(
-                            "The number of image start tokens and image end tokens should be the same.")
-                    image_start_tokens = torch.where(
-                        cur_input_ids == self.vision_config.im_start_token)[0]
-                    for image_start_token_pos in image_start_tokens:
-                        cur_image_features = image_features[cur_image_idx].to(
-                            device=cur_input_embeds.device)
+                if type(images) is list:
+                    image_features = []
+                    for image in images:
+                        image_forward_out = self.get_vision_embedding(image.unsqueeze(0))[
+                            0]
+                        image_features.append(image_forward_out)
+                else:
+                    image_features = self.get_vision_embedding(images)
+
+                dummy_image_features = torch.zeros(
+                    self.config.num_query,
+                    self.config.hidden_size,
+                    device=inputs_embeds.device,
+                    dtype=inputs_embeds.dtype)
+
+                new_input_embeds = []
+                cur_image_idx = 0
+                for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
+                    if (cur_input_ids == self.vision_config.im_patch_token).sum() == 0:
+                        # multimodal LLM, but the current sample is not multimodal
+                        cur_input_embeds = cur_input_embeds + \
+                                           (0. * dummy_image_features).sum()
+                        new_input_embeds.append(cur_input_embeds)
+                        continue
+
+                    if self.vision_config.use_im_start_end:
+                        cur_image_features = image_features[cur_image_idx]
                         num_patches = cur_image_features.shape[0]
-                        if cur_input_ids[image_start_token_pos + num_patches + 1] != self.vision_config.im_end_token:
+                        if (cur_input_ids == self.vision_config.im_start_token).sum() != (
+                                cur_input_ids == self.vision_config.im_end_token).sum():
                             raise ValueError(
-                                "The image end token should follow the image start token.")
-                        if orig_embeds_params is not None:
-                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features,
-                                                             cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
-                        else:
-                            cur_new_input_embeds = torch.cat(
-                                (cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
-                        cur_image_idx += 1
-                    new_input_embeds.append(cur_new_input_embeds)
-                else:
-                    raise NotImplementedError
-            inputs_embeds = torch.stack(new_input_embeds, dim=0)
-            input_ids = None
+                                "The number of image start tokens and image end tokens should be the same.")
+                        image_start_tokens = torch.where(
+                            cur_input_ids == self.vision_config.im_start_token)[0]
+                        for image_start_token_pos in image_start_tokens:
+                            cur_image_features = image_features[cur_image_idx].to(
+                                device=cur_input_embeds.device)
+                            num_patches = cur_image_features.shape[0]
+                            if cur_input_ids[
+                                image_start_token_pos + num_patches + 1] != self.vision_config.im_end_token:
+                                raise ValueError(
+                                    "The image end token should follow the image start token.")
+                            if orig_embeds_params is not None:
+                                cur_new_input_embeds = (
+                                    torch.cat((cur_input_embeds[:image_start_token_pos].detach(),
+                                               cur_input_embeds[
+                                               image_start_token_pos:image_start_token_pos + 1],
+                                               cur_image_features,
+                                               cur_input_embeds[
+                                               image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2],
+                                               cur_input_embeds[
+                                               image_start_token_pos + num_patches + 2:].detach()),
+                                              dim=0))
+                            else:
+                                cur_new_input_embeds = torch.cat(
+                                    (cur_input_embeds[:image_start_token_pos + 1], cur_image_features,
+                                     cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                            cur_image_idx += 1
+                        new_input_embeds.append(cur_new_input_embeds)
+                    else:
+                        raise NotImplementedError
+                inputs_embeds = torch.stack(new_input_embeds, dim=0)
+                input_ids = None
 
         return super(OmniLMMModel, self).forward(
             input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values,
@@ -280,18 +301,18 @@ def __init__(self, config, mm_vision_tower=None, tune_clip=True):
         self.post_init()
 
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        images: Optional[torch.FloatTensor] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            images: Optional[torch.FloatTensor] = None,
+            return_dict: Optional[bool] = None,
+            **kwargs
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -347,7 +368,7 @@ def forward(
 
     # TODO could be removed for generate_vllm()
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         if past_key_values:
             input_ids = input_ids[:, -1:]
@@ -369,12 +390,12 @@ def prepare_inputs_for_generation(
         return model_inputs
 
     def generate_vllm(
-        self,
-        input_ids: torch.LongTensor = None,
-        images: Optional[torch.FloatTensor] = None,
-        vision_hidden_states=None,
-        return_vision_hidden_states=False,
-        **kwargs
+            self,
+            input_ids: torch.LongTensor = None,
+            images: Optional[torch.FloatTensor] = None,
+            vision_hidden_states=None,
+            return_vision_hidden_states=False,
+            **kwargs
     ):
         model_inputs = {'input_ids': input_ids}
         if vision_hidden_states is None:
@@ -395,7 +416,6 @@ def generate_vllm(
 
         return result
 
-
     def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
                                     tune_mm_mlp_adapter=False):
         self.model.vision_config.use_im_start_end = mm_use_im_start_end
@@ -448,9 +468,11 @@ def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
 
         self.model.vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
             [DEFAULT_IMAGE_PATCH_TOKEN])[0]
-        print(f'Tokenizer: {tokenizer}\n patch_token_id: {self.model.vision_config.im_patch_token}, visoin_config: {self.model.vision_config}', flush=True)
+        print(
+            f'Tokenizer: {tokenizer}\n patch_token_id: {self.model.vision_config.im_patch_token}, visoin_config: {self.model.vision_config}',
+            flush=True)
         # exit()
 
 
 AutoConfig.register("omnilmm", OmniLMMConfig)
-AutoModelForCausalLM.register(OmniLMMConfig, OmniLMMForCausalLM)
\ No newline at end of file
+AutoModelForCausalLM.register(OmniLMMConfig, OmniLMMForCausalLM)
diff --git a/omnilmm/train/train_utils.py b/omnilmm/train/train_utils.py
index 83ec74e..e5c48c6 100644
--- a/omnilmm/train/train_utils.py
+++ b/omnilmm/train/train_utils.py
@@ -12,12 +12,6 @@
 from typing import Dict, Optional, Sequence
 from omnilmm import conversation as conversation_lib
 
-IGNORE_INDEX = -100
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
-
 
 def _tokenize_fn(strings: Sequence[str],
                  tokenizer: transformers.PreTrainedTokenizer) -> Dict:

From 627684d96eb3c3a29c53d64812d8937c584f70ea Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Sat, 23 Nov 2024 17:47:47 +0800
Subject: [PATCH 02/18] [upgrade] stage 1 passed

---
 builder/builder.py                   |  6 +-
 builder/llava_builder.py             |  3 +-
 builder/omnillm_builder.py           |  2 +-
 data_engine/dataset.py               | 15 +++--
 data_engine/logps_calculator.py      | 83 ++++++++++++++++++----------
 data_engine/logps_gen.sh             | 24 ++++++++
 muffin/eval/muffin_inference_logp.py |  1 -
 muffin/llava15_gen_data.py           |  1 -
 muffin/train/train_utils.py          | 18 +++---
 pyproject.toml                       | 10 ++--
 10 files changed, 110 insertions(+), 53 deletions(-)
 create mode 100644 data_engine/logps_gen.sh

diff --git a/builder/builder.py b/builder/builder.py
index 5a7a130..06d7a9f 100644
--- a/builder/builder.py
+++ b/builder/builder.py
@@ -65,9 +65,9 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
         kwargs['attn_implementation'] = 'flash_attention_2'
 
     # import here to avoid circular import
-    from llava_builder import LLaVABuilder
-    from omnillm_builder import OmniLLMBuilder
-    from language_model_builder import LanguageModelBuilder
+    from .llava_builder import LLaVABuilder
+    from .omnillm_builder import OmniLLMBuilder
+    from .language_model_builder import LanguageModelBuilder
 
     # Note: please put LanguageModelBuilder at the end of the list if you want you add your own builder
     model_builder_list = [LLaVABuilder, OmniLLMBuilder, LanguageModelBuilder]
diff --git a/builder/llava_builder.py b/builder/llava_builder.py
index d59a303..6feac82 100644
--- a/builder/llava_builder.py
+++ b/builder/llava_builder.py
@@ -12,7 +12,8 @@
 class LLaVABuilder(ModelBuilder):
     @classmethod
     def judge_able_to_build(cls, model_name: str) -> bool:
-        return 'llava' in model_name.lower()
+        lower_name = model_name.lower()
+        return 'llava' in lower_name or ('rlaif' in lower_name and '7b' in lower_name)
 
     @classmethod
     def build(cls, model_path, model_base, model_name, **kwargs):
diff --git a/builder/omnillm_builder.py b/builder/omnillm_builder.py
index c055755..74e88b0 100644
--- a/builder/omnillm_builder.py
+++ b/builder/omnillm_builder.py
@@ -11,7 +11,7 @@ class OmniLLMBuilder(ModelBuilder):
     @classmethod
     def judge_able_to_build(cls, model_name: str) -> bool:
         lower_name = model_name.lower()
-        return 'omnillm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
+        return 'omnilmm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
 
     @classmethod
     def build(cls, model_path, _, model_name, **kwargs):
diff --git a/data_engine/dataset.py b/data_engine/dataset.py
index e00f82b..3952fce 100644
--- a/data_engine/dataset.py
+++ b/data_engine/dataset.py
@@ -5,10 +5,12 @@
 
 from muffin.data.datasets import bytes_to_PIL_image
 from muffin.train.train_utils import encode_multimodal_preference_sample, preprocess_v1
+from omnilmm.train.train_utils import omni_preprocess
 
 
 class PreferenceInferenceDataset(torch_data.Dataset):
     def __init__(self,
+                 model_name,
                  data,
                  tokenizer,
                  image_token_len,
@@ -25,6 +27,12 @@ def __init__(self,
         }
         self.tokenizer = tokenizer
 
+        lower_name = model_name.lower()
+        if "onmi" in lower_name or ('rlaif' in lower_name and '12b' in lower_name):
+            self.preprocess_func = omni_preprocess
+        else:
+            self.preprocess_func = partial(preprocess_v1, has_image=True)
+
     def __getitem__(self, index):
         sample = self.data[index]
         metainfo = {
@@ -37,8 +45,8 @@ def __getitem__(self, index):
         chosen = {'from': 'gpt', 'value': sample['chosen']}
         rejected = {'from': 'gpt', 'value': sample['rejected']}
 
-        # image = bytes_to_PIL_image(sample['image']['bytes'])
-        image = bytes_to_PIL_image(sample['image_bytes'])
+        image = bytes_to_PIL_image(sample['image']['bytes'])
+        # image = bytes_to_PIL_image(sample['image_bytes'])
 
         formated_sample = {
             'image': image,
@@ -48,9 +56,8 @@ def __getitem__(self, index):
             "idx": sample['idx'],
             "metainfo": metainfo
         }
-        preprocess_func = partial(preprocess_v1, has_image=True)
         rej_data_dict, win_data_dict = encode_multimodal_preference_sample(
-            formated_sample, self.tokenizer, self.mm_cfg, preprocess_func=preprocess_func)
+            formated_sample, self.tokenizer, self.mm_cfg, preprocess_func=self.preprocess_func)
         return rej_data_dict, win_data_dict
 
     def __len__(self):
diff --git a/data_engine/logps_calculator.py b/data_engine/logps_calculator.py
index 0ef45d1..ee9acdb 100644
--- a/data_engine/logps_calculator.py
+++ b/data_engine/logps_calculator.py
@@ -20,7 +20,7 @@
 import torch.distributed as dist
 
 
-def preference_collator_fn(instances, pad_token_id, use_12b_model=False):
+def preference_collator_fn(instances, pad_token_id, is_omni=False):
     rej_instances, win_instances = list(zip(*instances))
     rej_batch = SFT_collator_fn(rej_instances, pad_token_id)
     win_batch = SFT_collator_fn(win_instances, pad_token_id)
@@ -29,7 +29,7 @@ def preference_collator_fn(instances, pad_token_id, use_12b_model=False):
     concatenated_labels = concate_pad(win_batch['labels'], rej_batch['labels'], -100)
     concatenated_attention_mask = concatenated_input_ids.ne(pad_token_id)
 
-    if not use_12b_model:
+    if not is_omni:
         if isinstance(win_batch['images'][0], BatchFeature):
             win_images = torch.stack([torch.tensor(img.pixel_values[0]) for img in win_batch['images']])
         elif isinstance(win_batch['images'][0], np.ndarray):
@@ -47,7 +47,7 @@ def preference_collator_fn(instances, pad_token_id, use_12b_model=False):
         rej_labels=rej_batch['labels'],
         win_attention_mask=win_batch['attention_mask'],
         rej_attention_mask=rej_batch['attention_mask'],
-        images=win_batch['images'] if use_12b_model else win_images,
+        images=win_batch['images'] if is_omni else win_images,
     )
     return batch
 
@@ -157,36 +157,54 @@ def write_logp_to_preference_parquet(origin_data, cache_file, logps, overwrite_l
     return df
 
 
-def inference_logp(model_path, dataset_path, output_file, use_12b_model=False):
+def inference_logp(
+        model_name,
+        model_path,
+        dataset_path,
+        output_file):
+    """
+    Args:
+        model_name:  e.g. llava-v1.5-7, OmniLMM-12B, RLAIF-V-12B
+        model_path: path to your model
+        dataset_path: path to dataset(should follow RLAIF-V-Dataset format)
+        output_file: path to outputfile(logps)
+
+    Returns:
+
+    """
     dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
                             rank=int(os.getenv('RANK', '0')), )
     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
-    if not use_12b_model:
-        model_name = 'llava-v1.5-7b'
-        tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
-                                                                               device_map={"": 'cuda'})
-    else:
-        model_name = 'OmniLMM-12B'
-        tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
-                                                                               device_map={"": 'cuda'})
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
+                                                                           device_map={"": 'cuda'})
+    image_token_len = 0
+    if hasattr(model, "model") and hasattr(model.model, "config") and hasattr(model.model.config, "num_query"):
         image_token_len = model.model.config.num_query
 
     model = model.to(dtype=torch.bfloat16, device='cuda')
     hf_data = hf_datasets.load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column("image",
                                                                                                hf_datasets.Image(
                                                                                                    decode=False))
-    dataset = PreferenceInferenceDataset(tokenizer=tokenizer,
+    dataset = PreferenceInferenceDataset(model_name=model_name,
+                                         tokenizer=tokenizer,
                                          data=hf_data,
-                                         image_token_len=0 if not use_12b_model else image_token_len,
+                                         image_token_len=image_token_len,
                                          img_processor=image_processor,
                                          use_im_start_end=False)
-    collate_fn = partial(preference_collator_fn, pad_token_id=tokenizer.pad_token_id, use_12b_model=use_12b_model)
+    collate_fn = partial(
+        preference_collator_fn,
+        pad_token_id=tokenizer.pad_token_id,
+        is_omni=("omni" in model_name.lower()))  # judge if the model follow omni structure
     dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
                                        num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
 
-    outputs = get_multimodal_sample_logps(model, dataloader, tokenizer,
-                                          is_llava15=True)  # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
+    outputs = get_multimodal_sample_logps(
+        # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
+        model,
+        dataloader,
+        tokenizer,
+        is_llava15=("llava" in model_name.lower() or ("rlaif" in model_name.lower() and "7b" in model_path.lower())))  # judge if the model follow llava structure
 
     world_size = torch.distributed.get_world_size()
     merged_outputs = [[None for _ in range(world_size)] for i in range(len(outputs))]
@@ -208,24 +226,31 @@ def inference_logp(model_path, dataset_path, output_file, use_12b_model=False):
     return df
 
 
-def main(reward_model_path: str, instruct_model_path: str, dataset_path: str, reward_model_output_file: str,
-         instruct_model_output_file: str, use_12b_model=False) -> None:
-    reward_model_output_df = inference_logp(reward_model_path, dataset_path, reward_model_output_file, use_12b_model=use_12b_model)
-    instruct_model_output_df = inference_logp(instruct_model_path, dataset_path, instruct_model_output_file, use_12b_model=use_12b_model)
+def main(
+        reward_model_name: str,
+        reward_model_path: str,
+        instruct_model_name: str,
+        instruct_model_path: str,
+        dataset_path: str,
+        reward_model_output_file: str,
+        instruct_model_output_file: str) -> None:
+    reward_model_output_df = inference_logp(reward_model_name, reward_model_path, dataset_path, reward_model_output_file)
+    instruct_model_output_df = inference_logp(instruct_model_name, instruct_model_path, dataset_path, instruct_model_output_file)
 
     return reward_model_output_df, instruct_model_output_df
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="inference and save the results")
-    parser.add_argument('--reward_model_path', type=str, default="/home/qmli/models/llava_1.5_7b")
-    parser.add_argument('--instruct_model_path', type=str, default="/home/qmli/models/llava_1.5_7b")
-    parser.add_argument('--dataset_path', type=str, default='/home/qmli/RLAIF-V/Refo_test/result/parquet')
-    parser.add_argument('--reward_model_output_file', type=str, default='/home/qmli/RLAIF-V/script_test')
-    parser.add_argument('--instruct_model_output_file', type=str, default='/home/qmli/RLAIF-V/script_test')
-    parser.add_argument('--use_12b_model', action='store_true')
+    parser.add_argument('--reward_model_name', type=str, default="RLAIF-V-7B")
+    parser.add_argument('--reward_model_path', type=str, default="/data/yaoshu/models/RLAIF-V-7B")
+    parser.add_argument('--instruct_model_name', type=str, default="RLAIF-V-12B")
+    parser.add_argument('--instruct_model_path', type=str, default="/data/yaoshu/models/RLAIF-V-12B")
+    parser.add_argument('--dataset_path', type=str, default='/data/yaoshu/dataset/RLAIF-V-Dataset')
+    parser.add_argument('--reward_model_output_file', type=str, default='/data/RLAIF-V-CC/results')
+    parser.add_argument('--instruct_model_output_file', type=str, default='/data/RLAIF-V-CC/results')
     parser.add_argument('--local-rank', type=int, default=0)
     args = parser.parse_args()
 
-    main(args.reward_model_path, args.instruct_model_path, args.dataset_path, args.reward_model_output_file,
-         args.instruct_model_output_file, use_12b_model=args.use_12b_model)
+    main(args.reward_model_name, args.reward_model_path, args.instruct_model_name, args.instruct_model_path, args.dataset_path, args.reward_model_output_file,
+         args.instruct_model_output_file)
diff --git a/data_engine/logps_gen.sh b/data_engine/logps_gen.sh
new file mode 100644
index 0000000..abc8aa5
--- /dev/null
+++ b/data_engine/logps_gen.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+export PYTHONPATH=$PYTHONPATH:`realpath .`
+export CUDA_VISIBLE_DEVICES=1
+
+#GPUS_PER_NODE=1
+#NNODES=1
+#NODE_RANK=0
+export MASTER_ADDR=localhost
+#MASTER_ADDR=13.13.19.1
+export MASTER_PORT=6001
+#
+#DISTRIBUTED_ARGS="
+#    --nproc_per_node $GPUS_PER_NODE \
+#    --nnodes $NNODES \
+#    --node_rank $NODE_RANK \
+#    --master_addr $MASTER_ADDR \
+#    --master_port $MASTER_PORT
+#"
+
+#torchrun $DISTRIBUTED_ARGS data_engine/logps_calculator.py  \
+#          --use_12b_model_for_reward \
+#          --use_12b_model_for_instruct
+
+python data_engine/logps_calculator.py
\ No newline at end of file
diff --git a/muffin/eval/muffin_inference_logp.py b/muffin/eval/muffin_inference_logp.py
index 4cdcda1..9eb3ffe 100644
--- a/muffin/eval/muffin_inference_logp.py
+++ b/muffin/eval/muffin_inference_logp.py
@@ -55,7 +55,6 @@ def get_batch_logps_minicpm(logits: torch.FloatTensor, labels: torch.LongTensor,
 class InferenceSampler(torch.utils.data.sampler.Sampler):
 
     def __init__(self, size):
-        super().__init__()
         self._size = int(size)
         assert size > 0
         self._rank = torch.distributed.get_rank()
diff --git a/muffin/llava15_gen_data.py b/muffin/llava15_gen_data.py
index 4b92792..22d2a00 100644
--- a/muffin/llava15_gen_data.py
+++ b/muffin/llava15_gen_data.py
@@ -38,7 +38,6 @@ def torch_pad_sequence(sequence, padding_value, batch_first=True, padding_side='
 class InferenceSampler(torch.utils.data.sampler.Sampler):
 
     def __init__(self, size):
-        super().__init__()
         self._size = int(size)
         assert size > 0
         self._rank = torch.distributed.get_rank()
diff --git a/muffin/train/train_utils.py b/muffin/train/train_utils.py
index f7d5d9f..205751b 100644
--- a/muffin/train/train_utils.py
+++ b/muffin/train/train_utils.py
@@ -65,13 +65,16 @@ def SFT_collator_fn(instances, pad_token_id):
               for instance in instances if 'image' in instance]
     if len(images) > 0:
         # possibly multi-image for each sample
-        if len(images[0].shape) == 4:
-            batch['images'] = images
-        elif all(x is not None and x.shape == images[0].shape for x in images):
-            import numpy
-            if isinstance(images[0], numpy.ndarray):
-                images = [torch.from_numpy(x) for x in images]
-            batch['images'] = torch.stack(images)
+        if hasattr(images[0], 'shape'):
+            if len(images[0].shape) == 4:
+                batch['images'] = images
+            elif all(x is not None and x.shape == images[0].shape for x in images):
+                import numpy
+                if isinstance(images[0], numpy.ndarray):
+                    images = [torch.from_numpy(x) for x in images]
+                batch['images'] = torch.stack(images)
+            else:
+                batch['images'] = images
         else:
             batch['images'] = images
     else:
@@ -340,4 +343,3 @@ def preprocess_v1(
         input_ids=input_ids,
         labels=targets,
     )
-
diff --git a/pyproject.toml b/pyproject.toml
index 5eeee30..561240f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,13 +14,13 @@ classifiers = [
 ]
 
 dependencies = [
-    "torch==2.0.1","deepspeed==0.11.1", "huggingface_hub==0.17.0", "jsonlines==4.0.0",
+    "torch==2.0.1","deepspeed==0.11.1", "huggingface-hub==0.23.1", "jsonlines==4.0.0",
     "matplotlib==3.8.0", "nltk==3.8.1", "numpy==1.25.2", "openai==1.30.1",
     "packaging==24.0", "pandas==2.2.2", "peft==0.10.0","sentencepiece==0.1.99",
-    "Pillow==10.3.0", "Requests==2.31.0", "shortuuid==1.0.13",
-    "spacy==3.7.2", "timm==0.9.10", "tokenizers==0.14.1",
-    "tqdm==4.66.1", "transformers==4.35.0","wandb==0.15.11",
-    "spacy==3.7.2","opencv-python==4.9.0.80",
+    "Pillow==10.3.0", "Requests==2.32.2", "shortuuid==1.0.13",
+    "spacy==3.7.2", "timm==0.9.10", "tokenizers==0.15.1",
+    "tqdm==4.67.0", "transformers==4.37.0","wandb==0.15.11",
+    "spacy==3.7.2","opencv-python==4.9.0.80", "datasets==2.21.0", "cffi==1.17.1"
 ]
 
 [project.optional-dependencies]

From 644aecd3cbada601734793d67616186d4adc24bc Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Sun, 24 Nov 2024 17:49:52 +0800
Subject: [PATCH 03/18] [upgrade] stage 2 passed

---
 data_engine/data_engine.py                   | 126 ++++++++++++++++
 data_engine/data_pair_builder.py             | 149 +++++++++++++++++++
 data_engine/dpo_data_filter/__init__.py      |   0
 data_engine/dpo_data_filter/filter.py        |  69 +++++++++
 data_engine/dpo_data_filter/length_filter.py |  40 +++++
 data_engine/dpo_data_filter/num_filter.py    |  59 ++++++++
 data_engine/dpo_data_filter/same_filter.py   |  24 +++
 data_engine/logps_calculator.py              |  47 +++---
 data_engine/reward_computer.py               | 112 ++++++++++++++
 data_engine/run_engine.sh                    |  15 ++
 10 files changed, 621 insertions(+), 20 deletions(-)
 create mode 100644 data_engine/data_engine.py
 create mode 100644 data_engine/data_pair_builder.py
 create mode 100644 data_engine/dpo_data_filter/__init__.py
 create mode 100644 data_engine/dpo_data_filter/filter.py
 create mode 100644 data_engine/dpo_data_filter/length_filter.py
 create mode 100644 data_engine/dpo_data_filter/num_filter.py
 create mode 100644 data_engine/dpo_data_filter/same_filter.py
 create mode 100644 data_engine/reward_computer.py
 create mode 100644 data_engine/run_engine.sh

diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
new file mode 100644
index 0000000..e60bfe0
--- /dev/null
+++ b/data_engine/data_engine.py
@@ -0,0 +1,126 @@
+import os.path
+
+import pandas as pd
+
+import logps_calculator
+import reward_computer
+import data_pair_builder
+from dpo_data_filter import filter
+import argparse
+
+
+def print_stage(idx, desc="", finish=False):
+    print("=" * 80)
+    if not finish:
+        print(f"Processing Stage {idx}: {desc}")
+    else:
+        print(f"Finish Stage {idx}")
+    print("=" * 80)
+
+
+def dir_prepare(dir_to_check, clean=True):
+    if not os.path.exists(dir_to_check):
+        os.makedirs(dir_to_check)
+    elif clean:
+        if not os.path.isdir(dir_to_check):
+            for file in os.listdir(dir_to_check):
+                os.remove(os.path.join(dir_to_check, file))
+        else:
+            os.remove(dir_to_check)
+            os.mkdir(dir_to_check)
+
+
+def run(
+        reward_model_name,
+        reward_model_path,
+        instruct_model_name,
+        instruct_model_path,
+        dataset_path,
+        work_dir,
+        continue_from_stage=1,
+        sample_k=10,
+        rank=10,
+        distance=5
+):
+    reward_logps_output_dir = os.path.join(work_dir, "reward_logps")
+    instruct_logps_output_dir = os.path.join(work_dir, "instruct_logps")
+    if continue_from_stage <= 1:
+        print_stage(1, "Calculate logps")
+        dir_prepare(reward_logps_output_dir)
+        dir_prepare(instruct_logps_output_dir)
+        _ = logps_calculator.main(
+            reward_model_name,
+            reward_model_path,
+            instruct_model_name,
+            instruct_model_path,
+            dataset_path,
+            reward_logps_output_dir,
+            instruct_logps_output_dir)
+        print_stage(1, finish=True)
+
+    if continue_from_stage <= 2:
+        print_stage(2, "DPO dataset construction")
+
+        print_stage(2.1, "Calculate reward")
+        rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
+        print_stage(2.1, finish=True)
+
+        print_stage(2.2, "Build DPO pairs")
+        dpo_pair = data_pair_builder.main(rewards, sample_k, rank, distance)
+        print_stage(2.2, finish=True)
+
+        print_stage(2.3, "Filter DPO pairs")
+        data = filter.main(dpo_pair)
+        print_stage(2.3, finish=True)
+
+        print_stage(2.4, "Save file to dataset format")
+        needed_keys = [
+            "question",
+            "chosen",
+            "rejected",
+            "origin_dataset",
+            "origin_split",
+            "idx",
+            "image_path",
+            "ds_name",
+            "image"]
+        for item in data:
+            for key in list(item.keys()):
+                if key not in needed_keys:
+                    del item[key]
+        df = pd.DataFrame(data)
+        output_file = os.path.join(work_dir, "dpo_dataset.parquet")
+        df.to_parquet(output_file)
+        print_stage(2.4, finish=True)
+
+        print_stage(2, finish=True)
+
+        print("Finish all stages, output file is saved to ", output_file)
+        print("Have a nice day!")
+
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument("--reward_model_name", type=str, help="The name of the reward model.")
+    args.add_argument("--reward_model_path", type=str, help="The path of the reward model.")
+    args.add_argument("--instruct_model_name", type=str, help="The name of the instruct model.")
+    args.add_argument("--instruct_model_path", type=str, help="The path of the instruct model.")
+    args.add_argument("--dataset_path", type=str, help="The path of the dataset.")
+    args.add_argument("--work_dir", type=str, help="The working directory.")
+    args.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
+    args.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
+    args.add_argument("--rank", type=int, default=10, help="The rank number.")
+    args.add_argument("--distance", type=int, default=5, help="The distance.")
+
+    args = args.parse_args()
+    run(
+        args.reward_model_name,
+        args.reward_model_path,
+        args.instruct_model_name,
+        args.instruct_model_path,
+        args.dataset_path,
+        args.work_dir,
+        args.continue_from_stage,
+        args.sample_k,
+        args.rank,
+        args.distance
+    )
diff --git a/data_engine/data_pair_builder.py b/data_engine/data_pair_builder.py
new file mode 100644
index 0000000..1095f8b
--- /dev/null
+++ b/data_engine/data_pair_builder.py
@@ -0,0 +1,149 @@
+from nltk import word_tokenize
+from tqdm import tqdm
+
+data_pairs = []
+
+
+def get_ranking_reward_data(sample_k, rewards):
+    sum_output = []
+    avg_output = []
+    # 打开数据文件进行读取，并打开输出文件进行写入
+
+    data = list(rewards)
+    data_pairs = [data[i:i + sample_k] for i in range(0, len(data), sample_k)]
+
+    # print(len(data_pairs))
+    # print("*****")
+
+    # 对于每组数据对进行排序和逐行写入
+    for data in tqdm(data_pairs):
+        # 按照 sum 和 avg 降序排列
+        sum_sorted_data = sorted(data, key=lambda x: x['sum'], reverse=True)
+        avg_sorted_data = sorted(data, key=lambda x: x['avg'], reverse=True)
+
+        # print(sum_sorted_data[0]['idx'])
+
+        # 逐行写入 sum 排序后的数据
+        for data in sum_sorted_data:
+            rank = sum_sorted_data.index(data) + 1
+            text = data['chosen']
+            word_count = len(word_tokenize(text))
+            sum_reward = data['sum']
+
+            sum_data_dict = {
+                "idx": data['idx'],
+                "rank": rank,
+                "word_count": word_count,
+                "sum_reward": sum_reward,
+                "question": data['question'],
+                "image": data['image'],
+                "text": text,
+            }
+            sum_output.append(sum_data_dict)  # 每次构造一个字典就写入文件
+
+        # 逐行写入 avg 排序后的数据
+        for data in avg_sorted_data:
+            rank = avg_sorted_data.index(data) + 1
+            text = data['chosen']
+            word_count = len(word_tokenize(text))
+            avg_reward = data['avg']
+
+            avg_data_dict = {
+                "idx": data['idx'],
+                "rank": rank,
+                "word_count": word_count,
+                "avg_reward": avg_reward,
+                "question": data['question'],
+                "image": data['image'],
+                "text": text,
+            }
+            avg_output.append(avg_data_dict)  # 每次构造一个字典就写入文件
+
+    return sum_output, avg_output
+
+
+def pair_union(sum_reward, avg_reward, sample_k=10, rank=10, distance=5):
+    total_pairs = 0
+    total_used_pic = 0
+    flag = 0
+    dpo_pair = []
+
+    sum_reward_whole_data = list(sum_reward)
+    avg_reward_whole_data = list(avg_reward)
+    assert len(sum_reward_whole_data) == len(avg_reward_whole_data)
+
+    # print(len(sum_reward_whole_data))
+
+    for i in tqdm(range(0, len(sum_reward_whole_data), sample_k)):
+        idx = sum_reward_whole_data[i]['idx']
+        sum_reward_data = sum_reward_whole_data[i:i + sample_k]
+        avg_reward_data = avg_reward_whole_data[i:i + sample_k]
+        # top10 -> top rank
+        sum_top_rank = sum_reward_data[:rank]
+        sum_last_rank = sum_reward_data[-rank:]
+        avg_top_rank = avg_reward_data[:rank]
+        avg_last_rank = avg_reward_data[-rank:]
+
+        avg_top_rank_text = [data['text'] for data in avg_top_rank]
+        avg_last_rank_text = [data['text'] for data in avg_last_rank]
+
+        # check the union
+        chosen_answer = []
+        rejected_answer = []
+        question = ""
+        for data in sum_top_rank:
+            question = data["question"]
+            if data['text'] in avg_top_rank_text:
+                # print(f"chosen data: {data['text']}")
+                # print(f"chosen word count: {data['word_count']}")
+                chosen_answer.append((data['text'], data['word_count']))
+
+        # print("*****")
+
+        for data in sum_last_rank:
+            if data['text'] in avg_last_rank_text:
+                # print(f"rejected data: {data['text']}")
+                # print(f"rejected word count: {data['word_count']}")
+                rejected_answer.append((data['text'], data['word_count']))
+
+        sign = 0
+        # construct dpo pair if abs(dif(word_count)) < distance
+        for chosen_data in chosen_answer:
+            for rejected_data in rejected_answer:
+                if abs(chosen_data[1] - rejected_data[1]) < distance:
+                    sign = 1
+                    dpo_pair.append({
+                        "idx": idx,
+                        "question": question,
+                        "chosen": chosen_data[0],
+                        "rejected": rejected_data[0],
+                        "image": sum_reward_whole_data[i]['image']
+                    })
+                    total_pairs += 1
+                    if chosen_data[1] >= rejected_data[1]:
+                        flag += 1
+        if sign == 1:
+            total_used_pic += 1
+
+    return dpo_pair
+
+
+def main(rewards, sample_k=10, rank=10, distance=5):
+    sum_output, avg_output = get_ranking_reward_data(sample_k, rewards)
+    dpo_pair = pair_union(sum_output, avg_output, sample_k, rank, distance)
+    return dpo_pair
+
+
+if __name__ == "__main__":
+    pass
+    # args = argparse.ArgumentParser()
+    # args.add_argument("--reward_file", type=str, default="", help="The file path of the reward data.")
+    # args.add_argument("--dpo_pair_file", type=str, default="", help="The output file path of the dpo pair data.")
+    # args.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
+    # args.add_argument("--rank", type=int, default=10, help="The rank number.")
+    # args.add_argument("--distance", type=int, default=5, help="The distance.")
+    # args = args.parse_args()
+    #
+    # dpo_pair = main(args.reward_file, args.sample_k, args.rank, args.distance)
+    # with jsonlines.open(args.output_file, 'w') as writer:
+    #     writer.write_all(dpo_pair)
diff --git a/data_engine/dpo_data_filter/__init__.py b/data_engine/dpo_data_filter/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_engine/dpo_data_filter/filter.py b/data_engine/dpo_data_filter/filter.py
new file mode 100644
index 0000000..05595d4
--- /dev/null
+++ b/data_engine/dpo_data_filter/filter.py
@@ -0,0 +1,69 @@
+import json
+import os.path
+
+import jsonlines
+
+
+class Filter:
+    @classmethod
+    def do_filter(cls, data: list) -> list:
+        """
+
+        Args:
+            data: (list): data need to be filtered
+
+        Returns: (list): filtered data
+        """
+        raise NotImplementedError
+
+
+def jsonl_to_data(file_path):
+    data = []
+    with jsonlines.open(file_path, 'r') as reader:
+        for item in reader:
+            data.append(item)
+    return data
+
+
+def json_to_data(file_path):
+    with open(file_path, 'r') as f:
+        return json.load(f)
+
+
+def load_data(file_path):
+    _, ext = os.path.splitext(file_path)
+
+    if ext == '.jsonl':
+        return jsonl_to_data(file_path)
+    elif ext == '.json':
+        return json_to_data(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+
+
+def main(data):
+    # import filters here to avoid circulate important
+    from .length_filter import LengthFilter
+    from .num_filter import NumFilter
+    from .same_filter import DeleteSameFilter
+
+    # you can add your own filters here or delete the filters
+    # that are determined to be unnecessary
+    filters = [LengthFilter, NumFilter, DeleteSameFilter]
+
+    for filter_to_run in filters:
+        filter_name = filter_to_run.__class__.__name__
+        filter_doc = filter_to_run.__doc__ if filter_to_run.__doc__ else "No documentation available"
+        print("=" * 80)
+        print(f"Processing Filter: {filter_name}")
+        print("=" * 80)
+        print(f"Documentation:\n{filter_doc}\n")
+
+        data = filter_to_run.do_filter(data)
+
+        print("=" * 80)
+        print(f"Filter {filter_name} Finished")
+        print(f"After filtering, we get {len(data)} data items")
+        print("=" * 80 + "\n")
+
+    return data
diff --git a/data_engine/dpo_data_filter/length_filter.py b/data_engine/dpo_data_filter/length_filter.py
new file mode 100644
index 0000000..23b4692
--- /dev/null
+++ b/data_engine/dpo_data_filter/length_filter.py
@@ -0,0 +1,40 @@
+import jsonlines
+from nltk.tokenize import word_tokenize
+
+from .filter import Filter
+
+
+class LengthFilter(Filter):
+    """
+    Adjust the average length of chosen and rejected to make them similar
+    """
+
+    @classmethod
+    def count_words(cls, sentence):
+        words = word_tokenize(sentence)
+        return len(words)
+
+    @classmethod
+    def calculate_mean_difference(cls, data):
+        total_difference = sum(item['chosen_diff'] for item in data)
+        return total_difference / len(data)
+
+    @classmethod
+    def do_filter(cls, data):
+        for item in data:
+            chosen_count = cls.count_words(item['chosen'])
+            reject_count = cls.count_words(item['rejected'])
+            item['chosen_diff'] = chosen_count - reject_count
+
+        data.sort(key=lambda x: x['chosen_diff'])
+
+        print("finish sorting")
+        print("mean difference: ", cls.calculate_mean_difference(data))
+
+        while cls.calculate_mean_difference(data) > 0.5:
+            print("pop data to reduce mean difference")
+            data.pop()
+        for item in data:
+            del item['chosen_diff']
+
+        return data
diff --git a/data_engine/dpo_data_filter/num_filter.py b/data_engine/dpo_data_filter/num_filter.py
new file mode 100644
index 0000000..62ead4a
--- /dev/null
+++ b/data_engine/dpo_data_filter/num_filter.py
@@ -0,0 +1,59 @@
+from .filter import Filter
+from nltk.tokenize import word_tokenize
+import random
+
+
+class NumFilter(Filter):
+    """
+    Control the amount of data corresponding to an image
+    """
+
+    @classmethod
+    def count_words(cls, sentence):
+        words = word_tokenize(sentence)
+        return len(words)
+
+    @classmethod
+    def calculate_mean_difference(cls, data):
+        total_difference = sum(item['chosen_diff'] for item in data)
+        return total_difference / len(data)
+
+    @classmethod
+    def do_filter(cls, data):
+        count = {}
+        sign = 0  # caption数据个数
+
+        num_filter_out = []
+        sum_chosen = 0
+        sum_rejected = 0
+        total_samples = 0
+        # shuffle data
+        list_data = data
+        random.shuffle(list_data)
+        for data in list_data:
+            if data["chosen"] == data["rejected"]:
+                continue
+            idx = data["idx"]
+            if idx in count:
+                if count[idx] >= 3:
+                    continue
+            else:
+                count[idx] = 0
+            count[idx] += 1
+
+            chosen_words = cls.count_words(data["chosen"])
+            rejected_words = cls.count_words(data["rejected"])
+
+            if chosen_words > 100:
+                sign += 1
+            # elif chosen_words < 50:
+            #     if(random.random() < 0.35):
+            #         continue
+
+            sum_chosen += chosen_words
+            sum_rejected += rejected_words
+            total_samples += 1
+
+            num_filter_out.append(data)
+
+        return num_filter_out
diff --git a/data_engine/dpo_data_filter/same_filter.py b/data_engine/dpo_data_filter/same_filter.py
new file mode 100644
index 0000000..cbd22ab
--- /dev/null
+++ b/data_engine/dpo_data_filter/same_filter.py
@@ -0,0 +1,24 @@
+import json
+
+from .filter import Filter
+
+
+class DeleteSameFilter(Filter):
+    """
+    For QA data, there may be some redundant data, which need to be filtered
+    """
+
+    @classmethod
+    def do_filter(cls, data):
+        unique_data = set()
+        delete_same_output = []
+        for obj in data:
+            # 将每条数据序列化为字符串，便于使用集合去重
+            data_str = json.dumps(obj, sort_keys=True)
+            if data_str not in unique_data:
+                unique_data.add(data_str)
+                delete_same_output.append(obj)
+
+        # print(f"去重完成，共写入 {len(unique_data)} 条数据到 {output_file}")
+
+        return delete_same_output
diff --git a/data_engine/logps_calculator.py b/data_engine/logps_calculator.py
index ee9acdb..a013757 100644
--- a/data_engine/logps_calculator.py
+++ b/data_engine/logps_calculator.py
@@ -122,7 +122,7 @@ def get_multimodal_sample_logps(model, dataloader, tokenizer, is_llava15=False):
     return win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
 
 
-def write_logp_to_preference_parquet(origin_data, cache_file, logps, overwrite_logps=True):
+def write_logp_to_preference_parquet(origin_data, cache_dir, logps, overwrite_logps=True):
     out_data = []
 
     for index in range(len(logps)):
@@ -151,7 +151,7 @@ def write_logp_to_preference_parquet(origin_data, cache_file, logps, overwrite_l
         for idx, start in enumerate(range(0, len(out_data), step)):
             temp_data = out_data[start: min(start + step, len(out_data))]
             df = pd.DataFrame(temp_data)
-            df.to_parquet(os.path.join(cache_file, f'RLAIF-V-Dataset-withlogp_{idx:03}-{len(temp_data)}.parquet'))
+            df.to_parquet(os.path.join(cache_dir, f'RLAIF-V-Dataset-withlogp_{idx:03}-{len(temp_data)}.parquet'))
 
     torch.distributed.barrier()
     return df
@@ -161,20 +161,17 @@ def inference_logp(
         model_name,
         model_path,
         dataset_path,
-        output_file):
+        output_dir):
     """
     Args:
         model_name:  e.g. llava-v1.5-7, OmniLMM-12B, RLAIF-V-12B
         model_path: path to your model
         dataset_path: path to dataset(should follow RLAIF-V-Dataset format)
-        output_file: path to outputfile(logps)
+        output_dir: path to outputfile(logps)
 
     Returns:
 
     """
-    dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
-                            rank=int(os.getenv('RANK', '0')), )
-    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
                                                                            device_map={"": 'cuda'})
@@ -195,7 +192,8 @@ def inference_logp(
     collate_fn = partial(
         preference_collator_fn,
         pad_token_id=tokenizer.pad_token_id,
-        is_omni=("omni" in model_name.lower()))  # judge if the model follow omni structure
+        is_omni=("omni" in model_name.lower()) or (
+                    "rlaif" in model_name.lower() and "12b" in model_path.lower()))  # judge if the model follow omni structure
     dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
                                        num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
 
@@ -204,7 +202,8 @@ def inference_logp(
         model,
         dataloader,
         tokenizer,
-        is_llava15=("llava" in model_name.lower() or ("rlaif" in model_name.lower() and "7b" in model_path.lower())))  # judge if the model follow llava structure
+        is_llava15=("llava" in model_name.lower() or (
+                    "rlaif" in model_name.lower() and "7b" in model_path.lower())))  # judge if the model follow llava structure
 
     world_size = torch.distributed.get_world_size()
     merged_outputs = [[None for _ in range(world_size)] for i in range(len(outputs))]
@@ -218,7 +217,7 @@ def inference_logp(
     logps = list(zip(win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list,
                      rej_per_token_logp_list))
 
-    df = write_logp_to_preference_parquet(dataset.data, output_file, logps, overwrite_logps=True)
+    df = write_logp_to_preference_parquet(dataset.data, output_dir, logps, overwrite_logps=True)
 
     torch.distributed.barrier()
 
@@ -232,25 +231,33 @@ def main(
         instruct_model_name: str,
         instruct_model_path: str,
         dataset_path: str,
-        reward_model_output_file: str,
-        instruct_model_output_file: str) -> None:
-    reward_model_output_df = inference_logp(reward_model_name, reward_model_path, dataset_path, reward_model_output_file)
-    instruct_model_output_df = inference_logp(instruct_model_name, instruct_model_path, dataset_path, instruct_model_output_file)
+        reward_output_dir: str,
+        instruct_output_dir: str):
+    dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
+                            rank=int(os.getenv('RANK', '0')), )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    _ = inference_logp(instruct_model_name, instruct_model_path, dataset_path, instruct_output_dir)
+    _ = inference_logp(reward_model_name, reward_model_path, dataset_path, reward_output_dir)
 
-    return reward_model_output_df, instruct_model_output_df
+    return {
+        "reward_output_dir": reward_output_dir,
+        "instruct_output_dir": instruct_output_dir
+    }
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="inference and save the results")
+    parser = argparse.ArgumentParser(description="calculate logps for reward and instruct model")
     parser.add_argument('--reward_model_name', type=str, default="RLAIF-V-7B")
     parser.add_argument('--reward_model_path', type=str, default="/data/yaoshu/models/RLAIF-V-7B")
     parser.add_argument('--instruct_model_name', type=str, default="RLAIF-V-12B")
     parser.add_argument('--instruct_model_path', type=str, default="/data/yaoshu/models/RLAIF-V-12B")
     parser.add_argument('--dataset_path', type=str, default='/data/yaoshu/dataset/RLAIF-V-Dataset')
-    parser.add_argument('--reward_model_output_file', type=str, default='/data/RLAIF-V-CC/results')
-    parser.add_argument('--instruct_model_output_file', type=str, default='/data/RLAIF-V-CC/results')
+    parser.add_argument('--reward_model_output_dir', type=str, default='/data/RLAIF-V-CC/results/reward')
+    parser.add_argument('--instruct_model_output_dir', type=str, default='/data/RLAIF-V-CC/results/instruct')
     parser.add_argument('--local-rank', type=int, default=0)
     args = parser.parse_args()
 
-    main(args.reward_model_name, args.reward_model_path, args.instruct_model_name, args.instruct_model_path, args.dataset_path, args.reward_model_output_file,
-         args.instruct_model_output_file)
+    files = main(args.reward_model_name, args.reward_model_path, args.instruct_model_name, args.instruct_model_path,
+                 args.dataset_path, args.reward_model_output_dir,
+                 args.instruct_model_output_dir)
+    print(files)
diff --git a/data_engine/reward_computer.py b/data_engine/reward_computer.py
new file mode 100644
index 0000000..543b2df
--- /dev/null
+++ b/data_engine/reward_computer.py
@@ -0,0 +1,112 @@
+import jsonlines
+import pandas as pd
+import pyarrow.parquet as pq
+import json
+import os
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import argparse
+
+
+# def convert_parquet_to_jsonl(parquet_file, jsonl_file):
+#     table = pq.read_table(parquet_file)
+#     df = table.to_pandas()
+#
+#     def filter_bytes(value):
+#         if isinstance(value, bytes):
+#             return None  # 返回 None 表示跳过该值
+#         return value
+#
+#     df = df.applymap(filter_bytes)
+#
+#     with open(jsonl_file, 'w', encoding='utf-8') as f:
+#         for record in df.to_dict(orient='records'):
+#             filtered_record = {k: v for k, v in record.items() if v is not None}
+#             f.write(json.dumps(filtered_record, ensure_ascii=False) + '\n')
+def parquet_to_json(parquet_file, jsonl_file):
+    df = pd.read_parquet(parquet_file, engine='pyarrow')
+
+    df = df.astype(str, errors='ignore')
+    df.to_json(jsonl_file, orient='records', lines=True, force_ascii=False)
+
+def compute_reward(tokenizer, reward_logps_dir, instruct_logps_dir):
+    rewards = []
+    reward_files = [f for f in os.listdir(reward_logps_dir) if f.endswith('.parquet')]
+
+    for reward_file in tqdm(reward_files, desc='Processing files'):
+        suffix = reward_file.split('_')[-1].split('.')[0]
+        reward_file_path = os.path.join(reward_logps_dir, reward_file)
+        instruct_file_path = os.path.join(instruct_logps_dir, f'RLAIF-V-Dataset-withlogp_{suffix}.parquet')
+
+        reward_jsonl_file = os.path.join(reward_logps_dir, reward_file.replace('.parquet', '.jsonl'))
+        instruct_jsonl_file = os.path.join(instruct_logps_dir, instruct_file_path.replace('.parquet', '.jsonl'))
+
+        if not os.path.exists(reward_jsonl_file):
+            print(f"Converting {reward_file_path} to {reward_jsonl_file}. For each file, it will only perform once, please wait...")
+            # convert_parquet_to_jsonl(reward_file_path, reward_jsonl_file)
+            parquet_to_json(reward_file_path, reward_jsonl_file)
+            print(f'Successfully converted {reward_file_path} to {reward_jsonl_file}')
+        if not os.path.exists(instruct_jsonl_file):
+            print(f"Converting {instruct_file_path} to {instruct_jsonl_file}. F or each file, it will only perform once, please wait...")
+            # convert_parquet_to_jsonl(instruct_file_path, instruct_jsonl_file)
+            parquet_to_json(instruct_file_path, instruct_jsonl_file)
+            print(f'Successfully converted {instruct_file_path} to {instruct_jsonl_file}')
+
+        with jsonlines.open(reward_jsonl_file) as reward_reader, jsonlines.open(
+                instruct_jsonl_file) as instruct_reader:
+            for obj in reward_reader:
+                idx = obj["idx"]
+                tokens = tokenizer.encode(obj["chosen"])
+                logps = obj["logps"].split("[")[-1].split("]")[0]
+                reward_logps = list(map(float, logps.split(",")))
+                reward_logps_for_reward = reward_logps[-len(tokens):]
+
+                for instruct_obj in instruct_reader:
+                    if instruct_obj["idx"] == idx:
+                        instruct_logps = instruct_obj["logps"].split("[")[-1].split("]")[0]
+                        instruct_logps = list(map(float, instruct_logps.split(",")))
+                        instruct_logps_for_reward = instruct_logps[-len(tokens):]
+                        break
+
+                differences = [instruct_logp - reward_logp for instruct_logp, reward_logp in
+                               zip(instruct_logps_for_reward, reward_logps_for_reward)]
+                min_reward = min(differences) * 0.1
+                sum_reward = sum(differences) * 0.1
+                last_reward = differences[-1] * 0.1
+                avg_reward = sum_reward / len(tokens) * 0.1
+
+                reward_data = {
+                    "idx": idx,
+                    "ds_name": obj["ds_name"],
+                    "question": obj["question"],
+                    "chosen": obj["chosen"],
+                    "image": obj["image"],
+                    "image_path": obj["image_path"],
+                    "origin_split": obj["origin_split"],
+                    "origin_dataset": obj["origin_dataset"],
+                    "min": min_reward,
+                    "sum": sum_reward,
+                    "ORM": last_reward,
+                    "avg": avg_reward
+                }
+
+                rewards.append(reward_data)
+    return rewards
+
+
+def main(model_path: str, reward_logps_dir: str, instruct_logps_dir: str):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    rewards = compute_reward(tokenizer, reward_logps_dir, instruct_logps_dir)
+    return rewards
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="turn logps to reward")
+    parser.add_argument("--model_path", type=str, help="model path")
+    parser.add_argument("--reward_logps_dir", type=str, help="reward logps dir")
+    parser.add_argument("--instruct_logps_dir", type=str, help="instruct logps dir")
+    parser.add_argument("--output_file", type=str, help="output file")
+    args = parser.parse_args()
+    rewards = main(args.model_path, args.reward_logps_dir, args.instruct_logps_dir)
+    with jsonlines.open(args.output_file, 'w') as writer:
+        writer.write_all(rewards)
diff --git a/data_engine/run_engine.sh b/data_engine/run_engine.sh
new file mode 100644
index 0000000..2e6bb5f
--- /dev/null
+++ b/data_engine/run_engine.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+export PYTHONPATH=$(realpath .):$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=1
+
+export MASTER_ADDR=localhost
+export MASTER_PORT=6001
+
+python data_engine/data_engine.py \
+      --reward_model_name llava-v1.5-7b \
+      --reward_model_path /data/yaoshu/models/llava-v1.5-7b \
+      --instruct_model_name RLAIF-V-7B \
+      --instruct_model_path /data/yaoshu/models/RLAIF-V-7B \
+      --dataset_path /data/yaoshu/dataset/RLAIF-V-Dataset \
+      --work_dir /data/RLAIF-V-CC/results/test \
+      --continue_from_stage 2
\ No newline at end of file

From b510d4b12fa5f565da16afc64125b98a7230c801 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Mon, 25 Nov 2024 18:15:24 +0800
Subject: [PATCH 04/18] [upgrade] stage 3 passed

---
 chat.py                                    |  89 +++++++----
 data_engine/answer_sampler.py              | 169 ++++++++++++++++++++
 data_engine/data_engine.py                 | 133 +++++++++-------
 data_engine/dataset.py                     |   8 +-
 data_engine/dpo_data_filter/filter.py      |   4 +-
 data_engine/dpo_data_filter/same_filter.py |  32 +++-
 data_engine/logps_calculator.py            | 119 ++------------
 data_engine/logps_gen.sh                   |  27 +++-
 data_engine/reward_computer.py             | 173 +++++++++++++--------
 data_engine/run_engine.sh                  |  30 ++--
 data_engine/util.py                        |  71 +++++++++
 muffin/eval/muffin_inference_logp.py       |   3 +-
 12 files changed, 563 insertions(+), 295 deletions(-)
 create mode 100644 data_engine/answer_sampler.py
 create mode 100644 data_engine/util.py

diff --git a/chat.py b/chat.py
index f427588..ac18251 100644
--- a/chat.py
+++ b/chat.py
@@ -61,33 +61,47 @@ def __init__(self, model_path) -> None:
         self.tokenizer = tokenizer
         self.model.eval()
 
-    def decode(self, image, input_ids):
+    def decode(self, image, input_ids, param=None):
         with torch.inference_mode():
-            output = self.model.generate_vllm(
-                input_ids=input_ids.unsqueeze(0).cuda(),
-                images=image.unsqueeze(0).half().cuda(),
-                temperature=0.6,
-                max_new_tokens=1024,
-                num_beams=3,
-                do_sample=True,
-                output_scores=True,
-                return_dict_in_generate=True,
-                repetition_penalty=1.1,
-                top_k=30,
-                top_p=0.9,
-            )
+            if param is None:
+                output = self.model.generate_vllm(
+                    input_ids=input_ids.unsqueeze(0).cuda(),
+                    images=image.unsqueeze(0).half().cuda(),
+                    temperature=0.6,
+                    max_new_tokens=1024,
+                    num_beams=3,
+                    do_sample=True,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    repetition_penalty=1.1,
+                    top_k=30,
+                    top_p=0.9,
+                )
+            else:
+                output = self.model.generate_vllm(
+                    input_ids=input_ids.unsqueeze(0).cuda(),
+                    images=image.unsqueeze(0).half().cuda(),
+                    max_new_tokens=1024,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    **param
+                )
 
             response = self.tokenizer.decode(
                 output.sequences[0], skip_special_tokens=True)
             response = response.strip()
             return response
 
-    def chat(self, input):
-        im_64 = img2base64(input['image'])
+    def chat(self, input, param=None):
+        if isinstance(input['image'], str):
+            im_64 = img2base64(input['image'])
         msgs = json.dumps([{"role": "user", "content": input['question']}])
 
         try:
-            image = Image.open(io.BytesIO(base64.b64decode(im_64))).convert('RGB')
+            if isinstance(input['image'], str):
+                image = Image.open(io.BytesIO(base64.b64decode(im_64))).convert('RGB')
+            else:
+                image = input['image']
         except Exception as e:
             return "Image decode error"
 
@@ -97,7 +111,7 @@ def chat(self, input):
         input_ids = torch.as_tensor(input_ids)
         image = self.image_transform(image)
 
-        out = self.decode(image, input_ids)
+        out = self.decode(image, input_ids, param=param)
 
         return out
 
@@ -119,14 +133,17 @@ def __init__(self, model_path) -> None:
         self.image_processor = image_processor
         self.context_len = context_len
 
-    def chat(self, input):
+    def chat(self, input, param=None):
         msgs = input['question']
         if self.model.config.mm_use_im_start_end:
             msgs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + msgs
         else:
             msgs = DEFAULT_IMAGE_TOKEN + '\n' + msgs
 
-        image = Image.open(input['image']).convert('RGB')
+        if isinstance(input['image'], str):
+            image = Image.open(input['image']).convert('RGB')
+        else:
+            image = input['image']
         conv = conv_templates["llava_v1"].copy()
         conv.append_message(conv.roles[0], msgs)
         conv.append_message(conv.roles[1], None)
@@ -136,15 +153,25 @@ def chat(self, input):
             0).cuda()
         image_tensor = process_images([image], self.image_processor, self.model.config)[0]
         with torch.inference_mode():
-            output_ids = self.model.generate(
-                input_ids,
-                images=image_tensor.unsqueeze(0).half().cuda(),
-                image_sizes=[image.size],
-                do_sample=False,
-                temperature=0,
-                num_beams=3,
-                max_new_tokens=1024,
-                use_cache=True)
+            if param is None:
+                output_ids = self.model.generate(
+                    input_ids,
+                    images=image_tensor.unsqueeze(0).half().cuda(),
+                    image_sizes=[image.size],
+                    do_sample=False,
+                    temperature=0,
+                    num_beams=3,
+                    max_new_tokens=1024,
+                    use_cache=True)
+            else:
+                output_ids = self.model.generate(
+                    input_ids,
+                    images=image_tensor.unsqueeze(0).half().cuda(),
+                    image_sizes=[image.size],
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    **param
+                )
         outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
         return outputs
 
@@ -156,8 +183,8 @@ def __init__(self, model_path) -> None:
         else:
             self.model = RLAIFV7B(model_path)
 
-    def chat(self, input):
-        return self.model.chat(input)
+    def chat(self, input, param=None):
+        return self.model.chat(input, param=param)
 
 
 if __name__ == '__main__':
diff --git a/data_engine/answer_sampler.py b/data_engine/answer_sampler.py
new file mode 100644
index 0000000..68e3dc3
--- /dev/null
+++ b/data_engine/answer_sampler.py
@@ -0,0 +1,169 @@
+import os
+import tqdm
+import copy
+from chat import RLAIFVChat
+from datasets import load_dataset
+import torch
+import pandas as pd
+from muffin.data.datasets import bytes_to_PIL_image
+from util import *
+from collections import defaultdict
+import torch.distributed as dist
+
+
+def sample_answer(model_path, dataset_path, output_path, image_column, sample=10):
+    # here we need to keep different samples of the same question adjacent to each other in the final file
+    # otherwise, the data_pair_builder will output data with no sense.
+    # so in this function, there are some code used to keep the order
+    # if you want to change them, you may also need to change code in data_pair_builder
+
+    try:
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        world_size = int(os.environ.get("WORLD_SIZE", 1))
+
+        model = RLAIFVChat(model_path)
+        grouped_output_data = defaultdict(list)
+
+        with torch.inference_mode():
+            generation_config = {
+                "top_p": 0.8,
+                "top_k": 100,
+                "temperature": 0.7,
+                "do_sample": True,
+                "repetition_penalty": 1.05
+            }
+
+            dataset = load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column(
+                image_column,
+                hf_datasets.Image(decode=False)
+            )
+
+            total_size = len(dataset)
+            base_size = total_size // world_size
+            remainder = total_size % world_size
+
+            start_idx = local_rank * base_size + min(local_rank, remainder)
+            end_idx = start_idx + base_size + (1 if local_rank < remainder else 0)
+
+            device_dataset = dataset.select(range(start_idx, end_idx))
+            processed_indices = set()
+
+            iterator = tqdm.tqdm(
+                device_dataset,
+                desc=f"GPU {local_rank}",
+                position=local_rank
+            )
+
+            for idx, data in enumerate(iterator):
+                try:
+                    data_id = start_idx + idx
+                    current_samples = []
+                    for i in range(sample):
+                        try:
+                            data_cp = copy.deepcopy(data)
+                            # your dataset should keep image in ['image']['bytes'] or ['image_bytes']['bytes']
+                            # or you can change the following code to read the data in your format
+                            if 'image' in data_cp:
+                                data_cp['image'] = bytes_to_PIL_image(data_cp['image']['bytes'])
+                                output = model.chat(data_cp, param=generation_config)
+                                data_cp['chosen'] = output
+                                data_cp['rejected'] = output
+                                data_cp['image'] = data['image']
+                                data_cp['global_index'] = data_id  # 添加全局索引
+                                data_cp['sample_index'] = i  # 添加样本索引
+                            elif 'image_bytes' in data_cp:
+                                data_cp['image'] = bytes_to_PIL_image(data_cp['image_bytes']['bytes'])
+                                output = model.chat(data_cp, param=generation_config)
+                                data_cp['chosen'] = output
+                                data_cp['rejected'] = output
+                                data_cp.pop('image')
+                                data_cp['image'] = data['image_bytes']
+                                data_cp['global_index'] = data_id
+                                data_cp['sample_index'] = i
+                            else:
+                                raise ValueError("image attribute not found")
+                            current_samples.append(data_cp)
+                        except Exception as e:
+                            print(f"Error processing sample {i} for data_id {data_id}: {str(e)}")
+                            continue
+
+                    if current_samples:  # 只有在成功生成样本时才添加
+                        grouped_output_data[data_id] = current_samples
+                        processed_indices.add(data_id)
+                except Exception as e:
+                    print(f"Error processing data_id {data_id}: {str(e)}")
+                    continue
+
+            torch.distributed.barrier()
+
+            if world_size > 1:
+                all_data = [None] * world_size
+                dist.all_gather_object(all_data, grouped_output_data)
+
+                if local_rank == 0:
+                    merged_data = defaultdict(list)
+                    all_data_ids = set()
+                    for rank_data in all_data:
+                        all_data_ids.update(rank_data.keys())
+
+                    for data_id in sorted(all_data_ids):
+                        for rank_data in all_data:
+                            if data_id in rank_data:
+                                merged_data[data_id].extend(rank_data[data_id])
+                    grouped_output_data = merged_data
+
+            if local_rank == 0:
+                step = 5000
+                flat_output_data = []
+
+                for data_id in sorted(grouped_output_data.keys()):
+                    samples = sorted(grouped_output_data[data_id], key=lambda x: x['sample_index'])
+                    flat_output_data.extend(samples)
+
+                # 分批保存数据时保持顺序
+                for idx, start in enumerate(range(0, len(flat_output_data), step)):
+                    try:
+                        temp_data = flat_output_data[start: min(start + step, len(flat_output_data))]
+                        df = pd.DataFrame(temp_data)
+
+                        df = df.sort_values(['global_index', 'sample_index'])
+                        df = df.drop(columns=['global_index', 'sample_index'])
+
+                        output_file = os.path.join(
+                            output_path,
+                            f'RLAIF-V-Dataset-sampled_{idx:03}-{len(temp_data)}.parquet'
+                        )
+
+                        temp_file = output_file + '.tmp'
+                        df.to_parquet(temp_file)
+                        os.rename(temp_file, output_file)
+
+                    except Exception as e:
+                        print(f"Error saving batch {idx}: {str(e)}")
+                        continue
+
+    except Exception as e:
+        print(f"Critical error in sample_answer: {str(e)}")
+        raise
+    finally:
+        if 'model' in locals():
+            del model
+
+
+def main():
+    dist.init_process_group(backend='nccl')
+
+    model_path = "your_model_path"
+    dataset_path = "your_dataset_path"
+    output_path = "your_output_path"
+    sample = 10
+
+    try:
+        sample_answer(model_path, dataset_path, output_path, sample)
+    finally:
+        # 清理分布式环境
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
index e60bfe0..9edce85 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/data_engine.py
@@ -6,28 +6,33 @@
 import reward_computer
 import data_pair_builder
 from dpo_data_filter import filter
+import answer_sampler
 import argparse
+import torch
+import torch.distributed as dist
 
 
 def print_stage(idx, desc="", finish=False):
-    print("=" * 80)
-    if not finish:
-        print(f"Processing Stage {idx}: {desc}")
-    else:
-        print(f"Finish Stage {idx}")
-    print("=" * 80)
+    if torch.distributed.get_rank() == 0:
+        print("=" * 80)
+        if not finish:
+            print(f"Processing Stage {idx}: {desc}")
+        else:
+            print(f"Finish Stage {idx}")
+        print("=" * 80)
 
 
 def dir_prepare(dir_to_check, clean=True):
-    if not os.path.exists(dir_to_check):
-        os.makedirs(dir_to_check)
-    elif clean:
-        if not os.path.isdir(dir_to_check):
-            for file in os.listdir(dir_to_check):
-                os.remove(os.path.join(dir_to_check, file))
-        else:
-            os.remove(dir_to_check)
-            os.mkdir(dir_to_check)
+    if torch.distributed.get_rank() == 0:
+        if not os.path.exists(dir_to_check):
+            os.makedirs(dir_to_check)
+        elif clean:
+            if os.path.isdir(dir_to_check):
+                for file in os.listdir(dir_to_check):
+                    os.remove(os.path.join(dir_to_check, file))
+            else:
+                os.remove(dir_to_check)
+                os.mkdir(dir_to_check)
 
 
 def run(
@@ -37,11 +42,23 @@ def run(
         instruct_model_path,
         dataset_path,
         work_dir,
+        image_column="image",
         continue_from_stage=1,
         sample_k=10,
         rank=10,
         distance=5
 ):
+    dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
+                            rank=int(os.getenv('RANK', '0')), )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    sampled_answer_path = os.path.join(work_dir, "sampled_answer")
+    if continue_from_stage <= 0:
+        print_stage(0, "Sample answers")
+        dir_prepare(sampled_answer_path)
+        answer_sampler.sample_answer(instruct_model_path, dataset_path, sampled_answer_path, image_column, sample_k)
+        print_stage(0, finish=True)
+
     reward_logps_output_dir = os.path.join(work_dir, "reward_logps")
     instruct_logps_output_dir = os.path.join(work_dir, "instruct_logps")
     if continue_from_stage <= 1:
@@ -53,50 +70,54 @@ def run(
             reward_model_path,
             instruct_model_name,
             instruct_model_path,
-            dataset_path,
+            sampled_answer_path,
             reward_logps_output_dir,
             instruct_logps_output_dir)
         print_stage(1, finish=True)
 
-    if continue_from_stage <= 2:
-        print_stage(2, "DPO dataset construction")
-
-        print_stage(2.1, "Calculate reward")
-        rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
-        print_stage(2.1, finish=True)
-
-        print_stage(2.2, "Build DPO pairs")
-        dpo_pair = data_pair_builder.main(rewards, sample_k, rank, distance)
-        print_stage(2.2, finish=True)
-
-        print_stage(2.3, "Filter DPO pairs")
-        data = filter.main(dpo_pair)
-        print_stage(2.3, finish=True)
-
-        print_stage(2.4, "Save file to dataset format")
-        needed_keys = [
-            "question",
-            "chosen",
-            "rejected",
-            "origin_dataset",
-            "origin_split",
-            "idx",
-            "image_path",
-            "ds_name",
-            "image"]
-        for item in data:
-            for key in list(item.keys()):
-                if key not in needed_keys:
-                    del item[key]
-        df = pd.DataFrame(data)
-        output_file = os.path.join(work_dir, "dpo_dataset.parquet")
-        df.to_parquet(output_file)
-        print_stage(2.4, finish=True)
-
-        print_stage(2, finish=True)
-
-        print("Finish all stages, output file is saved to ", output_file)
-        print("Have a nice day!")
+    # following code doesn't need multi CUDA
+    if torch.distributed.get_rank() == 0:
+        if continue_from_stage <= 2:
+            print_stage(2, "DPO dataset construction")
+
+            print_stage(2.1, "Calculate reward")
+            rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
+            print_stage(2.1, finish=True)
+
+            print_stage(2.2, "Build DPO pairs")
+            dpo_pair = data_pair_builder.main(rewards, sample_k, rank, distance)
+            print_stage(2.2, finish=True)
+
+            print_stage(2.3, "Filter DPO pairs")
+            data = filter.main(dpo_pair)
+            print_stage(2.3, finish=True)
+
+            print_stage(2.4, "Save file to dataset format")
+            output_file = os.path.join(work_dir, "dpo_dataset.parquet")
+            if os.path.exists(output_file):
+                os.remove(output_file)
+            needed_keys = [
+                "question",
+                "chosen",
+                "rejected",
+                "origin_dataset",
+                "origin_split",
+                "idx",
+                "image_path",
+                "ds_name",
+                "image"]
+            for item in data:
+                for key in list(item.keys()):
+                    if key not in needed_keys:
+                        del item[key]
+            df = pd.DataFrame(data)
+            df.to_parquet(output_file)
+            print_stage(2.4, finish=True)
+
+            print_stage(2, finish=True)
+
+            print("Finish all stages, output file is saved to ", output_file)
+            print("Have a nice day!")
 
 if __name__ == "__main__":
     args = argparse.ArgumentParser()
@@ -106,6 +127,7 @@ def run(
     args.add_argument("--instruct_model_path", type=str, help="The path of the instruct model.")
     args.add_argument("--dataset_path", type=str, help="The path of the dataset.")
     args.add_argument("--work_dir", type=str, help="The working directory.")
+    args.add_argument("--image_column", type=str, help="The column that keep image in your dataset")
     args.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
     args.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
     args.add_argument("--rank", type=int, default=10, help="The rank number.")
@@ -119,6 +141,7 @@ def run(
         args.instruct_model_path,
         args.dataset_path,
         args.work_dir,
+        args.image_column,
         args.continue_from_stage,
         args.sample_k,
         args.rank,
diff --git a/data_engine/dataset.py b/data_engine/dataset.py
index 3952fce..c8612be 100644
--- a/data_engine/dataset.py
+++ b/data_engine/dataset.py
@@ -37,16 +37,20 @@ def __getitem__(self, index):
         sample = self.data[index]
         metainfo = {
             "origin_dataset": sample['origin_dataset'],
-            "origin_split": json.loads(sample['origin_split']),
             "origin_idx": sample['idx'],
             "image_id": sample['image_path'],
         }
+        if sample['origin_split'] is not None and sample['origin_split'] != "":
+            metainfo["origin_split"] = json.loads(sample['origin_split'])
+        else:
+            metainfo["origin_split"] = ""
+
         question = {'from': 'human', 'value': f"<image>\n{sample['question']}"}
         chosen = {'from': 'gpt', 'value': sample['chosen']}
         rejected = {'from': 'gpt', 'value': sample['rejected']}
 
         image = bytes_to_PIL_image(sample['image']['bytes'])
-        # image = bytes_to_PIL_image(sample['image_bytes'])
+        # image = bytes_to_PIL_image(sample['image_bytes']['bytes'])
 
         formated_sample = {
             'image': image,
diff --git a/data_engine/dpo_data_filter/filter.py b/data_engine/dpo_data_filter/filter.py
index 05595d4..982c2d1 100644
--- a/data_engine/dpo_data_filter/filter.py
+++ b/data_engine/dpo_data_filter/filter.py
@@ -42,6 +42,7 @@ def load_data(file_path):
 
 
 def main(data):
+    print(f"Before filtering, we have {len(data)} data")
     # import filters here to avoid circulate important
     from .length_filter import LengthFilter
     from .num_filter import NumFilter
@@ -52,7 +53,7 @@ def main(data):
     filters = [LengthFilter, NumFilter, DeleteSameFilter]
 
     for filter_to_run in filters:
-        filter_name = filter_to_run.__class__.__name__
+        filter_name = filter_to_run.__name__
         filter_doc = filter_to_run.__doc__ if filter_to_run.__doc__ else "No documentation available"
         print("=" * 80)
         print(f"Processing Filter: {filter_name}")
@@ -65,5 +66,6 @@ def main(data):
         print(f"Filter {filter_name} Finished")
         print(f"After filtering, we get {len(data)} data items")
         print("=" * 80 + "\n")
+    print(f"After filtering, we have {len(data)} data")
 
     return data
diff --git a/data_engine/dpo_data_filter/same_filter.py b/data_engine/dpo_data_filter/same_filter.py
index cbd22ab..fcdecc8 100644
--- a/data_engine/dpo_data_filter/same_filter.py
+++ b/data_engine/dpo_data_filter/same_filter.py
@@ -1,24 +1,40 @@
 import json
-
+from copy import deepcopy
 from .filter import Filter
 
 
 class DeleteSameFilter(Filter):
     """
-    For QA data, there may be some redundant data, which need to be filtered
+    For QA data, there may be some redundant data, which need to be filtered.
+    This version temporarily stores image data during comparison and restores it afterwards.
     """
 
     @classmethod
     def do_filter(cls, data):
         unique_data = set()
         delete_same_output = []
-        for obj in data:
-            # 将每条数据序列化为字符串，便于使用集合去重
-            data_str = json.dumps(obj, sort_keys=True)
+        temp_image_store = {}  # 用于暂存 image 数据
+
+        for idx, obj in enumerate(data):
+            # 创建对象的深拷贝
+            obj_copy = deepcopy(obj)
+
+            # 如果存在 image 字段，暂存它
+            if 'image' in obj_copy:
+                # 使用对象的其他字段作为键来存储 image
+                image_data = obj_copy.pop('image')
+                temp_key = f"temp_key_{idx}"  # 使用索引创建唯一的临时键
+                temp_image_store[temp_key] = (image_data, obj_copy)
+
+            # 将处理后的数据序列化为字符串
+            data_str = json.dumps(obj_copy, sort_keys=True)
+
             if data_str not in unique_data:
                 unique_data.add(data_str)
-                delete_same_output.append(obj)
-
-        # print(f"去重完成，共写入 {len(unique_data)} 条数据到 {output_file}")
+                # 如果有 image，从临时存储中恢复它
+                if temp_key in temp_image_store:
+                    stored_image, _ = temp_image_store[temp_key]
+                    obj_copy['image'] = stored_image
+                delete_same_output.append(obj_copy)
 
         return delete_same_output
diff --git a/data_engine/logps_calculator.py b/data_engine/logps_calculator.py
index a013757..5c8b68d 100644
--- a/data_engine/logps_calculator.py
+++ b/data_engine/logps_calculator.py
@@ -1,58 +1,15 @@
 import os
-import json
 import tqdm
-import copy
 import itertools
 import argparse
-import pandas as pd
-import torch.utils.data as torch_data
-from functools import partial
-from muffin.train.train_utils import SFT_collator_fn
-import numpy as np
-import datasets as hf_datasets
-from transformers.image_processing_utils import BatchFeature
-
-from builder.builder import load_pretrained_model
-from muffin.eval.muffin_inference_logp import (get_batch_logps, InferenceSampler, concate_pad)
-from dataset import PreferenceInferenceDataset
+from muffin.eval.muffin_inference_logp import (get_batch_logps, write_logp_to_preference_parquet)
+from util import *
 
 import torch
 import torch.distributed as dist
 
 
-def preference_collator_fn(instances, pad_token_id, is_omni=False):
-    rej_instances, win_instances = list(zip(*instances))
-    rej_batch = SFT_collator_fn(rej_instances, pad_token_id)
-    win_batch = SFT_collator_fn(win_instances, pad_token_id)
-
-    concatenated_input_ids = concate_pad(win_batch['input_ids'], rej_batch['input_ids'], pad_token_id)
-    concatenated_labels = concate_pad(win_batch['labels'], rej_batch['labels'], -100)
-    concatenated_attention_mask = concatenated_input_ids.ne(pad_token_id)
-
-    if not is_omni:
-        if isinstance(win_batch['images'][0], BatchFeature):
-            win_images = torch.stack([torch.tensor(img.pixel_values[0]) for img in win_batch['images']])
-        elif isinstance(win_batch['images'][0], np.ndarray):
-            win_images = torch.stack([torch.tensor(img) for img in win_batch['images']])
-        else:
-            win_images = win_batch['images']
-
-    batch = dict(
-        concatenated_input_ids=concatenated_input_ids,
-        concatenated_labels=concatenated_labels,
-        concatenated_attention_mask=concatenated_attention_mask,
-        win_input_ids=win_batch['input_ids'],
-        rej_input_ids=rej_batch['input_ids'],
-        win_labels=win_batch['labels'],
-        rej_labels=rej_batch['labels'],
-        win_attention_mask=win_batch['attention_mask'],
-        rej_attention_mask=rej_batch['attention_mask'],
-        images=win_batch['images'] if is_omni else win_images,
-    )
-    return batch
-
-
-def get_multimodal_sample_logps(model, dataloader, tokenizer, is_llava15=False):
+def get_multimodal_sample_logps(model, dataloader, is_llava15=False):
     win_logp_list = []
     rej_logp_list = []
 
@@ -122,41 +79,6 @@ def get_multimodal_sample_logps(model, dataloader, tokenizer, is_llava15=False):
     return win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
 
 
-def write_logp_to_preference_parquet(origin_data, cache_dir, logps, overwrite_logps=True):
-    out_data = []
-
-    for index in range(len(logps)):
-        line = origin_data[index]
-        logp_data = {}
-        logp_data['logps'] = logps[index]
-
-        new_line = copy.deepcopy(line)
-
-        if 'logps' in new_line.keys():
-            assert overwrite_logps, 'Found existing logp data, pass overwrite_logps=True to force overwritting'
-            new_line['logps'] = json.dumps(logp_data)
-
-        else:
-            assert (('question' in list(new_line.keys()))
-                    and ('chosen' in list(new_line.keys()))
-                    and ('rejected' in list(new_line.keys()))), \
-                f'Undefined data structure, expecting [Q, Win, Rej] in keys, got {new_line.keys()}'
-            new_line['logps'] = json.dumps(logp_data)
-
-        out_data.append(new_line)
-
-    # df = none
-    if torch.distributed.get_rank() == 0:
-        step = 5000
-        for idx, start in enumerate(range(0, len(out_data), step)):
-            temp_data = out_data[start: min(start + step, len(out_data))]
-            df = pd.DataFrame(temp_data)
-            df.to_parquet(os.path.join(cache_dir, f'RLAIF-V-Dataset-withlogp_{idx:03}-{len(temp_data)}.parquet'))
-
-    torch.distributed.barrier()
-    return df
-
-
 def inference_logp(
         model_name,
         model_path,
@@ -173,37 +95,14 @@ def inference_logp(
 
     """
 
-    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
-                                                                           device_map={"": 'cuda'})
-    image_token_len = 0
-    if hasattr(model, "model") and hasattr(model.model, "config") and hasattr(model.model.config, "num_query"):
-        image_token_len = model.model.config.num_query
-
-    model = model.to(dtype=torch.bfloat16, device='cuda')
-    hf_data = hf_datasets.load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column("image",
-                                                                                               hf_datasets.Image(
-                                                                                                   decode=False))
-    dataset = PreferenceInferenceDataset(model_name=model_name,
-                                         tokenizer=tokenizer,
-                                         data=hf_data,
-                                         image_token_len=image_token_len,
-                                         img_processor=image_processor,
-                                         use_im_start_end=False)
-    collate_fn = partial(
-        preference_collator_fn,
-        pad_token_id=tokenizer.pad_token_id,
-        is_omni=("omni" in model_name.lower()) or (
-                    "rlaif" in model_name.lower() and "12b" in model_path.lower()))  # judge if the model follow omni structure
-    dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
-                                       num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
+    model, dataset, dataloader = load_model_and_dataloader(model_path, model_name, dataset_path)
 
     outputs = get_multimodal_sample_logps(
         # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
         model,
         dataloader,
-        tokenizer,
         is_llava15=("llava" in model_name.lower() or (
-                    "rlaif" in model_name.lower() and "7b" in model_path.lower())))  # judge if the model follow llava structure
+                "rlaif" in model_name.lower() and "7b" in model_path.lower())))  # judge if the model follow llava structure
 
     world_size = torch.distributed.get_world_size()
     merged_outputs = [[None for _ in range(world_size)] for i in range(len(outputs))]
@@ -233,9 +132,6 @@ def main(
         dataset_path: str,
         reward_output_dir: str,
         instruct_output_dir: str):
-    dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
-                            rank=int(os.getenv('RANK', '0')), )
-    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
     _ = inference_logp(instruct_model_name, instruct_model_path, dataset_path, instruct_output_dir)
     _ = inference_logp(reward_model_name, reward_model_path, dataset_path, reward_output_dir)
 
@@ -254,9 +150,12 @@ def main(
     parser.add_argument('--dataset_path', type=str, default='/data/yaoshu/dataset/RLAIF-V-Dataset')
     parser.add_argument('--reward_model_output_dir', type=str, default='/data/RLAIF-V-CC/results/reward')
     parser.add_argument('--instruct_model_output_dir', type=str, default='/data/RLAIF-V-CC/results/instruct')
-    parser.add_argument('--local-rank', type=int, default=0)
     args = parser.parse_args()
 
+    dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
+                            rank=int(os.getenv('RANK', '0')), )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
     files = main(args.reward_model_name, args.reward_model_path, args.instruct_model_name, args.instruct_model_path,
                  args.dataset_path, args.reward_model_output_dir,
                  args.instruct_model_output_dir)
diff --git a/data_engine/logps_gen.sh b/data_engine/logps_gen.sh
index abc8aa5..09e8f17 100644
--- a/data_engine/logps_gen.sh
+++ b/data_engine/logps_gen.sh
@@ -1,14 +1,13 @@
 #!/bin/bash
 export PYTHONPATH=$PYTHONPATH:`realpath .`
-export CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=2
 
-#GPUS_PER_NODE=1
+#GPUS_PER_NODE=4
 #NNODES=1
 #NODE_RANK=0
 export MASTER_ADDR=localhost
-#MASTER_ADDR=13.13.19.1
 export MASTER_PORT=6001
-#
+##
 #DISTRIBUTED_ARGS="
 #    --nproc_per_node $GPUS_PER_NODE \
 #    --nnodes $NNODES \
@@ -17,8 +16,20 @@ export MASTER_PORT=6001
 #    --master_port $MASTER_PORT
 #"
 
-#torchrun $DISTRIBUTED_ARGS data_engine/logps_calculator.py  \
-#          --use_12b_model_for_reward \
-#          --use_12b_model_for_instruct
 
-python data_engine/logps_calculator.py
\ No newline at end of file
+#torchrun $DISTRIBUTED_ARGS data_engine/logps_calculator.py \
+#      --reward_model_name llava-v1.5-7b \
+#      --reward_model_path /data/yaoshu/models/llava-v1.5-7b \
+#      --instruct_model_name RLAIF-V-7B \
+#      --instruct_model_path /data/yaoshu/models/RLAIF-V-7B \
+#      --reward_model_output_dir /data/RLAIF-V-CC/results/7b/reward \
+#      --instruct_model_output_dir /data/RLAIF-V-CC/results/7b/instruct
+
+
+python data_engine/logps_calculator.py \
+      --reward_model_name llava-v1.5-7b \
+      --reward_model_path /data/yaoshu/models/llava-v1.5-7b \
+      --instruct_model_name RLAIF-V-7B \
+      --instruct_model_path /data/yaoshu/models/RLAIF-V-7B \
+      --reward_model_output_dir /data/RLAIF-V-CC/results/7b/reward \
+      --instruct_model_output_dir /data/RLAIF-V-CC/results/7b/instruct
diff --git a/data_engine/reward_computer.py b/data_engine/reward_computer.py
index 543b2df..fde90ce 100644
--- a/data_engine/reward_computer.py
+++ b/data_engine/reward_computer.py
@@ -8,26 +8,75 @@
 import argparse
 
 
-# def convert_parquet_to_jsonl(parquet_file, jsonl_file):
-#     table = pq.read_table(parquet_file)
-#     df = table.to_pandas()
+# def parquet_to_json(parquet_file, jsonl_file):
+#     df = pd.read_parquet(parquet_file, engine='pyarrow')
 #
-#     def filter_bytes(value):
-#         if isinstance(value, bytes):
-#             return None  # 返回 None 表示跳过该值
-#         return value
+#     df = df.astype(str, errors='ignore')
+#     df.to_json(jsonl_file, orient='records', lines=True, force_ascii=False)
 #
-#     df = df.applymap(filter_bytes)
+# def compute_reward(tokenizer, reward_logps_dir, instruct_logps_dir):
+#     rewards = []
+#     reward_files = [f for f in os.listdir(reward_logps_dir) if f.endswith('.parquet')]
 #
-#     with open(jsonl_file, 'w', encoding='utf-8') as f:
-#         for record in df.to_dict(orient='records'):
-#             filtered_record = {k: v for k, v in record.items() if v is not None}
-#             f.write(json.dumps(filtered_record, ensure_ascii=False) + '\n')
-def parquet_to_json(parquet_file, jsonl_file):
-    df = pd.read_parquet(parquet_file, engine='pyarrow')
-
-    df = df.astype(str, errors='ignore')
-    df.to_json(jsonl_file, orient='records', lines=True, force_ascii=False)
+#     for reward_file in tqdm(reward_files, desc='Processing files'):
+#         suffix = reward_file.split('_')[-1].split('.')[0]
+#         reward_file_path = os.path.join(reward_logps_dir, reward_file)
+#         instruct_file_path = os.path.join(instruct_logps_dir, f'RLAIF-V-Dataset-withlogp_{suffix}.parquet')
+#
+#         reward_jsonl_file = os.path.join(reward_logps_dir, reward_file.replace('.parquet', '.jsonl'))
+#         instruct_jsonl_file = os.path.join(instruct_logps_dir, instruct_file_path.replace('.parquet', '.jsonl'))
+#
+#         if not os.path.exists(reward_jsonl_file):
+#             print(f"Converting {reward_file_path} to {reward_jsonl_file}. For each file, it will only perform once, please wait...")
+#             # convert_parquet_to_jsonl(reward_file_path, reward_jsonl_file)
+#             parquet_to_json(reward_file_path, reward_jsonl_file)
+#             print(f'Successfully converted {reward_file_path} to {reward_jsonl_file}')
+#         if not os.path.exists(instruct_jsonl_file):
+#             print(f"Converting {instruct_file_path} to {instruct_jsonl_file}. F or each file, it will only perform once, please wait...")
+#             # convert_parquet_to_jsonl(instruct_file_path, instruct_jsonl_file)
+#             parquet_to_json(instruct_file_path, instruct_jsonl_file)
+#             print(f'Successfully converted {instruct_file_path} to {instruct_jsonl_file}')
+#
+#         with jsonlines.open(reward_jsonl_file) as reward_reader, jsonlines.open(
+#                 instruct_jsonl_file) as instruct_reader:
+#             for obj in reward_reader:
+#                 idx = obj["idx"]
+#                 tokens = tokenizer.encode(obj["chosen"])
+#                 logps = obj["logps"].split("[")[-1].split("]")[0]
+#                 reward_logps = list(map(float, logps.split(",")))
+#                 reward_logps_for_reward = reward_logps[-len(tokens):]
+#
+#                 for instruct_obj in instruct_reader:
+#                     if instruct_obj["idx"] == idx:
+#                         instruct_logps = instruct_obj["logps"].split("[")[-1].split("]")[0]
+#                         instruct_logps = list(map(float, instruct_logps.split(",")))
+#                         instruct_logps_for_reward = instruct_logps[-len(tokens):]
+#                         break
+#
+#                 differences = [instruct_logp - reward_logp for instruct_logp, reward_logp in
+#                                zip(instruct_logps_for_reward, reward_logps_for_reward)]
+#                 min_reward = min(differences) * 0.1
+#                 sum_reward = sum(differences) * 0.1
+#                 last_reward = differences[-1] * 0.1
+#                 avg_reward = sum_reward / len(tokens) * 0.1
+#
+#                 reward_data = {
+#                     "idx": idx,
+#                     "ds_name": obj["ds_name"],
+#                     "question": obj["question"],
+#                     "chosen": obj["chosen"],
+#                     "image": obj["image"],
+#                     "image_path": obj["image_path"],
+#                     "origin_split": obj["origin_split"],
+#                     "origin_dataset": obj["origin_dataset"],
+#                     "min": min_reward,
+#                     "sum": sum_reward,
+#                     "ORM": last_reward,
+#                     "avg": avg_reward
+#                 }
+#
+#                 rewards.append(reward_data)
+#     return rewards
 
 def compute_reward(tokenizer, reward_logps_dir, instruct_logps_dir):
     rewards = []
@@ -38,59 +87,45 @@ def compute_reward(tokenizer, reward_logps_dir, instruct_logps_dir):
         reward_file_path = os.path.join(reward_logps_dir, reward_file)
         instruct_file_path = os.path.join(instruct_logps_dir, f'RLAIF-V-Dataset-withlogp_{suffix}.parquet')
 
-        reward_jsonl_file = os.path.join(reward_logps_dir, reward_file.replace('.parquet', '.jsonl'))
-        instruct_jsonl_file = os.path.join(instruct_logps_dir, instruct_file_path.replace('.parquet', '.jsonl'))
-
-        if not os.path.exists(reward_jsonl_file):
-            print(f"Converting {reward_file_path} to {reward_jsonl_file}. For each file, it will only perform once, please wait...")
-            # convert_parquet_to_jsonl(reward_file_path, reward_jsonl_file)
-            parquet_to_json(reward_file_path, reward_jsonl_file)
-            print(f'Successfully converted {reward_file_path} to {reward_jsonl_file}')
-        if not os.path.exists(instruct_jsonl_file):
-            print(f"Converting {instruct_file_path} to {instruct_jsonl_file}. F or each file, it will only perform once, please wait...")
-            # convert_parquet_to_jsonl(instruct_file_path, instruct_jsonl_file)
-            parquet_to_json(instruct_file_path, instruct_jsonl_file)
-            print(f'Successfully converted {instruct_file_path} to {instruct_jsonl_file}')
-
-        with jsonlines.open(reward_jsonl_file) as reward_reader, jsonlines.open(
-                instruct_jsonl_file) as instruct_reader:
-            for obj in reward_reader:
-                idx = obj["idx"]
-                tokens = tokenizer.encode(obj["chosen"])
-                logps = obj["logps"].split("[")[-1].split("]")[0]
-                reward_logps = list(map(float, logps.split(",")))
-                reward_logps_for_reward = reward_logps[-len(tokens):]
-
-                for instruct_obj in instruct_reader:
-                    if instruct_obj["idx"] == idx:
-                        instruct_logps = instruct_obj["logps"].split("[")[-1].split("]")[0]
-                        instruct_logps = list(map(float, instruct_logps.split(",")))
-                        instruct_logps_for_reward = instruct_logps[-len(tokens):]
-                        break
-
-                differences = [instruct_logp - reward_logp for instruct_logp, reward_logp in
-                               zip(instruct_logps_for_reward, reward_logps_for_reward)]
-                min_reward = min(differences) * 0.1
-                sum_reward = sum(differences) * 0.1
-                last_reward = differences[-1] * 0.1
-                avg_reward = sum_reward / len(tokens) * 0.1
-
-                reward_data = {
-                    "idx": idx,
-                    "ds_name": obj["ds_name"],
-                    "question": obj["question"],
-                    "chosen": obj["chosen"],
-                    "image": obj["image"],
-                    "image_path": obj["image_path"],
-                    "origin_split": obj["origin_split"],
-                    "origin_dataset": obj["origin_dataset"],
-                    "min": min_reward,
-                    "sum": sum_reward,
-                    "ORM": last_reward,
-                    "avg": avg_reward
-                }
+        reward_df = pd.read_parquet(reward_file_path)
+        instruct_df = pd.read_parquet(instruct_file_path)
+
+        for _, reward_row in reward_df.iterrows():
+            idx = reward_row["idx"]
+            tokens = tokenizer.encode(reward_row["chosen"])
+            logps = reward_row["logps"].split("[")[-1].split("]")[0]
+            reward_logps = list(map(float, logps.split(",")))
+            reward_logps_for_reward = reward_logps[-len(tokens):]
+
+            instruct_row = instruct_df[instruct_df["idx"] == idx].iloc[0]
+            instruct_logps = instruct_row["logps"].split("[")[-1].split("]")[0]
+            instruct_logps = list(map(float, instruct_logps.split(",")))
+            instruct_logps_for_reward = instruct_logps[-len(tokens):]
+
+            differences = [instruct_logp - reward_logp for instruct_logp, reward_logp in
+                           zip(instruct_logps_for_reward, reward_logps_for_reward)]
+            min_reward = min(differences) * 0.1
+            sum_reward = sum(differences) * 0.1
+            last_reward = differences[-1] * 0.1
+            avg_reward = sum_reward / len(tokens) * 0.1
+
+            reward_data = {
+                "idx": idx,
+                "ds_name": reward_row["ds_name"],
+                "question": reward_row["question"],
+                "chosen": reward_row["chosen"],
+                "image": reward_row["image"],
+                "image_path": reward_row["image_path"],
+                "origin_split": reward_row["origin_split"],
+                "origin_dataset": reward_row["origin_dataset"],
+                "min": min_reward,
+                "sum": sum_reward,
+                "ORM": last_reward,
+                "avg": avg_reward
+            }
+
+            rewards.append(reward_data)
 
-                rewards.append(reward_data)
     return rewards
 
 
diff --git a/data_engine/run_engine.sh b/data_engine/run_engine.sh
index 2e6bb5f..51ce2c0 100644
--- a/data_engine/run_engine.sh
+++ b/data_engine/run_engine.sh
@@ -1,15 +1,25 @@
 #!/bin/bash
 export PYTHONPATH=$(realpath .):$PYTHONPATH
-export CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=0,1,2,3
 
+GPUS_PER_NODE=4
+NNODES=1
+NODE_RANK=0
 export MASTER_ADDR=localhost
 export MASTER_PORT=6001
-
-python data_engine/data_engine.py \
-      --reward_model_name llava-v1.5-7b \
-      --reward_model_path /data/yaoshu/models/llava-v1.5-7b \
-      --instruct_model_name RLAIF-V-7B \
-      --instruct_model_path /data/yaoshu/models/RLAIF-V-7B \
-      --dataset_path /data/yaoshu/dataset/RLAIF-V-Dataset \
-      --work_dir /data/RLAIF-V-CC/results/test \
-      --continue_from_stage 2
\ No newline at end of file
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS data_engine/data_engine.py \
+      --reward_model_name RLAIF-V-7B \
+      --reward_model_path /data/yaoshu/models/RLAIF-V-7B \
+      --instruct_model_name llava-v1.5-7b \
+      --instruct_model_path /data/yaoshu/models/llava-v1.5-7b \
+      --dataset_path /data/yaoshu/dataset/origin_dataset \
+      --work_dir /data/RLAIF-V-CC/results/test1 \
+      --image_column image_bytes \
+      --continue_from_stage 0
\ No newline at end of file
diff --git a/data_engine/util.py b/data_engine/util.py
new file mode 100644
index 0000000..d73b4a5
--- /dev/null
+++ b/data_engine/util.py
@@ -0,0 +1,71 @@
+import torch.utils.data as torch_data
+from functools import partial
+from muffin.train.train_utils import SFT_collator_fn
+import numpy as np
+import datasets as hf_datasets
+from transformers.image_processing_utils import BatchFeature
+
+from builder.builder import load_pretrained_model
+from muffin.eval.muffin_inference_logp import (InferenceSampler, concate_pad)
+from dataset import PreferenceInferenceDataset
+
+import torch
+
+
+def preference_collator_fn(instances, pad_token_id, is_omni=False):
+    rej_instances, win_instances = list(zip(*instances))
+    rej_batch = SFT_collator_fn(rej_instances, pad_token_id)
+    win_batch = SFT_collator_fn(win_instances, pad_token_id)
+
+    concatenated_input_ids = concate_pad(win_batch['input_ids'], rej_batch['input_ids'], pad_token_id)
+    concatenated_labels = concate_pad(win_batch['labels'], rej_batch['labels'], -100)
+    concatenated_attention_mask = concatenated_input_ids.ne(pad_token_id)
+
+    if not is_omni:
+        if isinstance(win_batch['images'][0], BatchFeature):
+            win_images = torch.stack([torch.tensor(img.pixel_values[0]) for img in win_batch['images']])
+        elif isinstance(win_batch['images'][0], np.ndarray):
+            win_images = torch.stack([torch.tensor(img) for img in win_batch['images']])
+        else:
+            win_images = win_batch['images']
+
+    batch = dict(
+        concatenated_input_ids=concatenated_input_ids,
+        concatenated_labels=concatenated_labels,
+        concatenated_attention_mask=concatenated_attention_mask,
+        win_input_ids=win_batch['input_ids'],
+        rej_input_ids=rej_batch['input_ids'],
+        win_labels=win_batch['labels'],
+        rej_labels=rej_batch['labels'],
+        win_attention_mask=win_batch['attention_mask'],
+        rej_attention_mask=rej_batch['attention_mask'],
+        images=win_batch['images'] if is_omni else win_images,
+    )
+    return batch
+
+
+def load_model_and_dataloader(model_path, model_name, dataset_path):
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
+                                                                           device_map={"": 'cuda'})
+    image_token_len = 0
+    if hasattr(model, "model") and hasattr(model.model, "config") and hasattr(model.model.config, "num_query"):
+        image_token_len = model.model.config.num_query
+
+    model = model.to(dtype=torch.bfloat16, device='cuda')
+    hf_data = hf_datasets.load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column("image",
+                                                                                               hf_datasets.Image(
+                                                                                                   decode=False))
+    dataset = PreferenceInferenceDataset(model_name=model_name,
+                                         tokenizer=tokenizer,
+                                         data=hf_data,
+                                         image_token_len=image_token_len,
+                                         img_processor=image_processor,
+                                         use_im_start_end=False)
+    collate_fn = partial(
+        preference_collator_fn,
+        pad_token_id=tokenizer.pad_token_id,
+        is_omni=("omni" in model_name.lower()) or (
+                "rlaif" in model_name.lower() and "12b" in model_path.lower()))  # judge if the model follow omni structure
+    dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
+                                       num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
+    return model, dataset, dataloader
diff --git a/muffin/eval/muffin_inference_logp.py b/muffin/eval/muffin_inference_logp.py
index 9eb3ffe..1f2d835 100644
--- a/muffin/eval/muffin_inference_logp.py
+++ b/muffin/eval/muffin_inference_logp.py
@@ -312,7 +312,8 @@ def write_logp_to_preference_parquet(origin_data, cache_file, logps, overwrite_l
 
     torch.distributed.barrier()
 
-    return df
+    if torch.distributed.get_rank() == 0:
+        return df
 
 def inference_logp(model, tokenizer, hf_data, cache_file, image_token_len, img_processor, use_im_start_end, is_llava15=False):
     model = model.to(dtype=torch.bfloat16, device='cuda')

From 3e9c239629916507cef30dca73218dd3259a30b0 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Mon, 25 Nov 2024 22:37:12 +0800
Subject: [PATCH 05/18] [upgrade] able to train

---
 data_engine/data_engine.py           |  3 +-
 data_engine/dataset.py               |  5 ++-
 muffin/data/datasets.py              | 56 +++++++++++++++++++---------
 muffin/eval/muffin_inference_logp.py | 10 +++--
 muffin/utils.py                      |  7 ++++
 script/train/llava15_train.sh        |  5 ++-
 6 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
index 9edce85..b553dd2 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/data_engine.py
@@ -93,7 +93,7 @@ def run(
             print_stage(2.3, finish=True)
 
             print_stage(2.4, "Save file to dataset format")
-            output_file = os.path.join(work_dir, "dpo_dataset.parquet")
+            output_file = os.path.join(work_dir, "dataset", "dpo_dataset.parquet")
             if os.path.exists(output_file):
                 os.remove(output_file)
             needed_keys = [
@@ -117,6 +117,7 @@ def run(
             print_stage(2, finish=True)
 
             print("Finish all stages, output file is saved to ", output_file)
+            print("You can directly copy this path to the training script to replace --data_dir value")
             print("Have a nice day!")
 
 if __name__ == "__main__":
diff --git a/data_engine/dataset.py b/data_engine/dataset.py
index c8612be..28f89d9 100644
--- a/data_engine/dataset.py
+++ b/data_engine/dataset.py
@@ -5,6 +5,7 @@
 
 from muffin.data.datasets import bytes_to_PIL_image
 from muffin.train.train_utils import encode_multimodal_preference_sample, preprocess_v1
+from muffin.utils import load_attr_or_empty_str
 from omnilmm.train.train_utils import omni_preprocess
 
 
@@ -36,9 +37,9 @@ def __init__(self,
     def __getitem__(self, index):
         sample = self.data[index]
         metainfo = {
-            "origin_dataset": sample['origin_dataset'],
+            "origin_dataset": load_attr_or_empty_str(sample, 'origin_dataset'),
             "origin_idx": sample['idx'],
-            "image_id": sample['image_path'],
+            "image_id": load_attr_or_empty_str(sample, 'image_path'),
         }
         if sample['origin_split'] is not None and sample['origin_split'] != "":
             metainfo["origin_split"] = json.loads(sample['origin_split'])
diff --git a/muffin/data/datasets.py b/muffin/data/datasets.py
index b514ed4..870375a 100644
--- a/muffin/data/datasets.py
+++ b/muffin/data/datasets.py
@@ -18,12 +18,16 @@
 from muffin.eval.muffin_inference_logp import inference_logp
 import datasets as hf_datasets
 
+from muffin.utils import load_attr_or_empty_str
+
+
 def bytes_to_PIL_image(img_buffer):
     img_io = io.BytesIO(img_buffer)
     img_io.seek(0)
     image = Image.open(img_io).convert('RGB')
     return image
 
+
 class RLAIFVDataset(torch_data.Dataset):
     def __init__(self, data_dir: str, reference_model=None,
                  tokenizer=None, image_token_len=None, img_processor=None, use_im_start_end=True, is_llava15=False):
@@ -31,25 +35,43 @@ def __init__(self, data_dir: str, reference_model=None,
 
         if not op.exists(data_dir):
             os.makedirs(data_dir, exist_ok=True)
-
-        data_path = [file for file in os.listdir(data_dir) if file.endswith('.parquet') and 'logp' in file]
+        if not op.exists(op.join(data_dir, "logps")):
+            os.makedirs(op.join(data_dir, "logps"), exist_ok=True)
+
+        logps_sub_dir = False
+        logps_data_path = [file for file in os.listdir(data_dir) if file.endswith('.parquet') and 'logp' in file]
+        if len(logps_data_path) == 0 and op.exists(op.join(data_dir, "logps")):
+            logps_data_path = [file for file in os.listdir(op.join(data_dir, "logps")) if
+                               file.endswith('.parquet') and 'logp' in file]
+            logps_sub_dir = True
         self.data_path = data_dir
 
-        if len(data_path) == 0:
+        if len(logps_data_path) == 0:
             assert reference_model is not None, "`reference_model` is mandatory when logps do not exist."
 
+            origin_data_path = [file for file in os.listdir(data_dir) if
+                                file.endswith('.parquet') and 'logp' not in file]
             if not op.exists('./RLAIF-V-Dataset'):
                 os.mkdir('./RLAIF-V-Dataset')
-            hf_data = hf_datasets.load_dataset('openbmb/RLAIF-V-Dataset', cache_dir='./RLAIF-V-Dataset')['train'].cast_column("image", hf_datasets.Image(decode=False))
+            if len(origin_data_path) == 0:
+                hf_data = hf_datasets.load_dataset('openbmb/RLAIF-V-Dataset', cache_dir='./RLAIF-V-Dataset')[
+                    'train'].cast_column("image", hf_datasets.Image(decode=False))
+            else:
+                hf_data = hf_datasets.load_dataset(data_dir,
+                                                   cache_dir='./RLAIF-V-Dataset')['train'].cast_column("image",
+                                                                                                       hf_datasets.Image(
+                                                                                                           decode=False))
 
-            inference_logp(reference_model, tokenizer, hf_data, self.data_path,
-                            image_token_len, img_processor, use_im_start_end, is_llava15=is_llava15)
+            inference_logp(reference_model, tokenizer, hf_data, op.join(data_dir, "logps"),
+                           image_token_len, img_processor, use_im_start_end, is_llava15=is_llava15)
 
             torch.distributed.barrier()
 
-            self.data = hf_datasets.load_dataset(data_dir)['train'].cast_column("image", hf_datasets.Image(decode=False))
+            self.data = hf_datasets.load_dataset(op.join(data_dir, "logps"))['train'].cast_column("image",
+                                                                                hf_datasets.Image(decode=False))
         else:
-            self.data = hf_datasets.load_dataset(data_dir)['train'].cast_column("image", hf_datasets.Image(decode=False))
+            self.data = hf_datasets.load_dataset(op.join(data_dir, "logps") if logps_sub_dir else data_dir)['train'].cast_column("image",
+                                                                                hf_datasets.Image(decode=False))
 
         self.line_idx = list(range(len(self.data)))
         random.shuffle(self.line_idx)
@@ -66,10 +88,10 @@ def __getitem__(self, index):
         image = bytes_to_PIL_image(sample['image']['bytes'])
 
         metainfo = {
-            "origin_dataset": sample['origin_dataset'],
-            "origin_split": sample['origin_split'],
-            "origin_idx": sample['idx'],
-            "image_id": sample['image_path'],
+            "origin_dataset": load_attr_or_empty_str(sample, 'origin_dataset'),
+            "origin_split": load_attr_or_empty_str(sample, 'origin_split'),
+            "origin_idx": load_attr_or_empty_str(sample, 'idx'),
+            "image_id": load_attr_or_empty_str(sample, 'image_path'),
         }
 
         data_dict = {
@@ -77,17 +99,18 @@ def __getitem__(self, index):
             "question": question,
             "chosen": chosen,
             "rejected": rejected,
-            "idx": sample['idx'],
+            "idx": load_attr_or_empty_str(sample, 'idx'),
             "metainfo": metainfo
         }
-        logps=json.loads(sample['logps'])
+        logps = json.loads(sample['logps'])
 
         if type(logps) == type([]):
             (data_dict['ref_win_logp'], data_dict['ref_win_avg_logp'], data_dict['ref_win_per_token_logp'],
-            data_dict['ref_rej_logp'], data_dict['ref_rej_avg_logp'], data_dict['ref_rej_per_token_logp']) = logps
+             data_dict['ref_rej_logp'], data_dict['ref_rej_avg_logp'], data_dict['ref_rej_per_token_logp']) = logps
         else:
             (data_dict['ref_win_logp'], data_dict['ref_win_avg_logp'], data_dict['ref_win_per_token_logp'],
-            data_dict['ref_rej_logp'], data_dict['ref_rej_avg_logp'], data_dict['ref_rej_per_token_logp']) = logps['logps']
+             data_dict['ref_rej_logp'], data_dict['ref_rej_avg_logp'], data_dict['ref_rej_per_token_logp']) = logps[
+                'logps']
 
         return data_dict
 
@@ -285,4 +308,3 @@ def __getitem__(self, index):
 
     def __len__(self):
         return self.size
-
diff --git a/muffin/eval/muffin_inference_logp.py b/muffin/eval/muffin_inference_logp.py
index 1f2d835..3694a2e 100644
--- a/muffin/eval/muffin_inference_logp.py
+++ b/muffin/eval/muffin_inference_logp.py
@@ -10,6 +10,7 @@
 import PIL.Image as PIL_image
 from functools import partial
 from muffin.train.train_utils import encode_multimodal_preference_sample, SFT_collator_fn, preprocess_v1
+from muffin.utils import load_attr_or_empty_str
 
 
 def bytes_to_PIL_image(img_buffer):
@@ -135,11 +136,12 @@ def __init__(self,
 
     def __getitem__(self, index):
         sample = self.data[index]
+        origin_split = load_attr_or_empty_str(sample, 'origin_split')
         metainfo = {
-            "origin_dataset": sample['origin_dataset'],
-            "origin_split": json.loads(sample['origin_split']),
-            "origin_idx": sample['idx'],
-            "image_id": sample['image_path'],
+            "origin_dataset": load_attr_or_empty_str(sample, 'origin_dataset'),
+            "origin_split": json.loads(origin_split) if origin_split != "" else origin_split,
+            "origin_idx": load_attr_or_empty_str(sample, 'idx'),
+            "image_id": load_attr_or_empty_str(sample, 'image_path'),
         }
         question = {'from': 'human', 'value': f"<image>\n{sample['question']}"}
         chosen = {'from': 'gpt', 'value': sample['chosen']}
diff --git a/muffin/utils.py b/muffin/utils.py
index d94af97..56538db 100644
--- a/muffin/utils.py
+++ b/muffin/utils.py
@@ -125,3 +125,10 @@ def pretty_print_semaphore(semaphore):
     if semaphore is None:
         return "None"
     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
+
+
+def load_attr_or_empty_str(obj: dict, attr: str):
+    if attr in obj:
+        return attr
+    else:
+        return ""
diff --git a/script/train/llava15_train.sh b/script/train/llava15_train.sh
index b3c62a7..88fd4d6 100644
--- a/script/train/llava15_train.sh
+++ b/script/train/llava15_train.sh
@@ -1,12 +1,13 @@
 export PYTHONPATH=$PYTHONPATH:`realpath .`
+export CUDA_VISIBLE_DEVICES=0,1,2,3
 
 task_name=llava15_7b_DPO
 exp_name=llava15_rlaifv
 
 deepspeed ./muffin/train/train_llava15.py \
     --deepspeed ./script/zero2.json  \
-    --model_name_or_path liuhaotian/llava-v1.5-7b \
-    --data_dir ./RLAIF-V-Dataset_logps/ \
+    --model_name_or_path /data/yaoshu/models/llava-v1.5-7b \
+    --data_dir /data/RLAIF-V-CC/results/test/dataset/ \
     --image_folder not_used \
     --vision_tower openai/clip-vit-large-patch14-336 \
     --mm_use_im_start_end False \

From 9e5e1ae3229bc6851e7dc193bf74fdfaa0566d28 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Tue, 26 Nov 2024 16:02:27 +0800
Subject: [PATCH 06/18] [upgrade] fix and readme

---
 data_engine/README.md           | 38 +++++++++++++++++++++++++++++++++
 data_engine/README_zh.md        | 38 +++++++++++++++++++++++++++++++++
 data_engine/answer_sampler.py   |  2 --
 data_engine/data_engine.py      |  8 +++----
 data_engine/dataset.py          |  5 +++--
 data_engine/logps_calculator.py |  5 ++---
 data_engine/run_engine.sh       | 14 ++++++------
 data_engine/util.py             |  8 +++++++
 eval/eval_gpt_obj_halbench.py   | 13 ++++++++++-
 9 files changed, 112 insertions(+), 19 deletions(-)
 create mode 100644 data_engine/README.md
 create mode 100644 data_engine/README_zh.md

diff --git a/data_engine/README.md b/data_engine/README.md
new file mode 100644
index 0000000..af7bd0a
--- /dev/null
+++ b/data_engine/README.md
@@ -0,0 +1,38 @@
+# Data Engine
+
+## Welcome
+Thank you for using Data Engine.  
+This part of the code is used to build the DPO dataset, which you can use for direct training.  
+You only need to input the reward model, instruction model, and your dataset, and we will generate the DPO dataset for you. All you need to do is run the `run_engine.sh` script.
+
+## Usage
+Please refer to the `run_engine.sh` script.
+
+You will need to provide the path and name for both the reward model and the instruction model. Currently, we support the following models: llava-1.5-7b, RLAIF-V-7B, OmniLMM-12B, and RLAIF-V-12B. We are considering adding more models in the future. If the model you wish to use is not listed, you may need to implement the corresponding code yourself (for model loading, add code to `RLAIF-V/builder`; for data formatting, change `RLAIF-V/muffin/train/train_utils.py` and `RLAIF-V/data_engine/util.py`; for log probability calculation, change `RLAIF-V/data_engine/logps_calculator.py` and `RLAIF-V/muffin/eval/muffin_inference_logp.py`).
+
+Additionally, **please double-check that the model name you provide is correct**, as we will not know which code to execute otherwise.
+
+Next, your dataset should contain the following fields:
+1. `idx`: A unique index for each data entry (this can be a string).
+2. `question`: The question related to the image.
+3. `image`: You can determine the column name by passing it via the `--image_column` parameter. The column should follow this structure:
+   - `{'bytes': ..., 'path':...}`
+   - `bytes` should be in binary format.
+   - `path` is not strictly required, but to avoid errors, it's better to keep this field (you can set it as an empty string).
+4. `image_path`: This field is optional; we will retain it in your final DPO dataset.
+5. `ds_name`: This field is also optional; it will be retained in the final DPO dataset.
+6. `origin_split`: This field is optional, but **if you pass it, please ensure it is in JSON format**. It will be retained in the final DPO dataset.
+
+You can specify a `--work_dir` to store intermediate files and the final output under this directory (which will actually be a subdirectory within it).
+
+If you encounter errors during generation, you can pass the stage next to the stage that has been completed using the `--continue_from_stage` parameter (0, 1, or 2). When the value is 0, it will start from scratch. (For example, if you've completed stages 0 and 1 but encounter an error during stage 2, you can fix the issue and set `--continue_from_stage 2` to continue from that point.) You can check the `data_engine.py` file for details on what each stage does.
+
+Run:
+```shell
+sh data_engine/run_data_engine.sh
+```
+
+## Conclusion
+If you run into any issues, feel free to contact us by submitting an Issue.
+
+Thank you for choosing RLAIF-V. Best wishes for your project!
diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
new file mode 100644
index 0000000..3af0fd9
--- /dev/null
+++ b/data_engine/README_zh.md
@@ -0,0 +1,38 @@
+# Data Engine
+
+## Welcome
+感谢您使用 Data Engine。  
+此部分代码用于为您构建 DPO 数据集，您可以直接用它来进行训练。  
+您只需输入奖励模型（reward model）、指令模型（instruct model）和数据集，我们将为您构建 DPO 数据集。您只需运行 `run_engine.sh` 脚本即可。
+
+## Usage
+请查看 `run_engine.sh` 脚本。
+
+您需要输入奖励模型和指令模型的路径及名称。目前我们支持以下模型：llava-1.5-7b、RLAIF-V-7B、OmniLMM-12B 和 RLAIF-V-12B。我们也在考虑添加更多模型。如果您选择的模型不在模型列表中，您可能需要自行实现相关代码：（`RLAIF-V/builder` 用于模型加载；`RLAIF-V/muffin/train/train_utils.py` 和 `RLAIF-V/data_engine/util.py` 用于数据格式化；`RLAIF-V/data_engine/logps_calculator.py` 和 `RLAIF-V/muffin/eval/muffin_inference_logp.py` 用于计算 logps）。
+
+另外，**请务必确认您提供的模型名称正确，否则我们无法确定该运行哪段代码**。
+
+接下来是您的数据集，它应该包含以下字段：
+1. `idx`：每条数据的唯一索引（可以是字符串）。
+2. `question`：图像对应的问题。
+3. `image`：您可以自定义列名，请通过 `--image_column` 参数传递该列名。该列应遵循以下结构：
+   - {'bytes': ..., 'path':...}
+   - `bytes` 应为二进制格式。
+   - `path` 字段不是必须的，但为了避免错误，建议您保留此字段（可以设置为空字符串）。
+4. `image_path`：此字段不是必需的，我们将在最终的 DPO 数据集中保留它。
+5. `ds_name`：此字段不是必需的，我们将在最终的 DPO 数据集中保留它。
+6. `origin_split`：此字段不是必需的，但**如果传递此字段，请确保它是 JSON 格式**，我们将保留它在最终的 DPO 数据集中。
+
+您可以选择设置 `--work_dir`，我们将在该目录下保存中间文件和最终输出（实际上是该目录下的子目录）。
+
+如果在生成过程中遇到错误，您可以使用 `--continue_from_stage` 参数指定已完成阶段的下一个阶段（0、1、2）。如果值为 0，则从头开始。（例如，您完成了阶段 0 和阶段 1，在阶段 2 遇到错误，修复问题后设置 `--continue_from_stage 2` 以继续执行）。您可以查看文件 `data_engine.py` 了解每个阶段的具体内容。
+
+运行：
+```shell
+sh data_engine/run_data_engine.sh
+```
+
+## Conclusion
+如果您遇到任何问题，请随时通过提交 Issues 联系我们。
+
+感谢您选择 RLAIF-V，祝您使用愉快！
diff --git a/data_engine/answer_sampler.py b/data_engine/answer_sampler.py
index 68e3dc3..e17375b 100644
--- a/data_engine/answer_sampler.py
+++ b/data_engine/answer_sampler.py
@@ -120,7 +120,6 @@ def sample_answer(model_path, dataset_path, output_path, image_column, sample=10
                     samples = sorted(grouped_output_data[data_id], key=lambda x: x['sample_index'])
                     flat_output_data.extend(samples)
 
-                # 分批保存数据时保持顺序
                 for idx, start in enumerate(range(0, len(flat_output_data), step)):
                     try:
                         temp_data = flat_output_data[start: min(start + step, len(flat_output_data))]
@@ -161,7 +160,6 @@ def main():
     try:
         sample_answer(model_path, dataset_path, output_path, sample)
     finally:
-        # 清理分布式环境
         dist.destroy_process_group()
 
 
diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
index b553dd2..dc0b0c4 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/data_engine.py
@@ -93,9 +93,9 @@ def run(
             print_stage(2.3, finish=True)
 
             print_stage(2.4, "Save file to dataset format")
-            output_file = os.path.join(work_dir, "dataset", "dpo_dataset.parquet")
-            if os.path.exists(output_file):
-                os.remove(output_file)
+            output_path = os.path.join(work_dir, "dataset")
+            output_file = os.path.join(output_path, "dpo_dataset.parquet")
+            dir_prepare(output_path)
             needed_keys = [
                 "question",
                 "chosen",
@@ -116,7 +116,7 @@ def run(
 
             print_stage(2, finish=True)
 
-            print("Finish all stages, output file is saved to ", output_file)
+            print("Finish all stages, output file is saved to ", output_path)
             print("You can directly copy this path to the training script to replace --data_dir value")
             print("Have a nice day!")
 
diff --git a/data_engine/dataset.py b/data_engine/dataset.py
index 28f89d9..d80a795 100644
--- a/data_engine/dataset.py
+++ b/data_engine/dataset.py
@@ -8,6 +8,8 @@
 from muffin.utils import load_attr_or_empty_str
 from omnilmm.train.train_utils import omni_preprocess
 
+import util
+
 
 class PreferenceInferenceDataset(torch_data.Dataset):
     def __init__(self,
@@ -28,8 +30,7 @@ def __init__(self,
         }
         self.tokenizer = tokenizer
 
-        lower_name = model_name.lower()
-        if "onmi" in lower_name or ('rlaif' in lower_name and '12b' in lower_name):
+        if util.judge_is_omnilmm(model_name):
             self.preprocess_func = omni_preprocess
         else:
             self.preprocess_func = partial(preprocess_v1, has_image=True)
diff --git a/data_engine/logps_calculator.py b/data_engine/logps_calculator.py
index 5c8b68d..c4c5c1b 100644
--- a/data_engine/logps_calculator.py
+++ b/data_engine/logps_calculator.py
@@ -86,7 +86,7 @@ def inference_logp(
         output_dir):
     """
     Args:
-        model_name:  e.g. llava-v1.5-7, OmniLMM-12B, RLAIF-V-12B
+        model_name:  e.g. llava-v1.5-7B, OmniLMM-12B, RLAIF-V-12B
         model_path: path to your model
         dataset_path: path to dataset(should follow RLAIF-V-Dataset format)
         output_dir: path to outputfile(logps)
@@ -101,8 +101,7 @@ def inference_logp(
         # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
         model,
         dataloader,
-        is_llava15=("llava" in model_name.lower() or (
-                "rlaif" in model_name.lower() and "7b" in model_path.lower())))  # judge if the model follow llava structure
+        is_llava15=judge_is_llava(model_name))  # judge if the model follow llava structure
 
     world_size = torch.distributed.get_world_size()
     merged_outputs = [[None for _ in range(world_size)] for i in range(len(outputs))]
diff --git a/data_engine/run_engine.sh b/data_engine/run_engine.sh
index 51ce2c0..0b278ec 100644
--- a/data_engine/run_engine.sh
+++ b/data_engine/run_engine.sh
@@ -15,11 +15,11 @@ DISTRIBUTED_ARGS="
     --master_port $MASTER_PORT
 "
 torchrun $DISTRIBUTED_ARGS data_engine/data_engine.py \
-      --reward_model_name RLAIF-V-7B \
-      --reward_model_path /data/yaoshu/models/RLAIF-V-7B \
-      --instruct_model_name llava-v1.5-7b \
-      --instruct_model_path /data/yaoshu/models/llava-v1.5-7b \
-      --dataset_path /data/yaoshu/dataset/origin_dataset \
-      --work_dir /data/RLAIF-V-CC/results/test1 \
-      --image_column image_bytes \
+      --reward_model_name reward model name \
+      --reward_model_path /path/to/your/reward/model \
+      --instruct_model_name instruct model name \
+      --instruct_model_path /path/to/yout/instruct/model \
+      --dataset_path /path/to/your/dataset \
+      --work_dir /path/to/your/work/dir \
+      --image_column image \
       --continue_from_stage 0
\ No newline at end of file
diff --git a/data_engine/util.py b/data_engine/util.py
index d73b4a5..ed19ca8 100644
--- a/data_engine/util.py
+++ b/data_engine/util.py
@@ -69,3 +69,11 @@ def load_model_and_dataloader(model_path, model_name, dataset_path):
     dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
                                        num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
     return model, dataset, dataloader
+
+def judge_is_llava(model_name: str) -> bool:
+    lower_name = model_name.lower()
+    return 'llava' in lower_name or ('rlaif' in lower_name and '7b' in lower_name)
+
+def judge_is_omnilmm(model_name: str) -> bool:
+    lower_name = model_name.lower()
+    return 'omnilmm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
\ No newline at end of file
diff --git a/eval/eval_gpt_obj_halbench.py b/eval/eval_gpt_obj_halbench.py
index 53b5892..3a217fb 100644
--- a/eval/eval_gpt_obj_halbench.py
+++ b/eval/eval_gpt_obj_halbench.py
@@ -1,4 +1,5 @@
 import os
+import re
 import sys
 import ssl
 import json
@@ -32,9 +33,19 @@
 lemma = nltk.wordnet.WordNetLemmatizer()
 
 
+def extract_json_content(text):
+    match = re.search(r'```json\n(.*?)\n```', text, re.DOTALL)
+    if match:
+        json_content = match.group(1)
+        return json.loads(json_content)
+    else:
+        raise ValueError
+
 def parse_object_list(content):
     try:
-        content = json.loads(content)
+        # content = json.loads(content)
+        content = extract_json_content(content)
+
     except:
         if '["' in content:
             try:

From cbd51f8499213bb1a43fe1aec285a029bcacf062 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Thu, 28 Nov 2024 23:48:10 +0800
Subject: [PATCH 07/18] [upgrade]

---
 data_engine/answer_sampler.py         |  8 +++--
 data_engine/data_engine.py            | 43 +++++++++++++++++++++++----
 data_engine/data_pair_builder.py      | 13 ++++----
 data_engine/dataset.py                |  2 +-
 data_engine/dpo_data_filter/filter.py |  2 +-
 data_engine/reward_computer.py        | 10 ++++---
 data_engine/run_engine.sh             |  3 +-
 7 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/data_engine/answer_sampler.py b/data_engine/answer_sampler.py
index e17375b..c8e6263 100644
--- a/data_engine/answer_sampler.py
+++ b/data_engine/answer_sampler.py
@@ -26,17 +26,19 @@ def sample_answer(model_path, dataset_path, output_path, image_column, sample=10
 
         with torch.inference_mode():
             generation_config = {
-                "top_p": 0.8,
-                "top_k": 100,
+                # "top_p": 0.8,
+                # "top_k": 100,
                 "temperature": 0.7,
                 "do_sample": True,
-                "repetition_penalty": 1.05
+                # "repetition_penalty": 1.05
             }
 
+            print("Loading sample answer dataset.")
             dataset = load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column(
                 image_column,
                 hf_datasets.Image(decode=False)
             )
+            print("Finish loading")
 
             total_size = len(dataset)
             base_size = total_size // world_size
diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
index dc0b0c4..a6ac9c9 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/data_engine.py
@@ -1,4 +1,7 @@
+import json
 import os.path
+import random
+from copy import deepcopy
 
 import pandas as pd
 
@@ -12,6 +15,18 @@
 import torch.distributed as dist
 
 
+def store_data_with_no_image(data, path):
+    if torch.distributed.get_rank() == 0:
+        data_to_store = []
+        for item in data:
+            item = deepcopy(item)
+            item.pop('image', None)
+            data_to_store.append(item)
+
+        with open(path, 'w') as f:
+            json.dump(data_to_store, f, ensure_ascii=False, indent=4)
+
+
 def print_stage(idx, desc="", finish=False):
     if torch.distributed.get_rank() == 0:
         print("=" * 80)
@@ -45,8 +60,9 @@ def run(
         image_column="image",
         continue_from_stage=1,
         sample_k=10,
-        rank=10,
-        distance=5
+        rank=3,
+        distance=25,
+        debug=False
 ):
     dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
                             rank=int(os.getenv('RANK', '0')), )
@@ -77,24 +93,36 @@ def run(
 
     # following code doesn't need multi CUDA
     if torch.distributed.get_rank() == 0:
+        debug_root_dir = os.path.join(work_dir, 'debug')
+        if debug:
+            print(
+                "You set debug=True, it will generate fine-grained process data under subdir 'debug'. You can check that dir for debug details.")
+            dir_prepare(debug_root_dir)
         if continue_from_stage <= 2:
             print_stage(2, "DPO dataset construction")
 
             print_stage(2.1, "Calculate reward")
             rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
+            if debug:
+                store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'rewards.json'))
             print_stage(2.1, finish=True)
 
             print_stage(2.2, "Build DPO pairs")
             dpo_pair = data_pair_builder.main(rewards, sample_k, rank, distance)
+            if debug:
+                store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'dpo_pair.json'))
             print_stage(2.2, finish=True)
 
             print_stage(2.3, "Filter DPO pairs")
             data = filter.main(dpo_pair)
+            if debug:
+                store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'filtered.json'))
             print_stage(2.3, finish=True)
 
             print_stage(2.4, "Save file to dataset format")
             output_path = os.path.join(work_dir, "dataset")
             output_file = os.path.join(output_path, "dpo_dataset.parquet")
+            random.shuffle(data)
             dir_prepare(output_path)
             needed_keys = [
                 "question",
@@ -111,15 +139,18 @@ def run(
                     if key not in needed_keys:
                         del item[key]
             df = pd.DataFrame(data)
+            df = df.sample(frac=1).reset_index(drop=True)
             df.to_parquet(output_file)
             print_stage(2.4, finish=True)
 
             print_stage(2, finish=True)
 
+            print(f"We get {len(data)} data items in total, you may need that to set max_steps for training")
             print("Finish all stages, output file is saved to ", output_path)
             print("You can directly copy this path to the training script to replace --data_dir value")
             print("Have a nice day!")
 
+
 if __name__ == "__main__":
     args = argparse.ArgumentParser()
     args.add_argument("--reward_model_name", type=str, help="The name of the reward model.")
@@ -131,8 +162,9 @@ def run(
     args.add_argument("--image_column", type=str, help="The column that keep image in your dataset")
     args.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
     args.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
-    args.add_argument("--rank", type=int, default=10, help="The rank number.")
-    args.add_argument("--distance", type=int, default=5, help="The distance.")
+    args.add_argument("--rank", type=int, default=3, help="The rank number.")
+    args.add_argument("--distance", type=int, default=25, help="The distance.")
+    args.add_argument("--debug", type=bool, default=False, help="Preserve fine-grained process data")
 
     args = args.parse_args()
     run(
@@ -146,5 +178,6 @@ def run(
         args.continue_from_stage,
         args.sample_k,
         args.rank,
-        args.distance
+        args.distance,
+        args.debug
     )
diff --git a/data_engine/data_pair_builder.py b/data_engine/data_pair_builder.py
index 1095f8b..c48be47 100644
--- a/data_engine/data_pair_builder.py
+++ b/data_engine/data_pair_builder.py
@@ -12,14 +12,14 @@ def get_ranking_reward_data(sample_k, rewards):
     data = list(rewards)
     data_pairs = [data[i:i + sample_k] for i in range(0, len(data), sample_k)]
 
-    # print(len(data_pairs))
-    # print("*****")
-
     # 对于每组数据对进行排序和逐行写入
     for data in tqdm(data_pairs):
         # 按照 sum 和 avg 降序排列
         sum_sorted_data = sorted(data, key=lambda x: x['sum'], reverse=True)
         avg_sorted_data = sorted(data, key=lambda x: x['avg'], reverse=True)
+        # TODO
+        # sum_sorted_data = sorted(data, key=lambda x: x['sum'])
+        # avg_sorted_data = sorted(data, key=lambda x: x['avg'])
 
         # print(sum_sorted_data[0]['idx'])
 
@@ -62,7 +62,8 @@ def get_ranking_reward_data(sample_k, rewards):
     return sum_output, avg_output
 
 
-def pair_union(sum_reward, avg_reward, sample_k=10, rank=10, distance=5):
+def pair_union(sum_reward, avg_reward, sample_k=10, rank=3, distance=25):
+    print(f"sampling number k: {sample_k} \nrank number: {rank} \ndistance: {distance}")
     total_pairs = 0
     total_used_pic = 0
     flag = 0
@@ -124,11 +125,11 @@ def pair_union(sum_reward, avg_reward, sample_k=10, rank=10, distance=5):
                         flag += 1
         if sign == 1:
             total_used_pic += 1
-
+    print(f"total_used_pic: {total_used_pic}")
     return dpo_pair
 
 
-def main(rewards, sample_k=10, rank=10, distance=5):
+def main(rewards, sample_k=10, rank=3, distance=25):
     sum_output, avg_output = get_ranking_reward_data(sample_k, rewards)
     dpo_pair = pair_union(sum_output, avg_output, sample_k, rank, distance)
     return dpo_pair
diff --git a/data_engine/dataset.py b/data_engine/dataset.py
index d80a795..d41b266 100644
--- a/data_engine/dataset.py
+++ b/data_engine/dataset.py
@@ -42,7 +42,7 @@ def __getitem__(self, index):
             "origin_idx": sample['idx'],
             "image_id": load_attr_or_empty_str(sample, 'image_path'),
         }
-        if sample['origin_split'] is not None and sample['origin_split'] != "":
+        if 'origin_split' in sample and sample['origin_split'] != "":
             metainfo["origin_split"] = json.loads(sample['origin_split'])
         else:
             metainfo["origin_split"] = ""
diff --git a/data_engine/dpo_data_filter/filter.py b/data_engine/dpo_data_filter/filter.py
index 982c2d1..42d29e1 100644
--- a/data_engine/dpo_data_filter/filter.py
+++ b/data_engine/dpo_data_filter/filter.py
@@ -50,7 +50,7 @@ def main(data):
 
     # you can add your own filters here or delete the filters
     # that are determined to be unnecessary
-    filters = [LengthFilter, NumFilter, DeleteSameFilter]
+    filters = [NumFilter, LengthFilter, DeleteSameFilter]
 
     for filter_to_run in filters:
         filter_name = filter_to_run.__name__
diff --git a/data_engine/reward_computer.py b/data_engine/reward_computer.py
index fde90ce..1220503 100644
--- a/data_engine/reward_computer.py
+++ b/data_engine/reward_computer.py
@@ -7,6 +7,8 @@
 from transformers import AutoTokenizer
 import argparse
 
+from muffin.utils import load_attr_or_empty_str
+
 
 # def parquet_to_json(parquet_file, jsonl_file):
 #     df = pd.read_parquet(parquet_file, engine='pyarrow')
@@ -111,13 +113,13 @@ def compute_reward(tokenizer, reward_logps_dir, instruct_logps_dir):
 
             reward_data = {
                 "idx": idx,
-                "ds_name": reward_row["ds_name"],
+                "ds_name": load_attr_or_empty_str(reward_row, "ds_name"),
                 "question": reward_row["question"],
                 "chosen": reward_row["chosen"],
                 "image": reward_row["image"],
-                "image_path": reward_row["image_path"],
-                "origin_split": reward_row["origin_split"],
-                "origin_dataset": reward_row["origin_dataset"],
+                "image_path": load_attr_or_empty_str(reward_row, "image_path"),
+                "origin_split": load_attr_or_empty_str(reward_row, "origin_split"),
+                "origin_dataset": load_attr_or_empty_str(reward_row, "origin_dataset"),
                 "min": min_reward,
                 "sum": sum_reward,
                 "ORM": last_reward,
diff --git a/data_engine/run_engine.sh b/data_engine/run_engine.sh
index 0b278ec..f13a75b 100644
--- a/data_engine/run_engine.sh
+++ b/data_engine/run_engine.sh
@@ -22,4 +22,5 @@ torchrun $DISTRIBUTED_ARGS data_engine/data_engine.py \
       --dataset_path /path/to/your/dataset \
       --work_dir /path/to/your/work/dir \
       --image_column image \
-      --continue_from_stage 0
\ No newline at end of file
+      --continue_from_stage 0 \
+      --debug True
\ No newline at end of file

From a2be5929912ac2ea9380a8e58d37d502b8ceec28 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Mon, 2 Dec 2024 00:03:12 +0800
Subject: [PATCH 08/18] [upgrade]

---
 chat.py                                      |   1 -
 data_engine/README.md                        |   5 +-
 data_engine/README_zh.md                     |   5 +-
 data_engine/answer_sampler.py                | 174 +------------------
 data_engine/data_engine.py                   |  16 +-
 data_engine/data_pair_builder.py             |   7 +-
 data_engine/dpo_data_filter/filter.py        |   3 +-
 data_engine/dpo_data_filter/length_filter.py |   2 +-
 data_engine/dpo_data_filter/num_filter.py    |   5 -
 data_engine/logps_calculator.py              | 141 ++++++++-------
 data_engine/reward_computer.py               |   2 +-
 data_engine/run_engine.sh                    |   1 -
 data_engine/util.py                          |  75 +-------
 llava/llava15_sample_data.py                 |  93 ++++++++++
 muffin/eval/muffin_inference_logp.py         |  35 +---
 muffin/gen_data_util.py                      |  46 +++++
 muffin/llava15_gen_data.py                   |  45 +----
 muffin/sample_data_util.py                   | 154 ++++++++++++++++
 omnilmm/omnilmm_gen_data.py                  |  45 +----
 omnilmm/omnilmm_sample_data.py               |  83 +++++++++
 20 files changed, 485 insertions(+), 453 deletions(-)
 create mode 100644 llava/llava15_sample_data.py
 create mode 100644 muffin/gen_data_util.py
 create mode 100644 muffin/sample_data_util.py
 create mode 100644 omnilmm/omnilmm_sample_data.py

diff --git a/chat.py b/chat.py
index ac18251..8db10a7 100644
--- a/chat.py
+++ b/chat.py
@@ -168,7 +168,6 @@ def chat(self, input, param=None):
                     input_ids,
                     images=image_tensor.unsqueeze(0).half().cuda(),
                     image_sizes=[image.size],
-                    max_new_tokens=1024,
                     use_cache=True,
                     **param
                 )
diff --git a/data_engine/README.md b/data_engine/README.md
index af7bd0a..3b73c04 100644
--- a/data_engine/README.md
+++ b/data_engine/README.md
@@ -8,14 +8,15 @@ You only need to input the reward model, instruction model, and your dataset, an
 ## Usage
 Please refer to the `run_engine.sh` script.
 
-You will need to provide the path and name for both the reward model and the instruction model. Currently, we support the following models: llava-1.5-7b, RLAIF-V-7B, OmniLMM-12B, and RLAIF-V-12B. We are considering adding more models in the future. If the model you wish to use is not listed, you may need to implement the corresponding code yourself (for model loading, add code to `RLAIF-V/builder`; for data formatting, change `RLAIF-V/muffin/train/train_utils.py` and `RLAIF-V/data_engine/util.py`; for log probability calculation, change `RLAIF-V/data_engine/logps_calculator.py` and `RLAIF-V/muffin/eval/muffin_inference_logp.py`).
+You will need to provide the path and name for both the reward model and the instruction model. Currently, we support the following models: llava-1.5-7b, RLAIF-V-7B, OmniLMM-12B, and RLAIF-V-12B. We are considering adding more models in the future. \
+If the model you wish to use is not listed, you may need to implement the corresponding code yourself (for model loading, add code to `RLAIF-V/builder`; for answer sampling, refer to `RLAIF-V/llava/llava15_sample_data.py` to see how data is formatted (don't forget to pass `raw_images`) and add call it in `RLAIF-V/data_engine/answer_sampler.py`; for log probability calculation, change data formatting part in `RLAIF-V/data_engine/logps_calculator.py` and `get_multimodal_sample_logps` function in `RLAIF-V/muffin/eval/muffin_inference_logp.py`).
 
 Additionally, **please double-check that the model name you provide is correct**, as we will not know which code to execute otherwise.
 
 Next, your dataset should contain the following fields:
 1. `idx`: A unique index for each data entry (this can be a string).
 2. `question`: The question related to the image.
-3. `image`: You can determine the column name by passing it via the `--image_column` parameter. The column should follow this structure:
+3. `image`: The column should follow this structure:
    - `{'bytes': ..., 'path':...}`
    - `bytes` should be in binary format.
    - `path` is not strictly required, but to avoid errors, it's better to keep this field (you can set it as an empty string).
diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
index 3af0fd9..b07f0c6 100644
--- a/data_engine/README_zh.md
+++ b/data_engine/README_zh.md
@@ -8,14 +8,15 @@
 ## Usage
 请查看 `run_engine.sh` 脚本。
 
-您需要输入奖励模型和指令模型的路径及名称。目前我们支持以下模型：llava-1.5-7b、RLAIF-V-7B、OmniLMM-12B 和 RLAIF-V-12B。我们也在考虑添加更多模型。如果您选择的模型不在模型列表中，您可能需要自行实现相关代码：（`RLAIF-V/builder` 用于模型加载；`RLAIF-V/muffin/train/train_utils.py` 和 `RLAIF-V/data_engine/util.py` 用于数据格式化；`RLAIF-V/data_engine/logps_calculator.py` 和 `RLAIF-V/muffin/eval/muffin_inference_logp.py` 用于计算 logps）。
+您需要输入奖励模型和指令模型的路径及名称。目前我们支持以下模型：llava-1.5-7b、RLAIF-V-7B、OmniLMM-12B 和 RLAIF-V-12B。我们也在考虑添加更多模型。\
+如果您选择的模型不在模型列表中，您可能需要自行实现相关代码：（`RLAIF-V/builder` 用于模型加载；对于初始回答抽样，请参考`RLAIF-V/llava/llava15_sample_data.py`是如何对数据进行格式化的（请不要忘记传递`raw_images`）同时将您的调用代码添加到`RLAIF-V/data_engine/answer_sampler.py`中; 对于logps计算，请更改`RLAIF-V/data_engine/logps_calculator.py`中用于格式化数据的部分，和`RLAIF-V/muffin/eval/muffin_inference_logp.py`的`get_multimodal_sample_logps`函数）。
 
 另外，**请务必确认您提供的模型名称正确，否则我们无法确定该运行哪段代码**。
 
 接下来是您的数据集，它应该包含以下字段：
 1. `idx`：每条数据的唯一索引（可以是字符串）。
 2. `question`：图像对应的问题。
-3. `image`：您可以自定义列名，请通过 `--image_column` 参数传递该列名。该列应遵循以下结构：
+3. `image`：该列应遵循以下结构：
    - {'bytes': ..., 'path':...}
    - `bytes` 应为二进制格式。
    - `path` 字段不是必须的，但为了避免错误，建议您保留此字段（可以设置为空字符串）。
diff --git a/data_engine/answer_sampler.py b/data_engine/answer_sampler.py
index c8e6263..37f8190 100644
--- a/data_engine/answer_sampler.py
+++ b/data_engine/answer_sampler.py
@@ -1,169 +1,13 @@
-import os
-import tqdm
-import copy
-from chat import RLAIFVChat
-from datasets import load_dataset
-import torch
-import pandas as pd
-from muffin.data.datasets import bytes_to_PIL_image
+import llava.llava15_sample_data
+import omnilmm.omnilmm_sample_data
+
 from util import *
-from collections import defaultdict
 import torch.distributed as dist
 
 
-def sample_answer(model_path, dataset_path, output_path, image_column, sample=10):
-    # here we need to keep different samples of the same question adjacent to each other in the final file
-    # otherwise, the data_pair_builder will output data with no sense.
-    # so in this function, there are some code used to keep the order
-    # if you want to change them, you may also need to change code in data_pair_builder
-
-    try:
-        local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        world_size = int(os.environ.get("WORLD_SIZE", 1))
-
-        model = RLAIFVChat(model_path)
-        grouped_output_data = defaultdict(list)
-
-        with torch.inference_mode():
-            generation_config = {
-                # "top_p": 0.8,
-                # "top_k": 100,
-                "temperature": 0.7,
-                "do_sample": True,
-                # "repetition_penalty": 1.05
-            }
-
-            print("Loading sample answer dataset.")
-            dataset = load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column(
-                image_column,
-                hf_datasets.Image(decode=False)
-            )
-            print("Finish loading")
-
-            total_size = len(dataset)
-            base_size = total_size // world_size
-            remainder = total_size % world_size
-
-            start_idx = local_rank * base_size + min(local_rank, remainder)
-            end_idx = start_idx + base_size + (1 if local_rank < remainder else 0)
-
-            device_dataset = dataset.select(range(start_idx, end_idx))
-            processed_indices = set()
-
-            iterator = tqdm.tqdm(
-                device_dataset,
-                desc=f"GPU {local_rank}",
-                position=local_rank
-            )
-
-            for idx, data in enumerate(iterator):
-                try:
-                    data_id = start_idx + idx
-                    current_samples = []
-                    for i in range(sample):
-                        try:
-                            data_cp = copy.deepcopy(data)
-                            # your dataset should keep image in ['image']['bytes'] or ['image_bytes']['bytes']
-                            # or you can change the following code to read the data in your format
-                            if 'image' in data_cp:
-                                data_cp['image'] = bytes_to_PIL_image(data_cp['image']['bytes'])
-                                output = model.chat(data_cp, param=generation_config)
-                                data_cp['chosen'] = output
-                                data_cp['rejected'] = output
-                                data_cp['image'] = data['image']
-                                data_cp['global_index'] = data_id  # 添加全局索引
-                                data_cp['sample_index'] = i  # 添加样本索引
-                            elif 'image_bytes' in data_cp:
-                                data_cp['image'] = bytes_to_PIL_image(data_cp['image_bytes']['bytes'])
-                                output = model.chat(data_cp, param=generation_config)
-                                data_cp['chosen'] = output
-                                data_cp['rejected'] = output
-                                data_cp.pop('image')
-                                data_cp['image'] = data['image_bytes']
-                                data_cp['global_index'] = data_id
-                                data_cp['sample_index'] = i
-                            else:
-                                raise ValueError("image attribute not found")
-                            current_samples.append(data_cp)
-                        except Exception as e:
-                            print(f"Error processing sample {i} for data_id {data_id}: {str(e)}")
-                            continue
-
-                    if current_samples:  # 只有在成功生成样本时才添加
-                        grouped_output_data[data_id] = current_samples
-                        processed_indices.add(data_id)
-                except Exception as e:
-                    print(f"Error processing data_id {data_id}: {str(e)}")
-                    continue
-
-            torch.distributed.barrier()
-
-            if world_size > 1:
-                all_data = [None] * world_size
-                dist.all_gather_object(all_data, grouped_output_data)
-
-                if local_rank == 0:
-                    merged_data = defaultdict(list)
-                    all_data_ids = set()
-                    for rank_data in all_data:
-                        all_data_ids.update(rank_data.keys())
-
-                    for data_id in sorted(all_data_ids):
-                        for rank_data in all_data:
-                            if data_id in rank_data:
-                                merged_data[data_id].extend(rank_data[data_id])
-                    grouped_output_data = merged_data
-
-            if local_rank == 0:
-                step = 5000
-                flat_output_data = []
-
-                for data_id in sorted(grouped_output_data.keys()):
-                    samples = sorted(grouped_output_data[data_id], key=lambda x: x['sample_index'])
-                    flat_output_data.extend(samples)
-
-                for idx, start in enumerate(range(0, len(flat_output_data), step)):
-                    try:
-                        temp_data = flat_output_data[start: min(start + step, len(flat_output_data))]
-                        df = pd.DataFrame(temp_data)
-
-                        df = df.sort_values(['global_index', 'sample_index'])
-                        df = df.drop(columns=['global_index', 'sample_index'])
-
-                        output_file = os.path.join(
-                            output_path,
-                            f'RLAIF-V-Dataset-sampled_{idx:03}-{len(temp_data)}.parquet'
-                        )
-
-                        temp_file = output_file + '.tmp'
-                        df.to_parquet(temp_file)
-                        os.rename(temp_file, output_file)
-
-                    except Exception as e:
-                        print(f"Error saving batch {idx}: {str(e)}")
-                        continue
-
-    except Exception as e:
-        print(f"Critical error in sample_answer: {str(e)}")
-        raise
-    finally:
-        if 'model' in locals():
-            del model
-
-
-def main():
-    dist.init_process_group(backend='nccl')
-
-    model_path = "your_model_path"
-    dataset_path = "your_dataset_path"
-    output_path = "your_output_path"
-    sample = 10
-
-    try:
-        sample_answer(model_path, dataset_path, output_path, sample)
-    finally:
-        dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
+def sample_answer(model_name, model_path, dataset_path, output_path, sample_k=10):
+    if judge_is_llava(model_name=model_name):
+        llava.llava15_sample_data.main(model_name, model_path, None, dataset_path, output_path, sample=sample_k,
+                                       batch_size=sample_k)
+    if judge_is_omnilmm(model_name=model_name):
+        omnilmm.omnilmm_sample_data.main(model_path, dataset_path, output_path, sample=sample_k, batch_size=sample_k)
diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
index a6ac9c9..5c83f82 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/data_engine.py
@@ -57,24 +57,26 @@ def run(
         instruct_model_path,
         dataset_path,
         work_dir,
-        image_column="image",
         continue_from_stage=1,
         sample_k=10,
         rank=3,
         distance=25,
         debug=False
 ):
+    # -1: multi cuda env init
     dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
                             rank=int(os.getenv('RANK', '0')), )
     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
+    # 0: sample answer
     sampled_answer_path = os.path.join(work_dir, "sampled_answer")
     if continue_from_stage <= 0:
         print_stage(0, "Sample answers")
         dir_prepare(sampled_answer_path)
-        answer_sampler.sample_answer(instruct_model_path, dataset_path, sampled_answer_path, image_column, sample_k)
+        answer_sampler.sample_answer(instruct_model_name, instruct_model_path, dataset_path, sampled_answer_path, sample_k)
         print_stage(0, finish=True)
 
+    # 1: calculate logps
     reward_logps_output_dir = os.path.join(work_dir, "reward_logps")
     instruct_logps_output_dir = os.path.join(work_dir, "instruct_logps")
     if continue_from_stage <= 1:
@@ -101,24 +103,30 @@ def run(
         if continue_from_stage <= 2:
             print_stage(2, "DPO dataset construction")
 
+            # 2.1: calculate reward
             print_stage(2.1, "Calculate reward")
             rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
             if debug:
                 store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'rewards.json'))
             print_stage(2.1, finish=True)
 
+            # 2.2: build DPO pair
             print_stage(2.2, "Build DPO pairs")
-            dpo_pair = data_pair_builder.main(rewards, sample_k, rank, distance)
+            dpo_pair, sum_output, avg_output = data_pair_builder.main(rewards, sample_k, rank, distance)
             if debug:
                 store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'dpo_pair.json'))
+                store_data_with_no_image(sum_output, os.path.join(debug_root_dir, 'sum_output.json'))
+                store_data_with_no_image(avg_output, os.path.join(debug_root_dir, 'avg_output.json'))
             print_stage(2.2, finish=True)
 
+            # 2.3: filter DPO pairs
             print_stage(2.3, "Filter DPO pairs")
             data = filter.main(dpo_pair)
             if debug:
                 store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'filtered.json'))
             print_stage(2.3, finish=True)
 
+            # 2.4: save files
             print_stage(2.4, "Save file to dataset format")
             output_path = os.path.join(work_dir, "dataset")
             output_file = os.path.join(output_path, "dpo_dataset.parquet")
@@ -159,7 +167,6 @@ def run(
     args.add_argument("--instruct_model_path", type=str, help="The path of the instruct model.")
     args.add_argument("--dataset_path", type=str, help="The path of the dataset.")
     args.add_argument("--work_dir", type=str, help="The working directory.")
-    args.add_argument("--image_column", type=str, help="The column that keep image in your dataset")
     args.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
     args.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
     args.add_argument("--rank", type=int, default=3, help="The rank number.")
@@ -174,7 +181,6 @@ def run(
         args.instruct_model_path,
         args.dataset_path,
         args.work_dir,
-        args.image_column,
         args.continue_from_stage,
         args.sample_k,
         args.rank,
diff --git a/data_engine/data_pair_builder.py b/data_engine/data_pair_builder.py
index c48be47..f9b3ef4 100644
--- a/data_engine/data_pair_builder.py
+++ b/data_engine/data_pair_builder.py
@@ -1,3 +1,5 @@
+import os.path
+
 from nltk import word_tokenize
 from tqdm import tqdm
 
@@ -17,9 +19,6 @@ def get_ranking_reward_data(sample_k, rewards):
         # 按照 sum 和 avg 降序排列
         sum_sorted_data = sorted(data, key=lambda x: x['sum'], reverse=True)
         avg_sorted_data = sorted(data, key=lambda x: x['avg'], reverse=True)
-        # TODO
-        # sum_sorted_data = sorted(data, key=lambda x: x['sum'])
-        # avg_sorted_data = sorted(data, key=lambda x: x['avg'])
 
         # print(sum_sorted_data[0]['idx'])
 
@@ -132,7 +131,7 @@ def pair_union(sum_reward, avg_reward, sample_k=10, rank=3, distance=25):
 def main(rewards, sample_k=10, rank=3, distance=25):
     sum_output, avg_output = get_ranking_reward_data(sample_k, rewards)
     dpo_pair = pair_union(sum_output, avg_output, sample_k, rank, distance)
-    return dpo_pair
+    return dpo_pair, sum_output, avg_output
 
 
 if __name__ == "__main__":
diff --git a/data_engine/dpo_data_filter/filter.py b/data_engine/dpo_data_filter/filter.py
index 42d29e1..9828cec 100644
--- a/data_engine/dpo_data_filter/filter.py
+++ b/data_engine/dpo_data_filter/filter.py
@@ -47,10 +47,11 @@ def main(data):
     from .length_filter import LengthFilter
     from .num_filter import NumFilter
     from .same_filter import DeleteSameFilter
+    from .ratio_filter import RatioFilter
 
     # you can add your own filters here or delete the filters
     # that are determined to be unnecessary
-    filters = [NumFilter, LengthFilter, DeleteSameFilter]
+    filters = [DeleteSameFilter, NumFilter, LengthFilter]
 
     for filter_to_run in filters:
         filter_name = filter_to_run.__name__
diff --git a/data_engine/dpo_data_filter/length_filter.py b/data_engine/dpo_data_filter/length_filter.py
index 23b4692..54ebf5c 100644
--- a/data_engine/dpo_data_filter/length_filter.py
+++ b/data_engine/dpo_data_filter/length_filter.py
@@ -31,8 +31,8 @@ def do_filter(cls, data):
         print("finish sorting")
         print("mean difference: ", cls.calculate_mean_difference(data))
 
+        print("popping data to reduce mean difference...")
         while cls.calculate_mean_difference(data) > 0.5:
-            print("pop data to reduce mean difference")
             data.pop()
         for item in data:
             del item['chosen_diff']
diff --git a/data_engine/dpo_data_filter/num_filter.py b/data_engine/dpo_data_filter/num_filter.py
index 62ead4a..8cc7cdd 100644
--- a/data_engine/dpo_data_filter/num_filter.py
+++ b/data_engine/dpo_data_filter/num_filter.py
@@ -13,11 +13,6 @@ def count_words(cls, sentence):
         words = word_tokenize(sentence)
         return len(words)
 
-    @classmethod
-    def calculate_mean_difference(cls, data):
-        total_difference = sum(item['chosen_diff'] for item in data)
-        return total_difference / len(data)
-
     @classmethod
     def do_filter(cls, data):
         count = {}
diff --git a/data_engine/logps_calculator.py b/data_engine/logps_calculator.py
index c4c5c1b..b846ce9 100644
--- a/data_engine/logps_calculator.py
+++ b/data_engine/logps_calculator.py
@@ -1,82 +1,55 @@
 import os
-import tqdm
 import itertools
 import argparse
-from muffin.eval.muffin_inference_logp import (get_batch_logps, write_logp_to_preference_parquet)
+from functools import partial
+
+import datasets
+import numpy as np
+from transformers import BatchFeature
+
+from builder.builder import load_pretrained_model
+from muffin.eval.muffin_inference_logp import (write_logp_to_preference_parquet, get_multimodal_sample_logps,
+                                               concate_pad)
+from muffin.gen_data_util import InferenceSampler
+from muffin.train.train_utils import SFT_collator_fn
 from util import *
+from dataset import PreferenceInferenceDataset
 
 import torch
 import torch.distributed as dist
-
-
-def get_multimodal_sample_logps(model, dataloader, is_llava15=False):
-    win_logp_list = []
-    rej_logp_list = []
-
-    win_avg_logp_list = []
-    rej_avg_logp_list = []
-
-    win_per_token_logp_list = []
-    rej_per_token_logp_list = []
-
-    with torch.inference_mode():
-        idx = 0
-        for batch in tqdm.tqdm(dataloader):
-            for key in ['win', 'rej']:
-                input_ids = batch[f'{key}_input_ids'].cuda()
-                # tokens = tokenizer.batch_decode(copy.deepcopy(input_ids))
-                # print(tokens)
-                labels = batch[f'{key}_labels'].cuda()
-                attention_mask = batch[f'{key}_attention_mask'].cuda()
-
-                if is_llava15:
-                    # print("is llava15")
-                    (
-                        _,
-                        _,
-                        _,
-                        _,
-                        inputs_embeds,
-                        labels
-                    ) = model.prepare_inputs_labels_for_multimodal(
-                        input_ids=input_ids,
-                        position_ids=None,
-                        attention_mask=None,
-                        past_key_values=None,
-                        labels=labels,
-                        images=batch['images'].to(dtype=torch.bfloat16, device='cuda'),
-                    )
-                    output = model.forward(
-                        inputs_embeds=inputs_embeds,
-                        labels=None,
-                    )
-                else:
-                    output = model(
-                        input_ids=input_ids,
-                        labels=labels,
-                        attention_mask=attention_mask,
-                        images=batch['images'].to(dtype=torch.bfloat16, device='cuda'),
-                    )
-                per_token_logp, log_prob, average_log_prob = get_batch_logps(output.logits, labels, return_all=True)
-
-                # print(per_token_logp.shape, input_ids.shape, labels.shape, flush=True)
-                assert per_token_logp.size(1) >= input_ids.size(1) - 1
-                per_token_logp = per_token_logp.tolist()
-                # per_token_logp = [x[:input_ids[i].ne(tokenizer.pad_token_id).sum().item()] for i, x in enumerate(per_token_logp)]
-                log_prob = log_prob.tolist()
-                average_log_prob = average_log_prob.tolist()
-
-                if key == 'win':
-                    win_logp_list += log_prob
-                    win_avg_logp_list += average_log_prob
-                    win_per_token_logp_list += per_token_logp
-                else:
-                    rej_logp_list += log_prob
-                    rej_avg_logp_list += average_log_prob
-                    rej_per_token_logp_list += per_token_logp
-            # print(f'{key} logits in {output.logits.shape}, logp in {log_prob.shape} avg_logp in {average_log_prob.shape}', flush=True)
-
-    return win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
+import torch.utils.data as torch_data
+
+
+def preference_collator_fn(instances, pad_token_id, is_omni=False):
+    rej_instances, win_instances = list(zip(*instances))
+    rej_batch = SFT_collator_fn(rej_instances, pad_token_id)
+    win_batch = SFT_collator_fn(win_instances, pad_token_id)
+
+    concatenated_input_ids = concate_pad(win_batch['input_ids'], rej_batch['input_ids'], pad_token_id)
+    concatenated_labels = concate_pad(win_batch['labels'], rej_batch['labels'], -100)
+    concatenated_attention_mask = concatenated_input_ids.ne(pad_token_id)
+
+    if not is_omni:
+        if isinstance(win_batch['images'][0], BatchFeature):
+            win_images = torch.stack([torch.tensor(img.pixel_values[0]) for img in win_batch['images']])
+        elif isinstance(win_batch['images'][0], np.ndarray):
+            win_images = torch.stack([torch.tensor(img) for img in win_batch['images']])
+        else:
+            win_images = win_batch['images']
+
+    batch = dict(
+        concatenated_input_ids=concatenated_input_ids,
+        concatenated_labels=concatenated_labels,
+        concatenated_attention_mask=concatenated_attention_mask,
+        win_input_ids=win_batch['input_ids'],
+        rej_input_ids=rej_batch['input_ids'],
+        win_labels=win_batch['labels'],
+        rej_labels=rej_batch['labels'],
+        win_attention_mask=win_batch['attention_mask'],
+        rej_attention_mask=rej_batch['attention_mask'],
+        images=win_batch['images'] if is_omni else win_images,
+    )
+    return batch
 
 
 def inference_logp(
@@ -95,7 +68,29 @@ def inference_logp(
 
     """
 
-    model, dataset, dataloader = load_model_and_dataloader(model_path, model_name, dataset_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
+                                                                           device_map={"": 'cuda'})
+    image_token_len = 0
+    if hasattr(model, "model") and hasattr(model.model, "config") and hasattr(model.model.config, "num_query"):
+        image_token_len = model.model.config.num_query
+
+    model = model.to(dtype=torch.bfloat16, device='cuda')
+    hf_data = datasets.load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column("image",
+                                                                                            datasets.Image(
+                                                                                                decode=False))
+    dataset = PreferenceInferenceDataset(model_name=model_name,
+                                         tokenizer=tokenizer,
+                                         data=hf_data,
+                                         image_token_len=image_token_len,
+                                         img_processor=image_processor,
+                                         use_im_start_end=False)
+    collate_fn = partial(
+        preference_collator_fn,
+        pad_token_id=tokenizer.pad_token_id,
+        is_omni=("omni" in model_name.lower()) or (
+                "rlaif" in model_name.lower() and "12b" in model_path.lower()))  # judge if the model follow omni structure
+    dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
+                                       num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
 
     outputs = get_multimodal_sample_logps(
         # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
diff --git a/data_engine/reward_computer.py b/data_engine/reward_computer.py
index 1220503..6fd0289 100644
--- a/data_engine/reward_computer.py
+++ b/data_engine/reward_computer.py
@@ -104,7 +104,7 @@ def compute_reward(tokenizer, reward_logps_dir, instruct_logps_dir):
             instruct_logps = list(map(float, instruct_logps.split(",")))
             instruct_logps_for_reward = instruct_logps[-len(tokens):]
 
-            differences = [instruct_logp - reward_logp for instruct_logp, reward_logp in
+            differences = [reward_logp - instruct_logp for instruct_logp, reward_logp in
                            zip(instruct_logps_for_reward, reward_logps_for_reward)]
             min_reward = min(differences) * 0.1
             sum_reward = sum(differences) * 0.1
diff --git a/data_engine/run_engine.sh b/data_engine/run_engine.sh
index f13a75b..4d3e82a 100644
--- a/data_engine/run_engine.sh
+++ b/data_engine/run_engine.sh
@@ -21,6 +21,5 @@ torchrun $DISTRIBUTED_ARGS data_engine/data_engine.py \
       --instruct_model_path /path/to/yout/instruct/model \
       --dataset_path /path/to/your/dataset \
       --work_dir /path/to/your/work/dir \
-      --image_column image \
       --continue_from_stage 0 \
       --debug True
\ No newline at end of file
diff --git a/data_engine/util.py b/data_engine/util.py
index ed19ca8..ed0bcb5 100644
--- a/data_engine/util.py
+++ b/data_engine/util.py
@@ -1,79 +1,8 @@
-import torch.utils.data as torch_data
-from functools import partial
-from muffin.train.train_utils import SFT_collator_fn
-import numpy as np
-import datasets as hf_datasets
-from transformers.image_processing_utils import BatchFeature
-
-from builder.builder import load_pretrained_model
-from muffin.eval.muffin_inference_logp import (InferenceSampler, concate_pad)
-from dataset import PreferenceInferenceDataset
-
-import torch
-
-
-def preference_collator_fn(instances, pad_token_id, is_omni=False):
-    rej_instances, win_instances = list(zip(*instances))
-    rej_batch = SFT_collator_fn(rej_instances, pad_token_id)
-    win_batch = SFT_collator_fn(win_instances, pad_token_id)
-
-    concatenated_input_ids = concate_pad(win_batch['input_ids'], rej_batch['input_ids'], pad_token_id)
-    concatenated_labels = concate_pad(win_batch['labels'], rej_batch['labels'], -100)
-    concatenated_attention_mask = concatenated_input_ids.ne(pad_token_id)
-
-    if not is_omni:
-        if isinstance(win_batch['images'][0], BatchFeature):
-            win_images = torch.stack([torch.tensor(img.pixel_values[0]) for img in win_batch['images']])
-        elif isinstance(win_batch['images'][0], np.ndarray):
-            win_images = torch.stack([torch.tensor(img) for img in win_batch['images']])
-        else:
-            win_images = win_batch['images']
-
-    batch = dict(
-        concatenated_input_ids=concatenated_input_ids,
-        concatenated_labels=concatenated_labels,
-        concatenated_attention_mask=concatenated_attention_mask,
-        win_input_ids=win_batch['input_ids'],
-        rej_input_ids=rej_batch['input_ids'],
-        win_labels=win_batch['labels'],
-        rej_labels=rej_batch['labels'],
-        win_attention_mask=win_batch['attention_mask'],
-        rej_attention_mask=rej_batch['attention_mask'],
-        images=win_batch['images'] if is_omni else win_images,
-    )
-    return batch
-
-
-def load_model_and_dataloader(model_path, model_name, dataset_path):
-    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
-                                                                           device_map={"": 'cuda'})
-    image_token_len = 0
-    if hasattr(model, "model") and hasattr(model.model, "config") and hasattr(model.model.config, "num_query"):
-        image_token_len = model.model.config.num_query
-
-    model = model.to(dtype=torch.bfloat16, device='cuda')
-    hf_data = hf_datasets.load_dataset(dataset_path, cache_dir='./cache')['train'].cast_column("image",
-                                                                                               hf_datasets.Image(
-                                                                                                   decode=False))
-    dataset = PreferenceInferenceDataset(model_name=model_name,
-                                         tokenizer=tokenizer,
-                                         data=hf_data,
-                                         image_token_len=image_token_len,
-                                         img_processor=image_processor,
-                                         use_im_start_end=False)
-    collate_fn = partial(
-        preference_collator_fn,
-        pad_token_id=tokenizer.pad_token_id,
-        is_omni=("omni" in model_name.lower()) or (
-                "rlaif" in model_name.lower() and "12b" in model_path.lower()))  # judge if the model follow omni structure
-    dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
-                                       num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
-    return model, dataset, dataloader
-
 def judge_is_llava(model_name: str) -> bool:
     lower_name = model_name.lower()
     return 'llava' in lower_name or ('rlaif' in lower_name and '7b' in lower_name)
 
+
 def judge_is_omnilmm(model_name: str) -> bool:
     lower_name = model_name.lower()
-    return 'omnilmm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
\ No newline at end of file
+    return 'omnilmm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
diff --git a/llava/llava15_sample_data.py b/llava/llava15_sample_data.py
new file mode 100644
index 0000000..1d69d73
--- /dev/null
+++ b/llava/llava15_sample_data.py
@@ -0,0 +1,93 @@
+import os
+import random
+from functools import partial
+
+import torch
+import torch.utils.data as torch_data
+
+from builder.builder import load_pretrained_model
+from llava.mm_utils import process_images, get_model_name_from_path
+from muffin.gen_data_util import InferenceSampler, torch_pad_sequence
+from muffin.sample_data_util import SampleDataset, sample_and_record
+from muffin.llava15_gen_data import wrap_question_for_llava15
+
+
+def llava15_colloator_fn(data_list, tokenizer, image_processor, config):
+    input_ids = [torch.as_tensor(x['question_input_ids']) for x in data_list]
+
+    input_ids = torch_pad_sequence(
+        input_ids, tokenizer.pad_token_id, padding_side='left')
+
+    # NOTE: here we need to pass `raw_images`
+    images = [process_images([x['image']], image_processor, config)[0] for x in data_list]
+    images = torch.stack(images)
+    raw_images = [x['raw_image'] for x in data_list]
+
+    image_sizes = [x['image'].size for x in data_list]
+
+    raw_questions = [x['raw_question'] for x in data_list]
+    data = {
+        'images': images,
+        'image_sizes': image_sizes,
+        'input_ids': input_ids,
+        'raw_questions': raw_questions,
+        'raw_images': raw_images,
+    }
+
+    if 'question_id' in data_list[0]:
+        data['question_id'] = [x['question_id'] for x in data_list]
+    if 'origin_dataset' in data_list[0]:
+        data['origin_dataset'] = [x['origin_dataset'] for x in data_list]
+    if 'answer' in data_list[0]:
+        data['gt_answers'] = [x['answer'] for x in data_list]
+    if 'image_id' in data_list[0]:
+        data['image_id'] = [x['image_id'] for x in data_list]
+    if 'metainfo' in data_list[0]:
+        data['metainfo'] = [x['metainfo'] for x in data_list]
+    if 'metainfos' in data_list[0]:
+        data['metainfos'] = [x['metainfos'] for x in data_list]
+
+    return data
+
+
+def main(model_name, model_path, model_base, ds_path, answer_dir, sample=10, seed=0, batch_size=10,
+         num_workers=16, conv_mode='llava_v1', max_tokens=512, temperature=0.7):
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            backend='nccl',
+            world_size=int(os.getenv('WORLD_SIZE', '1')),
+            rank=int(os.getenv('RANK', '0')),
+        )
+        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+        print(f'Init Rank-{torch.distributed.get_rank()}')
+
+    model_path = os.path.expanduser(model_path)
+    if model_name is None:
+        model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name,
+                                                                           device_map={
+                                                                               "": 'cuda'})  # device_map={"": 'cuda'}
+
+    random.seed(seed)
+
+    question_process_func = partial(
+        wrap_question_for_llava15, tokenizer=tokenizer, mm_use_im_start_end=model.config.mm_use_im_start_end,
+        conv_mode=conv_mode)
+
+    dataset = SampleDataset(ds_path, question_process_func, repeat_time=sample)
+    print(f'Dataset size is {len(dataset)}')
+
+    collate_fn = partial(llava15_colloator_fn, tokenizer=tokenizer,
+                         image_processor=image_processor, config=model.config)
+    dataloader = torch_data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=collate_fn,
+    )
+    print(f'Dataloader size is {len(dataloader)}')
+
+    sample_and_record(dataloader, model_path, model, tokenizer, answer_dir, temperature, max_tokens)
diff --git a/muffin/eval/muffin_inference_logp.py b/muffin/eval/muffin_inference_logp.py
index 3694a2e..b73dc2f 100644
--- a/muffin/eval/muffin_inference_logp.py
+++ b/muffin/eval/muffin_inference_logp.py
@@ -9,6 +9,8 @@
 import torch.utils.data as torch_data
 import PIL.Image as PIL_image
 from functools import partial
+
+from muffin.gen_data_util import InferenceSampler
 from muffin.train.train_utils import encode_multimodal_preference_sample, SFT_collator_fn, preprocess_v1
 from muffin.utils import load_attr_or_empty_str
 
@@ -53,33 +55,6 @@ def get_batch_logps_minicpm(logits: torch.FloatTensor, labels: torch.LongTensor,
     return log_prob, average_log_prob
 
 
-class InferenceSampler(torch.utils.data.sampler.Sampler):
-
-    def __init__(self, size):
-        self._size = int(size)
-        assert size > 0
-        self._rank = torch.distributed.get_rank()
-        self._world_size = torch.distributed.get_world_size()
-        self._local_indices = self._get_local_indices(size, self._world_size,
-                                                      self._rank)
-
-    @staticmethod
-    def _get_local_indices(total_size, world_size, rank):
-        shard_size = total_size // world_size
-        left = total_size % world_size
-        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
-
-        begin = sum(shard_sizes[:rank])
-        end = min(sum(shard_sizes[:rank + 1]), total_size)
-        return range(begin, end)
-
-    def __iter__(self):
-        yield from self._local_indices
-
-    def __len__(self):
-        return len(self._local_indices)
-
-
 def get_batch_logps(logits: torch.FloatTensor, labels: torch.LongTensor, return_per_token_logp=False, return_all=False, tokenizer=None) -> torch.FloatTensor:
     """Compute the log probabilities of the given labels under the given logits.
 
@@ -210,9 +185,7 @@ def preference_collator_fn(instances, pad_token_id):
     return batch
 
 
-
-
-def get_multimodal_sample_logps(model, dataloader, tokenizer, is_llava15=False):
+def get_multimodal_sample_logps(model, dataloader, is_llava15=False):
     win_logp_list = []
     rej_logp_list = []
 
@@ -328,7 +301,7 @@ def inference_logp(model, tokenizer, hf_data, cache_file, image_token_len, img_p
     dataloader = torch_data.DataLoader(dataset, batch_size=1, collate_fn=collate_fn,
                                        num_workers=5, shuffle=False, sampler=InferenceSampler(len(dataset)))
 
-    outputs = get_multimodal_sample_logps(model, dataloader, tokenizer, is_llava15=is_llava15) # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
+    outputs = get_multimodal_sample_logps(model, dataloader, is_llava15=is_llava15) # win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list, rej_per_token_logp_list
 
     world_size = torch.distributed.get_world_size()
     merged_outputs = [[None for _ in range(world_size)] for i in range(len(outputs))]
diff --git a/muffin/gen_data_util.py b/muffin/gen_data_util.py
new file mode 100644
index 0000000..cfe9eb5
--- /dev/null
+++ b/muffin/gen_data_util.py
@@ -0,0 +1,46 @@
+import torch
+import torch.utils.data as torch_data
+
+
+def torch_pad_sequence(sequence, padding_value, batch_first=True, padding_side='right'):
+    if padding_side == 'right':
+        sequence = torch.nn.utils.rnn.pad_sequence(
+            sequence,
+            batch_first=batch_first,
+            padding_value=padding_value)
+    elif padding_side == 'left':
+        sequence = torch.nn.utils.rnn.pad_sequence(
+            [v.flip(-1) for v in sequence],
+            batch_first=batch_first,
+            padding_value=padding_value)
+        sequence = sequence.flip(-1)
+    else:
+        raise NotImplementedError(f'padding_size={padding_side}')
+    return sequence
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
diff --git a/muffin/llava15_gen_data.py b/muffin/llava15_gen_data.py
index 22d2a00..746c342 100644
--- a/muffin/llava15_gen_data.py
+++ b/muffin/llava15_gen_data.py
@@ -16,50 +16,7 @@
 from llava.conversation import conv_templates
 from builder.builder import load_pretrained_model
 from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
-
-
-def torch_pad_sequence(sequence, padding_value, batch_first=True, padding_side='right'):
-    if padding_side == 'right':
-        sequence = torch.nn.utils.rnn.pad_sequence(
-            sequence,
-            batch_first=batch_first,
-            padding_value=padding_value)
-    elif padding_side == 'left':
-        sequence = torch.nn.utils.rnn.pad_sequence(
-            [v.flip(-1) for v in sequence],
-            batch_first=batch_first,
-            padding_value=padding_value)
-        sequence = sequence.flip(-1)
-    else:
-        raise NotImplementedError(f'padding_size={padding_side}')
-    return sequence
-
-
-class InferenceSampler(torch.utils.data.sampler.Sampler):
-
-    def __init__(self, size):
-        self._size = int(size)
-        assert size > 0
-        self._rank = torch.distributed.get_rank()
-        self._world_size = torch.distributed.get_world_size()
-        self._local_indices = self._get_local_indices(size, self._world_size,
-                                                      self._rank)
-
-    @staticmethod
-    def _get_local_indices(total_size, world_size, rank):
-        shard_size = total_size // world_size
-        left = total_size % world_size
-        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
-
-        begin = sum(shard_sizes[:rank])
-        end = min(sum(shard_sizes[:rank + 1]), total_size)
-        return range(begin, end)
-
-    def __iter__(self):
-        yield from self._local_indices
-
-    def __len__(self):
-        return len(self._local_indices)
+from muffin.gen_data_util import InferenceSampler, torch_pad_sequence
 
 
 class GenDataset(torch_data.Dataset):
diff --git a/muffin/sample_data_util.py b/muffin/sample_data_util.py
new file mode 100644
index 0000000..17304f3
--- /dev/null
+++ b/muffin/sample_data_util.py
@@ -0,0 +1,154 @@
+import io
+import os
+
+import datasets
+import pandas as pd
+import torch
+import torch.utils.data as torch_data
+from PIL import Image
+import tqdm
+import itertools
+
+
+class SampleDataset(torch_data.Dataset):
+    def __init__(self, file, question_process, repeat_time=10):
+        '''
+        file: file that each line is a dict like {
+            'image': {
+                'bytes': Binary Data,
+            },
+            'question': question_text
+        }
+        '''
+        super().__init__()
+        self.file = file
+        self.data = datasets.load_dataset(self.file, cache_dir='./cache')['train'].cast_column(
+            'image',
+            datasets.Image(decode=False)
+        )
+
+        # print("org data len:", len(self.data), f"\nstart={start} end={end}")
+        # if end != -1 or start != 0:
+        #     if end == -1:
+        #         end = len(self.data)
+        #     self.data = self.data.select(range(start, end))
+
+        new_data = []
+        for i in range(len(self.data)):
+            new_data += [self.data[i]] * repeat_time
+
+        self.data = new_data
+        self.question_process = question_process
+        self.start_idx = 0
+
+    def __getitem__(self, index):
+        item = self.data[index]
+        # print(item.keys())
+        if "image" in item.keys():
+            img = item['image']['bytes']
+            raw_img = img
+            image = Image.open(io.BytesIO(img)).convert('RGB')
+        elif "image_path" in item.keys():
+            # print("in")
+            image = Image.open(item['image_path']).convert('RGB')
+            raw_img = image
+        elif "image_path" in item['metainfos'].keys():
+            # print("in metainfos")
+            image = Image.open(item['metainfos']['image_path']).convert('RGB')
+            raw_img = image
+
+        metainfo = {key: value for key, value in item.items() if key not in ["image_id", "question", "image"]}
+        raw_question = item['question']
+        question_input_ids = self.question_process(raw_question)
+
+        return {
+            'question_id': item['question_id'] if 'question_id' in item else self.start_idx + index,
+            'image': image,
+            'raw_image': raw_img,
+            'question_input_ids': question_input_ids,
+            'raw_question': raw_question,
+            'metainfos': metainfo,
+            'origin_dataset': self.file
+        }
+
+    def __len__(self):
+        return len(self.data)
+
+
+def sample_and_record(dataloader, model_path, model, tokenizer, answer_dir, temperature=0.7, max_tokens=512):
+    outputs = []
+    cnt = 0
+    with torch.inference_mode():
+        for batch in tqdm.tqdm(dataloader, f'Generating answers'):
+            output = model.generate(
+                inputs=batch['input_ids'].cuda(),
+                images=batch['images'].half().cuda(),
+                image_sizes=batch['image_sizes'],
+                do_sample=True,
+                temperature=temperature,
+                max_new_tokens=max_tokens,
+                use_cache=True,
+                return_dict_in_generate=True)
+
+            for question, output_ids, question_id, metainfos, raw_image in zip(batch['raw_questions'],
+                                                                               output.sequences,
+                                                                               batch['question_id'],
+                                                                               batch['metainfos'],
+                                                                               batch['raw_images']):
+                response = tokenizer.decode(
+                    output_ids, skip_special_tokens=True)
+                response = response.strip()
+
+                if 'ds_question_id' in metainfos:
+                    outputs.append({
+                        'idx': question_id,
+                        'question_id': question_id,
+                        'ds_question_id': metainfos['ds_question_id'],
+                        'question': question,
+                        'chosen': response,
+                        'rejected': response,
+                        'image': raw_image,
+                        'metainfos': metainfos,
+                        'model_path': model_path
+                    })
+                else:
+                    outputs.append({
+                        'idx': question_id,
+                        'question_id': question_id,
+                        'question': question,
+                        'chosen': response,
+                        'rejected': response,
+                        'image': raw_image,
+                        'metainfos': metainfos,
+                        'model_path': model_path
+                    })
+
+            cnt += 1
+            if cnt == 10:
+                torch.distributed.barrier()
+                cnt = 0
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+
+    torch.distributed.all_gather_object(merged_outputs, outputs)
+
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+    print(f'Merged outputs: {len(merged_outputs)}')
+
+    if torch.distributed.get_rank() == 0:
+        step = 5000
+        for idx, start in enumerate(range(0, len(merged_outputs), step)):
+            temp_data = merged_outputs[start: min(start + step, len(merged_outputs))]
+            df = pd.DataFrame(temp_data)
+            output_file = os.path.join(
+                answer_dir,
+                f'RLAIF-V-Dataset-sampled_{idx:03}-{len(temp_data)}.parquet'
+            )
+            temp_file = output_file + '.tmp'
+            df.to_parquet(temp_file)
+            os.rename(temp_file, output_file)
+
+    torch.distributed.barrier()
diff --git a/omnilmm/omnilmm_gen_data.py b/omnilmm/omnilmm_gen_data.py
index 6304147..0c59b57 100644
--- a/omnilmm/omnilmm_gen_data.py
+++ b/omnilmm/omnilmm_gen_data.py
@@ -12,50 +12,7 @@
 import torch.utils.data as torch_data
 import tqdm
 from chat import init_omni_lmm, wrap_question_for_omni_lmm
-
-
-def torch_pad_sequence(sequence, padding_value, batch_first=True, padding_side='right'):
-
-    if padding_side == 'right':
-        sequence = torch.nn.utils.rnn.pad_sequence(
-            sequence,
-            batch_first=batch_first,
-            padding_value=padding_value)
-    elif padding_side == 'left':
-        sequence = torch.nn.utils.rnn.pad_sequence(
-            [v.flip(-1) for v in sequence],
-            batch_first=batch_first,
-            padding_value=padding_value)
-        sequence = sequence.flip(-1)
-    else:
-        raise NotImplementedError(f'padding_size={padding_side}')
-    return sequence
-
-class InferenceSampler(torch.utils.data.sampler.Sampler):
-
-    def __init__(self, size):
-        self._size = int(size)
-        assert size > 0
-        self._rank = torch.distributed.get_rank()
-        self._world_size = torch.distributed.get_world_size()
-        self._local_indices = self._get_local_indices(size, self._world_size,
-                                                      self._rank)
-
-    @staticmethod
-    def _get_local_indices(total_size, world_size, rank):
-        shard_size = total_size // world_size
-        left = total_size % world_size
-        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
-
-        begin = sum(shard_sizes[:rank])
-        end = min(sum(shard_sizes[:rank + 1]), total_size)
-        return range(begin, end)
-
-    def __iter__(self):
-        yield from self._local_indices
-
-    def __len__(self):
-        return len(self._local_indices)
+from muffin.gen_data_util import InferenceSampler, torch_pad_sequence
 
 
 class GenDataset(torch_data.Dataset):
diff --git a/omnilmm/omnilmm_sample_data.py b/omnilmm/omnilmm_sample_data.py
new file mode 100644
index 0000000..fcbe22f
--- /dev/null
+++ b/omnilmm/omnilmm_sample_data.py
@@ -0,0 +1,83 @@
+import os
+import random
+from functools import partial
+
+import torch
+import torch.utils.data as torch_data
+from chat import init_omni_lmm, wrap_question_for_omni_lmm
+from muffin.gen_data_util import InferenceSampler, torch_pad_sequence
+from muffin.sample_data_util import SampleDataset, sample_and_record
+
+
+def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
+    input_ids = [torch.as_tensor(x['question_input_ids']) for x in data_list]
+    attn_mask = [torch.as_tensor([1] * len(x)) for x in input_ids]
+
+    input_ids = torch_pad_sequence(
+        input_ids, tokenizer.pad_token_id, padding_side='left')
+    attn_mask = torch_pad_sequence(attn_mask, 0, padding_side='left')
+
+    images = [img_transform(x['image']) for x in data_list]
+    images = torch.stack(images)
+    raw_images = [x['raw_image'] for x in data_list]
+
+    raw_questions = [x['raw_question'] for x in data_list]
+    data = {
+        'images': images,
+        'input_ids': input_ids,
+        'attention_mask': attn_mask,
+        'raw_questions': raw_questions,
+        'raw_images': raw_images,
+    }
+
+    if 'question_id' in data_list[0]:
+        data['question_id'] = [x['question_id'] for x in data_list]
+    if 'origin_dataset' in data_list[0]:
+        data['origin_dataset'] = [x['origin_dataset'] for x in data_list]
+    if 'answer' in data_list[0]:
+        data['gt_answers'] = [x['answer'] for x in data_list]
+    if 'image_id' in data_list[0]:
+        data['image_id'] = [x['image_id'] for x in data_list]
+    if 'metainfo' in data_list[0]:
+        data['metainfo'] = [x['metainfo'] for x in data_list]
+    if 'metainfos' in data_list[0]:
+        data['metainfos'] = [x['metainfos'] for x in data_list]
+
+    return data
+
+
+def main(model_path, ds_path, answer_dir, sample=10, seed=0, batch_size=10,
+         num_workers=16, max_tokens=512, temperature=0.7):
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            backend='nccl',
+            world_size=int(os.getenv('WORLD_SIZE', '1')),
+            rank=int(os.getenv('RANK', '0')),
+        )
+        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+        print(f'Init Rank-{torch.distributed.get_rank()}')
+    model, image_processor, image_token_len, tokenizer = init_omni_lmm(
+        model_path)
+    random.seed(seed)
+
+    question_process_func = partial(
+            wrap_question_for_omni_lmm, image_token_len=image_token_len, tokenizer=tokenizer)
+
+    dataset = SampleDataset(ds_path, question_process_func, repeat_time=sample)
+    print(f'Dataset size is {len(dataset)}')
+
+    collate_fn = partial(zephyr_qa_colloator_fn, tokenizer=tokenizer,
+                         img_transform=image_processor)
+    dataloader = torch_data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=collate_fn,
+    )
+    print(f'Dataloader size is {len(dataloader)}')
+
+    sample_and_record(dataloader, model_path, model, tokenizer, answer_dir, temperature, max_tokens)

From 1271f4e2d8bd13a0863a635486479b8d1f0b0f00 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Mon, 2 Dec 2024 00:29:21 +0800
Subject: [PATCH 09/18] [upgrade]

---
 chat.py                        | 39 +++++++++++++---------------------
 llava/llava15_sample_data.py   |  1 +
 omnilmm/omnilmm_sample_data.py |  1 +
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/chat.py b/chat.py
index 9ab4447..b6b730b 100644
--- a/chat.py
+++ b/chat.py
@@ -133,7 +133,7 @@ def __init__(self, model_path) -> None:
         self.image_processor = image_processor
         self.context_len = context_len
 
-    def chat(self, input, param=None):
+    def chat(self, input):
         msgs = input['question']
         if self.model.config.mm_use_im_start_end:
             msgs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + msgs
@@ -153,24 +153,15 @@ def chat(self, input, param=None):
             0).cuda()
         image_tensor = process_images([image], self.image_processor, self.model.config)[0]
         with torch.inference_mode():
-            if param is None:
-                output_ids = self.model.generate(
-                    input_ids,
-                    images=image_tensor.unsqueeze(0).half().cuda(),
-                    image_sizes=[image.size],
-                    do_sample=False,
-                    temperature=0,
-                    num_beams=3,
-                    max_new_tokens=1024,
-                    use_cache=True)
-            else:
-                output_ids = self.model.generate(
-                    input_ids,
-                    images=image_tensor.unsqueeze(0).half().cuda(),
-                    image_sizes=[image.size],
-                    use_cache=True,
-                    **param
-                )
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor.unsqueeze(0).half().cuda(),
+                image_sizes=[image.size],
+                do_sample=False,
+                temperature=0,
+                num_beams=3,
+                max_new_tokens=1024,
+                use_cache=True)
         outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
         return outputs
 
@@ -186,7 +177,7 @@ def __init__(self, model_path, model_base) -> None:
 
     def chat(self, input):
         msgs = input['question']
-        if model.config.mm_use_im_start_end:
+        if self.model.config.mm_use_im_start_end:
             msgs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + msgs
         else:
             msgs = DEFAULT_IMAGE_TOKEN + '\n' + msgs
@@ -219,11 +210,11 @@ def __init__(self, model_path) -> None:
             self.model = RLAIFV12B(model_path)
         elif '7B' in model_path:
             self.model = RLAIFV7B(model_path)
-            if 'lora_checkpoint' in model_path:
-                self.model = RLAIFV7BLoRA(model_path, model_base='liuhaotian/llava-v1.5-7b')
+        elif 'lora_checkpoint' in model_path:
+            self.model = RLAIFV7BLoRA(model_path, model_base='liuhaotian/llava-v1.5-7b')
 
-    def chat(self, input, param=None):
-        return self.model.chat(input, param=param)
+    def chat(self, input):
+        return self.model.chat(input)
 
 
 if __name__ == '__main__':
diff --git a/llava/llava15_sample_data.py b/llava/llava15_sample_data.py
index 1d69d73..edc8ca6 100644
--- a/llava/llava15_sample_data.py
+++ b/llava/llava15_sample_data.py
@@ -91,3 +91,4 @@ def main(model_name, model_path, model_base, ds_path, answer_dir, sample=10, see
     print(f'Dataloader size is {len(dataloader)}')
 
     sample_and_record(dataloader, model_path, model, tokenizer, answer_dir, temperature, max_tokens)
+    del model
diff --git a/omnilmm/omnilmm_sample_data.py b/omnilmm/omnilmm_sample_data.py
index fcbe22f..ccc4ea5 100644
--- a/omnilmm/omnilmm_sample_data.py
+++ b/omnilmm/omnilmm_sample_data.py
@@ -81,3 +81,4 @@ def main(model_path, ds_path, answer_dir, sample=10, seed=0, batch_size=10,
     print(f'Dataloader size is {len(dataloader)}')
 
     sample_and_record(dataloader, model_path, model, tokenizer, answer_dir, temperature, max_tokens)
+    del model

From d221e205bc351a5156be83fefd959f9988eebafe Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Tue, 3 Dec 2024 00:25:33 +0800
Subject: [PATCH 10/18] [upgrade]

---
 data_engine/README.md                      |  3 ---
 data_engine/README_zh.md                   |  3 ---
 data_engine/data_engine.py                 |  3 ++-
 data_engine/dpo_data_filter/filter.py      |  1 -
 data_engine/dpo_data_filter/same_filter.py | 14 ++++----------
 llava/llava15_sample_data.py               |  2 ++
 muffin/sample_data_util.py                 | 19 +++++++++++--------
 omnilmm/omnilmm_sample_data.py             |  2 ++
 8 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/data_engine/README.md b/data_engine/README.md
index 3b73c04..1059f50 100644
--- a/data_engine/README.md
+++ b/data_engine/README.md
@@ -20,9 +20,6 @@ Next, your dataset should contain the following fields:
    - `{'bytes': ..., 'path':...}`
    - `bytes` should be in binary format.
    - `path` is not strictly required, but to avoid errors, it's better to keep this field (you can set it as an empty string).
-4. `image_path`: This field is optional; we will retain it in your final DPO dataset.
-5. `ds_name`: This field is also optional; it will be retained in the final DPO dataset.
-6. `origin_split`: This field is optional, but **if you pass it, please ensure it is in JSON format**. It will be retained in the final DPO dataset.
 
 You can specify a `--work_dir` to store intermediate files and the final output under this directory (which will actually be a subdirectory within it).
 
diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
index b07f0c6..275f78f 100644
--- a/data_engine/README_zh.md
+++ b/data_engine/README_zh.md
@@ -20,9 +20,6 @@
    - {'bytes': ..., 'path':...}
    - `bytes` 应为二进制格式。
    - `path` 字段不是必须的，但为了避免错误，建议您保留此字段（可以设置为空字符串）。
-4. `image_path`：此字段不是必需的，我们将在最终的 DPO 数据集中保留它。
-5. `ds_name`：此字段不是必需的，我们将在最终的 DPO 数据集中保留它。
-6. `origin_split`：此字段不是必需的，但**如果传递此字段，请确保它是 JSON 格式**，我们将保留它在最终的 DPO 数据集中。
 
 您可以选择设置 `--work_dir`，我们将在该目录下保存中间文件和最终输出（实际上是该目录下的子目录）。
 
diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
index 5c83f82..a5fef98 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/data_engine.py
@@ -73,7 +73,8 @@ def run(
     if continue_from_stage <= 0:
         print_stage(0, "Sample answers")
         dir_prepare(sampled_answer_path)
-        answer_sampler.sample_answer(instruct_model_name, instruct_model_path, dataset_path, sampled_answer_path, sample_k)
+        answer_sampler.sample_answer(instruct_model_name, instruct_model_path, dataset_path, sampled_answer_path,
+                                     sample_k)
         print_stage(0, finish=True)
 
     # 1: calculate logps
diff --git a/data_engine/dpo_data_filter/filter.py b/data_engine/dpo_data_filter/filter.py
index 9828cec..0c65c4a 100644
--- a/data_engine/dpo_data_filter/filter.py
+++ b/data_engine/dpo_data_filter/filter.py
@@ -47,7 +47,6 @@ def main(data):
     from .length_filter import LengthFilter
     from .num_filter import NumFilter
     from .same_filter import DeleteSameFilter
-    from .ratio_filter import RatioFilter
 
     # you can add your own filters here or delete the filters
     # that are determined to be unnecessary
diff --git a/data_engine/dpo_data_filter/same_filter.py b/data_engine/dpo_data_filter/same_filter.py
index fcdecc8..cf18d99 100644
--- a/data_engine/dpo_data_filter/same_filter.py
+++ b/data_engine/dpo_data_filter/same_filter.py
@@ -13,25 +13,19 @@ class DeleteSameFilter(Filter):
     def do_filter(cls, data):
         unique_data = set()
         delete_same_output = []
-        temp_image_store = {}  # 用于暂存 image 数据
+        temp_image_store = {}
 
         for idx, obj in enumerate(data):
-            # 创建对象的深拷贝
             obj_copy = deepcopy(obj)
 
-            # 如果存在 image 字段，暂存它
-            if 'image' in obj_copy:
-                # 使用对象的其他字段作为键来存储 image
-                image_data = obj_copy.pop('image')
-                temp_key = f"temp_key_{idx}"  # 使用索引创建唯一的临时键
-                temp_image_store[temp_key] = (image_data, obj_copy)
+            image_data = obj_copy.pop('image')
+            temp_key = f"temp_key_{idx}"
+            temp_image_store[temp_key] = (image_data, obj_copy)
 
-            # 将处理后的数据序列化为字符串
             data_str = json.dumps(obj_copy, sort_keys=True)
 
             if data_str not in unique_data:
                 unique_data.add(data_str)
-                # 如果有 image，从临时存储中恢复它
                 if temp_key in temp_image_store:
                     stored_image, _ = temp_image_store[temp_key]
                     obj_copy['image'] = stored_image
diff --git a/llava/llava15_sample_data.py b/llava/llava15_sample_data.py
index edc8ca6..82b6799 100644
--- a/llava/llava15_sample_data.py
+++ b/llava/llava15_sample_data.py
@@ -36,6 +36,8 @@ def llava15_colloator_fn(data_list, tokenizer, image_processor, config):
 
     if 'question_id' in data_list[0]:
         data['question_id'] = [x['question_id'] for x in data_list]
+    if 'idx' in data_list[0]:
+        data['idx'] = [x['idx'] for x in data_list]
     if 'origin_dataset' in data_list[0]:
         data['origin_dataset'] = [x['origin_dataset'] for x in data_list]
     if 'answer' in data_list[0]:
diff --git a/muffin/sample_data_util.py b/muffin/sample_data_util.py
index 17304f3..d3562bf 100644
--- a/muffin/sample_data_util.py
+++ b/muffin/sample_data_util.py
@@ -61,7 +61,7 @@ def __getitem__(self, index):
         raw_question = item['question']
         question_input_ids = self.question_process(raw_question)
 
-        return {
+        res = {
             'question_id': item['question_id'] if 'question_id' in item else self.start_idx + index,
             'image': image,
             'raw_image': raw_img,
@@ -70,6 +70,8 @@ def __getitem__(self, index):
             'metainfos': metainfo,
             'origin_dataset': self.file
         }
+        res['idx'] = item['idx'] if 'idx' in item else res['question_id']
+        return res
 
     def __len__(self):
         return len(self.data)
@@ -90,18 +92,19 @@ def sample_and_record(dataloader, model_path, model, tokenizer, answer_dir, temp
                 use_cache=True,
                 return_dict_in_generate=True)
 
-            for question, output_ids, question_id, metainfos, raw_image in zip(batch['raw_questions'],
-                                                                               output.sequences,
-                                                                               batch['question_id'],
-                                                                               batch['metainfos'],
-                                                                               batch['raw_images']):
+            for question, output_ids, idx, question_id, metainfos, raw_image in zip(batch['raw_questions'],
+                                                                                    output.sequences,
+                                                                                    batch['idx'],
+                                                                                    batch['question_id'],
+                                                                                    batch['metainfos'],
+                                                                                    batch['raw_images']):
                 response = tokenizer.decode(
                     output_ids, skip_special_tokens=True)
                 response = response.strip()
 
                 if 'ds_question_id' in metainfos:
                     outputs.append({
-                        'idx': question_id,
+                        'idx': idx,
                         'question_id': question_id,
                         'ds_question_id': metainfos['ds_question_id'],
                         'question': question,
@@ -113,7 +116,7 @@ def sample_and_record(dataloader, model_path, model, tokenizer, answer_dir, temp
                     })
                 else:
                     outputs.append({
-                        'idx': question_id,
+                        'idx': idx,
                         'question_id': question_id,
                         'question': question,
                         'chosen': response,
diff --git a/omnilmm/omnilmm_sample_data.py b/omnilmm/omnilmm_sample_data.py
index ccc4ea5..d90df1e 100644
--- a/omnilmm/omnilmm_sample_data.py
+++ b/omnilmm/omnilmm_sample_data.py
@@ -32,6 +32,8 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
 
     if 'question_id' in data_list[0]:
         data['question_id'] = [x['question_id'] for x in data_list]
+    if 'idx' in data_list[0]:
+        data['idx'] = [x['idx'] for x in data_list]
     if 'origin_dataset' in data_list[0]:
         data['origin_dataset'] = [x['origin_dataset'] for x in data_list]
     if 'answer' in data_list[0]:

From 21fcd8ba7d135b7aca1fc0a57f1b6e2d158eb8c8 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Tue, 3 Dec 2024 00:36:20 +0800
Subject: [PATCH 11/18] [upgrade] README add some explanation

---
 data_engine/README.md    | 2 +-
 data_engine/README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_engine/README.md b/data_engine/README.md
index 1059f50..90a7102 100644
--- a/data_engine/README.md
+++ b/data_engine/README.md
@@ -3,7 +3,7 @@
 ## Welcome
 Thank you for using Data Engine.  
 This part of the code is used to build the DPO dataset, which you can use for direct training.  
-You only need to input the reward model, instruction model, and your dataset, and we will generate the DPO dataset for you. All you need to do is run the `run_engine.sh` script.
+You only need to input the reward model (the model trained with DPO which is used for guidance), instruction model (the model you want to train), and your dataset, and we will generate the DPO dataset for you. All you need to do is run the `run_engine.sh` script.
 
 ## Usage
 Please refer to the `run_engine.sh` script.
diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
index 275f78f..d335ab3 100644
--- a/data_engine/README_zh.md
+++ b/data_engine/README_zh.md
@@ -3,7 +3,7 @@
 ## Welcome
 感谢您使用 Data Engine。  
 此部分代码用于为您构建 DPO 数据集，您可以直接用它来进行训练。  
-您只需输入奖励模型（reward model）、指令模型（instruct model）和数据集，我们将为您构建 DPO 数据集。您只需运行 `run_engine.sh` 脚本即可。
+您只需输入奖励模型（reward model，也即一个经过DPO训练的模型，用来为您要训练的模型做指导）、指令模型（instruct model，您要训练的模型）和数据集，我们将为您构建 DPO 数据集。您只需运行 `run_engine.sh` 脚本即可。
 
 ## Usage
 请查看 `run_engine.sh` 脚本。

From 0dbb20fa6e1ee009bc74fd6719ad57cb058f2864 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Tue, 3 Dec 2024 21:58:20 +0800
Subject: [PATCH 12/18] [upgrade] some simple change

---
 data_engine/README.md         | 18 ++++++++----------
 data_engine/README_zh.md      | 15 ++++++---------
 script/train/llava15_train.sh |  4 ++--
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/data_engine/README.md b/data_engine/README.md
index 90a7102..704f460 100644
--- a/data_engine/README.md
+++ b/data_engine/README.md
@@ -1,15 +1,18 @@
 # Data Engine
 
-## Welcome
-Thank you for using Data Engine.  
+## Overview
 This part of the code is used to build the DPO dataset, which you can use for direct training.  
-You only need to input the reward model (the model trained with DPO which is used for guidance), instruction model (the model you want to train), and your dataset, and we will generate the DPO dataset for you. All you need to do is run the `run_engine.sh` script.
+You only need to input the reward model, instruction model, and your dataset, and we will generate the DPO dataset for you. All you need to do is run the `run_engine.sh` script. \
+Instruct model: we use instruct model to generate raw answer to the given question in the dataset. \
+Reward model: the model we use to evaluate the answer generated by the instruct model. We get rewards to answers with the help of reward model and use these rewards to rank answers. After that, we can build DPO dataset.
 
 ## Usage
 Please refer to the `run_engine.sh` script.
 
-You will need to provide the path and name for both the reward model and the instruction model. Currently, we support the following models: llava-1.5-7b, RLAIF-V-7B, OmniLMM-12B, and RLAIF-V-12B. We are considering adding more models in the future. \
-If the model you wish to use is not listed, you may need to implement the corresponding code yourself (for model loading, add code to `RLAIF-V/builder`; for answer sampling, refer to `RLAIF-V/llava/llava15_sample_data.py` to see how data is formatted (don't forget to pass `raw_images`) and add call it in `RLAIF-V/data_engine/answer_sampler.py`; for log probability calculation, change data formatting part in `RLAIF-V/data_engine/logps_calculator.py` and `get_multimodal_sample_logps` function in `RLAIF-V/muffin/eval/muffin_inference_logp.py`).
+You can specify reward model and instruction model you want to use for generating preference training dataset. The current supported list of reward models and instruction models are listed below: \
+llava-1.5-7b, RLAIF-V-7B, OmniLMM-12B, and RLAIF-V-12B. We are considering adding more models in the future. \
+If the model you wish to use is not listed, you may need to implement the corresponding code on yourself: \
+(for model loading, add code to `RLAIF-V/builder`; for answer sampling, refer to `RLAIF-V/llava/llava15_sample_data.py` to see how data is formatted (don't forget to pass `raw_images`) and add call it in `RLAIF-V/data_engine/answer_sampler.py`; for log probability calculation, change data formatting part in `RLAIF-V/data_engine/logps_calculator.py` and `get_multimodal_sample_logps` function in `RLAIF-V/muffin/eval/muffin_inference_logp.py`).
 
 Additionally, **please double-check that the model name you provide is correct**, as we will not know which code to execute otherwise.
 
@@ -29,8 +32,3 @@ Run:
 ```shell
 sh data_engine/run_data_engine.sh
 ```
-
-## Conclusion
-If you run into any issues, feel free to contact us by submitting an Issue.
-
-Thank you for choosing RLAIF-V. Best wishes for your project!
diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
index d335ab3..75cd4ea 100644
--- a/data_engine/README_zh.md
+++ b/data_engine/README_zh.md
@@ -1,14 +1,16 @@
 # Data Engine
 
-## Welcome
-感谢您使用 Data Engine。  
+## Overview
 此部分代码用于为您构建 DPO 数据集，您可以直接用它来进行训练。  
-您只需输入奖励模型（reward model，也即一个经过DPO训练的模型，用来为您要训练的模型做指导）、指令模型（instruct model，您要训练的模型）和数据集，我们将为您构建 DPO 数据集。您只需运行 `run_engine.sh` 脚本即可。
+您只需输入奖励模型（reward model）、指令模型（instruct model）和数据集，我们将为您构建 DPO 数据集。您只需运行 `run_engine.sh` 脚本即可。\
+指令模型：我们使用指令模型来生成数据集中给定问题的原始答案。 \
+奖励模型：我们用来评估指令模型生成的答案的模型。我们借助奖励模型获得答案的奖励，并使用此奖励对答案进行排名。之后，我们可以构建 DPO 数据集。
 
 ## Usage
 请查看 `run_engine.sh` 脚本。
 
-您需要输入奖励模型和指令模型的路径及名称。目前我们支持以下模型：llava-1.5-7b、RLAIF-V-7B、OmniLMM-12B 和 RLAIF-V-12B。我们也在考虑添加更多模型。\
+您可以指定要用于生成偏好训练数据集的奖励模型和指令模型。当前支持的奖励模型和指令模型列表如下：\
+llava-1.5-7b、RLAIF-V-7B、OmniLMM-12B 和 RLAIF-V-12B。我们也在考虑添加更多模型。\
 如果您选择的模型不在模型列表中，您可能需要自行实现相关代码：（`RLAIF-V/builder` 用于模型加载；对于初始回答抽样，请参考`RLAIF-V/llava/llava15_sample_data.py`是如何对数据进行格式化的（请不要忘记传递`raw_images`）同时将您的调用代码添加到`RLAIF-V/data_engine/answer_sampler.py`中; 对于logps计算，请更改`RLAIF-V/data_engine/logps_calculator.py`中用于格式化数据的部分，和`RLAIF-V/muffin/eval/muffin_inference_logp.py`的`get_multimodal_sample_logps`函数）。
 
 另外，**请务必确认您提供的模型名称正确，否则我们无法确定该运行哪段代码**。
@@ -29,8 +31,3 @@
 ```shell
 sh data_engine/run_data_engine.sh
 ```
-
-## Conclusion
-如果您遇到任何问题，请随时通过提交 Issues 联系我们。
-
-感谢您选择 RLAIF-V，祝您使用愉快！
diff --git a/script/train/llava15_train.sh b/script/train/llava15_train.sh
index 88fd4d6..35920c2 100644
--- a/script/train/llava15_train.sh
+++ b/script/train/llava15_train.sh
@@ -6,8 +6,8 @@ exp_name=llava15_rlaifv
 
 deepspeed ./muffin/train/train_llava15.py \
     --deepspeed ./script/zero2.json  \
-    --model_name_or_path /data/yaoshu/models/llava-v1.5-7b \
-    --data_dir /data/RLAIF-V-CC/results/test/dataset/ \
+    --model_name_or_path liuhaotian/llava-v1.5-7b \
+    --data_dir ./RLAIF-V-Dataset_logps/ \
     --image_folder not_used \
     --vision_tower openai/clip-vit-large-patch14-336 \
     --mm_use_im_start_end False \

From be73e251d3295e1f358502f51b8b05631105ea5e Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Wed, 4 Dec 2024 00:54:21 +0800
Subject: [PATCH 13/18] [upgrade] refactor code

---
 data_engine/data_engine.py                    | 137 ++++++------------
 data_engine/dpo_data_filter/filter.py         |  27 ++--
 data_engine/pipeline/__init__.py              |   0
 .../pipeline/dpo_reward_pipeline/__init__.py  |   0
 .../dpo_reward_pipeline}/answer_sampler.py    |   0
 .../dpo_reward_pipeline}/data_pair_builder.py |   0
 .../dpo_reward_pipeline}/dataset.py           |   2 +-
 .../dpo_reward_pipeline.py                    |  84 +++++++++++
 .../dpo_reward_pipeline}/logps_calculator.py  |   4 +-
 .../dpo_reward_pipeline}/reward_computer.py   |   0
 data_engine/pipeline/pipeline.py              |  37 +++++
 data_engine/run_engine.sh                     |   1 +
 data_engine/util.py                           |  42 ++++++
 13 files changed, 230 insertions(+), 104 deletions(-)
 create mode 100644 data_engine/pipeline/__init__.py
 create mode 100644 data_engine/pipeline/dpo_reward_pipeline/__init__.py
 rename data_engine/{ => pipeline/dpo_reward_pipeline}/answer_sampler.py (100%)
 rename data_engine/{ => pipeline/dpo_reward_pipeline}/data_pair_builder.py (100%)
 rename data_engine/{ => pipeline/dpo_reward_pipeline}/dataset.py (98%)
 create mode 100644 data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
 rename data_engine/{ => pipeline/dpo_reward_pipeline}/logps_calculator.py (98%)
 rename data_engine/{ => pipeline/dpo_reward_pipeline}/reward_computer.py (100%)
 create mode 100644 data_engine/pipeline/pipeline.py

diff --git a/data_engine/data_engine.py b/data_engine/data_engine.py
index a5fef98..cb1fd6a 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/data_engine.py
@@ -1,53 +1,15 @@
-import json
 import os.path
 import random
-from copy import deepcopy
 
 import pandas as pd
 
-import logps_calculator
-import reward_computer
-import data_pair_builder
-from dpo_data_filter import filter
-import answer_sampler
+from data_engine.pipeline.dpo_reward_pipeline.dpo_reward_pipeline import DPORewardPipeline
+from data_engine.util import *
 import argparse
 import torch
 import torch.distributed as dist
 
-
-def store_data_with_no_image(data, path):
-    if torch.distributed.get_rank() == 0:
-        data_to_store = []
-        for item in data:
-            item = deepcopy(item)
-            item.pop('image', None)
-            data_to_store.append(item)
-
-        with open(path, 'w') as f:
-            json.dump(data_to_store, f, ensure_ascii=False, indent=4)
-
-
-def print_stage(idx, desc="", finish=False):
-    if torch.distributed.get_rank() == 0:
-        print("=" * 80)
-        if not finish:
-            print(f"Processing Stage {idx}: {desc}")
-        else:
-            print(f"Finish Stage {idx}")
-        print("=" * 80)
-
-
-def dir_prepare(dir_to_check, clean=True):
-    if torch.distributed.get_rank() == 0:
-        if not os.path.exists(dir_to_check):
-            os.makedirs(dir_to_check)
-        elif clean:
-            if os.path.isdir(dir_to_check):
-                for file in os.listdir(dir_to_check):
-                    os.remove(os.path.join(dir_to_check, file))
-            else:
-                os.remove(dir_to_check)
-                os.mkdir(dir_to_check)
+pipelines = [DPORewardPipeline]
 
 
 def run(
@@ -57,13 +19,26 @@ def run(
         instruct_model_path,
         dataset_path,
         work_dir,
+        pipeline_name,
         continue_from_stage=1,
         sample_k=10,
         rank=3,
         distance=25,
         debug=False
 ):
-    # -1: multi cuda env init
+    pipline = None
+    for pipeline_to_judge in pipelines:
+        if pipeline_to_judge.judge_able_to_process(pipeline_name):
+            pipline = pipeline_to_judge
+            break
+    if pipline is None:
+        raise ValueError("Unsupported pipeline")
+
+    intermediate_step_dir = os.path.join(work_dir, "intermediate_step")
+    if debug:
+        print(
+            "You set debug=True, it will generate fine-grained process data under subdir 'debug'. You can check that dir for debug details.")
+
     dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
                             rank=int(os.getenv('RANK', '0')), )
     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
@@ -72,66 +47,52 @@ def run(
     sampled_answer_path = os.path.join(work_dir, "sampled_answer")
     if continue_from_stage <= 0:
         print_stage(0, "Sample answers")
-        dir_prepare(sampled_answer_path)
-        answer_sampler.sample_answer(instruct_model_name, instruct_model_path, dataset_path, sampled_answer_path,
-                                     sample_k)
+        pipline.sample_rollout(
+            instruct_model_name,
+            instruct_model_path,
+            dataset_path,
+            sampled_answer_path,
+            sample_k,
+            os.path.join(intermediate_step_dir, "sample_answers"),
+            debug
+        )
         print_stage(0, finish=True)
 
     # 1: calculate logps
-    reward_logps_output_dir = os.path.join(work_dir, "reward_logps")
-    instruct_logps_output_dir = os.path.join(work_dir, "instruct_logps")
+    reward_output_dir = os.path.join(work_dir, "reward")
     if continue_from_stage <= 1:
-        print_stage(1, "Calculate logps")
-        dir_prepare(reward_logps_output_dir)
-        dir_prepare(instruct_logps_output_dir)
-        _ = logps_calculator.main(
+        print_stage(1, "Calculate rewards")
+        pipline.reward_calculate(
             reward_model_name,
             reward_model_path,
             instruct_model_name,
             instruct_model_path,
             sampled_answer_path,
-            reward_logps_output_dir,
-            instruct_logps_output_dir)
+            reward_output_dir,
+            os.path.join(intermediate_step_dir, "calculate_rewards"),
+            debug
+        )
         print_stage(1, finish=True)
 
     # following code doesn't need multi CUDA
     if torch.distributed.get_rank() == 0:
-        debug_root_dir = os.path.join(work_dir, 'debug')
-        if debug:
-            print(
-                "You set debug=True, it will generate fine-grained process data under subdir 'debug'. You can check that dir for debug details.")
-            dir_prepare(debug_root_dir)
         if continue_from_stage <= 2:
-            print_stage(2, "DPO dataset construction")
-
-            # 2.1: calculate reward
-            print_stage(2.1, "Calculate reward")
-            rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
-            if debug:
-                store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'rewards.json'))
-            print_stage(2.1, finish=True)
-
-            # 2.2: build DPO pair
-            print_stage(2.2, "Build DPO pairs")
-            dpo_pair, sum_output, avg_output = data_pair_builder.main(rewards, sample_k, rank, distance)
-            if debug:
-                store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'dpo_pair.json'))
-                store_data_with_no_image(sum_output, os.path.join(debug_root_dir, 'sum_output.json'))
-                store_data_with_no_image(avg_output, os.path.join(debug_root_dir, 'avg_output.json'))
-            print_stage(2.2, finish=True)
-
-            # 2.3: filter DPO pairs
-            print_stage(2.3, "Filter DPO pairs")
-            data = filter.main(dpo_pair)
-            if debug:
-                store_data_with_no_image(rewards, os.path.join(debug_root_dir, 'filtered.json'))
-            print_stage(2.3, finish=True)
-
-            # 2.4: save files
-            print_stage(2.4, "Save file to dataset format")
+            print_stage(2, "Pair build and filter")
+
+            data = pipline.pair_build_with_filter(
+                reward_output_dir,
+                os.path.join(intermediate_step_dir, "pair_build_and_filter"),
+                sample_k,
+                rank,
+                distance,
+                debug
+            )
+            print_stage(2, finish=True)
+
+            # -1: save files
+            print_stage(-1, "Save file to dataset format")
             output_path = os.path.join(work_dir, "dataset")
             output_file = os.path.join(output_path, "dpo_dataset.parquet")
-            random.shuffle(data)
             dir_prepare(output_path)
             needed_keys = [
                 "question",
@@ -150,9 +111,7 @@ def run(
             df = pd.DataFrame(data)
             df = df.sample(frac=1).reset_index(drop=True)
             df.to_parquet(output_file)
-            print_stage(2.4, finish=True)
-
-            print_stage(2, finish=True)
+            print_stage(-1)
 
             print(f"We get {len(data)} data items in total, you may need that to set max_steps for training")
             print("Finish all stages, output file is saved to ", output_path)
diff --git a/data_engine/dpo_data_filter/filter.py b/data_engine/dpo_data_filter/filter.py
index 0c65c4a..cb4331d 100644
--- a/data_engine/dpo_data_filter/filter.py
+++ b/data_engine/dpo_data_filter/filter.py
@@ -41,17 +41,7 @@ def load_data(file_path):
         raise ValueError(f"Unsupported file type: {ext}")
 
 
-def main(data):
-    print(f"Before filtering, we have {len(data)} data")
-    # import filters here to avoid circulate important
-    from .length_filter import LengthFilter
-    from .num_filter import NumFilter
-    from .same_filter import DeleteSameFilter
-
-    # you can add your own filters here or delete the filters
-    # that are determined to be unnecessary
-    filters = [DeleteSameFilter, NumFilter, LengthFilter]
-
+def filter_with_filter_list(filters: list[Filter], data):
     for filter_to_run in filters:
         filter_name = filter_to_run.__name__
         filter_doc = filter_to_run.__doc__ if filter_to_run.__doc__ else "No documentation available"
@@ -67,5 +57,18 @@ def main(data):
         print(f"After filtering, we get {len(data)} data items")
         print("=" * 80 + "\n")
     print(f"After filtering, we have {len(data)} data")
-
     return data
+
+
+def main(data):
+    print(f"Before filtering, we have {len(data)} data")
+    # import filters here to avoid circulate important
+    from .length_filter import LengthFilter
+    from .num_filter import NumFilter
+    from .same_filter import DeleteSameFilter
+
+    # you can add your own filters here or delete the filters
+    # that are determined to be unnecessary
+    filters = [DeleteSameFilter, NumFilter, LengthFilter]
+
+    return filter_with_filter_list(filters, data)
diff --git a/data_engine/pipeline/__init__.py b/data_engine/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_engine/pipeline/dpo_reward_pipeline/__init__.py b/data_engine/pipeline/dpo_reward_pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_engine/answer_sampler.py b/data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py
similarity index 100%
rename from data_engine/answer_sampler.py
rename to data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py
diff --git a/data_engine/data_pair_builder.py b/data_engine/pipeline/dpo_reward_pipeline/data_pair_builder.py
similarity index 100%
rename from data_engine/data_pair_builder.py
rename to data_engine/pipeline/dpo_reward_pipeline/data_pair_builder.py
diff --git a/data_engine/dataset.py b/data_engine/pipeline/dpo_reward_pipeline/dataset.py
similarity index 98%
rename from data_engine/dataset.py
rename to data_engine/pipeline/dpo_reward_pipeline/dataset.py
index d41b266..c754d71 100644
--- a/data_engine/dataset.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/dataset.py
@@ -8,7 +8,7 @@
 from muffin.utils import load_attr_or_empty_str
 from omnilmm.train.train_utils import omni_preprocess
 
-import util
+import data_engine.util as util
 
 
 class PreferenceInferenceDataset(torch_data.Dataset):
diff --git a/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py b/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
new file mode 100644
index 0000000..26f2740
--- /dev/null
+++ b/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
@@ -0,0 +1,84 @@
+import os
+
+import pandas as pd
+import torch
+
+from data_engine.util import dir_prepare, store_data_with_no_image
+from data_engine.pipeline.dpo_reward_pipeline import answer_sampler, logps_calculator, reward_computer, \
+    data_pair_builder
+from data_engine.dpo_data_filter import filter
+from data_engine.pipeline.pipeline import Pipeline
+
+
+class DPORewardPipeline(Pipeline):
+    @classmethod
+    def judge_able_to_process(cls, pipeline_name):
+        return pipeline_name.lower() == "dpo_reward"
+
+    @classmethod
+    def sample_rollout(cls,
+                       instruct_model_name: str,
+                       instruct_model_path: str,
+                       dataset_path: str,
+                       sampled_answer_path: str,
+                       sample_k: int,
+                       work_dir: str,
+                       debug: bool):
+        answer_sampler.sample_answer(instruct_model_name, instruct_model_path, dataset_path, sampled_answer_path, sample_k)
+
+    @classmethod
+    def reward_calculate(cls,
+                         reward_model_name: str,
+                         reward_model_path: str,
+                         instruct_model_name: str,
+                         instruct_model_path: str,
+                         sampled_answer_path: str,
+                         reward_path: str,
+                         work_dir: str,
+                         debug: bool):
+        reward_logps_output_dir = os.path.join(work_dir, "reward_logps")
+        instruct_logps_output_dir = os.path.join(work_dir, "instruct_logps")
+        dir_prepare(reward_logps_output_dir)
+        dir_prepare(instruct_logps_output_dir)
+        logps_calculator.main(
+            reward_model_name,
+            reward_model_path,
+            instruct_model_name,
+            instruct_model_path,
+            sampled_answer_path,
+            reward_logps_output_dir,
+            instruct_logps_output_dir)
+        if torch.distributed.get_rank() == 0:
+            rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
+            step = 5000
+            for idx, start in enumerate(range(0, len(rewards), step)):
+                temp_data = rewards[start: min(start + step, len(rewards))]
+                df = pd.DataFrame(temp_data)
+                df.to_parquet(os.path.join(reward_path, f'RLAIF-V-Dataset-reward_{idx:03}-{len(temp_data)}.parquet'))
+
+    @classmethod
+    def pair_build_with_filter(cls,
+                               reward_path: str,
+                               work_dir: str,
+                               sample_k: int,
+                               rank: int,
+                               distance: int,
+                               debug: bool):
+        rewards = []
+        reward_files = [f for f in os.listdir(reward_path) if f.endswith('.parquet')]
+
+        for reward_file in reward_files:
+            reward_file_path = os.path.join(reward_path, reward_file)
+            reward_df = pd.read_parquet(reward_file_path)
+            rewards.append(reward_df)
+        rewards = pd.concat(rewards, ignore_index=True).to_dict(orient='records')
+        dpo_pair, sum_output, avg_output = data_pair_builder.main(rewards, sample_k, rank, distance)
+        if debug:
+            store_data_with_no_image(rewards, os.path.join(work_dir, 'debug', 'dpo_pair.json'))
+            store_data_with_no_image(sum_output, os.path.join(work_dir, 'debug', 'sum_output.json'))
+            store_data_with_no_image(avg_output, os.path.join(work_dir, 'debug', 'avg_output.json'))
+        data = filter.main(dpo_pair)
+        if debug:
+            store_data_with_no_image(rewards, os.path.join(work_dir, 'debug', 'filtered.json'))
+
+        return data
diff --git a/data_engine/logps_calculator.py b/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
similarity index 98%
rename from data_engine/logps_calculator.py
rename to data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
index b846ce9..8d9683a 100644
--- a/data_engine/logps_calculator.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
@@ -12,8 +12,8 @@
                                                concate_pad)
 from muffin.gen_data_util import InferenceSampler
 from muffin.train.train_utils import SFT_collator_fn
-from util import *
-from dataset import PreferenceInferenceDataset
+from data_engine.util import *
+from data_engine.pipeline.dpo_reward_pipeline.dataset import PreferenceInferenceDataset
 
 import torch
 import torch.distributed as dist
diff --git a/data_engine/reward_computer.py b/data_engine/pipeline/dpo_reward_pipeline/reward_computer.py
similarity index 100%
rename from data_engine/reward_computer.py
rename to data_engine/pipeline/dpo_reward_pipeline/reward_computer.py
diff --git a/data_engine/pipeline/pipeline.py b/data_engine/pipeline/pipeline.py
new file mode 100644
index 0000000..77cd8bc
--- /dev/null
+++ b/data_engine/pipeline/pipeline.py
@@ -0,0 +1,37 @@
+class Pipeline:
+    @classmethod
+    def judge_able_to_process(cls, pipeline_name) -> bool:
+        raise NotImplementedError
+
+    @classmethod
+    def sample_rollout(cls,
+                       instruct_model_name: str,
+                       instruct_model_path: str,
+                       dataset_path: str,
+                       sampled_answer_path: str,
+                       sample_k: int,
+                       work_dir: str,
+                       debug: bool) -> None:
+        raise NotImplementedError
+
+    @classmethod
+    def reward_calculate(cls,
+                         reward_model_name: str,
+                         reward_model_path: str,
+                         instruct_model_name: str,
+                         instruct_model_path: str,
+                         sampled_answer_path: str,
+                         reward_path: str,
+                         work_dir: str,
+                         debug: bool) -> None:
+        raise NotImplementedError
+
+    @classmethod
+    def pair_build_with_filter(cls,
+                               reward_path: str,
+                               work_dir: str,
+                               sample_k: int,
+                               rank: int,
+                               distance: int,
+                               debug: bool) -> list:
+        raise NotImplementedError
diff --git a/data_engine/run_engine.sh b/data_engine/run_engine.sh
index 4d3e82a..2348818 100644
--- a/data_engine/run_engine.sh
+++ b/data_engine/run_engine.sh
@@ -21,5 +21,6 @@ torchrun $DISTRIBUTED_ARGS data_engine/data_engine.py \
       --instruct_model_path /path/to/yout/instruct/model \
       --dataset_path /path/to/your/dataset \
       --work_dir /path/to/your/work/dir \
+      --pipeline_name dpo_reward \
       --continue_from_stage 0 \
       --debug True
\ No newline at end of file
diff --git a/data_engine/util.py b/data_engine/util.py
index ed0bcb5..4356736 100644
--- a/data_engine/util.py
+++ b/data_engine/util.py
@@ -1,3 +1,11 @@
+import json
+import os
+import shutil
+from copy import deepcopy
+
+import torch
+
+
 def judge_is_llava(model_name: str) -> bool:
     lower_name = model_name.lower()
     return 'llava' in lower_name or ('rlaif' in lower_name and '7b' in lower_name)
@@ -6,3 +14,37 @@ def judge_is_llava(model_name: str) -> bool:
 def judge_is_omnilmm(model_name: str) -> bool:
     lower_name = model_name.lower()
     return 'omnilmm' in lower_name or ('rlaif' in lower_name and '12b' in lower_name)
+
+
+def store_data_with_no_image(data, path):
+    if torch.distributed.get_rank() == 0:
+        data_to_store = []
+        for item in data:
+            item = deepcopy(item)
+            item.pop('image', None)
+            data_to_store.append(item)
+
+        with open(path, 'w') as f:
+            json.dump(data_to_store, f, ensure_ascii=False, indent=4)
+
+
+def print_stage(idx, desc="", finish=False):
+    if torch.distributed.get_rank() == 0:
+        print("=" * 80)
+        if not finish:
+            print(f"Processing Stage {idx}: {desc}")
+        else:
+            print(f"Finish Stage {idx}")
+        print("=" * 80)
+
+
+def dir_prepare(dir_to_check, clean=True):
+    if torch.distributed.get_rank() == 0:
+        if not os.path.exists(dir_to_check):
+            os.makedirs(dir_to_check)
+        elif clean:
+            if os.path.isdir(dir_to_check):
+                shutil.rmtree(dir_to_check)
+            else:
+                os.remove(dir_to_check)
+            os.makedirs(dir_to_check)

From 65755eba17317c75faa1cb2ea7b37665fb9701fe Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Wed, 4 Dec 2024 22:45:06 +0800
Subject: [PATCH 14/18] [upgrade]

---
 data_engine/{data_engine.py => engine.py}     | 92 +++++++++++--------
 .../pipeline/divide_and_conquer/__init__.py   |  0
 .../divide_and_conquer_pipeline.py            | 92 +++++++++++++++++++
 .../dpo_reward_pipeline/answer_sampler.py     |  3 +-
 .../dpo_reward_pipeline.py                    | 13 +--
 .../dpo_reward_pipeline/logps_calculator.py   |  1 -
 data_engine/pipeline/pipeline.py              |  6 +-
 muffin/llava15_gen_data.py                    | 17 ++--
 8 files changed, 168 insertions(+), 56 deletions(-)
 rename data_engine/{data_engine.py => engine.py} (60%)
 create mode 100644 data_engine/pipeline/divide_and_conquer/__init__.py
 create mode 100644 data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py

diff --git a/data_engine/data_engine.py b/data_engine/engine.py
similarity index 60%
rename from data_engine/data_engine.py
rename to data_engine/engine.py
index cb1fd6a..78b1017 100644
--- a/data_engine/data_engine.py
+++ b/data_engine/engine.py
@@ -1,15 +1,16 @@
 import os.path
-import random
+import sys
 
+sys.path.append("./")
 import pandas as pd
-
+from data_engine.pipeline.divide_and_conquer.divide_and_conquer_pipeline import DivideAndConquerPipeline
 from data_engine.pipeline.dpo_reward_pipeline.dpo_reward_pipeline import DPORewardPipeline
 from data_engine.util import *
 import argparse
 import torch
 import torch.distributed as dist
 
-pipelines = [DPORewardPipeline]
+pipelines = [DPORewardPipeline, DivideAndConquerPipeline]
 
 
 def run(
@@ -35,17 +36,22 @@ def run(
         raise ValueError("Unsupported pipeline")
 
     intermediate_step_dir = os.path.join(work_dir, "intermediate_step")
-    if debug:
-        print(
-            "You set debug=True, it will generate fine-grained process data under subdir 'debug'. You can check that dir for debug details.")
 
     dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
                             rank=int(os.getenv('RANK', '0')), )
     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
+    if debug:
+        if torch.distributed.get_rank() == 0:
+            print(
+                "You set debug=True, it will generate fine-grained process data under subdir 'debug'. You can check that dir for debug details.")
+
     # 0: sample answer
     sampled_answer_path = os.path.join(work_dir, "sampled_answer")
     if continue_from_stage <= 0:
+        dir_prepare(sampled_answer_path)
+        sub_work_dir = os.path.join(intermediate_step_dir, "sample_answers")
+        dir_prepare(sub_work_dir)
         print_stage(0, "Sample answers")
         pipline.sample_rollout(
             instruct_model_name,
@@ -53,7 +59,7 @@ def run(
             dataset_path,
             sampled_answer_path,
             sample_k,
-            os.path.join(intermediate_step_dir, "sample_answers"),
+            sub_work_dir,
             debug
         )
         print_stage(0, finish=True)
@@ -61,6 +67,9 @@ def run(
     # 1: calculate logps
     reward_output_dir = os.path.join(work_dir, "reward")
     if continue_from_stage <= 1:
+        dir_prepare(reward_output_dir)
+        sub_work_dir = os.path.join(intermediate_step_dir, "calculate_rewards")
+        dir_prepare(sub_work_dir)
         print_stage(1, "Calculate rewards")
         pipline.reward_calculate(
             reward_model_name,
@@ -69,7 +78,7 @@ def run(
             instruct_model_path,
             sampled_answer_path,
             reward_output_dir,
-            os.path.join(intermediate_step_dir, "calculate_rewards"),
+            sub_work_dir,
             debug
         )
         print_stage(1, finish=True)
@@ -78,10 +87,12 @@ def run(
     if torch.distributed.get_rank() == 0:
         if continue_from_stage <= 2:
             print_stage(2, "Pair build and filter")
-
+            sub_work_dir = os.path.join(intermediate_step_dir, "pair_build_and_filter")
+            dir_prepare(sub_work_dir)
             data = pipline.pair_build_with_filter(
+                sampled_answer_path,
                 reward_output_dir,
-                os.path.join(intermediate_step_dir, "pair_build_and_filter"),
+                sub_work_dir,
                 sample_k,
                 rank,
                 distance,
@@ -89,34 +100,37 @@ def run(
             )
             print_stage(2, finish=True)
 
-            # -1: save files
-            print_stage(-1, "Save file to dataset format")
-            output_path = os.path.join(work_dir, "dataset")
-            output_file = os.path.join(output_path, "dpo_dataset.parquet")
-            dir_prepare(output_path)
-            needed_keys = [
-                "question",
-                "chosen",
-                "rejected",
-                "origin_dataset",
-                "origin_split",
-                "idx",
-                "image_path",
-                "ds_name",
-                "image"]
-            for item in data:
-                for key in list(item.keys()):
-                    if key not in needed_keys:
-                        del item[key]
-            df = pd.DataFrame(data)
-            df = df.sample(frac=1).reset_index(drop=True)
-            df.to_parquet(output_file)
-            print_stage(-1)
-
-            print(f"We get {len(data)} data items in total, you may need that to set max_steps for training")
-            print("Finish all stages, output file is saved to ", output_path)
-            print("You can directly copy this path to the training script to replace --data_dir value")
-            print("Have a nice day!")
+            if isinstance(data, str):
+                print(f"Dataset stored to {data}")
+            else:
+                # -1: save files
+                print_stage(-1, "Save file to dataset format")
+                output_path = os.path.join(work_dir, "dataset")
+                output_file = os.path.join(output_path, "dpo_dataset.parquet")
+                dir_prepare(output_path)
+                needed_keys = [
+                    "question",
+                    "chosen",
+                    "rejected",
+                    "origin_dataset",
+                    "origin_split",
+                    "idx",
+                    "image_path",
+                    "ds_name",
+                    "image"]
+                for item in data:
+                    for key in list(item.keys()):
+                        if key not in needed_keys:
+                            del item[key]
+                df = pd.DataFrame(data)
+                df = df.sample(frac=1).reset_index(drop=True)
+                df.to_parquet(output_file)
+                print_stage(-1)
+
+                print(f"We get {len(data)} data items in total, you may need that to set max_steps for training")
+                print("Finish all stages, output file is saved to ", output_path)
+                print("You can directly copy this path to the training script to replace --data_dir value")
+                print("Have a nice day!")
 
 
 if __name__ == "__main__":
@@ -127,6 +141,7 @@ def run(
     args.add_argument("--instruct_model_path", type=str, help="The path of the instruct model.")
     args.add_argument("--dataset_path", type=str, help="The path of the dataset.")
     args.add_argument("--work_dir", type=str, help="The working directory.")
+    args.add_argument("--pipeline_name", type=str, help="The pipeline you choose to run.")
     args.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
     args.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
     args.add_argument("--rank", type=int, default=3, help="The rank number.")
@@ -141,6 +156,7 @@ def run(
         args.instruct_model_path,
         args.dataset_path,
         args.work_dir,
+        args.pipeline_name,
         args.continue_from_stage,
         args.sample_k,
         args.rank,
diff --git a/data_engine/pipeline/divide_and_conquer/__init__.py b/data_engine/pipeline/divide_and_conquer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py b/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
new file mode 100644
index 0000000..dbf36c7
--- /dev/null
+++ b/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
@@ -0,0 +1,92 @@
+import os
+import subprocess
+
+import torch
+
+from data_engine.pipeline.pipeline import Pipeline
+from data_engine.util import dir_prepare
+
+
+def run_bash_script(script_path, *args):
+    """Helper function to run bash scripts with arguments."""
+    command = ['bash', script_path] + list(args)
+    subprocess.run(command, check=True)
+
+
+def get_jsonl_file(path) -> list:
+    jsonl_files = [f for f in os.listdir(path) if f.endswith('.jsonl')]
+    return jsonl_files
+
+
+class DivideAndConquerPipeline(Pipeline):
+    @classmethod
+    def judge_able_to_process(cls, pipeline_name) -> bool:
+        return "divide_and_conquer" in pipeline_name.lower()
+
+    @classmethod
+    def sample_rollout(cls,
+                       instruct_model_name: str,
+                       instruct_model_path: str,
+                       dataset_path: str,
+                       sampled_answer_path: str,
+                       sample_k: int,
+                       work_dir: str,
+                       debug: bool) -> None:
+        if torch.distributed.get_rank() == 0:
+            script_path = './script/data_gen/llava15/llava15_diverse_gen.sh'
+            run_bash_script(script_path, instruct_model_path, sampled_answer_path, dataset_path,
+                            get_jsonl_file(dataset_path)[0], str(0), str(-1),
+                            str(torch.cuda.device_count()))
+
+    @classmethod
+    def reward_calculate(cls,
+                         reward_model_name: str,
+                         reward_model_path: str,
+                         instruct_model_name: str,
+                         instruct_model_path: str,
+                         sampled_answer_path: str,
+                         reward_path: str,
+                         work_dir: str,
+                         debug: bool) -> None:
+        if torch.distributed.get_rank() == 0:
+            script_path = './script/data_gen/divide_and_conquer/llama3_8b_divide_and_conquer.sh'
+            answer_file = os.path.join(sampled_answer_path, os.path.basename(get_jsonl_file(sampled_answer_path)[0])[0])
+            run_bash_script(script_path, answer_file, '0', '-1', str(torch.cuda.device_count()),
+                            str(torch.cuda.device_count()))
+            script_path = './script/data_gen/omnilmm/omnilmm_autocheck.sh'
+            check_ques_file = ""
+            for file in get_jsonl_file(sampled_answer_path):
+                if "llama3-8b_divide.gq.qas.jsonl" in file:
+                    check_ques_file = file
+                    break
+            run_bash_script(script_path, reward_model_path, reward_path, sampled_answer_path, check_ques_file, '0', '-1',
+                            str(torch.cuda.device_count()))
+
+    @classmethod
+    def pair_build_with_filter(cls,
+                               sampled_answer_path: str,
+                               reward_path: str,
+                               work_dir: str,
+                               sample_k: int,
+                               rank: int,
+                               distance: int,
+                               debug: bool) -> str:
+        if torch.distributed.get_rank() == 0:
+            gq_file = ""
+            for file in get_jsonl_file(sampled_answer_path):
+                if "llama3-8b_divide.gq.jsonl" in file:
+                    gq_file = file
+                    break
+            feedback_file = get_jsonl_file(reward_path)[0]
+            script_path = './script/data_gen/construct_pairs.sh'
+            run_bash_script(script_path, os.path.join(reward_path, feedback_file), os.path.join(sampled_answer_path, gq_file), str(2))
+
+            script_path = './utils/get_pairs_filter_shorten.py'
+            result_dir = os.path.join(work_dir, "dataset")
+            dir_prepare(result_dir)
+            subprocess.run([
+                'python', script_path,
+                '--path', os.path.join(reward_path, feedback_file),
+                '--save_path', os.path.join(result_dir, "result.jsonl")
+            ], check=True)
+            return os.path.join(result_dir, "result.jsonl")
diff --git a/data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py b/data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py
index 37f8190..cb5d512 100644
--- a/data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py
@@ -1,8 +1,7 @@
 import llava.llava15_sample_data
 import omnilmm.omnilmm_sample_data
 
-from util import *
-import torch.distributed as dist
+from data_engine.util import *
 
 
 def sample_answer(model_name, model_path, dataset_path, output_path, sample_k=10):
diff --git a/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py b/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
index 26f2740..dd8a91a 100644
--- a/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
@@ -13,7 +13,7 @@
 class DPORewardPipeline(Pipeline):
     @classmethod
     def judge_able_to_process(cls, pipeline_name):
-        return pipeline_name.lower() == "dpo_reward"
+        return "dpo_reward" in pipeline_name.lower()
 
     @classmethod
     def sample_rollout(cls,
@@ -58,6 +58,7 @@ def reward_calculate(cls,
 
     @classmethod
     def pair_build_with_filter(cls,
+                               sampled_answer_path: str,
                                reward_path: str,
                                work_dir: str,
                                sample_k: int,
@@ -74,11 +75,11 @@ def pair_build_with_filter(cls,
         rewards = pd.concat(rewards, ignore_index=True).to_dict(orient='records')
         dpo_pair, sum_output, avg_output = data_pair_builder.main(rewards, sample_k, rank, distance)
         if debug:
-            store_data_with_no_image(rewards, os.path.join(work_dir, 'debug', 'dpo_pair.json'))
-            store_data_with_no_image(sum_output, os.path.join(work_dir, 'debug', 'sum_output.json'))
-            store_data_with_no_image(avg_output, os.path.join(work_dir, 'debug', 'avg_output.json'))
+            debug_dir = os.path.join(work_dir, 'debug')
+            dir_prepare(debug_dir)
+            store_data_with_no_image(rewards, os.path.join(debug_dir, 'dpo_pair.json'))
+            store_data_with_no_image(sum_output, os.path.join(debug_dir, 'sum_output.json'))
+            store_data_with_no_image(avg_output, os.path.join(debug_dir, 'avg_output.json'))
         data = filter.main(dpo_pair)
-        if debug:
-            store_data_with_no_image(rewards, os.path.join(work_dir, 'debug', 'filtered.json'))
 
         return data
diff --git a/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py b/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
index 8d9683a..6898467 100644
--- a/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
@@ -1,4 +1,3 @@
-import os
 import itertools
 import argparse
 from functools import partial
diff --git a/data_engine/pipeline/pipeline.py b/data_engine/pipeline/pipeline.py
index 77cd8bc..2f66612 100644
--- a/data_engine/pipeline/pipeline.py
+++ b/data_engine/pipeline/pipeline.py
@@ -1,3 +1,6 @@
+from typing import Union
+
+
 class Pipeline:
     @classmethod
     def judge_able_to_process(cls, pipeline_name) -> bool:
@@ -28,10 +31,11 @@ def reward_calculate(cls,
 
     @classmethod
     def pair_build_with_filter(cls,
+                               sampled_answer_path: str,
                                reward_path: str,
                                work_dir: str,
                                sample_k: int,
                                rank: int,
                                distance: int,
-                               debug: bool) -> list:
+                               debug: bool) -> Union[list, str]:
         raise NotImplementedError
diff --git a/muffin/llava15_gen_data.py b/muffin/llava15_gen_data.py
index 746c342..3a05f43 100644
--- a/muffin/llava15_gen_data.py
+++ b/muffin/llava15_gen_data.py
@@ -191,12 +191,13 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
 
     args = parser.parse_args()
 
-    torch.distributed.init_process_group(
-        backend='nccl',
-        world_size=int(os.getenv('WORLD_SIZE', '1')),
-        rank=int(os.getenv('RANK', '0')),
-    )
-    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            backend='nccl',
+            world_size=int(os.getenv('WORLD_SIZE', '1')),
+            rank=int(os.getenv('RANK', '0')),
+        )
+        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
     print(f'Init Rank-{torch.distributed.get_rank()}')
     model_path = os.path.expanduser(args.checkpoint)
@@ -304,7 +305,7 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
 
             else:
                 if args.num_beam >= 1:
-                    print("use beamsearch:", args.num_beam)
+                    # print("use beamsearch:", args.num_beam)
                     output = model.generate(
                         inputs=batch['input_ids'].cuda(),
                         images=batch['images'].half().cuda(),
@@ -315,7 +316,7 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                         use_cache=True,
                         return_dict_in_generate=True)
                 else:
-                    print("use sampling:", args.temperature)
+                    # print("use sampling:", args.temperature)
                     output = model.generate(
                         inputs=batch['input_ids'].cuda(),
                         images=batch['images'].half().cuda(),

From 0863ec22ffbfeef16fc9ee6db1a3cddf8cb3b12f Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Sat, 7 Dec 2024 23:49:27 +0800
Subject: [PATCH 15/18] [upgrade]

---
 data_engine/dpo_data_filter/filter.py         |  24 ++-
 data_engine/dpo_data_filter/similar_filter.py |  55 ++++++
 data_engine/engine.py                         | 167 ++++++++--------
 .../divide_and_conquer_pipeline.py            | 186 +++++++++++++-----
 .../dpo_reward_pipeline/data_pair_builder.py  |  64 ++++--
 .../dpo_reward_pipeline.py                    | 130 ++++++++----
 .../dpo_reward_pipeline/logps_calculator.py   |   9 +-
 data_engine/pipeline/pipeline.py              |  38 +---
 minicpm-llama3-v-25/minicpmv_autocheck.py     |  75 +++----
 minicpm-llama3-v-25/minicpmv_diverse_gen.py   |   9 +-
 muffin/llava15_gen_data.py                    |  37 ++--
 omnilmm/omnilmm_gen_data.py                   |  55 +++---
 pyproject.toml                                |   2 +-
 .../llama3_8b_divide_and_conquer.sh           |  16 +-
 utils/get_pairs_filter_shorten.py             |   2 +-
 utils/get_preference_pairs.py                 |   9 +-
 utils/llama3_8b_inference.py                  |  23 ++-
 17 files changed, 556 insertions(+), 345 deletions(-)
 create mode 100644 data_engine/dpo_data_filter/similar_filter.py

diff --git a/data_engine/dpo_data_filter/filter.py b/data_engine/dpo_data_filter/filter.py
index cb4331d..e23ddcf 100644
--- a/data_engine/dpo_data_filter/filter.py
+++ b/data_engine/dpo_data_filter/filter.py
@@ -41,22 +41,25 @@ def load_data(file_path):
         raise ValueError(f"Unsupported file type: {ext}")
 
 
-def filter_with_filter_list(filters: list[Filter], data):
+def filter_with_filter_list(filters: list[Filter], data, log=True):
     for filter_to_run in filters:
         filter_name = filter_to_run.__name__
         filter_doc = filter_to_run.__doc__ if filter_to_run.__doc__ else "No documentation available"
-        print("=" * 80)
-        print(f"Processing Filter: {filter_name}")
-        print("=" * 80)
-        print(f"Documentation:\n{filter_doc}\n")
+        if log:
+            print("=" * 80)
+            print(f"Processing Filter: {filter_name}")
+            print("=" * 80)
+            print(f"Documentation:\n{filter_doc}\n")
 
         data = filter_to_run.do_filter(data)
 
-        print("=" * 80)
-        print(f"Filter {filter_name} Finished")
-        print(f"After filtering, we get {len(data)} data items")
-        print("=" * 80 + "\n")
-    print(f"After filtering, we have {len(data)} data")
+        if log:
+            print("=" * 80)
+            print(f"Filter {filter_name} Finished")
+            print(f"After filtering, we get {len(data)} data items")
+            print("=" * 80 + "\n")
+    if log:
+        print(f"After filtering, we have {len(data)} data")
     return data
 
 
@@ -66,6 +69,7 @@ def main(data):
     from .length_filter import LengthFilter
     from .num_filter import NumFilter
     from .same_filter import DeleteSameFilter
+    from .similar_filter import SimilarFilter
 
     # you can add your own filters here or delete the filters
     # that are determined to be unnecessary
diff --git a/data_engine/dpo_data_filter/similar_filter.py b/data_engine/dpo_data_filter/similar_filter.py
new file mode 100644
index 0000000..0763a66
--- /dev/null
+++ b/data_engine/dpo_data_filter/similar_filter.py
@@ -0,0 +1,55 @@
+import tqdm
+
+from .filter import Filter
+import jieba
+
+
+def get_ngrams(text, n=10):
+    words = list(jieba.cut(text))
+    return set([' '.join(words[i:i + n]) for i in range(len(words) - n + 1)])
+
+
+def jaccard_similarity(set1, set2):
+    intersection = len(set1 & set2)
+    union = len(set1 | set2)
+    return intersection / union if union != 0 else 0
+
+
+def deduplicate_data(data_list, threshold=0.9):
+    if 'chosen' in data_list[0] and 'rejected' in data_list[0]:
+        chosen_seen = []
+        rejected_seen = []
+        unique_data = []
+        for data in tqdm.tqdm(data_list):
+            chosen_ngrams = get_ngrams(data['chosen'])
+            rejected_ngrams = get_ngrams(data['rejected'])
+
+            chosen_duplicate = any(jaccard_similarity(chosen_ngrams, seen) > threshold for seen in chosen_seen)
+            rejected_duplicate = any(jaccard_similarity(rejected_ngrams, seen) > threshold for seen in rejected_seen)
+
+            if not chosen_duplicate and not rejected_duplicate:
+                unique_data.append(data)
+                chosen_seen.append(chosen_ngrams)
+                rejected_seen.append(rejected_ngrams)
+
+        return unique_data
+    else:
+        text_seen = []
+        unique_data = []
+        for data in data_list:
+            text_ngrams = get_ngrams(data['text'])
+
+            text_duplicate = any(jaccard_similarity(text_ngrams, seen) > threshold for seen in text_seen)
+
+            if not text_duplicate:
+                unique_data.append(data)
+                text_seen.append(text_ngrams)
+
+        # print(f"before {len(data_list)}, after {len(unique_data)}")
+        return unique_data
+
+
+class SimilarFilter(Filter):
+    @classmethod
+    def do_filter(cls, data: list) -> list:
+        return deduplicate_data(data)
\ No newline at end of file
diff --git a/data_engine/engine.py b/data_engine/engine.py
index 78b1017..cd87f35 100644
--- a/data_engine/engine.py
+++ b/data_engine/engine.py
@@ -1,103 +1,93 @@
-import os.path
+import os
 import sys
-
-sys.path.append("./")
+import argparse
 import pandas as pd
+import torch
+import torch.distributed as dist
+
 from data_engine.pipeline.divide_and_conquer.divide_and_conquer_pipeline import DivideAndConquerPipeline
 from data_engine.pipeline.dpo_reward_pipeline.dpo_reward_pipeline import DPORewardPipeline
 from data_engine.util import *
-import argparse
-import torch
-import torch.distributed as dist
 
 pipelines = [DPORewardPipeline, DivideAndConquerPipeline]
 
 
-def run(
-        reward_model_name,
-        reward_model_path,
-        instruct_model_name,
-        instruct_model_path,
-        dataset_path,
-        work_dir,
-        pipeline_name,
-        continue_from_stage=1,
-        sample_k=10,
-        rank=3,
-        distance=25,
-        debug=False
-):
+def run(**kwargs):
     pipline = None
     for pipeline_to_judge in pipelines:
-        if pipeline_to_judge.judge_able_to_process(pipeline_name):
+        if pipeline_to_judge.judge_able_to_process(kwargs.get("pipeline_name", "")):
             pipline = pipeline_to_judge
             break
     if pipline is None:
         raise ValueError("Unsupported pipeline")
 
-    intermediate_step_dir = os.path.join(work_dir, "intermediate_step")
+    intermediate_step_dir = os.path.join(kwargs["work_dir"], "intermediate_step")
 
-    dist.init_process_group(backend='nccl', world_size=int(os.getenv('WORLD_SIZE', '1')),
-                            rank=int(os.getenv('RANK', '0')), )
+    dist.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
-    if debug:
+    if kwargs.get("debug", False):
         if torch.distributed.get_rank() == 0:
             print(
-                "You set debug=True, it will generate fine-grained process data under subdir 'debug'. You can check that dir for debug details.")
+                "You set debug=True, it will generate fine-grained process data under subdir 'debug'. You can check that dir for debug details."
+            )
 
     # 0: sample answer
-    sampled_answer_path = os.path.join(work_dir, "sampled_answer")
-    if continue_from_stage <= 0:
+    sampled_answer_path = os.path.join(kwargs["work_dir"], "sampled_answer")
+    if kwargs.get("continue_from_stage", 1) <= 0:
         dir_prepare(sampled_answer_path)
         sub_work_dir = os.path.join(intermediate_step_dir, "sample_answers")
         dir_prepare(sub_work_dir)
         print_stage(0, "Sample answers")
         pipline.sample_rollout(
-            instruct_model_name,
-            instruct_model_path,
-            dataset_path,
-            sampled_answer_path,
-            sample_k,
-            sub_work_dir,
-            debug
+            instruct_model_name=kwargs["instruct_model_name"],
+            instruct_model_path=kwargs["instruct_model_path"],
+            dataset_path=kwargs["dataset_path"],
+            sampled_answer_path=sampled_answer_path,
+            sample_k=kwargs["sample_k"],
+            work_dir=sub_work_dir,
+            debug=kwargs["debug"],
+            # 对于 DPORewardPipeline，可能需要额外参数，如 strict_follow_rank
+            strict_follow_rank=kwargs.get("strict_follow_rank", False)
         )
         print_stage(0, finish=True)
 
-    # 1: calculate logps
-    reward_output_dir = os.path.join(work_dir, "reward")
-    if continue_from_stage <= 1:
+    # 1: calculate rewards
+    reward_output_dir = os.path.join(kwargs["work_dir"], "reward")
+    if kwargs.get("continue_from_stage", 1) <= 1:
         dir_prepare(reward_output_dir)
         sub_work_dir = os.path.join(intermediate_step_dir, "calculate_rewards")
         dir_prepare(sub_work_dir)
         print_stage(1, "Calculate rewards")
         pipline.reward_calculate(
-            reward_model_name,
-            reward_model_path,
-            instruct_model_name,
-            instruct_model_path,
-            sampled_answer_path,
-            reward_output_dir,
-            sub_work_dir,
-            debug
+            reward_model_name=kwargs["reward_model_name"],
+            reward_model_path=kwargs["reward_model_path"],
+            instruct_model_name=kwargs["instruct_model_name"],
+            instruct_model_path=kwargs["instruct_model_path"],
+            sampled_answer_path=sampled_answer_path,
+            reward_path=reward_output_dir,
+            work_dir=sub_work_dir,
+            python_path=kwargs["reward_model_python_path"],
+            debug=kwargs["debug"],
         )
         print_stage(1, finish=True)
 
     # following code doesn't need multi CUDA
     if torch.distributed.get_rank() == 0:
-        if continue_from_stage <= 2:
+        if kwargs.get("continue_from_stage", 1) <= 2:
             print_stage(2, "Pair build and filter")
             sub_work_dir = os.path.join(intermediate_step_dir, "pair_build_and_filter")
             dir_prepare(sub_work_dir)
-            data = pipline.pair_build_with_filter(
-                sampled_answer_path,
-                reward_output_dir,
-                sub_work_dir,
-                sample_k,
-                rank,
-                distance,
-                debug
-            )
+            pair_build_kwargs = {"sampled_answer_path": sampled_answer_path, "reward_path": reward_output_dir,
+                                 "work_dir": sub_work_dir, "sample_k": kwargs["sample_k"], "rank": kwargs["rank"],
+                                 "distance": kwargs["distance"], "debug": kwargs["debug"],
+                                 "strict_follow_rank": kwargs.get("strict_follow_rank", False)}
+
+            data = pipline.pair_build_with_filter(**pair_build_kwargs)
             print_stage(2, finish=True)
 
             if isinstance(data, str):
@@ -105,7 +95,7 @@ def run(
             else:
                 # -1: save files
                 print_stage(-1, "Save file to dataset format")
-                output_path = os.path.join(work_dir, "dataset")
+                output_path = os.path.join(kwargs["work_dir"], "dataset")
                 output_file = os.path.join(output_path, "dpo_dataset.parquet")
                 dir_prepare(output_path)
                 needed_keys = [
@@ -117,7 +107,8 @@ def run(
                     "idx",
                     "image_path",
                     "ds_name",
-                    "image"]
+                    "image"
+                ]
                 for item in data:
                     for key in list(item.keys()):
                         if key not in needed_keys:
@@ -134,32 +125,38 @@ def run(
 
 
 if __name__ == "__main__":
-    args = argparse.ArgumentParser()
-    args.add_argument("--reward_model_name", type=str, help="The name of the reward model.")
-    args.add_argument("--reward_model_path", type=str, help="The path of the reward model.")
-    args.add_argument("--instruct_model_name", type=str, help="The name of the instruct model.")
-    args.add_argument("--instruct_model_path", type=str, help="The path of the instruct model.")
-    args.add_argument("--dataset_path", type=str, help="The path of the dataset.")
-    args.add_argument("--work_dir", type=str, help="The working directory.")
-    args.add_argument("--pipeline_name", type=str, help="The pipeline you choose to run.")
-    args.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
-    args.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
-    args.add_argument("--rank", type=int, default=3, help="The rank number.")
-    args.add_argument("--distance", type=int, default=25, help="The distance.")
-    args.add_argument("--debug", type=bool, default=False, help="Preserve fine-grained process data")
-
-    args = args.parse_args()
+    parser = argparse.ArgumentParser(description="Run Data Pipeline")
+    parser.add_argument("--reward_model_name", type=str, required=True, help="The name of the reward model.")
+    parser.add_argument("--reward_model_path", type=str, required=True, help="The path of the reward model.")
+    parser.add_argument("--instruct_model_name", type=str, required=True, help="The name of the instruct model.")
+    parser.add_argument("--instruct_model_path", type=str, required=True, help="The path of the instruct model.")
+    parser.add_argument("--dataset_path", type=str, required=True, help="The path of the dataset.")
+    parser.add_argument("--work_dir", type=str, required=True, help="The working directory.")
+    parser.add_argument("--pipeline_name", type=str, required=True, help="The pipeline you choose to run.")
+    parser.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
+    parser.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
+    parser.add_argument("--rank", type=int, default=3, help="The rank number. (specific to DPORewardPipeline)")
+    parser.add_argument("--distance", type=int, default=25, help="The distance. (specific to DPORewardPipeline)")
+    parser.add_argument('--reward_model_python_path', type=str, help="Python path to reward model. Not required for all pipelines.")
+    parser.add_argument("--debug", action='store_true', help="Preserve fine-grained process data")
+    parser.add_argument("--strict_follow_rank", action='store_true',
+                        help="Strictly follow rank (specific to DPORewardPipeline)")
+
+    args = parser.parse_args()
+
     run(
-        args.reward_model_name,
-        args.reward_model_path,
-        args.instruct_model_name,
-        args.instruct_model_path,
-        args.dataset_path,
-        args.work_dir,
-        args.pipeline_name,
-        args.continue_from_stage,
-        args.sample_k,
-        args.rank,
-        args.distance,
-        args.debug
+        reward_model_name=args.reward_model_name,
+        reward_model_path=args.reward_model_path,
+        instruct_model_name=args.instruct_model_name,
+        instruct_model_path=args.instruct_model_path,
+        dataset_path=args.dataset_path,
+        work_dir=args.work_dir,
+        pipeline_name=args.pipeline_name,
+        continue_from_stage=args.continue_from_stage,
+        sample_k=args.sample_k,
+        rank=args.rank,
+        distance=args.distance,
+        debug=args.debug,
+        strict_follow_rank=args.strict_follow_rank,
+        reward_model_python_path=args.reward_model_python_path,
     )
diff --git a/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py b/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
index dbf36c7..2bf4627 100644
--- a/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
+++ b/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
@@ -1,6 +1,5 @@
 import os
 import subprocess
-
 import torch
 
 from data_engine.pipeline.pipeline import Pipeline
@@ -13,80 +12,165 @@ def run_bash_script(script_path, *args):
     subprocess.run(command, check=True)
 
 
-def get_jsonl_file(path) -> list:
+def get_jsonl_file(path: str) -> list:
     jsonl_files = [f for f in os.listdir(path) if f.endswith('.jsonl')]
     return jsonl_files
 
 
 class DivideAndConquerPipeline(Pipeline):
     @classmethod
-    def judge_able_to_process(cls, pipeline_name) -> bool:
+    def judge_able_to_process(cls, pipeline_name: str) -> bool:
         return "divide_and_conquer" in pipeline_name.lower()
 
     @classmethod
-    def sample_rollout(cls,
-                       instruct_model_name: str,
-                       instruct_model_path: str,
-                       dataset_path: str,
-                       sampled_answer_path: str,
-                       sample_k: int,
-                       work_dir: str,
-                       debug: bool) -> None:
+    def sample_rollout(cls, **kwargs) -> None:
+        required_params = [
+            "instruct_model_path",
+            "sampled_answer_path",
+            "dataset_path",
+            "work_dir"
+        ]
+        for param in required_params:
+            if param not in kwargs:
+                raise ValueError(f"Missing parameter '{param}' for sample_rollout in DivideAndConquerPipeline.")
+
         if torch.distributed.get_rank() == 0:
             script_path = './script/data_gen/llava15/llava15_diverse_gen.sh'
-            run_bash_script(script_path, instruct_model_path, sampled_answer_path, dataset_path,
-                            get_jsonl_file(dataset_path)[0], str(0), str(-1),
-                            str(torch.cuda.device_count()))
+            run_bash_script(
+                script_path,
+                kwargs["instruct_model_path"],
+                kwargs["sampled_answer_path"],
+                kwargs["dataset_path"],
+                get_jsonl_file(kwargs["dataset_path"])[0],
+                str(0),
+                str(-1),
+                str(torch.cuda.device_count())
+            )
 
     @classmethod
-    def reward_calculate(cls,
-                         reward_model_name: str,
-                         reward_model_path: str,
-                         instruct_model_name: str,
-                         instruct_model_path: str,
-                         sampled_answer_path: str,
-                         reward_path: str,
-                         work_dir: str,
-                         debug: bool) -> None:
+    def reward_calculate(cls, **kwargs) -> None:
+        required_params = [
+            "reward_model_name",
+            "reward_model_path",
+            "sampled_answer_path",
+            "work_dir",
+            "python_path"
+        ]
+        for param in required_params:
+            if param not in kwargs:
+                raise ValueError(f"Missing parameter '{param}' for reward_calculate in DivideAndConquerPipeline.")
+
         if torch.distributed.get_rank() == 0:
+            # omnilmm, changeq, split
+            reward_model_path = kwargs["reward_model_path"].split(',')
+
+            changeq = reward_model_path[1].strip()
+            split = reward_model_path[2].strip()
             script_path = './script/data_gen/divide_and_conquer/llama3_8b_divide_and_conquer.sh'
-            answer_file = os.path.join(sampled_answer_path, os.path.basename(get_jsonl_file(sampled_answer_path)[0])[0])
-            run_bash_script(script_path, answer_file, '0', '-1', str(torch.cuda.device_count()),
-                            str(torch.cuda.device_count()))
-            script_path = './script/data_gen/omnilmm/omnilmm_autocheck.sh'
-            check_ques_file = ""
-            for file in get_jsonl_file(sampled_answer_path):
-                if "llama3-8b_divide.gq.qas.jsonl" in file:
-                    check_ques_file = file
-                    break
-            run_bash_script(script_path, reward_model_path, reward_path, sampled_answer_path, check_ques_file, '0', '-1',
-                            str(torch.cuda.device_count()))
+            file_dict = {}
+            min_len = 9999
+            for file in get_jsonl_file(kwargs["sampled_answer_path"]):
+                if 'diverse_gen' not in file:
+                    continue
+                file_dict[len(file)] = file
+                min_len = min(min_len, len(file))
+            file_name = os.path.basename(file_dict[min_len])
+            answer_file = os.path.join(kwargs["sampled_answer_path"], file_name[:file_name.rfind('.')])
+            run_bash_script(
+                script_path,
+                answer_file,
+                '0',
+                '-1',
+                str(torch.cuda.device_count()),
+                str(torch.cuda.device_count()),
+                changeq,
+                split
+            )
+
+            auto_check_model = reward_model_path[0].strip()
+            file_dict = {}
+            min_len = 9999
+            for file in get_jsonl_file(kwargs["sampled_answer_path"]):
+                if "llama3-8b_divide.gq.qas.jsonl" in file and 'diverse_gen' in file:
+                    file_dict[len(file)] = file
+                    min_len = min(min_len, len(file))
+            check_ques_file = file_dict[min_len]
+            if 'omni' in auto_check_model.lower() or 'omni' in kwargs["reward_model_name"].lower():
+                print("OmniLMM as auto check model")
+                script_path = './script/data_gen/omnilmm/omnilmm_autocheck.sh'
+                run_bash_script(
+                    script_path,
+                    auto_check_model,
+                    kwargs["reward_path"],
+                    kwargs["sampled_answer_path"],
+                    check_ques_file,
+                    '0',
+                    '-1',
+                    str(torch.cuda.device_count())
+                )
+            else:
+                print("MiniCPM-llama3-v as auto check model")
+                script_path = './script/data_gen/minicpm_llama3_v/minicpm_llama3_v_autocheck.sh'
+                run_bash_script(
+                    script_path,
+                    auto_check_model,
+                    kwargs["reward_path"],
+                    kwargs["sampled_answer_path"],
+                    check_ques_file,
+                    '0',
+                    '-1',
+                    kwargs['python_path'],
+                    str(torch.cuda.device_count())
+                )
 
     @classmethod
-    def pair_build_with_filter(cls,
-                               sampled_answer_path: str,
-                               reward_path: str,
-                               work_dir: str,
-                               sample_k: int,
-                               rank: int,
-                               distance: int,
-                               debug: bool) -> str:
+    def pair_build_with_filter(cls, **kwargs) -> str:
+        required_params = [
+            "sampled_answer_path",
+            "reward_path",
+            "work_dir",
+            "distance"
+        ]
+        for param in required_params:
+            if param not in kwargs:
+                raise ValueError(f"Missing parameter '{param}' for pair_build_with_filter in DivideAndConquerPipeline.")
+
         if torch.distributed.get_rank() == 0:
-            gq_file = ""
-            for file in get_jsonl_file(sampled_answer_path):
-                if "llama3-8b_divide.gq.jsonl" in file:
-                    gq_file = file
-                    break
-            feedback_file = get_jsonl_file(reward_path)[0]
+            file_dict = {}
+            min_len = 999
+            for file in get_jsonl_file(kwargs["sampled_answer_path"]):
+                if 'llama3-8b_divide.gq.jsonl' not in file:
+                    continue
+                file_dict[len(file)] = file
+                min_len = min(min_len, len(file))
+            gq_file = file_dict[min_len]
+            file_dict = {}
+            min_len = 999
+            for file in get_jsonl_file(kwargs["reward_path"]):
+                file_dict[len(file)] = file
+                min_len = min(min_len, len(file))
+            feedback_file = file_dict[min_len]
             script_path = './script/data_gen/construct_pairs.sh'
-            run_bash_script(script_path, os.path.join(reward_path, feedback_file), os.path.join(sampled_answer_path, gq_file), str(2))
+            run_bash_script(
+                script_path,
+                os.path.join(kwargs["reward_path"], feedback_file),
+                os.path.join(kwargs["sampled_answer_path"], gq_file),
+                str(2)
+            )
 
             script_path = './utils/get_pairs_filter_shorten.py'
-            result_dir = os.path.join(work_dir, "dataset")
+            file_dict = {}
+            min_len = 999
+            for file in get_jsonl_file(kwargs["reward_path"]):
+                if 'llama3-8b_divide.gq.qas_pair_diff1_samp2.jsonl' not in file:
+                    continue
+                file_dict[len(file)] = file
+                min_len = min(min_len, len(file))
+            result_dir = os.path.join(kwargs["work_dir"], "dataset")
             dir_prepare(result_dir)
             subprocess.run([
                 'python', script_path,
-                '--path', os.path.join(reward_path, feedback_file),
+                '--path', os.path.join(kwargs["reward_path"], file_dict[min_len]),
                 '--save_path', os.path.join(result_dir, "result.jsonl")
             ], check=True)
             return os.path.join(result_dir, "result.jsonl")
diff --git a/data_engine/pipeline/dpo_reward_pipeline/data_pair_builder.py b/data_engine/pipeline/dpo_reward_pipeline/data_pair_builder.py
index f9b3ef4..6f7746e 100644
--- a/data_engine/pipeline/dpo_reward_pipeline/data_pair_builder.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/data_pair_builder.py
@@ -1,8 +1,12 @@
 import os.path
+import random
 
 from nltk import word_tokenize
 from tqdm import tqdm
 
+from data_engine.dpo_data_filter import filter
+from data_engine.dpo_data_filter.similar_filter import SimilarFilter
+
 data_pairs = []
 
 
@@ -61,7 +65,7 @@ def get_ranking_reward_data(sample_k, rewards):
     return sum_output, avg_output
 
 
-def pair_union(sum_reward, avg_reward, sample_k=10, rank=3, distance=25):
+def pair_union(sum_reward, avg_reward, sample_k=10, rank=3, strict_follow_rank=True, distance=25):
     print(f"sampling number k: {sample_k} \nrank number: {rank} \ndistance: {distance}")
     total_pairs = 0
     total_used_pic = 0
@@ -78,11 +82,13 @@ def pair_union(sum_reward, avg_reward, sample_k=10, rank=3, distance=25):
         idx = sum_reward_whole_data[i]['idx']
         sum_reward_data = sum_reward_whole_data[i:i + sample_k]
         avg_reward_data = avg_reward_whole_data[i:i + sample_k]
+        sum_reward_data = filter.filter_with_filter_list([SimilarFilter], sum_reward_data, log=False)
+        avg_reward_data = filter.filter_with_filter_list([SimilarFilter], avg_reward_data, log=False)
         # top10 -> top rank
-        sum_top_rank = sum_reward_data[:rank]
-        sum_last_rank = sum_reward_data[-rank:]
-        avg_top_rank = avg_reward_data[:rank]
-        avg_last_rank = avg_reward_data[-rank:]
+        sum_top_rank = sum_reward_data[:min(rank, len(sum_reward_data))]
+        sum_last_rank = sum_reward_data[-min(rank, len(sum_reward_data)):]
+        avg_top_rank = avg_reward_data[:min(rank, len(avg_reward_data))]
+        avg_last_rank = avg_reward_data[-min(rank, len(avg_reward_data)):]
 
         avg_top_rank_text = [data['text'] for data in avg_top_rank]
         avg_last_rank_text = [data['text'] for data in avg_last_rank]
@@ -108,29 +114,45 @@ def pair_union(sum_reward, avg_reward, sample_k=10, rank=3, distance=25):
 
         sign = 0
         # construct dpo pair if abs(dif(word_count)) < distance
-        for chosen_data in chosen_answer:
-            for rejected_data in rejected_answer:
-                if abs(chosen_data[1] - rejected_data[1]) < distance:
-                    sign = 1
-                    dpo_pair.append({
-                        "idx": idx,
-                        "question": question,
-                        "chosen": chosen_data[0],
-                        "rejected": rejected_data[0],
-                        "image": sum_reward_whole_data[i]['image']
-                    })
-                    total_pairs += 1
-                    if chosen_data[1] >= rejected_data[1]:
-                        flag += 1
+        if strict_follow_rank:
+            for chosen_data, rejected_data in zip(chosen_answer, rejected_answer):
+                sign = 1
+                dpo_pair.append({
+                    "idx": idx,
+                    "question": question,
+                    "chosen": chosen_data[0],
+                    "rejected": rejected_data[0],
+                    "image": sum_reward_whole_data[i]['image']
+                })
+                total_pairs += 1
+                if chosen_data[1] >= rejected_data[1]:
+                    flag += 1
+        else:
+            random.shuffle(chosen_answer)
+            random.shuffle(rejected_answer)
+            for chosen_data in chosen_answer:
+                for rejected_data in rejected_answer:
+                    if abs(chosen_data[1] - rejected_data[1]) < distance:
+                        sign = 1
+                        dpo_pair.append({
+                            "idx": idx,
+                            "question": question,
+                            "chosen": chosen_data[0],
+                            "rejected": rejected_data[0],
+                            "image": sum_reward_whole_data[i]['image']
+                        })
+                        total_pairs += 1
+                        if chosen_data[1] >= rejected_data[1]:
+                            flag += 1
         if sign == 1:
             total_used_pic += 1
     print(f"total_used_pic: {total_used_pic}")
     return dpo_pair
 
 
-def main(rewards, sample_k=10, rank=3, distance=25):
+def main(rewards, sample_k=10, rank=3, strict_follow_rank=True, distance=25):
     sum_output, avg_output = get_ranking_reward_data(sample_k, rewards)
-    dpo_pair = pair_union(sum_output, avg_output, sample_k, rank, distance)
+    dpo_pair = pair_union(sum_output, avg_output, sample_k, rank, strict_follow_rank, distance)
     return dpo_pair, sum_output, avg_output
 
 
diff --git a/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py b/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
index dd8a91a..a399017 100644
--- a/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/dpo_reward_pipeline.py
@@ -1,81 +1,125 @@
+# dpo_reward_pipeline.py
 import os
+from typing import Union
 
 import pandas as pd
 import torch
 
 from data_engine.util import dir_prepare, store_data_with_no_image
-from data_engine.pipeline.dpo_reward_pipeline import answer_sampler, logps_calculator, reward_computer, \
+from data_engine.pipeline.dpo_reward_pipeline import (
+    answer_sampler,
+    logps_calculator,
+    reward_computer,
     data_pair_builder
+)
 from data_engine.dpo_data_filter import filter
 from data_engine.pipeline.pipeline import Pipeline
 
 
 class DPORewardPipeline(Pipeline):
     @classmethod
-    def judge_able_to_process(cls, pipeline_name):
+    def judge_able_to_process(cls, pipeline_name: str) -> bool:
         return "dpo_reward" in pipeline_name.lower()
 
     @classmethod
-    def sample_rollout(cls,
-                       instruct_model_name: str,
-                       instruct_model_path: str,
-                       dataset_path: str,
-                       sampled_answer_path: str,
-                       sample_k: int,
-                       work_dir: str,
-                       debug: bool):
-        answer_sampler.sample_answer(instruct_model_name, instruct_model_path, dataset_path, sampled_answer_path, sample_k)
+    def sample_rollout(cls, **kwargs) -> None:
+        required_params = [
+            "instruct_model_name",
+            "instruct_model_path",
+            "dataset_path",
+            "sampled_answer_path",
+            "sample_k",
+            "work_dir",
+            "debug"
+        ]
+        for param in required_params:
+            if param not in kwargs:
+                raise ValueError(f"Missing parameter '{param}' for sample_rollout in DPORewardPipeline.")
+
+        answer_sampler.sample_answer(
+            kwargs["instruct_model_name"],
+            kwargs["instruct_model_path"],
+            kwargs["dataset_path"],
+            kwargs["sampled_answer_path"],
+            kwargs["sample_k"]
+        )
 
     @classmethod
-    def reward_calculate(cls,
-                         reward_model_name: str,
-                         reward_model_path: str,
-                         instruct_model_name: str,
-                         instruct_model_path: str,
-                         sampled_answer_path: str,
-                         reward_path: str,
-                         work_dir: str,
-                         debug: bool):
-        reward_logps_output_dir = os.path.join(work_dir, "reward_logps")
-        instruct_logps_output_dir = os.path.join(work_dir, "instruct_logps")
+    def reward_calculate(cls, **kwargs) -> None:
+        required_params = [
+            "reward_model_name",
+            "reward_model_path",
+            "instruct_model_name",
+            "instruct_model_path",
+            "sampled_answer_path",
+            "reward_path",
+            "work_dir",
+            "debug"
+        ]
+        for param in required_params:
+            if param not in kwargs:
+                raise ValueError(f"Missing parameter '{param}' for reward_calculate in DPORewardPipeline.")
+
+        reward_logps_output_dir = os.path.join(kwargs["work_dir"], "reward_logps")
+        instruct_logps_output_dir = os.path.join(kwargs["work_dir"], "instruct_logps")
         dir_prepare(reward_logps_output_dir)
         dir_prepare(instruct_logps_output_dir)
         logps_calculator.main(
-            reward_model_name,
-            reward_model_path,
-            instruct_model_name,
-            instruct_model_path,
-            sampled_answer_path,
+            kwargs["reward_model_name"],
+            kwargs["reward_model_path"],
+            kwargs["instruct_model_name"],
+            kwargs["instruct_model_path"],
+            kwargs["sampled_answer_path"],
             reward_logps_output_dir,
-            instruct_logps_output_dir)
+            instruct_logps_output_dir
+        )
         if torch.distributed.get_rank() == 0:
-            rewards = reward_computer.main(instruct_model_path, reward_logps_output_dir, instruct_logps_output_dir)
+            rewards = reward_computer.main(
+                kwargs["instruct_model_path"],
+                reward_logps_output_dir,
+                instruct_logps_output_dir
+            )
             step = 5000
             for idx, start in enumerate(range(0, len(rewards), step)):
                 temp_data = rewards[start: min(start + step, len(rewards))]
                 df = pd.DataFrame(temp_data)
-                df.to_parquet(os.path.join(reward_path, f'RLAIF-V-Dataset-reward_{idx:03}-{len(temp_data)}.parquet'))
+                df.to_parquet(os.path.join(
+                    kwargs["reward_path"],
+                    f'RLAIF-V-Dataset-reward_{idx:03}-{len(temp_data)}.parquet'
+                ))
 
     @classmethod
-    def pair_build_with_filter(cls,
-                               sampled_answer_path: str,
-                               reward_path: str,
-                               work_dir: str,
-                               sample_k: int,
-                               rank: int,
-                               distance: int,
-                               debug: bool):
+    def pair_build_with_filter(cls, **kwargs) -> Union[list, str]:
+        required_params = [
+            "sampled_answer_path",
+            "reward_path",
+            "work_dir",
+            "sample_k",
+            "rank",
+            "distance",
+            "debug",
+            "strict_follow_rank"
+        ]
+        for param in required_params:
+            if param not in kwargs:
+                raise ValueError(f"Missing parameter '{param}' for pair_build_with_filter in DPORewardPipeline.")
+
         rewards = []
-        reward_files = [f for f in os.listdir(reward_path) if f.endswith('.parquet')]
+        reward_files = [f for f in os.listdir(kwargs["reward_path"]) if f.endswith('.parquet')]
 
         for reward_file in reward_files:
-            reward_file_path = os.path.join(reward_path, reward_file)
+            reward_file_path = os.path.join(kwargs["reward_path"], reward_file)
             reward_df = pd.read_parquet(reward_file_path)
             rewards.append(reward_df)
         rewards = pd.concat(rewards, ignore_index=True).to_dict(orient='records')
-        dpo_pair, sum_output, avg_output = data_pair_builder.main(rewards, sample_k, rank, distance)
-        if debug:
-            debug_dir = os.path.join(work_dir, 'debug')
+        dpo_pair, sum_output, avg_output = data_pair_builder.main(
+            rewards,
+            kwargs["sample_k"],
+            kwargs["rank"],
+            kwargs["distance"]
+        )
+        if kwargs["debug"]:
+            debug_dir = os.path.join(kwargs["work_dir"], 'debug')
             dir_prepare(debug_dir)
             store_data_with_no_image(rewards, os.path.join(debug_dir, 'dpo_pair.json'))
             store_data_with_no_image(sum_output, os.path.join(debug_dir, 'sum_output.json'))
diff --git a/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py b/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
index 6898467..4496c64 100644
--- a/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
+++ b/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py
@@ -109,13 +109,10 @@ def inference_logp(
     logps = list(zip(win_logp_list, win_avg_logp_list, win_per_token_logp_list, rej_logp_list, rej_avg_logp_list,
                      rej_per_token_logp_list))
 
-    df = write_logp_to_preference_parquet(dataset.data, output_dir, logps, overwrite_logps=True)
+    _ = write_logp_to_preference_parquet(dataset.data, output_dir, logps, overwrite_logps=True)
 
     torch.distributed.barrier()
 
-    del model
-    return df
-
 
 def main(
         reward_model_name: str,
@@ -125,8 +122,8 @@ def main(
         dataset_path: str,
         reward_output_dir: str,
         instruct_output_dir: str):
-    _ = inference_logp(instruct_model_name, instruct_model_path, dataset_path, instruct_output_dir)
-    _ = inference_logp(reward_model_name, reward_model_path, dataset_path, reward_output_dir)
+    inference_logp(instruct_model_name, instruct_model_path, dataset_path, instruct_output_dir)
+    inference_logp(reward_model_name, reward_model_path, dataset_path, reward_output_dir)
 
     return {
         "reward_output_dir": reward_output_dir,
diff --git a/data_engine/pipeline/pipeline.py b/data_engine/pipeline/pipeline.py
index 2f66612..f39b12b 100644
--- a/data_engine/pipeline/pipeline.py
+++ b/data_engine/pipeline/pipeline.py
@@ -3,39 +3,17 @@
 
 class Pipeline:
     @classmethod
-    def judge_able_to_process(cls, pipeline_name) -> bool:
-        raise NotImplementedError
+    def judge_able_to_process(cls, pipeline_name: str) -> bool:
+        raise NotImplementedError("Subclasses must implement this method.")
 
     @classmethod
-    def sample_rollout(cls,
-                       instruct_model_name: str,
-                       instruct_model_path: str,
-                       dataset_path: str,
-                       sampled_answer_path: str,
-                       sample_k: int,
-                       work_dir: str,
-                       debug: bool) -> None:
-        raise NotImplementedError
+    def sample_rollout(cls, **kwargs) -> None:
+        raise NotImplementedError("Subclasses must implement this method.")
 
     @classmethod
-    def reward_calculate(cls,
-                         reward_model_name: str,
-                         reward_model_path: str,
-                         instruct_model_name: str,
-                         instruct_model_path: str,
-                         sampled_answer_path: str,
-                         reward_path: str,
-                         work_dir: str,
-                         debug: bool) -> None:
-        raise NotImplementedError
+    def reward_calculate(cls, **kwargs) -> None:
+        raise NotImplementedError("Subclasses must implement this method.")
 
     @classmethod
-    def pair_build_with_filter(cls,
-                               sampled_answer_path: str,
-                               reward_path: str,
-                               work_dir: str,
-                               sample_k: int,
-                               rank: int,
-                               distance: int,
-                               debug: bool) -> Union[list, str]:
-        raise NotImplementedError
+    def pair_build_with_filter(cls, **kwargs) -> Union[list, str]:
+        raise NotImplementedError("Subclasses must implement this method.")
diff --git a/minicpm-llama3-v-25/minicpmv_autocheck.py b/minicpm-llama3-v-25/minicpmv_autocheck.py
index 1c67b71..3facf29 100644
--- a/minicpm-llama3-v-25/minicpmv_autocheck.py
+++ b/minicpm-llama3-v-25/minicpmv_autocheck.py
@@ -9,7 +9,7 @@
 import tqdm
 import numpy as np
 from typing import List, Optional
-from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModel, AutoImageProcessor, AutoProcessor
 import torch.utils.data as torch_data
 
 from minicpmv_diverse_gen import MiniCPMVQADataset
@@ -19,28 +19,28 @@ class MiniCPM_Llama3_V_RM:
     def __init__(self, model_path) -> None:
         self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16)
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
+        self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
         self.model.eval().cuda()
         self.config = self.model.config
 
     def raw_generate(
         self,
-        input_id_list=None,
-        img_list=None,
-        tgt_sizes=None,
+        prompt=None,
+        images=None,
         tokenizer=None,
         max_inp_length: Optional[int] = None,
         vision_hidden_states=None,
         return_vision_hidden_states=False,
         **kwargs
     ):
-        assert input_id_list is not None
-        bs = len(input_id_list)
-        if img_list == None:
+        model_inputs = self.processor(prompt, images, max_length=max_inp_length)
+        bs = len(model_inputs["input_ids"])
+        img_list = model_inputs["pixel_values"]
+        tgt_sizes = model_inputs["tgt_sizes"]
+        if img_list is None:
             img_list = [[] for i in range(bs)]
         assert bs == len(img_list)
-
-        model_inputs = self.model._process_list(tokenizer, input_id_list, max_inp_length)
-
         if vision_hidden_states is None:
             pixel_values = []
             for i in range(bs):
@@ -60,7 +60,7 @@ def raw_generate(
             (
                 model_inputs["inputs_embeds"],
                 vision_hidden_states,
-            ) = self.model.get_vllm_embedding(model_inputs)
+            ) = self.model.get_vllm_embedding(model_inputs.to(self.model.device))
 
             result = self._raw_decode(model_inputs["inputs_embeds"], tokenizer, **kwargs)
 
@@ -100,6 +100,7 @@ def chat_with_scores(
         if image is not None and isinstance(copy_msgs[0]['content'], str):
             copy_msgs[0]['content'] = [image, copy_msgs[0]['content']]
 
+        images = []
         for i, msg in enumerate(copy_msgs):
             role = msg["role"]
             content = msg["content"]
@@ -108,39 +109,16 @@ def chat_with_scores(
                 assert role == "user", "The role of first msg should be user"
             if isinstance(content, str):
                 content = [content]
-
-            images = []
-            tgt_sizes = []
             cur_msgs = []
             for c in content:
                 if isinstance(c, Image.Image):
-                    image = c
-                    if self.config.slice_mode:
-                        slice_images, image_placeholder = self.model.get_slice_image_placeholder(
-                            image, self.tokenizer
-                        )
-                        cur_msgs.append(image_placeholder)
-                        for slice_image in slice_images:
-                            slice_image = self.model.transform(slice_image)
-                            H, W = slice_image.shape[1:]
-                            images.append(self.model.reshape_by_patch(slice_image))
-                            tgt_sizes.append(torch.Tensor([H // self.config.patch_size, W // self.config.patch_size]).type(torch.int32))
-                    else:
-                        images.append(self.model.transform(image))
-                        cur_msgs.append(
-                            self.tokenizer.im_start
-                            + self.tokenizer.unk_token * self.config.query_num
-                            + self.tokenizer.im_end
-                        )
+                    images.append(c)
+                    cur_msgs.append("(<image>./</image>)")
                 elif isinstance(c, str):
                     cur_msgs.append(c)
+            msg["content"] = "\n".join(cur_msgs)
 
-            if tgt_sizes:
-                tgt_sizes = torch.vstack(tgt_sizes)
-
-            msg['content'] = '\n'.join(cur_msgs)
-
-        input_ids = self.tokenizer.apply_chat_template(copy_msgs, tokenize=True, add_generation_prompt=False)
+        prompt = self.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=False)
 
         generation_config = {
             "num_beams": 1,
@@ -155,10 +133,9 @@ def chat_with_scores(
 
         with torch.inference_mode():
             res = self.raw_generate(
-                input_id_list=[input_ids],
+                prompt=prompt,
                 max_inp_length=max_inp_length,
-                img_list=[images],
-                tgt_sizes=[tgt_sizes],
+                images=images,
                 tokenizer=self.tokenizer,
                 max_new_tokens=max_new_tokens,
                 vision_hidden_states=vision_hidden_states,
@@ -171,8 +148,8 @@ def chat_with_scores(
         no_id = self.tokenizer.encode(f'{self.tokenizer.bos_token}no')[-1]
         No_id = self.tokenizer.encode(f'{self.tokenizer.bos_token}No')[-1]
 
-        print("output_ids:", res.sequences[0])
-        print("response:", self.tokenizer.decode(res.sequences[0]))
+        # print("output_ids:", res.sequences[0])
+        # print("response:", self.tokenizer.decode(res.sequences[0]))
 
         response = self.model._decode_text(res.sequences, self.tokenizer)
         # response = self.tokenizer.decode(
@@ -181,9 +158,9 @@ def chat_with_scores(
 
         output_scores = res.scores[0][0]
         scores = torch.softmax(output_scores, dim=0)
-        print(scores.shape)
-        max_value, max_index = torch.max(scores, dim=0)
-        print(f'scores: {max_index}')
+        # print(scores.shape)
+        # max_value, max_index = torch.max(scores, dim=0)
+        # print(f'scores: {max_index}')
 
         item_scores = {
             'yes': scores[yes_id].cpu().item(),
@@ -219,7 +196,8 @@ def eval_autocheck(args):
                     'answer': response,
                     'scores': score,
                     'metainfos': batch['metainfo'],
-                    'model_path': args.model_name
+                    'model_path': args.model_name,
+                    'image': batch['raw_image']
                 }) + "\n")
             else:
                 ans_file.write(json.dumps({
@@ -227,7 +205,8 @@ def eval_autocheck(args):
                     'raw_question': batch['raw_question'],
                     'answer': response,
                     'metainfos': batch['metainfo'],
-                    'model_path': args.model_name
+                    'model_path': args.model_name,
+                    'image': batch['raw_image']
                 }) + "\n")
 
             ans_file.flush()
diff --git a/minicpm-llama3-v-25/minicpmv_diverse_gen.py b/minicpm-llama3-v-25/minicpmv_diverse_gen.py
index a1090ef..5d2c23e 100644
--- a/minicpm-llama3-v-25/minicpmv_diverse_gen.py
+++ b/minicpm-llama3-v-25/minicpmv_diverse_gen.py
@@ -82,8 +82,10 @@ def __getitem__(self, index):
         else:
             imgid = ''
 
+        raw_image = None
         if "image" in item.keys():
             img_b64 = item['image']
+            raw_image = img_b64
 
             if len(img_b64) > 100:
                 image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert('RGB')
@@ -110,6 +112,7 @@ def __getitem__(self, index):
             'metainfo': metainfo,
             'question_id': item['question_id'] if 'question_id' in item else index,
             'origin_dataset': self.qa_file,
+            'raw_image': raw_image,
         }
 
     def __len__(self):
@@ -161,7 +164,8 @@ def eval_model(args):
                     'raw_question': batch['raw_question'],
                     'answer': response,
                     'metainfos': batch['metainfo'],
-                    'model_path': args.model_name
+                    'model_path': args.model_name,
+                    'image': batch['raw_image']
                 }) + "\n")
             else:
                 ans_file.write(json.dumps({
@@ -169,7 +173,8 @@ def eval_model(args):
                     'raw_question': batch['raw_question'],
                     'answer': response,
                     'metainfos': batch['metainfo'],
-                    'model_path': args.model_name
+                    'model_path': args.model_name,
+                    'image': batch['raw_image']
                 }) + "\n")
 
             ans_file.flush()
diff --git a/muffin/llava15_gen_data.py b/muffin/llava15_gen_data.py
index 3a05f43..b7aafb7 100644
--- a/muffin/llava15_gen_data.py
+++ b/muffin/llava15_gen_data.py
@@ -74,8 +74,10 @@ def __getitem__(self, index):
             imgid = item["image_id"]
 
         # print(item.keys())
+        origin_image = None
         if "image" in item.keys():
             img_b64 = item['image']
+            origin_image = img_b64
 
             if len(img_b64) > 100:
                 image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert('RGB')
@@ -101,7 +103,8 @@ def __getitem__(self, index):
             'question_input_ids': question_input_ids,
             'raw_question': raw_question,
             'metainfos': metainfo,
-            'origin_dataset': self.qa_file
+            'origin_dataset': self.qa_file,
+            'origin_image': origin_image
         }
 
     def __len__(self):
@@ -160,6 +163,8 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
         data['metainfo'] = [x['metainfo'] for x in data_list]
     if 'metainfos' in data_list[0]:
         data['metainfos'] = [x['metainfos'] for x in data_list]
+    if 'origin_image' in data_list[0]:
+        data['origin_image'] = [x['origin_image'] for x in data_list]
 
     return data
 
@@ -263,11 +268,12 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                 batch['input_ids'].shape[0], len(output.scores), args.num_beam, output.scores[0].shape[-1])
                 new_output_scores = output_scores_all.view(output_scores_reshape)
 
-                for question, output_ids, output_scores, question_id, metainfos in zip(batch['raw_questions'],
-                                                                                       output.sequences,
-                                                                                       new_output_scores,
-                                                                                       batch['question_id'],
-                                                                                       batch['metainfos']):
+                for question, output_ids, output_scores, question_id, metainfos, origin_image in zip(
+                        batch['raw_questions'],
+                        output.sequences,
+                        new_output_scores,
+                        batch['question_id'],
+                        batch['metainfos'], batch['origin_image']):
 
                     response = tokenizer.decode(
                         output_ids, skip_special_tokens=True)
@@ -291,7 +297,8 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                             'answer': response,
                             'scores': item_scores,
                             'metainfos': metainfos,
-                            'model_path': args.checkpoint
+                            'model_path': args.checkpoint,
+                            'image': origin_image
                         })
                     else:
                         outputs.append({
@@ -300,7 +307,8 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                             'answer': response,
                             'scores': item_scores,
                             'metainfos': metainfos,
-                            'model_path': args.checkpoint
+                            'model_path': args.checkpoint,
+                            'image': origin_image
                         })
 
             else:
@@ -328,8 +336,11 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                         return_dict_in_generate=True)
 
                 # print(output.scores, flush=True)
-                for question, output_ids, question_id, metainfos in zip(batch['raw_questions'], output.sequences,
-                                                                        batch['question_id'], batch['metainfos']):
+                for question, output_ids, question_id, metainfos, origin_image in zip(batch['raw_questions'],
+                                                                                      output.sequences,
+                                                                                      batch['question_id'],
+                                                                                      batch['metainfos'],
+                                                                                      batch['origin_image']):
                     response = tokenizer.decode(
                         output_ids, skip_special_tokens=True)
                     response = response.strip()
@@ -341,7 +352,8 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                             'raw_question': question,
                             'answer': response,
                             'metainfos': metainfos,
-                            'model_path': args.checkpoint
+                            'model_path': args.checkpoint,
+                            'image': origin_image
                         })
                     else:
                         outputs.append({
@@ -349,7 +361,8 @@ def llava15_qa_colloator_fn(data_list, tokenizer, image_processor, config):
                             'raw_question': question,
                             'answer': response,
                             'metainfos': metainfos,
-                            'model_path': args.checkpoint
+                            'model_path': args.checkpoint,
+                            'image': origin_image
                         })
 
             cnt += 1
diff --git a/omnilmm/omnilmm_gen_data.py b/omnilmm/omnilmm_gen_data.py
index 0c59b57..d9a394a 100644
--- a/omnilmm/omnilmm_gen_data.py
+++ b/omnilmm/omnilmm_gen_data.py
@@ -67,9 +67,10 @@ def __getitem__(self, index):
         if "image_id" in item.keys():
             imgid = item["image_id"]
 
-        print(item.keys())
+        origin_image = None
         if "image" in item.keys():
             img_b64 = item['image']
+            origin_image = img_b64
 
             if len(img_b64) > 100:
                 image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert('RGB')
@@ -94,7 +95,8 @@ def __getitem__(self, index):
             'question_input_ids': question_input_ids,
             'raw_question': raw_question,
             'metainfos': metainfo,
-            'origin_dataset': self.qa_file
+            'origin_dataset': self.qa_file,
+            'origin_image': origin_image
         }
 
     def __len__(self):
@@ -132,6 +134,8 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
         data['metainfo'] = [x['metainfo'] for x in data_list]
     if 'metainfos' in data_list[0]:
         data['metainfos'] = [x['metainfos'] for x in data_list]
+    if 'origin_image' in data_list[0]:
+        data['origin_image'] = [x['origin_image'] for x in data_list]
 
     return data
 
@@ -159,12 +163,13 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
     )
     args = parser.parse_args()
 
-    torch.distributed.init_process_group(
-        backend='nccl',
-        world_size=int(os.getenv('WORLD_SIZE', '1')),
-        rank=int(os.getenv('RANK', '0')),
-    )
-    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            backend='nccl',
+            world_size=int(os.getenv('WORLD_SIZE', '1')),
+            rank=int(os.getenv('RANK', '0')),
+        )
+        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
     print(f'Init Rank-{torch.distributed.get_rank()}')
     model, image_processor, image_token_len, tokenizer = init_omni_lmm(
@@ -219,7 +224,9 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
                 output_scores_reshape = (batch['input_ids'].shape[0], len(output.scores), args.num_beam, output.scores[0].shape[-1])
                 new_output_scores = output_scores_all.view(output_scores_reshape)
 
-                for question, output_ids, output_scores, question_id, metainfos in zip(batch['raw_questions'], output.sequences, new_output_scores, batch['question_id'], batch['metainfos']):
+                for question, output_ids, output_scores, question_id, metainfos, origin_image in zip(
+                        batch['raw_questions'], output.sequences, new_output_scores, batch['question_id'],
+                        batch['metainfos'], batch['origin_image']):
                     # print(args.max_tokens, output_ids[input_size:].shape, output_scores.shape, output_scores.squeeze().shape)
 
                     response = tokenizer.decode(
@@ -227,9 +234,9 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
                     response = response.strip()
 
                     scores = torch.softmax(output_scores.squeeze(), dim=0)
-                    print(scores.shape)
+                    # print(scores.shape)
                     max_value, max_index = torch.max(scores, dim=0)
-                    print(f'scores: {max_index}')
+                    # print(f'scores: {max_index}')
 
                     item_scores = {
                         'yes': scores[yes_id].cpu().item(),
@@ -247,17 +254,19 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
                             'answer': response,
                             'scores': item_scores,
                             'metainfos': metainfos,
-                            'model_path': args.checkpoint
+                            'model_path': args.checkpoint,
+                            'image': origin_image
                         })
                     else:
                         outputs.append({
-                        'question_id': question_id,
-                        'raw_question': question,
-                        'answer': response,
-                        'scores': item_scores,
-                        'metainfos': metainfos,
-                        'model_path': args.checkpoint
-                    })
+                            'question_id': question_id,
+                            'raw_question': question,
+                            'answer': response,
+                            'scores': item_scores,
+                            'metainfos': metainfos,
+                            'model_path': args.checkpoint,
+                            'image': origin_image
+                        })
 
             else:
                 if args.num_beam >= 1:
@@ -280,7 +289,7 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
                         repetition_penalty=1.1)
 
                 # print(output.scores, flush=True)
-                for question, output_ids, question_id, metainfos in zip(batch['raw_questions'], output.sequences, batch['question_id'], batch['metainfos']):
+                for question, output_ids, question_id, metainfos, origin_image in zip(batch['raw_questions'], output.sequences, batch['question_id'], batch['metainfos'], batch['origin_image']):
                     response = tokenizer.decode(
                             output_ids, skip_special_tokens=True)
                     response = response.strip()
@@ -294,7 +303,8 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
                             'raw_question': question,
                             'answer': response,
                             'metainfos': metainfos,
-                            'model_path': args.checkpoint
+                            'model_path': args.checkpoint,
+                            'image': origin_image
                         })
                     else:
                         outputs.append({
@@ -302,7 +312,8 @@ def zephyr_qa_colloator_fn(data_list, tokenizer, img_transform):
                             'raw_question': question,
                             'answer': response,
                             'metainfos': metainfos,
-                            'model_path': args.checkpoint
+                            'model_path': args.checkpoint,
+                            'image': origin_image
                         })
 
     torch.distributed.barrier()
diff --git a/pyproject.toml b/pyproject.toml
index c442238..d30bc8d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "Pillow==10.3.0", "Requests==2.32.2", "shortuuid==1.0.13",
     "spacy==3.7.2", "timm==0.9.10", "tokenizers==0.15.1",
     "tqdm==4.67.0", "transformers==4.37.0","wandb==0.15.11",
-    "spacy==3.7.2","opencv-python==4.9.0.80", "datasets==2.21.0", "cffi==1.17.1"
+    "spacy==3.7.2","opencv-python==4.9.0.80", "datasets==2.21.0", "cffi==1.17.1", "jieba"
 ]
 
 [project.optional-dependencies]
diff --git a/script/data_gen/divide_and_conquer/llama3_8b_divide_and_conquer.sh b/script/data_gen/divide_and_conquer/llama3_8b_divide_and_conquer.sh
index 509fbf4..5049b65 100644
--- a/script/data_gen/divide_and_conquer/llama3_8b_divide_and_conquer.sh
+++ b/script/data_gen/divide_and_conquer/llama3_8b_divide_and_conquer.sh
@@ -15,12 +15,17 @@ echo "chunk_num="$chunk_num
 echo "start="$start" end="$end
 echo "batch_size="$bs
 
+model_path_changeq=$6
+model_path_split=$7
+
 
 for i in $(seq 0 $((chunk_num-1))); do
     echo $i
     CUDA_VISIBLE_DEVICES=$i python ./utils/llama3_8b_inference.py \
     --path ${data_path}.jsonl \
     --chunk-num $chunk_num \
+    --model_path_changeq $model_path_changeq \
+    --model_path_split $model_path_split \
     --chunk-idx $i \
     --bs $bs \
     --start ${start} \
@@ -30,21 +35,26 @@ wait
 
 # Merge divided files
 output_file=${data_path}.s${start}-e${end}.llama3-8b_divide.jsonl
-> "$output_file"
+#> "$output_file"
 for IDX in $(seq 0 $((chunk_num-1))); do
+#  echo $output_file
     cat ${data_path}.s${start}-e${end}.chunk${chunk_num}-${IDX}.llama3-8b_divide.jsonl >> "$output_file"
 done
 
 # Merge generated questions files
 output_file=${data_path}.s${start}-e${end}.llama3-8b_divide.gq.jsonl
-> "$output_file"
+#> "$output_file"
 for IDX in $(seq 0 $((chunk_num-1))); do
+    # shellcheck disable=SC2086
+#  echo $output_file
     cat ${data_path}.s${start}-e${end}.chunk${chunk_num}-${IDX}.llama3-8b_divide.gq.jsonl >> "$output_file"
 done
 
 # Merge generated questions with 'yes or no' suffix files
 output_file=${data_path}.s${start}-e${end}.llama3-8b_divide.gq.qas.jsonl
-> "$output_file"
+#> "$output_file"
 for IDX in $(seq 0 $((chunk_num-1))); do
+    # shellcheck disable=SC2086
+#  echo $output_file
     cat ${data_path}.s${start}-e${end}.chunk${chunk_num}-${IDX}.llama3-8b_divide.gq.qas.jsonl >> "$output_file"
 done
diff --git a/utils/get_pairs_filter_shorten.py b/utils/get_pairs_filter_shorten.py
index 61de7e8..ff7c6f5 100644
--- a/utils/get_pairs_filter_shorten.py
+++ b/utils/get_pairs_filter_shorten.py
@@ -13,7 +13,7 @@ def filter_pair_by_len(all_pairs_dicts, diff_len):
             continue
 
         if pair['chosen'].strip() == pair['rejected'].strip():
-            print(diff_len, "chosen==rejected")
+            # print(diff_len, "chosen==rejected")
             continue
 
         remain_pair.append(pair)
diff --git a/utils/get_preference_pairs.py b/utils/get_preference_pairs.py
index 10878da..d48c4f4 100644
--- a/utils/get_preference_pairs.py
+++ b/utils/get_preference_pairs.py
@@ -33,6 +33,7 @@ def filter_same_instruct(org_data, autocheck_data):
 
     return new_data, remain_autocheck
 
+
 def save_pred_quesid_to_judge(pred_quesid_to_judge, origin_divide_data, save_path):
     new_data = []
     for item in origin_divide_data:
@@ -106,6 +107,8 @@ def get_pair_data(path_autocheck, path_ans_divide, save_path, diff=1):
         rej_len += len(rej_answer.split())
 
         image_path = chosen['metainfos']['image_path']
+        image = chosen.get('image', rejected.get('image', ''))
+
         assert chosen['metainfos']['image_path'] == rejected['metainfos']['image_path']
 
         if len(chosen_judge) != len([fact for fact in chosen['facts'] if fact != '']):
@@ -129,6 +132,7 @@ def get_pair_data(path_autocheck, path_ans_divide, save_path, diff=1):
         new_item = {
             "image_id": image_path.split('/')[-1],
             "image_path": image_path,
+            "image": image,
             "ds_question_id": ds_question_id,
             "question": question,
             "chosen": ch_answer,
@@ -151,6 +155,7 @@ def get_pair_data(path_autocheck, path_ans_divide, save_path, diff=1):
 
     return pair_data
 
+
 def sample_pair_data(pair_data, sample_num, save_path):
     dsid_2_pairs = defaultdict(list)
     for item in pair_data:
@@ -167,6 +172,7 @@ def sample_pair_data(pair_data, sample_num, save_path):
     print(f"sample {sample_num} pair data:", len(sampled_pairs))
     write_jsonlines(save_path, sampled_pairs)
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--autocheck_path', type=str)
@@ -178,11 +184,10 @@ def sample_pair_data(pair_data, sample_num, save_path):
     path = args.autocheck_path
     path_gpt_divide = args.gpt_divide_gq_path
 
-
     save_path = path.replace('.jsonl', '.pair_diff1.jsonl')
     pair_data = get_pair_data(path, path_gpt_divide, save_path, diff=1)
 
     sampled_num = args.sample_num
     sample_save_path = path.replace('.jsonl', f'_pair_diff1_samp{sampled_num}.jsonl')
     pair_data = read_jsonlines(save_path)
-    sample_pair_data(pair_data, sampled_num, sample_save_path)
\ No newline at end of file
+    sample_pair_data(pair_data, sampled_num, sample_save_path)
diff --git a/utils/llama3_8b_inference.py b/utils/llama3_8b_inference.py
index ccf81d6..5e8fadc 100644
--- a/utils/llama3_8b_inference.py
+++ b/utils/llama3_8b_inference.py
@@ -81,8 +81,7 @@ def get_facts(result):
     # print(fact_list)
     return fact_list
 
-def init_divide_pipline():
-    model_id = "/data/apps/RLAIF-V/rlaif-v-main/models/llama3-split"
+def init_divide_pipline(model_id):
     tokenizer = (model_id, {'padding_side': 'left'})
     pipeline = transformers.pipeline(
         "text-generation",
@@ -94,8 +93,7 @@ def init_divide_pipline():
     return tokenizer, pipeline
 
 
-def init_changeq_pipline():
-    model_id = "/data/apps/RLAIF-V/rlaif-v-main/models/llama3-changeq"
+def init_changeq_pipline(model_id):
     tokenizer = (model_id, {'padding_side': 'left'})
     pipeline = transformers.pipeline(
         "text-generation",
@@ -246,18 +244,23 @@ def data_collator(data, pipeline, wrap_func, batch_size=8):
     return batch_inputs
 
 def construct_question_yesno(path, save_path):
-    print("construct_question_yesno")
+    # print("construct_question_yesno")
     data = read_jsonlines(path)
 
     new_qas = []
     for i,item in enumerate(data):
+        image = None
+        if 'image' in item:
+            image = item['image']
         try:
             image_path = item['image_path']
         except:
             try:
                 image_path = item['metainfos']['image_path']
             except:
-                raise ValueError("Do not have 'image_path' in the data!")
+                if image is None:
+                    raise ValueError("Do not have 'image_path' in the data!")
+                image_path = ""
 
         if type(item['facts']) == type(''):
             continue
@@ -280,6 +283,8 @@ def construct_question_yesno(path, save_path):
                 'question': question,
                 'metainfos': metainfos
             }
+            if image is not None:
+                new_item['image'] = image
 
             new_qas.append(new_item)
 
@@ -289,6 +294,8 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--path', type=str)
     parser.add_argument('--divide_suffix', type=str, default='llama3-8b_divide')
+    parser.add_argument('--model_path_changeq', type=str, default='/data/apps/RLAIF-V/rlaif-v-main/models/llama3-changeq')
+    parser.add_argument('--model_path_split', type=str, default='/data/apps/RLAIF-V/rlaif-v-main/models/llama3-split')
     parser.add_argument('--chunk-num', type=int, default=1)
     parser.add_argument('--chunk-idx', type=int, default=0)
     parser.add_argument('--bs', type=int, default=4)
@@ -298,7 +305,7 @@ def main():
     args = parser.parse_args()
 
     print(f"chunk_num={args.chunk_num}, chunk_idx={args.chunk_idx}")
-    tokenizer, pipeline = init_divide_pipline()
+    tokenizer, pipeline = init_divide_pipline(args.model_path_split)
 
     path = args.path
 
@@ -310,7 +317,7 @@ def main():
     del tokenizer
     del pipeline
 
-    tokenizer, pipeline = init_changeq_pipline()
+    tokenizer, pipeline = init_changeq_pipline(args.model_path_changeq)
     save_general_questions_path=save_divide_path.replace(".jsonl", ".gq.jsonl")
     print(f"\n==> Do change question... \n save path = {save_general_questions_path}")
     all_outputs = batch_inference(save_divide_path, save_general_questions_path, tokenizer, pipeline,

From 8cdab04a8bad376732ff7126bf3f0dfe28bedc74 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Sun, 8 Dec 2024 14:24:19 +0800
Subject: [PATCH 16/18] [upgrade]

---
 data_engine/README.md                         | 141 +++++++++++++++---
 data_engine/README_zh.md                      | 141 +++++++++++++++---
 data_engine/engine.py                         |  10 +-
 .../divide_and_conquer_pipeline.py            |  61 +++-----
 data_engine/run_engine.sh                     |  56 +++++--
 5 files changed, 313 insertions(+), 96 deletions(-)

diff --git a/data_engine/README.md b/data_engine/README.md
index 704f460..e76bccb 100644
--- a/data_engine/README.md
+++ b/data_engine/README.md
@@ -1,34 +1,133 @@
 # Data Engine
 
 ## Overview
-This part of the code is used to build the DPO dataset, which you can use for direct training.  
-You only need to input the reward model, instruction model, and your dataset, and we will generate the DPO dataset for you. All you need to do is run the `run_engine.sh` script. \
-Instruct model: we use instruct model to generate raw answer to the given question in the dataset. \
-Reward model: the model we use to evaluate the answer generated by the instruct model. We get rewards to answers with the help of reward model and use these rewards to rank answers. After that, we can build DPO dataset.
+
+This module constructs a DPO dataset for direct training. Simply input your reward model, instruct model, and
+dataset, then run the `run_engine.sh` script to generate the dataset.
+
+- **Instruct Model**: Used to generate raw answers to questions in the dataset.
+- **Reward Model**: Evaluates the answers generated by the instruct model, providing rewards to rank them and build
+  the DPO dataset.
 
 ## Usage
-Please refer to the `run_engine.sh` script.
 
-You can specify reward model and instruction model you want to use for generating preference training dataset. The current supported list of reward models and instruction models are listed below: \
-llava-1.5-7b, RLAIF-V-7B, OmniLMM-12B, and RLAIF-V-12B. We are considering adding more models in the future. \
-If the model you wish to use is not listed, you may need to implement the corresponding code on yourself: \
-(for model loading, add code to `RLAIF-V/builder`; for answer sampling, refer to `RLAIF-V/llava/llava15_sample_data.py` to see how data is formatted (don't forget to pass `raw_images`) and add call it in `RLAIF-V/data_engine/answer_sampler.py`; for log probability calculation, change data formatting part in `RLAIF-V/data_engine/logps_calculator.py` and `get_multimodal_sample_logps` function in `RLAIF-V/muffin/eval/muffin_inference_logp.py`).
+Refer to the `run_engine.sh` script for execution. Two pipelines are available for data construction:
+
+1. **Divide and Conquer Pipeline** (`divide_and_conquer`)
+2. **DPO Reward Pipeline** (`dpo_reward`)
+
+Each has distinct requirements, explained below.
+
+---
+
+### Divide and Conquer Pipeline
+
+#### Process Method
+
+Detailed in the corresponding research paper.
+
+#### Required Models
+
+- **Split Model**: [Download here](https://thunlp.oss-cn-qingdao.aliyuncs.com/rlaifv_llama3_split_model.tar.gz)
+- **Question Transformation Model**: [Download here](https://thunlp.oss-cn-qingdao.aliyuncs.com/rlaifv_llama3_changeq_model.tar.gz)
+
+#### Custom Implementation
+
+We support `llava-1.5-7b` as the instruct model. For the reward model, three models are required:
+
+1. **Question Change Model** (`rlaifv_llama3_changeq_model`)
+2. **Question Split Model** (`rlaifv_llama3_split_model`)
+3. **Auto Check Model** (`OmniLMM-12B` or `MiniCPM-Llama3-V-2_5`)
+
+We don't recommend you to change Question Change Model and Question Split Model.If you wish to use other models as instruct model or auto check model, custom implementation is needed:
+
+1. **Generate Rollouts**
+    - Needed if you want new instruct model
+    - Add a model builder in `RLAIF-V/builder`.
+    - Add corresponding caller code in `RLAIF-V/builder/builder.py`.
+    - Refer to `RLAIF-V/muffin/llava15_gen_data.py` for sampling logic.
+
+2. **Reward Collection**
+    - Needed if you want new auto check model
+    - Implement builder.
+    - Update `reward_calculate` function in `RLAIF-V/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py` to call your own model.
+
+#### Dataset Format
+
+Dataset should be in `.jsonl` format with the following fields:
+
+- `question`: The image-related question.
+- `question_id`: Optional.
+- `image`: Base64-encoded binary data (optional if `image_path` is present).
+- `image_path`: Path to the image (optional if `image` is present). If `image` is empty, `image_path` will be used.
+
+#### Script Parameters
+
+- `--reward_model_path`: Comma-separated paths to the auto check model, question change model, and split model (e.g.,
+  `/path/to/MiniCPM-Llama3-V-2_5,/path/to/changeq_model,/path/to/split_model`).
+- `--reward_model_name`: Name of the auto check model (e.g., `MiniCPM-Llama3-V-2_5`).
+- `--pipeline_name`: `divide_and_conquer`.
+- `--reward_model_python_path`: Python path for configuring the auto check model environment (required for MiniCPM-V).
+
+---
+
+### DPO Reward Pipeline
+
+#### Process Method
+
+Generates rewards using the DPO framework to rank answers. Higher-ranked answers are marked as "chosen," while
+lower-ranked ones are "rejected."
+
+#### Custom Implementation
+
+Supported models:
+
+- Instruct Models: `llava-1.5-7b`, `OmniLMM-12B`
+- Reward Models: `RLAIF-V-7B`, `RLAIF-V-12B`
+
+For other models, implement custom code:
+
+1. **Generate Rollouts**
+    - Needed for new instruct model.
+    - Add model builders in `RLAIF-V/builder`.
+    - Add caller code in `RLAIF-V/builder/builder.py`.
+    - Refer to `RLAIF-V/llava/llava15_sample_data.py` for sampling logic.
+    - Update `RLAIF-V/data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py`.
+
+2. **Reward Collection**
+    - Needed for new reward model.
+    - Add model builder.
+    - Update `RLAIF-V/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py` and
+      `RLAIF-V/muffin/eval/muffin_inference_logp.py` to ensure compatibility.
+
+#### Dataset Format
+
+Recommended format: `.parquet`  
+Fields:
+
+- `idx`: Unique identifier for each entry (string format allowed).
+- `question`: Image-related question.
+- `image`: Dictionary with keys:
+    - `bytes`: Binary format.
+    - `path`: Optional but recommended.
+
+#### Script Parameters
+
+- `--pipeline_name`: `dpo_reward`.
+
+---
 
-Additionally, **please double-check that the model name you provide is correct**, as we will not know which code to execute otherwise.
+### Common Usage Notes
 
-Next, your dataset should contain the following fields:
-1. `idx`: A unique index for each data entry (this can be a string).
-2. `question`: The question related to the image.
-3. `image`: The column should follow this structure:
-   - `{'bytes': ..., 'path':...}`
-   - `bytes` should be in binary format.
-   - `path` is not strictly required, but to avoid errors, it's better to keep this field (you can set it as an empty string).
+- Specify `--work_dir` to store intermediate and final outputs under a designated directory.
+- Use `--debug` for detailed intermediate outputs saved under a `debug` directory.
+- If errors occur, use `--run_stage` to rerun specific steps after resolving issues.
+- The final dataset path will be displayed upon completion.
 
-You can specify a `--work_dir` to store intermediate files and the final output under this directory (which will actually be a subdirectory within it).
+---
 
-If you encounter errors during generation, you can pass the stage next to the stage that has been completed using the `--continue_from_stage` parameter (0, 1, or 2). When the value is 0, it will start from scratch. (For example, if you've completed stages 0 and 1 but encounter an error during stage 2, you can fix the issue and set `--continue_from_stage 2` to continue from that point.) You can check the `data_engine.py` file for details on what each stage does.
+### Running the Script
 
-Run:
-```shell
+```bash
 sh data_engine/run_data_engine.sh
 ```
diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
index 75cd4ea..eb4721a 100644
--- a/data_engine/README_zh.md
+++ b/data_engine/README_zh.md
@@ -1,33 +1,128 @@
-# Data Engine
+# 数据引擎
 
-## Overview
-此部分代码用于为您构建 DPO 数据集，您可以直接用它来进行训练。  
-您只需输入奖励模型（reward model）、指令模型（instruct model）和数据集，我们将为您构建 DPO 数据集。您只需运行 `run_engine.sh` 脚本即可。\
-指令模型：我们使用指令模型来生成数据集中给定问题的原始答案。 \
-奖励模型：我们用来评估指令模型生成的答案的模型。我们借助奖励模型获得答案的奖励，并使用此奖励对答案进行排名。之后，我们可以构建 DPO 数据集。
+## 概述
 
-## Usage
-请查看 `run_engine.sh` 脚本。
+此模块用于构建用于直接训练的 DPO 数据集。只需输入您的Reward model、Instruct model和数据集，然后运行 `run_engine.sh` 脚本即可生成数据集。
 
-您可以指定要用于生成偏好训练数据集的奖励模型和指令模型。当前支持的奖励模型和指令模型列表如下：\
-llava-1.5-7b、RLAIF-V-7B、OmniLMM-12B 和 RLAIF-V-12B。我们也在考虑添加更多模型。\
-如果您选择的模型不在模型列表中，您可能需要自行实现相关代码：（`RLAIF-V/builder` 用于模型加载；对于初始回答抽样，请参考`RLAIF-V/llava/llava15_sample_data.py`是如何对数据进行格式化的（请不要忘记传递`raw_images`）同时将您的调用代码添加到`RLAIF-V/data_engine/answer_sampler.py`中; 对于logps计算，请更改`RLAIF-V/data_engine/logps_calculator.py`中用于格式化数据的部分，和`RLAIF-V/muffin/eval/muffin_inference_logp.py`的`get_multimodal_sample_logps`函数）。
+- **Instruct model**：用于生成数据集中问题的初始答案。  
+- **Reward model**：评估Instruct model生成的答案，提供奖励以对其排序并构建 DPO 数据集。
 
-另外，**请务必确认您提供的模型名称正确，否则我们无法确定该运行哪段代码**。
+## 使用说明
 
-接下来是您的数据集，它应该包含以下字段：
-1. `idx`：每条数据的唯一索引（可以是字符串）。
-2. `question`：图像对应的问题。
-3. `image`：该列应遵循以下结构：
-   - {'bytes': ..., 'path':...}
-   - `bytes` 应为二进制格式。
-   - `path` 字段不是必须的，但为了避免错误，建议您保留此字段（可以设置为空字符串）。
+参阅 `run_engine.sh` 脚本执行。数据构建支持以下两种pipeline：
 
-您可以选择设置 `--work_dir`，我们将在该目录下保存中间文件和最终输出（实际上是该目录下的子目录）。
+1. **Divide and Conquer Pipeline**（`divide_and_conquer`）  
+2. **DPO Reward Pipeline**（`dpo_reward`）  
 
-如果在生成过程中遇到错误，您可以使用 `--continue_from_stage` 参数指定已完成阶段的下一个阶段（0、1、2）。如果值为 0，则从头开始。（例如，您完成了阶段 0 和阶段 1，在阶段 2 遇到错误，修复问题后设置 `--continue_from_stage 2` 以继续执行）。您可以查看文件 `data_engine.py` 了解每个阶段的具体内容。
+两种pipeline的需求各有不同，具体如下。
 
-运行：
-```shell
+---
+
+### Divide and Conquer Pipeline
+
+#### 处理方法
+
+具体流程详见论文。
+
+#### 所需模型
+
+- **拆分模型**：[点击下载](https://thunlp.oss-cn-qingdao.aliyuncs.com/rlaifv_llama3_split_model.tar.gz)  
+- **问题转换模型**：[点击下载](https://thunlp.oss-cn-qingdao.aliyuncs.com/rlaifv_llama3_changeq_model.tar.gz)  
+
+#### 自定义实现
+
+支持使用 `llava-1.5-7b` 作为Instruct model。Reward model需要以下三种模型：
+
+1. **问题转换模型**（现支持rlaifv_llama3_changeq_model）
+2. **问题拆分模型**（现支持rlaifv_llama3_split_model）
+3. **自动检查模型**（现支持 `OmniLMM-12B` 或 `MiniCPM-Llama3-V-2_5`）  
+
+对于问题转换模型和问题拆分模型，不建议更换。如需使用其他自动检查模型或Instruct model，请自定义实现以下内容：
+
+1. **Sample Rollout**
+    - 若要添加新的Instruct model则需要以下修改。
+    - 在 `RLAIF-V/builder` 中添加模型构建器。  
+    - 在 `RLAIF-V/builder/builder.py` 中添加对应调用代码。  
+    - 参考 `RLAIF-V/muffin/llava15_gen_data.py` 实现采样逻辑。  
+
+2. **Reward Collection**  
+    - 若要添加新的自动检查模型则需要以下修改。
+    - 如同生成 Rollout 一样，添加模型构建器和实用函数。  
+    - 更新 `RLAIF-V/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py` 中的 `reward_calculate` 方法。
+
+#### 数据集格式
+
+数据集需为 `.jsonl` 格式，包含以下字段：
+
+- `question`：与图像相关的问题。  
+- `question_id`：可选字段。  
+- `image`：Base64 编码的二进制数据（如果提供 `image_path` 则可省略）。  
+- `image_path`：图像的路径（如果提供 `image` 则可省略）。如果 `image` 为空，将使用 `image_path`。
+
+#### 脚本参数
+
+- `--reward_model_path`：自动检查模型、问题转换模型和拆分模型的路径，使用逗号分隔（如：`/path/to/MiniCPM-Llama3-V-2_5,/path/to/changeq_model,/path/to/split_model`）。  
+- `--reward_model_name`：自动检查模型名称（如：`MiniCPM-Llama3-V-2_5`）。  
+- `--pipeline_name`：使用的pipeline名称，设置为 `divide_and_conquer`。  
+- `--reward_model_python_path`：配置自动检查模型环境的 Python 路径（仅 MiniCPM-V 模型需要）。
+
+---
+
+### DPO Reward Pipeline
+
+#### 处理方法
+
+使用 DPO 框架生成奖励以对答案排序。高分答案标记为Chosen，低分答案标记为Rejected。
+
+#### 自定义实现
+
+支持的模型：
+
+- Instruct model：`llava-1.5-7b`，`OmniLMM-12B`  
+- Reward model：`RLAIF-V-7B`，`RLAIF-V-12B`  
+
+如需使用其他模型，请实现以下自定义代码：
+
+1. **Sample Rollout**  
+    - 若要添加新的Instruct model则需要以下修改。
+    - 在 `RLAIF-V/builder` 中添加模型构建器。  
+    - 在 `RLAIF-V/builder/builder.py` 中添加调用代码。  
+    - 参考 `RLAIF-V/llava/llava15_sample_data.py` 实现采样逻辑。  
+    - 更新 `RLAIF-V/data_engine/pipeline/dpo_reward_pipeline/answer_sampler.py`。
+
+2. **Reward Collection**  
+    - 若要添加新的Reward model则需要以下修改。
+    - 类似Sample Rollout添加模型构建器。  
+    - 更新 `RLAIF-V/data_engine/pipeline/dpo_reward_pipeline/logps_calculator.py` 和 `RLAIF-V/muffin/eval/muffin_inference_logp.py`，以确保数据格式一致。
+
+#### 数据集格式
+
+推荐格式：`.parquet`  
+字段：
+
+- `idx`：每条记录的唯一标识符（可使用字符串）。  
+- `question`：与图像相关的问题。  
+- `image`：包含以下键的字典：  
+    - `bytes`：二进制格式图像数据。  
+    - `path`：可选字段，推荐提供。
+
+#### 脚本参数
+
+- `--pipeline_name`：设置为 `dpo_reward`。
+
+---
+
+### 通用使用说明
+
+- 使用 `--work_dir` 指定工作目录，将中间和最终输出存储到该目录下。  
+- 使用 `--debug` 参数，可将详细中间输出保存到 `debug` 目录中。  
+- 如果出现错误，可使用 `--run_stage` 参数在解决问题后重新运行特定步骤。  
+- 脚本完成后将显示最终数据集路径。
+
+---
+
+### 运行脚本
+
+```bash
 sh data_engine/run_data_engine.sh
 ```
diff --git a/data_engine/engine.py b/data_engine/engine.py
index cd87f35..9a2746a 100644
--- a/data_engine/engine.py
+++ b/data_engine/engine.py
@@ -38,7 +38,7 @@ def run(**kwargs):
 
     # 0: sample answer
     sampled_answer_path = os.path.join(kwargs["work_dir"], "sampled_answer")
-    if kwargs.get("continue_from_stage", 1) <= 0:
+    if kwargs.get("run_stage", 0) == 0:
         dir_prepare(sampled_answer_path)
         sub_work_dir = os.path.join(intermediate_step_dir, "sample_answers")
         dir_prepare(sub_work_dir)
@@ -58,7 +58,7 @@ def run(**kwargs):
 
     # 1: calculate rewards
     reward_output_dir = os.path.join(kwargs["work_dir"], "reward")
-    if kwargs.get("continue_from_stage", 1) <= 1:
+    if kwargs.get("run_stage", 0) == 1:
         dir_prepare(reward_output_dir)
         sub_work_dir = os.path.join(intermediate_step_dir, "calculate_rewards")
         dir_prepare(sub_work_dir)
@@ -78,7 +78,7 @@ def run(**kwargs):
 
     # following code doesn't need multi CUDA
     if torch.distributed.get_rank() == 0:
-        if kwargs.get("continue_from_stage", 1) <= 2:
+        if kwargs.get("run_stage", 0) == 2:
             print_stage(2, "Pair build and filter")
             sub_work_dir = os.path.join(intermediate_step_dir, "pair_build_and_filter")
             dir_prepare(sub_work_dir)
@@ -133,7 +133,7 @@ def run(**kwargs):
     parser.add_argument("--dataset_path", type=str, required=True, help="The path of the dataset.")
     parser.add_argument("--work_dir", type=str, required=True, help="The working directory.")
     parser.add_argument("--pipeline_name", type=str, required=True, help="The pipeline you choose to run.")
-    parser.add_argument("--continue_from_stage", type=int, default=1, help="The stage to continue from.")
+    parser.add_argument("--run_stage", type=int, default=0, help="The stage to run.")
     parser.add_argument("--sample_k", type=int, default=10, help="The sample number k.")
     parser.add_argument("--rank", type=int, default=3, help="The rank number. (specific to DPORewardPipeline)")
     parser.add_argument("--distance", type=int, default=25, help="The distance. (specific to DPORewardPipeline)")
@@ -152,7 +152,7 @@ def run(**kwargs):
         dataset_path=args.dataset_path,
         work_dir=args.work_dir,
         pipeline_name=args.pipeline_name,
-        continue_from_stage=args.continue_from_stage,
+        run_stage=args.run_stage,
         sample_k=args.sample_k,
         rank=args.rank,
         distance=args.distance,
diff --git a/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py b/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
index 2bf4627..bfa1b65 100644
--- a/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
+++ b/data_engine/pipeline/divide_and_conquer/divide_and_conquer_pipeline.py
@@ -1,6 +1,7 @@
 import os
 import subprocess
 import torch
+import sys
 
 from data_engine.pipeline.pipeline import Pipeline
 from data_engine.util import dir_prepare
@@ -17,6 +18,21 @@ def get_jsonl_file(path: str) -> list:
     return jsonl_files
 
 
+def get_min_len_file(path: str, name_contains: list[str]) -> str:
+    file_dict = {}
+    min_len = sys.maxsize
+    for file in get_jsonl_file(path):
+        record = True
+        for name in name_contains:
+            if name not in file:
+                record = False
+                break
+        if record:
+            file_dict[len(file)] = file
+            min_len = min(min_len, len(file))
+    return file_dict[min_len]
+
+
 class DivideAndConquerPipeline(Pipeline):
     @classmethod
     def judge_able_to_process(cls, pipeline_name: str) -> bool:
@@ -67,14 +83,8 @@ def reward_calculate(cls, **kwargs) -> None:
             changeq = reward_model_path[1].strip()
             split = reward_model_path[2].strip()
             script_path = './script/data_gen/divide_and_conquer/llama3_8b_divide_and_conquer.sh'
-            file_dict = {}
-            min_len = 9999
-            for file in get_jsonl_file(kwargs["sampled_answer_path"]):
-                if 'diverse_gen' not in file:
-                    continue
-                file_dict[len(file)] = file
-                min_len = min(min_len, len(file))
-            file_name = os.path.basename(file_dict[min_len])
+            file_name = get_min_len_file(kwargs["sampled_answer_path"], ['diverse_gen'])
+            file_name = os.path.basename(file_name)
             answer_file = os.path.join(kwargs["sampled_answer_path"], file_name[:file_name.rfind('.')])
             run_bash_script(
                 script_path,
@@ -88,13 +98,8 @@ def reward_calculate(cls, **kwargs) -> None:
             )
 
             auto_check_model = reward_model_path[0].strip()
-            file_dict = {}
-            min_len = 9999
-            for file in get_jsonl_file(kwargs["sampled_answer_path"]):
-                if "llama3-8b_divide.gq.qas.jsonl" in file and 'diverse_gen' in file:
-                    file_dict[len(file)] = file
-                    min_len = min(min_len, len(file))
-            check_ques_file = file_dict[min_len]
+            check_ques_file = get_min_len_file(kwargs["sampled_answer_path"],
+                                               ['llama3-8b_divide.gq.qas.jsonl', 'diverse_gen'])
             if 'omni' in auto_check_model.lower() or 'omni' in kwargs["reward_model_name"].lower():
                 print("OmniLMM as auto check model")
                 script_path = './script/data_gen/omnilmm/omnilmm_autocheck.sh'
@@ -136,20 +141,8 @@ def pair_build_with_filter(cls, **kwargs) -> str:
                 raise ValueError(f"Missing parameter '{param}' for pair_build_with_filter in DivideAndConquerPipeline.")
 
         if torch.distributed.get_rank() == 0:
-            file_dict = {}
-            min_len = 999
-            for file in get_jsonl_file(kwargs["sampled_answer_path"]):
-                if 'llama3-8b_divide.gq.jsonl' not in file:
-                    continue
-                file_dict[len(file)] = file
-                min_len = min(min_len, len(file))
-            gq_file = file_dict[min_len]
-            file_dict = {}
-            min_len = 999
-            for file in get_jsonl_file(kwargs["reward_path"]):
-                file_dict[len(file)] = file
-                min_len = min(min_len, len(file))
-            feedback_file = file_dict[min_len]
+            gq_file = get_min_len_file(kwargs["sampled_answer_path"], ['llama3-8b_divide.gq.jsonl'])
+            feedback_file = get_min_len_file(kwargs["reward_path"], [])
             script_path = './script/data_gen/construct_pairs.sh'
             run_bash_script(
                 script_path,
@@ -159,18 +152,12 @@ def pair_build_with_filter(cls, **kwargs) -> str:
             )
 
             script_path = './utils/get_pairs_filter_shorten.py'
-            file_dict = {}
-            min_len = 999
-            for file in get_jsonl_file(kwargs["reward_path"]):
-                if 'llama3-8b_divide.gq.qas_pair_diff1_samp2.jsonl' not in file:
-                    continue
-                file_dict[len(file)] = file
-                min_len = min(min_len, len(file))
+            file_path = get_min_len_file(kwargs["reward_path"], ['llama3-8b_divide.gq.qas_pair_diff1_samp2.json'])
             result_dir = os.path.join(kwargs["work_dir"], "dataset")
             dir_prepare(result_dir)
             subprocess.run([
                 'python', script_path,
-                '--path', os.path.join(kwargs["reward_path"], file_dict[min_len]),
+                '--path', os.path.join(kwargs["reward_path"], file_path),
                 '--save_path', os.path.join(result_dir, "result.jsonl")
             ], check=True)
             return os.path.join(result_dir, "result.jsonl")
diff --git a/data_engine/run_engine.sh b/data_engine/run_engine.sh
index 2348818..fe5164d 100644
--- a/data_engine/run_engine.sh
+++ b/data_engine/run_engine.sh
@@ -14,13 +14,49 @@ DISTRIBUTED_ARGS="
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT
 "
-torchrun $DISTRIBUTED_ARGS data_engine/data_engine.py \
-      --reward_model_name reward model name \
-      --reward_model_path /path/to/your/reward/model \
-      --instruct_model_name instruct model name \
-      --instruct_model_path /path/to/yout/instruct/model \
-      --dataset_path /path/to/your/dataset \
-      --work_dir /path/to/your/work/dir \
-      --pipeline_name dpo_reward \
-      --continue_from_stage 0 \
-      --debug True
\ No newline at end of file
+
+reward_model_name="reward_model_name"
+reward_model_path="/path/to/yout/reward/model"
+instruct_model_name="instruct_model_name"
+instruct_model_path="/path/to/yout/instruct/model"
+dataset_path="/path/to/your/dataset"
+work_dir="/path/to/your/work/dir"
+pipeline_name="pipeline_name"
+reward_model_python_path="needed_for_minicpmv"
+
+
+torchrun $DISTRIBUTED_ARGS data_engine/engine.py \
+      --reward_model_name $reward_model_name \
+      --reward_model_path $reward_model_path \
+      --instruct_model_name $instruct_model_name \
+      --instruct_model_path $instruct_model_path \
+      --dataset_path $dataset_path \
+      --work_dir  $work_dir \
+      --pipeline_name $pipeline_name \
+      --reward_model_python_path $reward_model_python_path \
+      --run_stage 0\
+      --debug
+
+torchrun $DISTRIBUTED_ARGS data_engine/engine.py \
+      --reward_model_name $reward_model_name \
+      --reward_model_path $reward_model_path \
+      --instruct_model_name $instruct_model_name \
+      --instruct_model_path $instruct_model_path \
+      --dataset_path $dataset_path \
+      --work_dir  $work_dir \
+      --pipeline_name $pipeline_name \
+      --reward_model_python_path $reward_model_python_path \
+      --run_stage 1\
+      --debug
+
+torchrun $DISTRIBUTED_ARGS data_engine/engine.py \
+      --reward_model_name $reward_model_name \
+      --reward_model_path $reward_model_path \
+      --instruct_model_name $instruct_model_name \
+      --instruct_model_path $instruct_model_path \
+      --dataset_path $dataset_path \
+      --work_dir  $work_dir \
+      --pipeline_name $pipeline_name \
+      --reward_model_python_path $reward_model_python_path \
+      --run_stage 2\
+      --debug
\ No newline at end of file

From 0871cd7fc6988b28478c9962cef263ccf1e299a0 Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Mon, 9 Dec 2024 20:23:35 +0800
Subject: [PATCH 17/18] [upgrade] refine README

---
 data_engine/README.md    | 5 ++---
 data_engine/README_zh.md | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/data_engine/README.md b/data_engine/README.md
index e76bccb..ad0b578 100644
--- a/data_engine/README.md
+++ b/data_engine/README.md
@@ -24,7 +24,7 @@ Each has distinct requirements, explained below.
 
 #### Process Method
 
-Detailed in the corresponding research paper.
+Use RLAIF-V divide-and-conquer strategy to collect AI feedback.
 
 #### Required Models
 
@@ -75,8 +75,7 @@ Dataset should be in `.jsonl` format with the following fields:
 
 #### Process Method
 
-Generates rewards using the DPO framework to rank answers. Higher-ranked answers are marked as "chosen," while
-lower-ranked ones are "rejected."
+Use RLAIF-V self-feedback guidance with DPO-trained models.
 
 #### Custom Implementation
 
diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
index eb4721a..14ea777 100644
--- a/data_engine/README_zh.md
+++ b/data_engine/README_zh.md
@@ -22,7 +22,7 @@
 
 #### 处理方法
 
-具体流程详见论文。
+使用 RLAIF-V 分而治之策略收集 AI 反馈。
 
 #### 所需模型
 
@@ -72,7 +72,7 @@
 
 #### 处理方法
 
-使用 DPO 框架生成奖励以对答案排序。高分答案标记为Chosen，低分答案标记为Rejected。
+将 RLAIF-V 自反馈指导与 DPO 训练模型结合使用。
 
 #### 自定义实现
 

From 8c46e3a62b22467c603c504f223ef91dea9a33ef Mon Sep 17 00:00:00 2001
From: MagicYao <springrainyszxr@gmail.com>
Date: Mon, 9 Dec 2024 22:16:15 +0800
Subject: [PATCH 18/18] [upgrade] refine README

---
 data_engine/README_zh.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_engine/README_zh.md b/data_engine/README_zh.md
index 14ea777..0970193 100644
--- a/data_engine/README_zh.md
+++ b/data_engine/README_zh.md
@@ -72,7 +72,7 @@
 
 #### 处理方法
 
-将 RLAIF-V 自反馈指导与 DPO 训练模型结合使用。
+使用 RLAIF-V 基于 DPO-aligned 模型构造的自反馈信号。
 
 #### 自定义实现