From 0684f05e4ea14f2caba763fd84a2b097dd91806f Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Thu, 26 Sep 2024 14:03:06 +0200
Subject: [PATCH 1/5] Add transformers version pinning for qwenvl2

---
 node-hub/dora-qwenvl/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/node-hub/dora-qwenvl/pyproject.toml b/node-hub/dora-qwenvl/pyproject.toml
index 8df2d80b5..e302b10aa 100644
--- a/node-hub/dora-qwenvl/pyproject.toml
+++ b/node-hub/dora-qwenvl/pyproject.toml
@@ -16,7 +16,7 @@ dora-rs = "^0.3.6"
 numpy = "< 2.0.0"
 torch = "^2.4.0"
 torchvision = "^0.19"
-transformers = { git = "https://github.com/huggingface/transformers" }
+transformers = "^4.45"
 qwen-vl-utils = "^0.0.2"
 accelerate = "^0.33"
 # flash_attn = "^2.6.1" # Install using: pip install -U flash-attn --no-build-isolation

From 56cfb6d90062423091a255d13075610caa681a2b Mon Sep 17 00:00:00 2001
From: LyonRust <echo_ai@foxmail.com>
Date: Tue, 1 Oct 2024 16:49:40 +0800
Subject: [PATCH 2/5] Support multi image recording within llama factory
 recorder

---
 .../llama_factory_recorder/main.py            | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py
index ceeeebbf7..840044283 100644
--- a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py
+++ b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py
@@ -42,7 +42,7 @@ def write_dict_to_json(file_path, key: str, new_data):
 
 
 def save_image_and_add_to_json(
-    image_array, root_path, llama_root_path, jsonl_file, messages
+    frame_dict: dict, root_path, llama_root_path, jsonl_file, messages
 ):
     """
     Saves an image from a NumPy array and adds a new JSON object as a line to a JSONL file.
@@ -69,17 +69,19 @@ def save_image_and_add_to_json(
             if os.path.isfile(os.path.join(llama_root_path / root_path, name))
         ]
     )
+    image_paths = []
+    for (event_id, data) in frame_dict.items():
+        # Define the image filename
+        image_filename = f"{event_id}-{image_id}.png"
+        image_path = os.path.join(root_path, image_filename)
 
-    # Define the image filename
-    image_filename = f"{image_id}.png"
-    image_path = os.path.join(root_path, image_filename)
-
-    # Save the image
-    image = Image.fromarray(image_array)
-    image.save(llama_root_path / image_path)
+        # Save the image
+        image = Image.fromarray(data)
+        image.save(llama_root_path / image_path)
+        image_paths.append(image_path)
 
     # Create the JSON entry with 'messages' and 'images'
-    new_entry = {"messages": messages, "images": [image_path]}
+    new_entry = {"messages": messages, "images": image_paths}
 
     # Add the entry to the JSONL file with UTF-8 encoding
     with open(jsonl_file, "a", encoding="utf-8") as f:
@@ -123,7 +125,7 @@ def main():
     )
 
     question = DEFAULT_QUESTION
-    frame = None
+    frames = {}
 
     for event in node:
         event_type = event["type"]
@@ -131,7 +133,7 @@ def main():
         if event_type == "INPUT":
             event_id = event["id"]
 
-            if event_id == "image":
+            if "image" in event_id:
                 storage = event["value"]
                 metadata = event["metadata"]
                 encoding = metadata["encoding"]
@@ -153,7 +155,7 @@ def main():
                     .reshape((height, width, channels))
                 )
                 if encoding == "bgr8":
-                    frame = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
+                    frames[event_id] = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
                 elif encoding == "rgb8":
                     pass
                 else:
@@ -164,12 +166,12 @@ def main():
                 if text != "":
                     question = text
             elif event_id == "ground_truth":
-                if frame is None:
+                if len(frames.keys()) == 0:
                     continue
                 ground_truth = event["value"][0].as_py()
 
                 messages = [
-                    {"content": "<image>" + question, "role": "user"},
+                    {"content": "<image>"  * len(frames.keys()) + question, "role": "user"},
                     {
                         "content": ground_truth,
                         "role": "assistant",
@@ -177,7 +179,7 @@ def main():
                 ]
 
                 save_image_and_add_to_json(
-                    image_array=frame,
+                    frame_dict=frames,
                     root_path=entry_name,
                     llama_root_path=llama_factory_root_path,
                     jsonl_file=default_record_json_path,

From d383a486ae508d6092e04c222501d215845e63d9 Mon Sep 17 00:00:00 2001
From: LyonRust <echo_ai@foxmail.com>
Date: Mon, 7 Oct 2024 16:38:28 +0800
Subject: [PATCH 3/5] Add adapter config path within qwenvl2

---
 node-hub/dora-qwenvl/dora_qwenvl/main.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/node-hub/dora-qwenvl/dora_qwenvl/main.py b/node-hub/dora-qwenvl/dora_qwenvl/main.py
index bb182e0b1..1075a1772 100644
--- a/node-hub/dora-qwenvl/dora_qwenvl/main.py
+++ b/node-hub/dora-qwenvl/dora_qwenvl/main.py
@@ -12,6 +12,7 @@
     "DEFAULT_QUESTION",
     "Describe this image",
 )
+ADAPTER_PATH = os.getenv("ADAPTER_PATH", "")
 
 # Check if flash_attn is installed
 try:
@@ -23,7 +24,7 @@
         device_map="auto",
         attn_implementation="flash_attention_2",
     )
-except ImportError:
+except ImportError or ModuleNotFoundError:
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         CUSTOM_MODEL_PATH,
         torch_dtype="auto",
@@ -31,8 +32,12 @@
     )
 
 
+if ADAPTER_PATH != "":
+    model.load_adapter(ADAPTER_PATH, "dora")
+
+
 # default processor
-processor = AutoProcessor.from_pretrained(DEFAULT_PATH)
+processor = AutoProcessor.from_pretrained(CUSTOM_MODEL_PATH)
 
 
 def generate(frames: dict, question):

From 33307b4e8a5460d038c9cca6ebfaed76b4a04b37 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Mon, 7 Oct 2024 11:18:06 +0200
Subject: [PATCH 4/5] Fix linting

---
 node-hub/dora-qwenvl/dora_qwenvl/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/node-hub/dora-qwenvl/dora_qwenvl/main.py b/node-hub/dora-qwenvl/dora_qwenvl/main.py
index 1075a1772..e03b31c14 100644
--- a/node-hub/dora-qwenvl/dora_qwenvl/main.py
+++ b/node-hub/dora-qwenvl/dora_qwenvl/main.py
@@ -24,7 +24,7 @@
         device_map="auto",
         attn_implementation="flash_attention_2",
     )
-except ImportError or ModuleNotFoundError:
+except (ImportError, ModuleNotFoundError):
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         CUSTOM_MODEL_PATH,
         torch_dtype="auto",

From 5917b9ea27cbee4cbd6a6ca51d2861f318dc0a12 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Mon, 7 Oct 2024 13:18:03 +0200
Subject: [PATCH 5/5] Fix badly formatted llama factory recorder main.py

---
 .../llama-factory-recorder/llama_factory_recorder/main.py  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py
index 840044283..e9a26c645 100644
--- a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py
+++ b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py
@@ -70,7 +70,7 @@ def save_image_and_add_to_json(
         ]
     )
     image_paths = []
-    for (event_id, data) in frame_dict.items():
+    for event_id, data in frame_dict.items():
         # Define the image filename
         image_filename = f"{event_id}-{image_id}.png"
         image_path = os.path.join(root_path, image_filename)
@@ -171,7 +171,10 @@ def main():
                 ground_truth = event["value"][0].as_py()
 
                 messages = [
-                    {"content": "<image>"  * len(frames.keys()) + question, "role": "user"},
+                    {
+                        "content": "<image>" * len(frames.keys()) + question,
+                        "role": "user",
+                    },
                     {
                         "content": ground_truth,
                         "role": "assistant",