From 0684f05e4ea14f2caba763fd84a2b097dd91806f Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Thu, 26 Sep 2024 14:03:06 +0200 Subject: [PATCH 1/5] Add transformers version pinning for qwenvl2 --- node-hub/dora-qwenvl/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-hub/dora-qwenvl/pyproject.toml b/node-hub/dora-qwenvl/pyproject.toml index 8df2d80b5..e302b10aa 100644 --- a/node-hub/dora-qwenvl/pyproject.toml +++ b/node-hub/dora-qwenvl/pyproject.toml @@ -16,7 +16,7 @@ dora-rs = "^0.3.6" numpy = "< 2.0.0" torch = "^2.4.0" torchvision = "^0.19" -transformers = { git = "https://github.com/huggingface/transformers" } +transformers = "^4.45" qwen-vl-utils = "^0.0.2" accelerate = "^0.33" # flash_attn = "^2.6.1" # Install using: pip install -U flash-attn --no-build-isolation From 56cfb6d90062423091a255d13075610caa681a2b Mon Sep 17 00:00:00 2001 From: LyonRust Date: Tue, 1 Oct 2024 16:49:40 +0800 Subject: [PATCH 2/5] Support multi image recording within llama factory recorder --- .../llama_factory_recorder/main.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py index ceeeebbf7..840044283 100644 --- a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py +++ b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py @@ -42,7 +42,7 @@ def write_dict_to_json(file_path, key: str, new_data): def save_image_and_add_to_json( - image_array, root_path, llama_root_path, jsonl_file, messages + frame_dict: dict, root_path, llama_root_path, jsonl_file, messages ): """ Saves an image from a NumPy array and adds a new JSON object as a line to a JSONL file. @@ -69,17 +69,19 @@ def save_image_and_add_to_json( if os.path.isfile(os.path.join(llama_root_path / root_path, name)) ] ) + image_paths = [] + for (event_id, data) in frame_dict.items(): + # Define the image filename + image_filename = f"{event_id}-{image_id}.png" + image_path = os.path.join(root_path, image_filename) - # Define the image filename - image_filename = f"{image_id}.png" - image_path = os.path.join(root_path, image_filename) - - # Save the image - image = Image.fromarray(image_array) - image.save(llama_root_path / image_path) + # Save the image + image = Image.fromarray(data) + image.save(llama_root_path / image_path) + image_paths.append(image_path) # Create the JSON entry with 'messages' and 'images' - new_entry = {"messages": messages, "images": [image_path]} + new_entry = {"messages": messages, "images": image_paths} # Add the entry to the JSONL file with UTF-8 encoding with open(jsonl_file, "a", encoding="utf-8") as f: @@ -123,7 +125,7 @@ def main(): ) question = DEFAULT_QUESTION - frame = None + frames = {} for event in node: event_type = event["type"] @@ -131,7 +133,7 @@ def main(): if event_type == "INPUT": event_id = event["id"] - if event_id == "image": + if "image" in event_id: storage = event["value"] metadata = event["metadata"] encoding = metadata["encoding"] @@ -153,7 +155,7 @@ def main(): .reshape((height, width, channels)) ) if encoding == "bgr8": - frame = frame[:, :, ::-1] # OpenCV image (BGR to RGB) + frames[event_id] = frame[:, :, ::-1] # OpenCV image (BGR to RGB) elif encoding == "rgb8": pass else: @@ -164,12 +166,12 @@ def main(): if text != "": question = text elif event_id == "ground_truth": - if frame is None: + if len(frames.keys()) == 0: continue ground_truth = event["value"][0].as_py() messages = [ - {"content": "" + question, "role": "user"}, + {"content": "" * len(frames.keys()) + question, "role": "user"}, { "content": ground_truth, "role": "assistant", @@ -177,7 +179,7 @@ def main(): ] save_image_and_add_to_json( - image_array=frame, + frame_dict=frames, root_path=entry_name, llama_root_path=llama_factory_root_path, jsonl_file=default_record_json_path, From d383a486ae508d6092e04c222501d215845e63d9 Mon Sep 17 00:00:00 2001 From: LyonRust Date: Mon, 7 Oct 2024 16:38:28 +0800 Subject: [PATCH 3/5] Add adapter config path within qwenvl2 --- node-hub/dora-qwenvl/dora_qwenvl/main.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/node-hub/dora-qwenvl/dora_qwenvl/main.py b/node-hub/dora-qwenvl/dora_qwenvl/main.py index bb182e0b1..1075a1772 100644 --- a/node-hub/dora-qwenvl/dora_qwenvl/main.py +++ b/node-hub/dora-qwenvl/dora_qwenvl/main.py @@ -12,6 +12,7 @@ "DEFAULT_QUESTION", "Describe this image", ) +ADAPTER_PATH = os.getenv("ADAPTER_PATH", "") # Check if flash_attn is installed try: @@ -23,7 +24,7 @@ device_map="auto", attn_implementation="flash_attention_2", ) -except ImportError: +except ImportError or ModuleNotFoundError: model = Qwen2VLForConditionalGeneration.from_pretrained( CUSTOM_MODEL_PATH, torch_dtype="auto", @@ -31,8 +32,12 @@ ) +if ADAPTER_PATH != "": + model.load_adapter(ADAPTER_PATH, "dora") + + # default processor -processor = AutoProcessor.from_pretrained(DEFAULT_PATH) +processor = AutoProcessor.from_pretrained(CUSTOM_MODEL_PATH) def generate(frames: dict, question): From 33307b4e8a5460d038c9cca6ebfaed76b4a04b37 Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Mon, 7 Oct 2024 11:18:06 +0200 Subject: [PATCH 4/5] Fix linting --- node-hub/dora-qwenvl/dora_qwenvl/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-hub/dora-qwenvl/dora_qwenvl/main.py b/node-hub/dora-qwenvl/dora_qwenvl/main.py index 1075a1772..e03b31c14 100644 --- a/node-hub/dora-qwenvl/dora_qwenvl/main.py +++ b/node-hub/dora-qwenvl/dora_qwenvl/main.py @@ -24,7 +24,7 @@ device_map="auto", attn_implementation="flash_attention_2", ) -except ImportError or ModuleNotFoundError: +except (ImportError, ModuleNotFoundError): model = Qwen2VLForConditionalGeneration.from_pretrained( CUSTOM_MODEL_PATH, torch_dtype="auto", From 5917b9ea27cbee4cbd6a6ca51d2861f318dc0a12 Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Mon, 7 Oct 2024 13:18:03 +0200 Subject: [PATCH 5/5] Fix badly formatted llama factory recorder main.py --- .../llama-factory-recorder/llama_factory_recorder/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py index 840044283..e9a26c645 100644 --- a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py +++ b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py @@ -70,7 +70,7 @@ def save_image_and_add_to_json( ] ) image_paths = [] - for (event_id, data) in frame_dict.items(): + for event_id, data in frame_dict.items(): # Define the image filename image_filename = f"{event_id}-{image_id}.png" image_path = os.path.join(root_path, image_filename) @@ -171,7 +171,10 @@ def main(): ground_truth = event["value"][0].as_py() messages = [ - {"content": "" * len(frames.keys()) + question, "role": "user"}, + { + "content": "" * len(frames.keys()) + question, + "role": "user", + }, { "content": ground_truth, "role": "assistant",