From 7628b61d22f2c546229ed608bd03022ea2a182b6 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Mon, 25 Mar 2024 17:08:21 +0100
Subject: [PATCH 1/9] Adding LLM example

---
 .../python-operator-dataflow/dataflow.yml     |   3 -
 .../python-operator-dataflow/dataflow_llm.yml |  69 ++++
 .../dataflow_record.yml                       |  87 +++++
 .../python-operator-dataflow/file_saver_op.py |  44 +++
 .../python-operator-dataflow/keyboard_op.py   |  94 ++++++
 examples/python-operator-dataflow/llm_op.py   | 319 ++++++++++++++++++
 examples/python-operator-dataflow/merge.py    |  12 +
 .../python-operator-dataflow/microphone_op.py |  36 ++
 .../object_detection.py                       |  53 +--
 examples/python-operator-dataflow/plot.py     | 199 +++++------
 .../sentence_transformers_op.py               | 100 ++++++
 examples/python-operator-dataflow/utils.py    |  25 +-
 examples/python-operator-dataflow/webcam.py   |  33 +-
 .../python-operator-dataflow/whisper_op.py    |  25 ++
 14 files changed, 924 insertions(+), 175 deletions(-)
 create mode 100644 examples/python-operator-dataflow/dataflow_llm.yml
 create mode 100644 examples/python-operator-dataflow/dataflow_record.yml
 create mode 100644 examples/python-operator-dataflow/file_saver_op.py
 create mode 100644 examples/python-operator-dataflow/keyboard_op.py
 create mode 100644 examples/python-operator-dataflow/llm_op.py
 create mode 100644 examples/python-operator-dataflow/merge.py
 create mode 100644 examples/python-operator-dataflow/microphone_op.py
 create mode 100644 examples/python-operator-dataflow/sentence_transformers_op.py
 create mode 100644 examples/python-operator-dataflow/whisper_op.py

diff --git a/examples/python-operator-dataflow/dataflow.yml b/examples/python-operator-dataflow/dataflow.yml
index ca069f7d3..92bf5f2b3 100644
--- a/examples/python-operator-dataflow/dataflow.yml
+++ b/examples/python-operator-dataflow/dataflow.yml
@@ -10,12 +10,10 @@ nodes:
   - id: object_detection
     operator:
       python: object_detection.py
-      send_stdout_as: stdout
       inputs:
         image: webcam/image
       outputs:
         - bbox
-        - stdout
 
   - id: plot
     operator:
@@ -23,4 +21,3 @@ nodes:
       inputs:
         image: webcam/image
         bbox: object_detection/bbox
-        object_detection_stdout: object_detection/stdout
diff --git a/examples/python-operator-dataflow/dataflow_llm.yml b/examples/python-operator-dataflow/dataflow_llm.yml
new file mode 100644
index 000000000..6d3be9a58
--- /dev/null
+++ b/examples/python-operator-dataflow/dataflow_llm.yml
@@ -0,0 +1,69 @@
+nodes:
+  - id: webcam
+    operator:
+      python: webcam.py
+      inputs:
+        tick: dora/timer/millis/50
+      outputs:
+        - image
+
+  - id: object_detection
+    operator:
+      python: object_detection.py
+      inputs:
+        image: webcam/image
+      outputs:
+        - bbox
+
+  - id: plot
+    operator:
+      python: plot.py
+      inputs:
+        image: webcam/image
+        bbox: object_detection/bbox
+        line: llm/line
+        keyboard_buffer: keyboard/buffer
+        user_message: keyboard/submitted
+        assistant_message: llm/assistant_message
+
+  ## Speech to text
+  - id: keyboard
+    custom:
+      source: keyboard_op.py
+      outputs:
+        - buffer
+        - submitted
+        - record
+        - ask
+        - send
+        - change
+
+  ## Code Modifier
+  - id: vectordb
+    operator:
+      python: sentence_transformers_op.py
+      inputs:
+        query: keyboard/change
+        saved_file: file_saver/saved_file
+      outputs:
+        - raw_file
+
+  - id: llm
+    operator:
+      python: llm_op.py
+      inputs:
+        code_modifier: vectordb/raw_file
+        assistant: keyboard/ask
+        message_sender: keyboard/send
+      outputs:
+        - modified_file
+        - line
+        - assistant_message
+
+  - id: file_saver
+    operator:
+      python: file_saver_op.py
+      inputs:
+        file: llm/modified_file
+      outputs:
+        - saved_file
\ No newline at end of file
diff --git a/examples/python-operator-dataflow/dataflow_record.yml b/examples/python-operator-dataflow/dataflow_record.yml
new file mode 100644
index 000000000..faf978af4
--- /dev/null
+++ b/examples/python-operator-dataflow/dataflow_record.yml
@@ -0,0 +1,87 @@
+nodes:
+  - id: webcam
+    operator:
+      python: webcam.py
+      inputs:
+        tick: dora/timer/millis/50
+      outputs:
+        - image
+
+  - id: object_detection
+    operator:
+      python: object_detection.py
+      inputs:
+        image: webcam/image
+      outputs:
+        - bbox
+
+  - id: plot
+    operator:
+      python: plot.py
+      inputs:
+        image: webcam/image
+        bbox: object_detection/bbox
+        line: llm/line
+        keyboard_buffer: keyboard/buffer
+        user_message: keyboard/submitted
+        assistant_message: llm/assistant_message
+
+  ## Speech to text
+  - id: keyboard
+    custom:
+      source: keyboard_op.py
+      outputs:
+        - buffer
+        - submitted
+        - record
+        - ask
+        - send
+        - change
+      inputs:
+        recording: whisper/text
+
+  - id: microphone
+    operator:
+      python: microphone_op.py
+      inputs:
+        record: keyboard/record
+      outputs:
+        - audio
+
+  - id: whisper
+    operator:
+      python: whisper_op.py
+      inputs:
+        audio: microphone/audio
+      outputs:
+        - text
+
+  ## Code Modifier
+  - id: vectordb
+    operator:
+      python: sentence_transformers_op.py
+      inputs:
+        query: keyboard/change
+        saved_file: file_saver/saved_file
+      outputs:
+        - raw_file
+
+  - id: llm
+    operator:
+      python: llm_op.py
+      inputs:
+        code_modifier: vectordb/raw_file
+        assistant: keyboard/ask
+        message_sender: keyboard/send
+      outputs:
+        - modified_file
+        - line
+        - assistant_message
+
+  - id: file_saver
+    operator:
+      python: file_saver_op.py
+      inputs:
+        file: llm/modified_file
+      outputs:
+        - saved_file
\ No newline at end of file
diff --git a/examples/python-operator-dataflow/file_saver_op.py b/examples/python-operator-dataflow/file_saver_op.py
new file mode 100644
index 000000000..df1128f82
--- /dev/null
+++ b/examples/python-operator-dataflow/file_saver_op.py
@@ -0,0 +1,44 @@
+import pyarrow as pa
+
+from dora import DoraStatus
+
+
+class Operator:
+    """
+    Infering object from images
+    """
+
+    def __init__(self):
+        self.last_file = ""
+        self.last_path = ""
+        self.last_netadata = None
+
+    def on_event(
+        self,
+        dora_event,
+        send_output,
+    ) -> DoraStatus:
+        if dora_event["type"] == "INPUT" and dora_event["id"] == "file":
+            input = dora_event["value"][0].as_py()
+
+            with open(input["path"], "r") as file:
+                self.last_file = file.read()
+                self.last_path = input["path"]
+                self.last_metadata = dora_event["metadata"]
+            with open(input["path"], "w") as file:
+                file.write(input["raw"])
+
+            send_output(
+                "saved_file",
+                pa.array(
+                    [
+                        {
+                            "raw": input["raw"],
+                            "path": input["path"],
+                            "origin": dora_event["id"],
+                        }
+                    ]
+                ),
+                dora_event["metadata"],
+            )
+        return DoraStatus.CONTINUE
diff --git a/examples/python-operator-dataflow/keyboard_op.py b/examples/python-operator-dataflow/keyboard_op.py
new file mode 100644
index 000000000..79a1cd4c3
--- /dev/null
+++ b/examples/python-operator-dataflow/keyboard_op.py
@@ -0,0 +1,94 @@
+from pynput import keyboard
+from pynput.keyboard import Key, Events
+import pyarrow as pa
+from dora import Node
+from tkinter import Tk
+import tkinter as tk
+
+
+node = Node()
+buffer_text = ""
+ctrl = False
+submitted_text = []
+cursor = 0
+
+NODE_TOPIC = ["record", "send", "ask", "change"]
+
+with keyboard.Events() as events:
+    while True:
+        dora_event = node.next(0.01)
+        if (
+            dora_event is not None
+            and dora_event["type"] == "INPUT"
+            and dora_event["id"] == "recording"
+        ):
+            buffer_text += dora_event["value"][0].as_py()
+            node.send_output("buffer", pa.array([buffer_text]))
+            continue
+
+        event = events.get(1.0)
+        if event is not None and isinstance(event, Events.Press):
+            if hasattr(event.key, "char"):
+                cursor = 0
+                if ctrl and event.key.char == "v":
+                    r = Tk()
+                    r.update()
+                    try:
+                        selection = r.clipboard_get()
+                        r.withdraw()
+                        r.update()
+                    except tk.TclError:
+                        selection = ""
+                    r.destroy()
+                    buffer_text += selection
+                    node.send_output("buffer", pa.array([buffer_text]))
+                elif ctrl and event.key.char == "c":
+                    r = Tk()
+                    r.clipboard_clear()
+                    r.clipboard_append(buffer_text)
+                    r.update()
+                    r.destroy()
+                elif ctrl and event.key.char == "x":
+                    r = Tk()
+                    r.clipboard_clear()
+                    r.clipboard_append(buffer_text)
+                    r.update()
+                    r.destroy()
+                    buffer_text = ""
+                    node.send_output("buffer", pa.array([buffer_text]))
+                else:
+                    buffer_text += event.key.char
+                    node.send_output("buffer", pa.array([buffer_text]))
+            else:
+                if event.key == Key.backspace:
+                    buffer_text = buffer_text[:-1]
+                    node.send_output("buffer", pa.array([buffer_text]))
+                elif event.key == Key.esc:
+                    buffer_text = ""
+                    node.send_output("buffer", pa.array([buffer_text]))
+                elif event.key == Key.enter:
+                    node.send_output("submitted", pa.array([buffer_text]))
+                    first_word = buffer_text.split(" ")[0]
+                    if first_word in NODE_TOPIC:
+                        node.send_output(first_word, pa.array([buffer_text]))
+                    submitted_text.append(buffer_text)
+                    buffer_text = ""
+                    node.send_output("buffer", pa.array([buffer_text]))
+                elif event.key == Key.ctrl:
+                    ctrl = True
+                elif event.key == Key.space:
+                    buffer_text += " "
+                    node.send_output("buffer", pa.array([buffer_text]))
+                elif event.key == Key.up:
+                    if len(submitted_text) > 0:
+                        cursor = max(cursor - 1, -len(submitted_text))
+                        buffer_text = submitted_text[cursor]
+                        node.send_output("buffer", pa.array([buffer_text]))
+                elif event.key == Key.down:
+                    if len(submitted_text) > 0:
+                        cursor = min(cursor + 1, 0)
+                        buffer_text = submitted_text[cursor]
+                        node.send_output("buffer", pa.array([buffer_text]))
+        elif event is not None and isinstance(event, Events.Release):
+            if event.key == Key.ctrl:
+                ctrl = False
diff --git a/examples/python-operator-dataflow/llm_op.py b/examples/python-operator-dataflow/llm_op.py
new file mode 100644
index 000000000..2d0510962
--- /dev/null
+++ b/examples/python-operator-dataflow/llm_op.py
@@ -0,0 +1,319 @@
+from dora import DoraStatus
+import pylcs
+import os
+import pyarrow as pa
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import json
+
+import re
+import time
+
+MODEL_NAME_OR_PATH = "TheBloke/deepseek-coder-6.7B-instruct-GPTQ"
+# MODEL_NAME_OR_PATH = "hanspeterlyngsoeraaschoujensen/deepseek-math-7b-instruct-GPTQ"
+
+CODE_MODIFIER_TEMPLATE = """
+### Instruction
+Respond with the small modified code only. No explaination.
+
+```python
+{code}
+```
+
+{user_message}
+
+### Response:
+"""
+
+
+MESSAGE_SENDER_TEMPLATE = """
+### Instruction
+You're a json expert. Format your response as a json with a topic and a data field in a ```json block.  No explaination needed. No code needed.
+The schema for those json are:
+- line: Int[4]
+
+The response should look like this: 
+```json
+
+ [
+  {{ "topic": "line", "data": [10, 10, 90, 10] }},
+]
+```
+
+{user_message}
+
+### Response:
+"""
+
+ASSISTANT_TEMPLATE = """
+### Instruction
+You're a helpuf assistant named dora. 
+Reply with a short message. No code needed. 
+
+User {user_message}
+
+### Response:
+"""
+
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME_OR_PATH,
+    device_map="auto",
+    trust_remote_code=True,
+    revision="main",
+)
+
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH, use_fast=True)
+
+
+def extract_python_code_blocks(text):
+    """
+    Extracts Python code blocks from the given text that are enclosed in triple backticks with a python language identifier.
+
+    Parameters:
+    - text: A string that may contain one or more Python code blocks.
+
+    Returns:
+    - A list of strings, where each string is a block of Python code extracted from the text.
+    """
+    pattern = r"```python\n(.*?)\n```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    if len(matches) == 0:
+        pattern = r"```python\n(.*?)(?:\n```|$)"
+        matches = re.findall(pattern, text, re.DOTALL)
+        if len(matches) == 0:
+            return [text]
+
+    return matches
+
+
+def extract_json_code_blocks(text):
+    """
+    Extracts json code blocks from the given text that are enclosed in triple backticks with a json language identifier.
+
+    Parameters:
+    - text: A string that may contain one or more json code blocks.
+
+    Returns:
+    - A list of strings, where each string is a block of json code extracted from the text.
+    """
+    pattern = r"```json\n(.*?)\n```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    if len(matches) == 0:
+        pattern = r"```json\n(.*?)(?:\n```|$)"
+        matches = re.findall(pattern, text, re.DOTALL)
+        if len(matches) == 0:
+            return [text]
+
+    return matches
+
+
+def remove_last_line(python_code):
+    """
+    Removes the last line from a given string of Python code.
+
+    Parameters:
+    - python_code: A string representing Python source code.
+
+    Returns:
+    - A string with the last line removed.
+    """
+    lines = python_code.split("\n")  # Split the string into lines
+    if lines:  # Check if there are any lines to remove
+        lines.pop()  # Remove the last line
+    return "\n".join(lines)  # Join the remaining lines back into a string
+
+
+def calculate_similarity(source, target):
+    """
+    Calculate a similarity score between the source and target strings.
+    This uses the edit distance relative to the length of the strings.
+    """
+    edit_distance = pylcs.edit_distance(source, target)
+    max_length = max(len(source), len(target))
+    # Normalize the score by the maximum possible edit distance (the length of the longer string)
+    similarity = 1 - (edit_distance / max_length)
+    return similarity
+
+
+def find_best_match_location(source_code, target_block):
+    """
+    Find the best match for the target_block within the source_code by searching line by line,
+    considering blocks of varying lengths.
+    """
+    source_lines = source_code.split("\n")
+    target_lines = target_block.split("\n")
+
+    best_similarity = 0
+    best_start_index = 0
+    best_end_index = -1
+
+    # Iterate over the source lines to find the best matching range for all lines in target_block
+    for start_index in range(len(source_lines) - len(target_lines) + 1):
+        for end_index in range(start_index + len(target_lines), len(source_lines) + 1):
+            current_window = "\n".join(source_lines[start_index:end_index])
+            current_similarity = calculate_similarity(current_window, target_block)
+            if current_similarity > best_similarity:
+                best_similarity = current_similarity
+                best_start_index = start_index
+                best_end_index = end_index
+
+    # Convert line indices back to character indices for replacement
+    char_start_index = len("\n".join(source_lines[:best_start_index])) + (
+        1 if best_start_index > 0 else 0
+    )
+    char_end_index = len("\n".join(source_lines[:best_end_index]))
+
+    return char_start_index, char_end_index
+
+
+def replace_code_in_source(source_code, replacement_block: str):
+    """
+    Replace the best matching block in the source_code with the replacement_block, considering variable block lengths.
+    """
+    replacement_block = extract_python_code_blocks(replacement_block)[0]
+    replacement_block = remove_last_line(replacement_block)
+    start_index, end_index = find_best_match_location(source_code, replacement_block)
+    if start_index != -1 and end_index != -1:
+        # Replace the best matching part with the replacement block
+        new_source = (
+            source_code[:start_index] + replacement_block + source_code[end_index:]
+        )
+        return new_source
+    else:
+        return source_code
+
+
+class Operator:
+
+    def on_event(
+        self,
+        dora_event,
+        send_output,
+    ) -> DoraStatus:
+        if dora_event["type"] == "INPUT" and dora_event["id"] == "code_modifier":
+            input = dora_event["value"][0].as_py()
+
+            with open(input["path"], "r", encoding="utf8") as f:
+                code = f.read()
+
+            user_message = input["user_message"]
+            start_llm = time.time()
+            output = self.ask_llm(
+                CODE_MODIFIER_TEMPLATE.format(code=code, user_message=user_message)
+            )
+
+            source_code = replace_code_in_source(code, output)
+            print("response time:", time.time() - start_llm, flush=True)
+            send_output(
+                "modified_file",
+                pa.array(
+                    [
+                        {
+                            "raw": source_code,
+                            "path": input["path"],
+                            "response": output,
+                            "prompt": input["user_message"],
+                        }
+                    ]
+                ),
+                dora_event["metadata"],
+            )
+            print("response: ", output, flush=True)
+            send_output(
+                "assistant_message",
+                pa.array([output]),
+                dora_event["metadata"],
+            )
+        elif dora_event["type"] == "INPUT" and dora_event["id"] == "message_sender":
+            user_message = dora_event["value"][0].as_py()
+            output = self.ask_llm(
+                MESSAGE_SENDER_TEMPLATE.format(user_message=user_message)
+            )
+            outputs = extract_json_code_blocks(output)[0]
+            try:
+                outputs = json.loads(outputs)
+                if not isinstance(outputs, list):
+                    outputs = [outputs]
+                for output in outputs:
+                    if not isinstance(output["data"], list):
+                        output["data"] = [output["data"]]
+
+                    if output["topic"] in [
+                        "line",
+                    ]:
+                        send_output(
+                            output["topic"],
+                            pa.array(output["data"]),
+                            dora_event["metadata"],
+                        )
+                    else:
+                        print("Could not find the topic: {}".format(output["topic"]))
+            except:
+                print("Could not parse json")
+            # if data is not iterable, put data in a list
+        elif dora_event["type"] == "INPUT" and dora_event["id"] == "assistant":
+            user_message = dora_event["value"][0].as_py()
+            output = self.ask_llm(ASSISTANT_TEMPLATE.format(user_message=user_message))
+            send_output(
+                "assistant_message",
+                pa.array([output]),
+                dora_event["metadata"],
+            )
+        return DoraStatus.CONTINUE
+
+    def ask_llm(self, prompt):
+
+        # Generate output
+        # prompt = PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt))
+        input = tokenizer(prompt, return_tensors="pt")
+        input_ids = input.input_ids.cuda()
+
+        # add attention mask here
+        attention_mask = input["attention_mask"]
+
+        output = model.generate(
+            inputs=input_ids,
+            temperature=0.7,
+            do_sample=True,
+            top_p=0.95,
+            top_k=40,
+            max_new_tokens=512,
+            attention_mask=attention_mask,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+        # Get the tokens from the output, decode them, print them
+
+        # Get text between im_start and im_end
+        return tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt) :]
+
+
+if __name__ == "__main__":
+    op = Operator()
+
+    # Path to the current file
+    current_file_path = __file__
+
+    # Directory of the current file
+    current_directory = os.path.dirname(current_file_path)
+
+    path = current_directory + "object_detection.py"
+    with open(path, "r", encoding="utf8") as f:
+        raw = f.read()
+
+    op.on_event(
+        {
+            "type": "INPUT",
+            "id": "message_sender",
+            "value": pa.array(
+                [
+                    {
+                        "path": path,
+                        "user_message": "send a star ",
+                    },
+                ]
+            ),
+            "metadata": [],
+        },
+        print,
+    )
diff --git a/examples/python-operator-dataflow/merge.py b/examples/python-operator-dataflow/merge.py
new file mode 100644
index 000000000..cd60745d5
--- /dev/null
+++ b/examples/python-operator-dataflow/merge.py
@@ -0,0 +1,12 @@
+import pyarrow as pa
+
+with pa.memory_map("image.arrow", "r") as source:
+    df_i = pa.ipc.open_file(source).read_all()
+
+with pa.memory_map("bbox.arrow", "r") as source:
+    df_b = pa.ipc.open_file(source).read_all()
+
+df_i = df_i.to_pandas()
+df_b = df_b.to_pandas()
+
+df = df_i.merge(df_b, on="trace_id")
diff --git a/examples/python-operator-dataflow/microphone_op.py b/examples/python-operator-dataflow/microphone_op.py
new file mode 100644
index 000000000..6c3b37ad2
--- /dev/null
+++ b/examples/python-operator-dataflow/microphone_op.py
@@ -0,0 +1,36 @@
+import numpy as np
+import pyarrow as pa
+import sounddevice as sd
+
+from dora import DoraStatus
+
+# Set the parameters for recording
+SAMPLE_RATE = 16000
+MAX_DURATION = 5
+
+
+class Operator:
+    """
+    Microphone operator that records the audio
+    """
+
+    def on_event(
+        self,
+        dora_event,
+        send_output,
+    ) -> DoraStatus:
+        if dora_event["type"] == "INPUT":
+            audio_data = sd.rec(
+                int(SAMPLE_RATE * MAX_DURATION),
+                samplerate=SAMPLE_RATE,
+                channels=1,
+                dtype=np.int16,
+                blocking=True,
+            )
+
+            audio_data = audio_data.ravel().astype(np.float32) / 32768.0
+            if len(audio_data) > 0:
+                send_output("audio", pa.array(audio_data), dora_event["metadata"])
+        elif dora_event["type"] == "INPUT":
+            print("Microphone is not recording", dora_event["value"][0].as_py())
+        return DoraStatus.CONTINUE
diff --git a/examples/python-operator-dataflow/object_detection.py b/examples/python-operator-dataflow/object_detection.py
index a6bd9cafb..087dfc27b 100755
--- a/examples/python-operator-dataflow/object_detection.py
+++ b/examples/python-operator-dataflow/object_detection.py
@@ -1,61 +1,40 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-
 import numpy as np
 import pyarrow as pa
 
 from dora import DoraStatus
 from ultralytics import YOLO
 
-pa.array([])
 
 CAMERA_WIDTH = 640
 CAMERA_HEIGHT = 480
 
 
+model = YOLO("yolov8n.pt")
+
+
 class Operator:
     """
     Infering object from images
     """
 
-    def __init__(self):
-        self.model = YOLO("yolov8n.pt")
-
     def on_event(
         self,
         dora_event,
         send_output,
     ) -> DoraStatus:
         if dora_event["type"] == "INPUT":
-            return self.on_input(dora_event, send_output)
-        return DoraStatus.CONTINUE
+            frame = (
+                dora_event["value"].to_numpy().reshape((CAMERA_HEIGHT, CAMERA_WIDTH, 3))
+            )
+            frame = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
+            results = model(frame, verbose=False)  # includes NMS
+            # Process results
+            boxes = np.array(results[0].boxes.xyxy.cpu())
+            conf = np.array(results[0].boxes.conf.cpu())
+            label = np.array(results[0].boxes.cls.cpu())
+            # concatenate them together
+            arrays = np.concatenate((boxes, conf[:, None], label[:, None]), axis=1)
+
+            send_output("bbox", pa.array(arrays.ravel()), dora_event["metadata"])
 
-    def on_input(
-        self,
-        dora_input,
-        send_output,
-    ) -> DoraStatus:
-        """Handle image
-        Args:
-            dora_input (dict) containing the "id", value, and "metadata"
-            send_output Callable[[str, bytes | pa.Array, Optional[dict]], None]:
-                Function for sending output to the dataflow:
-                - First argument is the `output_id`
-                - Second argument is the data as either bytes or `pa.Array`
-                - Third argument is dora metadata dict
-                e.g.: `send_output("bbox", pa.array([100], type=pa.uint8()), dora_event["metadata"])`
-        """
-
-        frame = dora_input["value"].to_numpy().reshape((CAMERA_HEIGHT, CAMERA_WIDTH, 3))
-        frame = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
-        results = self.model(frame)  # includes NMS
-        # Process results
-        boxes = np.array(results[0].boxes.xyxy.cpu())
-        conf = np.array(results[0].boxes.conf.cpu())
-        label = np.array(results[0].boxes.cls.cpu())
-        # concatenate them together
-        arrays = np.concatenate((boxes, conf[:, None], label[:, None]), axis=1)
-
-        send_output("bbox", pa.array(arrays.ravel()), dora_input["metadata"])
         return DoraStatus.CONTINUE
diff --git a/examples/python-operator-dataflow/plot.py b/examples/python-operator-dataflow/plot.py
index f7f2ddd02..180af6fa4 100755
--- a/examples/python-operator-dataflow/plot.py
+++ b/examples/python-operator-dataflow/plot.py
@@ -1,22 +1,8 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import os
-
 import cv2
-import numpy as np
-import pyarrow as pa
-
-from dora import DoraStatus
-from utils import LABELS
-
-pa.array([])
 
-CI = os.environ.get("CI")
-CAMERA_WIDTH = 640
-CAMERA_HEIGHT = 480
 
-font = cv2.FONT_HERSHEY_SIMPLEX
+from dora import DoraStatus
+from utils import LABELS, put_text, CAMERA_HEIGHT, CAMERA_WIDTH, FONT, CI
 
 
 class Operator:
@@ -25,114 +11,91 @@ class Operator:
     """
 
     def __init__(self):
-        self.image = []
         self.bboxs = []
-        self.bounding_box_messages = 0
-        self.image_messages = 0
-        self.object_detection_stdout = []
+        self.buffer = ""
+        self.submitted = []
+        self.lines = []
 
     def on_event(
         self,
         dora_event,
         send_output,
-    ) -> DoraStatus:
+    ):
         if dora_event["type"] == "INPUT":
-            return self.on_input(dora_event, send_output)
-        return DoraStatus.CONTINUE
-
-    def on_input(
-        self,
-        dora_input,
-        send_output,
-    ) -> DoraStatus:
-        """
-        Put image and bounding box on cv2 window.
-
-        Args:
-            dora_input["id"] (str): Id of the dora_input declared in the yaml configuration
-            dora_input["value"] (arrow array): message of the dora_input
-            send_output Callable[[str, bytes | pa.Array, Optional[dict]], None]:
-                Function for sending output to the dataflow:
-                - First argument is the `output_id`
-                - Second argument is the data as either bytes or `pa.Array`
-                - Third argument is dora metadata dict
-                e.g.: `send_output("bbox", pa.array([100], type=pa.uint8()), dora_event["metadata"])`
-        """
-        if dora_input["id"] == "image":
-            frame = (
-                dora_input["value"]
-                .to_numpy()
-                .reshape((CAMERA_HEIGHT, CAMERA_WIDTH, 3))
-                .copy()  # copy the image because we want to modify it below
-            )
-            self.image = frame
-
-            self.image_messages += 1
-            print("received " + str(self.image_messages) + " images")
-
-        elif dora_input["id"] == "object_detection_stdout":
-            stdout = dora_input["value"][0].as_py()
-            self.object_detection_stdout += [stdout]
-            ## Only keep last 10 stdout
-            self.object_detection_stdout = self.object_detection_stdout[-10:]
-            return DoraStatus.CONTINUE
-
-        elif dora_input["id"] == "bbox" and len(self.image) != 0:
-            bboxs = dora_input["value"].to_numpy()
-            self.bboxs = np.reshape(bboxs, (-1, 6))
-
-            self.bounding_box_messages += 1
-            print("received " + str(self.bounding_box_messages) + " bounding boxes")
-            return DoraStatus.CONTINUE
-        else:
-            return DoraStatus.CONTINUE
-
-        for bbox in self.bboxs:
-            [
-                min_x,
-                min_y,
-                max_x,
-                max_y,
-                confidence,
-                label,
-            ] = bbox
-            cv2.rectangle(
-                self.image,
-                (int(min_x), int(min_y)),
-                (int(max_x), int(max_y)),
-                (0, 255, 0),
-                2,
-            )
-
-            cv2.putText(
-                self.image,
-                LABELS[int(label)] + f", {confidence:0.2f}",
-                (int(max_x), int(max_y)),
-                font,
-                0.75,
-                (0, 255, 0),
-                2,
-                1,
-            )
-
-        for i, log in enumerate(self.object_detection_stdout):
-            cv2.putText(
-                self.image,
-                log,
-                (10, 10 + 20 * i),
-                font,
-                0.5,
-                (0, 255, 0),
-                2,
-                1,
-            )
-
-        if CI != "true":
-            cv2.imshow("frame", self.image)
-            if cv2.waitKey(1) & 0xFF == ord("q"):
-                return DoraStatus.STOP
+            id = dora_event["id"]
+            value = dora_event["value"]
+            if id == "image":
+
+                image = (
+                    value.to_numpy().reshape((CAMERA_HEIGHT, CAMERA_WIDTH, 3)).copy()
+                )
+
+                for bbox in self.bboxs:
+                    [
+                        min_x,
+                        min_y,
+                        max_x,
+                        max_y,
+                        confidence,
+                        label,
+                    ] = bbox
+                    cv2.rectangle(
+                        image,
+                        (int(min_x), int(min_y)),
+                        (int(max_x), int(max_y)),
+                        (0, 255, 0),
+                    )
+                    cv2.putText(
+                        image,
+                        f"{LABELS[int(label)]}, {confidence:0.2f}",
+                        (int(max_x), int(max_y)),
+                        FONT,
+                        0.45,
+                        (0, 255, 0),
+                        2,
+                        1,
+                    )
+
+                put_text(
+                    image,
+                    self.buffer,
+                    (20, 12 * 25),
+                    (190, 250, 0),
+                )
+
+                for i, text in enumerate(self.submitted[::-1]):
+                    put_text(
+                        image,
+                        text["content"],
+                        (20, 25 + (10 - i) * 25),
+                        (0, 255, 190),
+                    )
+
+                for line in self.lines:
+                    cv2.line(
+                        image,
+                        (int(line[0]), int(line[1])),
+                        (int(line[2]), int(line[3])),
+                        (0, 0, 255),
+                        2,
+                    )
+
+                if CI != "true":
+                    cv2.imshow("frame", image)
+                    if cv2.waitKey(1) & 0xFF == ord("q"):
+                        return DoraStatus.STOP
+            elif id == "bbox":
+                self.bboxs = value.to_numpy().reshape((-1, 6))
+            elif id == "keyboard_buffer":
+                self.buffer = value[0].as_py()
+            elif id == "line":
+                self.lines += [value.to_pylist()]
+            elif "message" in id:
+                self.submitted += [
+                    {
+                        "role": id,
+                        "content": value[0].as_py(),
+                    }
+                ]
 
         return DoraStatus.CONTINUE
-
-    def __del__(self):
-        cv2.destroyAllWindows()
diff --git a/examples/python-operator-dataflow/sentence_transformers_op.py b/examples/python-operator-dataflow/sentence_transformers_op.py
new file mode 100644
index 000000000..cd045211d
--- /dev/null
+++ b/examples/python-operator-dataflow/sentence_transformers_op.py
@@ -0,0 +1,100 @@
+from sentence_transformers import SentenceTransformer
+from sentence_transformers import util
+
+from dora import DoraStatus
+import os
+import sys
+import inspect
+import torch
+import pyarrow as pa
+
+SHOULD_NOT_BE_INCLUDED = [
+    "utils.py",
+    "sentence_transformers_op.py",
+    "chatgpt_op.py",
+    "llm_op.py",
+]
+
+SHOULD_BE_INCLUDED = [
+    "webcam.py",
+    "object_detection.py",
+    "plot.py",
+]
+
+
+## Get all python files path in given directory
+def get_all_functions(path):
+    raw = []
+    paths = []
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            if file.endswith(".py"):
+                if file not in SHOULD_BE_INCLUDED:
+                    continue
+                path = os.path.join(root, file)
+                with open(path, "r", encoding="utf8") as f:
+                    ## add file folder to system path
+                    sys.path.append(root)
+                    ## import module from path
+                    raw.append(f.read())
+                    paths.append(path)
+
+    return raw, paths
+
+
+def search(query_embedding, corpus_embeddings, paths, raw, k=5, file_extension=None):
+    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
+    top_results = torch.topk(cos_scores, k=min(k, len(cos_scores)), sorted=True)
+    out = []
+    for score, idx in zip(top_results[0], top_results[1]):
+        out.extend([raw[idx], paths[idx], score])
+    return out
+
+
+class Operator:
+    """ """
+
+    def __init__(self):
+        ## TODO: Add a initialisation step
+        self.model = SentenceTransformer("BAAI/bge-large-en-v1.5")
+        self.encoding = []
+        # file directory
+        path = os.path.dirname(os.path.abspath(__file__))
+
+        self.raw, self.path = get_all_functions(path)
+        # Encode all files
+        self.encoding = self.model.encode(self.raw)
+
+    def on_event(
+        self,
+        dora_event,
+        send_output,
+    ) -> DoraStatus:
+        if dora_event["type"] == "INPUT":
+            if dora_event["id"] == "query":
+                values = dora_event["value"].to_pylist()
+
+                query_embeddings = self.model.encode(values)
+                output = search(
+                    query_embeddings,
+                    self.encoding,
+                    self.path,
+                    self.raw,
+                )
+                [raw, path, score] = output[0:3]
+                send_output(
+                    "raw_file",
+                    pa.array([{"raw": raw, "path": path, "user_message": values[0]}]),
+                    dora_event["metadata"],
+                )
+            else:
+                input = dora_event["value"][0].as_py()
+                index = self.path.index(input["path"])
+                self.raw[index] = input["raw"]
+                self.encoding[index] = self.model.encode([input["raw"]])[0]
+
+        return DoraStatus.CONTINUE
+
+
+if __name__ == "__main__":
+    operator = Operator()
diff --git a/examples/python-operator-dataflow/utils.py b/examples/python-operator-dataflow/utils.py
index dabc915e1..4ec04ed09 100644
--- a/examples/python-operator-dataflow/utils.py
+++ b/examples/python-operator-dataflow/utils.py
@@ -1,5 +1,28 @@
+import os
+import cv2
+
+
+def put_text(image, text, position, color):
+    cv2.putText(
+        image,
+        text,
+        position,
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.45,
+        color,
+        2,
+        1,
+    )
+
+
+CI = os.environ.get("CI")
+
+CAMERA_WIDTH = 640
+CAMERA_HEIGHT = 480
+
+FONT = cv2.FONT_HERSHEY_SIMPLEX
 LABELS = [
-    "ABC",
+    "person",
     "bicycle",
     "car",
     "motorcycle",
diff --git a/examples/python-operator-dataflow/webcam.py b/examples/python-operator-dataflow/webcam.py
index 8b55db404..9867ad7c4 100755
--- a/examples/python-operator-dataflow/webcam.py
+++ b/examples/python-operator-dataflow/webcam.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
 import os
 import time
 
@@ -13,6 +10,7 @@
 CAMERA_WIDTH = 640
 CAMERA_HEIGHT = 480
 CAMERA_INDEX = int(os.getenv("CAMERA_INDEX", 0))
+CI = os.environ.get("CI")
 
 font = cv2.FONT_HERSHEY_SIMPLEX
 
@@ -27,6 +25,7 @@ def __init__(self):
         self.start_time = time.time()
         self.video_capture.set(cv2.CAP_PROP_FRAME_WIDTH, CAMERA_WIDTH)
         self.video_capture.set(cv2.CAP_PROP_FRAME_HEIGHT, CAMERA_HEIGHT)
+        self.failure_count = 0
 
     def on_event(
         self,
@@ -38,20 +37,22 @@ def on_event(
             ret, frame = self.video_capture.read()
             if ret:
                 frame = cv2.resize(frame, (CAMERA_WIDTH, CAMERA_HEIGHT))
-
+                self.failure_count = 0
             ## Push an error image in case the camera is not available.
             else:
-                frame = np.zeros((CAMERA_HEIGHT, CAMERA_WIDTH, 3), dtype=np.uint8)
-                cv2.putText(
-                    frame,
-                    "No Webcam was found at index %d" % (CAMERA_INDEX),
-                    (int(30), int(30)),
-                    font,
-                    0.75,
-                    (255, 255, 255),
-                    2,
-                    1,
-                )
+                if self.failure_count > 10:
+                    frame = np.zeros((CAMERA_HEIGHT, CAMERA_WIDTH, 3), dtype=np.uint8)
+                    cv2.putText(
+                        frame,
+                        "No Webcam was found at index %d" % (CAMERA_INDEX),
+                        (int(30), int(30)),
+                        font,
+                        0.75,
+                        (255, 255, 255),
+                        2,
+                        1,
+                    )
+                    self.failure_count += 1
 
             send_output(
                 "image",
@@ -63,7 +64,7 @@ def on_event(
         else:
             print("received unexpected event:", event_type)
 
-        if time.time() - self.start_time < 20:
+        if time.time() - self.start_time < 200 or CI != "true":
             return DoraStatus.CONTINUE
         else:
             return DoraStatus.STOP
diff --git a/examples/python-operator-dataflow/whisper_op.py b/examples/python-operator-dataflow/whisper_op.py
new file mode 100644
index 000000000..feab8b92e
--- /dev/null
+++ b/examples/python-operator-dataflow/whisper_op.py
@@ -0,0 +1,25 @@
+import pyarrow as pa
+import whisper
+
+from dora import DoraStatus
+
+
+model = whisper.load_model("base")
+
+
+class Operator:
+    """
+    Transforming Speech to Text using OpenAI Whisper model
+    """
+
+    def on_event(
+        self,
+        dora_event,
+        send_output,
+    ) -> DoraStatus:
+        if dora_event["type"] == "INPUT":
+            audio = dora_event["value"].to_numpy()
+            audio = whisper.pad_or_trim(audio)
+            result = model.transcribe(audio, language="en")
+            send_output("text", pa.array([result["text"]]), dora_event["metadata"])
+        return DoraStatus.CONTINUE

From e27a88713bf2377b9b6df4f3e8d0a4e8c3122391 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Mon, 25 Mar 2024 17:11:51 +0100
Subject: [PATCH 2/9] adding requirements

---
 examples/python-operator-dataflow/requirements.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/python-operator-dataflow/requirements.txt b/examples/python-operator-dataflow/requirements.txt
index a8f02b0a0..4f0330c64 100644
--- a/examples/python-operator-dataflow/requirements.txt
+++ b/examples/python-operator-dataflow/requirements.txt
@@ -44,4 +44,10 @@ seaborn>=0.11.0
 # roboflow
 
 opencv-python>=4.1.1
-maturin
\ No newline at end of file
+maturin
+
+whisper
+sounddevice
+pynput
+sentence-transformers
+transformers
\ No newline at end of file

From c382aa55cab21f7c1cbece3fb5363c9b5d1e97b4 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Tue, 2 Apr 2024 18:04:24 +0200
Subject: [PATCH 3/9] Fix added operator

---
 examples/python-operator-dataflow/README.md   | 135 +++++++++++++++++-
 .../python-operator-dataflow/dataflow.yml     |   3 +
 .../python-operator-dataflow/dataflow_llm.yml |  18 +++
 .../dataflow_record.yml                       |  87 -----------
 .../python-operator-dataflow/keyboard_op.py   |  33 +----
 examples/python-operator-dataflow/llm_op.py   |  41 +++---
 examples/python-operator-dataflow/plot.py     |  45 ++++--
 .../python-operator-dataflow/requirements.txt |   8 +-
 examples/python-operator-dataflow/utils.py    |  23 ---
 examples/python-operator-dataflow/webcam.py   |   2 +
 10 files changed, 209 insertions(+), 186 deletions(-)
 delete mode 100644 examples/python-operator-dataflow/dataflow_record.yml

diff --git a/examples/python-operator-dataflow/README.md b/examples/python-operator-dataflow/README.md
index 3c4bd335d..e3b6eb0f3 100644
--- a/examples/python-operator-dataflow/README.md
+++ b/examples/python-operator-dataflow/README.md
@@ -13,21 +13,144 @@ The [`dataflow.yml`](./dataflow.yml) defines a simple dataflow graph with the fo
 ## Getting started
 
 ```bash
-cargo run --example python-dataflow 
+cargo run --example python-operator-dataflow
 ```
 
 ## Installation
 
-To install, you should run the `install.sh` script.
+```bash
+conda create -n example_env python=3.12
+pip install -r requirements.txt
+```
+
+## Run the dataflow
+
+- Start the object detection dataflow alone:
+
+```bash
+dora start dataflow.yml
+```
+
+- Start the llm dataflow:
 
 ```bash
-install.sh
+dora start dataflow_llm.yml
 ```
 
-## Run the dataflow as a standalone
+Within the window you can ask question such as:
 
-- Start the `dora-coordinator`:
+```bash
+ask how are you
+change bounding box plot to red
+change confidence value to percentage
+change object detection to only detect person
+send 200 200 200 400 to topic line
+record
+```
 
+```bash
+wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/keyboard_op.py
+wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/microphone_op.py
+wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/whisper_op.py
+wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/sentence_transformers_op.py
+wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/llm_op.py
+wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/file_saver_op.py
 ```
-../../target/release/dora-daemon --run-dataflow dataflow.yml
+
+and adding the following to the dataflow configuration:
+
+```yaml
+nodes:
+  - id: webcam
+    operator:
+      python: webcam.py
+      inputs:
+        tick: dora/timer/millis/50
+      outputs:
+        - image
+
+  - id: object_detection
+    operator:
+      python: object_detection.py
+      inputs:
+        image: webcam/image
+      outputs:
+        - bbox
+
+  - id: plot
+    operator:
+      python: plot.py
+      inputs:
+        image: webcam/image
+        bbox: object_detection/bbox
+        line: llm/line
+        keyboard_buffer: keyboard/buffer
+        user_message: keyboard/submitted
+        assistant_message: llm/assistant_message
+
+  ## Speech to text
+  - id: keyboard
+    custom:
+      source: keyboard_op.py
+      outputs:
+        - buffer
+        - submitted
+        - record
+        - ask
+        - send
+        - change
+      inputs:
+        recording: whisper/text
+
+  - id: microphone
+    operator:
+      python: microphone_op.py
+      inputs:
+        record: keyboard/record
+      outputs:
+        - audio
+
+  - id: whisper
+    operator:
+      python: whisper_op.py
+      inputs:
+        audio: microphone/audio
+      outputs:
+        - text
+
+  ## Code Modifier
+  - id: vectordb
+    operator:
+      python: sentence_transformers_op.py
+      inputs:
+        query: keyboard/change
+        saved_file: file_saver/saved_file
+      outputs:
+        - raw_file
+
+  - id: llm
+    operator:
+      python: llm_op.py
+      inputs:
+        code_modifier: vectordb/raw_file
+        assistant: keyboard/ask
+        message_sender: keyboard/send
+      outputs:
+        - modified_file
+        - line
+        - assistant_message
+
+  - id: file_saver
+    operator:
+      python: file_saver_op.py
+      inputs:
+        file: llm/modified_file
+      outputs:
+        - saved_file
 ```
+
+The keyboard, microphone, whisper node, works in a very similar fashion as the object detection dataflow and I'll let you check it out by yourself.
+
+The code modification flow works by first comparing an instruction with a vectordb of operators source code and then feeding the most similar operator to an llm with the instruction for code modification.
+
+The end result is then saved using the file saver
diff --git a/examples/python-operator-dataflow/dataflow.yml b/examples/python-operator-dataflow/dataflow.yml
index 92bf5f2b3..400f881f5 100644
--- a/examples/python-operator-dataflow/dataflow.yml
+++ b/examples/python-operator-dataflow/dataflow.yml
@@ -9,11 +9,13 @@ nodes:
 
   - id: object_detection
     operator:
+      send_stdout_as: stdout
       python: object_detection.py
       inputs:
         image: webcam/image
       outputs:
         - bbox
+        - stdout
 
   - id: plot
     operator:
@@ -21,3 +23,4 @@ nodes:
       inputs:
         image: webcam/image
         bbox: object_detection/bbox
+        assistant_message: object_detection/stdout
diff --git a/examples/python-operator-dataflow/dataflow_llm.yml b/examples/python-operator-dataflow/dataflow_llm.yml
index 6d3be9a58..faf978af4 100644
--- a/examples/python-operator-dataflow/dataflow_llm.yml
+++ b/examples/python-operator-dataflow/dataflow_llm.yml
@@ -37,6 +37,24 @@ nodes:
         - ask
         - send
         - change
+      inputs:
+        recording: whisper/text
+
+  - id: microphone
+    operator:
+      python: microphone_op.py
+      inputs:
+        record: keyboard/record
+      outputs:
+        - audio
+
+  - id: whisper
+    operator:
+      python: whisper_op.py
+      inputs:
+        audio: microphone/audio
+      outputs:
+        - text
 
   ## Code Modifier
   - id: vectordb
diff --git a/examples/python-operator-dataflow/dataflow_record.yml b/examples/python-operator-dataflow/dataflow_record.yml
deleted file mode 100644
index faf978af4..000000000
--- a/examples/python-operator-dataflow/dataflow_record.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-nodes:
-  - id: webcam
-    operator:
-      python: webcam.py
-      inputs:
-        tick: dora/timer/millis/50
-      outputs:
-        - image
-
-  - id: object_detection
-    operator:
-      python: object_detection.py
-      inputs:
-        image: webcam/image
-      outputs:
-        - bbox
-
-  - id: plot
-    operator:
-      python: plot.py
-      inputs:
-        image: webcam/image
-        bbox: object_detection/bbox
-        line: llm/line
-        keyboard_buffer: keyboard/buffer
-        user_message: keyboard/submitted
-        assistant_message: llm/assistant_message
-
-  ## Speech to text
-  - id: keyboard
-    custom:
-      source: keyboard_op.py
-      outputs:
-        - buffer
-        - submitted
-        - record
-        - ask
-        - send
-        - change
-      inputs:
-        recording: whisper/text
-
-  - id: microphone
-    operator:
-      python: microphone_op.py
-      inputs:
-        record: keyboard/record
-      outputs:
-        - audio
-
-  - id: whisper
-    operator:
-      python: whisper_op.py
-      inputs:
-        audio: microphone/audio
-      outputs:
-        - text
-
-  ## Code Modifier
-  - id: vectordb
-    operator:
-      python: sentence_transformers_op.py
-      inputs:
-        query: keyboard/change
-        saved_file: file_saver/saved_file
-      outputs:
-        - raw_file
-
-  - id: llm
-    operator:
-      python: llm_op.py
-      inputs:
-        code_modifier: vectordb/raw_file
-        assistant: keyboard/ask
-        message_sender: keyboard/send
-      outputs:
-        - modified_file
-        - line
-        - assistant_message
-
-  - id: file_saver
-    operator:
-      python: file_saver_op.py
-      inputs:
-        file: llm/modified_file
-      outputs:
-        - saved_file
\ No newline at end of file
diff --git a/examples/python-operator-dataflow/keyboard_op.py b/examples/python-operator-dataflow/keyboard_op.py
index 79a1cd4c3..2d179ac63 100644
--- a/examples/python-operator-dataflow/keyboard_op.py
+++ b/examples/python-operator-dataflow/keyboard_op.py
@@ -2,8 +2,6 @@
 from pynput.keyboard import Key, Events
 import pyarrow as pa
 from dora import Node
-from tkinter import Tk
-import tkinter as tk
 
 
 node = Node()
@@ -30,35 +28,8 @@
         if event is not None and isinstance(event, Events.Press):
             if hasattr(event.key, "char"):
                 cursor = 0
-                if ctrl and event.key.char == "v":
-                    r = Tk()
-                    r.update()
-                    try:
-                        selection = r.clipboard_get()
-                        r.withdraw()
-                        r.update()
-                    except tk.TclError:
-                        selection = ""
-                    r.destroy()
-                    buffer_text += selection
-                    node.send_output("buffer", pa.array([buffer_text]))
-                elif ctrl and event.key.char == "c":
-                    r = Tk()
-                    r.clipboard_clear()
-                    r.clipboard_append(buffer_text)
-                    r.update()
-                    r.destroy()
-                elif ctrl and event.key.char == "x":
-                    r = Tk()
-                    r.clipboard_clear()
-                    r.clipboard_append(buffer_text)
-                    r.update()
-                    r.destroy()
-                    buffer_text = ""
-                    node.send_output("buffer", pa.array([buffer_text]))
-                else:
-                    buffer_text += event.key.char
-                    node.send_output("buffer", pa.array([buffer_text]))
+                buffer_text += event.key.char
+                node.send_output("buffer", pa.array([buffer_text]))
             else:
                 if event.key == Key.backspace:
                     buffer_text = buffer_text[:-1]
diff --git a/examples/python-operator-dataflow/llm_op.py b/examples/python-operator-dataflow/llm_op.py
index 2d0510962..5f5957688 100644
--- a/examples/python-operator-dataflow/llm_op.py
+++ b/examples/python-operator-dataflow/llm_op.py
@@ -33,10 +33,7 @@
 
 The response should look like this: 
 ```json
-
- [
-  {{ "topic": "line", "data": [10, 10, 90, 10] }},
-]
+  {{ "topic": "line", "data": [10, 10, 90, 10] }}
 ```
 
 {user_message}
@@ -83,6 +80,8 @@ def extract_python_code_blocks(text):
         matches = re.findall(pattern, text, re.DOTALL)
         if len(matches) == 0:
             return [text]
+        else:
+            matches = [remove_last_line(matches[0])]
 
     return matches
 
@@ -172,7 +171,6 @@ def replace_code_in_source(source_code, replacement_block: str):
     Replace the best matching block in the source_code with the replacement_block, considering variable block lengths.
     """
     replacement_block = extract_python_code_blocks(replacement_block)[0]
-    replacement_block = remove_last_line(replacement_block)
     start_index, end_index = find_best_match_location(source_code, replacement_block)
     if start_index != -1 and end_index != -1:
         # Replace the best matching part with the replacement block
@@ -232,23 +230,20 @@ def on_event(
             )
             outputs = extract_json_code_blocks(output)[0]
             try:
-                outputs = json.loads(outputs)
-                if not isinstance(outputs, list):
-                    outputs = [outputs]
-                for output in outputs:
-                    if not isinstance(output["data"], list):
-                        output["data"] = [output["data"]]
-
-                    if output["topic"] in [
-                        "line",
-                    ]:
-                        send_output(
-                            output["topic"],
-                            pa.array(output["data"]),
-                            dora_event["metadata"],
-                        )
-                    else:
-                        print("Could not find the topic: {}".format(output["topic"]))
+                output = json.loads(outputs)
+                if not isinstance(output["data"], list):
+                    output["data"] = [output["data"]]
+
+                if output["topic"] in [
+                    "line",
+                ]:
+                    send_output(
+                        output["topic"],
+                        pa.array(output["data"]),
+                        dora_event["metadata"],
+                    )
+                else:
+                    print("Could not find the topic: {}".format(output["topic"]))
             except:
                 print("Could not parse json")
             # if data is not iterable, put data in a list
@@ -270,7 +265,7 @@ def ask_llm(self, prompt):
         input_ids = input.input_ids.cuda()
 
         # add attention mask here
-        attention_mask = input["attention_mask"]
+        attention_mask = input["attention_mask"].cuda()
 
         output = model.generate(
             inputs=input_ids,
diff --git a/examples/python-operator-dataflow/plot.py b/examples/python-operator-dataflow/plot.py
index 180af6fa4..c7b0a0bed 100755
--- a/examples/python-operator-dataflow/plot.py
+++ b/examples/python-operator-dataflow/plot.py
@@ -1,8 +1,17 @@
+import os
 import cv2
-
+import time
 
 from dora import DoraStatus
-from utils import LABELS, put_text, CAMERA_HEIGHT, CAMERA_WIDTH, FONT, CI
+from utils import LABELS
+
+
+CI = os.environ.get("CI")
+
+CAMERA_WIDTH = 640
+CAMERA_HEIGHT = 480
+
+FONT = cv2.FONT_HERSHEY_SIMPLEX
 
 
 class Operator:
@@ -50,26 +59,34 @@ def on_event(
                         f"{LABELS[int(label)]}, {confidence:0.2f}",
                         (int(max_x), int(max_y)),
                         FONT,
-                        0.45,
+                        0.5,
                         (0, 255, 0),
-                        2,
-                        1,
                     )
 
-                put_text(
-                    image,
-                    self.buffer,
-                    (20, 12 * 25),
-                    (190, 250, 0),
+                cv2.putText(
+                    image, self.buffer, (20, 14 + 21 * 14), FONT, 0.5, (190, 250, 0), 1
                 )
 
-                for i, text in enumerate(self.submitted[::-1]):
-                    put_text(
+                i = 0
+                for text in self.submitted[::-1]:
+                    color = (
+                        (0, 255, 190)
+                        if text["role"] == "user_message"
+                        else (0, 190, 255)
+                    )
+                    cv2.putText(
                         image,
                         text["content"],
-                        (20, 25 + (10 - i) * 25),
-                        (0, 255, 190),
+                        (
+                            20,
+                            14 + (19 - i) * 14,
+                        ),
+                        FONT,
+                        0.5,
+                        color,
+                        1,
                     )
+                    i += 1
 
                 for line in self.lines:
                     cv2.line(
diff --git a/examples/python-operator-dataflow/requirements.txt b/examples/python-operator-dataflow/requirements.txt
index 4f0330c64..3fb39579d 100644
--- a/examples/python-operator-dataflow/requirements.txt
+++ b/examples/python-operator-dataflow/requirements.txt
@@ -46,8 +46,12 @@ seaborn>=0.11.0
 opencv-python>=4.1.1
 maturin
 
-whisper
+openai-whisper
 sounddevice
 pynput
 sentence-transformers
-transformers
\ No newline at end of file
+transformers
+pylcs
+accelerate
+optimum
+auto-gptq>=0.7.1
\ No newline at end of file
diff --git a/examples/python-operator-dataflow/utils.py b/examples/python-operator-dataflow/utils.py
index 4ec04ed09..a40bd6d6f 100644
--- a/examples/python-operator-dataflow/utils.py
+++ b/examples/python-operator-dataflow/utils.py
@@ -1,26 +1,3 @@
-import os
-import cv2
-
-
-def put_text(image, text, position, color):
-    cv2.putText(
-        image,
-        text,
-        position,
-        cv2.FONT_HERSHEY_SIMPLEX,
-        0.45,
-        color,
-        2,
-        1,
-    )
-
-
-CI = os.environ.get("CI")
-
-CAMERA_WIDTH = 640
-CAMERA_HEIGHT = 480
-
-FONT = cv2.FONT_HERSHEY_SIMPLEX
 LABELS = [
     "person",
     "bicycle",
diff --git a/examples/python-operator-dataflow/webcam.py b/examples/python-operator-dataflow/webcam.py
index 9867ad7c4..c7d451919 100755
--- a/examples/python-operator-dataflow/webcam.py
+++ b/examples/python-operator-dataflow/webcam.py
@@ -52,7 +52,9 @@ def on_event(
                         2,
                         1,
                     )
+                else:
                     self.failure_count += 1
+                    return DoraStatus.CONTINUE
 
             send_output(
                 "image",

From 09340b744b6b7f1c59e94a57ebe9ddada634dc4c Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Tue, 2 Apr 2024 18:14:12 +0200
Subject: [PATCH 4/9] Minor fix

---
 examples/python-operator-dataflow/README.md   | 105 +-----------------
 .../python-operator-dataflow/dataflow_llm.yml |   2 +-
 examples/python-operator-dataflow/merge.py    |  12 --
 .../python-operator-dataflow/requirements.txt |   2 +-
 .../sentence_transformers_op.py               |   8 --
 examples/python-operator-dataflow/webcam.py   |   2 +-
 6 files changed, 5 insertions(+), 126 deletions(-)
 delete mode 100644 examples/python-operator-dataflow/merge.py

diff --git a/examples/python-operator-dataflow/README.md b/examples/python-operator-dataflow/README.md
index e3b6eb0f3..e678f6241 100644
--- a/examples/python-operator-dataflow/README.md
+++ b/examples/python-operator-dataflow/README.md
@@ -19,7 +19,7 @@ cargo run --example python-operator-dataflow
 ## Installation
 
 ```bash
-conda create -n example_env python=3.12
+conda create -n example_env python=3.11
 pip install -r requirements.txt
 ```
 
@@ -48,109 +48,8 @@ send 200 200 200 400 to topic line
 record
 ```
 
-```bash
-wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/keyboard_op.py
-wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/microphone_op.py
-wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/whisper_op.py
-wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/sentence_transformers_op.py
-wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/llm_op.py
-wget https://raw.githubusercontent.com/dora-rs/dora/v0.3.2/examples/python-operator-dataflow/file_saver_op.py
-```
-
-and adding the following to the dataflow configuration:
-
-```yaml
-nodes:
-  - id: webcam
-    operator:
-      python: webcam.py
-      inputs:
-        tick: dora/timer/millis/50
-      outputs:
-        - image
-
-  - id: object_detection
-    operator:
-      python: object_detection.py
-      inputs:
-        image: webcam/image
-      outputs:
-        - bbox
-
-  - id: plot
-    operator:
-      python: plot.py
-      inputs:
-        image: webcam/image
-        bbox: object_detection/bbox
-        line: llm/line
-        keyboard_buffer: keyboard/buffer
-        user_message: keyboard/submitted
-        assistant_message: llm/assistant_message
-
-  ## Speech to text
-  - id: keyboard
-    custom:
-      source: keyboard_op.py
-      outputs:
-        - buffer
-        - submitted
-        - record
-        - ask
-        - send
-        - change
-      inputs:
-        recording: whisper/text
-
-  - id: microphone
-    operator:
-      python: microphone_op.py
-      inputs:
-        record: keyboard/record
-      outputs:
-        - audio
-
-  - id: whisper
-    operator:
-      python: whisper_op.py
-      inputs:
-        audio: microphone/audio
-      outputs:
-        - text
-
-  ## Code Modifier
-  - id: vectordb
-    operator:
-      python: sentence_transformers_op.py
-      inputs:
-        query: keyboard/change
-        saved_file: file_saver/saved_file
-      outputs:
-        - raw_file
-
-  - id: llm
-    operator:
-      python: llm_op.py
-      inputs:
-        code_modifier: vectordb/raw_file
-        assistant: keyboard/ask
-        message_sender: keyboard/send
-      outputs:
-        - modified_file
-        - line
-        - assistant_message
-
-  - id: file_saver
-    operator:
-      python: file_saver_op.py
-      inputs:
-        file: llm/modified_file
-      outputs:
-        - saved_file
-```
-
 The keyboard, microphone, whisper node, works in a very similar fashion as the object detection dataflow and I'll let you check it out by yourself.
 
 The code modification flow works by first comparing an instruction with a vectordb of operators source code and then feeding the most similar operator to an llm with the instruction for code modification.
 
-The end result is then saved using the file saver
+The end result is then saved using a file saver.
diff --git a/examples/python-operator-dataflow/dataflow_llm.yml b/examples/python-operator-dataflow/dataflow_llm.yml
index faf978af4..41bc72050 100644
--- a/examples/python-operator-dataflow/dataflow_llm.yml
+++ b/examples/python-operator-dataflow/dataflow_llm.yml
@@ -84,4 +84,4 @@ nodes:
       inputs:
         file: llm/modified_file
       outputs:
-        - saved_file
\ No newline at end of file
+        - saved_file
diff --git a/examples/python-operator-dataflow/merge.py b/examples/python-operator-dataflow/merge.py
deleted file mode 100644
index cd60745d5..000000000
--- a/examples/python-operator-dataflow/merge.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import pyarrow as pa
-
-with pa.memory_map("image.arrow", "r") as source:
-    df_i = pa.ipc.open_file(source).read_all()
-
-with pa.memory_map("bbox.arrow", "r") as source:
-    df_b = pa.ipc.open_file(source).read_all()
-
-df_i = df_i.to_pandas()
-df_b = df_b.to_pandas()
-
-df = df_i.merge(df_b, on="trace_id")
diff --git a/examples/python-operator-dataflow/requirements.txt b/examples/python-operator-dataflow/requirements.txt
index 3fb39579d..71ccb110c 100644
--- a/examples/python-operator-dataflow/requirements.txt
+++ b/examples/python-operator-dataflow/requirements.txt
@@ -54,4 +54,4 @@ transformers
 pylcs
 accelerate
 optimum
-auto-gptq>=0.7.1
\ No newline at end of file
+auto-gptq>=0.7.1
diff --git a/examples/python-operator-dataflow/sentence_transformers_op.py b/examples/python-operator-dataflow/sentence_transformers_op.py
index cd045211d..c4619cf7e 100644
--- a/examples/python-operator-dataflow/sentence_transformers_op.py
+++ b/examples/python-operator-dataflow/sentence_transformers_op.py
@@ -4,17 +4,9 @@
 from dora import DoraStatus
 import os
 import sys
-import inspect
 import torch
 import pyarrow as pa
 
-SHOULD_NOT_BE_INCLUDED = [
-    "utils.py",
-    "sentence_transformers_op.py",
-    "chatgpt_op.py",
-    "llm_op.py",
-]
-
 SHOULD_BE_INCLUDED = [
     "webcam.py",
     "object_detection.py",
diff --git a/examples/python-operator-dataflow/webcam.py b/examples/python-operator-dataflow/webcam.py
index c7d451919..43ce7e207 100755
--- a/examples/python-operator-dataflow/webcam.py
+++ b/examples/python-operator-dataflow/webcam.py
@@ -66,7 +66,7 @@ def on_event(
         else:
             print("received unexpected event:", event_type)
 
-        if time.time() - self.start_time < 200 or CI != "true":
+        if time.time() - self.start_time < 20 or CI != "true":
             return DoraStatus.CONTINUE
         else:
             return DoraStatus.STOP

From 191f7793e567f653ba41d167918e43824af2a77c Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Tue, 2 Apr 2024 18:19:53 +0200
Subject: [PATCH 5/9] Pinning dora dependencies

---
 examples/python-operator-dataflow/README.md        | 1 +
 examples/python-operator-dataflow/requirements.txt | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/python-operator-dataflow/README.md b/examples/python-operator-dataflow/README.md
index e678f6241..351e2776c 100644
--- a/examples/python-operator-dataflow/README.md
+++ b/examples/python-operator-dataflow/README.md
@@ -20,6 +20,7 @@ cargo run --example python-operator-dataflow
 
 ```bash
 conda create -n example_env python=3.11
+conda activate test_env
 pip install -r requirements.txt
 ```
 
diff --git a/examples/python-operator-dataflow/requirements.txt b/examples/python-operator-dataflow/requirements.txt
index 71ccb110c..ec7e481a7 100644
--- a/examples/python-operator-dataflow/requirements.txt
+++ b/examples/python-operator-dataflow/requirements.txt
@@ -55,3 +55,4 @@ pylcs
 accelerate
 optimum
 auto-gptq>=0.7.1
+dora-rs==0.3.2
\ No newline at end of file

From c5a1c1a4eef617f375ec40dbcc0044dcff8b17c4 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Wed, 3 Apr 2024 10:31:38 +0200
Subject: [PATCH 6/9] Adding separate reqiorements to avoid GPU depencies
 issues

---
 examples/python-operator-dataflow/README.md            | 4 +++-
 examples/python-operator-dataflow/requirements_llm.txt | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 examples/python-operator-dataflow/requirements_llm.txt

diff --git a/examples/python-operator-dataflow/README.md b/examples/python-operator-dataflow/README.md
index 351e2776c..767e1c10a 100644
--- a/examples/python-operator-dataflow/README.md
+++ b/examples/python-operator-dataflow/README.md
@@ -13,6 +13,7 @@ The [`dataflow.yml`](./dataflow.yml) defines a simple dataflow graph with the fo
 ## Getting started
 
 ```bash
+pip install -r requirements.txt
 cargo run --example python-operator-dataflow
 ```
 
@@ -22,6 +23,7 @@ cargo run --example python-operator-dataflow
 conda create -n example_env python=3.11
 conda activate test_env
 pip install -r requirements.txt
+pip install -r requirements_llm.txt
 ```
 
 ## Run the dataflow
@@ -32,7 +34,7 @@ pip install -r requirements.txt
 dora start dataflow.yml
 ```
 
-- Start the llm dataflow:
+- Start the llm dataflow (Only works on Windows and Linux):
 
 ```bash
 dora start dataflow_llm.yml
diff --git a/examples/python-operator-dataflow/requirements_llm.txt b/examples/python-operator-dataflow/requirements_llm.txt
new file mode 100644
index 000000000..4cc16fa36
--- /dev/null
+++ b/examples/python-operator-dataflow/requirements_llm.txt
@@ -0,0 +1,9 @@
+openai-whisper
+sounddevice
+pynput
+sentence-transformers
+transformers
+pylcs
+accelerate
+optimum
+auto-gptq>=0.7.1

From c641d97d18a93c89e82b76029d9d5b637cdd5670 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Wed, 3 Apr 2024 11:58:26 +0200
Subject: [PATCH 7/9] Increase the wait time for the daemon for the CI

---
 examples/multiple-daemons/run.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs
index 4b1417cc4..bd0722135 100644
--- a/examples/multiple-daemons/run.rs
+++ b/examples/multiple-daemons/run.rs
@@ -55,7 +55,7 @@ async fn main() -> eyre::Result<()> {
         } else if retries > 20 {
             bail!("daemon not connected after {retries} retries");
         } else {
-            std::thread::sleep(Duration::from_millis(100));
+            std::thread::sleep(Duration::from_millis(500));
             retries += 1
         }
     }

From ad9bec342ff314fe2ccd3d51cce405ed550032f7 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Wed, 3 Apr 2024 11:59:23 +0200
Subject: [PATCH 8/9] Remove the need to download the weights as
 https://github.com/ultralytics/ultralytics/pull/7432 has been merged

---
 examples/python-operator-dataflow/requirements.txt | 11 -----------
 examples/python-operator-dataflow/run.rs           |  7 -------
 2 files changed, 18 deletions(-)

diff --git a/examples/python-operator-dataflow/requirements.txt b/examples/python-operator-dataflow/requirements.txt
index ec7e481a7..68020faa6 100644
--- a/examples/python-operator-dataflow/requirements.txt
+++ b/examples/python-operator-dataflow/requirements.txt
@@ -45,14 +45,3 @@ seaborn>=0.11.0
 
 opencv-python>=4.1.1
 maturin
-
-openai-whisper
-sounddevice
-pynput
-sentence-transformers
-transformers
-pylcs
-accelerate
-optimum
-auto-gptq>=0.7.1
-dora-rs==0.3.2
\ No newline at end of file
diff --git a/examples/python-operator-dataflow/run.rs b/examples/python-operator-dataflow/run.rs
index c10dfc9a5..dfc2e8345 100644
--- a/examples/python-operator-dataflow/run.rs
+++ b/examples/python-operator-dataflow/run.rs
@@ -1,5 +1,4 @@
 use dora_core::{get_pip_path, get_python_path, run};
-use dora_download::download_file;
 use dora_tracing::set_up_tracing;
 use eyre::{bail, ContextCompat, WrapErr};
 use std::path::Path;
@@ -74,12 +73,6 @@ async fn main() -> eyre::Result<()> {
     .await
     .context("maturin develop failed")?;
 
-    download_file(
-        "https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt",
-        Path::new("yolov8n.pt"),
-    )
-    .await
-    .context("Could not download weights.")?;
     let dataflow = Path::new("dataflow.yml");
     run_dataflow(dataflow).await?;
 

From 3edc3f82093e60af73d67b7f20ff8817bd323211 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Thu, 4 Apr 2024 17:57:01 +0200
Subject: [PATCH 9/9] Remove unnecessary condition

---
 examples/python-operator-dataflow/microphone_op.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/python-operator-dataflow/microphone_op.py b/examples/python-operator-dataflow/microphone_op.py
index 6c3b37ad2..b6fb6e63a 100644
--- a/examples/python-operator-dataflow/microphone_op.py
+++ b/examples/python-operator-dataflow/microphone_op.py
@@ -31,6 +31,4 @@ def on_event(
             audio_data = audio_data.ravel().astype(np.float32) / 32768.0
             if len(audio_data) > 0:
                 send_output("audio", pa.array(audio_data), dora_event["metadata"])
-        elif dora_event["type"] == "INPUT":
-            print("Microphone is not recording", dora_event["value"][0].as_py())
         return DoraStatus.CONTINUE