forked from OthersideAI/self-operating-computer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoperate.py
187 lines (159 loc) · 5.78 KB
/
operate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import sys
import os
import time
import asyncio
from prompt_toolkit.shortcuts import message_dialog
from prompt_toolkit import prompt
from operate.exceptions import ModelNotRecognizedException
import platform
# from operate.models.prompts import USER_QUESTION, get_system_prompt
from operate.models.prompts import (
USER_QUESTION,
get_system_prompt,
)
from operate.config import Config
from operate.utils.style import (
ANSI_GREEN,
ANSI_RESET,
ANSI_YELLOW,
ANSI_RED,
ANSI_BRIGHT_MAGENTA,
ANSI_BLUE,
style,
)
from operate.utils.operating_system import OperatingSystem
from operate.models.apis import get_next_action
# Load configuration
config = Config()
operating_system = OperatingSystem()
def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
"""
Main function for the Self-Operating Computer.
Parameters:
- model: The model used for generating responses.
- terminal_prompt: A string representing the prompt provided in the terminal.
- voice_mode: A boolean indicating whether to enable voice mode.
Returns:
None
"""
mic = None
# Initialize `WhisperMic`, if `voice_mode` is True
config.verbose = verbose_mode
config.validation(model, voice_mode)
if voice_mode:
try:
from whisper_mic import WhisperMic
# Initialize WhisperMic if import is successful
mic = WhisperMic()
except ImportError:
print(
"Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'"
)
sys.exit(1)
# Skip message dialog if prompt was given directly
if not terminal_prompt:
message_dialog(
title="Self-Operating Computer",
text="An experimental framework to enable multimodal models to operate computers",
style=style,
).run()
else:
print("Running direct prompt...")
# # Clear the console
if platform.system() == "Windows":
os.system("cls")
else:
print("\033c", end="")
if terminal_prompt: # Skip objective prompt if it was given as an argument
objective = terminal_prompt
elif voice_mode:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
)
try:
objective = mic.listen()
except Exception as e:
print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
return # Exit if voice input fails
else:
print(
f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}"
)
print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
objective = prompt(style=style)
system_prompt = get_system_prompt(model, objective)
system_message = {"role": "system", "content": system_prompt}
messages = [system_message]
loop_count = 0
session_id = None
while True:
if config.verbose:
print("[Self Operating Computer] loop_count", loop_count)
try:
operations, session_id = asyncio.run(
get_next_action(model, messages, objective, session_id)
)
stop = operate(operations, model)
if stop:
break
loop_count += 1
if loop_count > 10:
break
except ModelNotRecognizedException as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
)
break
except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
)
break
def operate(operations, model):
if config.verbose:
print("[Self Operating Computer][operate]")
for operation in operations:
if config.verbose:
print("[Self Operating Computer][operate] operation", operation)
# wait one second
time.sleep(1)
operate_type = operation.get("operation").lower()
operate_thought = operation.get("thought")
operate_detail = ""
if config.verbose:
print("[Self Operating Computer][operate] operate_type", operate_type)
if operate_type == "press" or operate_type == "hotkey":
keys = operation.get("keys")
operate_detail = keys
operating_system.press(keys)
elif operate_type == "write":
content = operation.get("content")
operate_detail = content
operating_system.write(content)
elif operate_type == "click":
x = operation.get("x")
y = operation.get("y")
click_detail = {"x": x, "y": y}
operate_detail = click_detail
operating_system.mouse(click_detail)
elif operate_type == "done":
summary = operation.get("summary")
print(
f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]"
)
print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n")
return True
else:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}"
)
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response {ANSI_RESET}{operation}"
)
return True
print(
f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]"
)
print(f"{operate_thought}")
print(f"{ANSI_BLUE}Action: {ANSI_RESET}{operate_type} {operate_detail}\n")
return False