Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple display support #1161

Merged
merged 8 commits into from
Apr 10, 2024
138 changes: 107 additions & 31 deletions interpreter/core/computer/display/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
import warnings
from contextlib import redirect_stdout
from io import BytesIO

import io
import subprocess
from PIL import Image
import requests

from ...utils.lazy_import import lazy_import
from ..utils.recipient_utils import format_to_recipient
import cv2
from screeninfo import get_monitors # for getting info about connected monitors


# Still experimenting with this
# from utils.get_active_window import get_active_window
Expand All @@ -20,6 +24,7 @@
np = lazy_import("numpy")
plt = lazy_import("matplotlib.pyplot")


from ..utils.computer_vision import find_text_in_image, pytesseract_get_text


Expand Down Expand Up @@ -56,20 +61,30 @@ def center(self):
"""
return self.width // 2, self.height // 2

def view(self, show=True, quadrant=None):
def info(self):
"""
Returns a list of all connected montitor/displays and thir information
"""
return get_displays()


def view(self, show=True, quadrant=None, screen=0, combine_screens=True
):
"""
Redirects to self.screenshot
"""
return self.screenshot(show, quadrant)
return self.screenshot(screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens)

# def get_active_window(self):
# return get_active_window()

def screenshot(
self, show=True, quadrant=None, active_app_only=False, force_image=False
self, screen=0, show=True, quadrant=None, active_app_only=False, force_image=False,combine_screens=True
):
"""
Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
:param screen: specify which display; 0 for primary and 1 and above for secondary.
:param combine_screens: If True, a collage of all display screens will be returned. Otherwise, a list of display screens will be returned.
"""
if not self.computer.emit_images and force_image == False:
text = self.get_text_as_list_of_lists()
Expand All @@ -91,10 +106,7 @@ def screenshot(
region = self.get_active_window()["region"]
screenshot = pyautogui.screenshot(region=region)
else:
if platform.system() == "Darwin":
screenshot = take_screenshot_to_pil()
else:
screenshot = pyautogui.screenshot()
screenshot = take_screenshot_to_pil(screen=screen, combine_screens=combine_screens) # this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
# message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
# print(message)

Expand All @@ -121,18 +133,26 @@ def screenshot(

# Open the image file with PIL
# IPython interactive mode auto-displays plots, causing RGBA handling issues, possibly MacOS-specific.
screenshot = screenshot.convert("RGB")
if isinstance(screenshot, list):
screenshot = [img.convert("RGB") for img in screenshot] # if screenshot is a list (i.e combine_screens=False).
else:
screenshot = screenshot.convert("RGB")

if show:
# Show the image using matplotlib
plt.imshow(np.array(screenshot))
if isinstance(screenshot, list):
for img in screenshot:
plt.imshow(np.array(img))
plt.show()
else:
plt.imshow(np.array(screenshot))

with warnings.catch_warnings():
# It displays an annoying message about Agg not being able to display something or WHATEVER
warnings.simplefilter("ignore")
plt.show()

return screenshot
return screenshot # this will be a list of combine_screens == False

def find(self, description, screenshot=None):
if description.startswith('"') and description.endswith('"'):
Expand Down Expand Up @@ -260,22 +280,78 @@ def get_text_as_list_of_lists(self, screenshot=None):
)


import io
import subprocess

from PIL import Image


def take_screenshot_to_pil(filename="temp_screenshot.png"):
# Capture the screenshot and save it to a temporary file
subprocess.run(["screencapture", "-x", filename], check=True)

# Open the image file with PIL
with open(filename, "rb") as f:
image_data = f.read()
image = Image.open(io.BytesIO(image_data))

# Optionally, delete the temporary file if you don't need it after loading
os.remove(filename)

return image
def take_screenshot_to_pil(screen=0, combine_screens=True):
# Get information about all screens
monitors = get_monitors()
if screen == -1: # All screens

# Take a screenshot of each screen and save them in a list
screenshots = [pyautogui.screenshot(region=(monitor.x, monitor.y, monitor.width, monitor.height)) for monitor in monitors]

if combine_screens:
# Combine all screenshots horizontally
total_width = sum([img.width for img in screenshots])
max_height = max([img.height for img in screenshots])

# Create a new image with a size that can contain all screenshots
new_img = Image.new('RGB', (total_width, max_height))

# Paste each screenshot into the new image
x_offset = 0
for i, img in enumerate(screenshots):
# Convert PIL Image to OpenCV Image (numpy array)
img_cv = np.array(img)
img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)

# Convert new_img PIL Image to OpenCV Image (numpy array)
new_img_cv = np.array(new_img)
new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_RGB2BGR)

# Paste each screenshot into the new image using OpenCV
new_img_cv[0:img_cv.shape[0], x_offset:x_offset+img_cv.shape[1]] = img_cv
x_offset += img.width

# Add monitor labels using OpenCV
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 4
font_color = (255, 255, 255)
line_type = 2

if i == 0:
text = "Primary Monitor"
else:
text = f"Monitor {i}"

# Calculate the font scale that will fit the text perfectly in the center of the monitor
text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]
font_scale = min(img.width / text_size[0], img.height / text_size[1])

# Recalculate the text size with the new font scale
text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]

# Calculate the position to center the text
text_x = x_offset - img.width // 2 - text_size[0] // 2
text_y = max_height // 2 - text_size[1] // 2

cv2.putText(new_img_cv, text, (text_x, text_y), font, font_scale, font_color, line_type)

# Convert new_img from OpenCV Image back to PIL Image
new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_BGR2RGB)
new_img = Image.fromarray(new_img_cv)

return new_img
else:
return screenshots
elif screen > 0:
# Take a screenshot of the selected screen
return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))

else:
# Take a screenshot of the primary screen
return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))


def get_displays():
monitors = get_monitors()
return monitors

12 changes: 6 additions & 6 deletions interpreter/core/respond.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,12 @@ def respond(interpreter):
)
code = re.sub(r"import computer\.\w+\n", "pass\n", code)
# If it does this it sees the screenshot twice (which is expected jupyter behavior)
if code.split("\n")[-1] in [
"computer.display.view()",
"computer.display.screenshot()",
"computer.view()",
"computer.screenshot()",
]:
if any(code.split("\n")[-1].startswith(text) for text in [
"computer.display.view",
"computer.display.screenshot",
"computer.view",
"computer.screenshot",
]):
code = code + "\npass"

# sync up some things (is this how we want to do this?)
Expand Down
3 changes: 2 additions & 1 deletion interpreter/terminal_interface/profiles/defaults/01.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@
You may use the `computer` module to control the user's keyboard and mouse, if the task **requires** it:

```python
computer.display.view() # Shows you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
computer.display.info() # Returns a list of connected monitors/Displays and their info (x and y cordinates, width, height, width_mm, height_mm, name). Use this to verify the monitors connected before using computer.display.view() when neccessary
computer.display.view() # Shows you what's on the screen (primary display by default), returns a `pil_image` `in case you need it (rarely). To get a specific display, use the parameter screen=DISPLAY_NUMBER (0 for primary monitor 1 and above for secondary monitors). **You almost always want to do this first!**
computer.keyboard.hotkey(" ", "command") # Opens spotlight
computer.keyboard.write("hello")
computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
Expand Down
3 changes: 2 additions & 1 deletion interpreter/terminal_interface/profiles/defaults/os.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
```python
computer.browser.search(query) # Silently searches Google for the query, returns result. The user's browser is unaffected. (does not open a browser!)

computer.display.view() # Shows you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
computer.display.info() # Returns a list of connected monitors/Displays and their info (x and y cordinates, width, height, width_mm, height_mm, name). Use this to verify the monitors connected before using computer.display.view() when neccessary
computer.display.view() # Shows you what's on the screen (primary display by default), returns a `pil_image` `in case you need it (rarely). To get a specific display, use the parameter screen=DISPLAY_NUMBER (0 for primary monitor 1 and above for secondary monitors). **You almost always want to do this first!**

computer.keyboard.hotkey(" ", "command") # Opens spotlight (very useful)
computer.keyboard.write("hello")
Expand Down
12 changes: 6 additions & 6 deletions interpreter/terminal_interface/terminal_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,12 +361,12 @@ def terminal_interface(interpreter, message):
# (unless we figure out how to do this AFTER taking the screenshot)
# otherwise it will try to click this notification!

if action in [
"computer.screenshot()",
"computer.display.screenshot()",
"computer.display.view()",
"computer.view()",
]:
if any(action.startswith(text) for text in [
"computer.screenshot",
"computer.display.screenshot",
"computer.display.view",
"computer.view"
]):
description = "Viewing screen..."
elif action == "computer.mouse.click()":
description = "Clicking..."
Expand Down
Loading
Loading