Compare commits
1 Commits
main
...
djlu/qwen3
| Author | SHA1 | Date |
|---|---|---|
|
|
23874ca9e9 |
|
|
@ -204,11 +204,4 @@ reference/
|
|||
draft/
|
||||
manual_examine.py
|
||||
run_human_examine.sh
|
||||
quick_start.py
|
||||
result_multi_apps_pengxiang_transformers12evaluation_examples/settings/proxy/dataimpulse.json
|
||||
evaluation_examples/settings/proxy/dataimpulse.json
|
||||
|
||||
# Local test configurations (not for public repo)
|
||||
evaluation_examples/spiderman.json
|
||||
evaluation_examples/test_50_random_proportional.json
|
||||
evaluation_examples/test_chrome.json
|
||||
quick_start.py
|
||||
|
|
@ -228,7 +228,3 @@ Special thanks to the following institutions that provided feedback and particip
|
|||
Special thanks to the following students who participated in the specific fixes: [Mengqi Yuan](https://yuanmengqi.github.io/), [Danyang Zhang](https://zdy023.github.io/), [Xinzhuang Xiong](https://thisisxxz.com/), [Zhennan Shen](https://scholar.google.com/citations?user=JPwg5MwAAAAJ&hl=en), [Zilong Zhou](https://github.com/adlsdztony), Yanxu Chen, [Jiaqi Deng](https://millank0817.github.io/), [Tianbao Xie](https://tianbaoxie.com/), Junda Chen, [Jixuan Chen](https://chenjix.github.io/), [Haoyuan Wu](https://www.linkedin.com/in/haoyuan-wu-240878291/).
|
||||
|
||||
Special thanks to the following students who participated in running the re-evaluation: [Mengqi Yuan](https://yuanmengqi.github.io/), [Zilong Zhou](https://github.com/adlsdztony), [Xinyuan Wang](https://xinyuanwangcs.github.io/), [Bowen Wang](https://bowenbryanwang.github.io/).
|
||||
|
||||
## You might also be interested
|
||||
|
||||
- **OSWorld-MCP**: Benchmarking MCP Tool Invocation in Computer-Use Agents. [Website](https://osworld-mcp.github.io/)
|
||||
|
|
|
|||
|
|
@ -238,17 +238,12 @@ class PythonController:
|
|||
"returncode": -1
|
||||
}
|
||||
|
||||
def execute_action(self, action):
|
||||
def execute_action(self, action: Dict[str, Any]):
|
||||
"""
|
||||
Executes an action on the server computer.
|
||||
"""
|
||||
# Handle string actions
|
||||
if action in ['WAIT', 'FAIL', 'DONE']:
|
||||
return
|
||||
|
||||
# Handle dictionary actions
|
||||
if type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']:
|
||||
return
|
||||
|
||||
action_type = action["action_type"]
|
||||
parameters = action["parameters"] if "parameters" in action else {param: action[param] for param in action if param != 'action_type'}
|
||||
|
|
|
|||
|
|
@ -813,7 +813,7 @@ class SetupController:
|
|||
|
||||
def _update_browse_history_setup(self, **config):
|
||||
cache_path = os.path.join(self.cache_dir, "history_new.sqlite")
|
||||
db_url = "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938/history_empty.sqlite?download=true"
|
||||
db_url = "https://drive.usercontent.google.com/u/0/uc?id=1Lv74QkJYDWVX0RIgg0Co-DUcoYpVL0oX&export=download" # google drive
|
||||
if not os.path.exists(cache_path):
|
||||
max_retries = 3
|
||||
downloaded = False
|
||||
|
|
@ -839,82 +839,80 @@ class SetupController:
|
|||
else:
|
||||
logger.info("File already exists in cache directory")
|
||||
# copy a new history file in the tmp folder
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
db_path = os.path.join(tmp_dir, "history_empty.sqlite")
|
||||
shutil.copy(cache_path, db_path)
|
||||
db_path = cache_path
|
||||
|
||||
history = config['history']
|
||||
history = config['history']
|
||||
|
||||
for history_item in history:
|
||||
url = history_item['url']
|
||||
title = history_item['title']
|
||||
visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
|
||||
for history_item in history:
|
||||
url = history_item['url']
|
||||
title = history_item['title']
|
||||
visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
|
||||
|
||||
# Chrome use ms from 1601-01-01 as timestamp
|
||||
epoch_start = datetime(1601, 1, 1)
|
||||
chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
|
||||
# Chrome use ms from 1601-01-01 as timestamp
|
||||
epoch_start = datetime(1601, 1, 1)
|
||||
chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url, title, 1, 0, chrome_timestamp, 0))
|
||||
cursor.execute('''
|
||||
INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url, title, 1, 0, chrome_timestamp, 0))
|
||||
|
||||
url_id = cursor.lastrowid
|
||||
url_id = cursor.lastrowid
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
|
||||
cursor.execute('''
|
||||
INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info('Fake browsing history added successfully.')
|
||||
logger.info('Fake browsing history added successfully.')
|
||||
|
||||
controller = PythonController(self.vm_ip, self.server_port)
|
||||
controller = PythonController(self.vm_ip, self.server_port)
|
||||
|
||||
# get the path of the history file according to the platform
|
||||
os_type = controller.get_vm_platform()
|
||||
# get the path of the history file according to the platform
|
||||
os_type = controller.get_vm_platform()
|
||||
|
||||
if os_type == 'Windows':
|
||||
if os_type == 'Windows':
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"""import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
|
||||
'output'].strip()
|
||||
elif os_type == 'Darwin':
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"""import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
|
||||
'output'].strip()
|
||||
elif os_type == 'Linux':
|
||||
if "arm" in platform.machine():
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"""import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
|
||||
"import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
|
||||
'output'].strip()
|
||||
elif os_type == 'Darwin':
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"""import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
|
||||
'output'].strip()
|
||||
elif os_type == 'Linux':
|
||||
if "arm" in platform.machine():
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
|
||||
'output'].strip()
|
||||
else:
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
|
||||
'output'].strip()
|
||||
else:
|
||||
raise Exception('Unsupported operating system')
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
|
||||
'output'].strip()
|
||||
else:
|
||||
raise Exception('Unsupported operating system')
|
||||
|
||||
form = MultipartEncoder({
|
||||
"file_path": chrome_history_path,
|
||||
"file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
|
||||
})
|
||||
headers = {"Content-Type": form.content_type}
|
||||
logger.debug(form.content_type)
|
||||
form = MultipartEncoder({
|
||||
"file_path": chrome_history_path,
|
||||
"file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
|
||||
})
|
||||
headers = {"Content-Type": form.content_type}
|
||||
logger.debug(form.content_type)
|
||||
|
||||
# send request to server to upload file
|
||||
try:
|
||||
logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
|
||||
response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
else:
|
||||
logger.error("Failed to upload file. Status code: %s", response.text)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error("An error occurred while trying to send the request: %s", e)
|
||||
# send request to server to upload file
|
||||
try:
|
||||
logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
|
||||
response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
else:
|
||||
logger.error("Failed to upload file. Status code: %s", response.text)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error("An error occurred while trying to send the request: %s", e)
|
||||
|
||||
self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
|
||||
self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
|
||||
|
|
|
|||
|
|
@ -391,12 +391,12 @@ class DesktopEnv(gym.Env):
|
|||
logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
|
||||
# handle the special actions
|
||||
if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
|
||||
if action == 'WAIT' or (type(action) == dict and action.get('action_type') == 'WAIT'):
|
||||
if action == 'WAIT':
|
||||
time.sleep(pause)
|
||||
elif action == 'FAIL' or (type(action) == dict and action.get('action_type') == 'FAIL'):
|
||||
elif action == 'FAIL':
|
||||
done = True
|
||||
info = {"fail": True}
|
||||
elif action == 'DONE' or (type(action) == dict and action.get('action_type') == 'DONE'):
|
||||
elif action == 'DONE':
|
||||
done = True
|
||||
info = {"done": True}
|
||||
|
||||
|
|
@ -404,7 +404,7 @@ class DesktopEnv(gym.Env):
|
|||
# the set of all possible actions defined in the action representation
|
||||
self.controller.execute_action(action)
|
||||
elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
|
||||
if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']):
|
||||
if action in ['WAIT', 'FAIL', 'DONE']:
|
||||
self.controller.execute_action(action)
|
||||
else:
|
||||
# the set of all possible python commands insides `pyautogui`
|
||||
|
|
@ -434,16 +434,13 @@ class DesktopEnv(gym.Env):
|
|||
self.is_environment_used = True
|
||||
|
||||
if self.evaluator['func'] == "infeasible":
|
||||
if len(self.action_history) > 0:
|
||||
last_action = self.action_history[-1]
|
||||
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
|
||||
return 1
|
||||
return 0
|
||||
if len(self.action_history) > 0 and self.action_history[-1] == "FAIL":
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
if len(self.action_history) > 0:
|
||||
last_action = self.action_history[-1]
|
||||
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
|
||||
return 0
|
||||
if len(self.action_history) > 0 and self.action_history[-1] == "FAIL":
|
||||
return 0
|
||||
|
||||
if type(self.metric) == list:
|
||||
# Multiple metrics to evaluate whether the task is successfully completed
|
||||
|
|
|
|||
|
|
@ -1,499 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
from typing import Callable, Any, Optional, Tuple
|
||||
from typing import List, Dict, Union
|
||||
|
||||
import gymnasium as gym
|
||||
|
||||
from desktop_env.controllers.python import PythonController
|
||||
from desktop_env.controllers.setup import SetupController
|
||||
from desktop_env.evaluators import metrics, getters
|
||||
from desktop_env.providers import create_vm_manager_and_provider
|
||||
|
||||
logger = logging.getLogger("desktopenv.env")
|
||||
|
||||
Metric = Callable[[Any, Any], float]
|
||||
Getter = Callable[[gym.Env, Dict[str, Any]], Any]
|
||||
|
||||
MAX_RETRIES = 5 # Maximum retries for environment setup
|
||||
|
||||
|
||||
|
||||
def _fix_pyautogui_less_than_bug(command: str) -> str:
|
||||
"""
|
||||
Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
|
||||
|
||||
This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
|
||||
References:
|
||||
- https://github.com/asweigart/pyautogui/issues/198
|
||||
- https://github.com/xlang-ai/OSWorld/issues/257
|
||||
|
||||
Args:
|
||||
command (str): The original pyautogui command
|
||||
|
||||
Returns:
|
||||
str: The fixed command with '<' characters handled properly
|
||||
"""
|
||||
# Pattern to match press('<') or press('\u003c') calls
|
||||
press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
|
||||
|
||||
# Handle press('<') calls
|
||||
def replace_press_less_than(match):
|
||||
return 'pyautogui.hotkey("shift", ",")'
|
||||
|
||||
# First handle press('<') calls
|
||||
command = re.sub(press_pattern, replace_press_less_than, command)
|
||||
|
||||
# Pattern to match typewrite calls with quoted strings
|
||||
typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
|
||||
|
||||
# Then handle typewrite calls
|
||||
def process_typewrite_match(match):
|
||||
quote_char = match.group(1)
|
||||
content = match.group(2)
|
||||
|
||||
# Preprocess: Try to decode Unicode escapes like \u003c to actual '<'
|
||||
# This handles cases where '<' is represented as escaped Unicode
|
||||
try:
|
||||
# Attempt to decode unicode escapes
|
||||
decoded_content = content.encode('utf-8').decode('unicode_escape')
|
||||
content = decoded_content
|
||||
except UnicodeDecodeError:
|
||||
# If decoding fails, proceed with original content to avoid breaking existing logic
|
||||
pass # English comment: Graceful degradation - fall back to original content if decoding fails
|
||||
|
||||
# Check if content contains '<'
|
||||
if '<' not in content:
|
||||
return match.group(0)
|
||||
|
||||
# Split by '<' and rebuild
|
||||
parts = content.split('<')
|
||||
result_parts = []
|
||||
|
||||
for i, part in enumerate(parts):
|
||||
if i == 0:
|
||||
# First part
|
||||
if part:
|
||||
result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
|
||||
else:
|
||||
# Add hotkey for '<' and then typewrite for the rest
|
||||
result_parts.append('pyautogui.hotkey("shift", ",")')
|
||||
if part:
|
||||
result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
|
||||
|
||||
return '; '.join(result_parts)
|
||||
|
||||
command = re.sub(typewrite_pattern, process_typewrite_match, command)
|
||||
|
||||
return command
|
||||
|
||||
|
||||
class DesktopEnv(gym.Env):
|
||||
"""
|
||||
DesktopEnv with OpenAI Gym interface. It provides a desktop environment for setting and evaluating desktop automation tasks.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
provider_name: str = "vmware",
|
||||
region: str = None,
|
||||
path_to_vm: str = None,
|
||||
snapshot_name: str = "init_state",
|
||||
action_space: str = "pyautogui",
|
||||
cache_dir: str = "cache",
|
||||
screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))),
|
||||
headless: bool = False,
|
||||
require_a11y_tree: bool = True,
|
||||
require_terminal: bool = False,
|
||||
os_type: str = "Ubuntu",
|
||||
enable_proxy: bool = False,
|
||||
client_password: str = "",
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
provider_name (str): virtualization provider name, default to "vmware"
|
||||
region (str): the region for allocate machines, work for cloud services, default to "us-east-1"
|
||||
path_to_vm (str): path to .vmx file
|
||||
snapshot_name (str): snapshot name to revert to, default to "init_state"
|
||||
action_space (str): "computer_13" | "pyautogui"
|
||||
cache_dir (str): cache directory to cache task-related stuffs like
|
||||
reference file for evaluation
|
||||
screen_size (Tuple[int]): screen size of the VM
|
||||
headless (bool): whether to run the VM in headless mode
|
||||
require_a11y_tree (bool): whether to require accessibility tree
|
||||
require_terminal (bool): whether to require terminal output
|
||||
os_type (str): operating system type, default to "Ubuntu"
|
||||
enable_proxy (bool): whether to enable proxy support, default to False
|
||||
"""
|
||||
# Initialize VM manager and vitualization provider
|
||||
self.region = region
|
||||
self.provider_name = provider_name
|
||||
self.enable_proxy = enable_proxy # Store proxy enablement setting
|
||||
if client_password == "":
|
||||
if self.provider_name == "aws":
|
||||
self.client_password = "osworld-public-evaluation"
|
||||
else:
|
||||
self.client_password = "password"
|
||||
else:
|
||||
self.client_password = client_password
|
||||
|
||||
self.screen_width = screen_size[0]
|
||||
self.screen_height = screen_size[1]
|
||||
|
||||
# Default
|
||||
self.server_port = 5000
|
||||
self.chromium_port = 9222
|
||||
self.vnc_port = 8006
|
||||
self.vlc_port = 8080
|
||||
|
||||
# Initialize with default (no proxy) provider
|
||||
self.current_use_proxy = False
|
||||
self.manager, self.provider = None, None
|
||||
self.os_type = os_type
|
||||
self.path_to_vm = path_to_vm
|
||||
# Track whether environment has been used (step/setup) to optimize snapshot revert
|
||||
# docker, aws, gcp, azure are always unused as the emulator starts from a clean state
|
||||
# vmware, virtualbox are always used as the emulator starts from a dirty state
|
||||
if self.provider_name in {"docker", "aws", "gcp", "azure", "aliyun", "volcengine"}:
|
||||
self.is_environment_used = False
|
||||
elif self.provider_name in {"vmware", "virtualbox"}:
|
||||
self.is_environment_used = True
|
||||
else:
|
||||
raise ValueError(f"Invalid provider name: {self.provider_name}")
|
||||
|
||||
self.snapshot_name = snapshot_name
|
||||
self.cache_dir_base: str = cache_dir
|
||||
self.headless = headless
|
||||
self.require_a11y_tree = require_a11y_tree
|
||||
self.require_terminal = require_terminal
|
||||
|
||||
# mode: human or machine
|
||||
self.instruction = None
|
||||
assert action_space in ["computer_13", "pyautogui", "claude_computer_use", "autoglm_computer_use"]
|
||||
self.action_space = action_space # todo: refactor it to the ActType
|
||||
|
||||
# episodic stuffs, like counters, will be updated or reset
|
||||
# when calling self.reset()
|
||||
self._traj_no: int = -1
|
||||
self._step_no: int = 0
|
||||
self.action_history: List[Dict[str, any]] = []
|
||||
|
||||
def start(self):
|
||||
# Initialize emulator and controller
|
||||
if not self.manager and not self.provider:
|
||||
logger.info("Initializing...")
|
||||
self.manager, self.provider = create_vm_manager_and_provider(self.provider_name, self.region, use_proxy=False)
|
||||
|
||||
if self.path_to_vm:
|
||||
self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(self.path_to_vm))) \
|
||||
if self.provider_name in {"vmware", "virtualbox"} else self.path_to_vm
|
||||
else:
|
||||
self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=self.region, screen_size=(self.screen_width, self.screen_height))
|
||||
|
||||
self._start_emulator()
|
||||
|
||||
def _start_emulator(self):
|
||||
try:
|
||||
# Power on the virtual machine
|
||||
self.provider.start_emulator(self.path_to_vm, self.headless, self.os_type)
|
||||
|
||||
# Get the ip from the virtual machine, and setup the controller
|
||||
vm_ip_ports = self.provider.get_ip_address(self.path_to_vm).split(':')
|
||||
self.vm_ip = vm_ip_ports[0]
|
||||
# Get the ports from the virtual machine (for Docker provider only)
|
||||
if len(vm_ip_ports) > 1:
|
||||
self.server_port = int(vm_ip_ports[1])
|
||||
self.chromium_port = int(vm_ip_ports[2])
|
||||
self.vnc_port = int(vm_ip_ports[3])
|
||||
self.vlc_port = int(vm_ip_ports[4])
|
||||
self.controller = PythonController(vm_ip=self.vm_ip, server_port=self.server_port)
|
||||
self.setup_controller = SetupController(vm_ip=self.vm_ip, server_port=self.server_port, chromium_port=self.chromium_port, vlc_port=self.vlc_port, cache_dir=self.cache_dir_base, client_password=self.client_password, screen_width=self.screen_width, screen_height=self.screen_height)
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
self.provider.stop_emulator(self.path_to_vm)
|
||||
except Exception as stop_err:
|
||||
logger.warning(f"Cleanup after interrupt failed: {stop_err}")
|
||||
raise
|
||||
|
||||
def _revert_to_snapshot(self):
|
||||
# Revert to certain snapshot of the virtual machine, and refresh the path to vm and ip of vm
|
||||
# due to the fact it could be changed when implemented by cloud services
|
||||
path_to_vm = self.provider.revert_to_snapshot(self.path_to_vm, self.snapshot_name)
|
||||
if path_to_vm and not path_to_vm == self.path_to_vm:
|
||||
# path_to_vm has to be a new path
|
||||
|
||||
self.manager.delete_vm(self.path_to_vm, self.region)
|
||||
self.manager.add_vm(path_to_vm, self.region)
|
||||
self.manager.occupy_vm(path_to_vm, os.getpid(), self.region)
|
||||
self.path_to_vm = path_to_vm
|
||||
|
||||
def _save_state(self, snapshot_name=None):
|
||||
# Save the current virtual machine state to a certain snapshot name
|
||||
self.provider.save_state(self.path_to_vm, snapshot_name)
|
||||
|
||||
def close(self):
|
||||
# Close (release) the virtual machine
|
||||
self.provider.stop_emulator(self.path_to_vm)
|
||||
|
||||
def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
|
||||
|
||||
# Reset to certain task in OSWorld
|
||||
logger.info("Resetting environment...")
|
||||
logger.info("Switching task...")
|
||||
logger.info("Setting counters...")
|
||||
self._traj_no += 1
|
||||
self._step_no = 0
|
||||
self.action_history.clear()
|
||||
|
||||
for attempt in range(MAX_RETRIES):
|
||||
# Only revert to snapshot if environment has been used (step/setup)
|
||||
# This optimization is especially important for cloud providers like AWS
|
||||
# where unnecessary snapshot operations are costly and time-consuming
|
||||
|
||||
if task_config is not None:
|
||||
# Only consider task proxy requirement if proxy is enabled at system level
|
||||
task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
|
||||
if not self.enable_proxy and task_config.get("proxy", False):
|
||||
logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
|
||||
|
||||
if task_use_proxy != self.current_use_proxy:
|
||||
# keep because get_info_from_website depend on this
|
||||
self.current_use_proxy = task_use_proxy
|
||||
|
||||
if self.is_environment_used:
|
||||
logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
|
||||
self._revert_to_snapshot()
|
||||
logger.info("Starting emulator...")
|
||||
self._start_emulator()
|
||||
logger.info("Emulator started.")
|
||||
# Reset the usage flag after reverting
|
||||
self.is_environment_used = False
|
||||
else:
|
||||
logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
|
||||
|
||||
if task_config is not None:
|
||||
if task_config.get("proxy", False) and self.enable_proxy:
|
||||
# If using proxy and proxy is enabled, set up the proxy configuration
|
||||
self.setup_controller._proxy_setup(self.client_password)
|
||||
self._set_task_info(task_config)
|
||||
self.setup_controller.reset_cache_dir(self.cache_dir)
|
||||
logger.info("Setting up environment...")
|
||||
success = self.setup_controller.setup(self.config, task_config.get("proxy", False) and self.enable_proxy)
|
||||
if success:
|
||||
# Mark environment as used when setup is successfully executed
|
||||
if self.config: # Only mark as used if there were actual setup operations
|
||||
self.is_environment_used = True
|
||||
break
|
||||
else:
|
||||
logger.error(
|
||||
"Environment setup failed, retrying (%d/%d)...",
|
||||
attempt + 1,
|
||||
MAX_RETRIES,
|
||||
)
|
||||
time.sleep(5)
|
||||
else:
|
||||
break
|
||||
|
||||
logger.info("Environment setup complete.")
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def _get_obs(self):
|
||||
# We provide screenshot, accessibility_tree (optional), terminal (optional), and instruction.
|
||||
# can be customized and scaled
|
||||
return {
|
||||
"screenshot": self.controller.get_screenshot(),
|
||||
"accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
|
||||
"terminal": self.controller.get_terminal_output() if self.require_terminal else None,
|
||||
"instruction": self.instruction
|
||||
}
|
||||
|
||||
@property
|
||||
def vm_platform(self):
|
||||
return self.controller.get_vm_platform()
|
||||
|
||||
@property
|
||||
def vm_screen_size(self):
|
||||
return self.controller.get_vm_screen_size()
|
||||
|
||||
def _set_task_info(self, task_config: Dict[str, Any]):
|
||||
"""Set task info (proxy logic is handled in reset method)"""
|
||||
self.task_id: str = task_config["id"]
|
||||
self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
|
||||
os.makedirs(self.cache_dir, exist_ok=True)
|
||||
self.instruction = task_config["instruction"]
|
||||
self.config = task_config["config"] if "config" in task_config else []
|
||||
|
||||
self._set_evaluator_info(task_config)
|
||||
|
||||
def _set_evaluator_info(self, task_config: Dict[str, Any]):
|
||||
"""Set evaluator information from task config"""
|
||||
if "evaluator" not in task_config:
|
||||
return
|
||||
# evaluator dict
|
||||
# func -> metric function string, or list of metric function strings
|
||||
# conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or"
|
||||
# result -> result getter config, or list of result getter configs
|
||||
# expected (optional) -> expected getter config, or list of expected getter configs
|
||||
# options (optional) -> metric options, or list of metric options
|
||||
# if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length
|
||||
# even if one of the metrics does not need expected or options field, it should be included in the list with None
|
||||
self.evaluator = task_config["evaluator"]
|
||||
self.metric: Metric = [getattr(metrics, func) for func in self.evaluator["func"]] \
|
||||
if isinstance(self.evaluator["func"], list) \
|
||||
else getattr(metrics, self.evaluator["func"])
|
||||
self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics
|
||||
if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
|
||||
self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
|
||||
self.evaluator["result"]] \
|
||||
if isinstance(self.evaluator["result"], list) \
|
||||
else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
|
||||
else:
|
||||
self.result_getter = [None] * len(self.metric) \
|
||||
if isinstance(self.metric, list) \
|
||||
else None
|
||||
|
||||
if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
|
||||
self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
|
||||
self.evaluator["expected"]] \
|
||||
if isinstance(self.evaluator["expected"], list) \
|
||||
else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
|
||||
else:
|
||||
self.expected_getter = [None] * len(self.metric) \
|
||||
if isinstance(self.metric, list) \
|
||||
else None
|
||||
self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in
|
||||
self.evaluator["options"]] \
|
||||
if isinstance(self.evaluator.get("options", {}), list) \
|
||||
else self.evaluator["options"] \
|
||||
if "options" in self.evaluator \
|
||||
else [{}] * len(self.metric) \
|
||||
if isinstance(self.metric, list) \
|
||||
else {}
|
||||
|
||||
assert (not isinstance(self.evaluator["func"], list)
|
||||
or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
|
||||
self.metric_options)))
|
||||
|
||||
def step(self, action, pause=2):
|
||||
self._step_no += 1
|
||||
self.action_history.append(action)
|
||||
|
||||
# Mark environment as used when step is called
|
||||
self.is_environment_used = True
|
||||
|
||||
reward = 0 # todo: Define reward calculation for each example
|
||||
done = False # todo: Define episode termination condition for each example
|
||||
info = {}
|
||||
logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
|
||||
# handle the special actions
|
||||
if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
|
||||
if action == 'WAIT' or (type(action) == dict and action.get('action_type') == 'WAIT'):
|
||||
time.sleep(pause)
|
||||
elif action == 'FAIL' or (type(action) == dict and action.get('action_type') == 'FAIL'):
|
||||
done = True
|
||||
info = {"fail": True}
|
||||
elif action == 'DONE' or (type(action) == dict and action.get('action_type') == 'DONE'):
|
||||
done = True
|
||||
info = {"done": True}
|
||||
|
||||
if self.action_space == "computer_13":
|
||||
# the set of all possible actions defined in the action representation
|
||||
self.controller.execute_action(action)
|
||||
elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
|
||||
if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']):
|
||||
self.controller.execute_action(action)
|
||||
else:
|
||||
# the set of all possible python commands insides `pyautogui`
|
||||
if type(action) == str:
|
||||
# Fix PyAutoGUI '<' character bug before execution
|
||||
fixed_command = _fix_pyautogui_less_than_bug(action)
|
||||
self.controller.execute_python_command(fixed_command)
|
||||
elif type(action) == dict:
|
||||
# Fix PyAutoGUI '<' character bug before execution
|
||||
fixed_command = _fix_pyautogui_less_than_bug(action['command'])
|
||||
self.controller.execute_python_command(fixed_command)
|
||||
|
||||
time.sleep(pause)
|
||||
observation = self._get_obs()
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def evaluate(self):
|
||||
"""
|
||||
Evaluate whether the task is successfully completed.
|
||||
"""
|
||||
|
||||
postconfig = self.evaluator.get("postconfig", [])
|
||||
self.setup_controller.setup(postconfig, self.enable_proxy)
|
||||
# Mark environment as used if there were postconfig setup operations
|
||||
if postconfig:
|
||||
self.is_environment_used = True
|
||||
|
||||
if self.evaluator['func'] == "infeasible":
|
||||
if len(self.action_history) > 0:
|
||||
last_action = self.action_history[-1]
|
||||
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
|
||||
return 1
|
||||
return 0
|
||||
else:
|
||||
if len(self.action_history) > 0:
|
||||
last_action = self.action_history[-1]
|
||||
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
|
||||
return 0
|
||||
|
||||
if type(self.metric) == list:
|
||||
# Multiple metrics to evaluate whether the task is successfully completed
|
||||
results = []
|
||||
assert len(self.metric) == len(self.result_getter), "The number of metrics and result getters must be the same"
|
||||
if "expected" in self.evaluator:
|
||||
assert len(self.metric) == len(self.expected_getter), "The number of metrics and expected getters must be the same"
|
||||
for idx, metric in enumerate(self.metric):
|
||||
try:
|
||||
config = self.evaluator["result"][idx]
|
||||
result_state = self.result_getter[idx](self, config)
|
||||
except FileNotFoundError:
|
||||
logger.error("File not found!")
|
||||
if self.metric_conj == 'and':
|
||||
return 0
|
||||
|
||||
if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
|
||||
expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
|
||||
metric: int = metric(result_state, expected_state, **self.metric_options[idx])
|
||||
else:
|
||||
metric: int = metric(result_state, **self.metric_options[idx])
|
||||
|
||||
if self.metric_conj == 'and' and float(metric) == 0.0:
|
||||
return 0
|
||||
elif self.metric_conj == 'or' and float(metric) == 1.0:
|
||||
return 1
|
||||
else:
|
||||
results.append(metric)
|
||||
|
||||
return sum(results) / len(results) if self.metric_conj == 'and' else max(results)
|
||||
else:
|
||||
# Single metric to evaluate whether the task is successfully completed
|
||||
try:
|
||||
result_state = self.result_getter(self, self.evaluator["result"])
|
||||
except FileNotFoundError:
|
||||
logger.error("File not found!")
|
||||
return 0
|
||||
|
||||
if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
|
||||
expected_state = self.expected_getter(self, self.evaluator["expected"])
|
||||
metric: float = self.metric(result_state, expected_state, **self.metric_options)
|
||||
else:
|
||||
metric: float = self.metric(result_state, **self.metric_options)
|
||||
|
||||
return metric
|
||||
|
||||
def render(self, mode='rgb_array'):
|
||||
if mode == 'rgb_array':
|
||||
return self.controller.get_screenshot()
|
||||
else:
|
||||
raise ValueError('Unsupported render mode: {}'.format(mode))
|
||||
|
|
@ -16,7 +16,6 @@ from .chrome import (
|
|||
get_active_tab_info,
|
||||
get_enable_do_not_track,
|
||||
get_enable_enhanced_safety_browsing,
|
||||
get_enable_safe_browsing,
|
||||
get_new_startup_page,
|
||||
get_find_unpacked_extension_path,
|
||||
get_data_delete_automacally,
|
||||
|
|
|
|||
|
|
@ -827,8 +827,8 @@ def get_active_tab_info(env, config: Dict[str, str]):
|
|||
|
||||
try:
|
||||
logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}")
|
||||
page.goto(active_tab_url, wait_until='load', timeout=timeout_ms)
|
||||
page.wait_for_load_state('load', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||
page.goto(active_tab_url, wait_until='networkidle', timeout=timeout_ms)
|
||||
page.wait_for_load_state('networkidle', timeout=timeout_ms) # Wait for the 'load' event to complete
|
||||
|
||||
active_tab_info = {
|
||||
'title': page.title(),
|
||||
|
|
@ -1304,40 +1304,6 @@ def get_enable_enhanced_safety_browsing(env, config: Dict[str, str]):
|
|||
return "Google"
|
||||
|
||||
|
||||
def get_enable_safe_browsing(env, config: Dict[str, str]):
|
||||
os_type = env.vm_platform
|
||||
if os_type == 'Windows':
|
||||
preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
|
||||
'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
|
||||
elif os_type == 'Darwin':
|
||||
preference_file_path = env.controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
|
||||
'output'].strip()
|
||||
elif os_type == 'Linux':
|
||||
if "arm" in platform.machine():
|
||||
preference_file_path = env.controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
|
||||
'output'].strip()
|
||||
else:
|
||||
preference_file_path = env.controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
|
||||
'output'].strip()
|
||||
|
||||
else:
|
||||
raise Exception('Unsupported operating system')
|
||||
|
||||
try:
|
||||
content = env.controller.get_file(preference_file_path)
|
||||
data = json.loads(content)
|
||||
|
||||
safebrowsing = data.get('safebrowsing', {})
|
||||
is_enhanced = bool(safebrowsing.get('enhanced', False))
|
||||
is_enabled = bool(safebrowsing.get('enabled', False))
|
||||
return "true" if (is_enhanced or is_enabled) else "false"
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
return "false"
|
||||
|
||||
def get_new_startup_page(env, config: Dict[str, str]):
|
||||
os_type = env.vm_platform
|
||||
if os_type == 'Windows':
|
||||
|
|
|
|||
|
|
@ -2,8 +2,6 @@ import functools
|
|||
import itertools
|
||||
import logging
|
||||
import os.path
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
# import operator
|
||||
from numbers import Number
|
||||
|
|
@ -746,18 +744,6 @@ def compare_table(result: str, expected: str = None, **options) -> float:
|
|||
# }}} function compare_table #
|
||||
|
||||
|
||||
def _normalize_city_string(value: Any) -> str:
|
||||
"""Lowercase, strip punctuation, and remove accents for tolerant matching."""
|
||||
if value is None:
|
||||
return ""
|
||||
if not isinstance(value, str):
|
||||
value = str(value)
|
||||
normalized = unicodedata.normalize("NFKD", value)
|
||||
normalized = "".join(ch for ch in normalized if not unicodedata.combining(ch))
|
||||
normalized = re.sub(r"[^a-z0-9]+", " ", normalized.lower())
|
||||
return normalized.strip()
|
||||
|
||||
|
||||
def compare_conference_city_in_order(actual_city_list_path, expected_city):
|
||||
expected_city_list = expected_city["expected"]
|
||||
wb = openpyxl.load_workbook(actual_city_list_path)
|
||||
|
|
@ -766,35 +752,38 @@ def compare_conference_city_in_order(actual_city_list_path, expected_city):
|
|||
for row in sheet["C2:C22"]:
|
||||
for cell in row:
|
||||
actual_city_list.append(cell.value)
|
||||
|
||||
# expected_city is the city that we want to compare with the actual city list
|
||||
# must in order index
|
||||
# debug
|
||||
try:
|
||||
for i, actual_city in enumerate(actual_city_list):
|
||||
actual_normalized = _normalize_city_string(actual_city)
|
||||
expected_entry = expected_city_list[i]
|
||||
for i in range(len(actual_city_list)):
|
||||
if isinstance(expected_city_list[i], str):
|
||||
if expected_city_list[i] not in actual_city_list[i]:
|
||||
logger.debug(
|
||||
f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}"
|
||||
)
|
||||
print(
|
||||
f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}"
|
||||
)
|
||||
return 0.0
|
||||
|
||||
elif isinstance(expected_city_list[i], List):
|
||||
if not any(
|
||||
possible_str in actual_city_list[i]
|
||||
for possible_str in expected_city_list[i]
|
||||
):
|
||||
logger.debug(
|
||||
f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}"
|
||||
)
|
||||
print(
|
||||
f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}"
|
||||
)
|
||||
return 0.0
|
||||
|
||||
if isinstance(expected_entry, str):
|
||||
expected_candidates = [expected_entry]
|
||||
elif isinstance(expected_entry, List):
|
||||
expected_candidates = expected_entry
|
||||
else:
|
||||
raise TypeError("Expected city should be a string or a list of strings")
|
||||
|
||||
matched = False
|
||||
for candidate in expected_candidates:
|
||||
normalized_candidate = _normalize_city_string(candidate)
|
||||
if normalized_candidate and normalized_candidate in actual_normalized:
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
logger.debug(
|
||||
f"Expected city {expected_entry}; Actual city {actual_city}"
|
||||
)
|
||||
print(f"Expected city {expected_entry}; Actual city {actual_city}")
|
||||
return 0.0
|
||||
|
||||
except Exception as exc:
|
||||
logger.error(f"Error comparing conference cities: {exc}")
|
||||
except:
|
||||
return 0.0
|
||||
|
||||
return 1.0
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import os
|
|||
|
||||
# Default TTL minutes for instance auto-termination (cloud-side scheduler)
|
||||
# Can be overridden via environment variable DEFAULT_TTL_MINUTES
|
||||
DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "180"))
|
||||
DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "60"))
|
||||
|
||||
# Master switch for TTL feature
|
||||
ENABLE_TTL: bool = os.getenv("ENABLE_TTL", "true").lower() == "true"
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from desktop_env.providers.aws.config import ENABLE_TTL, DEFAULT_TTL_MINUTES, AW
|
|||
from desktop_env.providers.aws.scheduler_utils import schedule_instance_termination
|
||||
|
||||
|
||||
INSTANCE_TYPE = "t3.xlarge"
|
||||
INSTANCE_TYPE = "t3.medium"
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
|
@ -40,9 +40,9 @@ DEFAULT_REGION = "us-east-1"
|
|||
# todo: public the AMI images
|
||||
IMAGE_ID_MAP = {
|
||||
"us-east-1": {
|
||||
(1920, 1080): "ami-0d23263edb96951d8",
|
||||
# (1920, 1080): "ami-0d23263edb96951d8"
|
||||
# For CoACT-1, uncomment to use the following AMI
|
||||
# (1920, 1080): "ami-0b505e9d0d99ba88c"
|
||||
(1920, 1080): "ami-0b505e9d0d99ba88c"
|
||||
},
|
||||
"ap-east-1": {
|
||||
(1920, 1080): "ami-06850864d18fad836"
|
||||
|
|
|
|||
|
|
@ -108,52 +108,13 @@ class AWSProvider(Provider):
|
|||
# Step 1: Retrieve the original instance details
|
||||
instance_details = ec2_client.describe_instances(InstanceIds=[path_to_vm])
|
||||
instance = instance_details['Reservations'][0]['Instances'][0]
|
||||
# Resolve security groups with fallbacks
|
||||
security_groups = [sg['GroupId'] for sg in instance.get('SecurityGroups', []) if 'GroupId' in sg]
|
||||
if not security_groups:
|
||||
env_sg = os.getenv('AWS_SECURITY_GROUP_ID')
|
||||
if env_sg:
|
||||
security_groups = [env_sg]
|
||||
logger.info("SecurityGroups missing on instance; using AWS_SECURITY_GROUP_ID from env")
|
||||
else:
|
||||
raise ValueError("No security groups found on instance and AWS_SECURITY_GROUP_ID not set")
|
||||
|
||||
# Resolve subnet with fallbacks
|
||||
subnet_id = instance.get('SubnetId')
|
||||
if not subnet_id:
|
||||
nis = instance.get('NetworkInterfaces', []) or []
|
||||
if nis and isinstance(nis, list):
|
||||
for ni in nis:
|
||||
if isinstance(ni, dict) and ni.get('SubnetId'):
|
||||
subnet_id = ni.get('SubnetId')
|
||||
break
|
||||
if not subnet_id:
|
||||
env_subnet = os.getenv('AWS_SUBNET_ID')
|
||||
if env_subnet:
|
||||
subnet_id = env_subnet
|
||||
logger.info("SubnetId missing on instance; using AWS_SUBNET_ID from env")
|
||||
else:
|
||||
raise ValueError("SubnetId not available on instance, NetworkInterfaces, or environment")
|
||||
|
||||
# Resolve instance type with fallbacks
|
||||
instance_type = instance.get('InstanceType') or os.getenv('AWS_INSTANCE_TYPE') or 't3.large'
|
||||
if instance.get('InstanceType') is None:
|
||||
logger.info(f"InstanceType missing on instance; using '{instance_type}' from env/default")
|
||||
security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']]
|
||||
subnet_id = instance['SubnetId']
|
||||
instance_type = instance['InstanceType']
|
||||
|
||||
# Step 2: Terminate the old instance (skip if already terminated/shutting-down)
|
||||
state = (instance.get('State') or {}).get('Name')
|
||||
if state in ['shutting-down', 'terminated']:
|
||||
logger.info(f"Old instance {path_to_vm} is already in state '{state}', skipping termination.")
|
||||
else:
|
||||
try:
|
||||
ec2_client.terminate_instances(InstanceIds=[path_to_vm])
|
||||
logger.info(f"Old instance {path_to_vm} has been terminated.")
|
||||
except ClientError as e:
|
||||
error_code = getattr(getattr(e, 'response', {}), 'get', lambda *_: None)('Error', {}).get('Code') if hasattr(e, 'response') else None
|
||||
if error_code in ['InvalidInstanceID.NotFound', 'IncorrectInstanceState']:
|
||||
logger.info(f"Ignore termination error for {path_to_vm}: {error_code}")
|
||||
else:
|
||||
raise
|
||||
# Step 2: Terminate the old instance
|
||||
ec2_client.terminate_instances(InstanceIds=[path_to_vm])
|
||||
logger.info(f"Old instance {path_to_vm} has been terminated.")
|
||||
|
||||
# Step 3: Launch a new instance from the snapshot(AMI) with performance optimization
|
||||
logger.info(f"Launching a new instance from AMI {snapshot_name}...")
|
||||
|
|
|
|||
|
|
@ -29,11 +29,13 @@ UBUNTU_X86_URL = "https://huggingface.co/datasets/xlangai/ubuntu_osworld/resolve
|
|||
WINDOWS_X86_URL = "https://huggingface.co/datasets/xlangai/windows_osworld/resolve/main/Windows-x86.zip"
|
||||
|
||||
# Determine the platform and CPU architecture to decide the correct VM image to download
|
||||
# sometimes the system is 'Darwin' but the machine is x86-based.
|
||||
if platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = UBUNTU_X86_URL
|
||||
elif platform.system() == 'Darwin': # macOS
|
||||
if platform.system() == 'Darwin': # macOS
|
||||
# if os.uname().machine == 'arm64': # Apple Silicon
|
||||
URL = UBUNTU_ARM_URL
|
||||
# else:
|
||||
# url = UBUNTU_X86_URL
|
||||
elif platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = UBUNTU_X86_URL
|
||||
else:
|
||||
raise Exception("Unsupported platform or architecture")
|
||||
|
||||
|
|
@ -123,12 +125,12 @@ def _install_vm(vm_name, vms_dir, downloaded_file_name, os_type, original_vm_nam
|
|||
# Download the virtual machine image
|
||||
logger.info("Downloading the virtual machine image...")
|
||||
downloaded_size = 0
|
||||
# sometimes the system is 'Darwin' but the machine is x86-based.
|
||||
|
||||
if os_type == "Ubuntu":
|
||||
if platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = UBUNTU_X86_URL
|
||||
elif platform.system() == 'Darwin':
|
||||
if platform.system() == 'Darwin':
|
||||
URL = UBUNTU_ARM_URL
|
||||
elif platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = UBUNTU_X86_URL
|
||||
elif os_type == "Windows":
|
||||
if platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = WINDOWS_X86_URL
|
||||
|
|
|
|||
|
|
@ -1370,7 +1370,7 @@ def open_file():
|
|||
if window_found:
|
||||
return "File opened and window activated successfully"
|
||||
else:
|
||||
return f"Failed to find window for {file_name} within {TIMEOUT} seconds.", 500
|
||||
return f"Failed to find window for {file_name} within {timeout} seconds.", 500
|
||||
|
||||
except Exception as e:
|
||||
return f"Failed to open {path}. Error: {e}", 500
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "35253b65-1c19-4304-8aa4-6884b8218fc0",
|
||||
"snapshot": "chrome",
|
||||
"instruction": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me using Chrome's built-in feature?",
|
||||
"instruction": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
|
||||
"source": "https://www.laptopmag.com/articles/how-to-create-desktop-shortcuts-for-web-pages-using-chrome",
|
||||
"config": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -4,20 +4,6 @@
|
|||
"instruction": "I want Chrome to warn me whenever I visit a potentially harmful or unsafe website. Can you enable this safety feature?",
|
||||
"source": "https://www.quora.com/How-do-I-set-the-security-settings-for-the-Google-Chrome-browser-for-the-best-security#:~:text=Enable%20Safe%20Browsing:%20Chrome%20has%20a%20built%2Din,Security%20%3E%20Security%20%3E%20Enable%20Safe%20Browsing.",
|
||||
"config": [
|
||||
{
|
||||
"type": "execute",
|
||||
"parameters": {
|
||||
"command": "echo {CLIENT_PASSWORD} | sudo -S apt update -y && echo {CLIENT_PASSWORD} | sudo -S apt install jq -y",
|
||||
"shell": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "execute",
|
||||
"parameters": {
|
||||
"command": "mkdir -p /home/user/.config/google-chrome/Default && if [ ! -f /home/user/.config/google-chrome/Default/Preferences ]; then echo '{}' > /home/user/.config/google-chrome/Default/Preferences; fi && cd /home/user/.config/google-chrome/Default && jq '. + {\"safebrowsing\":{\"enabled\":false,\"enhanced\":false}}' Preferences > temp && mv temp Preferences",
|
||||
"shell": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "launch",
|
||||
"parameters": {
|
||||
|
|
@ -71,7 +57,7 @@
|
|||
],
|
||||
"func": "exact_match",
|
||||
"result": {
|
||||
"type": "enable_safe_browsing"
|
||||
"type": "enable_enhanced_safety_browsing"
|
||||
},
|
||||
"expected": {
|
||||
"type": "rule",
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@
|
|||
"chrome"
|
||||
],
|
||||
"evaluator": {
|
||||
"func": "is_expected_url_pattern_match",
|
||||
"func": "is_expected_active_tab",
|
||||
"result": {
|
||||
"type": "active_url_from_accessTree",
|
||||
"goto_prefix": "https://www."
|
||||
|
|
@ -51,9 +51,8 @@
|
|||
"expected": {
|
||||
"type": "rule",
|
||||
"rules": {
|
||||
"expected": [
|
||||
"^https://(www\\.)?dmv\\.virginia\\.gov/licenses-ids/license/applying/eligibility"
|
||||
]
|
||||
"type": "url",
|
||||
"url": "https://www.dmv.virginia.gov/licenses-ids/license/applying/eligibility"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
|
|
|||
|
|
@ -44,35 +44,37 @@
|
|||
],
|
||||
"evaluator": {
|
||||
"func": [
|
||||
"is_expected_url_pattern_match",
|
||||
"is_expected_url_pattern_match"
|
||||
"exact_match",
|
||||
"exact_match"
|
||||
],
|
||||
"conj": "or",
|
||||
"result": [
|
||||
{
|
||||
"type": "active_url_from_accessTree",
|
||||
"goto_prefix": "https://www."
|
||||
"type": "url_dashPart",
|
||||
"goto_prefix": "https://www.",
|
||||
"partIndex": -1,
|
||||
"needDeleteId": false,
|
||||
"returnType": "string"
|
||||
},
|
||||
{
|
||||
"type": "active_url_from_accessTree",
|
||||
"goto_prefix": "https://www."
|
||||
"type": "url_dashPart",
|
||||
"goto_prefix": "https://www.",
|
||||
"partIndex": -1,
|
||||
"needDeleteId": false,
|
||||
"returnType": "string"
|
||||
}
|
||||
],
|
||||
"expected": [
|
||||
{
|
||||
"type": "rule",
|
||||
"rules": {
|
||||
"expected": [
|
||||
"^https://(www\\.)?drugs\\.com/tamiflu\\.html#side-effects"
|
||||
]
|
||||
"expected": "tamiflu.html#side-effects"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "rule",
|
||||
"rules": {
|
||||
"expected": [
|
||||
"^https://(www\\.)?drugs\\.com/sfx/tamiflu-side-effects\\.html"
|
||||
]
|
||||
"expected": "tamiflu-side-effects.html"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@
|
|||
"type": "rule",
|
||||
"rules": {
|
||||
"expected": [
|
||||
"united\\.com/en/us/checked-bag-fee-calculator(/.*)?"
|
||||
"united.com/en/us/checked-bag-fee-calculator"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -82,8 +82,8 @@
|
|||
],
|
||||
"func": "check_palette_and_structure_sim",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/06ca5602-62ca-47f6-ad4f-da151cde54cc/computer.png",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/computer.png",
|
||||
"dest": "computer.png"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@
|
|||
{
|
||||
"url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/2a729ded-3296-423d-aec4-7dd55ed5fbb3/dog_with_background.png",
|
||||
"path": "/home/user/Desktop/dog_with_background.png"
|
||||
},
|
||||
{
|
||||
"url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/2a729ded-3296-423d-aec4-7dd55ed5fbb3/dog_cutout_gold.png",
|
||||
"path": "/home/user/Desktop/dog_cutout_gold.png"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -82,8 +86,8 @@
|
|||
],
|
||||
"func": "check_structure_sim",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/2a729ded-3296-423d-aec4-7dd55ed5fbb3/dog_cutout_gold.png",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/dog_cutout_gold.png",
|
||||
"dest": "dog_cutout_gold.png"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -82,8 +82,8 @@
|
|||
],
|
||||
"func": "check_saturation_increase_and_structure_sim",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/554785e9-4523-4e7a-b8e1-8016f565f56a/woman_sitting_by_the_tree2.png",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/woman_sitting_by_the_tree2.png",
|
||||
"dest": "woman_sitting_by_the_tree2.png"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -88,8 +88,8 @@
|
|||
],
|
||||
"func": "check_image_mirror",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/72f83cdc-bf76-4531-9a1b-eb893a13f8aa/berry.jpeg",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/berry.png",
|
||||
"dest": "berry.png"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -86,8 +86,8 @@
|
|||
],
|
||||
"func": "check_green_background",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/734d6579-c07d-47a8-9ae2-13339795476b/white_background_with_object.png",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/white_background_with_object.png",
|
||||
"dest": "white_background_with_object.png"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@
|
|||
"evaluator": {
|
||||
"func": "check_file_exists_and_structure_sim",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/77b8ab4d-994f-43ac-8930-8ca087d7c4b4/The_Lost_River_Of_Dreams.jpg",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/The_Lost_River_Of_Dreams.jpg",
|
||||
"dest": "The_Lost_River_Of_Dreams.jpg"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -82,8 +82,8 @@
|
|||
],
|
||||
"func": "check_brightness_decrease_and_structure_sim",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/7a4deb26-d57d-4ea9-9a73-630f66a7b568/woman_sitting_by_the_tree.png",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/woman_sitting_by_the_tree.png",
|
||||
"dest": "woman_sitting_by_the_tree.png"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -104,8 +104,8 @@
|
|||
}
|
||||
},
|
||||
{
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15/dog_with_background.png",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/dog_with_background.png",
|
||||
"dest": "dog_with_background.png"
|
||||
}
|
||||
],
|
||||
|
|
|
|||
|
|
@ -88,8 +88,8 @@
|
|||
],
|
||||
"func": "check_contrast_increase_and_structure_sim",
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/f723c744-e62c-4ae6-98d1-750d3cd7d79d/file_1X42_kOanL74vu_p6QdcZuiyzDQi3kA7F.jpg",
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/berries.png",
|
||||
"dest": "berries.png"
|
||||
},
|
||||
"result": {
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "01b269ae-2111-4a07-81fd-3fcd711993b0",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Fill all the blank cells in B1:E30 with the value in the cell above it. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "Fill all the blank cells in B1:E30 with the value in the cell above it.",
|
||||
"source": "https://www.youtube.com/shorts/VrUzPTIwQ04",
|
||||
"config": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "0bf05a7d-b28b-44d2-955a-50b41e24012a",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "I would like to copy all the numbers in the 'Old ID' column to the 'New 7 Digit Id' column, and pad them with zeros in front, to fill them up to seven digits. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "I would like to copy all the numbers in the 'Old ID' column to the 'New 7 Digit Id' column, and pad them with zeros in front, to fill them up to seven digits.",
|
||||
"source": "https://www.youtube.com/shorts/FPAQaDTS8VY",
|
||||
"config": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "I have calculated the total work hours from the everday hours. And I have an hourly rate. Now I want to multiply the total hours with the hourly rate to get a total earned amount. However, I can't get a correct answer by directly multiply the two cells. Here the \"total hours\" is of time and \"hourly rate\" is just a number. How can I get the correct product of them? Help me fill in the cell the correct answer. Don't touch irrelevant blank regions.",
|
||||
"instruction": "I have calculated the total work hours from the everday hours. And I have an hourly rate. Now I want to multiply the total hours with the hourly rate to get a total earned amount. However, I can't get a correct answer by directly multiply the two cells. Here the \"total hours\" is of time and \"hourly rate\" is just a number. How can I get the correct product of them?",
|
||||
"source": "https://www.reddit.com/r/excel/comments/17zny8u/calculating_total_amount_earned_from_total_hours/",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -83,4 +83,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "37608790-6147-45d0-9f20-1137bb35703d",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "The information are mixed in one field. Help me split them and fill in the columns of First Name, Last Name and Rank. Finish the work and don't touch the original data.",
|
||||
"instruction": "The information are mixed in one field. Help me split them and fill in the columns of First Name, Last Name and Rank",
|
||||
"source": "https://www.youtube.com/shorts/uzPo_CPCHH8",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -82,4 +82,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "4e6fcf72-daf3-439f-a232-c434ce416af6",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Please calculate the ages of the employees according to their birthday. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "Please calculate the ages of the employees according to their birthday.",
|
||||
"source": "https://www.youtube.com/shorts/0uxJccNCKcE",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -134,4 +134,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "4f07fbe9-70de-4927-a4d5-bb28bc12c52c",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Here I want to use the numerical value from a cell in the text. I can set its number of decimal digits to 2 in the original value cell but don't know how to fix it in the text as well. Please help me to do this. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "Here I want to use the numerical value from a cell in the text. I can set its number of decimal digits to 2 in the original value cell but don't know how to fix it in the text as well. Please help me to do this.",
|
||||
"source": "https://superuser.com/questions/1081048/libreoffice-calc-how-to-pad-number-to-fixed-decimals-when-used-within-formula",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -115,4 +115,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "6054afcb-5bab-4702-90a0-b259b5d3217c",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Some data are missed by now and are filled by 'N/A' temporarily. Please hide them in the table for now. Do not delete any cells and filter is not needed.",
|
||||
"instruction": "Some data are missed by now and are filled by 'N/A' temporarily. Please hide them in the table for now. Do not delete them and filter is no needed.",
|
||||
"source": "https://www.youtube.com/shorts/JTbZ8sRxkdU",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -90,4 +90,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "7a4e4bc8-922c-4c84-865c-25ba34136be1",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Reorder the columns to be \"Date\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\". Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "Reorder the columns to be \"Date\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
|
||||
"source": "https://www.youtube.com/shorts/bvUhr1AHs44",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -82,4 +82,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "7efeb4b1-3d19-4762-b163-63328d66303b",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Fill the Sequence Numbers as \"No. #\" in the \"Seq No.\" column. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "Fill the Sequence Numbers as \"No. #\" in the \"Seq No.\" column",
|
||||
"source": "https://www.youtube.com/shorts/4jzXfZNhfmk",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -82,4 +82,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Given a partial calendar, please highlight all the weekends (Satureday & Sunday) by setting the cell background as red (#ff0000). Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "Given a partial calendar, please highlight all the weekends (Satureday & Sunday) by setting the cell background as red (#ff0000).",
|
||||
"source": "https://www.youtube.com/shorts/Hbcwu6IQ1ns",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -90,4 +90,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "I want to copy the movie titles in 'Garbage Movie Titles' column to the 'Clean Movie Titles' column. But please remove the adundant whitespaces and canonicalize the letter cases by capitalizing the first letter of each words and leave other letters as lower case. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "I want to copy the movie titles in 'Garbage Movie Titles' column to the 'Clean Movie Titles' column. But please remove the adundant whitespaces and canonicalize the letter cases by capitalizing the first letter of each words and leave other letters as lower case.",
|
||||
"source": "https://www.youtube.com/shorts/A0gmEBRKXWs",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -82,4 +82,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "abed40dc-063f-4598-8ba5-9fe749c0615d",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Check the names in column \"Names with duplicates\" and put the unique ones in column \"Unique Names\". Keep the original order of the first occurrences. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "Check the names in column \"Names with duplicates\" and put the unique ones in column \"Unique Names\". Keep the original order of the first occurrences.",
|
||||
"source": "https://help.libreoffice.org/7.6/ro/text/scalc/guide/remove_duplicates.html?&DbPAR=SHARED&System=UNIX",
|
||||
"config": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "d681960f-7bc3-4286-9913-a8812ba3261a",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "According to the scale table shown above, calculate and give each student a grade in the table below. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "According to the scale table shown above, calculate and give each student a grade in the table below",
|
||||
"source": "https://www.youtube.com/shorts/d7U1S_IsTVM",
|
||||
"config": [
|
||||
{
|
||||
|
|
@ -82,4 +82,4 @@
|
|||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "In the column \"Pass/Fail/Held\", one from the texts \"Pass\", \"Fail\", and \"Held\" should be filled. For convinience, enable data validation for the cells in this column so that the texts to fill can be directly selected from a drop down list. Finish the work and don't touch irrelevant regions, even if they are blank.",
|
||||
"instruction": "In the column \"Pass/Fail/Held\", one from the texts \"Pass\", \"Fail\", and \"Held\" should be filled. For convinience, enable data validation for the cells in this column so that the texts to fill can be directly selected from a drop down list.",
|
||||
"source": "https://www.youtube.com/shorts/tXOovKn0H68",
|
||||
"config": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@
|
|||
"rules": {
|
||||
"expected": [
|
||||
"Zoom Chrome Extension",
|
||||
"Speechify — Voice AI Assistant",
|
||||
"Speechify Text to Speech Voice Reader",
|
||||
"React Developer Tools",
|
||||
"Momentum",
|
||||
"Google Translate"
|
||||
|
|
|
|||
|
|
@ -40,8 +40,8 @@
|
|||
},
|
||||
"result": {
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/essay_submission.zip",
|
||||
"dest": "essay_submission.zip"
|
||||
"path": "/home/user/Recruitment_and_retention_of_health_professionals_across_Europe.zip",
|
||||
"dest": "Recruitment_and_retention_of_health_professionals_across_Europe.zip"
|
||||
}
|
||||
},
|
||||
"proxy": false,
|
||||
|
|
|
|||
|
|
@ -1,135 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Thread-safe results logging for OSWorld evaluations.
|
||||
Appends task completion results to results.json in real-time.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import fcntl
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
def extract_domain_from_path(result_path: str) -> str:
|
||||
"""
|
||||
Extract domain/application from result directory path.
|
||||
Expected structure: results/{action_space}/{observation_type}/{model}/{domain}/{task_id}/
|
||||
"""
|
||||
path_parts = Path(result_path).parts
|
||||
if len(path_parts) >= 2:
|
||||
return path_parts[-2] # Second to last part should be domain
|
||||
return "unknown"
|
||||
|
||||
|
||||
def append_task_result(
|
||||
task_id: str,
|
||||
domain: str,
|
||||
score: float,
|
||||
result_dir: str,
|
||||
args: Any,
|
||||
error_message: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Thread-safely append a task result to results.json.
|
||||
|
||||
Args:
|
||||
task_id: UUID of the task
|
||||
domain: Application domain (chrome, vlc, etc.)
|
||||
score: Task score (0.0 or 1.0)
|
||||
result_dir: Full path to the task result directory
|
||||
args: Command line arguments object
|
||||
error_message: Error message if task failed
|
||||
"""
|
||||
# Create result entry
|
||||
result_entry = {
|
||||
"application": domain,
|
||||
"task_id": task_id,
|
||||
"status": "error" if error_message else "success",
|
||||
"score": score,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
|
||||
if error_message:
|
||||
result_entry["err_message"] = error_message
|
||||
|
||||
# Determine summary directory and results file path
|
||||
# Extract base result directory from args
|
||||
base_result_dir = Path(args.result_dir)
|
||||
summary_dir = base_result_dir / "summary"
|
||||
results_file = summary_dir / "results.json"
|
||||
|
||||
# Ensure summary directory exists
|
||||
summary_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Thread-safe JSON append with file locking
|
||||
try:
|
||||
with open(results_file, 'a+') as f:
|
||||
# Lock the file for exclusive access
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
|
||||
try:
|
||||
# Move to beginning to read existing content
|
||||
f.seek(0)
|
||||
content = f.read().strip()
|
||||
|
||||
# Parse existing JSON array or create new one
|
||||
if content:
|
||||
try:
|
||||
existing_results = json.loads(content)
|
||||
if not isinstance(existing_results, list):
|
||||
existing_results = []
|
||||
except json.JSONDecodeError:
|
||||
existing_results = []
|
||||
else:
|
||||
existing_results = []
|
||||
|
||||
# Add new result
|
||||
existing_results.append(result_entry)
|
||||
|
||||
# Write back the complete JSON array
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
json.dump(existing_results, f, indent=2)
|
||||
f.write('\n') # Add newline for readability
|
||||
|
||||
finally:
|
||||
# Always unlock the file
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
print(f"📝 Logged result: {domain}/{task_id} -> {result_entry['status']} (score: {score})")
|
||||
|
||||
except Exception as e:
|
||||
# Don't let logging errors break the main evaluation
|
||||
print(f"⚠️ Failed to log result for {task_id}: {e}")
|
||||
|
||||
|
||||
def log_task_completion(example: Dict, result: float, result_dir: str, args: Any) -> None:
|
||||
"""
|
||||
Convenience wrapper for logging successful task completion.
|
||||
|
||||
Args:
|
||||
example: Task configuration dictionary
|
||||
result: Task score
|
||||
result_dir: Path to task result directory
|
||||
args: Command line arguments
|
||||
"""
|
||||
task_id = example.get('id', 'unknown')
|
||||
domain = extract_domain_from_path(result_dir)
|
||||
append_task_result(task_id, domain, result, result_dir, args)
|
||||
|
||||
|
||||
def log_task_error(example: Dict, error_msg: str, result_dir: str, args: Any) -> None:
|
||||
"""
|
||||
Convenience wrapper for logging task errors.
|
||||
|
||||
Args:
|
||||
example: Task configuration dictionary
|
||||
error_msg: Error message
|
||||
result_dir: Path to task result directory
|
||||
args: Command line arguments
|
||||
"""
|
||||
task_id = example.get('id', 'unknown')
|
||||
domain = extract_domain_from_path(result_dir)
|
||||
append_task_result(task_id, domain, 0.0, result_dir, args, error_msg)
|
||||
|
|
@ -4,22 +4,18 @@ import logging
|
|||
import os
|
||||
import time
|
||||
from wrapt_timeout_decorator import *
|
||||
from lib_results_logger import log_task_completion
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
|
||||
def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
|
||||
# Reset environment first to get fresh VM IP
|
||||
env.reset(task_config=example)
|
||||
|
||||
# Reset agent with fresh VM IP (for snapshot reverts)
|
||||
try:
|
||||
agent.reset(runtime_logger, vm_ip=env.vm_ip)
|
||||
agent.reset(runtime_logger)
|
||||
except Exception as e:
|
||||
agent.reset(vm_ip=env.vm_ip)
|
||||
agent.reset()
|
||||
|
||||
env.reset(task_config=example)
|
||||
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
|
|
@ -33,7 +29,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
|||
)
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S%f")
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
|
|
@ -59,16 +55,11 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
|||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
time.sleep(20) # Wait for the environment to settle
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
|
||||
# Log task completion to results.json
|
||||
log_task_completion(example, result, example_result_dir, args)
|
||||
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
|
||||
|
|
@ -105,67 +96,6 @@ def run_single_example_human(env, example, max_steps, instruction, args, example
|
|||
|
||||
|
||||
|
||||
def run_single_example_agi(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
agent.reset(runtime_logger)
|
||||
env.reset(task_config=example)
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
while not done and step_idx < max_steps:
|
||||
response, actions = agent.predict(
|
||||
instruction,
|
||||
obs
|
||||
)
|
||||
|
||||
done = not response.get('state_correct', False)
|
||||
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info, step_info = agent.step(action)
|
||||
|
||||
if not done:
|
||||
if not response.get('state_correct', False):
|
||||
done = True
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
|
||||
# Remove pending checks if they exist which will cause issues with json serialization
|
||||
if action.get('pending_checks', None):
|
||||
del action['pending_checks']
|
||||
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
|
||||
def run_single_example_openaicua(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
agent.reset(runtime_logger)
|
||||
|
|
@ -256,25 +186,23 @@ def run_single_example_opencua(agent, env, example, max_steps, instruction, args
|
|||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action": action,
|
||||
"natural_language_action": info_dict.get("action"),
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
}, ensure_ascii=False))
|
||||
}))
|
||||
f.write("\n")
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
|
||||
time.sleep(20) # Wait for the environment to settle
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
|
|
@ -325,321 +253,17 @@ def run_single_example_autoglm(agent, env, example, max_steps, instruction, args
|
|||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
|
||||
# Invalid Action
|
||||
if not actions:
|
||||
obs = env._get_obs() # update observation
|
||||
if not done: # not completed the task yet
|
||||
env.action_history.append('FAIL')
|
||||
|
||||
step_idx += 1
|
||||
|
||||
if not done: # not completed the task yet
|
||||
env.action_history.append('FAIL')
|
||||
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
def run_single_example_mano(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
agent.reset(runtime_logger)
|
||||
env.reset(task_config=example)
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
|
||||
with open(os.path.join(example_result_dir, f"step_0.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
while not done and step_idx < max_steps:
|
||||
response, actions = agent.predict(
|
||||
instruction,
|
||||
obs
|
||||
)
|
||||
if len(actions) > 1:
|
||||
if (("pyautogui.hotkey('shift')" in actions[0] or "pyautogui.hotkey('ctrl')" in actions[0])
|
||||
and "pyautogui.click" in actions[1]):
|
||||
hotkey_type = 'shift' if "shift" in actions[0] else 'ctrl'
|
||||
action = f"pyautogui.keyDown('{hotkey_type}')\n{actions[1]}\npyautogui.keyUp('{hotkey_type}')"
|
||||
actions = [action]
|
||||
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png",
|
||||
"response":response
|
||||
}))
|
||||
f.write("\n")
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
def run_single_example_uipath(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
try:
|
||||
agent.reset(runtime_logger)
|
||||
except Exception as e:
|
||||
agent.reset()
|
||||
|
||||
env.reset(task_config=example)
|
||||
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
while not done and step_idx < max_steps:
|
||||
response, actions = agent.predict(
|
||||
instruction,
|
||||
obs,
|
||||
args,
|
||||
step_idx
|
||||
)
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
|
||||
from mm_agents.os_symphony.utils.common_utils import draw_coordinates
|
||||
from mm_agents.os_symphony.utils.process_context import set_current_result_dir
|
||||
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
def run_single_example_os_symphony(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
set_current_result_dir(example_result_dir)
|
||||
|
||||
agent.reset(result_dir=example_result_dir)
|
||||
env.reset(task_config=example)
|
||||
time.sleep(30) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
# env.controller.start_recording()
|
||||
start_time = time.time()
|
||||
|
||||
while not done and step_idx < max_steps:
|
||||
response, actions = agent.predict(
|
||||
instruction,
|
||||
obs,
|
||||
step_idx == max_steps - 1
|
||||
)
|
||||
for action in actions:
|
||||
# Save screenshot and trajectory information
|
||||
if "reflection" in response and response["reflection"].get("is_milestone"):
|
||||
img_name = f"step_{step_idx + 1}_milestone.png"
|
||||
else:
|
||||
img_name = f"step_{step_idx + 1}.png"
|
||||
|
||||
with open(os.path.join(example_result_dir, img_name),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
if "coordinates" in response and response["coordinates"]:
|
||||
draw_coordinates(
|
||||
image_bytes=obs['screenshot'],
|
||||
coordinates=response["coordinates"],
|
||||
save_path=os.path.join(example_result_dir, img_name[:-4] + "_draw.png")
|
||||
)
|
||||
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
logger.info("Done: %s", done)
|
||||
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps({
|
||||
"instruction": instruction,
|
||||
"step_num": step_idx + 1,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": img_name
|
||||
}))
|
||||
f.write("\n")
|
||||
with open(os.path.join(example_result_dir, f"traj_{step_idx+1}.json"), "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"step_num": step_idx + 1,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": img_name
|
||||
}, f, indent=4, ensure_ascii=False)
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
time.sleep(60)
|
||||
break
|
||||
step_idx += 1
|
||||
end_time = time.time()
|
||||
result = float(env.evaluate())
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
|
||||
with open(os.path.join(example_result_dir, "time.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{end_time-start_time:.2f}\n")
|
||||
|
||||
|
||||
def run_single_example_evocua(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
"""
|
||||
Unified run function for EvoCUAAgent (supporting both S1 and S2 modes).
|
||||
"""
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
|
||||
# Reset Environment
|
||||
env.reset(task_config=example)
|
||||
|
||||
# Reset Agent
|
||||
# Handle agent reset signature differences if any
|
||||
try:
|
||||
agent.reset(runtime_logger, vm_ip=env.vm_ip)
|
||||
except Exception:
|
||||
try:
|
||||
agent.reset(runtime_logger)
|
||||
except Exception:
|
||||
agent.reset()
|
||||
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
|
||||
env.controller.start_recording()
|
||||
while not done and step_idx < max_steps:
|
||||
# EvoCUAAgent.predict unified signature: returns (response, actions)
|
||||
# It handles both modes internally.
|
||||
predict_res = agent.predict(instruction, obs)
|
||||
|
||||
# Check return signature logic
|
||||
if len(predict_res) == 3:
|
||||
# Compatibility with S1 original signature if agent was updated to match
|
||||
response, actions, info_dict = predict_res
|
||||
else:
|
||||
response, actions = predict_res
|
||||
info_dict = {}
|
||||
|
||||
logger.info(f"Step {step_idx + 1} Actions: {actions}")
|
||||
|
||||
# Break if no actions (fail-safe)
|
||||
if not actions or (len(actions) == 1 and (actions[0] == "" or "error" in actions[0].lower())):
|
||||
# Allow "FAIL" or "DONE" to process through execution loop if agent outputs them as actions
|
||||
if not (actions and actions[0] in ["FAIL", "DONE"]):
|
||||
logger.warning("No valid actions returned. Breaking loop.")
|
||||
break
|
||||
|
||||
for action in actions:
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S%f")
|
||||
logger.info("Executing action: %s", action)
|
||||
|
||||
# Execute
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
|
||||
# Save screenshot
|
||||
screenshot_file = f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
with open(os.path.join(example_result_dir, screenshot_file), "wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
|
||||
# Log Trajectory
|
||||
log_entry = {
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": screenshot_file
|
||||
}
|
||||
# Add natural language info if available (S1 style)
|
||||
if info_dict:
|
||||
log_entry["natural_language_action"] = info_dict.get("action")
|
||||
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(log_entry, ensure_ascii=False))
|
||||
f.write("\n")
|
||||
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
|
||||
step_idx += 1
|
||||
|
||||
time.sleep(20) # Wait for environment to settle
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
|
||||
log_task_completion(example, result, example_result_dir, args)
|
||||
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,84 +0,0 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from wrapt_timeout_decorator import *
|
||||
from mm_agents.os_symphony.utils.common_utils import draw_coordinates
|
||||
from mm_agents.os_symphony.utils.process_context import set_current_result_dir
|
||||
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
set_current_result_dir(example_result_dir)
|
||||
|
||||
agent.reset(result_dir=example_result_dir)
|
||||
env.reset(task_config=example)
|
||||
time.sleep(30) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
# env.controller.start_recording()
|
||||
start_time = time.time()
|
||||
|
||||
while not done and step_idx < max_steps:
|
||||
response, actions = agent.predict(
|
||||
instruction,
|
||||
obs,
|
||||
step_idx == max_steps - 1
|
||||
)
|
||||
for action in actions:
|
||||
# Save screenshot and trajectory information
|
||||
if "reflection" in response and response["reflection"].get("is_milestone"):
|
||||
img_name = f"step_{step_idx + 1}_milestone.png"
|
||||
else:
|
||||
img_name = f"step_{step_idx + 1}.png"
|
||||
|
||||
with open(os.path.join(example_result_dir, img_name),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
if "coordinates" in response and response["coordinates"]:
|
||||
draw_coordinates(
|
||||
image_bytes=obs['screenshot'],
|
||||
coordinates=response["coordinates"],
|
||||
save_path=os.path.join(example_result_dir, img_name[:-4] + "_draw.png")
|
||||
)
|
||||
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
logger.info("Done: %s", done)
|
||||
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps({
|
||||
"instruction": instruction,
|
||||
"step_num": step_idx + 1,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": img_name
|
||||
}))
|
||||
f.write("\n")
|
||||
with open(os.path.join(example_result_dir, f"traj_{step_idx+1}.json"), "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"step_num": step_idx + 1,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": img_name
|
||||
}, f, indent=4, ensure_ascii=False)
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
time.sleep(60)
|
||||
break
|
||||
step_idx += 1
|
||||
end_time = time.time()
|
||||
result = float(env.evaluate())
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
|
||||
with open(os.path.join(example_result_dir, "time.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{end_time-start_time:.2f}\n")
|
||||
|
|
@ -1134,12 +1134,10 @@ class PromptAgent:
|
|||
|
||||
return actions
|
||||
|
||||
def reset(self, _logger=None, vm_ip=None, **kwargs):
|
||||
def reset(self, _logger=None):
|
||||
global logger
|
||||
logger = _logger if _logger is not None else logging.getLogger("desktopenv.agent")
|
||||
|
||||
self.vm_ip = vm_ip
|
||||
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.observations = []
|
||||
|
|
|
|||
|
|
@ -1,219 +0,0 @@
|
|||
import base64
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Tuple, Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
class Timer:
|
||||
"""Context manager for timing code blocks."""
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.duration = time.time() - self.start
|
||||
|
||||
|
||||
class AGIAgent:
|
||||
"""Agent that communicates with your private AGI server for decision-making."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
env,
|
||||
server_url: str = "https://your-private-agi-endpoint", # Contact the authors for access to a private deployment endpoint.
|
||||
platform: str = "ubuntu",
|
||||
action_space: str = "pyautogui",
|
||||
observation_type: str = "screenshot",
|
||||
max_trajectory_length: int = 100,
|
||||
client_password: str = "",
|
||||
provider_name: str = "aws",
|
||||
screen_width: int = 1920,
|
||||
screen_height: int = 1080,
|
||||
timeout: int = 1800,
|
||||
):
|
||||
"""Initialize the AGI client.
|
||||
|
||||
Args:
|
||||
env: The desktop environment
|
||||
server_url: URL of your private AGI server
|
||||
"""
|
||||
self.env = env
|
||||
self.server_url = server_url.rstrip("/")
|
||||
self.platform = platform
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.client_password = client_password
|
||||
self.provider_name = provider_name
|
||||
self.screen_width = screen_width
|
||||
self.screen_height = screen_height
|
||||
|
||||
# Session management
|
||||
self.session_id: Optional[str] = None
|
||||
self.instruction: Optional[str] = None
|
||||
|
||||
# HTTP client
|
||||
self.client = httpx.Client(timeout=timeout)
|
||||
|
||||
# Tracking
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
|
||||
logger.info(f"Initialized AGIAgent with server URL: {self.server_url}")
|
||||
|
||||
def reset(self, runtime_logger=None):
|
||||
"""Reset the agent and create a new session on the server.
|
||||
|
||||
Args:
|
||||
runtime_logger: Optional logger for runtime information
|
||||
"""
|
||||
global logger
|
||||
logger = runtime_logger if runtime_logger is not None else logging.getLogger("desktopenv.agent")
|
||||
|
||||
# Clear local state
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.session_id = None
|
||||
|
||||
logger.info("AGIAgent reset complete")
|
||||
|
||||
def _create_session(self, instruction: str) -> str:
|
||||
"""Create a new session on the server.
|
||||
|
||||
Args:
|
||||
instruction: The task instruction
|
||||
|
||||
Returns:
|
||||
The session ID
|
||||
|
||||
Equivalent curl request:
|
||||
curl -X POST {server_url}/sessions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"task_description": "{instruction}"}'
|
||||
"""
|
||||
try:
|
||||
# print(f"Creating session with instruction: {instruction}")
|
||||
# print(f"Server URL: {self.server_url}")
|
||||
response = self.client.post(
|
||||
f"{self.server_url}/sessions",
|
||||
json={"task_description": instruction}
|
||||
)
|
||||
response.raise_for_status()
|
||||
session_id = response.json()["session_id"]
|
||||
logger.info(f"Created session: {session_id}")
|
||||
return session_id
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create session: {e}")
|
||||
raise
|
||||
|
||||
def predict(self, instruction: str, obs: Dict) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
||||
"""Predict the next action based on the current observation.
|
||||
|
||||
Args:
|
||||
instruction: The task instruction
|
||||
obs: Observation dictionary containing 'screenshot' key with image bytes
|
||||
|
||||
Returns:
|
||||
Tuple of (predict_info dict, list of action dicts)
|
||||
"""
|
||||
# Create session on first prediction
|
||||
if self.session_id is None:
|
||||
self.instruction = instruction
|
||||
self.session_id = self._create_session(instruction)
|
||||
|
||||
# input("Session created, press Enter to continue")
|
||||
|
||||
# Encode screenshot to base64
|
||||
screenshot_bytes = obs["screenshot"]
|
||||
screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
|
||||
|
||||
# Call the server
|
||||
with Timer() as model_timer:
|
||||
try:
|
||||
response = self.client.post(
|
||||
f"{self.server_url}/sessions/{self.session_id}/step",
|
||||
json={
|
||||
"screenshot_base64_png": screenshot_b64,
|
||||
"error": None # Could be populated from previous step errors
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
parsed_action = result["parsed_response"]
|
||||
|
||||
logger.info(f"Server returned action: {parsed_action[:100]}...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling server: {e}")
|
||||
raise
|
||||
|
||||
# Format response as expected by lib_run_single
|
||||
actions = [{
|
||||
"action_space": "pyautogui",
|
||||
"action": parsed_action,
|
||||
"pending_checks": [],
|
||||
"call_id": ""
|
||||
}]
|
||||
|
||||
# Check if task is complete or failed
|
||||
state_correct = parsed_action not in ["FAIL", "DONE"]
|
||||
|
||||
predict_info = {
|
||||
"model_usage": {
|
||||
"model_time": model_timer.duration,
|
||||
"prompt_tokens": 0, # Server doesn't expose these
|
||||
"completion_tokens": 0,
|
||||
},
|
||||
"messages": [], # Server manages conversation history
|
||||
"response": parsed_action,
|
||||
"state_correct": state_correct,
|
||||
}
|
||||
|
||||
return predict_info, actions
|
||||
|
||||
def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict, Dict]:
|
||||
"""Execute an action in the environment.
|
||||
|
||||
Args:
|
||||
action: Action dictionary with 'action' key containing PyAutoGUI command
|
||||
|
||||
Returns:
|
||||
Tuple of (observation, reward, done, info, step_info)
|
||||
"""
|
||||
try:
|
||||
if not action:
|
||||
logger.warning("Empty action received, terminating episode")
|
||||
# Get observation without executing action
|
||||
obs = self.env._get_obs()
|
||||
return obs, 0.0, True, {}, {"step_time": 0.0, "action": action}
|
||||
|
||||
action_str = action.get("action", "")
|
||||
logger.info(f"Executing action: {action_str[:100]}...")
|
||||
|
||||
with Timer() as step_timer:
|
||||
# Execute the action directly (it's already a PyAutoGUI command string)
|
||||
obs, reward, terminated, info = self.env.step(action_str)
|
||||
|
||||
logger.debug(f"Action completed in {step_timer.duration:.2f}s")
|
||||
if terminated:
|
||||
logger.info("Environment signaled termination")
|
||||
|
||||
return obs, reward, terminated, info, {
|
||||
"step_time": step_timer.duration,
|
||||
"action": action
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Environment step failed: {str(e)}")
|
||||
raise
|
||||
|
||||
def close(self):
|
||||
"""Close the HTTP client."""
|
||||
self.client.close()
|
||||
|
|
@ -17,7 +17,7 @@ from anthropic.types.beta import (
|
|||
BetaMessageParam,
|
||||
BetaTextBlockParam,
|
||||
)
|
||||
from .utils import COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG,SYSTEM_PROMPT, SYSTEM_PROMPT_WINDOWS, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME, get_model_name
|
||||
from .utils import COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG,SYSTEM_PROMPT, SYSTEM_PROMPT_WINDOWS, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME
|
||||
from .utils import _response_to_params, _inject_prompt_caching, _maybe_filter_to_n_most_recent_images
|
||||
|
||||
import logging
|
||||
|
|
@ -30,18 +30,14 @@ API_RETRY_INTERVAL = 5
|
|||
class AnthropicAgent:
|
||||
def __init__(self,
|
||||
platform: str = "Ubuntu",
|
||||
model: str = "claude-sonnet-4-5-20250929",
|
||||
provider: APIProvider = APIProvider.ANTHROPIC,
|
||||
model: str = "claude-3-5-sonnet-20241022",
|
||||
provider: APIProvider = APIProvider.BEDROCK,
|
||||
max_tokens: int = 4096,
|
||||
api_key: str = os.environ.get("ANTHROPIC_API_KEY", None),
|
||||
system_prompt_suffix: str = "",
|
||||
only_n_most_recent_images: Optional[int] = 10,
|
||||
action_space: str = "claude_computer_use",
|
||||
screen_size: tuple[int, int] = (1920, 1080),
|
||||
no_thinking: bool = False,
|
||||
use_isp: bool = False,
|
||||
temperature: Optional[float] = None,
|
||||
top_p: Optional[float] = None,
|
||||
*args, **kwargs
|
||||
):
|
||||
self.platform = platform
|
||||
|
|
@ -56,24 +52,10 @@ class AnthropicAgent:
|
|||
self.only_n_most_recent_images = only_n_most_recent_images
|
||||
self.messages: list[BetaMessageParam] = []
|
||||
self.screen_size = screen_size
|
||||
self.no_thinking = no_thinking
|
||||
self.use_isp = use_isp
|
||||
self.temperature = temperature
|
||||
self.top_p = top_p
|
||||
|
||||
self.resize_factor = (
|
||||
screen_size[0] / 1280, # Assuming 1280 is the base width
|
||||
screen_size[1] / 720 # Assuming 720 is the base height
|
||||
)
|
||||
|
||||
def _get_sampling_params(self):
|
||||
"""Get sampling parameters (temperature and/or top_p) - let API validate exclusivity"""
|
||||
params = {}
|
||||
if self.temperature is not None:
|
||||
params['temperature'] = self.temperature
|
||||
if self.top_p is not None:
|
||||
params['top_p'] = self.top_p
|
||||
return params
|
||||
|
||||
def add_tool_result(self, tool_call_id: str, result: str, screenshot: bytes = None):
|
||||
"""Add tool result to message history"""
|
||||
|
|
@ -102,21 +84,6 @@ class AnthropicAgent:
|
|||
"content": tool_result_content
|
||||
})
|
||||
|
||||
def _extract_raw_response_string(self, response) -> str:
|
||||
"""Extract and concatenate raw response content into a single string."""
|
||||
raw_response_str = ""
|
||||
if response.content:
|
||||
for block in response.content:
|
||||
if hasattr(block, 'text') and block.text:
|
||||
raw_response_str += f"[TEXT] {block.text}\n"
|
||||
elif hasattr(block, 'thinking') and block.thinking:
|
||||
raw_response_str += f"[THINKING] {block.thinking}\n"
|
||||
elif hasattr(block, 'name') and hasattr(block, 'input'):
|
||||
raw_response_str += f"[TOOL_USE] {block.name}: {block.input}\n"
|
||||
else:
|
||||
raw_response_str += f"[OTHER] {str(block)}\n"
|
||||
return raw_response_str.strip()
|
||||
|
||||
def parse_actions_from_tool_call(self, tool_call: Dict) -> str:
|
||||
result = ""
|
||||
function_args = (
|
||||
|
|
@ -227,23 +194,13 @@ class AnthropicAgent:
|
|||
result += (f"pyautogui.keyUp('{key}')\n")
|
||||
expected_outcome = f"Key {key} pressed."
|
||||
elif action == "type":
|
||||
for char in text:
|
||||
if char == '\n':
|
||||
result += "pyautogui.press('enter')\n"
|
||||
elif char == "'":
|
||||
result += 'pyautogui.press("\'")\n'
|
||||
elif char == '\\':
|
||||
result += "pyautogui.press('\\\\')\n"
|
||||
elif char == '"':
|
||||
result += "pyautogui.press('\"')\n"
|
||||
else:
|
||||
result += f"pyautogui.press('{char}')\n"
|
||||
result += (
|
||||
f"pyautogui.typewrite(\"\"\"{text}\"\"\", interval=0.01)\n"
|
||||
)
|
||||
expected_outcome = f"Text {text} written."
|
||||
|
||||
# Handle scroll actions
|
||||
elif action == "scroll":
|
||||
if text is not None:
|
||||
result += (f"pyautogui.keyDown('{text.lower()}')\n")
|
||||
if coordinate is None:
|
||||
if scroll_direction in ("up", "down"):
|
||||
result += (
|
||||
|
|
@ -264,8 +221,6 @@ class AnthropicAgent:
|
|||
result += (
|
||||
f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount}, {x}, {y})\n"
|
||||
)
|
||||
if text is not None:
|
||||
result += (f"pyautogui.keyUp('{text.lower()}')\n")
|
||||
expected_outcome = "Scroll action finished"
|
||||
|
||||
# Handle click actions
|
||||
|
|
@ -330,7 +285,7 @@ class AnthropicAgent:
|
|||
expected_outcome = "Call user"
|
||||
elif action == "screenshot":
|
||||
result += "pyautogui.sleep(0.1)\n"
|
||||
expected_outcome = "Screenshot taken"
|
||||
expected_outcome = "Screenshot taken"
|
||||
else:
|
||||
raise ValueError(f"Invalid action: {action}")
|
||||
|
||||
|
|
@ -348,9 +303,6 @@ class AnthropicAgent:
|
|||
screenshot_bytes = obs["screenshot"]
|
||||
screenshot_image = Image.open(io.BytesIO(screenshot_bytes))
|
||||
|
||||
# Store original unresized screenshot for zoom processing
|
||||
obs["screenshot_original"] = screenshot_bytes
|
||||
|
||||
# Calculate new size based on resize factor
|
||||
new_width, new_height = 1280, 720
|
||||
|
||||
|
|
@ -382,45 +334,23 @@ class AnthropicAgent:
|
|||
]
|
||||
})
|
||||
|
||||
# Add tool_result for ALL tool_use blocks in the last message
|
||||
if self.messages:
|
||||
last_message_content = self.messages[-1]["content"]
|
||||
tool_use_blocks = [block for block in last_message_content if block.get("type") == "tool_use"]
|
||||
|
||||
for i, tool_block in enumerate(tool_use_blocks):
|
||||
tool_input = tool_block.get("input", {})
|
||||
action = tool_input.get("action")
|
||||
is_last_tool = i == len(tool_use_blocks) - 1
|
||||
|
||||
include_screenshot = None
|
||||
|
||||
if obs:
|
||||
if action == "screenshot":
|
||||
# Screenshot action always gets regular screenshot
|
||||
include_screenshot = obs.get("screenshot")
|
||||
elif is_last_tool:
|
||||
# Auto-screenshot: last tool gets regular screenshot (unless it's zoom, handled above)
|
||||
include_screenshot = obs.get("screenshot")
|
||||
|
||||
self.add_tool_result(
|
||||
tool_block["id"],
|
||||
f"Success",
|
||||
screenshot=include_screenshot
|
||||
)
|
||||
if self.messages and "tool_use" in [content_block["type"] for content_block in self.messages[-1]["content"]]:
|
||||
self.add_tool_result(
|
||||
self.messages[-1]["content"][-1]["id"],
|
||||
f"Success",
|
||||
screenshot=obs.get("screenshot") if obs else None
|
||||
)
|
||||
|
||||
enable_prompt_caching = False
|
||||
betas = [COMPUTER_USE_BETA_FLAG]
|
||||
|
||||
# Add interleaved thinking beta if ISP is requested
|
||||
if self.use_isp:
|
||||
betas.append("interleaved-thinking-2025-05-14")
|
||||
logger.info(f"Added interleaved thinking beta. Betas: {betas}")
|
||||
betas = ["computer-use-2025-01-24"]
|
||||
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
|
||||
betas = ["computer-use-2025-01-24"]
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
betas = [COMPUTER_USE_BETA_FLAG]
|
||||
|
||||
image_truncation_threshold = 10
|
||||
if self.provider == APIProvider.ANTHROPIC:
|
||||
client = Anthropic(api_key=self.api_key, max_retries=4).with_options(
|
||||
default_headers={"anthropic-beta": COMPUTER_USE_BETA_FLAG}
|
||||
)
|
||||
client = Anthropic(api_key=self.api_key, max_retries=4)
|
||||
enable_prompt_caching = True
|
||||
elif self.provider == APIProvider.VERTEX:
|
||||
client = AnthropicVertex()
|
||||
|
|
@ -438,7 +368,7 @@ class AnthropicAgent:
|
|||
if enable_prompt_caching:
|
||||
betas.append(PROMPT_CACHING_BETA_FLAG)
|
||||
_inject_prompt_caching(self.messages)
|
||||
image_truncation_threshold = 20
|
||||
image_truncation_threshold = 50
|
||||
system["cache_control"] = {"type": "ephemeral"}
|
||||
|
||||
if self.only_n_most_recent_images:
|
||||
|
|
@ -448,65 +378,49 @@ class AnthropicAgent:
|
|||
min_removal_threshold=image_truncation_threshold,
|
||||
)
|
||||
|
||||
# Configure tool settings - use modern computer tool for all models
|
||||
tool_config = {
|
||||
'name': 'computer',
|
||||
'type': 'computer_20250124',
|
||||
'display_width_px': 1280,
|
||||
'display_height_px': 720,
|
||||
'display_number': 1
|
||||
}
|
||||
|
||||
tools = [
|
||||
tool_config,
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
tool_config,
|
||||
]
|
||||
|
||||
# Configure thinking mode based on user preferences
|
||||
if self.no_thinking:
|
||||
# Disable thinking mode - omit the thinking parameter
|
||||
extra_body = {}
|
||||
actual_max_tokens = self.max_tokens # Use default when no thinking
|
||||
logger.info("Thinking mode: DISABLED")
|
||||
else:
|
||||
# Enable thinking mode (regular or interleaved)
|
||||
# Use consistent 2048 budget for both regular and ISP thinking
|
||||
budget_tokens = 2048
|
||||
|
||||
# For regular thinking: max_tokens > budget_tokens (API requirement)
|
||||
# For ISP: budget_tokens can exceed max_tokens (represents total across all thinking blocks)
|
||||
if self.max_tokens <= budget_tokens:
|
||||
required_max_tokens = budget_tokens + 500 # Give some headroom
|
||||
logger.warning(f"Regular thinking requires max_tokens > budget_tokens. Increasing max_tokens from {self.max_tokens} to {required_max_tokens}")
|
||||
actual_max_tokens = required_max_tokens
|
||||
else:
|
||||
actual_max_tokens = self.max_tokens
|
||||
|
||||
extra_body = {
|
||||
"thinking": {"type": "enabled", "budget_tokens": budget_tokens}
|
||||
}
|
||||
if self.use_isp:
|
||||
logger.info("Thinking mode: INTERLEAVED SCRATCHPAD (ISP)")
|
||||
else:
|
||||
logger.info("Thinking mode: REGULAR SCRATCHPAD")
|
||||
|
||||
try:
|
||||
if self.model_name == "claude-3-5-sonnet-20241022":
|
||||
tools = [
|
||||
{'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
# {'type': 'bash_20241022', 'name': 'bash'},
|
||||
# {'name': 'str_replace_editor', 'type': 'text_editor_20241022'}
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
{'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
]
|
||||
elif self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
tools = [
|
||||
{'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
# {'type': 'bash_20250124', 'name': 'bash'},
|
||||
# {'name': 'str_replace_editor', 'type': 'text_editor_20250124'}
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
{'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
]
|
||||
extra_body = {
|
||||
"thinking": {"type": "enabled", "budget_tokens": 1024}
|
||||
}
|
||||
response = None
|
||||
|
||||
for attempt in range(API_RETRY_TIMES):
|
||||
try:
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=actual_max_tokens,
|
||||
messages=self.messages,
|
||||
model=get_model_name(self.provider, self.model_name),
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body,
|
||||
**self._get_sampling_params()
|
||||
)
|
||||
|
||||
if self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
logger.info(f"Response: {response}")
|
||||
break
|
||||
except (APIError, APIStatusError, APIResponseValidationError) as e:
|
||||
|
|
@ -536,20 +450,26 @@ class AnthropicAgent:
|
|||
try:
|
||||
logger.warning("Retrying with backup API key...")
|
||||
|
||||
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4).with_options(
|
||||
default_headers={"anthropic-beta": COMPUTER_USE_BETA_FLAG}
|
||||
)
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=actual_max_tokens,
|
||||
messages=self.messages,
|
||||
model=get_model_name(self.provider, self.model_name),
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body,
|
||||
**self._get_sampling_params()
|
||||
)
|
||||
|
||||
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
|
||||
if self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
logger.info("Successfully used backup API key")
|
||||
except Exception as backup_e:
|
||||
backup_error_msg = str(backup_e)
|
||||
|
|
@ -577,16 +497,9 @@ class AnthropicAgent:
|
|||
logger.exception(f"Error in Anthropic API: {str(e)}")
|
||||
return None, None
|
||||
|
||||
if response is None:
|
||||
logger.error("Response is None after API call - this should not happen")
|
||||
return None, None
|
||||
|
||||
response_params = _response_to_params(response)
|
||||
logger.info(f"Received response params: {response_params}")
|
||||
|
||||
# Convert raw response to concatenated string for trajectory logging
|
||||
raw_response_str = self._extract_raw_response_string(response)
|
||||
|
||||
# Store response in message history
|
||||
self.messages.append({
|
||||
"role": "assistant",
|
||||
|
|
@ -605,8 +518,7 @@ class AnthropicAgent:
|
|||
"input": cast(dict[str, Any], content_block["input"]),
|
||||
"id": content_block["id"],
|
||||
"action_type": content_block.get("type"),
|
||||
"command": self.parse_actions_from_tool_call(content_block),
|
||||
"raw_response": raw_response_str # Add raw response to each action
|
||||
"command": self.parse_actions_from_tool_call(content_block)
|
||||
})
|
||||
elif content_block["type"] == "text":
|
||||
reasonings.append(content_block["text"])
|
||||
|
|
@ -614,23 +526,10 @@ class AnthropicAgent:
|
|||
reasonings = reasonings[0]
|
||||
else:
|
||||
reasonings = ""
|
||||
|
||||
# Check if the model indicated the task is infeasible
|
||||
if raw_response_str and "[INFEASIBLE]" in raw_response_str:
|
||||
logger.info("Detected [INFEASIBLE] pattern in response, triggering FAIL action")
|
||||
# Override actions with FAIL
|
||||
actions = [{
|
||||
"action_type": "FAIL",
|
||||
"raw_response": raw_response_str
|
||||
}]
|
||||
|
||||
logger.info(f"Received actions: {actions}")
|
||||
logger.info(f"Received reasonings: {reasonings}")
|
||||
if len(actions) == 0:
|
||||
actions = [{
|
||||
"action_type": "DONE",
|
||||
"raw_response": raw_response_str
|
||||
}]
|
||||
actions = ["DONE"]
|
||||
return reasonings, actions
|
||||
except Exception as e:
|
||||
logger.warning(f"parse_actions_from_tool_call parsing failed (attempt {parse_retry+1}/3), will retry API request: {e}")
|
||||
|
|
@ -640,17 +539,25 @@ class AnthropicAgent:
|
|||
response = None
|
||||
for attempt in range(API_RETRY_TIMES):
|
||||
try:
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=actual_max_tokens,
|
||||
messages=self.messages,
|
||||
model=get_model_name(self.provider, self.model_name),
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body,
|
||||
**self._get_sampling_params()
|
||||
)
|
||||
|
||||
if self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
logger.info(f"Response: {response}")
|
||||
break # Success, exit retry loop
|
||||
except (APIError, APIStatusError, APIResponseValidationError) as e2:
|
||||
|
|
@ -662,20 +569,13 @@ class AnthropicAgent:
|
|||
raise
|
||||
response_params = _response_to_params(response)
|
||||
logger.info(f"Received response params: {response_params}")
|
||||
|
||||
# Update raw response string for retry case (will be used in next loop iteration)
|
||||
raw_response_str = self._extract_raw_response_string(response)
|
||||
|
||||
self.messages.append({
|
||||
"role": "assistant",
|
||||
"content": response_params
|
||||
})
|
||||
if parse_retry == max_parse_retry - 1:
|
||||
logger.error(f"parse_actions_from_tool_call parsing failed 3 times consecutively, terminating: {e}")
|
||||
actions = [{
|
||||
"action_type": "FAIL",
|
||||
"raw_response": f"Failed to parse actions from tool call after {max_parse_retry} attempts: {e}"
|
||||
}]
|
||||
actions = ["FAIL"]
|
||||
return reasonings, actions
|
||||
def reset(self, _logger = None, *args, **kwargs):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ from datetime import datetime
|
|||
from .tools import ToolResult
|
||||
|
||||
|
||||
COMPUTER_USE_BETA_FLAG = "computer-use-2025-01-24"
|
||||
COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22"
|
||||
PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31"
|
||||
|
||||
|
||||
|
|
@ -47,25 +47,12 @@ PROVIDER_TO_DEFAULT_MODEL_NAME: dict[(APIProvider, str), str] = {
|
|||
(APIProvider.ANTHROPIC, "claude-4-opus-20250514"): "claude-4-opus-20250514",
|
||||
(APIProvider.BEDROCK, "claude-4-opus-20250514"): "us.anthropic.claude-opus-4-20250514-v1:0",
|
||||
(APIProvider.VERTEX, "claude-4-opus-20250514"): "claude-4-opus-v1@20250514",
|
||||
# Add mapping for the alternative model name format
|
||||
(APIProvider.ANTHROPIC, "claude-opus-4-20250514"): "claude-opus-4-20250514",
|
||||
(APIProvider.ANTHROPIC, "claude-opus-4-1-20250805"): "claude-opus-4-1-20250805",
|
||||
(APIProvider.ANTHROPIC, "claude-4-sonnet-20250514"): "claude-4-sonnet-20250514",
|
||||
(APIProvider.ANTHROPIC, "claude-sonnet-4-20250514"): "claude-sonnet-4-20250514",
|
||||
(APIProvider.BEDROCK, "claude-4-sonnet-20250514"): "us.anthropic.claude-sonnet-4-20250514-v1:0",
|
||||
(APIProvider.VERTEX, "claude-4-sonnet-20250514"): "claude-sonnet-4-v1@20250514",
|
||||
}
|
||||
|
||||
|
||||
def get_model_name(provider: APIProvider, model_name: str) -> str:
|
||||
"""
|
||||
Get the actual model name to use for API calls.
|
||||
|
||||
Simply returns the model name as-is for direct API usage.
|
||||
"""
|
||||
return model_name
|
||||
|
||||
|
||||
# This system prompt is optimized for the Docker environment in this repository and
|
||||
# specific tool combinations enabled.
|
||||
# We encourage modifying this system prompt to ensure the model has context for the
|
||||
|
|
@ -80,15 +67,8 @@ SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
|
|||
* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
|
||||
* DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools.
|
||||
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
||||
* TASK FEASIBILITY: You can declare a task infeasible at any point during execution - whether at the beginning after taking a screenshot, or later after attempting some actions and discovering barriers. Carefully evaluate whether the task is feasible given the current system state, available applications, and task requirements. If you determine that a task cannot be completed due to:
|
||||
- Missing required applications or dependencies that cannot be installed
|
||||
- Insufficient permissions or system limitations
|
||||
- Contradictory or impossible requirements
|
||||
- Any other fundamental barriers that make completion impossible
|
||||
Then you MUST output exactly "[INFEASIBLE]" (including the square brackets) anywhere in your response to trigger the fail action. The system will automatically detect this pattern and terminate the task appropriately.
|
||||
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
|
||||
* Home directory of this Ubuntu system is '/home/user'.
|
||||
* If you need a password for sudo, the password of the computer is 'osworld-public-evaluation'.
|
||||
</SYSTEM_CAPABILITY>
|
||||
|
||||
<IMPORTANT>
|
||||
|
|
@ -102,7 +82,6 @@ SYSTEM_PROMPT_WINDOWS = f"""<SYSTEM_CAPABILITY>
|
|||
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
|
||||
* Home directory of this Windows system is 'C:\\Users\\user'.
|
||||
* When you want to open some applications on Windows, please use Double Click on it instead of clicking once.
|
||||
* If you need a password for sudo, The password of the computer is 'osworld-public-evaluation'.
|
||||
</SYSTEM_CAPABILITY>"""
|
||||
|
||||
|
||||
|
|
@ -175,30 +154,21 @@ def _inject_prompt_caching(
|
|||
one cache breakpoint is left for tools/system prompt, to be shared across sessions
|
||||
"""
|
||||
|
||||
breakpoints_remaining = 2 # Use full budget for recent messages
|
||||
messages_processed = 0
|
||||
|
||||
breakpoints_remaining = 3
|
||||
for message in reversed(messages):
|
||||
if message["role"] == "user" and isinstance(
|
||||
content := message["content"], list
|
||||
):
|
||||
messages_processed += 1
|
||||
# Check if this message would fit within the remaining budget
|
||||
if breakpoints_remaining >= len(content):
|
||||
# We have enough budget, spend it and add cache_control
|
||||
breakpoints_remaining -= len(content)
|
||||
if breakpoints_remaining:
|
||||
breakpoints_remaining -= 1
|
||||
# Use type ignore to bypass TypedDict check until SDK types are updated
|
||||
content[-1]["cache_control"] = BetaCacheControlEphemeralParam( # type: ignore
|
||||
{"type": "ephemeral"}
|
||||
)
|
||||
else:
|
||||
# Check if this is the first message (contains image + text with task description)
|
||||
is_first_message = messages_processed == len([msg for msg in messages if msg["role"] == "user"])
|
||||
|
||||
if not is_first_message:
|
||||
# Not enough budget, remove any existing cache_control from this message
|
||||
content[-1].pop("cache_control", None)
|
||||
# Continue to clean up older messages that might have cache_control from previous turns
|
||||
content[-1].pop("cache_control", None)
|
||||
# we'll only every have one extra turn per loop
|
||||
break
|
||||
|
||||
|
||||
def _maybe_filter_to_n_most_recent_images(
|
||||
|
|
@ -250,105 +220,6 @@ def _maybe_filter_to_n_most_recent_images(
|
|||
tool_result["content"] = new_content
|
||||
|
||||
|
||||
def validate_model_support(model_name: str, api_key: str = None, temperature: float = None, top_p: float = None, no_thinking: bool = False, use_isp: bool = False) -> bool:
|
||||
"""
|
||||
Validate model support with the same API call pattern as the main agent.
|
||||
|
||||
Args:
|
||||
model_name: The model name to validate
|
||||
api_key: Optional API key, defaults to ANTHROPIC_API_KEY env var
|
||||
temperature: Optional temperature parameter for testing
|
||||
top_p: Optional top_p parameter for testing
|
||||
no_thinking: Disable thinking mode (matches AnthropicAgent)
|
||||
use_isp: Use interleaved scratchpad mode (matches AnthropicAgent)
|
||||
|
||||
Returns:
|
||||
True if model is supported and API call succeeds, False otherwise
|
||||
"""
|
||||
print(f"🔍 Validating model support: {model_name}")
|
||||
|
||||
try:
|
||||
from anthropic import Anthropic
|
||||
import os
|
||||
import time
|
||||
|
||||
# Same client setup as main agent but with manual retry (max_retries=1 for faster feedback)
|
||||
client = Anthropic(
|
||||
api_key=api_key or os.environ.get("ANTHROPIC_API_KEY"),
|
||||
max_retries=4
|
||||
).with_options(default_headers={"anthropic-beta": COMPUTER_USE_BETA_FLAG})
|
||||
|
||||
# Same message format as main agent - always use structured format with cache_control
|
||||
messages = [{"role": "user", "content": [{"type": "text", "text": "Respond with 'OK'", "cache_control": {"type": "ephemeral"}}]}]
|
||||
|
||||
# Same betas configuration as main agent
|
||||
betas = [COMPUTER_USE_BETA_FLAG]
|
||||
if use_isp:
|
||||
betas.append("interleaved-thinking-2025-05-14")
|
||||
|
||||
system = [{"type": "text", "text": "You are Claude. Respond with 'OK'."}]
|
||||
|
||||
# Same tools configuration as main agent - use modern computer tool for all models
|
||||
tools = [{"name": "computer", "type": "computer_20250124",
|
||||
"display_width_px": 1280, "display_height_px": 720, "display_number": 1}]
|
||||
|
||||
# Same thinking configuration as main agent
|
||||
max_tokens = 50 # Base validation max_tokens
|
||||
if no_thinking:
|
||||
extra_body = {}
|
||||
actual_max_tokens = max_tokens
|
||||
else:
|
||||
budget_tokens = 2048
|
||||
# Same logic as main agent: if max_tokens <= budget_tokens, increase it
|
||||
if max_tokens <= budget_tokens:
|
||||
actual_max_tokens = budget_tokens + 500
|
||||
else:
|
||||
actual_max_tokens = max_tokens
|
||||
extra_body = {
|
||||
"thinking": {"type": "enabled", "budget_tokens": budget_tokens}
|
||||
}
|
||||
|
||||
# Sampling parameters (same logic as main agent)
|
||||
sampling_params = {}
|
||||
if temperature is not None:
|
||||
sampling_params['temperature'] = temperature
|
||||
if top_p is not None:
|
||||
sampling_params['top_p'] = top_p
|
||||
|
||||
# Retry logic with 5 attempts, 5 second delays
|
||||
for attempt in range(5):
|
||||
try:
|
||||
# Same API call pattern as main agent
|
||||
client.beta.messages.create(
|
||||
max_tokens=actual_max_tokens,
|
||||
messages=messages,
|
||||
model=get_model_name(APIProvider.ANTHROPIC, model_name),
|
||||
system=system,
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body,
|
||||
**sampling_params
|
||||
)
|
||||
|
||||
print(f"✅ Model {model_name} validated successfully")
|
||||
return True
|
||||
except Exception as e:
|
||||
if attempt < 4: # Don't print error on final attempt
|
||||
print(f"🔄 Validation attempt {attempt + 1}/5 failed: {e}")
|
||||
print(f"⏳ Retrying in 5 seconds...")
|
||||
time.sleep(5)
|
||||
else:
|
||||
print(f"❌ All validation attempts failed. Final error: {e}")
|
||||
|
||||
return False
|
||||
|
||||
except ValueError:
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ API validation setup failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _response_to_params(
|
||||
response: BetaMessage,
|
||||
) -> list[BetaContentBlockParam]:
|
||||
|
|
|
|||
|
|
@ -1,7 +0,0 @@
|
|||
"""
|
||||
AutoGLM agent implementation
|
||||
"""
|
||||
|
||||
from .main import AutoGLMAgent
|
||||
|
||||
__all__ = ["AutoGLMAgent"]
|
||||
|
|
@ -1,265 +0,0 @@
|
|||
import logging
|
||||
import re
|
||||
from base64 import b64encode
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from typing import Dict, List
|
||||
|
||||
from .prompt.accessibility_tree_handle import linearize_accessibility_tree, trim_accessibility_tree
|
||||
from .prompt.grounding_agent import GroundingAgent as Agent
|
||||
from .tools.package.google_chrome import BrowserTools
|
||||
from .prompt.procedural_memory import Prompt
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
pure_text_settings = ["a11y_tree"]
|
||||
|
||||
def resize_image(image, w, h):
|
||||
img = Image.open(BytesIO(image))
|
||||
# resize to max_pixel_num max_pixels
|
||||
img = img.resize((w, h))
|
||||
buf = BytesIO()
|
||||
img.save(buf, format='PNG') # 指定保存格式,比如 PNG、JPEG
|
||||
img_bytes = buf.getvalue() # 得到 bytes 数据
|
||||
return img_bytes
|
||||
|
||||
def parse_code_from_string(input_string):
|
||||
# input_string = "\n".join([line.strip() for line in input_string.split(';') if line.strip()])
|
||||
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
|
||||
return [input_string.strip()]
|
||||
|
||||
# This regular expression will match both ```code``` and ```python code```
|
||||
# and capture the `code` part. It uses a non-greedy match for the content inside.
|
||||
pattern = r"```(?:\w+\s+)?(.*?)```"
|
||||
# Find all non-overlapping matches in the string
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
|
||||
# The regex above captures the content inside the triple backticks.
|
||||
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
|
||||
# so the code inside backticks can span multiple lines.
|
||||
|
||||
# matches now contains all the captured code snippets
|
||||
|
||||
codes = []
|
||||
|
||||
for match in matches:
|
||||
match = match.strip()
|
||||
commands = ["WAIT", "DONE", "FAIL"] # fixme: updates this part when we have more commands
|
||||
|
||||
if match in commands:
|
||||
codes.append(match.strip())
|
||||
elif match.split("\n")[-1] in commands:
|
||||
if len(match.split("\n")) > 1:
|
||||
codes.append("\n".join(match.split("\n")[:-1]))
|
||||
codes.append(match.split("\n")[-1])
|
||||
else:
|
||||
codes.append(match)
|
||||
|
||||
return codes
|
||||
|
||||
|
||||
class AutoGLMAgent:
|
||||
def __init__(
|
||||
self,
|
||||
action_space="autoglm_computer_use",
|
||||
observation_type="a11y_tree",
|
||||
max_trajectory_length=3,
|
||||
a11y_tree_max_items=300,
|
||||
with_image: bool = True,
|
||||
screen_size = (1920, 1080),
|
||||
image_size=(1920, 1080),
|
||||
with_atree: bool = False,
|
||||
glm41v_format: bool = True,
|
||||
relative_coordinate: bool = True,
|
||||
client_password="password",
|
||||
gen_func=None,
|
||||
tool_in_sys_msg: bool = True,
|
||||
):
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
assert action_space in ["autoglm_computer_use"], "Invalid action space"
|
||||
assert observation_type in ["a11y_tree"], "Invalid observation type"
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.a11y_tree_max_items = a11y_tree_max_items
|
||||
self.with_image = with_image
|
||||
self.screen_size = screen_size
|
||||
self.image_size = image_size
|
||||
self.with_atree = with_atree
|
||||
self.glm41v_format = glm41v_format
|
||||
self.relative_coordinate = relative_coordinate
|
||||
self.client_password = client_password
|
||||
self.gen_func = gen_func
|
||||
self.tool_in_sys_msg = tool_in_sys_msg
|
||||
|
||||
self.tool_list = {
|
||||
"libreoffice_calc": "CalcTools",
|
||||
"libreoffice_impress": "ImpressTools",
|
||||
"libreoffice_writer": "WriterTools",
|
||||
"code": "CodeTools",
|
||||
"vlc": "VLCTools",
|
||||
"google_chrome": "BrowserTools",
|
||||
}
|
||||
|
||||
Agent.relative_coordinate = relative_coordinate
|
||||
|
||||
self.contents = []
|
||||
|
||||
@property
|
||||
def turn_number(self):
|
||||
return len(self.contents)
|
||||
|
||||
def prepare(self, instruction: str, obs: Dict, history: List, last_result: str = "") -> List:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
if "exe_result" in obs and not last_result:
|
||||
last_result = obs["exe_result"]
|
||||
if self.contents:
|
||||
self.contents[-1]["exe_result"] = last_result
|
||||
|
||||
cur_app = obs["cur_app"]
|
||||
logger.info(f"current app is {cur_app}")
|
||||
|
||||
if cur_app:
|
||||
tool_name = cur_app.strip().lower().replace("-", "_")
|
||||
tool_name = tool_name if tool_name in self.tool_list.keys() else None
|
||||
else:
|
||||
tool_name = None
|
||||
|
||||
setup_prompt, func_def_prompt, note_prompt = Prompt.construct_procedural_memory(
|
||||
Agent, app_name=tool_name, client_password=self.client_password, with_image=self.with_image, with_atree=self.with_atree, relative_coordinate=self.relative_coordinate, glm41v_format=self.glm41v_format
|
||||
)
|
||||
if self.tool_in_sys_msg:
|
||||
system_message = setup_prompt + "\n\n" + func_def_prompt + "\n\n" + note_prompt
|
||||
else:
|
||||
system_message = setup_prompt + "\n\n" + note_prompt
|
||||
system_message += "\n\n**IMPORTANT** You are asked to complete the following task: {}".format(instruction)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": system_message,
|
||||
}
|
||||
]
|
||||
messages.extend(history)
|
||||
|
||||
if obs["apps"]:
|
||||
app_str = "Window ID App Name Title\n"
|
||||
for window_id, app in obs["apps"].items():
|
||||
app_str += f"{window_id} {app['app_name']} {app['title']}\n"
|
||||
else:
|
||||
app_str = "None"
|
||||
|
||||
last_result = last_result.strip() if last_result else "None"
|
||||
last_result = last_result[:2000] + "..." if len(last_result) > 2000 else last_result
|
||||
|
||||
tree = linearize_accessibility_tree(obs["accessibility_tree"], "Ubuntu")
|
||||
tree = trim_accessibility_tree(tree, 300)
|
||||
|
||||
app_info = obs["app_info"].strip() if obs["app_info"] else "None"
|
||||
app_info = app_info[:5000] + "..." if len(app_info) > 5000 else app_info
|
||||
|
||||
prompt = "* Apps: {}\n\n* Current App: {}{}\n\n* App Info: {}\n\n* Previous Action Result: {}".format(
|
||||
app_str.strip(),
|
||||
obs["cur_window_id"].strip() if obs["cur_window_id"] in app_str else "None",
|
||||
'\n\n* A11y Tree: {}'.format(tree.strip()) if self.with_atree else "",
|
||||
app_info,
|
||||
last_result if last_result else "None",
|
||||
) + (
|
||||
"\n\n" + func_def_prompt if not self.tool_in_sys_msg else ""
|
||||
)
|
||||
|
||||
content = [{"type": "text", "text": prompt}]
|
||||
if self.with_image and obs.get('screenshot'):
|
||||
screenshot = resize_image(obs['screenshot'], self.image_size[0], self.image_size[1])
|
||||
content = [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{b64encode(screenshot).decode('utf-8')}",
|
||||
"detail": "high",
|
||||
},
|
||||
}
|
||||
] + content
|
||||
|
||||
messages.append({"role": "user", "content": content})
|
||||
|
||||
return messages
|
||||
|
||||
def execute(self, response, obs):
|
||||
try:
|
||||
actions = parse_code_from_string(response)
|
||||
action = actions[0]
|
||||
logger.info(f"The pesudo action is {action}")
|
||||
|
||||
if "Agent." in action:
|
||||
actions = [
|
||||
eval(action),
|
||||
]
|
||||
elif "BrowserTools." in action: # TODO: special check for BrowserTools
|
||||
actions = [
|
||||
eval(action),
|
||||
]
|
||||
else:
|
||||
actions = Agent.tool_commands(action, obs["cur_app"].strip().replace("-", "_").lower())
|
||||
logger.info(f"The grounded action is {actions[0]}")
|
||||
except Exception as e:
|
||||
print("Failed to parse action from response", e)
|
||||
actions = []
|
||||
|
||||
return actions
|
||||
|
||||
def format_history(self, max_turns=30):
|
||||
history = []
|
||||
for ix in range(self.turn_number):
|
||||
if ix == 0:
|
||||
env_input = "**Environment State (Omitted)**"
|
||||
else:
|
||||
env_input = (
|
||||
f"**Environment State (Omitted)**\nPrevious Action Result: {self.contents[ix - 1]['exe_result']}"
|
||||
)
|
||||
|
||||
env_input = env_input[:2000] + "..." if len(env_input) > 2000 else env_input
|
||||
response = (
|
||||
self.contents[ix]["response"][:1500] + "..."
|
||||
if len(self.contents[ix]["response"]) > 1500
|
||||
else self.contents[ix]["response"]
|
||||
)
|
||||
history.append({"role": "user", "content": [{"type": "text", "text": env_input}]})
|
||||
history.append({"role": "assistant", "content": [{"type": "text", "text": response}]})
|
||||
|
||||
return history[-max_turns * 2:]
|
||||
|
||||
def predict(self, instruction: str, obs: Dict) -> List:
|
||||
history = self.format_history()
|
||||
messages = self.prepare(instruction, obs, history)
|
||||
|
||||
assert self.gen_func is not None, "gen_func is not set"
|
||||
try:
|
||||
response = self.gen_func(messages)
|
||||
except Exception as e:
|
||||
logger.error("Failed to call gen_func, Error: " + str(e))
|
||||
response = ""
|
||||
|
||||
logger.info("RESPONSE: %s", response)
|
||||
|
||||
actions = self.execute(response, obs)
|
||||
|
||||
# update the contents
|
||||
self.contents.append(
|
||||
{
|
||||
"instruction": instruction,
|
||||
"index": len(self.contents),
|
||||
"response": response,
|
||||
"action": "Parse error" if not actions else actions[0],
|
||||
"exe_result": "Invalid action" if not actions else "",
|
||||
**obs,
|
||||
}
|
||||
)
|
||||
return response, actions
|
||||
|
||||
def reset(self, _logger=None):
|
||||
global logger
|
||||
logger = _logger if _logger is not None else logging.getLogger("desktopenv.aguvis_agent")
|
||||
|
||||
self.contents = []
|
||||
|
|
@ -1,329 +0,0 @@
|
|||
import io
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import List, Tuple
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from .deduplicate_node import filter_similar_nodes
|
||||
|
||||
attributes_ns_ubuntu = "https://accessibility.windows.example.org/ns/attributes"
|
||||
attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
|
||||
state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
|
||||
state_ns_windows = "https://accessibility.windows.example.org/ns/state"
|
||||
component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
|
||||
component_ns_windows = "https://accessibility.windows.example.org/ns/component"
|
||||
value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
|
||||
value_ns_windows = "https://accessibility.windows.example.org/ns/value"
|
||||
class_ns_windows = "https://accessibility.windows.example.org/ns/class"
|
||||
|
||||
|
||||
def find_leaf_nodes(xlm_file_str):
|
||||
if not xlm_file_str:
|
||||
return []
|
||||
|
||||
root = ET.fromstring(xlm_file_str)
|
||||
|
||||
# Recursive function to traverse the XML tree and collect leaf nodes
|
||||
def collect_leaf_nodes(node, leaf_nodes):
|
||||
# If the node has no children, it is a leaf node, add it to the list
|
||||
if not list(node):
|
||||
leaf_nodes.append(node)
|
||||
# If the node has children, recurse on each child
|
||||
for child in node:
|
||||
collect_leaf_nodes(child, leaf_nodes)
|
||||
|
||||
# List to hold all leaf nodes
|
||||
leaf_nodes = []
|
||||
collect_leaf_nodes(root, leaf_nodes)
|
||||
return leaf_nodes
|
||||
|
||||
|
||||
def judge_node(node: ET, platform="Ubuntu", check_image=False) -> bool:
|
||||
if platform == "Ubuntu":
|
||||
_state_ns = state_ns_ubuntu
|
||||
_component_ns = component_ns_ubuntu
|
||||
elif platform == "Windows":
|
||||
_state_ns = state_ns_windows
|
||||
_component_ns = component_ns_windows
|
||||
else:
|
||||
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
|
||||
|
||||
keeps: bool = (
|
||||
node.tag.startswith("document")
|
||||
or node.tag.endswith("item")
|
||||
or node.tag.endswith("button")
|
||||
or node.tag.endswith("heading")
|
||||
or node.tag.endswith("label")
|
||||
or node.tag.endswith("scrollbar")
|
||||
or node.tag.endswith("searchbox")
|
||||
or node.tag.endswith("textbox")
|
||||
or node.tag.endswith("link")
|
||||
or node.tag.endswith("tabelement")
|
||||
or node.tag.endswith("textfield")
|
||||
or node.tag.endswith("textarea")
|
||||
or node.tag.endswith("menu")
|
||||
or node.tag
|
||||
in {
|
||||
"alert",
|
||||
"canvas",
|
||||
"check-box",
|
||||
"combo-box",
|
||||
"entry",
|
||||
"icon",
|
||||
"image",
|
||||
"paragraph",
|
||||
"scroll-bar",
|
||||
"section",
|
||||
"slider",
|
||||
"static",
|
||||
"table-cell",
|
||||
"terminal",
|
||||
"text",
|
||||
"netuiribbontab",
|
||||
"start",
|
||||
"trayclockwclass",
|
||||
"traydummysearchcontrol",
|
||||
"uiimage",
|
||||
"uiproperty",
|
||||
"uiribboncommandbar",
|
||||
}
|
||||
)
|
||||
keeps = (
|
||||
keeps
|
||||
and (
|
||||
platform == "Ubuntu"
|
||||
and node.get("{{{:}}}showing".format(_state_ns), "false") == "true"
|
||||
and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
|
||||
or platform == "Windows"
|
||||
and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
|
||||
)
|
||||
and (
|
||||
node.get("name", "") != ""
|
||||
or node.text is not None
|
||||
and len(node.text) > 0
|
||||
or check_image
|
||||
and node.get("image", "false") == "true"
|
||||
)
|
||||
)
|
||||
# and (
|
||||
# node.get("{{{:}}}enabled".format(_state_ns), "false") == "true"
|
||||
# or node.get("{{{:}}}editable".format(_state_ns), "false") == "true"
|
||||
# or node.get("{{{:}}}expandable".format(_state_ns), "false") == "true"
|
||||
# or node.get("{{{:}}}checkable".format(_state_ns), "false") == "true"
|
||||
# ) \
|
||||
|
||||
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(_component_ns), "(-1, -1)"))
|
||||
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(_component_ns), "(-1, -1)"))
|
||||
keeps = keeps and coordinates[0] >= 0 and coordinates[1] >= 0 and sizes[0] > 0 and sizes[1] > 0
|
||||
return keeps
|
||||
|
||||
|
||||
def filter_nodes(root: ET, platform="Ubuntu", check_image=False):
|
||||
filtered_nodes = []
|
||||
|
||||
for node in root.iter():
|
||||
if judge_node(node, platform, check_image):
|
||||
filtered_nodes.append(node)
|
||||
|
||||
return filtered_nodes
|
||||
|
||||
|
||||
def draw_bounding_boxes(nodes, image_file_content, down_sampling_ratio=1.0, platform="Ubuntu"):
|
||||
|
||||
if platform == "Ubuntu":
|
||||
_state_ns = state_ns_ubuntu
|
||||
_component_ns = component_ns_ubuntu
|
||||
_value_ns = value_ns_ubuntu
|
||||
elif platform == "Windows":
|
||||
_state_ns = state_ns_windows
|
||||
_component_ns = component_ns_windows
|
||||
_value_ns = value_ns_windows
|
||||
else:
|
||||
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
|
||||
|
||||
# Load the screenshot image
|
||||
image_stream = io.BytesIO(image_file_content)
|
||||
image = Image.open(image_stream)
|
||||
if float(down_sampling_ratio) != 1.0:
|
||||
image = image.resize((int(image.size[0] * down_sampling_ratio), int(image.size[1] * down_sampling_ratio)))
|
||||
draw = ImageDraw.Draw(image)
|
||||
marks = []
|
||||
drew_nodes = []
|
||||
text_informations: List[str] = ["index\ttag\tname\ttext"]
|
||||
|
||||
try:
|
||||
# Adjust the path to the font file you have or use a default one
|
||||
font = ImageFont.truetype("arial.ttf", 15)
|
||||
except IOError:
|
||||
# Fallback to a basic font if the specified font can't be loaded
|
||||
font = ImageFont.load_default()
|
||||
|
||||
index = 1
|
||||
|
||||
# Loop over all the visible nodes and draw their bounding boxes
|
||||
for _node in nodes:
|
||||
coords_str = _node.attrib.get("{{{:}}}screencoord".format(_component_ns))
|
||||
size_str = _node.attrib.get("{{{:}}}size".format(_component_ns))
|
||||
|
||||
if coords_str and size_str:
|
||||
try:
|
||||
# Parse the coordinates and size from the strings
|
||||
coords = tuple(map(int, coords_str.strip("()").split(", ")))
|
||||
size = tuple(map(int, size_str.strip("()").split(", ")))
|
||||
|
||||
import copy
|
||||
|
||||
original_coords = copy.deepcopy(coords)
|
||||
original_size = copy.deepcopy(size)
|
||||
|
||||
if float(down_sampling_ratio) != 1.0:
|
||||
# Downsample the coordinates and size
|
||||
coords = tuple(int(coord * down_sampling_ratio) for coord in coords)
|
||||
size = tuple(int(s * down_sampling_ratio) for s in size)
|
||||
|
||||
# Check for negative sizes
|
||||
if size[0] <= 0 or size[1] <= 0:
|
||||
raise ValueError(f"Size must be positive, got: {size}")
|
||||
|
||||
# Calculate the bottom-right corner of the bounding box
|
||||
bottom_right = (coords[0] + size[0], coords[1] + size[1])
|
||||
|
||||
# Check that bottom_right > coords (x1 >= x0, y1 >= y0)
|
||||
if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
|
||||
raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}")
|
||||
|
||||
# Check if the area only contains one color
|
||||
cropped_image = image.crop((*coords, *bottom_right))
|
||||
if len(set(list(cropped_image.getdata()))) == 1:
|
||||
continue
|
||||
|
||||
# Draw rectangle on image
|
||||
draw.rectangle([coords, bottom_right], outline="red", width=1)
|
||||
|
||||
# Draw index number at the bottom left of the bounding box with black background
|
||||
text_position = (coords[0], bottom_right[1]) # Adjust Y to be above the bottom right
|
||||
text_bbox: Tuple[int, int, int, int] = draw.textbbox(text_position, str(index), font=font, anchor="lb")
|
||||
# offset: int = bottom_right[1]-text_bbox[3]
|
||||
# text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)
|
||||
|
||||
# draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
|
||||
draw.rectangle(text_bbox, fill="black")
|
||||
draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
|
||||
|
||||
# each mark is an x, y, w, h tuple
|
||||
marks.append([original_coords[0], original_coords[1], original_size[0], original_size[1]])
|
||||
drew_nodes.append(_node)
|
||||
|
||||
if _node.text:
|
||||
node_text = _node.text if '"' not in _node.text else '"{:}"'.format(_node.text.replace('"', '""'))
|
||||
elif _node.get("{{{:}}}class".format(class_ns_windows), "").endswith("EditWrapper") and _node.get(
|
||||
"{{{:}}}value".format(_value_ns)
|
||||
):
|
||||
node_text = _node.get("{{{:}}}value".format(_value_ns), "")
|
||||
node_text = node_text if '"' not in node_text else '"{:}"'.format(node_text.replace('"', '""'))
|
||||
else:
|
||||
node_text = '""'
|
||||
text_information: str = "{:d}\t{:}\t{:}\t{:}".format(index, _node.tag, _node.get("name", ""), node_text)
|
||||
text_informations.append(text_information)
|
||||
|
||||
index += 1
|
||||
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
output_image_stream = io.BytesIO()
|
||||
image.save(output_image_stream, format="PNG")
|
||||
image_content = output_image_stream.getvalue()
|
||||
|
||||
return marks, drew_nodes, "\n".join(text_informations), image_content
|
||||
|
||||
|
||||
def print_nodes_with_indent(nodes, indent=0):
|
||||
for node in nodes:
|
||||
print(" " * indent, node.tag, node.attrib)
|
||||
print_nodes_with_indent(node, indent + 2)
|
||||
|
||||
|
||||
def find_active_applications(tree, state_ns):
|
||||
apps_with_active_tag = []
|
||||
for application in list(tree.getroot()):
|
||||
app_name = application.attrib.get("name")
|
||||
for frame in application:
|
||||
is_active = frame.attrib.get("{{{:}}}active".format(state_ns), "false")
|
||||
if is_active == "true":
|
||||
apps_with_active_tag.append(app_name)
|
||||
if apps_with_active_tag:
|
||||
to_keep = apps_with_active_tag + ["gnome-shell"]
|
||||
else:
|
||||
to_keep = ["gjs", "gnome-shell"]
|
||||
return to_keep
|
||||
|
||||
|
||||
def linearize_accessibility_tree(accessibility_tree, platform="Ubuntu"):
|
||||
if platform == "Ubuntu":
|
||||
_attributes_ns = attributes_ns_ubuntu
|
||||
_state_ns = state_ns_ubuntu
|
||||
_component_ns = component_ns_ubuntu
|
||||
_value_ns = value_ns_ubuntu
|
||||
elif platform == "Windows":
|
||||
_attributes_ns = attributes_ns_windows
|
||||
_state_ns = state_ns_windows
|
||||
_component_ns = component_ns_windows
|
||||
_value_ns = value_ns_windows
|
||||
else:
|
||||
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
|
||||
|
||||
try:
|
||||
tree = ET.ElementTree(ET.fromstring(accessibility_tree))
|
||||
keep_apps = find_active_applications(tree, _state_ns)
|
||||
|
||||
# Remove inactive applications
|
||||
for application in list(tree.getroot()):
|
||||
if application.get("name") not in keep_apps:
|
||||
tree.getroot().remove(application)
|
||||
|
||||
filtered_nodes = filter_nodes(tree.getroot(), platform, check_image=True)
|
||||
linearized_accessibility_tree = ["tag\ttext\tposition (center x & y)\tsize (w & h)"]
|
||||
|
||||
# Linearize the accessibility tree nodes into a table format
|
||||
for node in filtered_nodes:
|
||||
try:
|
||||
text = node.text if node.text is not None else ""
|
||||
text = text.strip()
|
||||
name = node.get("name", "").strip()
|
||||
if text == "":
|
||||
text = name
|
||||
elif name != "" and text != name:
|
||||
text = f"{name} ({text})"
|
||||
|
||||
text = text.replace("\n", "\\n")
|
||||
pos = node.get("{{{:}}}screencoord".format(_component_ns), "")
|
||||
size = node.get("{{{:}}}size".format(_component_ns), "")
|
||||
|
||||
x, y = re.match(f"\((\d+), (\d+)\)", pos).groups()
|
||||
w, h = re.match(f"\((\d+), (\d+)\)", size).groups()
|
||||
x_mid, y_mid = int(x) + int(w) // 2, int(y) + int(h) // 2
|
||||
|
||||
linearized_accessibility_tree.append(
|
||||
"{:}\t{:}\t{:}\t{:}".format(node.tag, text, f"({x_mid}, {y_mid})", size)
|
||||
)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# Filter out similar nodes
|
||||
linearized_accessibility_tree = filter_similar_nodes("\n".join(linearized_accessibility_tree))
|
||||
except Exception as e:
|
||||
print(f"Error in linearize_accessibility_tree: {e}")
|
||||
linearized_accessibility_tree = ""
|
||||
|
||||
return linearized_accessibility_tree
|
||||
|
||||
|
||||
def trim_accessibility_tree(linearized_accessibility_tree, max_items):
|
||||
lines = linearized_accessibility_tree.strip().split("\n")
|
||||
if len(lines) > max_items:
|
||||
lines = lines[:max_items]
|
||||
linearized_accessibility_tree = "\n".join(lines)
|
||||
linearized_accessibility_tree += "\n..."
|
||||
return linearized_accessibility_tree
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
import re
|
||||
|
||||
|
||||
def parse_line(line):
|
||||
# 解析格式,如:label Google Chrome (191, 13) (104, 17)
|
||||
pattern = r"^(\S+)\s+(.+?)\s+\((\d+), (\d+)\)\s+\((\d+), (\d+)\)"
|
||||
m = re.match(pattern, line)
|
||||
if not m:
|
||||
return None
|
||||
node_type, text, cx, cy, w, h = m.groups()
|
||||
cx, cy, w, h = map(int, (cx, cy, w, h))
|
||||
# bounding box as (x1, y1, x2, y2)
|
||||
x1 = cx - w // 2
|
||||
y1 = cy - h // 2
|
||||
x2 = x1 + w
|
||||
y2 = y1 + h
|
||||
return {
|
||||
"type": node_type,
|
||||
"text": text.strip(),
|
||||
"bbox": (x1, y1, x2, y2),
|
||||
"center": (cx, cy),
|
||||
"size": (w, h),
|
||||
"raw": line,
|
||||
}
|
||||
|
||||
|
||||
def iou(box1, box2):
|
||||
# box: (x1, y1, x2, y2)
|
||||
xi1 = max(box1[0], box2[0])
|
||||
yi1 = max(box1[1], box2[1])
|
||||
xi2 = min(box1[2], box2[2])
|
||||
yi2 = min(box1[3], box2[3])
|
||||
inter_width = max(0, xi2 - xi1)
|
||||
inter_height = max(0, yi2 - yi1)
|
||||
inter_area = inter_width * inter_height
|
||||
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
||||
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
||||
union = area1 + area2 - inter_area
|
||||
if union == 0:
|
||||
return 0
|
||||
return inter_area / union
|
||||
|
||||
|
||||
def norm_text(s):
|
||||
# 归一化文本:小写、去空格等
|
||||
return re.sub(r"\s+", "", s.lower())
|
||||
|
||||
|
||||
def text_similarity(a, b):
|
||||
# 简单判定:完全一致为1,否则0
|
||||
na, nb = norm_text(a), norm_text(b)
|
||||
if na == nb:
|
||||
return 1.0
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def filter_similar_nodes(linearized_accessibility_tree):
|
||||
lines = [ln for ln in linearized_accessibility_tree.split("\n") if ln.strip()]
|
||||
# parse all nodes
|
||||
nodes = []
|
||||
for ln in lines:
|
||||
node = parse_line(ln)
|
||||
if node:
|
||||
nodes.append(node)
|
||||
else:
|
||||
# 解析不了的保留
|
||||
nodes.append({"raw": ln, "invalid": True})
|
||||
filtered = []
|
||||
removed = [False] * len(nodes)
|
||||
# 阈值可自行调整
|
||||
IOU_THRESH = 0.2
|
||||
TEXT_THRESH = 0.9
|
||||
for i, ni in enumerate(nodes):
|
||||
if ni.get("invalid"):
|
||||
filtered.append(ni["raw"])
|
||||
continue
|
||||
if removed[i]:
|
||||
continue
|
||||
for j in range(i + 1, len(nodes)):
|
||||
nj = nodes[j]
|
||||
if nj.get("invalid"):
|
||||
continue
|
||||
iou_val = iou(ni["bbox"], nj["bbox"])
|
||||
text_sim = text_similarity(ni["text"], nj["text"])
|
||||
if iou_val > IOU_THRESH and text_sim > TEXT_THRESH:
|
||||
# 二者极其相似,移除后者
|
||||
removed[j] = True
|
||||
# print(f"移除: {nj['raw']} (与 {ni['raw']} 相似度高)")
|
||||
# 保留未被标记为移除的
|
||||
if not removed[i]:
|
||||
filtered.append(ni["raw"])
|
||||
return "\n".join(filtered)
|
||||
|
||||
|
||||
# 示例用法
|
||||
if __name__ == "__main__":
|
||||
linearized_accessibility_tree = "tag\ttext\tposition (center x & y)\tsize (w & h)\nicon\t\t(1853, 1001)\t(64, 64)\nlabel\tHome\t(1853, 1045)\t(40, 17)\nlabel\tActivities\t(49, 13)\t(63, 17)\ntext\tActivities\t(49, 13)\t(63, 17)\nlabel\tApr 17 17∶04\t(995, 13)\t(117, 27)\ntext\tApr 17 17∶04\t(995, 13)\t(87, 18)\nmenu\tSystem\t(1867, 13)\t(106, 27)\npush-button\tGoogle Chrome\t(35, 65)\t(70, 64)\npush-button\tThunderbird Mail\t(35, 133)\t(70, 64)\npush-button\tVisual Studio Code\t(35, 201)\t(70, 64)\npush-button\tVLC media player\t(35, 269)\t(70, 64)\npush-button\tLibreOffice Writer\t(35, 337)\t(70, 64)\npush-button\tLibreOffice Calc\t(35, 405)\t(70, 64)\npush-button\tLibreOffice Impress\t(35, 473)\t(70, 64)\npush-button\tGNU Image Manipulation Program\t(35, 541)\t(70, 64)\npush-button\tFiles\t(35, 609)\t(70, 64)\npush-button\tUbuntu Software\t(35, 677)\t(70, 64)\npush-button\tHelp\t(35, 745)\t(70, 64)\npush-button\tTrash\t(35, 816)\t(70, 64)\ntoggle-button\tShow Applications\t(35, 1045)\t(70, 70)"
|
||||
result = filter_similar_nodes(linearized_accessibility_tree)
|
||||
print(result)
|
||||
|
|
@ -1,260 +0,0 @@
|
|||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
def agent_action(func):
|
||||
func.is_agent_action = True
|
||||
return func
|
||||
|
||||
|
||||
switch_window_code = """import subprocess;
|
||||
import pyautogui;
|
||||
pyautogui.press('escape');
|
||||
time.sleep(0.5);
|
||||
subprocess.run(['wmctrl', '-ia', 'WINDOW_ID'])
|
||||
subprocess.run(['wmctrl', '-ir', 'WINDOW_ID', '-b', 'add,maximized_vert,maximized_horz'])
|
||||
print('Switch to WINDOW_ID')"""
|
||||
|
||||
launch_app_commands = {
|
||||
# Web Browser
|
||||
"chrome": "google-chrome --remote-debugging-port=1337",
|
||||
# File Manager
|
||||
"files": "nautilus",
|
||||
# Terminal
|
||||
"terminal": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-terminal',
|
||||
# Utilities
|
||||
"gedit": "gedit",
|
||||
# Office
|
||||
"libreoffice writer": "libreoffice --writer",
|
||||
"libreoffice calc": "libreoffice --calc",
|
||||
"libreoffice impress": "libreoffice --impress",
|
||||
# System
|
||||
"settings": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-control-center',
|
||||
# Multimedia
|
||||
"vlc": "vlc",
|
||||
"gimp": "gimp",
|
||||
# IDE
|
||||
"vs code": "code",
|
||||
# Email
|
||||
"thunderbird": "thunderbird",
|
||||
}
|
||||
|
||||
|
||||
class GroundingAgent:
|
||||
|
||||
tool_list = {
|
||||
"libreoffice_calc": "CalcTools",
|
||||
"libreoffice_impress": "ImpressTools",
|
||||
"libreoffice_writer": "WriterTools",
|
||||
"code": "CodeTools",
|
||||
"vlc": "VLCTools",
|
||||
"google_chrome": "BrowserTools",
|
||||
}
|
||||
|
||||
relative_coordinate = True # whether the coordinates are relative (0-1000) or absolute (e.g. 1920x1080)
|
||||
|
||||
@classmethod
|
||||
def tool_commands(cls, code: str, tool_name: str):
|
||||
command = f"from {tool_name} import *; "
|
||||
command += code
|
||||
|
||||
tool_class = cls.tool_list[tool_name]
|
||||
command += f"; {tool_class}.print_result()"
|
||||
|
||||
return [
|
||||
command,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def click(
|
||||
cls,
|
||||
coordinate: List,
|
||||
num_clicks: int = 1,
|
||||
button_type: str = "left",
|
||||
):
|
||||
"""
|
||||
Click on the element
|
||||
|
||||
Args:
|
||||
coordinate (List): [x, y], coordinate of the element to click on
|
||||
num_clicks (int): number of times to click the element
|
||||
button_type (str): which mouse button to press ("left", "middle", or "right")
|
||||
"""
|
||||
command = ""
|
||||
x, y = coordinate
|
||||
if cls.relative_coordinate:
|
||||
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
|
||||
command += f"""pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); print("Click Success")""" # TODO: 最大化窗口需要一次调用
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def type(
|
||||
cls,
|
||||
coordinate: Optional[List] = None,
|
||||
text: str = "",
|
||||
overwrite: bool = False,
|
||||
enter: bool = False,
|
||||
):
|
||||
"""
|
||||
Type text into the element
|
||||
|
||||
Args:
|
||||
coordinate (List): [x, y], coordinate of the element to type into. If None, typing starts at current cursor location
|
||||
text (str): the text to type
|
||||
overwrite (bool): True to overwrite existing text, False otherwise
|
||||
enter (bool): True to press enter after typing, False otherwise
|
||||
"""
|
||||
|
||||
command = ""
|
||||
|
||||
if coordinate is not None:
|
||||
# Start typing at the center of the element
|
||||
x, y = coordinate
|
||||
if cls.relative_coordinate:
|
||||
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
|
||||
command += f"pyautogui.click({x}, {y}); "
|
||||
|
||||
if overwrite:
|
||||
command += f"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); "
|
||||
|
||||
command += f"pyautogui.write({repr(text)}); "
|
||||
|
||||
if enter:
|
||||
command += "pyautogui.press('enter'); "
|
||||
|
||||
command += "print('Type Success')"
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def drag_and_drop(cls, drag_from_coordinate: List, drop_on_coordinate: List):
|
||||
"""
|
||||
Drag element1 and drop it on element2
|
||||
|
||||
Args:
|
||||
drag_from_coordinate (List): [x, y], coordinate of element to drag
|
||||
drop_on_coordinate (List): [x, y], coordinate of element to drop on
|
||||
"""
|
||||
x1, y1 = drag_from_coordinate
|
||||
if cls.relative_coordinate:
|
||||
x1, y1 = round(x1 * 1920 / 1000), round(y1 * 1080 / 1000)
|
||||
x2, y2 = drop_on_coordinate
|
||||
if cls.relative_coordinate:
|
||||
x2, y2 = round(x2 * 1920 / 1000), round(y2 * 1080 / 1000)
|
||||
|
||||
command = f"pyautogui.moveTo({x1}, {y1}); "
|
||||
# TODO: specified duration?
|
||||
command += f"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); "
|
||||
|
||||
command += "print('Drag and Drop Success')"
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def scroll(cls, coordinate: List, direction: str):
|
||||
"""
|
||||
Scroll the element in the specified direction
|
||||
|
||||
Args:
|
||||
coordinate (List): [x, y], coordinate of the element to scroll in
|
||||
direction (str): the direction to scroll ("up" or "down")
|
||||
"""
|
||||
x, y = coordinate
|
||||
if cls.relative_coordinate:
|
||||
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
|
||||
amount = 100 if direction == "up" else -100
|
||||
return f"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({amount}); print('Scroll Success')"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def open_app(cls, app_name: str):
|
||||
"""
|
||||
Open a specified application
|
||||
|
||||
Supported apps: chrome, files, terminal, gedit, libreoffice writer,
|
||||
libreoffice calc, libreoffice impress, vs code, vlc, gimp, settings, thunderbird
|
||||
|
||||
Args:
|
||||
app_name (str): name of the application to open
|
||||
"""
|
||||
|
||||
app_name = app_name.lower().strip()
|
||||
|
||||
if app_name not in launch_app_commands:
|
||||
command = f"print(f'{app_name} is not supported or recognized')"
|
||||
else:
|
||||
command = {
|
||||
"action_type": "OPEN_APP",
|
||||
"parameters": {"launch_app_command": launch_app_commands[app_name], "app_name": app_name},
|
||||
}
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def switch_window(cls, window_id: str):
|
||||
"""
|
||||
Switch to the window with the given window id
|
||||
|
||||
Args:
|
||||
window_id (str): the window id to switch to from the provided list of open windows
|
||||
"""
|
||||
return switch_window_code.replace("WINDOW_ID", window_id)
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def hotkey(cls, keys: List):
|
||||
"""
|
||||
Press a hotkey combination
|
||||
|
||||
Args:
|
||||
keys (List): the keys to press in combination (e.g. ['ctrl', 'c'] for copy, ['prtsc'] for screenshot)
|
||||
"""
|
||||
# add quotes around the keys
|
||||
keys = [f"'{key}'" for key in keys]
|
||||
key_str = ", ".join(keys).replace("'", "\\'")
|
||||
return f"import pyautogui; pyautogui.hotkey({', '.join(keys)}); print(f'Press Hotkey: {key_str}')"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def quote(cls, content: str):
|
||||
"""
|
||||
Quote information from the current page for memory
|
||||
|
||||
Args:
|
||||
content (str): text summarized or copied from the page for later operation
|
||||
"""
|
||||
return f'''print("""{content}""")'''
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def wait(cls):
|
||||
"""
|
||||
Wait for a while
|
||||
|
||||
"""
|
||||
return "WAIT"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def exit(cls, success: bool):
|
||||
"""
|
||||
End the current task
|
||||
|
||||
Args:
|
||||
success (bool): True if successfully finish a task, False otherwise
|
||||
"""
|
||||
if success:
|
||||
return "DONE"
|
||||
else:
|
||||
return "FAIL"
|
||||
|
|
@ -1,194 +0,0 @@
|
|||
import inspect
|
||||
import json
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
def generate_func(json_data):
|
||||
# 收集所有类名和它们的函数
|
||||
class_funcs = {}
|
||||
no_class_funcs = []
|
||||
cls_name = ""
|
||||
|
||||
for item in json_data:
|
||||
if item["type"] == "function":
|
||||
func = item["function"]
|
||||
func_parts = func["name"].split(".")
|
||||
|
||||
if len(func_parts) == 2:
|
||||
class_name, func_name = func_parts
|
||||
if class_name not in class_funcs:
|
||||
class_funcs[class_name] = []
|
||||
class_funcs[class_name].append(item)
|
||||
else:
|
||||
no_class_funcs.append(item)
|
||||
|
||||
code = ""
|
||||
|
||||
# 生成有类的函数
|
||||
for class_name, funcs in class_funcs.items():
|
||||
code += f"class {class_name}:\n"
|
||||
cls_name = class_name
|
||||
for item in funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"].split(".")[-1]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = ["cls"]
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}") # 可选参数默认值设为None
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f" def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if len(param_list) == 1: # 只有cls参数
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
code += "\n"
|
||||
|
||||
# 生成没有类的函数
|
||||
for item in no_class_funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = []
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}")
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f"def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if not param_list:
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
return code.strip(), cls_name
|
||||
|
||||
|
||||
setup_prompt = """You are a GUI operation agent. You will be given a task and your action history, with current observation ({observation_list}). You should help me control the computer, output the best action step by step to accomplish the task.
|
||||
You should first generate a plan, reflect on the current observation, then generate actions to complete the task in python-style pseudo code using the predefined functions.
|
||||
|
||||
* Output Format:
|
||||
{format_hint}"""
|
||||
|
||||
func_def_template = """* Available Functions:
|
||||
```python
|
||||
{class_content}
|
||||
```"""
|
||||
|
||||
note_prompt = """* Note:
|
||||
- Your code should only be wrapped in ```python```.
|
||||
- Only **ONE-LINE-OF-CODE** at a time.
|
||||
- Each code block is context independent, and variables from the previous round cannot be used in the next round.
|
||||
{relative_coordinate_hint}- Return with `Agent.exit(success=True)` immediately after the task is completed.
|
||||
- The computer's environment is Linux, e.g., Desktop path is '/home/user/Desktop'
|
||||
- My computer's password is '{client_password}', feel free to use it when you need sudo rights"""
|
||||
|
||||
|
||||
class Prompt:
|
||||
@staticmethod
|
||||
def construct_procedural_memory(agent_class, app_name=None, client_password="password", with_image=True, with_atree=False, relative_coordinate=True, glm41v_format=True):
|
||||
agent_class_content = "Class Agent:"
|
||||
for attr_name in dir(agent_class):
|
||||
attr = getattr(agent_class, attr_name)
|
||||
if callable(attr) and hasattr(attr, "is_agent_action"):
|
||||
# Use inspect to get the full function signature
|
||||
signature = inspect.signature(attr)
|
||||
agent_class_content += f"""
|
||||
def {attr_name}{signature}:
|
||||
'''{attr.__doc__}'''
|
||||
"""
|
||||
|
||||
if app_name is not None:
|
||||
tool_path = os.path.join(current_dir, "tools", "apis", f"{app_name.lower()}.json")
|
||||
with open(tool_path, "r") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
tool_class_content, tool_class_name = generate_func(json_data)
|
||||
|
||||
agent_class_content += "\n\n{}".format(tool_class_content)
|
||||
|
||||
func_def_prompt = func_def_template.format(class_content=agent_class_content.strip())
|
||||
|
||||
# --- dynamic observation list ---
|
||||
obs_items = []
|
||||
if with_image:
|
||||
obs_items.append("screenshot")
|
||||
obs_items.append("current app name")
|
||||
if with_atree:
|
||||
obs_items.append("a11y tree (based on AT-SPI library)")
|
||||
obs_items.append("app info")
|
||||
obs_items.append("last action result")
|
||||
observation_list = ", ".join(obs_items)
|
||||
|
||||
setup_prompt_formatted = setup_prompt.format(
|
||||
observation_list=observation_list,
|
||||
format_hint="<think>\n{**YOUR-PLAN-AND-THINKING**}</think>\n<answer>```python\n{**ONE-LINE-OF-CODE**}\n```</answer>" if glm41v_format else "<think>\n{**YOUR-PLAN-AND-THINKING**}\n</think>\n```python\n{**ONE-LINE-OF-CODE**}\n```"
|
||||
)
|
||||
|
||||
note_prompt_formatted = note_prompt.format(
|
||||
relative_coordinate_hint="- The coordinate [x, y] should be normalized to 0-1000, which usually should be the center of a specific target element.\n" if relative_coordinate else "",
|
||||
client_password=client_password
|
||||
)
|
||||
|
||||
return setup_prompt_formatted, func_def_prompt, note_prompt_formatted
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from grounding_agent import GroundingAgent
|
||||
|
||||
print(Prompt.construct_procedural_memory(GroundingAgent, "vlc"))
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
from .func import generate_func
|
||||
|
||||
__all__ = ["generate_func"]
|
||||
|
|
@ -1,236 +0,0 @@
|
|||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.launch_vscode",
|
||||
"description": "Launch VS Code with specified path",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "File path or directory to open"
|
||||
}
|
||||
},
|
||||
"required": ["path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.compare_files",
|
||||
"description": "Compare two files in VS Code",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file1": {
|
||||
"type": "string",
|
||||
"description": "First file path"
|
||||
},
|
||||
"file2": {
|
||||
"type": "string",
|
||||
"description": "Second file path"
|
||||
}
|
||||
},
|
||||
"required": ["file1", "file2"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.add_folder",
|
||||
"description": "Add folder to active VS Code window",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"folder": {
|
||||
"type": "string",
|
||||
"description": "Folder path to add"
|
||||
}
|
||||
},
|
||||
"required": ["folder"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.goto_file",
|
||||
"description": "Open file at specific position",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "File path to open"
|
||||
},
|
||||
"line": {
|
||||
"type": "integer",
|
||||
"description": "Line number",
|
||||
"default": 1
|
||||
},
|
||||
"character": {
|
||||
"type": "integer",
|
||||
"description": "Character position",
|
||||
"default": 1
|
||||
}
|
||||
},
|
||||
"required": ["file_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.perform_merge",
|
||||
"description": "Perform three-way merge",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path1": {
|
||||
"type": "string",
|
||||
"description": "First version file path"
|
||||
},
|
||||
"path2": {
|
||||
"type": "string",
|
||||
"description": "Second version file path"
|
||||
},
|
||||
"base": {
|
||||
"type": "string",
|
||||
"description": "Base version file path"
|
||||
},
|
||||
"result": {
|
||||
"type": "string",
|
||||
"description": "Output file path"
|
||||
}
|
||||
},
|
||||
"required": ["path1", "path2", "base", "result"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.remove_folder",
|
||||
"description": "Remove folder from active VS Code window",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"folder": {
|
||||
"type": "string",
|
||||
"description": "Folder path to remove"
|
||||
}
|
||||
},
|
||||
"required": ["folder"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.install_extension",
|
||||
"description": "Install or update VS Code extension",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"extension_id": {
|
||||
"type": "string",
|
||||
"description": "Extension identifier"
|
||||
},
|
||||
"pre_release": {
|
||||
"type": "boolean",
|
||||
"description": "Install pre-release version",
|
||||
"default": false
|
||||
}
|
||||
},
|
||||
"required": ["extension_id"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.uninstall_extension",
|
||||
"description": "Uninstall VS Code extension",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"extension_id": {
|
||||
"type": "string",
|
||||
"description": "Extension identifier"
|
||||
}
|
||||
},
|
||||
"required": ["extension_id"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.list_extensions",
|
||||
"description": "List installed extensions",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"show_versions": {
|
||||
"type": "boolean",
|
||||
"description": "Show extension versions",
|
||||
"default": false
|
||||
},
|
||||
"category": {
|
||||
"type": "string",
|
||||
"description": "Filter by category"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.update_extensions",
|
||||
"description": "Update all extensions to latest version",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.disable_extension",
|
||||
"description": "Disable extension for next VS Code instance",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"extension_id": {
|
||||
"type": "string",
|
||||
"description": "Extension identifier"
|
||||
}
|
||||
},
|
||||
"required": ["extension_id"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.toggle_sync",
|
||||
"description": "Toggle VS Code synchronization",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "Sync state",
|
||||
"enum": ["on", "off"]
|
||||
}
|
||||
},
|
||||
"required": ["state"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
def generate_func(json_data):
|
||||
# 收集所有类名和它们的函数
|
||||
class_funcs = {}
|
||||
no_class_funcs = []
|
||||
|
||||
for item in json_data:
|
||||
if item["type"] == "function":
|
||||
func = item["function"]
|
||||
func_parts = func["name"].split(".")
|
||||
|
||||
if len(func_parts) == 2:
|
||||
class_name, func_name = func_parts
|
||||
if class_name not in class_funcs:
|
||||
class_funcs[class_name] = []
|
||||
class_funcs[class_name].append(item)
|
||||
else:
|
||||
no_class_funcs.append(item)
|
||||
|
||||
code = ""
|
||||
|
||||
# 生成有类的函数
|
||||
for class_name, funcs in class_funcs.items():
|
||||
code += f"class {class_name}:\n"
|
||||
for item in funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"].split(".")[-1]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = ["cls"]
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}") # 可选参数默认值设为None
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f" def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if len(param_list) == 1: # 只有cls参数
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
code += "\n"
|
||||
|
||||
# 生成没有类的函数
|
||||
for item in no_class_funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = []
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}")
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f"def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if not param_list:
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
return code.strip()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
|
||||
with open("libreoffice_calc.json", "r") as f:
|
||||
json_data = json.load(f)
|
||||
print(generate_func(json_data))
|
||||
|
|
@ -1,134 +0,0 @@
|
|||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_profile_settings",
|
||||
"description": "Opens profile settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_password_settings",
|
||||
"description": "Opens password/autofill settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_privacy_settings",
|
||||
"description": "Opens privacy settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_appearance_settings",
|
||||
"description": "Opens appearance settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_search_engine_settings",
|
||||
"description": "Opens search engine settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.bring_back_last_tab",
|
||||
"description": "Restores last-closed tab (Ctrl+Shift+T).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.print",
|
||||
"description": "Opens print dialog (Ctrl+P).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.delete_browsing_data",
|
||||
"description": "Opens clear browsing data dialog (Ctrl+Shift+Del).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_extensions",
|
||||
"description": "Opens extensions management page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.bookmark_page",
|
||||
"description": "Bookmarks current page (Ctrl+D).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_bookmarks",
|
||||
"description": "Opens bookmarks page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -1,634 +0,0 @@
|
|||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.get_workbook_info",
|
||||
"description": "Get workbook info: file path, name, sheets, and active sheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.save",
|
||||
"description": "Save workbook to current location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.get_column_data",
|
||||
"description": "Get all data from specified column",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.switch_active_sheet",
|
||||
"description": "Switch to sheet (creates if not exists)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"sheet_name": {
|
||||
"type": "string",
|
||||
"description": "Sheet name"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"sheet_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_column_values",
|
||||
"description": "Set values to column (values only, not formulas)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"description": "Values to write"
|
||||
},
|
||||
"start_index": {
|
||||
"type": "integer",
|
||||
"description": "First row index (default: 2)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name",
|
||||
"data"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.highlight_range",
|
||||
"description": "Highlight range with color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Range (e.g. 'A1:B10')"
|
||||
},
|
||||
"color": {
|
||||
"type": "integer",
|
||||
"description": "Color value (default: 0xFF0000)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.transpose_range",
|
||||
"description": "Transpose range and paste to target cell",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_range": {
|
||||
"type": "string",
|
||||
"description": "Source range (e.g. 'A1:B10')"
|
||||
},
|
||||
"target_cell": {
|
||||
"type": "string",
|
||||
"description": "Target cell (e.g. 'A1')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"source_range",
|
||||
"target_cell"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.export_to_csv",
|
||||
"description": "Export to CSV with same path/name",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.sort_column",
|
||||
"description": "Sort column data",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
},
|
||||
"ascending": {
|
||||
"type": "boolean",
|
||||
"description": "Sort ascending (default: true)"
|
||||
},
|
||||
"start_index": {
|
||||
"type": "integer",
|
||||
"description": "First row index (default: 2)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_validation_list",
|
||||
"description": "Set validation list for column",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
},
|
||||
"values": {
|
||||
"type": "array",
|
||||
"description": "Validation values"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name",
|
||||
"values"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.hide_row_data",
|
||||
"description": "Hide rows containing value",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "Value to hide (default: 'N/A')"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.reorder_columns",
|
||||
"description": "Reorder columns by specified order",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_order": {
|
||||
"type": "array",
|
||||
"description": "Column names in desired order (e.g. ['A', 'B', 'C'])"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_order"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.create_pivot_table",
|
||||
"description": "Create pivot table from source sheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_sheet": {
|
||||
"type": "string",
|
||||
"description": "Source sheet name"
|
||||
},
|
||||
"table_name": {
|
||||
"type": "string",
|
||||
"description": "Pivot table name"
|
||||
},
|
||||
"row_fields": {
|
||||
"type": "array",
|
||||
"description": "Row labels (e.g. ['A', 'B'])"
|
||||
},
|
||||
"col_fields": {
|
||||
"type": "array",
|
||||
"description": "Column labels (e.g. ['A', 'B'])"
|
||||
},
|
||||
"value_fields": {
|
||||
"type": "array",
|
||||
"description": "Value fields (e.g. ['A', 'B'])"
|
||||
},
|
||||
"aggregation_function": {
|
||||
"type": "string",
|
||||
"description": "Aggregation function (sum, count, average, min, max)"
|
||||
},
|
||||
"target_cell": {
|
||||
"type": "string",
|
||||
"description": "Target cell (default: 'A1')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"source_sheet",
|
||||
"table_name",
|
||||
"value_fields"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.merge_cells",
|
||||
"description": "Merge cells in range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Cell range (e.g. 'A1:B10')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_cell_value",
|
||||
"description": "Set cell value",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"cell": {
|
||||
"type": "string",
|
||||
"description": "Cell reference (e.g. 'A1')"
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "Cell value"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"cell",
|
||||
"value"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.format_range",
|
||||
"description": "Apply formatting to range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Range (e.g. 'A1:B10')"
|
||||
},
|
||||
"background_color": {
|
||||
"type": "string",
|
||||
"description": "Background color (e.g. '#0000ff')"
|
||||
},
|
||||
"font_color": {
|
||||
"type": "string",
|
||||
"description": "Font color (e.g. '#ffffff')"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Bold text"
|
||||
},
|
||||
"alignment": {
|
||||
"type": "string",
|
||||
"description": "Text alignment (left, center, right)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.create_chart",
|
||||
"description": "Create chart from data range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"chart_type": {
|
||||
"type": "string",
|
||||
"description": "Chart type (bar, column, line, pie, scatter, area)"
|
||||
},
|
||||
"data_range": {
|
||||
"type": "string",
|
||||
"description": "Data range (e.g. 'A1:B10')"
|
||||
},
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "Chart title"
|
||||
},
|
||||
"x_axis_title": {
|
||||
"type": "string",
|
||||
"description": "X axis title"
|
||||
},
|
||||
"y_axis_title": {
|
||||
"type": "string",
|
||||
"description": "Y axis title"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"chart_type",
|
||||
"data_range"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.freeze_panes",
|
||||
"description": "Freeze rows/columns",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rows": {
|
||||
"type": "integer",
|
||||
"description": "Rows to freeze from top"
|
||||
},
|
||||
"columns": {
|
||||
"type": "integer",
|
||||
"description": "Columns to freeze from left"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.rename_sheet",
|
||||
"description": "Rename worksheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"old_name": {
|
||||
"type": "string",
|
||||
"description": "Current sheet name"
|
||||
},
|
||||
"new_name": {
|
||||
"type": "string",
|
||||
"description": "New sheet name"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"old_name",
|
||||
"new_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.copy_sheet",
|
||||
"description": "Copy worksheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_sheet": {
|
||||
"type": "string",
|
||||
"description": "Source sheet name"
|
||||
},
|
||||
"new_sheet_name": {
|
||||
"type": "string",
|
||||
"description": "New sheet name (optional)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"source_sheet"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.reorder_sheets",
|
||||
"description": "Change sheet order",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"sheet_name": {
|
||||
"type": "string",
|
||||
"description": "Sheet to move"
|
||||
},
|
||||
"position": {
|
||||
"type": "integer",
|
||||
"description": "New position (0-based)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"sheet_name",
|
||||
"position"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_chart_legend_position",
|
||||
"description": "Set chart legend position",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Legend position (top, bottom, left, right, none)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"position"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_number_format",
|
||||
"description": "Apply number format to range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Range (e.g. 'A1:B10')"
|
||||
},
|
||||
"format_type": {
|
||||
"type": "string",
|
||||
"description": "Format type (general, number, currency, accounting, date, time, percentage, fraction, scientific, text)"
|
||||
},
|
||||
"decimal_places": {
|
||||
"type": "integer",
|
||||
"description": "Decimal places (optional)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str",
|
||||
"format_type"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.adjust_column_width",
|
||||
"description": "Adjust column width",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"columns": {
|
||||
"type": "string",
|
||||
"description": "Column range (e.g. 'A:C')"
|
||||
},
|
||||
"width": {
|
||||
"type": "number",
|
||||
"description": "Width in characters"
|
||||
},
|
||||
"autofit": {
|
||||
"type": "boolean",
|
||||
"description": "Autofit to content"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"columns"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.adjust_row_height",
|
||||
"description": "Adjust row height",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rows": {
|
||||
"type": "string",
|
||||
"description": "Row range (e.g. '1:10')"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"description": "Height in points"
|
||||
},
|
||||
"autofit": {
|
||||
"type": "boolean",
|
||||
"description": "Autofit to content"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"rows"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.export_to_pdf",
|
||||
"description": "Export to PDF",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "PDF save path (default: same as original)"
|
||||
},
|
||||
"sheets": {
|
||||
"type": "array",
|
||||
"description": "Sheets to include (default: all)"
|
||||
},
|
||||
"open_after_export": {
|
||||
"type": "boolean",
|
||||
"description": "Open PDF after export (default: false)"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_zoom_level",
|
||||
"description": "Set worksheet zoom level",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"zoom_percentage": {
|
||||
"type": "integer",
|
||||
"description": "Zoom percentage (10-400)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"zoom_percentage"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -1,559 +0,0 @@
|
|||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.save",
|
||||
"description": "Save current presentation",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.go_to_slide",
|
||||
"description": "Navigate to specific slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.get_slide_count",
|
||||
"description": "Get total slide count",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.duplicate_slide",
|
||||
"description": "Duplicate slide and place at end",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index to duplicate (1-based)"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_font",
|
||||
"description": "Set font for all text in slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"font_name": {
|
||||
"type": "string",
|
||||
"description": "Font name (e.g., 'Arial', 'Times New Roman')"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "font_name"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.write_text",
|
||||
"description": "Write text to textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "Text content"
|
||||
},
|
||||
"page_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Bold text (default: false)"
|
||||
},
|
||||
"italic": {
|
||||
"type": "boolean",
|
||||
"description": "Italic text (default: false)"
|
||||
},
|
||||
"size": {
|
||||
"type": "integer",
|
||||
"description": "Font size"
|
||||
},
|
||||
"append": {
|
||||
"type": "boolean",
|
||||
"description": "Append to existing text (default: false)"
|
||||
}
|
||||
},
|
||||
"required": ["content", "page_index", "box_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_style",
|
||||
"description": "Set text style for textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Bold text"
|
||||
},
|
||||
"italic": {
|
||||
"type": "boolean",
|
||||
"description": "Italic text"
|
||||
},
|
||||
"underline": {
|
||||
"type": "boolean",
|
||||
"description": "Underline text"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.configure_auto_save",
|
||||
"description": "Configure auto-save settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean",
|
||||
"description": "Enable auto-save"
|
||||
},
|
||||
"interval_minutes": {
|
||||
"type": "number",
|
||||
"description": "Auto-save interval in minutes (min: 1)"
|
||||
}
|
||||
},
|
||||
"required": ["enabled", "interval_minutes"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_background_color",
|
||||
"description": "Set textbox background color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Color name or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_text_color",
|
||||
"description": "Set text color for textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Color name or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.delete_content",
|
||||
"description": "Delete textbox from slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_orientation",
|
||||
"description": "Set slide orientation",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"orientation": {
|
||||
"type": "string",
|
||||
"description": "Slide orientation",
|
||||
"enum": ["portrait", "landscape"]
|
||||
}
|
||||
},
|
||||
"required": ["orientation"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.position_box",
|
||||
"description": "Position textbox or image on slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Box index (0-based)"
|
||||
},
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Position on slide",
|
||||
"enum": ["left", "right", "center", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right"]
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "position"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.insert_file",
|
||||
"description": "Insert video or audio file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "File path"
|
||||
},
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"position": {
|
||||
"type": "object",
|
||||
"description": "Position coordinates",
|
||||
"properties": {
|
||||
"x": {
|
||||
"type": "number",
|
||||
"description": "X position (% of slide width)"
|
||||
},
|
||||
"y": {
|
||||
"type": "number",
|
||||
"description": "Y position (% of slide height)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"size": {
|
||||
"type": "object",
|
||||
"description": "Size dimensions",
|
||||
"properties": {
|
||||
"width": {
|
||||
"type": "number",
|
||||
"description": "Width (% of slide width)"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"description": "Height (% of slide height)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"autoplay": {
|
||||
"type": "boolean",
|
||||
"description": "Auto-play media"
|
||||
}
|
||||
},
|
||||
"required": ["file_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_background",
|
||||
"description": "Set slide background color or image",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based). If not provided, applies to all slides"
|
||||
},
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Background color"
|
||||
},
|
||||
"image_path": {
|
||||
"type": "string",
|
||||
"description": "Background image path (overrides color)"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.save_as",
|
||||
"description": "Save document to specified location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "File save path with filename and extension"
|
||||
},
|
||||
"overwrite": {
|
||||
"type": "boolean",
|
||||
"description": "Overwrite existing file (default: false)"
|
||||
}
|
||||
},
|
||||
"required": ["file_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.insert_image",
|
||||
"description": "Insert image to slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"image_path": {
|
||||
"type": "string",
|
||||
"description": "Image file path"
|
||||
},
|
||||
"width": {
|
||||
"type": "number",
|
||||
"description": "Image width in cm"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"description": "Image height in cm"
|
||||
},
|
||||
"position": {
|
||||
"type": "object",
|
||||
"description": "Position coordinates",
|
||||
"properties": {
|
||||
"x": {
|
||||
"type": "number",
|
||||
"description": "X position (% of slide width)"
|
||||
},
|
||||
"y": {
|
||||
"type": "number",
|
||||
"description": "Y position (% of slide height)"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "image_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.configure_display_settings",
|
||||
"description": "Configure presentation display settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"use_presenter_view": {
|
||||
"type": "boolean",
|
||||
"description": "Use presenter view"
|
||||
},
|
||||
"primary_monitor_only": {
|
||||
"type": "boolean",
|
||||
"description": "Use primary monitor only"
|
||||
},
|
||||
"monitor_for_presentation": {
|
||||
"type": "integer",
|
||||
"description": "Monitor number for presentation"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_number_color",
|
||||
"description": "Set slide number color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Color name or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_text_strikethrough",
|
||||
"description": "Apply strikethrough formatting to text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"line_numbers": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
},
|
||||
"description": "Line numbers for strikethrough (1-based)"
|
||||
},
|
||||
"apply": {
|
||||
"type": "boolean",
|
||||
"description": "Apply or remove strikethrough"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "line_numbers", "apply"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_textbox_alignment",
|
||||
"description": "Set text alignment for textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"alignment": {
|
||||
"type": "string",
|
||||
"description": "Text alignment",
|
||||
"enum": ["left", "center", "right", "justify"]
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "alignment"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.export_to_image",
|
||||
"description": "Export presentation or slide to image",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "Image save path with filename and extension"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"description": "Image format",
|
||||
"enum": ["png", "jpeg", "jpg", "gif", "bmp", "tiff"]
|
||||
},
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Specific slide index (1-based). If not provided, exports all slides"
|
||||
}
|
||||
},
|
||||
"required": ["file_path", "format"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -1,412 +0,0 @@
|
|||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.save",
|
||||
"description": "Save document to current location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.write_text",
|
||||
"description": "Write text at cursor position",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Text to write"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Apply bold formatting"
|
||||
},
|
||||
"italic": {
|
||||
"type": "boolean",
|
||||
"description": "Apply italic formatting"
|
||||
},
|
||||
"size": {
|
||||
"type": "number",
|
||||
"description": "Font size"
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_color",
|
||||
"description": "Change text color using regex pattern",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"color": {
|
||||
"type": "number",
|
||||
"description": "Hex color code (e.g., 0x000000)"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["pattern", "color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.find_and_replace",
|
||||
"description": "Find and replace text using regex",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to find"
|
||||
},
|
||||
"replacement": {
|
||||
"type": "string",
|
||||
"description": "Replacement text"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["pattern", "replacement"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_font",
|
||||
"description": "Change font family",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"font_name": {
|
||||
"type": "string",
|
||||
"description": "Font name (e.g., 'Arial', 'Times New Roman')"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["font_name"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_line_spacing",
|
||||
"description": "Set line spacing",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"spacing_value": {
|
||||
"type": "number",
|
||||
"description": "Spacing value (1.0=single, 2.0=double)"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["spacing_value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.remove_highlighting",
|
||||
"description": "Remove text highlighting",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.find_highlighted_text",
|
||||
"description": "Find text with specific highlight color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"highlight_color": {
|
||||
"type": "string",
|
||||
"description": "Color name (e.g., 'yellow') or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["highlight_color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.insert_formula_at_cursor",
|
||||
"description": "Insert formula at cursor",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"formula": {
|
||||
"type": "string",
|
||||
"description": "Formula to insert"
|
||||
}
|
||||
},
|
||||
"required": ["formula"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.insert_image_at_cursor",
|
||||
"description": "Insert image at cursor",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"image_path": {
|
||||
"type": "string",
|
||||
"description": "Full path to image file"
|
||||
},
|
||||
"width": {
|
||||
"type": "integer",
|
||||
"description": "Display width in pixels"
|
||||
},
|
||||
"height": {
|
||||
"type": "integer",
|
||||
"description": "Display height in pixels"
|
||||
}
|
||||
},
|
||||
"required": ["image_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_strikethrough",
|
||||
"description": "Apply strikethrough formatting",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["pattern"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_font_size",
|
||||
"description": "Change font size",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"font_size": {
|
||||
"type": "number",
|
||||
"description": "Font size in points"
|
||||
},
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["font_size", "pattern"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.export_to_pdf",
|
||||
"description": "Export document to PDF",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"output_path": {
|
||||
"type": "string",
|
||||
"description": "PDF save path"
|
||||
},
|
||||
"output_filename": {
|
||||
"type": "string",
|
||||
"description": "PDF filename"
|
||||
},
|
||||
"include_comments": {
|
||||
"type": "boolean",
|
||||
"description": "Include comments in PDF"
|
||||
},
|
||||
"quality": {
|
||||
"type": "string",
|
||||
"description": "Export quality ('standard', 'high', 'print')"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_paragraph_alignment",
|
||||
"description": "Set paragraph alignment",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"alignment": {
|
||||
"type": "string",
|
||||
"description": "Alignment type ('left', 'center', 'right', 'justify')"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["alignment"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.capitalize_words",
|
||||
"description": "Capitalize first letter of each word",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_default_font",
|
||||
"description": "Set default font for new text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"font_name": {
|
||||
"type": "string",
|
||||
"description": "Default font name"
|
||||
},
|
||||
"font_size": {
|
||||
"type": "number",
|
||||
"description": "Default font size in points"
|
||||
}
|
||||
},
|
||||
"required": ["font_name"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.add_page_numbers",
|
||||
"description": "Add page numbers",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Position ('bottom_left', 'bottom_center', 'bottom_right', 'top_left', 'top_center', 'top_right')"
|
||||
},
|
||||
"start_number": {
|
||||
"type": "integer",
|
||||
"description": "Starting page number"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"description": "Number format (e.g., '1', 'Page 1', '1 of N')"
|
||||
}
|
||||
},
|
||||
"required": ["position"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.insert_page_break",
|
||||
"description": "Insert page break",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Insert location ('at_cursor', 'end_of_document')"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.change_text_case",
|
||||
"description": "Change text case",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"case_type": {
|
||||
"type": "string",
|
||||
"description": "Case type ('lowercase', 'uppercase')"
|
||||
},
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["case_type", "pattern"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -1,166 +0,0 @@
|
|||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_playlist",
|
||||
"description": "Get current playlist with track info",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.play",
|
||||
"description": "Start playing current media",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.pause",
|
||||
"description": "Pause current media",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.next",
|
||||
"description": "Switch to next track",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.previous",
|
||||
"description": "Switch to previous track",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.add_to_playlist",
|
||||
"description": "Add media file to playlist",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"uri": {
|
||||
"type": "string",
|
||||
"description": "Media file URI (file:// or https://)"
|
||||
}
|
||||
},
|
||||
"required": ["uri"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_current_time",
|
||||
"description": "Get current playback position in seconds",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_media_duration",
|
||||
"description": "Get media duration in seconds",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.toggle_fullscreen",
|
||||
"description": "Toggle or set fullscreen mode",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enable": {
|
||||
"type": "boolean",
|
||||
"description": "Force fullscreen on/off, omit to toggle"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_settings",
|
||||
"description": "Get VLC settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.set_settings",
|
||||
"description": "Set VLC settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"field": {
|
||||
"type": "string",
|
||||
"description": "Setting name (e.g. qt-max-volume, qt-minimal-view)"
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "Setting value (use 0/1 for booleans)"
|
||||
}
|
||||
},
|
||||
"required": ["field", "value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_media_files",
|
||||
"description": "Get media files from path",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Directory path"
|
||||
},
|
||||
"suffix": {
|
||||
"type": "array",
|
||||
"description": "File extensions, default: ['mp4','avi','mkv','mov','mp3','m4a','wav']"
|
||||
}
|
||||
},
|
||||
"required": ["path"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -1,260 +0,0 @@
|
|||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class CodeTools:
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
"""打印执行结果"""
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def launch_vscode(cls, path):
|
||||
"""
|
||||
Launches Visual Studio Code with the specified file path or directory.
|
||||
在存在的窗口中打开一个文件或目录。
|
||||
|
||||
Args:
|
||||
path (str): 文件路径或目录。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "-r", path], check=True)
|
||||
cls.ret = "Successfully launched VS Code"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error launching VS Code: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
cls.ret = "None"
|
||||
|
||||
@classmethod
|
||||
def compare_files(cls, file1, file2):
|
||||
"""
|
||||
Compares two files in VSCode.
|
||||
在VSCode中比较两个文件。
|
||||
|
||||
Args:
|
||||
file1 (str): 第一个文件的路径。
|
||||
file2 (str): 第二个文件的路径。
|
||||
"""
|
||||
try:
|
||||
# 获取compare结果
|
||||
subprocess.run(["code", "-d", file1, file2], check=True)
|
||||
cls.ret = "The compared files are opened in VSCode"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error comparing files: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def add_folder(cls, folder):
|
||||
"""
|
||||
Adds a folder to the last active window in VSCode.
|
||||
向VSCode的最后一个活动窗口添加文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "-a", folder], check=True)
|
||||
cls.ret = "Successfully added folder"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error adding folder: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def goto_file(cls, file_path, line=1, character=1):
|
||||
"""
|
||||
Opens a file at a specific line and character position.
|
||||
在特定行和字符的位置打开文件。
|
||||
|
||||
Args:
|
||||
file_path (str): 文件路径。
|
||||
line (int): 行号。
|
||||
character (int): 字符位置。
|
||||
"""
|
||||
try:
|
||||
command = f"{file_path}:{line}:{character}"
|
||||
subprocess.run(["code", "-g", command], check=True)
|
||||
cls.ret = "Successfully opened file, line: {}, character: {}".format(line, character)
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error going to file: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def perform_merge(cls, path1, path2, base, result):
|
||||
"""
|
||||
Perform a three-way merge.
|
||||
执行三方合并。
|
||||
|
||||
Args:
|
||||
path1 (str): 第一版本文件路径。
|
||||
path2 (str): 第二版本文件路径。
|
||||
base (str): 基础版本文件路径。
|
||||
result (str): 结果文件的保存路径。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "-m", path1, path2, base, result], check=True)
|
||||
cls.ret = "Successfully performed merge"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error performing merge: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def remove_folder(cls, folder):
|
||||
"""
|
||||
Removes a folder from the last active window in VSCode.
|
||||
在VSCode的最后一个活动窗口中移除文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--remove", folder], check=True)
|
||||
cls.ret = "Successfully removed folder"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error removing folder: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def install_extension(cls, extension_id, pre_release=False):
|
||||
"""
|
||||
Installs an extension or updates it in VSCode.
|
||||
安装或更新VSCode中的扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
pre_release (bool): 是否安装预发布版本。
|
||||
"""
|
||||
try:
|
||||
command = ["code", "--install-extension", extension_id]
|
||||
if pre_release:
|
||||
command.append("--pre-release")
|
||||
subprocess.run(command, check=True)
|
||||
cls.ret = "Successfully installed extension"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error installing extension: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def uninstall_extension(cls, extension_id):
|
||||
"""
|
||||
Uninstalls an extension from VSCode.
|
||||
从VSCode中卸载扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--uninstall-extension", extension_id], check=True)
|
||||
cls.ret = "Successfully uninstalled extension"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error uninstalling extension: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def list_extensions(cls, show_versions=False, category=None):
|
||||
"""
|
||||
Lists installed extensions in VSCode.
|
||||
列出VSCode中安装的扩展。
|
||||
|
||||
Args:
|
||||
show_versions (bool): 是否显示扩展的版本。
|
||||
category (str): 按类别筛选扩展。
|
||||
"""
|
||||
try:
|
||||
command = ["code", "--list-extensions"]
|
||||
if show_versions:
|
||||
command.append("--show-versions")
|
||||
if category:
|
||||
command.extend(["--category", category])
|
||||
cls.ret = subprocess.run(command, check=True, capture_output=True, text=True).stdout
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error listing extensions: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def update_extensions(cls):
|
||||
"""
|
||||
Updates all installed extensions in VSCode to the latest version.
|
||||
更新VSCode中所有安装的扩展到最新版本。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--update-extensions"], check=True)
|
||||
cls.ret = "Successfully updated extensions"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error updating extensions: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def disable_extension(cls, extension_id):
|
||||
"""
|
||||
Disables a specific extension for the next instance of VSCode.
|
||||
禁用在下一个VSCode窗口中的指定扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--disable-extension", extension_id], check=True)
|
||||
cls.ret = "Successfully disabled extension"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error disabling extension: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def toggle_sync(cls, state):
|
||||
"""
|
||||
Toggles synchronization on or off in VSCode.
|
||||
在VSCode中开启或关闭同步。
|
||||
|
||||
Args:
|
||||
state (str): 'on' 或 'off' 表示开启或关闭。
|
||||
"""
|
||||
try:
|
||||
command = ["code", "--sync", state]
|
||||
subprocess.run(command, check=True)
|
||||
cls.ret = "Successfully toggled sync"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error toggling sync: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
class BrowserTools:
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
cls.ret = "None"
|
||||
|
||||
# @classmethod
|
||||
# def show_all_tabs(cls):
|
||||
# cls.ret = "Browser not found"
|
||||
# for attempt in range(3):
|
||||
# with sync_playwright() as p:
|
||||
# try:
|
||||
# browser = p.chromium.connect_over_cdp(cls.remote_debugging_url)
|
||||
# if not browser:
|
||||
# continue
|
||||
# context = browser.contexts[0]
|
||||
# # 获取所有窗口名称
|
||||
# cls.ret = 'Browser Tabs: '
|
||||
# for idx, page in enumerate(context.pages):
|
||||
# cls.ret += f"{idx}. {page.title()} ({page.url})" + '\n'
|
||||
# return cls.ret
|
||||
# except TimeoutError:
|
||||
# cls.ret = 'Failed to get browser tabs'
|
||||
# return None
|
||||
# return None
|
||||
|
||||
@classmethod
|
||||
def open_profile_settings(cls):
|
||||
"""
|
||||
Open the profile settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/people"]}}
|
||||
|
||||
@classmethod
|
||||
def open_password_settings(cls):
|
||||
"""
|
||||
Open the password settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/autofill"]}}
|
||||
|
||||
@classmethod
|
||||
def open_privacy_settings(cls):
|
||||
"""
|
||||
Open the privacy settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/privacy"]}}
|
||||
|
||||
@classmethod
|
||||
def open_appearance_settings(cls):
|
||||
"""
|
||||
Open the appearance settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/appearance"]}}
|
||||
|
||||
@classmethod
|
||||
def open_search_engine_settings(cls):
|
||||
"""
|
||||
Open the search engine settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/search"]}}
|
||||
|
||||
@classmethod
|
||||
def bring_back_last_tab(cls):
|
||||
"""
|
||||
Bring back the last tab in the browser.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'shift', 't'); print('Brought back last tab')"
|
||||
|
||||
@classmethod
|
||||
def print(cls):
|
||||
"""
|
||||
Open the print option in current page.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'p'); print('Opened print option')"
|
||||
|
||||
@classmethod
|
||||
def delete_browsing_data(cls):
|
||||
"""
|
||||
Delete browsing data in the browser.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'shift', 'del'); print('Deleted browsing data')"
|
||||
|
||||
@classmethod
|
||||
def open_extensions(cls):
|
||||
"""
|
||||
open the extensions page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://extensions"]}}
|
||||
|
||||
@classmethod
|
||||
def bookmark_page(cls):
|
||||
"""
|
||||
Bookmark the current page in the browser.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'd'); print('Bookmarked page')"
|
||||
|
||||
@classmethod
|
||||
def open_bookmarks(cls):
|
||||
"""
|
||||
Open the bookmarks page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://bookmarks"]}}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,753 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
import uno
|
||||
from com.sun.star.awt.FontSlant import ITALIC, NONE, OBLIQUE
|
||||
from com.sun.star.awt.FontWeight import BOLD, NORMAL
|
||||
from com.sun.star.beans import PropertyValue
|
||||
from com.sun.star.style.ParagraphAdjust import CENTER, LEFT, RIGHT
|
||||
from com.sun.star.text.ControlCharacter import PARAGRAPH_BREAK
|
||||
from com.sun.star.text.TextContentAnchorType import AS_CHARACTER
|
||||
|
||||
|
||||
class WriterTools:
|
||||
localContext = uno.getComponentContext()
|
||||
resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
|
||||
ctx = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
|
||||
desktop = ctx.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
||||
doc = desktop.getCurrentComponent()
|
||||
text = doc.Text
|
||||
cursor = text.createTextCursor()
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def close_other_window(cls):
|
||||
"""关闭除当前文档外的所有文档"""
|
||||
components = cls.desktop.getComponents().createEnumeration()
|
||||
current_url = cls.doc.getURL()
|
||||
while components.hasMoreElements():
|
||||
doc = components.nextElement()
|
||||
if doc.getURL() != current_url:
|
||||
doc.close(True)
|
||||
|
||||
@classmethod
|
||||
def save(cls):
|
||||
"""保存文档到当前位置"""
|
||||
try:
|
||||
if cls.doc.hasLocation():
|
||||
cls.doc.store()
|
||||
else:
|
||||
raise Exception("文档没有保存位置,请使用另存为功能")
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def maximize_window(cls):
|
||||
"""
|
||||
将窗口设置为工作区最大尺寸
|
||||
使用工作区域大小(考虑任务栏等)
|
||||
"""
|
||||
window = cls.doc.getCurrentController().getFrame().getContainerWindow()
|
||||
toolkit = window.getToolkit()
|
||||
device = toolkit.createScreenCompatibleDevice(0, 0)
|
||||
workarea = toolkit.getWorkArea()
|
||||
window.setPosSize(workarea.X, workarea.Y, workarea.Width, workarea.Height, 15)
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def write_text(cls, text, bold=False, italic=False, size=None):
|
||||
"""写入文本"""
|
||||
cls.cursor.CharWeight = 150 if bold else 100
|
||||
cls.cursor.CharPosture = ITALIC if italic else NONE
|
||||
if size:
|
||||
cls.cursor.CharHeight = size
|
||||
cls.text.insertString(cls.cursor, text, False)
|
||||
cls.ret = "Success"
|
||||
|
||||
@classmethod
|
||||
def get_paragraphs(cls, start_index=0, count=None):
|
||||
"""Retrieves paragraphs from the document as a list."""
|
||||
text = cls.doc.getText()
|
||||
paragraphs = text.createEnumeration()
|
||||
paragraph_list = []
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
paragraph_list.append(paragraph.getString())
|
||||
if start_index < 0:
|
||||
start_index = 0
|
||||
elif start_index >= len(paragraph_list):
|
||||
cls.ret = []
|
||||
if count is not None:
|
||||
end_index = min(start_index + count, len(paragraph_list))
|
||||
cls.ret = paragraph_list[start_index:end_index]
|
||||
else:
|
||||
cls.ret = paragraph_list[start_index:]
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
paras = cls.get_paragraphs()
|
||||
para_str = ""
|
||||
for i, para in enumerate(paras):
|
||||
para = para[:500] + "..." if len(para) > 500 else para
|
||||
para_str += "Paragraph " + str(i) + ": " + para.strip() + "\n"
|
||||
cls.ret = para_str
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_color(cls, pattern, color, paragraph_indices=None):
|
||||
"""
|
||||
Changes the color of matched text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
pattern (str): Regular expression pattern to match text
|
||||
color (int): Hex color code (e.g., 0x000000 for black)
|
||||
paragraph_indices (list, optional): List of paragraph indices to modify (0-based).
|
||||
If None, applies to all paragraphs.
|
||||
"""
|
||||
try:
|
||||
enum = cls.doc.Text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraphs.append(enum.nextElement())
|
||||
if not paragraph_indices:
|
||||
paragraphs_to_process = range(len(paragraphs))
|
||||
else:
|
||||
paragraphs_to_process = paragraph_indices
|
||||
regex = re.compile(pattern)
|
||||
for idx in paragraphs_to_process:
|
||||
if idx < 0 or idx >= len(paragraphs):
|
||||
continue
|
||||
paragraph = paragraphs[idx]
|
||||
if not paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
continue
|
||||
para_text = paragraph.getString()
|
||||
matches = regex.finditer(para_text)
|
||||
for match in matches:
|
||||
para_cursor = cls.text.createTextCursorByRange(paragraph.getStart())
|
||||
para_cursor.goRight(match.start(), False)
|
||||
para_cursor.goRight(match.end() - match.start(), True)
|
||||
para_cursor.CharColor = color
|
||||
cls.ret = "Success"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def find_and_replace(cls, pattern, replacement, paragraph_indices=None):
|
||||
"""
|
||||
Finds all occurrences of a specified text pattern and replaces them with another text in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The pattern to match in the document, should be a regular expression
|
||||
replacement (str): The text to replace the found text with
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing)
|
||||
|
||||
Returns:
|
||||
str: Success message with number of replacements made
|
||||
"""
|
||||
try:
|
||||
enum = cls.doc.Text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraphs.append(enum.nextElement())
|
||||
total_replacements = 0
|
||||
if not paragraph_indices:
|
||||
paragraphs_to_process = list(range(len(paragraphs)))
|
||||
else:
|
||||
paragraphs_to_process = [i for i in paragraph_indices if 0 <= i < len(paragraphs)]
|
||||
regex = re.compile(pattern)
|
||||
for idx in paragraphs_to_process:
|
||||
if idx >= len(paragraphs):
|
||||
continue
|
||||
paragraph = paragraphs[idx]
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
text_content = paragraph.getString()
|
||||
new_text, count = regex.subn(replacement, text_content)
|
||||
if count > 0:
|
||||
paragraph.setString(new_text)
|
||||
total_replacements += count
|
||||
cls.ret = f"Successfully made {total_replacements} replacements"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error during find and replace: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_font(cls, font_name, paragraph_indices=None):
|
||||
"""
|
||||
Changes the font of text in the document or specified paragraphs.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to apply (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
enum = text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraphs.append(enum.nextElement())
|
||||
if not paragraph_indices:
|
||||
paragraph_indices = range(len(paragraphs))
|
||||
for idx in paragraph_indices:
|
||||
if 0 <= idx < len(paragraphs):
|
||||
paragraph = paragraphs[idx]
|
||||
cursor = text.createTextCursorByRange(paragraph)
|
||||
cursor.CharFontName = font_name
|
||||
cls.ret = "Success"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def set_line_spacing(cls, spacing_value, paragraph_indices=None):
|
||||
"""
|
||||
Sets the line spacing for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
spacing_value (float): The line spacing value to apply (1.0 for single spacing, 2.0 for double spacing, etc.)
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
paragraph_enum = text.createEnumeration()
|
||||
line_spacing_value = int(spacing_value * 100)
|
||||
current_index = 0
|
||||
|
||||
while paragraph_enum.hasMoreElements():
|
||||
paragraph = paragraph_enum.nextElement()
|
||||
|
||||
if not paragraph_indices or current_index in paragraph_indices:
|
||||
line_spacing = uno.createUnoStruct("com.sun.star.style.LineSpacing")
|
||||
line_spacing.Mode = 0
|
||||
line_spacing.Height = line_spacing_value
|
||||
paragraph.ParaLineSpacing = line_spacing
|
||||
|
||||
if paragraph.String.strip():
|
||||
current_index += 1
|
||||
|
||||
cls.ret = "Success"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def remove_highlighting(cls, paragraph_indices=None):
|
||||
"""
|
||||
Removes ALL highlighting from text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
paragraphs = text.createEnumeration()
|
||||
target_indices = set(paragraph_indices) if paragraph_indices else None
|
||||
current_index = 0
|
||||
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if target_indices is None or current_index in target_indices:
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
para_cursor = text.createTextCursorByRange(paragraph)
|
||||
# Remove all highlighting by setting back color to -1
|
||||
para_cursor.CharBackColor = -1
|
||||
|
||||
# Additional cleanup for individual text portions (optional)
|
||||
text_portions = paragraph.createEnumeration()
|
||||
while text_portions.hasMoreElements():
|
||||
text_portion = text_portions.nextElement()
|
||||
if hasattr(text_portion, "CharBackColor"):
|
||||
portion_cursor = text.createTextCursorByRange(text_portion)
|
||||
portion_cursor.CharBackColor = -1
|
||||
current_index += 1
|
||||
|
||||
cls.ret = "Successfully removed all highlighting"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error removing highlighting: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def find_highlighted_text(cls, highlight_color):
|
||||
"""
|
||||
Finds all text in the document that has a specific highlight color applied to it.
|
||||
|
||||
Args:
|
||||
highlight_color (str): The highlight color to search for. Can be a color name (e.g., 'yellow', 'green') or hex code.
|
||||
|
||||
Returns:
|
||||
list: A list of strings containing all text segments with the specified highlight color.
|
||||
"""
|
||||
color_map = {
|
||||
"yellow": 16776960,
|
||||
"green": 65280,
|
||||
"blue": 255,
|
||||
"red": 16711680,
|
||||
"cyan": 65535,
|
||||
"magenta": 16711935,
|
||||
"black": 0,
|
||||
"white": 16777215,
|
||||
"gray": 8421504,
|
||||
"lightgray": 12632256,
|
||||
}
|
||||
target_color = None
|
||||
if highlight_color.lower() in color_map:
|
||||
target_color = color_map[highlight_color.lower()]
|
||||
elif highlight_color.startswith("#") and len(highlight_color) == 7:
|
||||
try:
|
||||
hex_color = highlight_color[1:]
|
||||
r = int(hex_color[0:2], 16)
|
||||
g = int(hex_color[2:4], 16)
|
||||
b = int(hex_color[4:6], 16)
|
||||
target_color = (r << 16) + (g << 8) + b
|
||||
except ValueError:
|
||||
cls.ret = f"Invalid hex color format: {highlight_color}"
|
||||
return []
|
||||
else:
|
||||
cls.ret = f"Unsupported color format: {highlight_color}"
|
||||
return []
|
||||
highlighted_text = []
|
||||
text = cls.doc.getText()
|
||||
enum_paragraphs = text.createEnumeration()
|
||||
while enum_paragraphs.hasMoreElements():
|
||||
paragraph = enum_paragraphs.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
enum_portions = paragraph.createEnumeration()
|
||||
while enum_portions.hasMoreElements():
|
||||
text_portion = enum_portions.nextElement()
|
||||
if hasattr(text_portion, "CharBackColor") and text_portion.CharBackColor == target_color:
|
||||
if text_portion.getString().strip():
|
||||
highlighted_text.append(text_portion.getString())
|
||||
cls.ret = f"Found {len(highlighted_text)} text segments with highlight color {highlight_color}"
|
||||
return highlighted_text
|
||||
|
||||
@classmethod
|
||||
def insert_formula_at_cursor(cls, formula):
|
||||
"""
|
||||
Inserts a formula at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
formula (str): The formula to insert at the current cursor position.
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
embedded_obj = cls.doc.createInstance("com.sun.star.text.TextEmbeddedObject")
|
||||
embedded_obj.setPropertyValue("CLSID", "078B7ABA-54FC-457F-8551-6147e776a997")
|
||||
embedded_obj.setPropertyValue("AnchorType", AS_CHARACTER)
|
||||
cls.text.insertTextContent(cls.cursor, embedded_obj, False)
|
||||
math_obj = embedded_obj.getEmbeddedObject()
|
||||
math_obj.Formula = formula
|
||||
cls.ret = "Formula inserted successfully"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error inserting formula: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def insert_image_at_cursor(cls, image_path, width=None, height=None):
|
||||
"""
|
||||
Inserts an image at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
image_path (str): Full path to the image file to insert
|
||||
width (int, optional): Width to display the image in pixels
|
||||
height (int, optional): Height to display the image in pixels
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
if image_path.startswith("~"):
|
||||
image_path = os.path.expanduser(image_path)
|
||||
if not os.path.exists(image_path):
|
||||
cls.ret = f"Error: Image file not found at {image_path}"
|
||||
return cls.ret
|
||||
image_path = os.path.abspath(image_path)
|
||||
if os.name == "nt":
|
||||
file_url = "file:///" + image_path.replace("\\", "/")
|
||||
else:
|
||||
file_url = "file://" + image_path
|
||||
graphic = cls.doc.createInstance("com.sun.star.text.GraphicObject")
|
||||
graphic.GraphicURL = file_url
|
||||
graphic.AnchorType = AS_CHARACTER
|
||||
if width is not None:
|
||||
graphic.Width = width * 100
|
||||
if height is not None:
|
||||
graphic.Height = height * 100
|
||||
cls.text.insertTextContent(cls.cursor, graphic, False)
|
||||
cls.ret = "Success: Image inserted"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_strikethrough(cls, pattern, paragraph_indices=None):
|
||||
"""
|
||||
Sets the strikethrough formatting for text matching the specified pattern in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The regular expression pattern to match in the document
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error information
|
||||
"""
|
||||
try:
|
||||
paragraphs = cls.doc.getText().createEnumeration()
|
||||
para_index = 0
|
||||
found_matches = 0
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
if paragraph_indices and para_index not in paragraph_indices:
|
||||
para_index += 1
|
||||
continue
|
||||
para_text = paragraph.getString()
|
||||
matches = list(re.finditer(pattern, para_text))
|
||||
for match in matches:
|
||||
text_range = paragraph.getStart()
|
||||
cursor = cls.doc.getText().createTextCursorByRange(text_range)
|
||||
cursor.goRight(match.start(), False)
|
||||
cursor.goRight(match.end() - match.start(), True)
|
||||
cursor.CharStrikeout = 1
|
||||
found_matches += 1
|
||||
para_index += 1
|
||||
cls.ret = f"Successfully applied strikethrough to {found_matches} matches of pattern: {pattern}"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error applying strikethrough: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_font_size(cls, font_size, pattern, paragraph_indices=None):
|
||||
"""
|
||||
Changes the font size of specified text in the document.
|
||||
|
||||
Args:
|
||||
font_size (float): The font size to apply (in points).
|
||||
pattern (str): The pattern to match in the document, should be a regular expression.
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Result message indicating success or failure.
|
||||
"""
|
||||
try:
|
||||
regex = re.compile(pattern)
|
||||
paragraphs = cls.doc.getText().createEnumeration()
|
||||
current_index = 0
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if paragraph_indices and current_index not in paragraph_indices:
|
||||
current_index += 1
|
||||
continue
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
para_cursor = cls.text.createTextCursorByRange(paragraph)
|
||||
para_text = paragraph.getString()
|
||||
matches = list(regex.finditer(para_text))
|
||||
for match in reversed(matches):
|
||||
start_pos = match.start()
|
||||
end_pos = match.end()
|
||||
para_cursor.gotoStart(False)
|
||||
para_cursor.goRight(start_pos, False)
|
||||
para_cursor.goRight(end_pos - start_pos, True)
|
||||
para_cursor.CharHeight = font_size
|
||||
current_index += 1
|
||||
cls.ret = f"Successfully changed font size to {font_size} for text matching '{pattern}'"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error changing font size: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def export_to_pdf(cls, output_path=None, output_filename=None, include_comments=False, quality="standard"):
|
||||
"""
|
||||
Exports the current document to PDF format.
|
||||
|
||||
Args:
|
||||
output_path (str, optional): The full path where the PDF should be saved.
|
||||
If not provided, uses the same location as the original document.
|
||||
output_filename (str, optional): The filename to use for the PDF.
|
||||
If not provided, uses the original document's filename with .pdf extension.
|
||||
include_comments (bool, optional): Whether to include comments in the exported PDF.
|
||||
Defaults to False.
|
||||
quality (str, optional): The quality of the PDF export ('standard', 'high', 'print').
|
||||
Defaults to 'standard'.
|
||||
|
||||
Returns:
|
||||
str: Path to the exported PDF file or error message
|
||||
"""
|
||||
try:
|
||||
doc_url = cls.doc.getURL()
|
||||
if not doc_url and not output_path:
|
||||
return "Error: Document has not been saved and no output path provided"
|
||||
if doc_url:
|
||||
doc_path = uno.fileUrlToSystemPath(os.path.dirname(doc_url))
|
||||
doc_filename = os.path.basename(doc_url)
|
||||
doc_name = os.path.splitext(doc_filename)[0]
|
||||
else:
|
||||
doc_path = ""
|
||||
doc_name = "export"
|
||||
final_path = output_path if output_path else doc_path
|
||||
final_filename = output_filename if output_filename else f"{doc_name}.pdf"
|
||||
if not final_filename.lower().endswith(".pdf"):
|
||||
final_filename += ".pdf"
|
||||
full_output_path = os.path.join(final_path, final_filename)
|
||||
output_url = uno.systemPathToFileUrl(full_output_path)
|
||||
export_props = []
|
||||
if quality == "high":
|
||||
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=1))
|
||||
elif quality == "print":
|
||||
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=2))
|
||||
else:
|
||||
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=0))
|
||||
export_props.append(PropertyValue(Name="ExportNotes", Value=include_comments))
|
||||
export_props.extend(
|
||||
[
|
||||
PropertyValue(Name="FilterName", Value="writer_pdf_Export"),
|
||||
PropertyValue(Name="Overwrite", Value=True),
|
||||
]
|
||||
)
|
||||
cls.doc.storeToURL(output_url, tuple(export_props))
|
||||
cls.ret = f"PDF exported to: {full_output_path}"
|
||||
return full_output_path
|
||||
except Exception as e:
|
||||
cls.ret = f"Error exporting to PDF: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_paragraph_alignment(cls, alignment, paragraph_indices=None):
|
||||
"""
|
||||
Sets the text alignment for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
alignment (str): The alignment to apply ('left', 'center', 'right', 'justify').
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
alignment_map = {"left": LEFT, "center": CENTER, "right": RIGHT, "justify": 3}
|
||||
if alignment.lower() not in alignment_map:
|
||||
cls.ret = f"Error: Invalid alignment '{alignment}'. Use 'left', 'center', 'right', or 'justify'."
|
||||
return cls.ret
|
||||
alignment_value = alignment_map[alignment.lower()]
|
||||
text = cls.doc.getText()
|
||||
paragraph_enum = text.createEnumeration()
|
||||
paragraphs = []
|
||||
while paragraph_enum.hasMoreElements():
|
||||
paragraph = paragraph_enum.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
paragraphs.append(paragraph)
|
||||
if paragraph_indices:
|
||||
valid_indices = [i for i in paragraph_indices if 0 <= i < len(paragraphs)]
|
||||
if len(valid_indices) != len(paragraph_indices):
|
||||
cls.ret = f"Warning: Some paragraph indices were out of range (0-{len(paragraphs) - 1})"
|
||||
for idx in valid_indices:
|
||||
paragraphs[idx].ParaAdjust = alignment_value
|
||||
else:
|
||||
for paragraph in paragraphs:
|
||||
paragraph.ParaAdjust = alignment_value
|
||||
cls.ret = f"Successfully applied '{alignment}' alignment to paragraphs"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error setting paragraph alignment: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def capitalize_words(cls, paragraph_indices=None):
|
||||
"""
|
||||
Capitalizes the first letter of each word for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
enum = text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraph = enum.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
paragraphs.append(paragraph)
|
||||
if not paragraph_indices:
|
||||
target_paragraphs = list(range(len(paragraphs)))
|
||||
else:
|
||||
target_paragraphs = paragraph_indices
|
||||
valid_indices = [idx for idx in target_paragraphs if 0 <= idx < len(paragraphs)]
|
||||
for idx in valid_indices:
|
||||
paragraph = paragraphs[idx]
|
||||
text_content = paragraph.getString()
|
||||
if not text_content.strip():
|
||||
continue
|
||||
capitalized_text = " ".join(word.capitalize() if word else "" for word in text_content.split(" "))
|
||||
para_cursor = text.createTextCursorByRange(paragraph.getStart())
|
||||
para_cursor.gotoRange(paragraph.getEnd(), True)
|
||||
para_cursor.setString(capitalized_text)
|
||||
cls.ret = f"Successfully capitalized words in {len(valid_indices)} paragraphs"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error capitalizing words: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_default_font(cls, font_name, font_size=None):
|
||||
"""
|
||||
Sets the default font for new text in the document without changing existing text.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to set as default (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
font_size (float, optional): The default font size in points.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
style_families = cls.doc.getStyleFamilies()
|
||||
paragraph_styles = style_families.getByName("ParagraphStyles")
|
||||
default_style_names = ["Default", "Standard", "Normal"]
|
||||
standard_style = None
|
||||
for style_name in default_style_names:
|
||||
if paragraph_styles.hasByName(style_name):
|
||||
standard_style = paragraph_styles.getByName(style_name)
|
||||
break
|
||||
if standard_style is None:
|
||||
style_names = paragraph_styles.getElementNames()
|
||||
if style_names:
|
||||
standard_style = paragraph_styles.getByName(style_names[0])
|
||||
else:
|
||||
raise Exception("Could not find default paragraph style")
|
||||
standard_style.setPropertyValue("CharFontName", font_name)
|
||||
standard_style.setPropertyValue("CharFontNameAsian", font_name)
|
||||
standard_style.setPropertyValue("CharFontNameComplex", font_name)
|
||||
if font_size is not None:
|
||||
standard_style.setPropertyValue("CharHeight", float(font_size))
|
||||
standard_style.setPropertyValue("CharHeightAsian", float(font_size))
|
||||
standard_style.setPropertyValue("CharHeightComplex", float(font_size))
|
||||
cls.cursor.setPropertyValue("CharFontName", font_name)
|
||||
cls.cursor.setPropertyValue("CharFontNameAsian", font_name)
|
||||
cls.cursor.setPropertyValue("CharFontNameComplex", font_name)
|
||||
if font_size is not None:
|
||||
cls.cursor.setPropertyValue("CharHeight", float(font_size))
|
||||
cls.cursor.setPropertyValue("CharHeightAsian", float(font_size))
|
||||
cls.cursor.setPropertyValue("CharHeightComplex", float(font_size))
|
||||
cls.ret = f"Default font set to '{font_name}'" + (f" with size {font_size}pt" if font_size else "")
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error setting default font: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def add_page_numbers(cls, position, start_number=1, format=None):
|
||||
"""
|
||||
Adds page numbers to the document at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Position of the page numbers ('bottom_left', 'bottom_center', 'bottom_right',
|
||||
'top_left', 'top_center', 'top_right')
|
||||
start_number (int, optional): The starting page number. Defaults to 1.
|
||||
format (str, optional): Format of the page numbers (e.g., '1', 'Page 1', '1 of N').
|
||||
Defaults to simple number format.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
page_styles = cls.doc.StyleFamilies.getByName("PageStyles")
|
||||
default_style = page_styles.getByName("Standard")
|
||||
try:
|
||||
default_style.setPropertyValue("PageNumberOffset", start_number)
|
||||
except:
|
||||
pass
|
||||
if position.startswith("top"):
|
||||
default_style.HeaderIsOn = True
|
||||
target = default_style.HeaderText
|
||||
else:
|
||||
default_style.FooterIsOn = True
|
||||
target = default_style.FooterText
|
||||
cursor = target.createTextCursor()
|
||||
cursor.gotoStart(False)
|
||||
cursor.gotoEnd(True)
|
||||
cursor.setString("")
|
||||
cursor.gotoStart(False)
|
||||
if position.endswith("_left"):
|
||||
cursor.ParaAdjust = LEFT
|
||||
elif position.endswith("_center"):
|
||||
cursor.ParaAdjust = CENTER
|
||||
elif position.endswith("_right"):
|
||||
cursor.ParaAdjust = RIGHT
|
||||
if not format or format == "1":
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
elif format == "Page 1" or "Page" in format and "of" not in format:
|
||||
target.insertString(cursor, "Page ", False)
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
elif format == "1 of N" or format == "Page {page} of {total}" or "of" in format:
|
||||
if "Page" in format:
|
||||
target.insertString(cursor, "Page ", False)
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
target.insertString(cursor, " of ", False)
|
||||
page_count = cls.doc.createInstance("com.sun.star.text.TextField.PageCount")
|
||||
page_count.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_count, False)
|
||||
else:
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
cls.ret = "Successfully added page numbers"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error adding page numbers: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def insert_page_break(cls, position="at_cursor"):
|
||||
"""
|
||||
Inserts a page break at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Where to insert the page break: 'at_cursor' for current cursor position,
|
||||
'end_of_document' for end of document. Defaults to 'at_cursor'.
|
||||
"""
|
||||
try:
|
||||
if position == "end_of_document":
|
||||
cls.cursor.gotoEnd(False)
|
||||
cls.text.insertControlCharacter(cls.cursor, PARAGRAPH_BREAK, False)
|
||||
cls.cursor.gotoStartOfParagraph(True)
|
||||
cls.cursor.BreakType = uno.Enum("com.sun.star.style.BreakType", "PAGE_BEFORE")
|
||||
cls.ret = "Page break inserted successfully"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error inserting page break: {str(e)}"
|
||||
return False
|
||||
|
|
@ -1,233 +0,0 @@
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
|
||||
class VLCTools:
|
||||
host = "localhost"
|
||||
port = 8080
|
||||
base_url = f"http://{host}:{port}/requests"
|
||||
password = "password"
|
||||
auth = HTTPBasicAuth("", password)
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def _make_request(cls, endpoint, params=None):
|
||||
url = f"{cls.base_url}/{endpoint}"
|
||||
try:
|
||||
response = requests.get(url, params=params, auth=cls.auth)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _get_status(cls):
|
||||
response = cls._make_request("status.xml")
|
||||
if response:
|
||||
return ET.fromstring(response.content)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
cls.ret = "None"
|
||||
|
||||
@classmethod
|
||||
def get_playlist(cls):
|
||||
response = cls._make_request("playlist.xml")
|
||||
if response:
|
||||
info = ET.fromstring(response.content)
|
||||
playlist_node = info.find('.//node[@name="Playlist"]')
|
||||
if playlist_node is not None:
|
||||
playlist_items = []
|
||||
for leaf in playlist_node.findall("leaf"):
|
||||
item = {"name": leaf.get("name"), "uri": leaf.get("uri"), "duration": leaf.get("duration") + "s"}
|
||||
playlist_items.append(item)
|
||||
cls.ret = f"Playlist: {playlist_items}"
|
||||
return cls.ret
|
||||
cls.ret = "Error getting playlist"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def play(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_play"})
|
||||
if response:
|
||||
cls.ret = "Start playing the media"
|
||||
return cls.ret
|
||||
cls.ret = "Error playing the media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def pause(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_pause"})
|
||||
if response:
|
||||
cls.ret = "Pause the media"
|
||||
return cls.ret
|
||||
cls.ret = "Error pausing the media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def next(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_next"})
|
||||
if response:
|
||||
cls.ret = "Switch to next media"
|
||||
return cls.ret
|
||||
cls.ret = "Error switching to next media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def previous(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_previous"})
|
||||
if response:
|
||||
cls.ret = "Switch to previous media"
|
||||
return cls.ret
|
||||
cls.ret = "Error switching to previous media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def add_to_playlist(cls, uri):
|
||||
if uri.startswith("http"):
|
||||
encoded_uri = uri
|
||||
else:
|
||||
encoded_uri = "file://" + quote(uri.replace("file://", ""))
|
||||
|
||||
response = cls._make_request("status.xml", {"command": "in_play", "input": encoded_uri})
|
||||
if response:
|
||||
cls.ret = f"Add {uri} to playlist"
|
||||
return cls.ret
|
||||
cls.ret = f"Error adding {uri} to playlist"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_current_time(cls):
|
||||
status = cls._get_status()
|
||||
if status is not None:
|
||||
time = status.find("time")
|
||||
cls.ret = int(time.text) if time is not None else None
|
||||
return cls.ret
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_media_duration(cls):
|
||||
status = cls._get_status()
|
||||
if status is not None:
|
||||
length = status.find("length")
|
||||
if length is not None:
|
||||
cls.ret = f"Media duration: {length.text} seconds"
|
||||
return cls.ret
|
||||
cls.ret = "Error getting media duration"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_settings(cls):
|
||||
settings = {}
|
||||
with open(Path.home() / ".config/vlc/vlcrc", "r") as f:
|
||||
for line in f:
|
||||
if line:
|
||||
try:
|
||||
key, value = line.split("=")
|
||||
if key.strip().startswith("#"):
|
||||
continue
|
||||
settings[key.strip()] = value.strip()
|
||||
except:
|
||||
continue
|
||||
cls.ret = json.dumps(settings, indent=4, ensure_ascii=False)
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_settings(cls, field, value):
|
||||
with open(Path.home() / ".config/vlc/vlcrc", "r") as rf:
|
||||
settings = rf.read()
|
||||
|
||||
# 正则表达式匹配settings中的field项并替换
|
||||
pattern = re.compile(r"#? *" + re.escape(field) + r"=.*")
|
||||
# 判断是否存在field项
|
||||
if pattern.search(settings):
|
||||
settings = pattern.sub(f"{field}={value}", settings)
|
||||
else:
|
||||
settings += f"{field}={value}\n"
|
||||
|
||||
with open(Path.home() / ".config/vlc/vlcrc", "w") as wf:
|
||||
wf.write(settings)
|
||||
|
||||
cls.ret = f"Set {field} to {value}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def toggle_fullscreen(cls, enable=None):
|
||||
"""
|
||||
Toggle fullscreen mode or set it explicitly based on the enable parameter.
|
||||
|
||||
Args:
|
||||
enable (bool, optional): If provided, explicitly set fullscreen mode (True for fullscreen, False for windowed)
|
||||
|
||||
Returns:
|
||||
str: Success or error message
|
||||
"""
|
||||
if enable is not None:
|
||||
command = "fullscreen" if enable else "fullscreen off"
|
||||
else:
|
||||
command = "fullscreen"
|
||||
response = cls._make_request("status.xml", {"command": command})
|
||||
if response:
|
||||
action = "enabled" if enable is True else "disabled" if enable is False else "toggled"
|
||||
cls.ret = f"Fullscreen mode {action}"
|
||||
return cls.ret
|
||||
cls.ret = "Error changing fullscreen mode"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_media_files(cls, path, suffix=None):
|
||||
"""
|
||||
Gets the media files for the specified path.
|
||||
|
||||
Args:
|
||||
path (str): The path to the media files
|
||||
suffix (List[str], optional): The suffix of the media files.
|
||||
Defaults to ['mp4', 'avi', 'mkv', 'mov', 'mp3', 'm4a', 'wav']
|
||||
"""
|
||||
# Set default suffix if not provided
|
||||
if suffix is None:
|
||||
suffix = ["mp4", "avi", "mkv", "mov", "mp3", "m4a", "wav"]
|
||||
|
||||
# Validate path
|
||||
if not path:
|
||||
cls.ret = "Path cannot be empty"
|
||||
return None
|
||||
|
||||
if not os.path.exists(path):
|
||||
cls.ret = f"Path not found: {path}"
|
||||
return None
|
||||
|
||||
# Initialize result list
|
||||
media_files = []
|
||||
|
||||
# Convert suffix list to lowercase for case-insensitive comparison
|
||||
suffix = [s.lower() for s in suffix]
|
||||
|
||||
# Walk through directory
|
||||
try:
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
# Check if file extension matches any of the specified suffixes
|
||||
if any(file.lower().endswith(f".{s}") for s in suffix):
|
||||
# Add full path of the file to results
|
||||
full_path = os.path.join(root, file)
|
||||
media_files.append(full_path)
|
||||
|
||||
except Exception as e:
|
||||
cls.ret = f"Error while scanning directory: {str(e)}"
|
||||
return None
|
||||
|
||||
cls.ret = media_files
|
||||
return cls.ret
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
# aworldGUIAgent-v1
|
||||
|
||||
aworldGUIAgent-v1 built on the [AWorld Framework](https://github.com/inclusionAI/AWorld), specifically designed to tackle complex desktop automation tasks within the [OSWorld-verified](https://os-world.github.io/) benchmark.
|
||||
|
||||
The core logic for our agent's perception and reasoning is adapted from the great work of the [Agent-S project](https://github.com/simular-ai/Agent-S). We have built upon their foundation by introducing a suite of new executable tools that enhance the agent's ability to interact with the OS environment.
|
||||
|
||||
## Quick Start
|
||||
|
||||
Follow these steps to set up the environment and reproduce our results.
|
||||
|
||||
1. **Create Environment & Set Up OSWorld**:
|
||||
* First, create a dedicated Conda environment with **Python 3.11**.
|
||||
```bash
|
||||
conda create -n osworld_env python=3.11
|
||||
conda activate osworld_env
|
||||
```
|
||||
* Next, follow the official setup guide in the [OSWorld README](https://github.com/xlang-ai/OSWorld) to install OSWorld and its dependencies.
|
||||
|
||||
2. **Install AWorld Framework**:
|
||||
* Install the specific version of the AWorld Framework into the **same environment**.
|
||||
```bash
|
||||
# Make sure your osworld_env is still activated
|
||||
git clone https://github.com/inclusionAI/AWorld.git
|
||||
cd AWorld
|
||||
git checkout osworld_benchmark
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
3. **Run the Evaluation Script**:
|
||||
* Our results were achieved using `openai/o3` for reasoning and `bytedance/ui-tars-1.5-7b` for visual grounding, both accessed via OpenRouter.
|
||||
* Remember to replace placeholders like `YOUR_OPENROUTER_API_KEY` and `/path/to/your/vm/Ubuntu.vmx` with your actual credentials and paths.
|
||||
|
||||
```bash
|
||||
# Activate your OSWorld conda environment (e.g., osworld_env)
|
||||
conda activate osworld_env
|
||||
|
||||
# Run the evaluation with the recommended settings
|
||||
python run_multienv_aworldguiagent.py \
|
||||
--headless \
|
||||
--ground_url YOUR_BASE_URL \
|
||||
--ground_api_key YOUR_API_KEY \
|
||||
--ground_model bytedance/ui-tars-1.5-7b \
|
||||
--ground_provider open_router \
|
||||
--model_url YOUR_BASE_URL \
|
||||
--model_api_key YOUR_API_KEY \
|
||||
--model_temperature 1.0 \
|
||||
--provider_name vmware \
|
||||
--path_to_vm /path/to/your/vm/Ubuntu.vmx \
|
||||
--max_steps 50 \
|
||||
--model_provider open_router \
|
||||
--model openai/o3 \
|
||||
--grounding_width 1920 \
|
||||
--grounding_height 1080 \
|
||||
--test_all_meta_path evaluation_examples/test_all.json \
|
||||
--result_dir ./results \
|
||||
--observation_type screenshot \
|
||||
--num_envs 1 \
|
||||
--region us-east-1 \
|
||||
--client_password osworld-public-evaluation
|
||||
```
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This work would not have been possible without building upon the foundations of several incredible open-source projects.
|
||||
|
||||
- **AWorld Framework**: We thank the developers of the [AWorld Framework](https://github.com/inclusionAI/AWorld) for providing a powerful and flexible platform for agent development. The AWorld Framework is designed for agent training and is especially suited for complex multi-agent scenarios. If you have requirements for designing or experimenting with multi-agent systems, we highly recommend you explore the AWorld Framework further.
|
||||
|
||||
- **Agent-S**: We extend our sincere gratitude to the creators of the [Agent-S project](https://github.com/simular-ai/Agent-S). The core agent logic in our implementation is adapted and enhanced from their codebase. We built upon their work by adding a suite of executable tools to improve the agent's interaction with the OS environment, which effectively boosted the stability and capability of our CUA Agent.
|
||||
|
||||
- **OSWorld Benchmark**: We are grateful to the creators of the [OSWorld Benchmark](https://os-world.github.io/) for developing a challenging and comprehensive testbed for GUI agents.
|
||||
|
|
@ -1,99 +0,0 @@
|
|||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
import logging
|
||||
import platform
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from mm_agents.aworldguiagent.grounding import ACI
|
||||
from mm_agents.aworldguiagent.workflow import Worker
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
class UIAgent:
|
||||
"""Base class for UI automation agents"""
|
||||
|
||||
""""""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_params: Dict,
|
||||
grounding_agent: ACI,
|
||||
platform: str = platform.system().lower(),
|
||||
):
|
||||
"""Initialize UIAgent
|
||||
|
||||
Args:
|
||||
engine_params: Configuration parameters for the LLM engine
|
||||
grounding_agent: Instance of ACI class for UI interaction
|
||||
platform: Operating system platform (macos, linux, windows)
|
||||
"""
|
||||
self.engine_params = engine_params
|
||||
self.grounding_agent = grounding_agent
|
||||
self.platform = platform
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset agent state"""
|
||||
pass
|
||||
|
||||
def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
|
||||
"""Generate next action prediction
|
||||
|
||||
Args:
|
||||
instruction: Natural language instruction
|
||||
observation: Current UI state observation
|
||||
|
||||
Returns:
|
||||
Tuple containing agent info dictionary and list of actions
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class AworldGUIAgent(UIAgent):
|
||||
"""Agent that uses no hierarchy for less inference time"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_params: Dict,
|
||||
grounding_agent: ACI,
|
||||
platform: str = platform.system().lower(),
|
||||
max_trajectory_length: int = 8,
|
||||
enable_reflection: bool = True,
|
||||
):
|
||||
"""Initialize a minimalist AgentS2 without hierarchy
|
||||
|
||||
Args:
|
||||
engine_params: Configuration parameters for the LLM engine
|
||||
grounding_agent: Instance of ACI class for UI interaction
|
||||
platform: Operating system platform (darwin, linux, windows)
|
||||
max_trajectory_length: Maximum number of image turns to keep
|
||||
enable_reflection: Creates a reflection agent to assist the worker agent
|
||||
"""
|
||||
|
||||
super().__init__(engine_params, grounding_agent, platform)
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.enable_reflection = enable_reflection
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset agent state and initialize components"""
|
||||
self.executor = Worker(
|
||||
engine_params=self.engine_params,
|
||||
grounding_agent=self.grounding_agent,
|
||||
platform=self.platform,
|
||||
max_trajectory_length=self.max_trajectory_length,
|
||||
enable_reflection=self.enable_reflection,
|
||||
)
|
||||
|
||||
def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
|
||||
# Initialize the three info dictionaries
|
||||
executor_info, actions = self.executor.generate_next_action(
|
||||
instruction=instruction, obs=observation
|
||||
)
|
||||
|
||||
# concatenate the three info dictionaries
|
||||
info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}
|
||||
|
||||
return info, actions
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,947 +0,0 @@
|
|||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
GENERATOR_SYS_PROMPT = """You are an expert in graphical user interfaces and Python code. You are responsible for executing the task: `TASK_DESCRIPTION`.
|
||||
You are working in Ubuntu.
|
||||
You are provided with:
|
||||
1. A screenshot of the current time step.
|
||||
2. The history of your previous interactions with the UI.
|
||||
3. Access to the following class and methods to interact with the UI:
|
||||
class Agent:
|
||||
|
||||
def click(self, element_description: str, num_clicks: int = 1, button_type: str = 'left', hold_keys: List = []):
|
||||
'''Click on the element
|
||||
Args:
|
||||
element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.
|
||||
num_clicks:int, number of times to click the element
|
||||
button_type:str, which mouse button to press can be "left", "middle", or "right"
|
||||
hold_keys:List, list of keys to hold while clicking
|
||||
'''
|
||||
|
||||
def done(self, return_value: Union[Dict, str, List, Tuple, int, float, bool, NoneType] = None):
|
||||
'''End the current task with a success and the required return value'''
|
||||
|
||||
def drag_and_drop(self, starting_description: str, ending_description: str, hold_keys: List = []):
|
||||
'''Drag from the starting description to the ending description
|
||||
Args:
|
||||
starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.
|
||||
ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.
|
||||
hold_keys:List list of keys to hold while dragging
|
||||
'''
|
||||
|
||||
def fail(self):
|
||||
'''End the current task with a failure, and replan the whole task.'''
|
||||
|
||||
def hold_and_press(self, hold_keys: List, press_keys: List):
|
||||
'''Hold a list of keys and press a list of keys
|
||||
Args:
|
||||
hold_keys:List, list of keys to hold
|
||||
press_keys:List, list of keys to press in a sequence
|
||||
'''
|
||||
|
||||
def hotkey(self, keys: List):
|
||||
'''Press a hotkey combination
|
||||
Args:
|
||||
keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])
|
||||
'''
|
||||
|
||||
def open(self, app_or_filename: str):
|
||||
'''Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.
|
||||
Args:
|
||||
app_or_filename:str, the name of the application or filename to open
|
||||
'''
|
||||
|
||||
def save_to_knowledge(self, text: List[str]):
|
||||
'''Save facts, elements, texts, etc. to a long-term knowledge bank for reuse during this task. Can be used for copy-pasting text, saving elements, etc.
|
||||
Args:
|
||||
text:List[str] the text to save to the knowledge
|
||||
'''
|
||||
|
||||
def scroll(self, element_description: str, clicks: int, shift: bool = False):
|
||||
'''Scroll the element in the specified direction
|
||||
Args:
|
||||
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
|
||||
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
|
||||
shift:bool, whether to use shift+scroll for horizontal scrolling
|
||||
'''
|
||||
|
||||
def set_cell_values(self, cell_values: Dict[str, Any], app_name: str, sheet_name: str):
|
||||
'''Use this to set individual cell values in a spreadsheet. For example, setting A2 to "hello" would be done by passing {"A2": "hello"} as cell_values. The sheet must be opened before this command can be used.
|
||||
Args:
|
||||
cell_values: Dict[str, Any], A dictionary of cell values to set in the spreadsheet. The keys are the cell coordinates in the format "A1", "B2", etc.
|
||||
Supported value types include: float, int, string, bool, formulas.
|
||||
app_name: str, The name of the spreadsheet application. For example, "Some_sheet.xlsx".
|
||||
sheet_name: str, The name of the sheet in the spreadsheet. For example, "Sheet1".
|
||||
'''
|
||||
|
||||
def switch_applications(self, app_code):
|
||||
'''Switch to a different application that is already open
|
||||
Args:
|
||||
app_code:str the code name of the application to switch to from the provided list of open applications
|
||||
'''
|
||||
|
||||
def type(self, element_description: str, text: str = '', overwrite: bool = False, enter: bool = False):
|
||||
'''Type text into a specific element
|
||||
Args:
|
||||
element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.
|
||||
text:str, the text to type
|
||||
overwrite:bool, Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.
|
||||
enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.
|
||||
'''
|
||||
|
||||
def wait(self, time: float):
|
||||
'''Wait for a specified amount of time
|
||||
Args:
|
||||
time:float the amount of time to wait in seconds
|
||||
'''
|
||||
|
||||
def code_launch_vscode(self, path):
|
||||
'''Launches Visual Studio Code with the specified file path or directory.
|
||||
在存在的窗口中打开一个文件或目录。
|
||||
|
||||
Args:
|
||||
path (str): 文件路径或目录。'''
|
||||
|
||||
def code_compare_files(self, file1, file2):
|
||||
'''Compares two files in VSCode.
|
||||
在VSCode中比较两个文件。
|
||||
|
||||
Args:
|
||||
file1 (str): 第一个文件的路径。
|
||||
file2 (str): 第二个文件的路径。'''
|
||||
|
||||
def code_add_folder(self, folder):
|
||||
'''Adds a folder to the last active window in VSCode.
|
||||
向VSCode的最后一个活动窗口添加文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。'''
|
||||
|
||||
def code_goto_file(self, file_path, line=1, character=1):
|
||||
'''Opens a file at a specific line and character position.
|
||||
在特定行和字符的位置打开文件。
|
||||
|
||||
Args:
|
||||
file_path (str): 文件路径。
|
||||
line (int): 行号。
|
||||
character (int): 字符位置。'''
|
||||
|
||||
def code_perform_merge(self, path1, path2, base, result):
|
||||
'''Perform a three-way merge.
|
||||
执行三方合并。
|
||||
|
||||
Args:
|
||||
path1 (str): 第一版本文件路径。
|
||||
path2 (str): 第二版本文件路径。
|
||||
base (str): 基础版本文件路径。
|
||||
result (str): 结果文件的保存路径。'''
|
||||
|
||||
def code_remove_folder(self, folder):
|
||||
'''Removes a folder from the last active window in VSCode.
|
||||
在VSCode的最后一个活动窗口中移除文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。'''
|
||||
|
||||
def code_install_extension(self, extension_id, pre_release=False):
|
||||
'''Installs an extension or updates it in VSCode.
|
||||
安装或更新VSCode中的扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
pre_release (bool): 是否安装预发布版本。'''
|
||||
|
||||
def code_uninstall_extension(self, extension_id):
|
||||
'''Uninstalls an extension from VSCode.
|
||||
从VSCode中卸载扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。'''
|
||||
|
||||
def code_list_extensions(self, show_versions=False, category=None):
|
||||
'''Lists installed extensions in VSCode.
|
||||
列出VSCode中安装的扩展。
|
||||
|
||||
Args:
|
||||
show_versions (bool): 是否显示扩展的版本。
|
||||
category (str): 按类别筛选扩展。'''
|
||||
|
||||
def code_update_extensions(self):
|
||||
'''Updates all installed extensions in VSCode to the latest version.
|
||||
更新VSCode中所有安装的扩展到最新版本。'''
|
||||
|
||||
def code_disable_extension(self, extension_id):
|
||||
'''Disables a specific extension for the next instance of VSCode.
|
||||
禁用在下一个VSCode窗口中的指定扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。'''
|
||||
|
||||
def code_toggle_sync(self, state):
|
||||
'''Toggles synchronization on or off in VSCode.
|
||||
在VSCode中开启或关闭同步。
|
||||
|
||||
Args:
|
||||
state (str): 'on' 或 'off' 表示开启或关闭。'''
|
||||
|
||||
|
||||
def libreoffice_calc_save(self):
|
||||
'''Save the current workbook to its current location
|
||||
|
||||
Returns:
|
||||
bool: True if save successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_get_workbook_info(self):
|
||||
'''Get workbook information
|
||||
|
||||
Args:
|
||||
None
|
||||
|
||||
Returns:
|
||||
dict: Workbook information, including file path, file name, sheets and active sheet'''
|
||||
|
||||
def libreoffice_calc_get_column_data(self, column_name):
|
||||
'''Get data from the specified column
|
||||
|
||||
Args:
|
||||
column_name (str): Name of the column to read
|
||||
|
||||
Returns:
|
||||
list: List of values in the specified column'''
|
||||
|
||||
def libreoffice_calc_set_column_as_text(self, column_name):
|
||||
|
||||
'''
|
||||
Set the specified column format as text type.
|
||||
This will convert all numeric values in the column to text format and apply text formatting.
|
||||
|
||||
Args:
|
||||
column_name (str): The column name to format as text (e.g., 'A', 'B', 'C')
|
||||
|
||||
Returns:
|
||||
str: Success message or error description
|
||||
|
||||
Example:
|
||||
"Successfully set column A as text format"
|
||||
'''
|
||||
|
||||
def libreoffice_calc_get_active_sheet_data(self):
|
||||
|
||||
'''
|
||||
Get all data from the currently active sheet with detailed coordinate information.
|
||||
Returns data with cell addresses, values, row/column info, and empty cell indicators.
|
||||
|
||||
Returns:
|
||||
dict: Complete sheet data with detailed cell information
|
||||
|
||||
Example:
|
||||
{
|
||||
"data": [
|
||||
[
|
||||
{"address": "A1", "value": "", "row": 1, "col": 1, "col_name": "A", "is_empty": true},
|
||||
{"address": "B1", "value": "Age", "row": 1, "col": 2, "col_name": "B", "is_empty": false}
|
||||
],
|
||||
[
|
||||
{"address": "A2", "value": "Ryan", "row": 2, "col": 1, "col_name": "A", "is_empty": false},
|
||||
{"address": "B2", "value": 5.0, "row": 2, "col": 2, "col_name": "B", "is_empty": false}
|
||||
],
|
||||
[
|
||||
{"address": "A3", "value": "Jack", "row": 3, "col": 1, "col_name": "A", "is_empty": false},
|
||||
{"address": "B3", "value": 6.0, "row": 3, "col": 2, "col_name": "B", "is_empty": false}
|
||||
]
|
||||
],
|
||||
"rows": 3,
|
||||
"columns": 2,
|
||||
"range": "A1:B3"
|
||||
}
|
||||
'''
|
||||
|
||||
def libreoffice_calc_switch_active_sheet(self, sheet_name):
|
||||
'''Switch to the specified sheet and make it active, create if not exist
|
||||
|
||||
Args:
|
||||
sheet_name (str): Name of the sheet to switch to or create
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_set_column_values(self, column_name, data, start_index=2):
|
||||
'''Set data to the specified column
|
||||
|
||||
Args:
|
||||
column_name (str): Name of the column to write
|
||||
data (list): List of values to write to the column
|
||||
start_index (int): The index of the first row to write to, default is 2 (skip the first row)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_highlight_range(self, range_str, color=0xFF0000):
|
||||
'''highlight the specified range with the specified color
|
||||
|
||||
Args:
|
||||
range_str (str): Range to highlight, in the format of "A1:B10"
|
||||
color (str): Color to highlight with, default is '0xFF0000' (red)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_transpose_range(self, source_range, target_cell):
|
||||
'''Transpose the specified range and paste it to the target cell
|
||||
|
||||
Args:
|
||||
source_range (str): Range to transpose, in the format of "A1:B10"
|
||||
target_cell (str): Target cell to paste the transposed data, in the format of "A1"
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_export_to_csv(self):
|
||||
'''Export the current document to a CSV file
|
||||
|
||||
Args:
|
||||
None
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_sort_column(self, column_name, ascending=True, start_index=2):
|
||||
'''Sorts the data in the specified column in ascending or descending order
|
||||
|
||||
Args:
|
||||
column_name (str): The name of the column to sort (e.g. 'A') or the title
|
||||
ascending (bool): Whether to sort in ascending order (default True)
|
||||
start_index (int): The index of the first row to sort, default is 1
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_set_validation_list(self, column_name, values):
|
||||
'''Set a validation list for the specified column
|
||||
|
||||
Args:
|
||||
column_name (str): The name of the column to set the validation list for
|
||||
values (list): The list of values to use for the validation list
|
||||
|
||||
Returns:
|
||||
None'''
|
||||
|
||||
def libreoffice_calc_hide_row_data(self, value="N/A"):
|
||||
'''Hide rows that contain the specified value
|
||||
|
||||
Args:
|
||||
value (str): The value to hide rows for, default is 'N/A'
|
||||
|
||||
Returns:
|
||||
None'''
|
||||
|
||||
def libreoffice_calc_reorder_columns(self, column_order):
|
||||
'''Reorder the columns in the sheet according to the specified order
|
||||
|
||||
Args:
|
||||
column_order (list): A list of column names in the desired order
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_create_pivot_table(self,
|
||||
source_sheet,
|
||||
table_name,
|
||||
row_fields=None,
|
||||
col_fields=None,
|
||||
value_fields=None,
|
||||
aggregation_function="sum",
|
||||
target_cell="A1",
|
||||
):
|
||||
'''Create a pivot table in the active worksheet based on data from the active sheet.'''
|
||||
|
||||
def libreoffice_calc_merge_cells(sheet_name, range_str):
|
||||
'''Merges a specified range of cells within a specific worksheet.
|
||||
|
||||
This function connects to a running LibreOffice Calc instance,
|
||||
selects a worksheet by its name, and merges the cells defined
|
||||
by the given range string.
|
||||
|
||||
Args:
|
||||
sheet_name (str): The name of the worksheet where the cells will be
|
||||
merged, e.g., 'Sheet1' or 'Q4_Report'.
|
||||
range_str (str): The cell range to merge, specified in A1 notation,
|
||||
e.g., 'A1:B10'.
|
||||
|
||||
Returns:
|
||||
bool: True if the cells were successfully merged, False if an
|
||||
error occurred.
|
||||
'''
|
||||
|
||||
def libreoffice_calc_set_cell_value(self, cell, value):
|
||||
'''Set a value to a specific cell in the active worksheet.
|
||||
|
||||
Args:
|
||||
cell (str): Cell reference (e.g., 'A1')
|
||||
value (str): Value to set in the cell
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_format_range(self, range_str, background_color=None, font_color=None, bold=None, alignment=None):
|
||||
'''Apply formatting to the specified range in the active worksheet
|
||||
|
||||
Args:
|
||||
range_str (str): Range to format, in the format of 'A1:B10'
|
||||
background_color (str, optional): Background color in hex format (e.g., '#0000ff')
|
||||
font_color (str, optional): Font color in hex format (e.g., '#ffffff')
|
||||
bold (bool, optional): Whether to make the text bold
|
||||
italic (bool, optional): Whether to make the text italic
|
||||
alignment (str, optional): Text alignment (left, center, right)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_freeze_panes(self, rows=0, columns=0):
|
||||
'''冻结活动工作表中的行和/或列
|
||||
|
||||
Args:
|
||||
rows (int): 从顶部开始冻结的行数
|
||||
columns (int): 从左侧开始冻结的列数
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_rename_sheet(self, old_name, new_name):
|
||||
'''重命名工作表
|
||||
|
||||
Args:
|
||||
old_name (str): 要重命名的工作表的当前名称
|
||||
new_name (str): 工作表的新名称
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_copy_sheet(self, source_sheet, new_sheet_name=None):
|
||||
'''创建工作簿中现有工作表的副本
|
||||
|
||||
Args:
|
||||
source_sheet (str): 要复制的工作表名称
|
||||
new_sheet_name (str, optional): 新工作表副本的名称,如果不提供则自动生成
|
||||
|
||||
Returns:
|
||||
str: 新创建的工作表名称,如果失败则返回None'''
|
||||
|
||||
def libreoffice_calc_reorder_sheets(self, sheet_name, position):
|
||||
'''重新排序工作表在工作簿中的位置
|
||||
|
||||
Args:
|
||||
sheet_name (str): 要移动的工作表名称
|
||||
position (int): 要移动到的位置(基于0的索引)
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_set_chart_legend_position(self, position):
|
||||
'''Set the position of the legend in a chart in the active worksheet.
|
||||
|
||||
Args:
|
||||
position (str): Position of the legend ('top', 'bottom', 'left', 'right', 'none')
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_set_number_format(self, range_str, format_type, decimal_places=None):
|
||||
'''Apply a specific number format to a range of cells in the active worksheet.
|
||||
|
||||
Args:
|
||||
range_str (str): Range to format, in the format of 'A1:B10'
|
||||
format_type (str): Type of number format to apply
|
||||
decimal_places (int, optional): Number of decimal places to display
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_adjust_column_width(self, columns, width=None, autofit=False):
|
||||
'''调整活动工作表中指定列的宽度
|
||||
|
||||
Args:
|
||||
columns (str): 要调整的列范围,例如 'A:C' 表示从A列到C列
|
||||
width (float, optional): 要设置的宽度(以字符为单位)
|
||||
autofit (bool, optional): 是否自动调整列宽以适应内容
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_adjust_row_height(self, rows, height=None, autofit=False):
|
||||
'''调整活动工作表中指定行的高度
|
||||
|
||||
Args:
|
||||
rows (str): 要调整的行范围,例如 '1:10' 表示第1行到第10行
|
||||
height (float, optional): 要设置的高度(以点为单位)
|
||||
autofit (bool, optional): 是否自动调整行高以适应内容
|
||||
|
||||
Returns:
|
||||
bool: 操作成功返回True,否则返回False'''
|
||||
|
||||
def libreoffice_calc_export_to_pdf(self, file_path=None, sheets=None, open_after_export=False):
|
||||
'''将当前文档或指定工作表导出为PDF文件
|
||||
|
||||
Args:
|
||||
file_path (str, optional): PDF文件保存路径,如果不指定则使用当前文档路径
|
||||
sheets (list, optional): 要包含在PDF中的工作表名称列表,如果不指定则包含所有工作表
|
||||
open_after_export (bool, optional): 导出后是否打开PDF文件
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_set_zoom_level(self, zoom_percentage):
|
||||
'''调整当前工作表的缩放级别,使单元格看起来更大或更小
|
||||
|
||||
Args:
|
||||
zoom_percentage (int): 缩放级别的百分比(例如,75表示75%,100表示正常大小,150表示放大)。
|
||||
有效范围通常为10-400。
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
|
||||
def libreoffice_impress_save(self):
|
||||
'''保存文档到当前位置'''
|
||||
|
||||
def libreoffice_impress_go_to_slide(self, slide_index):
|
||||
'''Navigates to a specific slide in the presentation based on its index.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to navigate to (1-based indexing)
|
||||
|
||||
Returns:
|
||||
bool: True if navigation was successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_get_slide_count(self):
|
||||
'''Gets the total number of slides in the current presentation.
|
||||
:return: The total number of slides as an integer'''
|
||||
|
||||
def libreoffice_impress_duplicate_slide(self, slide_index):
|
||||
'''Creates a duplicate of a specific slide and places it at the end of the presentation.
|
||||
|
||||
:param slide_index: The index of the slide to duplicate (1-based indexing)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_font(self, slide_index, font_name):
|
||||
'''Sets the font style for all text elements in a specific slide, including the title.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to modify (1-based indexing)
|
||||
font_name (str): The name of the font to apply (e.g., 'Arial', 'Times New Roman', 'Calibri')
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_write_text(self, content, page_index, box_index, bold=False, italic=False, size=None, append=False):
|
||||
'''Writes text to a specific textbox on a slide
|
||||
|
||||
:param content: The text content to add
|
||||
:param page_index: The index of the slide (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:param bold: Whether to make the text bold, default is False
|
||||
:param italic: Whether to make the text italic, default is False
|
||||
:param size: The size of the text. If None, uses the box's current font size.
|
||||
:param append: Whether to append the text, default is False. If you want to observe some formats(like a bullet at the beginning) or keep the original text, you should set up it.
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_style(self, slide_index, box_index, bold=None, italic=None, underline=None):
|
||||
'''Sets the style properties for the specified textbox on a slide.
|
||||
|
||||
:param slide_index: The index of the slide to modify (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:param bold: Whether to make the text bold
|
||||
:param italic: Whether to make the text italic
|
||||
:param underline: Whether to underline the text
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_configure_auto_save(self, enabled, interval_minutes):
|
||||
'''Enables or disables auto-save functionality for the current document and sets the auto-save interval.
|
||||
|
||||
:param enabled: Whether to enable (True) or disable (False) auto-save
|
||||
:param interval_minutes: The interval in minutes between auto-saves (minimum 1 minute)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_background_color(self, slide_index, box_index, color):
|
||||
'''Sets the background color for the specified textbox on a slide.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide containing the textbox (1-based indexing)
|
||||
box_index (int): The index of the textbox to modify (0-based indexing)
|
||||
color (str): The color to apply to the textbox (e.g., 'red', 'green', 'blue', 'yellow', or hex color code)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_text_color(self, slide_index, box_index, color):
|
||||
'''Sets the text color for the specified textbox on a slide.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to modify (1-based indexing)
|
||||
box_index (int): The index of the textbox to modify (0-based indexing)
|
||||
color (str): The color to apply to the text (e.g., 'red', 'green', 'blue', 'black', or hex color code)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_delete_content(self, slide_index, box_index):
|
||||
'''Deletes the specified textbox from a slide.
|
||||
|
||||
:param slide_index: The index of the slide to modify (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_orientation(self, orientation):
|
||||
'''Changes the orientation of slides in the presentation between portrait (upright) and landscape (sideways).
|
||||
|
||||
:param orientation: The desired orientation for the slides ('portrait' or 'landscape')
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_position_box(self, slide_index, box_index, position):
|
||||
'''Positions a textbox or image on a slide at a specific location or predefined position.
|
||||
|
||||
:param slide_index: The index of the slide containing the box (1-based indexing)
|
||||
:param box_index: The index of the box to position (0-based indexing)
|
||||
:param position: Predefined position on the slide (left, right, center, top, bottom, etc.)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_insert_file(self, file_path, slide_index=None, position=None, size=None, autoplay=False):
|
||||
'''Inserts a video file into the current or specified slide in the presentation.
|
||||
|
||||
Args:
|
||||
file_path (str): The full path to the video file to be inserted
|
||||
slide_index (int, optional): The index of the slide to insert the video into (1-based indexing).
|
||||
If not provided, inserts into the current slide.
|
||||
position (dict, optional): The position coordinates for the video as percentages of slide dimensions
|
||||
{'x': float, 'y': float}
|
||||
size (dict, optional): The size dimensions for the video as percentages of slide dimensions
|
||||
{'width': float, 'height': float}
|
||||
autoplay (bool, optional): Whether the video should automatically play when the slide is shown
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_background(self, slide_index=None, color=None, image_path=None):
|
||||
'''Sets the background color or image for a specific slide or all slides.
|
||||
|
||||
Args:
|
||||
slide_index (int, optional): The index of the slide to modify (1-based indexing).
|
||||
If not provided, applies to all slides.
|
||||
color (str, optional): The background color to apply (e.g., 'red', 'green', 'blue', or hex color code)
|
||||
image_path (str, optional): Path to an image file to use as background. If provided, overrides color.
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_save_as(self, file_path, overwrite=False):
|
||||
'''Saves the current document to a specified location with a given filename.
|
||||
|
||||
:param file_path: The full path where the file should be saved, including the filename and extension
|
||||
:param overwrite: Whether to overwrite the file if it already exists (default: False)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_insert_image(self, slide_index, image_path, width=None, height=None, position=None):
|
||||
'''Inserts an image to a specific slide in the presentation.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to add the image to (1-based indexing)
|
||||
image_path (str): The full path to the image file to be added
|
||||
width (float, optional): The width of the image in centimeters
|
||||
height (float, optional): The height of the image in centimeters
|
||||
position (dict, optional): The position coordinates for the image as percentages
|
||||
{
|
||||
'x': float, # The x-coordinate as a percentage of slide width
|
||||
'y': float # The y-coordinate as a percentage of slide height
|
||||
}
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_configure_display_settings(self, use_presenter_view=None, primary_monitor_only=None, monitor_for_presentation=None
|
||||
):
|
||||
'''Configures the display settings for LibreOffice Impress presentations.
|
||||
|
||||
Args:
|
||||
use_presenter_view (bool, optional): Whether to use presenter view. Set to false to disable presenter view.
|
||||
primary_monitor_only (bool, optional): Whether to use only the primary monitor for the presentation.
|
||||
monitor_for_presentation (int, optional): Specify which monitor to use (1 for primary, 2 for secondary, etc.)
|
||||
|
||||
Returns:
|
||||
bool: True if settings were successfully applied, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_text_strikethrough(self, slide_index, box_index, line_numbers, apply):
|
||||
'''Applies or removes strike-through formatting to specific text content in a slide.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide containing the text (1-based indexing)
|
||||
box_index (int): The index of the textbox containing the text (0-based indexing)
|
||||
line_numbers (list): The line numbers to apply strike-through formatting to (1-based indexing)
|
||||
apply (bool): Whether to apply (true) or remove (false) strike-through formatting
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_textbox_alignment(self, slide_index, box_index, alignment):
|
||||
'''Sets the text alignment for the specified textbox on a slide.
|
||||
|
||||
:param slide_index: The index of the slide to modify (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:param alignment: The text alignment to apply ('left', 'center', 'right', or 'justify')
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_number_color(self, color):
|
||||
'''Sets the color of the slide number in the presentation.
|
||||
|
||||
Args:
|
||||
color (str): The color to apply to slide numbers (e.g., 'red', 'green', 'blue', 'black', or hex color code)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_export_to_image(self, file_path, format, slide_index=None):
|
||||
'''Exports the current presentation or a specific slide to an image file format.
|
||||
|
||||
Args:
|
||||
file_path (str): The full path where the image file should be saved, including the filename and extension
|
||||
format (str): The image format to export to (e.g., 'png', 'jpeg', 'gif')
|
||||
slide_index (int, optional): The index of the specific slide to export (1-based indexing).
|
||||
If not provided, exports the entire presentation as a series of images.
|
||||
|
||||
Returns:
|
||||
bool: True if export was successful, False otherwise'''
|
||||
|
||||
|
||||
def libreoffice_writer_save(self):
|
||||
'''保存文档到当前位置'''
|
||||
|
||||
def libreoffice_writer_write_text(self, text, bold=False, italic=False, size=None):
|
||||
'''写入文本'''
|
||||
|
||||
def libreoffice_writer_set_color(self, pattern, color, paragraph_indices=None):
|
||||
'''Changes the color of matched text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
pattern (str): Regular expression pattern to match text
|
||||
color (int): Hex color code (e.g., 0x000000 for black)
|
||||
paragraph_indices (list, optional): List of paragraph indices to modify (0-based).
|
||||
If None, applies to all paragraphs.'''
|
||||
|
||||
def libreoffice_writer_find_and_replace(self, pattern, replacement, paragraph_indices=None):
|
||||
'''Finds all occurrences of a specified text pattern and replaces them with another text in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The pattern to match in the document, should be a regular expression
|
||||
replacement (str): The text to replace the found text with
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing)
|
||||
|
||||
Returns:
|
||||
str: Success message with number of replacements made'''
|
||||
|
||||
def libreoffice_writer_set_font(self, font_name, paragraph_indices=None):
|
||||
'''Changes the font of text in the document or specified paragraphs.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to apply (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.'''
|
||||
|
||||
def libreoffice_writer_set_line_spacing(self, spacing_value, paragraph_indices=None):
|
||||
'''Sets the line spacing for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
spacing_value (float): The line spacing value to apply (1.0 for single spacing, 2.0 for double spacing, etc.)
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.'''
|
||||
|
||||
def libreoffice_writer_remove_highlighting(self, paragraph_indices=None):
|
||||
'''Removes ALL highlighting from text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_find_highlighted_text(self, highlight_color):
|
||||
'''Finds all text in the document that has a specific highlight color applied to it.
|
||||
|
||||
Args:
|
||||
highlight_color (str): The highlight color to search for. Can be a color name (e.g., 'yellow', 'green') or hex code.
|
||||
|
||||
Returns:
|
||||
list: A list of strings containing all text segments with the specified highlight color.'''
|
||||
|
||||
def libreoffice_writer_insert_formula_at_cursor(self, formula):
|
||||
'''Inserts a formula at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
formula (str): The formula to insert at the current cursor position.
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_writer_insert_image_at_cursor(self, image_path, width=None, height=None):
|
||||
'''Inserts an image at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
image_path (str): Full path to the image file to insert
|
||||
width (int, optional): Width to display the image in pixels
|
||||
height (int, optional): Height to display the image in pixels
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_set_strikethrough(self, pattern, paragraph_indices=None):
|
||||
'''Sets the strikethrough formatting for text matching the specified pattern in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The regular expression pattern to match in the document
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error information'''
|
||||
|
||||
def libreoffice_writer_set_font_size(self, font_size, pattern, paragraph_indices=None):
|
||||
'''Changes the font size of specified text in the document.
|
||||
|
||||
Args:
|
||||
font_size (float): The font size to apply (in points).
|
||||
pattern (str): The pattern to match in the document, should be a regular expression.
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Result message indicating success or failure.'''
|
||||
|
||||
def libreoffice_writer_export_to_pdf(self, output_path=None, output_filename=None, include_comments=False, quality="standard"):
|
||||
'''Exports the current document to PDF format.
|
||||
|
||||
Args:
|
||||
output_path (str, optional): The full path where the PDF should be saved.
|
||||
If not provided, uses the same location as the original document.
|
||||
output_filename (str, optional): The filename to use for the PDF.
|
||||
If not provided, uses the original document's filename with .pdf extension.
|
||||
include_comments (bool, optional): Whether to include comments in the exported PDF.
|
||||
Defaults to False.
|
||||
quality (str, optional): The quality of the PDF export ('standard', 'high', 'print').
|
||||
Defaults to 'standard'.
|
||||
|
||||
Returns:
|
||||
str: Path to the exported PDF file or error message'''
|
||||
|
||||
def libreoffice_writer_set_paragraph_alignment(self, alignment, paragraph_indices=None):
|
||||
'''Sets the text alignment for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
alignment (str): The alignment to apply ('left', 'center', 'right', 'justify').
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_capitalize_words(self, paragraph_indices=None):
|
||||
'''Capitalizes the first letter of each word for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_set_default_font(self, font_name, font_size=None):
|
||||
'''Sets the default font for new text in the document without changing existing text.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to set as default (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
font_size (float, optional): The default font size in points.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_add_page_numbers(self, position, start_number=1, format=None):
|
||||
'''Adds page numbers to the document at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Position of the page numbers ('bottom_left', 'bottom_center', 'bottom_right',
|
||||
'top_left', 'top_center', 'top_right')
|
||||
start_number (int
|
||||
def libreoffice_writer_add_page_numbers(self, position, start_number=1, format=None):
|
||||
'''Adds page numbers to the document at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Position of the page numbers ('bottom_left', 'bottom_center', 'bottom_right',
|
||||
'top_left', 'top_center', 'top_right')
|
||||
start_number (int, optional): The starting page number. Defaults to 1.
|
||||
format (str, optional): Format of the page numbers (e.g., '1', 'Page 1', '1 of N').
|
||||
Defaults to simple number format.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message''', optional): The starting page number. Defaults to 1.
|
||||
format (str, optional): Format of the page numbers (e.g., '1', 'Page 1', '1 of N').
|
||||
Defaults to simple number format.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_insert_page_break(self, position="at_cursor"):
|
||||
'''Inserts a page break at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Where to insert the page break: 'at_cursor' for current cursor position,
|
||||
'end_of_document' for end of document. Defaults to 'at_cursor'.'''
|
||||
|
||||
Your response should be formatted like this:
|
||||
(Previous action verification)
|
||||
Carefully analyze based on the screenshot if the previous action was successful. If the previous action was not successful, provide a reason for the failure.
|
||||
|
||||
(Screenshot Analysis)
|
||||
Closely examine and describe the current state of the desktop along with the currently open applications.
|
||||
|
||||
(Next Action)
|
||||
Based on the current screenshot and the history of your previous interaction with the UI, decide on the next action in natural language to accomplish the given task.
|
||||
|
||||
(Grounded Action)
|
||||
Translate the next action into code using the provided API methods. Format the code like this:
|
||||
```python
|
||||
agent.click("The menu button at the top right of the window", 1, "left")
|
||||
```
|
||||
Note for the code:
|
||||
1. Only perform one action at a time.
|
||||
2. Do not put anything other than python code in the block. You can only use one function call at a time. Do not put more than one function call in the block.
|
||||
3. You must use only the available methods provided above to interact with the UI, do not invent new methods.
|
||||
4. Only return one code block every time. There must be a single line of code in the code block.
|
||||
5. Do not do anything other than the exact specified task. Return with `agent.done()` immediately after the subtask is completed or `agent.fail()` if it cannot be completed.
|
||||
6. Whenever possible, your grounded action should use hot-keys with the agent.hotkey() action instead of clicking or dragging.
|
||||
7. My computer's password is 'osworld-public-evaluation', feel free to use it when you need sudo rights.
|
||||
8. Before performing any calculations on elements in a table or inserting charts, always use libreoffice_calc_get_column_data or libreoffice_calc_get_active_sheet_data to obtain accurate column coordinates and element values from the table, ensuring precise execution of subsequent calculations or chart insertions.
|
||||
9. Generate agent.fail() as your grounded action if you get exhaustively stuck on the task and believe it is impossible.
|
||||
10. Generate agent.done() as your grounded action when your believe the task is fully complete.
|
||||
11. Do not use the "command" + "tab" hotkey on MacOS.
|
||||
"""
|
||||
|
||||
|
||||
REFLECTION_SYS_PROMPT = """
|
||||
You are an expert computer use agent designed to reflect on the trajectory of a task and provide feedback on what has happened so far.
|
||||
You have access to the Task Description and the Current Trajectory of another computer agent. The Current Trajectory is a sequence of a desktop image, chain-of-thought reasoning, and a desktop action for each time step. The last image is the screen's display after the last action.
|
||||
Your task is to generate a reflection. Your generated reflection must fall under one of the cases listed below:
|
||||
|
||||
**Your judgment must be based solely on a critical comparison between the agent's stated plan/reasoning and the visual evidence presented in the screenshot history.** Do not take the agent's claims of success at face value. **If there is no visual proof in the screenshot, the action did not happen.**
|
||||
|
||||
Case 1. The trajectory is not going according to plan. This occurs when there is a mismatch between the intended action and the visual outcome, when the agent hallucinates information, or when it is stuck. You must trigger Case 1 if you detect any of the following:
|
||||
Failed Action: The previous action did not produce its expected visual change on the screen (e.g., a window failed to open, text was not pasted).
|
||||
Unsupported Conclusion (Hallucination): The agent makes a claim or states a result (like a number or a fact) that is not visibly supported by the current or any previous screenshot. This is a critical failure.
|
||||
Repetitive Cycle: The agent is repeating actions without making meaningful progress.
|
||||
Case 2. The trajectory is going according to plan. In this case, simply tell the agent to continue proceeding as planned. DO NOT encourage a specific action in particular.
|
||||
Case 3. You believe the current task has been completed. In this case, tell the agent that the task has been successfully completed.
|
||||
|
||||
To be successful, you must follow the rules below:
|
||||
- **Your output MUST be based on one of the case options above**.
|
||||
- DO NOT suggest any specific future plans or actions. Your only goal is to provide a reflection, not an actual plan or action.
|
||||
- Any response that falls under Case 1 should explain why the trajectory is not going according to plan. You should especially lookout for cycles of actions that are continually repeated with no progress.
|
||||
- Any response that falls under Case 2 should be concise, since you just need to affirm the agent to continue with the current trajectory.
|
||||
"""
|
||||
|
|
@ -1,194 +0,0 @@
|
|||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
import re
|
||||
import base64
|
||||
from aworld.core.common import Observation, ActionModel
|
||||
from aworld.models.model_response import ModelResponse
|
||||
from aworld.core.agent.base import AgentResult
|
||||
from aworld.memory.main import InMemoryMemoryStore
|
||||
|
||||
def encode_image(image_content):
|
||||
# if image_content is a path to an image file, check type of the image_content to verify
|
||||
if isinstance(image_content, str):
|
||||
with open(image_content, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
else:
|
||||
return base64.b64encode(image_content).decode("utf-8")
|
||||
|
||||
|
||||
def extract_first_agent_function(code_string):
|
||||
# Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses
|
||||
pattern = r'agent\.[a-zA-Z_]+\((?:[^()\'"]|\'[^\']*\'|"[^"]*")*\)'
|
||||
|
||||
# Find all matches in the string
|
||||
matches = re.findall(pattern, code_string)
|
||||
|
||||
# Return the first match if found, otherwise return None
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
def parse_single_code_from_string(input_string):
|
||||
input_string = input_string.strip()
|
||||
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
|
||||
return input_string.strip()
|
||||
|
||||
# This regular expression will match both ```code``` and ```python code```
|
||||
# and capture the `code` part. It uses a non-greedy match for the content inside.
|
||||
pattern = r"```(?:\w+\s+)?(.*?)```"
|
||||
# Find all non-overlapping matches in the string
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
|
||||
# The regex above captures the content inside the triple backticks.
|
||||
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
|
||||
# so the code inside backticks can span multiple lines.
|
||||
|
||||
# matches now contains all the captured code snippets
|
||||
|
||||
codes = []
|
||||
|
||||
for match in matches:
|
||||
match = match.strip()
|
||||
commands = [
|
||||
"WAIT",
|
||||
"DONE",
|
||||
"FAIL",
|
||||
] # fixme: updates this part when we have more commands
|
||||
|
||||
if match in commands:
|
||||
codes.append(match.strip())
|
||||
elif match.split("\n")[-1] in commands:
|
||||
if len(match.split("\n")) > 1:
|
||||
codes.append("\n".join(match.split("\n")[:-1]))
|
||||
codes.append(match.split("\n")[-1])
|
||||
else:
|
||||
codes.append(match)
|
||||
|
||||
if len(codes) <= 0:
|
||||
return "fail"
|
||||
return codes[0]
|
||||
|
||||
|
||||
def sanitize_code(code):
|
||||
# This pattern captures the outermost double-quoted text
|
||||
if "\n" in code:
|
||||
pattern = r'(".*?")'
|
||||
# Find all matches in the text
|
||||
matches = re.findall(pattern, code, flags=re.DOTALL)
|
||||
if matches:
|
||||
# Replace the first occurrence only
|
||||
first_match = matches[0]
|
||||
code = code.replace(first_match, f'"""{first_match[1:-1]}"""', 1)
|
||||
return code
|
||||
|
||||
def prune_image_messages(memory_store: InMemoryMemoryStore, max_trajectory_length: int):
|
||||
"""
|
||||
检查 memory_store 中的消息,并仅保留最新的 max_trajectory_length 个包含图片的消息。
|
||||
对于更早的包含图片的消息,会从其 content 中移除图片部分。
|
||||
|
||||
Args:
|
||||
memory_store (InMemoryMemoryStore): 内存存储的对象实例。
|
||||
max_trajectory_length (int): 希望保留的含图片消息的最大数量。
|
||||
"""
|
||||
# 步骤 1: 使用 memory_store 的 get_all 方法获取所有消息
|
||||
all_items = memory_store.get_all()
|
||||
|
||||
# 步骤 2: 筛选出所有包含图片内容的消息
|
||||
image_messages = []
|
||||
for item in all_items:
|
||||
if isinstance(item.content, list):
|
||||
if any(isinstance(part, dict) and part.get('type') == 'image_url' for part in item.content):
|
||||
image_messages.append(item)
|
||||
|
||||
# 步骤 3: 检查包含图片的消息数量是否超过限制
|
||||
if len(image_messages) <= max_trajectory_length:
|
||||
print("Number of image messages does not exceed the limit. No pruning needed.")
|
||||
return
|
||||
|
||||
# 步骤 4: 确定需要移除图片的旧消息
|
||||
# 由于 get_all() 返回的列表是按添加顺序排列的,所以列表前面的项就是最旧的
|
||||
num_to_prune = len(image_messages) - max_trajectory_length
|
||||
messages_to_prune = image_messages[:num_to_prune]
|
||||
|
||||
print(f"Found {len(image_messages)} image messages. Pruning the oldest {num_to_prune}.")
|
||||
|
||||
# 步骤 5: 遍历需要修剪的消息,更新其 content,并使用 store 的 update 方法保存
|
||||
for item_to_prune in messages_to_prune:
|
||||
|
||||
# 创建一个新的 content 列表,仅包含非图片部分
|
||||
new_content = [
|
||||
part for part in item_to_prune.content
|
||||
if not (isinstance(part, dict) and part.get('type') == 'image_url')
|
||||
]
|
||||
|
||||
# 可选:如果 new_content 中只剩下一个文本元素,可以将其简化为字符串
|
||||
if len(new_content) == 1 and new_content[0].get('type') == 'text':
|
||||
final_content = new_content[0].get('text', '')
|
||||
else:
|
||||
final_content = new_content
|
||||
|
||||
# 更新消息对象的 content 属性
|
||||
item_to_prune.content = final_content
|
||||
|
||||
# 使用 memory_store 的 update 方法将更改持久化到 store 中
|
||||
memory_store.update(item_to_prune)
|
||||
|
||||
print(f"Pruned image from message with ID: {item_to_prune.id}")
|
||||
|
||||
def reps_action_result(resp: ModelResponse) -> AgentResult:
|
||||
try:
|
||||
full_response = resp.content
|
||||
# Extract thoughts section
|
||||
thoughts_match = re.search(
|
||||
r"<thoughts>(.*?)</thoughts>", full_response, re.DOTALL
|
||||
)
|
||||
thoughts = thoughts_match.group(1).strip()
|
||||
# Extract answer section
|
||||
answer_match = re.search(r"<answer>(.*?)</answer>", full_response, re.DOTALL)
|
||||
answer = answer_match.group(1).strip()
|
||||
action = ActionModel(action_name=answer, policy_info=thoughts)
|
||||
return AgentResult(actions=[action], current_state=None)
|
||||
except Exception as e:
|
||||
action = ActionModel(action_name=resp.content, policy_info="")
|
||||
return AgentResult(actions=[action], current_state=None)
|
||||
|
||||
def parse_single_code_from_string(input_string):
|
||||
input_string = input_string.strip()
|
||||
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
|
||||
return input_string.strip()
|
||||
|
||||
# This regular expression will match both ```code``` and ```python code```
|
||||
# and capture the `code` part. It uses a non-greedy match for the content inside.
|
||||
pattern = r"```(?:\w+\s+)?(.*?)```"
|
||||
# Find all non-overlapping matches in the string
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
|
||||
# The regex above captures the content inside the triple backticks.
|
||||
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
|
||||
# so the code inside backticks can span multiple lines.
|
||||
|
||||
# matches now contains all the captured code snippets
|
||||
|
||||
codes = []
|
||||
|
||||
for match in matches:
|
||||
match = match.strip()
|
||||
commands = [
|
||||
"WAIT",
|
||||
"DONE",
|
||||
"FAIL",
|
||||
] # fixme: updates this part when we have more commands
|
||||
|
||||
if match in commands:
|
||||
codes.append(match.strip())
|
||||
elif match.split("\n")[-1] in commands:
|
||||
if len(match.split("\n")) > 1:
|
||||
codes.append("\n".join(match.split("\n")[:-1]))
|
||||
codes.append(match.split("\n")[-1])
|
||||
else:
|
||||
codes.append(match)
|
||||
|
||||
if len(codes) <= 0:
|
||||
return "fail"
|
||||
return codes[0]
|
||||
|
|
@ -1,230 +0,0 @@
|
|||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
import logging
|
||||
import textwrap
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from aworld.config.conf import AgentConfig
|
||||
from aworld.agents.llm_agent import Agent
|
||||
from aworld.core.common import Observation
|
||||
|
||||
from aworld.core.task import Task
|
||||
from aworld.core.context.base import Context
|
||||
from aworld.core.event.base import Message
|
||||
from aworld.models.llm import get_llm_model
|
||||
from aworld.utils.common import sync_exec
|
||||
|
||||
from mm_agents.aworldguiagent.grounding import ACI
|
||||
from mm_agents.aworldguiagent.prompt import GENERATOR_SYS_PROMPT, REFLECTION_SYS_PROMPT
|
||||
from mm_agents.aworldguiagent.utils import encode_image, extract_first_agent_function, parse_single_code_from_string, sanitize_code
|
||||
from mm_agents.aworldguiagent.utils import prune_image_messages, reps_action_result
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
class Worker:
|
||||
def __init__(
|
||||
self,
|
||||
engine_params: Dict,
|
||||
grounding_agent: ACI,
|
||||
platform: str = "ubuntu",
|
||||
max_trajectory_length: int = 16,
|
||||
enable_reflection: bool = True,
|
||||
):
|
||||
"""
|
||||
Worker receives the main task and generates actions, without the need of hierarchical planning
|
||||
Args:
|
||||
engine_params: Dict
|
||||
Parameters for the multimodal engine
|
||||
grounding_agent: Agent
|
||||
The grounding agent to use
|
||||
platform: str
|
||||
OS platform the agent runs on (darwin, linux, windows)
|
||||
max_trajectory_length: int
|
||||
The amount of images turns to keep
|
||||
enable_reflection: bool
|
||||
Whether to enable reflection
|
||||
"""
|
||||
# super().__init__(engine_params, platform)
|
||||
|
||||
self.grounding_agent = grounding_agent
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.enable_reflection = enable_reflection
|
||||
self.use_thinking = engine_params.get("model", "") in [
|
||||
"claude-3-7-sonnet-20250219"
|
||||
]
|
||||
|
||||
self.generator_agent_config = AgentConfig(
|
||||
llm_provider=engine_params.get("engine_type", "openai"),
|
||||
llm_model_name=engine_params.get("model", "openai/o3",),
|
||||
llm_temperature=engine_params.get("temperature", 1.0),
|
||||
llm_base_url=engine_params.get("base_url", "https://openrouter.ai/api/v1"),
|
||||
llm_api_key=engine_params.get("api_key", ""),
|
||||
)
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
|
||||
self.generator_agent = Agent(
|
||||
name="generator_agent",
|
||||
conf=self.generator_agent_config,
|
||||
system_prompt=GENERATOR_SYS_PROMPT,
|
||||
resp_parse_func=reps_action_result
|
||||
)
|
||||
|
||||
self.reflection_agent = Agent(
|
||||
name="reflection_agent",
|
||||
conf=self.generator_agent_config,
|
||||
system_prompt=REFLECTION_SYS_PROMPT,
|
||||
resp_parse_func=reps_action_result
|
||||
)
|
||||
|
||||
self.turn_count = 0
|
||||
self.worker_history = []
|
||||
self.reflections = []
|
||||
self.cost_this_turn = 0
|
||||
self.screenshot_inputs = []
|
||||
|
||||
self.dummy_task = Task()
|
||||
self.dummy_context = Context()
|
||||
self.dummy_context.set_task(self.dummy_task)
|
||||
self.dummy_message = Message(headers={'context': self.dummy_context})
|
||||
|
||||
self.planning_model = get_llm_model(self.generator_agent_config)
|
||||
|
||||
self.first_done = False
|
||||
self.first_image = None
|
||||
|
||||
def generate_next_action(
|
||||
self,
|
||||
instruction: str,
|
||||
obs: Dict,
|
||||
) -> Tuple[Dict, List]:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
agent = self.grounding_agent
|
||||
generator_message = (
|
||||
""
|
||||
if self.turn_count > 0
|
||||
else "The initial screen is provided. No action has been taken yet."
|
||||
)
|
||||
|
||||
# Load the task into the system prompt
|
||||
if self.turn_count == 0:
|
||||
self.generator_agent.system_prompt = self.generator_agent.system_prompt.replace(
|
||||
"TASK_DESCRIPTION", instruction)
|
||||
|
||||
# Get the per-step reflection
|
||||
reflection = None
|
||||
reflection_thoughts = None
|
||||
if self.enable_reflection:
|
||||
# Load the initial message
|
||||
if self.turn_count == 0:
|
||||
text_content = textwrap.dedent(
|
||||
f"""
|
||||
Task Description: {instruction}
|
||||
Current Trajectory below:
|
||||
"""
|
||||
)
|
||||
updated_sys_prompt = (
|
||||
self.reflection_agent.system_prompt + "\n" + text_content
|
||||
)
|
||||
self.reflection_agent.system_prompt = updated_sys_prompt
|
||||
|
||||
image_content = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"The initial screen is provided. No action has been taken yet."
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "data:image/png;base64," + encode_image(obs["screenshot"])
|
||||
}
|
||||
}
|
||||
]
|
||||
self.reflection_agent._init_context(context=self.dummy_context)
|
||||
|
||||
sync_exec(
|
||||
self.reflection_agent._add_human_input_to_memory,
|
||||
image_content,
|
||||
self.dummy_context,
|
||||
"message"
|
||||
)
|
||||
|
||||
# Load the latest action
|
||||
else:
|
||||
|
||||
image = "data:image/png;base64," + encode_image(obs["screenshot"])
|
||||
reflection_message = self.worker_history[-1] + "\n" + f"Here is function execute result: {obs['action_response']}.\n"
|
||||
|
||||
reflection_observation = Observation(content=reflection_message, image=image)
|
||||
|
||||
self.reflection_agent._init_context(context=self.dummy_context)
|
||||
reflection_actions = self.reflection_agent.policy(reflection_observation, message=self.dummy_message)
|
||||
|
||||
reflection = reflection_actions[0].action_name
|
||||
reflection_thoughts = reflection_actions[0].policy_info
|
||||
|
||||
self.reflections.append(reflection)
|
||||
|
||||
generator_message += f"Here is your function execute result: {obs['action_response']}.\n"
|
||||
|
||||
generator_message += f"REFLECTION: You may use this reflection on the previous action and overall trajectory:\n{reflection}\n"
|
||||
logger.info("REFLECTION: %s", reflection)
|
||||
|
||||
if self.first_done:
|
||||
pass
|
||||
|
||||
else:
|
||||
# Add finalized message to conversation
|
||||
generator_message += f"\nCurrent Text Buffer = [{','.join(agent.notes)}]\n"
|
||||
|
||||
image = "data:image/png;base64," + encode_image(obs["screenshot"])
|
||||
generator_observation = Observation(content=generator_message, image=image)
|
||||
|
||||
self.generator_agent._init_context(context=self.dummy_context)
|
||||
generator_actions = self.generator_agent.policy(generator_observation, message=self.dummy_message)
|
||||
|
||||
plan = generator_actions[0].action_name
|
||||
plan_thoughts = generator_actions[0].policy_info
|
||||
|
||||
prune_image_messages(self.generator_agent.memory.memory_store, 16)
|
||||
prune_image_messages(self.reflection_agent.memory.memory_store, 16)
|
||||
|
||||
self.worker_history.append(plan)
|
||||
|
||||
logger.info("FULL PLAN:\n %s", plan)
|
||||
|
||||
# self.generator_agent.add_message(plan, role="assistant")
|
||||
# Use the grounding agent to convert agent_action("desc") into agent_action([x, y])
|
||||
|
||||
try:
|
||||
agent.assign_coordinates(plan, obs)
|
||||
plan_code = parse_single_code_from_string(plan.split("Grounded Action")[-1])
|
||||
plan_code = sanitize_code(plan_code)
|
||||
plan_code = extract_first_agent_function(plan_code)
|
||||
exec_code = eval(plan_code)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error in parsing plan code: %s", e)
|
||||
plan_code = "agent.wait(1.0)"
|
||||
exec_code = eval(plan_code)
|
||||
|
||||
executor_info = {
|
||||
"full_plan": plan,
|
||||
"executor_plan": plan,
|
||||
"plan_thoughts": plan_thoughts,
|
||||
"plan_code": plan_code,
|
||||
"reflection": reflection,
|
||||
"reflection_thoughts": reflection_thoughts,
|
||||
}
|
||||
self.turn_count += 1
|
||||
|
||||
self.screenshot_inputs.append(obs["screenshot"])
|
||||
|
||||
return executor_info, [exec_code]
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
COMPUTER_USE_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
||||
|
||||
## Output Format
|
||||
```
|
||||
Thought: ...
|
||||
Action: ...
|
||||
```
|
||||
|
||||
## Action Space
|
||||
|
||||
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
||||
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
||||
|
||||
## Note
|
||||
- Use {language} in `Thought` part.
|
||||
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
||||
- My computer's password is 'password', feel free to use it when you need sudo rights.
|
||||
|
||||
## User Instruction
|
||||
{instruction}
|
||||
"""
|
||||
|
||||
COMPUTER_USE_PROMPT_WITH_CALL_USER = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
||||
|
||||
## Output Format
|
||||
```
|
||||
Thought: ...
|
||||
Action: ...
|
||||
```
|
||||
|
||||
## Action Space
|
||||
|
||||
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
||||
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
||||
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
||||
|
||||
## Note
|
||||
- Use {language} in `Thought` part.
|
||||
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
||||
- My computer's password is 'password', feel free to use it when you need sudo rights.
|
||||
|
||||
## User Instruction
|
||||
{instruction}
|
||||
"""
|
||||
|
||||
UITARS_ACTION_SPACE = """
|
||||
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
||||
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
finished()
|
||||
"""
|
||||
|
||||
UITARS_CALL_USR_ACTION_SPACE = """
|
||||
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
||||
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
finished()
|
||||
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
||||
"""
|
||||
|
||||
UITARS_NORMAL_ACTION_SPACE = """
|
||||
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
||||
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
||||
"""
|
||||
|
||||
UITARS_USR_PROMPT_NOTHOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
||||
## Output Format
|
||||
```
|
||||
Action: ...
|
||||
```
|
||||
## Action Space
|
||||
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
||||
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
||||
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
finished()
|
||||
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
||||
## User Instruction
|
||||
{instruction}
|
||||
"""
|
||||
|
||||
UITARS_USR_PROMPT_THOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
||||
|
||||
## Output Format
|
||||
```
|
||||
Thought: ...
|
||||
Action: ...
|
||||
```
|
||||
|
||||
## Action Space
|
||||
{action_space}
|
||||
|
||||
## Note
|
||||
- Use {language} in `Thought` part.
|
||||
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
||||
|
||||
## User Instruction
|
||||
{instruction}
|
||||
"""
|
||||
|
||||
|
||||
FAILURE_INDICATORS = [
|
||||
# Direct inability expressions
|
||||
"无法", "不能", "不可以", "做不到", "实现不了", "完成不了","没法",
|
||||
|
||||
# Regret/apology expressions
|
||||
"遗憾", "抱歉", "很抱歉", "非常抱歉", "对不起",
|
||||
|
||||
# Not supported/available
|
||||
"不直接支持", "不支持", "不提供", "不具备", "没有权限", "权限不足", "不在这里面","不符合",#"不存在",
|
||||
|
||||
# Cannot access/handle
|
||||
"无权访问", "访问不了", "处理不了", "操作不了", "执行不了", "没找到", "空空如也",
|
||||
|
||||
# Not possible/feasible
|
||||
"不可能", "无法实现", "实现不了", "办不到", "做不了","找不到","存在技术限制","没有找到","没有内置",
|
||||
|
||||
# System limitations
|
||||
"超出范围", "不在我的能力范围", "能力有限", "功能限制","没有成功","没成功","硬件的问题",
|
||||
|
||||
# Refusal indicators
|
||||
"拒绝", "不允许", "禁止", "不合适", "不恰当",
|
||||
|
||||
# Trying Restart
|
||||
"从头开始", "藏在", "浪费时间","一个更合理的思路","正确的方向","没有意义",#, "重新","重启",
|
||||
]
|
||||
|
|
@ -1,202 +0,0 @@
|
|||
import asyncio
|
||||
from typing import List, Optional, Union, Dict, Any
|
||||
import json
|
||||
import os
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from omegaconf import DictConfig
|
||||
from dataclasses import dataclass, asdict
|
||||
import copy
|
||||
import logging
|
||||
import random
|
||||
|
||||
from prompts import COMPUTER_USE_PROMPT, COMPUTER_USE_PROMPT_WITH_CALL_USER
|
||||
from log_config import setup_logging
|
||||
|
||||
# 设置统一的日志系统
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class TaskLoader:
|
||||
def __init__(self, task_cfg: DictConfig, storage_root):
|
||||
self.task_file = Path(task_cfg.task_file)
|
||||
#self.task_root = Path(task_cfg.task_root)
|
||||
self.osworld_root = Path(task_cfg.osworld_root)
|
||||
|
||||
self._latest_sha: Optional[str] = None
|
||||
self.storage_root = storage_root
|
||||
self.resume = task_cfg.resume
|
||||
|
||||
def poll_for_tasks(self) -> List[Dict]:
|
||||
"""find new tasks json file
|
||||
return list of TaskInfo dict if there is new json
|
||||
else return []
|
||||
"""
|
||||
self._maybe_refresh_dataset()
|
||||
|
||||
tasks_list = [task.to_dict() for task in self._tasks]
|
||||
random.shuffle(tasks_list)
|
||||
|
||||
return tasks_list
|
||||
|
||||
def _maybe_refresh_dataset_bak(self):
|
||||
|
||||
# check new json
|
||||
latest_json = self._find_latest_json()
|
||||
|
||||
if latest_json is None:
|
||||
return False # no json file
|
||||
|
||||
sha = self._calc_sha1(latest_json)
|
||||
if sha == self._latest_sha:
|
||||
return False # no change
|
||||
|
||||
with open(latest_json) as f:
|
||||
data = json.load(f)
|
||||
|
||||
raw_tasks = [
|
||||
{"task_type": task_type, "task_id": task_id}
|
||||
for task_type, task_ids in data.items()
|
||||
for task_id in task_ids
|
||||
]
|
||||
|
||||
self._tasks = [build_task(raw, self.osworld_root) for raw in raw_tasks]
|
||||
self._latest_sha = sha
|
||||
|
||||
logger.info(f"当前任务文件: {str(latest_json)}")
|
||||
logger.info(f"任务总数: {len(raw_tasks)}")
|
||||
|
||||
return True
|
||||
|
||||
def _maybe_refresh_dataset(self):
|
||||
|
||||
latest_json = self.task_file
|
||||
print("Current tasks file: ", str(latest_json))
|
||||
|
||||
with open(latest_json) as f:
|
||||
data = json.load(f)
|
||||
|
||||
raw_tasks = [
|
||||
{"task_type": task_type, "task_id": task_id}
|
||||
for task_type, task_ids in data.items()
|
||||
for task_id in task_ids
|
||||
]
|
||||
|
||||
if self.resume:
|
||||
# 过滤已完成或类型不匹配的任务
|
||||
filtered_tasks = []
|
||||
storage_root = Path(self.storage_root)
|
||||
|
||||
for raw in raw_tasks:
|
||||
task_id = str(raw["task_id"])
|
||||
task_type_expected = raw["task_type"]
|
||||
|
||||
# 找到所有以 task_id 开头的子目录(允许有多个版本)
|
||||
candidate_dirs = [
|
||||
d for d in storage_root.iterdir()
|
||||
if d.is_dir() and d.name.startswith(task_id)
|
||||
]
|
||||
|
||||
# 默认认为任务未完成
|
||||
task_finished = False
|
||||
|
||||
for d in candidate_dirs:
|
||||
cfg_path = d / "task_config.json"
|
||||
if not cfg_path.exists():
|
||||
print("找不到config文件")
|
||||
continue
|
||||
|
||||
try:
|
||||
with cfg_path.open("r", encoding="utf-8") as cf:
|
||||
cfg = json.load(cf)
|
||||
except Exception:
|
||||
print("配置损坏,忽略此目录")
|
||||
continue
|
||||
|
||||
# 3.1 task_type 不同 => 不是同一个任务,直接跳过这目录
|
||||
if cfg.get("raw", {}).get("task_type") != task_type_expected:
|
||||
continue
|
||||
|
||||
# 3.2 task_type 相同,检查 reward.txt
|
||||
if (d / "reward.txt").exists():
|
||||
task_finished = True
|
||||
break # 已找到完成记录,无需再看其他目录
|
||||
if not task_finished:
|
||||
filtered_tasks.append(raw)
|
||||
self._tasks = [build_task(raw, self.osworld_root) for raw in filtered_tasks]
|
||||
print(f"Total number of tasks: {len(raw_tasks)}, Remained:{len(filtered_tasks)}")
|
||||
|
||||
else:
|
||||
self._tasks = [build_task(raw, self.osworld_root) for raw in raw_tasks]
|
||||
print(f"Total number of tasks: {len(raw_tasks)}")
|
||||
|
||||
return True
|
||||
|
||||
def _find_latest_json(self) -> Optional[Path]:
|
||||
files = list(self.task_root.glob("*.json"))
|
||||
return max(files, key=lambda p: p.stat().st_mtime) if files else None
|
||||
|
||||
@staticmethod
|
||||
def _calc_sha1(fp: Path, chunk_size=2<<20) -> str:
|
||||
h = hashlib.sha1()
|
||||
with fp.open("rb") as f:
|
||||
for chunk in iter(lambda: f.read(chunk_size), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskInfo:
|
||||
messages: List
|
||||
instruction: str
|
||||
task_config: Dict
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def build_task(raw: Dict, osworld_root: Path, use_call_user: bool = False) -> TaskInfo:
|
||||
|
||||
task_type = raw["task_type"]
|
||||
task_id = raw["task_id"]
|
||||
task_path = os.path.join(osworld_root, task_type, task_id + ".json")
|
||||
with open(task_path) as f:
|
||||
task_data = json.load(f)
|
||||
|
||||
task_data["raw"] = {
|
||||
"task_type": task_type,
|
||||
"task_id": task_id
|
||||
}
|
||||
|
||||
instruction = task_data["instruction"]
|
||||
|
||||
if "human-ground-truth" in task_data and "single-action" in task_data["human-ground-truth"]:
|
||||
plan = task_data["human-ground-truth"]["single-action"]
|
||||
plan_text = "\n".join(plan)
|
||||
instruction = instruction.strip() + "\nHere is an instruction to help you complete the task: \n" + plan_text
|
||||
|
||||
system_prompt = COMPUTER_USE_PROMPT if not use_call_user else COMPUTER_USE_PROMPT_WITH_CALL_USER
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_prompt.format(
|
||||
instruction=instruction,
|
||||
language="English"
|
||||
)}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
return TaskInfo(
|
||||
messages = messages,
|
||||
instruction = instruction,
|
||||
task_config = task_data
|
||||
)
|
||||
|
|
@ -1,511 +0,0 @@
|
|||
import ast
|
||||
import base64
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from io import BytesIO
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
import openai
|
||||
|
||||
from openai import OpenAI
|
||||
from PIL import Image
|
||||
from requests.exceptions import SSLError
|
||||
from mm_agents.dart_gui.prompts import FAILURE_INDICATORS
|
||||
|
||||
# 设置日志系统
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FINISH_WORD = "finished"
|
||||
WAIT_WORD = "wait"
|
||||
ENV_FAIL_WORD = "error_env"
|
||||
CALL_USER = "call_user"
|
||||
|
||||
IMAGE_FACTOR = 28
|
||||
MIN_PIXELS = 100 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
MAX_RATIO = 200
|
||||
|
||||
pure_text_settings = ["a11y_tree"]
|
||||
|
||||
attributes_ns_ubuntu = "https://accessibility.windows.example.org/ns/attributes"
|
||||
attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
|
||||
state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
|
||||
state_ns_windows = "https://accessibility.windows.example.org/ns/state"
|
||||
component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
|
||||
component_ns_windows = "https://accessibility.windows.example.org/ns/component"
|
||||
value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
|
||||
value_ns_windows = "https://accessibility.windows.example.org/ns/value"
|
||||
class_ns_windows = "https://accessibility.windows.example.org/ns/class"
|
||||
# More namespaces defined in OSWorld, please check desktop_env/server/main.py
|
||||
|
||||
# 定义一个函数来解析每个 action
|
||||
def parse_action(action_str):
|
||||
try:
|
||||
# 解析字符串为 AST 节点
|
||||
node = ast.parse(action_str, mode='eval')
|
||||
|
||||
# 确保节点是一个表达式
|
||||
if not isinstance(node, ast.Expression):
|
||||
raise ValueError("Not an expression")
|
||||
|
||||
# 获取表达式的主体
|
||||
call = node.body
|
||||
|
||||
# 确保主体是一个函数调用
|
||||
if not isinstance(call, ast.Call):
|
||||
raise ValueError("Not a function call")
|
||||
|
||||
# 获取函数名
|
||||
if isinstance(call.func, ast.Name):
|
||||
func_name = call.func.id
|
||||
elif isinstance(call.func, ast.Attribute):
|
||||
func_name = call.func.attr
|
||||
else:
|
||||
func_name = None
|
||||
|
||||
# 获取关键字参数
|
||||
kwargs = {}
|
||||
for kw in call.keywords:
|
||||
key = kw.arg
|
||||
# 处理不同类型的值,这里假设都是常量
|
||||
if isinstance(kw.value, ast.Constant):
|
||||
value = kw.value.value
|
||||
elif isinstance(kw.value, ast.Str): # 兼容旧版本 Python
|
||||
value = kw.value.s
|
||||
else:
|
||||
value = None
|
||||
kwargs[key] = value
|
||||
|
||||
return {
|
||||
'function': func_name,
|
||||
'args': kwargs
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse action '{action_str}': {e}")
|
||||
return None
|
||||
|
||||
def escape_single_quotes(text):
|
||||
# 匹配未转义的单引号(不匹配 \\')
|
||||
pattern = r"(?<!\\)'"
|
||||
return re.sub(pattern, r"\\'", text)
|
||||
|
||||
def round_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
def linear_resize(
|
||||
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
||||
) -> tuple[int, int]:
|
||||
if width * height > max_pixels:
|
||||
"""
|
||||
如果图片超过/低于像素限制,则计算一个缩放因子resize_factor,使图片的像素数缩小到等于或小于max_pixels。这个缩放因子是通过开平方根计算的,确保纵横比保持不变,这样原始的相对坐标可以不经转换直接复用
|
||||
"""
|
||||
resize_factor = math.sqrt(max_pixels / (width * height))
|
||||
width, height = int(width * resize_factor), int(height * resize_factor)
|
||||
if width * height < min_pixels:
|
||||
resize_factor = math.sqrt(min_pixels / (width * height))
|
||||
width, height = math.ceil(width * resize_factor), math.ceil(height * resize_factor)
|
||||
|
||||
return height, width
|
||||
|
||||
def smart_resize(
|
||||
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if max(height, width) / min(height, width) > MAX_RATIO:
|
||||
raise ValueError(
|
||||
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
||||
)
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
return h_bar, w_bar
|
||||
|
||||
def parse_action_to_structure_output(text, factor, origin_resized_height, origin_resized_width, model_type, max_pixels=16384*28*28, min_pixels=100*28*28):
|
||||
text = text.strip()
|
||||
if model_type == "qwen25vl":
|
||||
smart_resize_height, smart_resize_width = smart_resize(origin_resized_height, origin_resized_width, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels)
|
||||
|
||||
# 正则表达式匹配 Action 字符串
|
||||
if text.startswith("Thought:"):
|
||||
thought_pattern = r"Thought: (.+?)(?=\s*Action:|$)"
|
||||
thought_hint = "Thought: "
|
||||
elif text.startswith("Reflection:"):
|
||||
thought_pattern = r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)"
|
||||
thought_hint = "Reflection: "
|
||||
elif text.startswith("Action_Summary:"):
|
||||
thought_pattern = r"Action_Summary: (.+?)(?=\s*Action:|$)"
|
||||
thought_hint = "Action_Summary: "
|
||||
else:
|
||||
thought_pattern = r"Thought: (.+?)(?=\s*Action:|$)"
|
||||
thought_hint = "Thought: "
|
||||
reflection, thought = None, None
|
||||
thought_match = re.search(thought_pattern, text, re.DOTALL)
|
||||
if thought_match:
|
||||
if len(thought_match.groups()) == 1:
|
||||
thought = thought_match.group(1).strip()
|
||||
elif len(thought_match.groups()) == 2:
|
||||
thought = thought_match.group(2).strip()
|
||||
reflection = thought_match.group(1).strip()
|
||||
assert "Action:" in text
|
||||
action_str = text.split("Action:")[-1]
|
||||
|
||||
tmp_all_action = action_str.split("\n\n")
|
||||
all_action = []
|
||||
for action_str in tmp_all_action:
|
||||
if "type(content" in action_str:
|
||||
# 正则表达式匹配 content 中的字符串并转义单引号
|
||||
def escape_quotes(match):
|
||||
content = match.group(1) # 获取 content 的值
|
||||
return content
|
||||
|
||||
# 使用正则表达式进行替换
|
||||
pattern = r"type\(content='(.*?)'\)" # 匹配 type(content='...')
|
||||
content = re.sub(pattern, escape_quotes, action_str)
|
||||
|
||||
# 处理字符串
|
||||
action_str = escape_single_quotes(content)
|
||||
action_str = "type(content='" + action_str + "')"
|
||||
|
||||
if "finished(content" in action_str:
|
||||
# 正则表达式匹配 content 中的字符串并转义单引号
|
||||
def escape_quotes(match):
|
||||
content = match.group(1) # 获取 content 的值
|
||||
return content
|
||||
|
||||
# 使用正则表达式进行替换
|
||||
pattern = r"finished\(content='(.*?)'\)" # 匹配 type(content='...')
|
||||
content = re.sub(pattern, escape_quotes, action_str)
|
||||
|
||||
# 处理字符串
|
||||
action_str = escape_single_quotes(content)
|
||||
action_str = "finished(content='" + action_str + "')"
|
||||
all_action.append(action_str)
|
||||
|
||||
parsed_actions = [parse_action(action.replace("\n","\\n").lstrip()) for action in all_action]
|
||||
actions = []
|
||||
for action_instance, raw_str in zip(parsed_actions, all_action):
|
||||
if action_instance == None:
|
||||
logger.error(f"Action can't parse: {raw_str}")
|
||||
# raise ValueError(f"Action can't parse: {raw_str}")
|
||||
continue
|
||||
action_type = action_instance["function"]
|
||||
params = action_instance["args"]
|
||||
|
||||
# import pdb; pdb.set_trace()
|
||||
action_inputs = {}
|
||||
for param_name, param in params.items():
|
||||
if param == "": continue
|
||||
param = param.lstrip() # 去掉引号和多余的空格
|
||||
# 处理start_box或者end_box参数格式 '<bbox>x1 y1 x2 y2</bbox>'
|
||||
action_inputs[param_name.strip()] = param
|
||||
|
||||
if "start_box" in param_name or "end_box" in param_name:
|
||||
ori_box = param
|
||||
# Remove parentheses and split the string by commas
|
||||
numbers = ori_box.replace("(", "").replace(")", "").split(",")
|
||||
|
||||
# Convert to float and scale by 1000
|
||||
# Qwen2.5vl output absolute coordinates, qwen2vl output relative coordinates
|
||||
if model_type == "qwen25vl":
|
||||
float_numbers = []
|
||||
for num_idx, num in enumerate(numbers):
|
||||
num = float(num)
|
||||
if (num_idx + 1) % 2 == 0:
|
||||
float_numbers.append(float(num/smart_resize_height))
|
||||
else:
|
||||
float_numbers.append(float(num/smart_resize_width))
|
||||
else:
|
||||
float_numbers = [float(num) / factor for num in numbers]
|
||||
|
||||
if len(float_numbers) == 2:
|
||||
float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
|
||||
action_inputs[param_name.strip()] = str(float_numbers)
|
||||
|
||||
# import pdb; pdb.set_trace()
|
||||
actions.append(
|
||||
{
|
||||
"reflection": reflection,
|
||||
"thought": thought,
|
||||
"action_type": action_type,
|
||||
"action_inputs": action_inputs,
|
||||
"text": text
|
||||
})
|
||||
return actions
|
||||
|
||||
def parsing_response_to_pyautogui_code(responses, image_height: int, image_width:int, input_swap:bool=True) -> str:
|
||||
'''
|
||||
将M模型的输出解析为OSWorld中的action,生成pyautogui代码字符串
|
||||
参数:
|
||||
response: 包含模型输出的字典,结构类似于:
|
||||
{
|
||||
"action_type": "hotkey",
|
||||
"action_inputs": {
|
||||
"hotkey": "v ctrl",
|
||||
"start_box": None,
|
||||
"end_box": None
|
||||
}
|
||||
}
|
||||
返回:
|
||||
生成的pyautogui代码字符串
|
||||
'''
|
||||
|
||||
pyautogui_code = "import pyautogui\nimport time\n"
|
||||
if isinstance(responses, dict):
|
||||
responses = [responses]
|
||||
for response_id, response in enumerate(responses):
|
||||
if "observation" in response:
|
||||
observation = response["observation"]
|
||||
else:
|
||||
observation = ""
|
||||
|
||||
if "thought" in response:
|
||||
thought = response["thought"]
|
||||
else:
|
||||
thought = ""
|
||||
|
||||
if response_id == 0:
|
||||
pyautogui_code += f"'''\nObservation:\n{observation}\n\nThought:\n{thought}\n'''\n"
|
||||
else:
|
||||
pyautogui_code += "\ntime.sleep(1)\n"
|
||||
|
||||
action_dict = response
|
||||
response_text = action_dict.get("text", "")
|
||||
action_type = action_dict.get("action_type")
|
||||
action_inputs = action_dict.get("action_inputs", {})
|
||||
|
||||
if action_type == "hotkey":
|
||||
# Parsing hotkey action
|
||||
if "key" in action_inputs:
|
||||
hotkey = action_inputs.get("key", "")
|
||||
else:
|
||||
hotkey = action_inputs.get("hotkey", "")
|
||||
|
||||
if hotkey == "arrowleft":
|
||||
hotkey = "left"
|
||||
|
||||
elif hotkey == "arrowright":
|
||||
hotkey = "right"
|
||||
|
||||
elif hotkey == "arrowup":
|
||||
hotkey = "up"
|
||||
|
||||
elif hotkey == "arrowdown":
|
||||
hotkey = "down"
|
||||
|
||||
if hotkey:
|
||||
# Handle other hotkeys
|
||||
keys = hotkey.split() # Split the keys by space
|
||||
convert_keys = []
|
||||
for key in keys:
|
||||
if key == "space":
|
||||
key = ' '
|
||||
convert_keys.append(key)
|
||||
pyautogui_code += f"\npyautogui.hotkey({', '.join([repr(k) for k in convert_keys])})"
|
||||
|
||||
elif action_type == "press":
|
||||
# Parsing press action
|
||||
if "key" in action_inputs:
|
||||
key_to_press = action_inputs.get("key", "")
|
||||
else:
|
||||
key_to_press = action_inputs.get("press", "")
|
||||
|
||||
if hotkey == "arrowleft":
|
||||
hotkey = "left"
|
||||
|
||||
elif hotkey == "arrowright":
|
||||
hotkey = "right"
|
||||
|
||||
elif hotkey == "arrowup":
|
||||
hotkey = "up"
|
||||
|
||||
elif hotkey == "arrowdown":
|
||||
hotkey = "down"
|
||||
|
||||
elif hotkey == "space":
|
||||
hotkey = " "
|
||||
|
||||
if key_to_press:
|
||||
# Simulate pressing a single key
|
||||
pyautogui_code += f"\npyautogui.press({repr(key_to_press)})"
|
||||
|
||||
elif action_type == "keyup":
|
||||
key_to_up = action_inputs.get("key", "")
|
||||
pyautogui_code += f"\npyautogui.keyUp({repr(key_to_up)})"
|
||||
|
||||
elif action_type == "keydown":
|
||||
key_to_down = action_inputs.get("key", "")
|
||||
pyautogui_code += f"\npyautogui.keyDown({repr(key_to_down)})"
|
||||
|
||||
elif action_type == "type":
|
||||
# Parsing typing action using clipboard
|
||||
content = action_inputs.get("content", "")
|
||||
content = escape_single_quotes(content)
|
||||
stripped_content = content
|
||||
if content.endswith("\n") or content.endswith("\\n"):
|
||||
stripped_content = stripped_content.rstrip("\\n").rstrip("\n")
|
||||
if content:
|
||||
if input_swap:
|
||||
pyautogui_code += "\nimport pyperclip"
|
||||
pyautogui_code += f"\npyperclip.copy('{stripped_content}')"
|
||||
pyautogui_code += "\npyautogui.hotkey('ctrl', 'v')"
|
||||
pyautogui_code += "\ntime.sleep(0.5)\n"
|
||||
if content.endswith("\n") or content.endswith("\\n"):
|
||||
pyautogui_code += "\npyautogui.press('enter')"
|
||||
else:
|
||||
pyautogui_code += f"\npyautogui.write('{stripped_content}', interval=0.1)"
|
||||
pyautogui_code += "\ntime.sleep(0.5)\n"
|
||||
if content.endswith("\n") or content.endswith("\\n"):
|
||||
pyautogui_code += "\npyautogui.press('enter')"
|
||||
|
||||
|
||||
elif action_type in ["drag", "select"]:
|
||||
# Parsing drag or select action based on start and end_boxes
|
||||
start_box = action_inputs.get("start_box")
|
||||
end_box = action_inputs.get("end_box")
|
||||
if start_box and end_box:
|
||||
x1, y1, x2, y2 = eval(start_box) # Assuming box is in [x1, y1, x2, y2]
|
||||
sx = round(float((x1 + x2) / 2) * image_width, 3)
|
||||
sy = round(float((y1 + y2) / 2) * image_height, 3)
|
||||
x1, y1, x2, y2 = eval(end_box) # Assuming box is in [x1, y1, x2, y2]
|
||||
ex = round(float((x1 + x2) / 2) * image_width, 3)
|
||||
ey = round(float((y1 + y2) / 2) * image_height, 3)
|
||||
pyautogui_code += (
|
||||
f"\npyautogui.moveTo({sx}, {sy})\n"
|
||||
f"\npyautogui.dragTo({ex}, {ey}, duration=1.0)\n"
|
||||
)
|
||||
|
||||
elif action_type == "scroll":
|
||||
# Parsing scroll action
|
||||
start_box = action_inputs.get("start_box")
|
||||
if start_box:
|
||||
x1, y1, x2, y2 = eval(start_box) # Assuming box is in [x1, y1, x2, y2]
|
||||
x = round(float((x1 + x2) / 2) * image_width, 3)
|
||||
y = round(float((y1 + y2) / 2) * image_height, 3)
|
||||
|
||||
# # 先点对应区域,再滚动
|
||||
# pyautogui_code += f"\npyautogui.click({x}, {y}, button='left')"
|
||||
else:
|
||||
x = None
|
||||
y = None
|
||||
direction = action_inputs.get("direction", "")
|
||||
|
||||
if x == None:
|
||||
if "up" in direction.lower():
|
||||
pyautogui_code += "\npyautogui.scroll(5)"
|
||||
elif "down" in direction.lower():
|
||||
pyautogui_code += "\npyautogui.scroll(-5)"
|
||||
else:
|
||||
if "up" in direction.lower():
|
||||
pyautogui_code += f"\npyautogui.scroll(5, x={x}, y={y})"
|
||||
elif "down" in direction.lower():
|
||||
pyautogui_code += f"\npyautogui.scroll(-5, x={x}, y={y})"
|
||||
|
||||
elif action_type in ["click", "left_single", "left_double", "right_single", "hover"]:
|
||||
# Parsing mouse click actions
|
||||
start_box = action_inputs.get("start_box")
|
||||
start_box = str(start_box)
|
||||
if start_box:
|
||||
start_box = eval(start_box)
|
||||
if start_box is None:
|
||||
logger.warning(f"[Warning] start_box is None and wired condition:\n{action_inputs}")
|
||||
|
||||
if len(start_box) == 4:
|
||||
x1, y1, x2, y2 = start_box # Assuming box is in [x1, y1, x2, y2]
|
||||
elif len(start_box) == 2:
|
||||
x1, y1 = start_box
|
||||
x2 = x1
|
||||
y2 = y1
|
||||
x = round(float((x1 + x2) / 2) * image_width, 3)
|
||||
y = round(float((y1 + y2) / 2) * image_height, 3)
|
||||
if action_type == "left_single" or action_type == "click":
|
||||
pyautogui_code += f"\npyautogui.click({x}, {y}, button='left')"
|
||||
elif action_type == "left_double":
|
||||
pyautogui_code += f"\npyautogui.doubleClick({x}, {y}, button='left')"
|
||||
elif action_type == "right_single":
|
||||
pyautogui_code += f"\npyautogui.click({x}, {y}, button='right')"
|
||||
elif action_type == "hover":
|
||||
pyautogui_code += f"\npyautogui.moveTo({x}, {y})"
|
||||
|
||||
elif action_type in ["finished"]:
|
||||
pyautogui_code = "DONE"
|
||||
print(f"FINISHED:response_text: {response_text}")
|
||||
print(f"FINISHED:response: {str(response)}")
|
||||
for failure_indicator in FAILURE_INDICATORS:
|
||||
if failure_indicator in response_text:
|
||||
pyautogui_code = "FAIL"
|
||||
break
|
||||
elif action_type in ["wait"]:
|
||||
pyautogui_code = "WAIT"
|
||||
|
||||
elif action_type in ["call_user"]:
|
||||
pyautogui_code = "FAIL"
|
||||
else:
|
||||
pyautogui_code += f"\n# Unrecognized action type: {action_type}"
|
||||
|
||||
return pyautogui_code
|
||||
|
||||
def add_box_token(input_string):
|
||||
# Step 1: Split the string into individual actions
|
||||
if "Action: " in input_string and "start_box=" in input_string:
|
||||
suffix = input_string.split("Action: ")[0] + "Action: "
|
||||
actions = input_string.split("Action: ")[1:]
|
||||
processed_actions = []
|
||||
for action in actions:
|
||||
action = action.strip()
|
||||
# Step 2: Extract coordinates (start_box or end_box) using regex
|
||||
coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
|
||||
|
||||
updated_action = action # Start with the original action
|
||||
for coord_type, x, y in coordinates:
|
||||
# Convert x and y to integers
|
||||
updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
|
||||
processed_actions.append(updated_action)
|
||||
|
||||
# Step 5: Reconstruct the final string
|
||||
final_string = suffix + "\n\n".join(processed_actions)
|
||||
else:
|
||||
final_string = input_string
|
||||
# print(f"Input string: {input_string}")
|
||||
# print(f"Final string: {final_string}")
|
||||
return [{"type": "text", "text": final_string}]
|
||||
|
||||
def pil_to_base64(image):
|
||||
"""Convert PIL Image or bytes to base64 string"""
|
||||
if isinstance(image, bytes):
|
||||
# If it's already bytes, just encode to base64
|
||||
return base64.b64encode(image).decode("utf-8")
|
||||
else:
|
||||
# If it's a PIL Image, convert it
|
||||
buffer = BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
|
@ -1,686 +0,0 @@
|
|||
"""
|
||||
Dart Agent - Custom agent for GUI automation using Dart models
|
||||
Based on UITARSAgent structure but using Dart-specific utilities and prompts
|
||||
"""
|
||||
import ast
|
||||
import base64
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from io import BytesIO
|
||||
from typing import Dict, List, Any
|
||||
from PIL import Image
|
||||
from openai import OpenAI
|
||||
import backoff
|
||||
import openai
|
||||
import requests
|
||||
from requests.exceptions import SSLError
|
||||
from google.api_core.exceptions import (
|
||||
BadRequest,
|
||||
InternalServerError,
|
||||
InvalidArgument,
|
||||
ResourceExhausted,
|
||||
)
|
||||
|
||||
# Import Dart-specific utilities and prompts
|
||||
from mm_agents.dart_gui.utils import (
|
||||
pil_to_base64,
|
||||
parse_action_to_structure_output,
|
||||
parsing_response_to_pyautogui_code,
|
||||
parse_action,
|
||||
escape_single_quotes,
|
||||
round_by_factor,
|
||||
ceil_by_factor,
|
||||
floor_by_factor,
|
||||
linear_resize,
|
||||
smart_resize,
|
||||
add_box_token,
|
||||
IMAGE_FACTOR,
|
||||
MIN_PIXELS,
|
||||
MAX_PIXELS,
|
||||
MAX_RATIO,
|
||||
FINISH_WORD,
|
||||
WAIT_WORD,
|
||||
ENV_FAIL_WORD,
|
||||
CALL_USER
|
||||
)
|
||||
|
||||
from mm_agents.dart_gui.prompts import (
|
||||
COMPUTER_USE_PROMPT,
|
||||
COMPUTER_USE_PROMPT_WITH_CALL_USER,
|
||||
UITARS_ACTION_SPACE,
|
||||
UITARS_CALL_USR_ACTION_SPACE,
|
||||
UITARS_USR_PROMPT_THOUGHT,
|
||||
UITARS_USR_PROMPT_NOTHOUGHT
|
||||
)
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
class DartAgent:
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
runtime_conf: Dict,
|
||||
platform="ubuntu",
|
||||
max_tokens=1000,
|
||||
top_p=0.9,
|
||||
top_k=1.0,
|
||||
temperature=0.0,
|
||||
action_space="pyautogui",
|
||||
observation_type="screenshot",
|
||||
max_trajectory_length=50,
|
||||
model_type="qwen25vl",
|
||||
**kwargs
|
||||
):
|
||||
self.model = model
|
||||
self.platform = platform
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.model_type = model_type
|
||||
self.runtime_conf = runtime_conf
|
||||
|
||||
# Extract runtime configuration parameters
|
||||
self.max_tokens = self.runtime_conf.get("max_tokens", max_tokens)
|
||||
self.top_p = self.runtime_conf.get("top_p", top_p)
|
||||
self.top_k = self.runtime_conf.get("top_k", top_k)
|
||||
self.temperature = self.runtime_conf.get("temperature", temperature)
|
||||
self.infer_mode = self.runtime_conf.get("infer_mode", "dart_mode")
|
||||
self.prompt_style = self.runtime_conf.get("prompt_style", "dart_style")
|
||||
self.input_swap = self.runtime_conf.get("input_swap", False)
|
||||
self.language = self.runtime_conf.get("language", "English")
|
||||
self.max_pixels = self.runtime_conf.get("max_pixels", MAX_PIXELS)
|
||||
self.min_pixels = self.runtime_conf.get("min_pixels", MIN_PIXELS)
|
||||
self.history_n = self.runtime_conf.get("history_n", 5)
|
||||
|
||||
# Dart specific configurations
|
||||
self.max_images = self.runtime_conf.get("max_images", 5)
|
||||
self.max_texts = self.runtime_conf.get("max_texts", 35)
|
||||
|
||||
# Initialize OpenAI client - use Dart API if provided
|
||||
dart_api_key = self.runtime_conf.get("dart_api_key", "")
|
||||
dart_base_url = self.runtime_conf.get("dart_base_url", "")
|
||||
|
||||
if dart_base_url:
|
||||
# 检查是否为直接的生成端点(包含 /generate)
|
||||
if '/generate' in dart_base_url:
|
||||
# 直接使用提供的 URL,不添加 /v1
|
||||
logger.info(f"使用直接生成端点: {dart_base_url}")
|
||||
self.dart_direct_url = dart_base_url
|
||||
self.vlm = None # 不使用 OpenAI 客户端
|
||||
else:
|
||||
# 传统的 OpenAI 兼容端点,确保以 /v1 结尾
|
||||
if not dart_base_url.endswith('/v1'):
|
||||
dart_base_url = dart_base_url.rstrip('/') + '/v1'
|
||||
|
||||
self.vlm = OpenAI(
|
||||
base_url=dart_base_url,
|
||||
api_key=dart_api_key,
|
||||
)
|
||||
self.dart_direct_url = None
|
||||
else:
|
||||
# Fallback to environment variables
|
||||
base_url = os.environ.get('DART_API_URL', os.environ.get('DOUBAO_API_URL'))
|
||||
if base_url:
|
||||
if '/generate' in base_url:
|
||||
# 直接生成端点
|
||||
self.dart_direct_url = base_url
|
||||
self.vlm = None
|
||||
else:
|
||||
if not base_url.endswith('/v1'):
|
||||
base_url = base_url.rstrip('/') + '/v1'
|
||||
self.vlm = OpenAI(
|
||||
base_url=base_url,
|
||||
api_key=os.environ.get('DART_API_KEY', os.environ.get('DOUBAO_API_KEY')),
|
||||
)
|
||||
self.dart_direct_url = None
|
||||
else:
|
||||
self.vlm = None
|
||||
self.dart_direct_url = None
|
||||
|
||||
# Initialize trajectory storage - similar to trajectory_runner.py
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.history_images = []
|
||||
self.history_responses = []
|
||||
|
||||
# Message handling similar to trajectory_runner.py
|
||||
self.base_messages = [] # for model client (with base64 images)
|
||||
self.base_messages_for_save = [] # for storage (with file paths)
|
||||
self.prompt_dialogue = [] # for model client
|
||||
self.save_dialogue = [] # for storage
|
||||
self.save_dialogue_full = [] # for full storage (保存所有图片路径)
|
||||
self.image_refs = [] # record image position
|
||||
|
||||
# All image paths storage - to keep track of all images even when trimmed
|
||||
self.all_image_paths = []
|
||||
|
||||
# Current screenshot file path for proper saving
|
||||
self.current_screenshot_path = None
|
||||
|
||||
# Configure prompt and action space based on mode
|
||||
if self.infer_mode == "dart_mode":
|
||||
self.prompt_action_space = UITARS_ACTION_SPACE
|
||||
self.prompt_template = COMPUTER_USE_PROMPT
|
||||
else:
|
||||
# For qwen2vl_user mode
|
||||
self.prompt_action_space = UITARS_CALL_USR_ACTION_SPACE
|
||||
if self.prompt_style == "qwen2vl_user":
|
||||
self.prompt_template = UITARS_USR_PROMPT_THOUGHT
|
||||
elif self.prompt_style == "qwen2vl_no_thought":
|
||||
self.prompt_template = UITARS_USR_PROMPT_NOTHOUGHT
|
||||
else:
|
||||
self.prompt_template = UITARS_USR_PROMPT_THOUGHT
|
||||
|
||||
self.action_parse_res_factor = 1000
|
||||
|
||||
logger.info(f"Initialized DartAgent with model: {self.model}, mode: {self.infer_mode}")
|
||||
|
||||
def reset(self, runtime_logger=None):
|
||||
"""Reset the agent state"""
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.history_images = []
|
||||
self.history_responses = []
|
||||
|
||||
# Reset message handling
|
||||
self.base_messages = []
|
||||
self.base_messages_for_save = []
|
||||
self.prompt_dialogue = []
|
||||
self.save_dialogue = []
|
||||
self.save_dialogue_full = []
|
||||
self.image_refs = []
|
||||
self.all_image_paths = []
|
||||
self.current_screenshot_path = None
|
||||
|
||||
logger.info("DartAgent reset")
|
||||
|
||||
def set_base_messages(self, instruction: str):
|
||||
"""Initialize base messages similar to task_loader.py"""
|
||||
system_prompt = COMPUTER_USE_PROMPT
|
||||
|
||||
self.base_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_prompt.format(
|
||||
instruction=instruction,
|
||||
language=self.language
|
||||
)
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
# Copy for save version
|
||||
from copy import deepcopy
|
||||
self.base_messages_for_save = deepcopy(self.base_messages)
|
||||
|
||||
def set_current_screenshot_path(self, screenshot_path: str):
|
||||
"""Set the current screenshot file path for proper saving"""
|
||||
self.current_screenshot_path = screenshot_path
|
||||
|
||||
def predict(
|
||||
self, instruction: str, obs: Dict, last_action_after_obs: Dict = None
|
||||
) -> tuple:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
Returns: (response_text, actions_list)
|
||||
"""
|
||||
# Initialize base messages if not set
|
||||
if not self.base_messages:
|
||||
self.set_base_messages(instruction)
|
||||
|
||||
# Store current observation
|
||||
self._add_observation(obs)
|
||||
|
||||
# For first step, set the first frame
|
||||
if len(self.observations) == 1:
|
||||
self._set_first_frame(obs["screenshot"], self.current_screenshot_path)
|
||||
else:
|
||||
# For subsequent steps, add the new image to dialogue
|
||||
# This represents the result of the previous action
|
||||
self._add_image(obs["screenshot"], self.current_screenshot_path)
|
||||
|
||||
# Build prompt messages (base_messages + prompt_dialogue)
|
||||
messages = self._build_messages()
|
||||
|
||||
# Call model to get response
|
||||
prediction = self._call_model(messages)
|
||||
if prediction is None:
|
||||
return "client error", ["DONE"]
|
||||
|
||||
# Store response and parse actions
|
||||
self._add_text(prediction)
|
||||
|
||||
# Parse response to actions
|
||||
try:
|
||||
image_size = self._get_current_image_size()
|
||||
actions = self._parse_and_convert_actions(prediction, image_size)
|
||||
|
||||
# Check for terminal actions
|
||||
terminal_action = self._check_terminal_actions(actions)
|
||||
if terminal_action:
|
||||
self.actions.append(actions)
|
||||
return prediction, [terminal_action]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Parsing action error: {prediction}, error: {e}")
|
||||
return f"Parsing action error: {prediction}, error: {e}", ["DONE"]
|
||||
|
||||
self.actions.append(actions)
|
||||
# Check max steps
|
||||
if len(self.history_responses) >= self.max_trajectory_length:
|
||||
actions = ["FAIL"]
|
||||
|
||||
return prediction, actions
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.constant,
|
||||
(
|
||||
# General exceptions
|
||||
SSLError,
|
||||
# OpenAI exceptions
|
||||
openai.RateLimitError,
|
||||
openai.BadRequestError,
|
||||
openai.InternalServerError,
|
||||
# Google exceptions
|
||||
InvalidArgument,
|
||||
ResourceExhausted,
|
||||
InternalServerError,
|
||||
BadRequest,
|
||||
),
|
||||
interval=30,
|
||||
max_tries=10,
|
||||
)
|
||||
def predict_with_backoff(self, instruction: str, obs: Dict, last_action_after_obs: Dict = None):
|
||||
"""Predict with backoff for rate limiting and temporary errors"""
|
||||
return self.predict(instruction, obs, last_action_after_obs)
|
||||
|
||||
def get_trajectory(self) -> List[Dict]:
|
||||
"""Get the current trajectory for saving"""
|
||||
trajectory = []
|
||||
for i in range(len(self.observations)):
|
||||
trajectory.append({
|
||||
"observation": self.observations[i],
|
||||
"thought": self.thoughts[i] if i < len(self.thoughts) else "",
|
||||
"action": self.actions[i] if i < len(self.actions) else []
|
||||
})
|
||||
return trajectory
|
||||
|
||||
def get_full_messages(self) -> List[Dict]:
|
||||
"""Get the complete conversation messages for saving (including base messages and dialogue)"""
|
||||
# Combine base_messages_for_save with save_dialogue_full to get complete conversation
|
||||
full_messages = []
|
||||
|
||||
# Add base messages (system prompt and initial user message)
|
||||
full_messages.extend(self.base_messages_for_save)
|
||||
|
||||
# Add dialogue messages (user images + assistant responses) with all images
|
||||
full_messages.extend(self.save_dialogue_full)
|
||||
|
||||
return full_messages
|
||||
|
||||
def get_all_image_paths(self) -> List[str]:
|
||||
"""Get all image paths that have been used throughout the conversation"""
|
||||
return self.all_image_paths.copy()
|
||||
|
||||
# ========== Private Methods ==========
|
||||
|
||||
def _validate_trajectory(self):
|
||||
"""Validate trajectory consistency"""
|
||||
assert len(self.observations) == len(self.actions) and len(self.actions) == len(
|
||||
self.thoughts
|
||||
), "The number of observations and actions should be the same."
|
||||
|
||||
def _add_observation(self, obs: Dict):
|
||||
"""Process observation and add to history"""
|
||||
# Store observation
|
||||
if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
|
||||
base64_image = obs["screenshot"]
|
||||
try:
|
||||
# Handle accessibility tree if needed
|
||||
linearized_accessibility_tree = None
|
||||
if self.observation_type == "screenshot_a11y_tree" and "accessibility_tree" in obs:
|
||||
# For now, we'll skip accessibility tree processing in Dart mode
|
||||
linearized_accessibility_tree = None
|
||||
except:
|
||||
linearized_accessibility_tree = None
|
||||
|
||||
if self.observation_type == "screenshot_a11y_tree":
|
||||
self.observations.append({
|
||||
"screenshot": base64_image,
|
||||
"accessibility_tree": linearized_accessibility_tree,
|
||||
})
|
||||
else:
|
||||
self.observations.append({
|
||||
"screenshot": base64_image,
|
||||
"accessibility_tree": None
|
||||
})
|
||||
else:
|
||||
raise ValueError("Invalid observation_type type: " + self.observation_type)
|
||||
|
||||
|
||||
def _build_messages(self) -> List[Dict]:
|
||||
"""Build messages for model API call - similar to trajectory_runner._build_messages"""
|
||||
return self.base_messages + self.prompt_dialogue
|
||||
|
||||
def _call_model(self, messages: List[Dict]) -> str:
|
||||
"""Call model with retry logic"""
|
||||
try_times = 3
|
||||
while try_times > 0:
|
||||
try:
|
||||
# 如果使用直接生成端点
|
||||
if hasattr(self, 'dart_direct_url') and self.dart_direct_url:
|
||||
prediction = self._call_direct_generate_endpoint(messages)
|
||||
else:
|
||||
# 使用标准 OpenAI 客户端
|
||||
response = self.vlm.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=messages,
|
||||
frequency_penalty=1,
|
||||
max_tokens=self.max_tokens,
|
||||
temperature=self.temperature,
|
||||
top_p=self.top_p
|
||||
)
|
||||
prediction = response.choices[0].message.content
|
||||
|
||||
logger.info(f"Model response: {prediction}")
|
||||
return prediction
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error when fetching response from client: {e}")
|
||||
try_times -= 1
|
||||
if try_times <= 0:
|
||||
logger.error("Reach max retry times to fetch response from client")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _call_direct_generate_endpoint(self, messages: List[Dict]) -> str:
|
||||
"""直接调用生成端点"""
|
||||
try:
|
||||
|
||||
|
||||
# 构建请求数据
|
||||
payload = {
|
||||
"messages": messages,
|
||||
"model": self.model,
|
||||
"max_tokens": self.max_tokens,
|
||||
"temperature": self.temperature,
|
||||
"top_p": self.top_p,
|
||||
"frequency_penalty": 1
|
||||
}
|
||||
|
||||
# 添加 API key 到 headers
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.runtime_conf.get('dart_api_key', '')}"
|
||||
}
|
||||
|
||||
|
||||
# 重试机制:最多重试3次,每次推理60秒
|
||||
max_retries = 3
|
||||
response = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
logger.info(f"尝试第 {attempt + 1} 次请求...")
|
||||
response = requests.post(
|
||||
self.dart_direct_url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=60
|
||||
)
|
||||
response.raise_for_status()
|
||||
break # 成功则跳出重试循环
|
||||
except Exception as e:
|
||||
logger.warning(f"第 {attempt + 1} 次请求失败: {e}")
|
||||
if attempt == max_retries - 1: # 最后一次重试失败
|
||||
logger.error(f"所有 {max_retries} 次重试都失败了")
|
||||
raise e
|
||||
else:
|
||||
logger.info(f"等待后重试...")
|
||||
import time
|
||||
time.sleep(2) # 等待2秒后重试
|
||||
|
||||
# 解析响应
|
||||
result = response.json()
|
||||
|
||||
# 尝试多种可能的响应格式
|
||||
if 'choices' in result and len(result['choices']) > 0:
|
||||
# OpenAI 兼容格式
|
||||
return result['choices'][0]['message']['content']
|
||||
elif 'response' in result:
|
||||
# 简单的 response 字段
|
||||
return result['response']
|
||||
elif 'text' in result:
|
||||
# text 字段
|
||||
return result['text']
|
||||
elif 'content' in result:
|
||||
# content 字段
|
||||
return result['content']
|
||||
else:
|
||||
# 如果找不到标准字段,返回整个响应的字符串
|
||||
logger.warning(f"未知的响应格式: {result}")
|
||||
return str(result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"直接端点调用失败: {e}")
|
||||
raise e
|
||||
|
||||
def _add_text(self, assistant_txt: str):
|
||||
"""Add text response to history - similar to trajectory_runner.py"""
|
||||
self.history_responses.append(assistant_txt)
|
||||
self.thoughts.append(assistant_txt)
|
||||
|
||||
# Add to dialogue similar to trajectory_runner._add_text
|
||||
msg = {
|
||||
"role": "assistant",
|
||||
"content": add_box_token(assistant_txt)
|
||||
}
|
||||
self.prompt_dialogue.append(msg)
|
||||
self.save_dialogue.append(msg)
|
||||
self.save_dialogue_full.append(msg)
|
||||
self._trim()
|
||||
|
||||
def _set_first_frame(self, obs_img: bytes, frame_path: str = None):
|
||||
"""Set first frame in base_messages - similar to trajectory_runner._set_first_frame"""
|
||||
self.base_messages[1]["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "data:image/png;base64," + pil_to_base64(obs_img)}
|
||||
}
|
||||
)
|
||||
|
||||
# Use actual frame path if provided, otherwise use current_screenshot_path or placeholder
|
||||
if frame_path:
|
||||
first_frame_path = frame_path
|
||||
elif self.current_screenshot_path:
|
||||
first_frame_path = self.current_screenshot_path
|
||||
else:
|
||||
first_frame_path = "first_frame.png"
|
||||
|
||||
# Store in all_image_paths
|
||||
self.all_image_paths.append(first_frame_path)
|
||||
|
||||
self.base_messages_for_save[1]["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": first_frame_path
|
||||
}
|
||||
)
|
||||
|
||||
self.image_refs.append(
|
||||
{"source": "base", "msg_idx": 1,
|
||||
"content_idx": len(self.base_messages[1]["content"]) - 1}
|
||||
)
|
||||
|
||||
def _add_image(self, img_bytes: bytes, frame_path: str = None):
|
||||
"""Add image to dialogue - similar to trajectory_runner._add_image"""
|
||||
self.prompt_dialogue.append({
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "data:image/png;base64," + pil_to_base64(img_bytes)}
|
||||
}]
|
||||
})
|
||||
|
||||
# Use actual frame path if provided, otherwise use current_screenshot_path
|
||||
if frame_path:
|
||||
image_url = frame_path
|
||||
elif self.current_screenshot_path:
|
||||
image_url = self.current_screenshot_path
|
||||
else:
|
||||
# Fallback to a placeholder - this should rarely happen in practice
|
||||
image_url = f"frame_{len(self.save_dialogue)}.png"
|
||||
|
||||
# Store in all_image_paths for complete record
|
||||
self.all_image_paths.append(image_url)
|
||||
|
||||
# Add to save_dialogue (trimmed version)
|
||||
self.save_dialogue.append({
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": image_url
|
||||
}]
|
||||
})
|
||||
|
||||
# Add to save_dialogue_full (complete version - never trimmed)
|
||||
self.save_dialogue_full.append({
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": image_url
|
||||
}]
|
||||
})
|
||||
|
||||
self.image_refs.append(
|
||||
{"source": "dialogue", "msg_idx": len(self.prompt_dialogue) - 1,
|
||||
"content_idx": None}
|
||||
)
|
||||
|
||||
self._trim()
|
||||
|
||||
def _trim(self):
|
||||
"""Ensure image num ≤ max_images and assistant text num ≤ max_texts - similar to trajectory_runner._trim"""
|
||||
img_cnt = len(self.image_refs)
|
||||
txt_cnt = sum(m["role"] == "assistant" for m in self.prompt_dialogue)
|
||||
|
||||
while img_cnt > self.max_images or txt_cnt > self.max_texts:
|
||||
# 图片超限:最早一张
|
||||
if img_cnt > self.max_images:
|
||||
ref = self.image_refs.pop(0)
|
||||
if ref["source"] == "base":
|
||||
self.base_messages[ref["msg_idx"]]["content"].pop(ref["content_idx"])
|
||||
else: # dialogue 图
|
||||
self._remove_dialogue_msg(ref["msg_idx"])
|
||||
img_cnt -= 1
|
||||
continue
|
||||
|
||||
# 文本超限:最早 assistant 文本
|
||||
if txt_cnt > self.max_texts:
|
||||
for i, m in enumerate(self.prompt_dialogue):
|
||||
if m["role"] == "assistant":
|
||||
self._remove_dialogue_msg(i)
|
||||
txt_cnt -= 1
|
||||
break
|
||||
|
||||
def _remove_dialogue_msg(self, idx: int):
|
||||
"""Remove dialogue message and update refs - similar to trajectory_runner._remove_dialogue_msg"""
|
||||
self.prompt_dialogue.pop(idx)
|
||||
self.save_dialogue.pop(idx)
|
||||
# Note: save_dialogue_full is never trimmed, so we don't remove from it
|
||||
|
||||
# 更新 image_refs
|
||||
self.image_refs = [
|
||||
r if not (r["source"] == "dialogue" and r["msg_idx"] == idx)
|
||||
else None # 同一条被删掉的图引用直接丢弃
|
||||
for r in self.image_refs
|
||||
]
|
||||
self.image_refs = [
|
||||
(
|
||||
{**r, "msg_idx": r["msg_idx"] - 1}
|
||||
if r and r["source"] == "dialogue" and r["msg_idx"] > idx # idx后的图片索引均-1
|
||||
else r
|
||||
)
|
||||
for r in self.image_refs
|
||||
if r # 剔除 None
|
||||
]
|
||||
|
||||
def _get_current_image_size(self) -> tuple:
|
||||
"""Get current image size for coordinate conversion"""
|
||||
if len(self.observations) > 0:
|
||||
try:
|
||||
current_image_bytes = self.observations[-1]["screenshot"]
|
||||
if isinstance(current_image_bytes, bytes):
|
||||
current_image = Image.open(BytesIO(current_image_bytes))
|
||||
return (current_image.height, current_image.width)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error getting image size: {e}")
|
||||
|
||||
# Fallback to default screen size
|
||||
return (1080, 1920)
|
||||
|
||||
def _parse_and_convert_actions(self, prediction: str, image_size: tuple) -> List[str]:
|
||||
"""Parse response and convert to pyautogui actions - similar to trajectory_runner._parse"""
|
||||
image_height, image_width = image_size
|
||||
|
||||
# Parse the response to structured actions
|
||||
parsed_responses = parse_action_to_structure_output(
|
||||
prediction,
|
||||
factor=self.action_parse_res_factor,
|
||||
origin_resized_height=image_height,
|
||||
origin_resized_width=image_width,
|
||||
model_type=self.model_type,
|
||||
max_pixels=self.max_pixels,
|
||||
min_pixels=self.min_pixels
|
||||
)
|
||||
|
||||
# Convert parsed responses to pyautogui actions
|
||||
actions = []
|
||||
for parsed_response in parsed_responses:
|
||||
try:
|
||||
pyautogui_code = parsing_response_to_pyautogui_code(
|
||||
parsed_response,
|
||||
image_height=image_height,
|
||||
image_width=image_width,
|
||||
input_swap=self.input_swap
|
||||
)
|
||||
|
||||
|
||||
|
||||
actions.append(pyautogui_code)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating pyautogui code: {e}")
|
||||
actions.append("FAIL")
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
|
||||
def _check_terminal_actions(self, actions: List[str]) -> str:
|
||||
"""Check if any action is terminal and return appropriate code"""
|
||||
for action in actions:
|
||||
if isinstance(action, dict) and "action_type" in action:
|
||||
action_type = action["action_type"]
|
||||
if action_type == FINISH_WORD:
|
||||
return "DONE"
|
||||
elif action_type == WAIT_WORD:
|
||||
return "WAIT"
|
||||
elif action_type == ENV_FAIL_WORD:
|
||||
return "FAIL"
|
||||
elif action_type == CALL_USER:
|
||||
return "FAIL"
|
||||
return None
|
||||
|
|
@ -1,653 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import json
|
||||
import logging
|
||||
import backoff
|
||||
import openai
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
from mm_agents.evocua.utils import (
|
||||
process_image,
|
||||
encode_image,
|
||||
rewrite_pyautogui_text_inputs,
|
||||
project_coordinate_to_absolute_scale,
|
||||
log_messages
|
||||
)
|
||||
|
||||
from mm_agents.evocua.prompts import (
|
||||
S1_SYSTEM_PROMPT,
|
||||
S1_INSTRUTION_TEMPLATE,
|
||||
S1_STEP_TEMPLATE,
|
||||
S1_ACTION_HISTORY_TEMPLATE,
|
||||
S2_ACTION_DESCRIPTION,
|
||||
S2_DESCRIPTION_PROMPT_TEMPLATE,
|
||||
S2_SYSTEM_PROMPT,
|
||||
build_s2_tools_def
|
||||
)
|
||||
|
||||
logger = logging.getLogger("desktopenv.evocua")
|
||||
|
||||
class EvoCUAAgent:
|
||||
"""
|
||||
EvoCUA - A Native GUI agent model for desktop automation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "EvoCUA-S2",
|
||||
max_tokens: int = 32768,
|
||||
top_p: float = 0.9,
|
||||
temperature: float = 0.0,
|
||||
action_space: str = "pyautogui",
|
||||
observation_type: str = "screenshot",
|
||||
max_steps: int = 50,
|
||||
prompt_style: str = "S2", # "S1" or "S2"
|
||||
max_history_turns: int = 4,
|
||||
screen_size: Tuple[int, int] = (1920, 1080),
|
||||
coordinate_type: str = "relative",
|
||||
password: str = "osworld-public-evaluation",
|
||||
resize_factor: int = 32,
|
||||
**kwargs
|
||||
):
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
self.top_p = top_p
|
||||
self.temperature = temperature
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
self.max_steps = max_steps
|
||||
|
||||
self.prompt_style = prompt_style
|
||||
assert self.prompt_style in ["S1", "S2"], f"Invalid prompt_style: {self.prompt_style}"
|
||||
|
||||
self.max_history_turns = max_history_turns
|
||||
|
||||
self.screen_size = screen_size
|
||||
self.coordinate_type = coordinate_type
|
||||
self.password = password
|
||||
self.resize_factor = resize_factor
|
||||
|
||||
# Action space assertion
|
||||
assert self.action_space == "pyautogui", f"Invalid action space: {self.action_space}"
|
||||
assert self.observation_type == "screenshot", f"Invalid observation type: {self.observation_type}"
|
||||
|
||||
# State
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.responses = []
|
||||
self.screenshots = [] # Stores encoded string
|
||||
self.cots = [] # For S1 style history
|
||||
|
||||
def reset(self, _logger=None, vm_ip=None):
|
||||
global logger
|
||||
if _logger:
|
||||
logger = _logger
|
||||
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.responses = []
|
||||
self.screenshots = []
|
||||
self.cots = []
|
||||
|
||||
def predict(self, instruction: str, obs: Dict) -> List:
|
||||
"""
|
||||
Main prediction loop.
|
||||
"""
|
||||
|
||||
logger.info(f"========================== {self.model} ===================================")
|
||||
logger.info(f"Instruction: \n{instruction}")
|
||||
|
||||
screenshot_bytes = obs["screenshot"]
|
||||
|
||||
try:
|
||||
original_img = Image.open(BytesIO(screenshot_bytes))
|
||||
original_width, original_height = original_img.size
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read screenshot size, falling back to screen_size: {e}")
|
||||
original_width, original_height = self.screen_size
|
||||
|
||||
if self.prompt_style == "S1":
|
||||
raw_b64 = encode_image(screenshot_bytes)
|
||||
self.screenshots.append(raw_b64)
|
||||
return self._predict_s1(instruction, obs, raw_b64)
|
||||
else:
|
||||
processed_b64, p_width, p_height = process_image(screenshot_bytes, factor=self.resize_factor)
|
||||
self.screenshots.append(processed_b64)
|
||||
return self._predict_s2(
|
||||
instruction,
|
||||
obs,
|
||||
processed_b64,
|
||||
p_width,
|
||||
p_height,
|
||||
original_width,
|
||||
original_height,
|
||||
)
|
||||
|
||||
|
||||
def _predict_s2(self, instruction, obs, processed_b64, p_width, p_height, original_width, original_height):
|
||||
current_step = len(self.actions)
|
||||
current_history_n = self.max_history_turns
|
||||
|
||||
response = None
|
||||
|
||||
if self.coordinate_type == "absolute":
|
||||
resolution_info = f"* The screen's resolution is {p_width}x{p_height}."
|
||||
else:
|
||||
resolution_info = "* The screen's resolution is 1000x1000."
|
||||
|
||||
description_prompt = S2_DESCRIPTION_PROMPT_TEMPLATE.format(resolution_info=resolution_info)
|
||||
|
||||
tools_def = build_s2_tools_def(description_prompt)
|
||||
|
||||
system_prompt = S2_SYSTEM_PROMPT.format(tools_xml=json.dumps(tools_def))
|
||||
|
||||
# Retry loop for context length
|
||||
while True:
|
||||
messages = self._build_s2_messages(
|
||||
instruction,
|
||||
processed_b64,
|
||||
current_step,
|
||||
current_history_n,
|
||||
system_prompt
|
||||
)
|
||||
|
||||
try:
|
||||
response = self.call_llm({
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"max_tokens": self.max_tokens,
|
||||
"top_p": self.top_p,
|
||||
"temperature": self.temperature,
|
||||
})
|
||||
break
|
||||
except Exception as e:
|
||||
# Handle Context Too Large
|
||||
if self._should_giveup_on_context_error(e) and current_history_n > 0:
|
||||
current_history_n -= 1
|
||||
logger.warning(f"Context too large, retrying with history_n={current_history_n}")
|
||||
else:
|
||||
logger.error(f"Error in predict: {e}")
|
||||
break
|
||||
|
||||
self.responses.append(response)
|
||||
|
||||
low_level_instruction, pyautogui_code = self._parse_response_s2(
|
||||
response, p_width, p_height, original_width, original_height
|
||||
)
|
||||
|
||||
# new added
|
||||
current_step = len(self.actions) + 1
|
||||
first_action = pyautogui_code[0] if pyautogui_code else ""
|
||||
if current_step >= self.max_steps and str(first_action).upper() not in ("DONE", "FAIL"):
|
||||
logger.warning(f"Reached maximum steps {self.max_steps}. Forcing termination with FAIL.")
|
||||
low_level_instruction = "Fail the task because reaching the maximum step limit."
|
||||
pyautogui_code = ["FAIL"]
|
||||
|
||||
logger.info(f"Low level instruction: {low_level_instruction}")
|
||||
logger.info(f"Pyautogui code: {pyautogui_code}")
|
||||
|
||||
self.actions.append(low_level_instruction)
|
||||
return response, pyautogui_code
|
||||
|
||||
def _build_s2_messages(self, instruction, current_img, step, history_n, system_prompt):
|
||||
messages = [{"role": "system", "content": [{"type": "text", "text": system_prompt}]}]
|
||||
|
||||
previous_actions = []
|
||||
history_start_idx = max(0, step - history_n)
|
||||
for i in range(history_start_idx):
|
||||
if i < len(self.actions):
|
||||
previous_actions.append(f"Step {i+1}: {self.actions[i]}")
|
||||
previous_actions_str = "\n".join(previous_actions) if previous_actions else "None"
|
||||
|
||||
# Add History
|
||||
history_len = min(history_n, len(self.responses))
|
||||
if history_len > 0:
|
||||
hist_responses = self.responses[-history_len:]
|
||||
hist_imgs = self.screenshots[-history_len-1:-1]
|
||||
|
||||
for i in range(history_len):
|
||||
if i < len(hist_imgs):
|
||||
screenshot_b64 = hist_imgs[i]
|
||||
if i == 0:
|
||||
# First history item: Inject Instruction + Previous Actions Context
|
||||
img_url = f"data:image/png;base64,{screenshot_b64}"
|
||||
instruction_prompt = f"""
|
||||
Please generate the next move according to the UI screenshot, instruction and previous actions.
|
||||
|
||||
Instruction: {instruction}
|
||||
|
||||
Previous actions:
|
||||
{previous_actions_str}"""
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": img_url}},
|
||||
{"type": "text", "text": instruction_prompt}
|
||||
]
|
||||
})
|
||||
else:
|
||||
img_url = f"data:image/png;base64,{screenshot_b64}"
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": img_url}},
|
||||
]
|
||||
})
|
||||
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": hist_responses[i]}]
|
||||
})
|
||||
|
||||
# Current Turn
|
||||
# We re-use previous_actions_str logic for the case where history_len == 0
|
||||
|
||||
if history_len == 0:
|
||||
# First turn logic: Include Instruction + Previous Actions
|
||||
instruction_prompt = f"""
|
||||
Please generate the next move according to the UI screenshot, instruction and previous actions.
|
||||
|
||||
Instruction: {instruction}
|
||||
|
||||
Previous actions:
|
||||
{previous_actions_str}"""
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{current_img}"}},
|
||||
{"type": "text", "text": instruction_prompt}
|
||||
]
|
||||
})
|
||||
else:
|
||||
# Subsequent turns logic (context already in first history message): Image Only
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{current_img}"}}
|
||||
]
|
||||
})
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def _parse_response_s2(
|
||||
self,
|
||||
response: str,
|
||||
processed_width: int = None,
|
||||
processed_height: int = None,
|
||||
original_width: Optional[int] = None,
|
||||
original_height: Optional[int] = None,
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Parse LLM response and convert it to low level action and pyautogui code.
|
||||
"""
|
||||
# Prefer the real screenshot resolution (passed from predict), fallback to configured screen_size.
|
||||
if not (original_width and original_height):
|
||||
original_width, original_height = self.screen_size
|
||||
low_level_instruction = ""
|
||||
pyautogui_code: List[str] = []
|
||||
|
||||
if response is None or not response.strip():
|
||||
return low_level_instruction, pyautogui_code
|
||||
|
||||
def adjust_coordinates(x: float, y: float) -> Tuple[int, int]:
|
||||
if not (original_width and original_height):
|
||||
return int(x), int(y)
|
||||
if self.coordinate_type == "absolute":
|
||||
# scale from processed pixels to original
|
||||
if processed_width and processed_height:
|
||||
x_scale = original_width / processed_width
|
||||
y_scale = original_height / processed_height
|
||||
return int(x * x_scale), int(y * y_scale)
|
||||
return int(x), int(y)
|
||||
# relative: scale from 0..999 grid
|
||||
x_scale = original_width / 999
|
||||
y_scale = original_height / 999
|
||||
return int(x * x_scale), int(y * y_scale)
|
||||
|
||||
def process_tool_call(json_str: str) -> None:
|
||||
try:
|
||||
tool_call = json.loads(json_str)
|
||||
if tool_call.get("name") == "computer_use":
|
||||
args = tool_call["arguments"]
|
||||
action = args["action"]
|
||||
|
||||
def _clean_keys(raw_keys):
|
||||
keys = raw_keys if isinstance(raw_keys, list) else [raw_keys]
|
||||
cleaned_keys = []
|
||||
for key in keys:
|
||||
if isinstance(key, str):
|
||||
if key.startswith("keys=["):
|
||||
key = key[6:]
|
||||
if key.endswith("]"):
|
||||
key = key[:-1]
|
||||
if key.startswith("['") or key.startswith('["'):
|
||||
key = key[2:] if len(key) > 2 else key
|
||||
if key.endswith("']") or key.endswith('"]'):
|
||||
key = key[:-2] if len(key) > 2 else key
|
||||
key = key.strip()
|
||||
cleaned_keys.append(key)
|
||||
else:
|
||||
cleaned_keys.append(key)
|
||||
return cleaned_keys
|
||||
|
||||
if action == "left_click" or action == "click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(f"pyautogui.click({adj_x}, {adj_y})")
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.click()")
|
||||
|
||||
elif action == "right_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.rightClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.rightClick()")
|
||||
|
||||
elif action == "middle_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.middleClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.middleClick()")
|
||||
|
||||
elif action == "double_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.doubleClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.doubleClick()")
|
||||
|
||||
elif action == "triple_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.tripleClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.tripleClick()")
|
||||
|
||||
elif action == "type":
|
||||
text = args.get("text", "")
|
||||
|
||||
try:
|
||||
text = text.encode('latin-1', 'backslashreplace').decode('unicode_escape')
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to unescape text: {e}")
|
||||
|
||||
logger.info(f"Pyautogui code[before rewrite]: {text}")
|
||||
|
||||
result = ""
|
||||
for char in text:
|
||||
if char == '\n':
|
||||
result += "pyautogui.press('enter')\n"
|
||||
elif char == "'":
|
||||
result += 'pyautogui.press("\'")\n'
|
||||
elif char == '\\':
|
||||
result += "pyautogui.press('\\\\')\n"
|
||||
elif char == '"':
|
||||
result += "pyautogui.press('\"')\n"
|
||||
else:
|
||||
result += f"pyautogui.press('{char}')\n"
|
||||
|
||||
pyautogui_code.append(result)
|
||||
logger.info(f"Pyautogui code[after rewrite]: {pyautogui_code}")
|
||||
|
||||
|
||||
elif action == "key":
|
||||
keys = _clean_keys(args.get("keys", []))
|
||||
|
||||
keys_str = ", ".join([f"'{key}'" for key in keys])
|
||||
if len(keys) > 1:
|
||||
pyautogui_code.append(f"pyautogui.hotkey({keys_str})")
|
||||
else:
|
||||
pyautogui_code.append(f"pyautogui.press({keys_str})")
|
||||
|
||||
elif action == "key_down":
|
||||
keys = _clean_keys(args.get("keys", []))
|
||||
for k in keys:
|
||||
pyautogui_code.append(f"pyautogui.keyDown('{k}')")
|
||||
|
||||
elif action == "key_up":
|
||||
keys = _clean_keys(args.get("keys", []))
|
||||
for k in reversed(keys):
|
||||
pyautogui_code.append(f"pyautogui.keyUp('{k}')")
|
||||
|
||||
elif action == "scroll":
|
||||
pixels = args.get("pixels", 0)
|
||||
pyautogui_code.append(f"pyautogui.scroll({pixels})")
|
||||
|
||||
elif action == "wait":
|
||||
pyautogui_code.append("WAIT")
|
||||
|
||||
elif action == "terminate":
|
||||
# Termination should respect status:
|
||||
# - success -> DONE
|
||||
# - failure -> FAIL
|
||||
# Backward compatible: missing status defaults to success.
|
||||
status = args.get("status", "success")
|
||||
if str(status).lower() == "failure":
|
||||
pyautogui_code.append("FAIL")
|
||||
else:
|
||||
pyautogui_code.append("DONE")
|
||||
|
||||
elif action == "mouse_move":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.moveTo({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.moveTo(0, 0)")
|
||||
|
||||
elif action == "left_click_drag":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
duration = args.get("duration", 0.5)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.dragTo({adj_x}, {adj_y}, duration={duration})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.dragTo(0, 0)")
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.error(f"Failed to parse tool call: {e}")
|
||||
|
||||
lines = response.split("\n")
|
||||
inside_tool_call = False
|
||||
current_tool_call: List[str] = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.lower().startswith(("action:")):
|
||||
if not low_level_instruction:
|
||||
low_level_instruction = line.split("Action:")[-1].strip()
|
||||
continue
|
||||
|
||||
if line.startswith("<tool_call>"):
|
||||
inside_tool_call = True
|
||||
continue
|
||||
elif line.startswith("</tool_call>"):
|
||||
if current_tool_call:
|
||||
process_tool_call("\n".join(current_tool_call))
|
||||
current_tool_call = []
|
||||
inside_tool_call = False
|
||||
continue
|
||||
|
||||
if inside_tool_call:
|
||||
current_tool_call.append(line)
|
||||
continue
|
||||
|
||||
if line.startswith("{") and line.endswith("}"):
|
||||
try:
|
||||
json_obj = json.loads(line)
|
||||
if "name" in json_obj and "arguments" in json_obj:
|
||||
process_tool_call(line)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if current_tool_call:
|
||||
process_tool_call("\n".join(current_tool_call))
|
||||
|
||||
if not low_level_instruction and len(pyautogui_code) > 0:
|
||||
first_action = pyautogui_code[0]
|
||||
if "." in first_action:
|
||||
action_type = first_action.split(".", 1)[1].split("(", 1)[0]
|
||||
else:
|
||||
action_type = first_action.lower()
|
||||
low_level_instruction = f"Performing {action_type} action"
|
||||
|
||||
return low_level_instruction, pyautogui_code
|
||||
|
||||
|
||||
|
||||
def _predict_s1(self, instruction, obs, processed_b64):
|
||||
messages = [{"role": "system", "content": S1_SYSTEM_PROMPT.format(password=self.password)}]
|
||||
|
||||
# Reconstruct History Logic for S1 mode
|
||||
history_step_texts = []
|
||||
|
||||
for i in range(len(self.actions)):
|
||||
cot = self.cots[i] if i < len(self.cots) else {}
|
||||
|
||||
# Step Content string
|
||||
step_content = S1_STEP_TEMPLATE.format(step_num=i+1) + S1_ACTION_HISTORY_TEMPLATE.format(action=cot.get('action', ''))
|
||||
|
||||
if i > len(self.actions) - self.max_history_turns:
|
||||
# Recent history: Add User(Image) and Assistant(Text)
|
||||
if i < len(self.screenshots) - 1: # Screenshot exists for this step
|
||||
img = self.screenshots[i]
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}
|
||||
]
|
||||
})
|
||||
messages.append({"role": "assistant", "content": step_content})
|
||||
else:
|
||||
# Old history: Collect text
|
||||
history_step_texts.append(step_content)
|
||||
# If this is the last step before the recent window, flush collected texts
|
||||
if i == len(self.actions) - self.max_history_turns:
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": "\n".join(history_step_texts)
|
||||
})
|
||||
|
||||
# Current
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{processed_b64}"}},
|
||||
{"type": "text", "text": S1_INSTRUTION_TEMPLATE.format(instruction=instruction)}
|
||||
]
|
||||
})
|
||||
|
||||
response = self.call_llm({
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"max_tokens": self.max_tokens
|
||||
})
|
||||
|
||||
low_level, codes, cot_data = self._parse_response_s1(response)
|
||||
|
||||
self.observations.append(obs)
|
||||
self.cots.append(cot_data)
|
||||
self.actions.append(low_level)
|
||||
self.responses.append(response)
|
||||
|
||||
return response, codes
|
||||
|
||||
|
||||
def _parse_response_s1(self, response):
|
||||
sections = {}
|
||||
# Simple Regex Parsing
|
||||
for key, pattern in [
|
||||
('observation', r'#{1,2}\s*Observation\s*:?[\n\r]+(.*?)(?=^#{1,2}\s|$)'),
|
||||
('thought', r'#{1,2}\s*Thought\s*:?[\n\r]+(.*?)(?=^#{1,2}\s|$)'),
|
||||
('action', r'#{1,2}\s*Action\s*:?[\n\r]+(.*?)(?=^#{1,2}\s|$)')
|
||||
]:
|
||||
m = re.search(pattern, response, re.DOTALL | re.MULTILINE)
|
||||
if m: sections[key] = m.group(1).strip()
|
||||
|
||||
code_blocks = re.findall(r'```(?:code|python)?\s*(.*?)\s*```', response, re.DOTALL | re.IGNORECASE)
|
||||
code = code_blocks[-1].strip() if code_blocks else "FAIL"
|
||||
|
||||
sections['code'] = code
|
||||
|
||||
# Post-process code
|
||||
if "computer.terminate" in code:
|
||||
final_code = ["DONE"] if "success" in code.lower() else ["FAIL"]
|
||||
elif "computer.wait" in code:
|
||||
final_code = ["WAIT"]
|
||||
else:
|
||||
# Project coordinates
|
||||
code = project_coordinate_to_absolute_scale(
|
||||
code,
|
||||
self.screen_size[0],
|
||||
self.screen_size[1],
|
||||
self.coordinate_type,
|
||||
self.resize_factor
|
||||
)
|
||||
logger.info(f"[rewrite before]: {code}")
|
||||
final_code = [rewrite_pyautogui_text_inputs(code)]
|
||||
logger.info(f"[rewrite after]: {final_code}")
|
||||
|
||||
return sections.get('action', 'Acting'), final_code, sections
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _should_giveup_on_context_error(e):
|
||||
"""对于 context length 相关的错误,立即放弃重试,交给外层处理"""
|
||||
error_str = str(e)
|
||||
return "Too Large" in error_str or "context_length_exceeded" in error_str or "413" in error_str
|
||||
|
||||
@backoff.on_exception(backoff.constant, Exception, interval=30, max_tries=10, giveup=_should_giveup_on_context_error.__func__)
|
||||
def call_llm(self, payload):
|
||||
"""Unified OpenAI-compatible API call"""
|
||||
# Get env vars
|
||||
base_url = os.environ.get("OPENAI_BASE_URL", "url-xxx")
|
||||
api_key = os.environ.get("OPENAI_API_KEY", "sk-xxx")
|
||||
|
||||
client = openai.OpenAI(base_url=base_url, api_key=api_key)
|
||||
|
||||
messages = payload["messages"]
|
||||
log_messages(messages, "LLM Request")
|
||||
|
||||
params = {
|
||||
"model": payload["model"],
|
||||
"messages": messages,
|
||||
"max_tokens": payload["max_tokens"],
|
||||
"temperature": self.temperature,
|
||||
"top_p": self.top_p
|
||||
}
|
||||
|
||||
try:
|
||||
resp = client.chat.completions.create(**params)
|
||||
content = resp.choices[0].message.content
|
||||
logger.info(f"LLM Response:\n{content}")
|
||||
return content
|
||||
except Exception as e:
|
||||
logger.error(f"LLM Call failed: {e}")
|
||||
raise e
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
S1_SYSTEM_PROMPT = """You are a GUI agent. You are given a task, a screenshot of the screen and your previous interactions with the computer. You need to perform a series of actions to complete the task. The password of the computer is "{password}", use it when you need sudo rights. You need to **wait** explicitly for installation, waiting website loading or running commands to finish. Don't terminate the task unless you are sure the task is finished. If you find that you can't finish the task, or the task is not finished exactly as the instruction indicates (you have made progress but not finished the task completely), or the task is impossible to complete, you must report **failure**.
|
||||
|
||||
For each step, provide your response in this format:
|
||||
# Step: {{step number}}
|
||||
## Thought:
|
||||
{{thought}}
|
||||
## Action:
|
||||
{{action}}
|
||||
## Code:
|
||||
{{code}}
|
||||
|
||||
For the Thought section, you should include the following parts:
|
||||
- Reflection on the task when there is previous action:
|
||||
- Consider the correnctness of previous action and its outcomes
|
||||
- If the previous action was correct, describe the change in the state of the computer and reason
|
||||
- If the previous action was incorrect, reflect on what went wrong and why
|
||||
- Step by Step Progress Assessment:
|
||||
- Add necessary information according to the history screenshots, former actions and current screenshot.
|
||||
- Analyze what parts of the task have already been completed and how they contribute to the overall goal.
|
||||
- Make a plan on how to complete the task based on the history and currect screenshot.
|
||||
- Next Action Prediction:
|
||||
- Propose the most possible next action and state the reason
|
||||
- For Text Input Actions:
|
||||
- Note current cursor position
|
||||
- Consolidate repetitive actions (specify count for multiple keypresses)
|
||||
- Describe expected final text outcome
|
||||
- Use first-person perspective in reasoning
|
||||
|
||||
For the action section, you should provide clear, concise, and actionable instructions in one sentence.
|
||||
- If the action involves interacting with a specific target:
|
||||
- Describe target explicitly (if multiple elements share that name, you should distinguish the target) without using coordinates
|
||||
- Specify element names when possible (use original language if non-English)
|
||||
- Describe features (shape, color, position) if name unavailable
|
||||
- If the action involves keyboard actions like 'press', 'write', 'hotkey':
|
||||
- Consolidate repetitive keypresses with count
|
||||
- Specify expected text outcome for typing actions
|
||||
|
||||
For the code section, you should output the corresponding code for the action. The code should be either PyAutoGUI code or one of the following functions warped in the code block:
|
||||
- {{"name": "computer.wait", "description": "Make the computer wait for 20 seconds for installation, running code, etc.", "parameters": {{"type": "object", "properties": {{}}, "required": []}}}}
|
||||
- {{"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {{"type": "object", "properties": {{"status": {{"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, {{"answer": {{"type": "string", "description": "The answer of the task"}}}}, "required": ["status"]}}}}
|
||||
Examples for the code section:
|
||||
```python
|
||||
pyautogui.click(x=123, y=456)
|
||||
```
|
||||
```code
|
||||
computer.terminate(status="success")
|
||||
```
|
||||
```code
|
||||
computer.terminate(status="success", answer='''text''')
|
||||
```"""
|
||||
|
||||
|
||||
# S1 prompt templates for generating trajectories
|
||||
S1_STEP_TEMPLATE = "# Step {step_num}:\n"
|
||||
S1_INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"
|
||||
|
||||
S1_ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
|
||||
|
||||
|
||||
# S2 Prompts
|
||||
S2_ACTION_DESCRIPTION = """
|
||||
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
|
||||
* `key_down`: Press and HOLD the specified key(s) down in order (no release). Use this for stateful holds like holding Shift while clicking.
|
||||
* `key_up`: Release the specified key(s) in reverse order.
|
||||
* `type`: Type a string of text on the keyboard.
|
||||
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
|
||||
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.
|
||||
* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `scroll`: Performs a scroll of the mouse scroll wheel.
|
||||
* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
|
||||
* `wait`: Wait specified seconds for the change to happen.
|
||||
* `terminate`: Terminate the current task and report its completion status.
|
||||
* `answer`: Answer a question.
|
||||
"""
|
||||
|
||||
S2_DESCRIPTION_PROMPT_TEMPLATE = """Use a mouse and keyboard to interact with a computer, and take screenshots.
|
||||
* This is an interface to a desktop GUI. You must click on desktop icons to start applications.
|
||||
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
|
||||
{resolution_info}
|
||||
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
|
||||
* If you tried clicking on a program or link but it failed to load even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.
|
||||
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked."""
|
||||
|
||||
S2_SYSTEM_PROMPT = """# Tools
|
||||
|
||||
You may call one or more functions to assist with the user query.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags:
|
||||
<tools>
|
||||
{tools_xml}
|
||||
</tools>
|
||||
|
||||
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||
<tool_call>
|
||||
{{"name": <function-name>, "arguments": <args-json-object>}}
|
||||
</tool_call>
|
||||
|
||||
# Response format
|
||||
|
||||
Response format for every step:
|
||||
1) Action: a short imperative describing what to do in the UI.
|
||||
2) A single <tool_call>...</tool_call> block containing only the JSON: {{"name": <function-name>, "arguments": <args-json-object>}}.
|
||||
|
||||
Rules:
|
||||
- Output exactly in the order: Action, <tool_call>.
|
||||
- Be brief: one sentence for Action.
|
||||
- Do not output anything else outside those parts.
|
||||
- If finishing, use action=terminate in the tool call."""
|
||||
|
||||
|
||||
def build_s2_tools_def(description_prompt):
|
||||
return {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name_for_human": "computer_use",
|
||||
"name": "computer_use",
|
||||
"description": description_prompt,
|
||||
"parameters": {
|
||||
"properties": {
|
||||
"action": {
|
||||
"description": S2_ACTION_DESCRIPTION,
|
||||
"enum": ["key", "type", "mouse_move", "left_click", "left_click_drag",
|
||||
"right_click", "middle_click", "double_click", "triple_click", "scroll",
|
||||
"wait", "terminate", "key_down", "key_up"],
|
||||
"type": "string"
|
||||
},
|
||||
"keys": {"description": "Required only by `action=key`.", "type": "array"},
|
||||
"text": {"description": "Required only by `action=type`.", "type": "string"},
|
||||
"coordinate": {"description": "The x,y coordinates for mouse actions.", "type": "array"},
|
||||
"pixels": {"description": "The amount of scrolling.", "type": "number"},
|
||||
"time": {"description": "The seconds to wait.", "type": "number"},
|
||||
"status": {
|
||||
"description": "The status of the task.",
|
||||
"type": "string",
|
||||
"enum": ["success", "failure"]
|
||||
}
|
||||
},
|
||||
"required": ["action"],
|
||||
"type": "object"
|
||||
},
|
||||
"args_format": "Format the arguments as a JSON object."
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,302 +0,0 @@
|
|||
import base64
|
||||
import re
|
||||
import ast
|
||||
import logging
|
||||
from io import BytesIO
|
||||
import json
|
||||
from PIL import Image
|
||||
|
||||
from mm_agents.utils.qwen_vl_utils import smart_resize
|
||||
|
||||
logger = logging.getLogger("desktopenv.evocua.utils")
|
||||
|
||||
def encode_image(image_content):
|
||||
"""Encode image bytes to base64 string."""
|
||||
return base64.b64encode(image_content).decode("utf-8")
|
||||
|
||||
|
||||
def process_image(image_bytes, factor=32):
|
||||
"""
|
||||
Process an image for VL models.
|
||||
factor: 32 for S2 mode, 28 for S1 mode default
|
||||
"""
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
width, height = image.size
|
||||
|
||||
resized_height, resized_width = smart_resize(
|
||||
height=height,
|
||||
width=width,
|
||||
factor=factor,
|
||||
max_pixels=16 * 16 * 4 * 12800, # Large buffer
|
||||
)
|
||||
|
||||
image = image.resize((resized_width, resized_height))
|
||||
|
||||
buffer = BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
processed_bytes = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(processed_bytes).decode("utf-8"), resized_width, resized_height
|
||||
|
||||
def _fallback_rewrite_pyautogui_text_inputs(code: str) -> str:
|
||||
"""
|
||||
Regex-based fallback to handle malformed pyautogui.write/typewrite calls.
|
||||
"""
|
||||
logger.info(f"SyntaxError detected in code, using regex fallback. Original code: {code}")
|
||||
|
||||
def _replacer(match):
|
||||
call_content = match.group(0)
|
||||
m = re.search(r'pyautogui\.(?:write|typewrite)\s*\(', call_content)
|
||||
if not m:
|
||||
return call_content
|
||||
|
||||
args_part = call_content[m.end():].strip()
|
||||
args_part = re.sub(r'^(?:message|text)\s*=\s*', '', args_part)
|
||||
|
||||
text_content = ""
|
||||
if args_part.startswith(("'''", '"""')):
|
||||
quote_type = args_part[:3]
|
||||
content = args_part[3:]
|
||||
end_idx = content.rfind(quote_type)
|
||||
if end_idx != -1:
|
||||
text_content = content[:end_idx]
|
||||
else:
|
||||
text_content = content[:-1] if content.endswith(')') else content
|
||||
elif args_part.startswith(("'", '"')):
|
||||
quote_type = args_part[0]
|
||||
content = args_part[1:]
|
||||
if content.endswith(quote_type + ")"):
|
||||
text_content = content[:-2]
|
||||
elif content.endswith(")"):
|
||||
if len(content) > 1 and content[-2] == quote_type:
|
||||
text_content = content[:-2]
|
||||
else:
|
||||
text_content = content[:-1]
|
||||
elif content.endswith(quote_type):
|
||||
text_content = content[:-1]
|
||||
else:
|
||||
text_content = content
|
||||
else:
|
||||
text_content = args_part[:-1] if args_part.endswith(')') else args_part
|
||||
|
||||
new_cmds = []
|
||||
for char in text_content:
|
||||
p = "enter" if char == "\n" else char
|
||||
p_esc = p.replace("'", "\\'")
|
||||
new_cmds.append(f"pyautogui.press('{p_esc}')")
|
||||
|
||||
return "; ".join(new_cmds)
|
||||
|
||||
pattern = r"pyautogui\.(?:write|typewrite)\s*\(.*?(?=\s*;|\s*$|\n)"
|
||||
new_code = re.sub(pattern, _replacer, code)
|
||||
|
||||
if new_code == code and ("pyautogui.write" in code or "pyautogui.typewrite" in code):
|
||||
new_code = re.sub(r"pyautogui\.(?:write|typewrite)\s*\(.*", _replacer, code)
|
||||
|
||||
return new_code
|
||||
|
||||
def rewrite_pyautogui_text_inputs(code: str) -> str:
|
||||
"""
|
||||
Expand pyautogui.write/typewrite string literals into per-character presses.
|
||||
"""
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
|
||||
class _TextCallRewriter(ast.NodeTransformer):
|
||||
def _extract_text(self, call: ast.Call):
|
||||
if not (
|
||||
isinstance(call.func, ast.Attribute)
|
||||
and isinstance(call.func.value, ast.Name)
|
||||
and call.func.value.id == "pyautogui"
|
||||
and call.func.attr in ("write", "typewrite")
|
||||
):
|
||||
return None
|
||||
|
||||
message_node = call.args[0] if call.args else None
|
||||
if message_node is None:
|
||||
for kw in call.keywords:
|
||||
if kw.arg in ("message", "text"):
|
||||
message_node = kw.value
|
||||
break
|
||||
|
||||
if isinstance(message_node, ast.Constant) and isinstance(message_node.value, str):
|
||||
return message_node.value
|
||||
return None
|
||||
|
||||
def visit_Expr(self, node):
|
||||
self.generic_visit(node)
|
||||
if isinstance(node.value, ast.Call):
|
||||
text = self._extract_text(node.value)
|
||||
if text is not None:
|
||||
new_nodes = []
|
||||
for char in text:
|
||||
press_value = "enter" if char == "\n" else char
|
||||
press_call = ast.Expr(
|
||||
value=ast.Call(
|
||||
func=ast.Attribute(
|
||||
value=ast.Name(id="pyautogui", ctx=ast.Load()),
|
||||
attr="press",
|
||||
ctx=ast.Load(),
|
||||
),
|
||||
args=[ast.Constant(value=press_value)],
|
||||
keywords=[],
|
||||
)
|
||||
)
|
||||
new_nodes.append(press_call)
|
||||
return new_nodes if new_nodes else node
|
||||
return node
|
||||
|
||||
tree = _TextCallRewriter().visit(tree)
|
||||
tree = ast.fix_missing_locations(tree)
|
||||
new_code = ast.unparse(tree)
|
||||
return new_code
|
||||
|
||||
except (SyntaxError, Exception):
|
||||
return _fallback_rewrite_pyautogui_text_inputs(code)
|
||||
|
||||
|
||||
|
||||
def project_coordinate_to_absolute_scale(pyautogui_code_relative_coordinates, screen_width, screen_height, coordinate_type="relative", resize_factor=28):
|
||||
"""
|
||||
Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size.
|
||||
"""
|
||||
def _coordinate_projection(x, y, screen_width, screen_height, coordinate_type):
|
||||
if coordinate_type == "qwen25":
|
||||
height, width = smart_resize(
|
||||
height=screen_height,
|
||||
width=screen_width,
|
||||
factor=resize_factor,
|
||||
min_pixels=3136,
|
||||
max_pixels=12845056
|
||||
)
|
||||
if 0 <= x <= 1 and 0 <= y <= 1:
|
||||
# If already normalized, treat like "relative"
|
||||
return int(round(x * width)), int(round(y * height))
|
||||
return int(x / width * screen_width), int(y / height * screen_height)
|
||||
else:
|
||||
raise ValueError(f"Invalid coordinate type: {coordinate_type}. Expected 'qwen25'")
|
||||
|
||||
pattern = r'(pyautogui\.\w+\([^\)]*\))'
|
||||
matches = re.findall(pattern, pyautogui_code_relative_coordinates)
|
||||
|
||||
new_code = pyautogui_code_relative_coordinates
|
||||
|
||||
for full_call in matches:
|
||||
func_name_pattern = r'(pyautogui\.\w+)\((.*)\)'
|
||||
func_match = re.match(func_name_pattern, full_call, re.DOTALL)
|
||||
if not func_match:
|
||||
continue
|
||||
|
||||
func_name = func_match.group(1)
|
||||
args_str = func_match.group(2)
|
||||
|
||||
try:
|
||||
parsed = ast.parse(f"func({args_str})").body[0].value
|
||||
parsed_args = parsed.args
|
||||
parsed_keywords = parsed.keywords
|
||||
|
||||
except SyntaxError:
|
||||
continue
|
||||
|
||||
function_parameters = {
|
||||
'click': ['x', 'y', 'clicks', 'interval', 'button', 'duration', 'pause'],
|
||||
'rightClick': ['x', 'y', 'duration', 'tween', 'pause'],
|
||||
'middleClick': ['x', 'y', 'duration', 'tween', 'pause'],
|
||||
'doubleClick': ['x', 'y', 'interval', 'button', 'duration', 'pause'],
|
||||
'tripleClick': ['x', 'y', 'interval', 'button', 'duration', 'pause'],
|
||||
'moveTo': ['x', 'y', 'duration', 'tween', 'pause'],
|
||||
'dragTo': ['x', 'y', 'duration', 'button', 'mouseDownUp', 'pause'],
|
||||
}
|
||||
|
||||
func_base_name = func_name.split('.')[-1]
|
||||
|
||||
param_names = function_parameters.get(func_base_name, [])
|
||||
|
||||
args = {}
|
||||
for idx, arg in enumerate(parsed_args):
|
||||
if idx < len(param_names):
|
||||
param_name = param_names[idx]
|
||||
try:
|
||||
arg_value = ast.literal_eval(arg)
|
||||
args[param_name] = arg_value
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
for kw in parsed_keywords:
|
||||
param_name = kw.arg
|
||||
arg_value = ast.literal_eval(kw.value)
|
||||
args[param_name] = arg_value
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing keyword arguments: {e}")
|
||||
continue
|
||||
|
||||
updated = False
|
||||
if 'x' in args and 'y' in args:
|
||||
try:
|
||||
x_rel = float(args['x'])
|
||||
y_rel = float(args['y'])
|
||||
# Only project if they look like relative coords (e.g. <= 1.0 or depending on type)
|
||||
# Projection applies unconditionally if type is relative
|
||||
x_abs, y_abs = _coordinate_projection(x_rel, y_rel, screen_width, screen_height, coordinate_type)
|
||||
|
||||
# Apply coordinate transformation
|
||||
args['x'] = x_abs
|
||||
args['y'] = y_abs
|
||||
updated = True
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if updated:
|
||||
reconstructed_args = []
|
||||
for idx, param_name in enumerate(param_names):
|
||||
if param_name in args:
|
||||
arg_value = args[param_name]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"'{arg_value}'"
|
||||
else:
|
||||
arg_repr = str(arg_value)
|
||||
reconstructed_args.append(arg_repr)
|
||||
else:
|
||||
break
|
||||
|
||||
used_params = set(param_names[:len(reconstructed_args)])
|
||||
for kw in parsed_keywords:
|
||||
if kw.arg not in used_params:
|
||||
arg_value = args[kw.arg]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"{kw.arg}='{arg_value}'"
|
||||
else:
|
||||
arg_repr = f"{kw.arg}={arg_value}"
|
||||
reconstructed_args.append(arg_repr)
|
||||
|
||||
new_args_str = ', '.join(reconstructed_args)
|
||||
new_full_call = f"{func_name}({new_args_str})"
|
||||
new_code = new_code.replace(full_call, new_full_call)
|
||||
|
||||
return new_code
|
||||
|
||||
|
||||
def log_messages(messages, prefix="LLM Messages"):
|
||||
"""Log messages with truncated base64 images"""
|
||||
try:
|
||||
log_msgs = []
|
||||
for msg in messages:
|
||||
msg_copy = msg.copy()
|
||||
content = msg.get("content")
|
||||
if isinstance(content, list):
|
||||
new_content = []
|
||||
for item in content:
|
||||
if isinstance(item, dict) and item.get("type") == "image_url":
|
||||
item_copy = item.copy()
|
||||
url = item_copy.get("image_url", {}).get("url", "")
|
||||
if len(url) > 100:
|
||||
item_copy["image_url"] = {"url": url[:30] + "...[base64_truncated]..." + url[-10:]}
|
||||
new_content.append(item_copy)
|
||||
else:
|
||||
new_content.append(item)
|
||||
msg_copy["content"] = new_content
|
||||
log_msgs.append(msg_copy)
|
||||
logger.info(f"{prefix}:\n{json.dumps(log_msgs, indent=2, ensure_ascii=False)}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to log messages: {e}")
|
||||
|
|
@ -1,302 +0,0 @@
|
|||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "click",
|
||||
"description": "Click on the element",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"instruction": {
|
||||
"type": "string",
|
||||
"description": "Decribe the element you want to interact with in detail including the visual description and function description. And make it clear and concise. For example you can describe what the element looks like, and what will be the expected result when you interact with it."
|
||||
},
|
||||
"num_clicks": {
|
||||
"type": "integer",
|
||||
"description": "Number of times to click the element.",
|
||||
"default": 1
|
||||
},
|
||||
"button_type": {
|
||||
"type": "string",
|
||||
"enum": ["left", "middle", "right"],
|
||||
"description": "Which mouse button to press.",
|
||||
"default": "left"
|
||||
},
|
||||
"hold_keys": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of keys to hold while clicking",
|
||||
"default": []
|
||||
}
|
||||
},
|
||||
"required": ["instruction"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "drag_and_drop",
|
||||
"description": "Drag from the starting description to the ending description",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"starting_description": {
|
||||
"type": "string",
|
||||
"description": "A very detailed description of where to start the drag action. This description should be at least a full sentence. And make it clear and concise."
|
||||
},
|
||||
"ending_description": {
|
||||
"type": "string",
|
||||
"description": "A very detailed description of where to end the drag action. This description should be at least a full sentence. And make it clear and concise."
|
||||
},
|
||||
"hold_keys": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of keys to hold while dragging",
|
||||
"default": []
|
||||
}
|
||||
},
|
||||
"required": ["starting_description", "ending_description"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "highlight_text_span",
|
||||
"description": "Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"starting_phrase": {
|
||||
"type": "string",
|
||||
"description": "The phrase that denotes the start of the text span you want to highlight. If you only want to highlight one word, just pass in that single word."
|
||||
},
|
||||
"ending_phrase": {
|
||||
"type": "string",
|
||||
"description": "The phrase that denotes the end of the text span you want to highlight. If you only want to highlight one word, just pass in that single word."
|
||||
}
|
||||
},
|
||||
"required": ["starting_phrase", "ending_phrase"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "hold_and_press",
|
||||
"description": "Hold a list of keys and press a list of keys",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hold_keys": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of keys to hold"
|
||||
},
|
||||
"press_keys": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of keys to press in a sequence"
|
||||
}
|
||||
},
|
||||
"required": ["hold_keys", "press_keys"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "hotkey",
|
||||
"description": "Press a hotkey combination",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"keys": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])"
|
||||
}
|
||||
},
|
||||
"required": ["keys"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "open",
|
||||
"description": "Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"app_or_filename": {
|
||||
"type": "string",
|
||||
"description": "The name of the application or filename to open"
|
||||
}
|
||||
},
|
||||
"required": ["app_or_filename"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "scroll",
|
||||
"description": "Scroll the element in the specified direction",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"instruction": {
|
||||
"type": "string",
|
||||
"description": "A very detailed description of which element to enter scroll in. This description should be at least a full sentence. And make it clear and concise."
|
||||
},
|
||||
"clicks": {
|
||||
"type": "integer",
|
||||
"description": "The number of clicks to scroll can be positive (up) or negative (down)."
|
||||
},
|
||||
"shift": {
|
||||
"type": "boolean",
|
||||
"description": "Whether to use shift+scroll for horizontal scrolling",
|
||||
"default": False
|
||||
}
|
||||
},
|
||||
"required": ["instruction", "clicks"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "set_cell_values",
|
||||
"description": """Use this to set individual cell values or formulas in a spreadsheet. For setting values: pass {"A2": "hello", "B2": "world"} to set text, or {"A1": 42, "B1": 3.14} for numbers. For setting formulas: start with '=' like {"A2": "=B2+C2", "C1": "=SUM(A1:A10)"}. The sheet must be opened before this command can be used.""",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"cell_values": {
|
||||
"type": "object",
|
||||
"description": """A dictionary of cell values or formulas to set in the spreadsheet. Keys are cell coordinates like "A1", "B2", etc. Examples: For values: {"A2": "hello", "B1": 42}. For formulas: {"A2": "=B2+C2", "C1": "=SUM(A1:A10)"}. Always start formulas with '='.""",
|
||||
"additionalProperties": {
|
||||
"type": ["number", "string"]
|
||||
},
|
||||
"default": {}
|
||||
},
|
||||
"app_name": {
|
||||
"type": "string",
|
||||
"description": "Spreadsheet application/file name (e.g., 'Some_sheet.xlsx')."
|
||||
},
|
||||
"sheet_name": {
|
||||
"type": "string",
|
||||
"description": "Sheet name (e.g., 'Sheet1')."
|
||||
}
|
||||
},
|
||||
"required": ["cell_values", "app_name", "sheet_name"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "switch_applications",
|
||||
"description": "Switch to a different application that is already open",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"app_code": {
|
||||
"type": "string",
|
||||
"description": "The code/name of the application to switch to from the open apps list."
|
||||
}
|
||||
},
|
||||
"required": ["app_code"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "type",
|
||||
"description": "Type text into a specific element",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"element_description": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Detailed, full-sentence description of the element to type into. If omitted, types into the focused element.",
|
||||
"default": None
|
||||
},
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The text to type.",
|
||||
"default": ""
|
||||
},
|
||||
"overwrite": {
|
||||
"type": "boolean",
|
||||
"description": "If true, clear existing text before typing.",
|
||||
"default": False
|
||||
},
|
||||
"enter": {
|
||||
"type": "boolean",
|
||||
"description": "If true, press Enter after typing.",
|
||||
"default": False
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "wait",
|
||||
"description": "Wait for a specified amount of time",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"time": {
|
||||
"type": "number",
|
||||
"description": "Time to wait in seconds."
|
||||
}
|
||||
},
|
||||
"required": ["time"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "fast_open_terminal",
|
||||
"description": "Save the file in focus, close it, and open a terminal.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
def to_response_api_tools(completion_tools):
|
||||
"""
|
||||
Convert completion-style tools (nested under the 'function' key)
|
||||
into response-style tools (flattened: type/name/description/parameters).
|
||||
|
||||
Example:
|
||||
{"type": "function", "function": {"name": "click", ...}}
|
||||
->
|
||||
{"type": "function", "name": "click", ...}
|
||||
"""
|
||||
response_tools = []
|
||||
for tool in completion_tools or []:
|
||||
if isinstance(tool, dict) and tool.get("type") == "function" and isinstance(tool.get("function"), dict):
|
||||
fn = tool["function"]
|
||||
response_tools.append({
|
||||
"type": "function",
|
||||
"name": fn.get("name"),
|
||||
"description": fn.get("description"),
|
||||
"parameters": fn.get("parameters"),
|
||||
})
|
||||
else:
|
||||
response_tools.append(tool)
|
||||
return response_tools
|
||||
|
||||
response_api_tools = to_response_api_tools(tools)
|
||||
|
|
@ -1,73 +0,0 @@
|
|||
|
||||
import base64
|
||||
import os
|
||||
from typing import Dict, Any, List, Union
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
class FormatMessage:
|
||||
def __init__(self):
|
||||
self.text_key = "input_text"
|
||||
self.image_key = "input_image"
|
||||
|
||||
def encode_image(self, image_content: bytes) -> str:
|
||||
return base64.b64encode(image_content).decode('utf-8')
|
||||
|
||||
def format_image(self, image: bytes, detail: str="high") -> Dict[str, Any]:
|
||||
return {
|
||||
"type": self.image_key,
|
||||
"image_url": f"data:image/png;base64,{self.encode_image(image)}",
|
||||
"detail": detail
|
||||
}
|
||||
|
||||
def format_text_message(self, text: str) -> Dict[str, Any]:
|
||||
return {"type": self.text_key, "text": text}
|
||||
|
||||
def create_system_message(self, content: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"role": "system",
|
||||
"content": [self.format_text_message(content)]
|
||||
}
|
||||
|
||||
def create_user_message(self, text: str=None, image: bytes=None, detail: str="high", image_first: bool=False) -> Dict[str, Any]:
|
||||
if text is None and image is None:
|
||||
raise ValueError("At least one of text or image must be provided")
|
||||
|
||||
content = []
|
||||
|
||||
# Add text if provided
|
||||
if text is not None:
|
||||
content.append(self.format_text_message(text))
|
||||
|
||||
# Add image if provided
|
||||
if image is not None:
|
||||
content.append(self.format_image(image, detail))
|
||||
|
||||
if image_first:
|
||||
content.reverse()
|
||||
return {
|
||||
"role": "user",
|
||||
"content": content
|
||||
}
|
||||
|
||||
def create_assistant_message(self, text: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"role": "assistant",
|
||||
"content": [{"type": "output_text", "text": text}]
|
||||
}
|
||||
|
||||
|
||||
def encode_numpy_image_to_base64(image: np.ndarray) -> str:
|
||||
# Convert numpy array to bytes
|
||||
success, buffer = cv2.imencode('.png', image)
|
||||
if not success:
|
||||
raise ValueError("Failed to encode image to png format")
|
||||
|
||||
# Convert bytes to base64 string
|
||||
image_bytes = buffer.tobytes()
|
||||
base64_string = base64.b64encode(image_bytes).decode('utf-8')
|
||||
|
||||
return base64_string
|
||||
|
||||
def encode_image_bytes(image_content):
|
||||
return base64.b64encode(image_content).decode('utf-8')
|
||||
|
|
@ -1,616 +0,0 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict, List, Tuple, Callable
|
||||
from desktop_env.desktop_env import DesktopEnv
|
||||
from openai import OpenAI
|
||||
from mm_agents.gta1.format_message import FormatMessage
|
||||
from mm_agents.gta1.cua_tool import response_api_tools as CUA_TOOLS
|
||||
import inspect
|
||||
import concurrent.futures
|
||||
import re
|
||||
from mm_agents.utils.qwen_vl_utils import smart_resize
|
||||
from mm_agents.gta1.gta1_agent import OSWorldACI
|
||||
import httpx
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from mm_agents.gta1.format_message import encode_numpy_image_to_base64, encode_image_bytes
|
||||
|
||||
|
||||
GTA1_SERVICE_URL=os.getenv("GTA1_SERVICE_URL",None)
|
||||
|
||||
GTA1_GROUNDING_SYSTEM_PROMPT=(
|
||||
"You are a GUI agent. You are given a task and a screenshot of the screen. "
|
||||
"You need to perform a series of pyautogui actions to complete the task."
|
||||
)
|
||||
|
||||
CUA_SYSTEM_PROMPT_GPT5 = """# Role and Objective
|
||||
- An agent with strong computer knowledge and a good internet connection, designed to execute desktop computer tasks on Ubuntu precisely as instructed by the user.
|
||||
- Assumes tool calls will run to control the computer.
|
||||
- Has access to all its reasoning and knowledge for use in tasks.
|
||||
|
||||
# Instructions
|
||||
- Begin each user task with a concise checklist (3–7 items) of conceptual, non-implementation sub-tasks.
|
||||
- Revise the sub-tasks checklist as the task progresses, based on the latest screenshot and previous actions.
|
||||
- Interact solely using the provided tool actions; do not invent or assume any unlisted methods. Use only tools explicitly listed in the available actions for every step.
|
||||
- Base every action on observable elements in the latest screenshot; never anticipate or assume elements not yet present or visible.
|
||||
- For each step, you will receive a new screenshot, tool execution results, and the remaining number of steps allowed in the user task.
|
||||
- If an option or input is not specified in the user task (e.g., creating a new file without specifying a name), use the default settings.
|
||||
|
||||
## Action Execution Guidelines
|
||||
- Execute exactly one tool call per interaction.
|
||||
- Prefer the `hotkey` action (tool call) over `click` or `drag_and_drop` where possible.
|
||||
- For spreadsheet value or formula changes in LibreOffice Calc, Writer, Impress, always use `set_cell_values` for both single-cell and multi-cell value or formula editing.
|
||||
- When highlighting text, use only the `highlight_text_span` or `hotkey` (tool calls).
|
||||
- Dismiss "Authentication required" prompts by clicking "Cancel".
|
||||
- All tool calls are permitted within the provided action list; do not attempt actions outside this set.
|
||||
|
||||
# Additional Information
|
||||
- Leave windows/applications open at task completion.
|
||||
- Upon fully completing the user's task, briefly summarize results if applicable, then return `TERMINATE`.
|
||||
- **Feasibility First**: Confirm the task can be completed with available files, applications, and environments before starting.
|
||||
- **Strict Adherence**: Only perform actions the user has explicitly requested; avoid unnecessary steps.
|
||||
- **Completion Criteria**: Only return "TERMINATE" when all user requirements are met in full.
|
||||
- **Impossibility Handling**: Return "INFEASIBLE" if completion is blocked by environmental constraints.
|
||||
- **Screenshot Verification**: Always check the screenshot before proceeding.
|
||||
|
||||
# Additional Rules
|
||||
- The sudo password is "{CLIENT_PASSWORD}"; use it if sudo privileges are required.
|
||||
- Leave all windows and applications open after completing the task.
|
||||
- Only use `TERMINATE` when all user requirements have been fully satisfied; provide a brief summary of results if applicable.
|
||||
- Before proceeding, confirm that the task is feasible with the currently available files, applications, and environment; if it is impossible to complete due to environmental constraints, return `INFEASIBLE`.
|
||||
- Strictly follow user instructions, avoiding unnecessary or extraneous steps.
|
||||
- Always review the latest screenshot before every action.
|
||||
|
||||
# Execution Procedure
|
||||
- Briefly review prior actions, the current checklist, and the latest screenshot before each tool call.
|
||||
- Before each action, state in one line the purpose and required minimal inputs.
|
||||
- After each action, validate the result in 1–2 lines using the updated screenshot. If the action was unsuccessful, adapt your approach before proceeding.
|
||||
- Only return the selected action(s); do not elaborate or output other information.
|
||||
- Work deliberately and avoid unnecessary or extraneous steps; strictly adhere to user instructions.
|
||||
|
||||
Proceed methodically and efficiently, ensuring all user requirements are met before terminating."""
|
||||
|
||||
CUA_START_MESSAGE = """
|
||||
Please check the screenshot and see if the task is impossible to complete due to environmental constraints. If it is, reply with 'INFEASIBLE'.
|
||||
If it is possible to complete, please complete the task, and before making any tool call, you should reasoning the next move according to the UI screenshot and instruction, while refer to the previous actions (tool calls), screenshots, and observations for reflection.
|
||||
|
||||
User task:
|
||||
{instruction}
|
||||
|
||||
""".strip()
|
||||
|
||||
|
||||
CUA_DEFAULT_REPLY = """Note the user task is:
|
||||
|
||||
{instruction}
|
||||
|
||||
If you have completed the user task, reply with 'TERMINATE'.
|
||||
If the task is impossible to complete due to environmental constraints, reply with 'INFEASIBLE'."""
|
||||
|
||||
|
||||
GTA1_JUDGE_SYSTEM_PROMPT='''# Role and Objective
|
||||
Assess the planning and reasoning of a UI agent to determine the most effective action for advancing toward a specified task goal. You may use the computer password '{CLIENT_PASSWORD}' during this process if needed.
|
||||
|
||||
# Workflow Checklist
|
||||
Begin each assessment by generating a concise checklist (adapt as appropriate for task complexity) of evaluation steps to ensure a systematic and methodical analysis.
|
||||
# Inputs
|
||||
For each assessment, you will receive:
|
||||
- The task goal
|
||||
- The history of planning and actions performed
|
||||
- A current UI screenshot
|
||||
- A list of {N_PLANNING} alternative planning approaches for achieving the goal, in the current context. Each approach will be formatted as:
|
||||
- Thought: <summary, goal, screenshot observation>
|
||||
- Action: <proposed UI action>
|
||||
|
||||
# Action Function Definition
|
||||
Actions are formatted as function calls. The specification for these calls is provided here:
|
||||
{FUNCTION_CALL_DEFINITION}
|
||||
|
||||
# Assessment Criteria
|
||||
- Correctness: Does the proposed action logically advance the goal?
|
||||
- Effectiveness: Is immediate progress made?
|
||||
- Alignment: Does it support both the step and overall objective?
|
||||
- Planning Quality: Reasoning is clear, concise, and logical.
|
||||
- Appropriateness: Action is valid/executable in the current context.
|
||||
- Matchness: Does the action correspond exactly to names/nouns in the user task? Avoid generalization or conflation.
|
||||
- Exactness: Does the action relate to the user task? No extra or unnecessary steps are performed.
|
||||
- Completeness: If terminate, does the action complete the user task?
|
||||
|
||||
Be aware that some planning approaches may be similar—evaluate each on its own merits, and do not allow the frequency of similar approaches to bias your assessment.
|
||||
Carefully assess each approach and select the best one based on the above criteria.
|
||||
|
||||
# Output Format
|
||||
Produce a single, strictly valid JSON object with the following fields:
|
||||
- `explaining` (string, required): A concise (1–4 sentences) justification for why the chosen approach is optimal in light of the assessment criteria; or, if none are effective, briefly explain why.
|
||||
- `index` (integer, required): The 0-based index (0, 1, ..., {N_INDEX}) identifying the best approach. You must choose one of the approaches.
|
||||
Do not output anything except the required JSON object.
|
||||
|
||||
**Carefully evaluate each approach and select the best one based on the criteria.**'''
|
||||
|
||||
def make_single_request(client: OpenAI, logger: logging.Logger, *args, **kwargs):
|
||||
for retry in range(10):
|
||||
try:
|
||||
response = client.responses.create(
|
||||
*args,
|
||||
**kwargs
|
||||
)
|
||||
response.output_text
|
||||
return response
|
||||
except Exception as e:
|
||||
if os.getenv("VERBOSEDEBUG", None) is not None:
|
||||
print(f"Error in response.create: {e}")
|
||||
time.sleep(min(retry**2, 16))
|
||||
return None
|
||||
|
||||
def extract_answer_from_response(response):
|
||||
if not response or not isinstance(response, str):
|
||||
raise ValueError("Response must be a non-empty string")
|
||||
json_pattern = r'```json\s*(.*?)\s*```'
|
||||
json_match = re.search(json_pattern, response, re.DOTALL)
|
||||
|
||||
if json_match:
|
||||
json_str = json_match.group(1)
|
||||
try:
|
||||
answer = json.loads(json_str)
|
||||
if "explaining" in answer and "index" in answer:
|
||||
answer["index"] = int(answer["index"])
|
||||
return answer
|
||||
else:
|
||||
raise ValueError("JSON missing required fields 'explaining' or 'index'")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
direct_json_pattern = r'\{[\s\S]*?"explaining"[\s\S]*?"index"[\s\S]*?\}'
|
||||
direct_match = re.search(direct_json_pattern, response)
|
||||
|
||||
if direct_match:
|
||||
try:
|
||||
json_str = direct_match.group(0)
|
||||
json_str = json_str.replace(''', "'").replace(''', "'").replace('"', '"').replace('"', '"')
|
||||
answer = json.loads(json_str)
|
||||
answer["index"] = int(answer["index"])
|
||||
return answer
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
index_pattern = r'"index"\s*:\s*(\d+)'
|
||||
index_match = re.search(index_pattern, response)
|
||||
|
||||
explaining_pattern = r'"explaining"\s*:\s*"(.*?)"(?=,|\s*})'
|
||||
explaining_match = re.search(explaining_pattern, response, re.DOTALL)
|
||||
|
||||
if not explaining_match:
|
||||
explaining_pattern = r'"explaining"\s*:\s*(.*?)(?=,\s*"index"|\s*})'
|
||||
explaining_match = re.search(explaining_pattern, response, re.DOTALL)
|
||||
|
||||
if index_match and explaining_match:
|
||||
return {
|
||||
"index": int(index_match.group(1)),
|
||||
"explaining": explaining_match.group(1).strip('" \t\n')
|
||||
}
|
||||
if index_match:
|
||||
return {
|
||||
"index": int(index_match.group(1)),
|
||||
"explaining": "Explanation not found in response"
|
||||
}
|
||||
raise ValueError("Could not extract valid answer from response")
|
||||
|
||||
def select_response(summary_info, responses, client_password):
|
||||
summary_info, curr_obs, instruction = summary_info
|
||||
|
||||
MAX_RETRY_TIMES = 10
|
||||
|
||||
system_promt = GTA1_JUDGE_SYSTEM_PROMPT.format(N_PLANNING=len(responses), N_INDEX=len(responses)-1, CLIENT_PASSWORD=client_password, FUNCTION_CALL_DEFINITION=json.dumps(CUA_TOOLS,indent=2))
|
||||
|
||||
message_formater = FormatMessage()
|
||||
messages = [
|
||||
message_formater.create_system_message(system_promt),
|
||||
message_formater.create_user_message(text=f"The goal of the task is:\n{instruction}\n\n\n"),
|
||||
|
||||
]
|
||||
|
||||
if len(summary_info) == 0:
|
||||
messages.append(message_formater.create_user_message(text=f"No history available. The action just started.\n"))
|
||||
else:
|
||||
for idx, (curr_obs, action_call, content_text) in enumerate(summary_info):
|
||||
name = action_call['name']
|
||||
args = action_call['arguments']
|
||||
action = f"{name}({args})"
|
||||
if os.getenv("JUDGE_SCREENSHOT_PROMPT", None) is not None and idx >= len(summary_info) - 5:
|
||||
messages.append(message_formater.create_user_message(text=f"\n### {idx} Screenshot before taking the action:\n"))
|
||||
messages.append(message_formater.create_user_message(image=curr_obs['screenshot']))
|
||||
messages.append(message_formater.create_user_message(text=f"\n"))
|
||||
messages.append(message_formater.create_user_message(text=f"### Past step {idx}:\nThought:{content_text}\nAction:{action_call}\n\n\n"))
|
||||
messages.append(message_formater.create_user_message(text=f"Here are the different plans to compare:\n"))
|
||||
for idx, plan in enumerate(responses):
|
||||
messages.append(message_formater.create_user_message(text=f"### Index {idx}:\n{plan}\n\n\n"))
|
||||
|
||||
messages.append(message_formater.create_user_message(text=f"Here are the current screenshot:\n"))
|
||||
messages.append(message_formater.create_user_message(image=curr_obs['screenshot']))
|
||||
messages.append(message_formater.create_user_message(text=f"Here are the different plans to compare for completing the task:\n"))
|
||||
for idx, rsp in enumerate(responses):
|
||||
content_text = rsp.output_text
|
||||
action = "No Action is performed."
|
||||
for i, o in enumerate(rsp.output):
|
||||
typ = o["type"] if isinstance(o, dict) else getattr(o, "type", None)
|
||||
if typ == 'function_call':
|
||||
name = o.name
|
||||
args = json.loads(o.arguments)
|
||||
action = f"{name}({args})"
|
||||
break
|
||||
messages.append(message_formater.create_user_message(text=f"### Index {idx}:\nThought:{content_text}\nAction:{action}\n\n\n"))
|
||||
|
||||
messages.append(message_formater.create_user_message(text=f"Please select the best plan to complete the task."))
|
||||
|
||||
if os.getenv("X_API_KEY") and os.getenv("X_API_URL"):
|
||||
client = OpenAI(base_url=os.getenv("X_API_URL"), api_key="dummy", default_headers = {"X-Api-Key": os.getenv("X_API_KEY")})
|
||||
else:
|
||||
client = OpenAI()
|
||||
wait = 1
|
||||
for _ in range(MAX_RETRY_TIMES):
|
||||
try:
|
||||
prediction = client.responses.create(
|
||||
model="gpt-5",
|
||||
input=messages,
|
||||
reasoning={"effort": "high"},
|
||||
max_output_tokens=4096 * 4,
|
||||
timeout=100,
|
||||
)
|
||||
prediction = prediction.output_text
|
||||
if os.getenv("VERBOSEDEBUG", None) is not None:
|
||||
print(f"Prediction: {prediction}")
|
||||
prediction = extract_answer_from_response(prediction)
|
||||
return responses[prediction['index']]
|
||||
except:
|
||||
time.sleep(wait)
|
||||
wait *=2
|
||||
wait = min(wait,16)
|
||||
continue
|
||||
return responses[0]
|
||||
|
||||
def call_openai_cua(client: OpenAI,
|
||||
history_inputs: list,
|
||||
cua_model: str,
|
||||
logger: logging.Logger = None,
|
||||
tts_step: int = 1,
|
||||
summary_info: List[Any] = None,
|
||||
client_password: str = "",
|
||||
) -> Tuple[Any, float]:
|
||||
retry = 0
|
||||
response = None
|
||||
if tts_step == 1:
|
||||
response = make_single_request(client, logger,
|
||||
model=cua_model,
|
||||
tools=CUA_TOOLS,
|
||||
parallel_tool_calls=False,
|
||||
reasoning={"effort": "high"},
|
||||
max_output_tokens=4096 * 4,
|
||||
input=history_inputs,
|
||||
timeout=500)
|
||||
else:
|
||||
potential_responses = []
|
||||
retry = 0
|
||||
while len(potential_responses) < tts_step and retry < 5:
|
||||
retry += 1
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=tts_step-len(potential_responses)) as executor:
|
||||
futures = [executor.submit(make_single_request, client, logger,
|
||||
model=cua_model,
|
||||
tools=CUA_TOOLS,
|
||||
parallel_tool_calls=False,
|
||||
reasoning={"effort": "high"},
|
||||
max_output_tokens=4096 * 4,
|
||||
input=history_inputs,
|
||||
timeout=500) for _ in range(tts_step-len(potential_responses))]
|
||||
responses = [future.result() for future in concurrent.futures.as_completed(futures)]
|
||||
responses = [response for response in responses if response is not None]
|
||||
potential_responses.extend(responses)
|
||||
responses = potential_responses
|
||||
if os.getenv("VERBOSEDEBUG", None) is not None:
|
||||
print(f"Responses: {responses}")
|
||||
response = select_response(summary_info,responses,client_password)
|
||||
return response
|
||||
|
||||
def _tool_call_to_pyautogui(agent: OSWorldACI,
|
||||
action_call: Dict[str, Any],
|
||||
obs: Dict[str, Any],
|
||||
request_vllm: Callable,
|
||||
logger: logging.Logger = None) -> Tuple[str, str]:
|
||||
tool_output = "Action (tool call) is executed. For your reference, you have maximum of {max_steps} steps, and current step is {step_no} out of {max_steps}."
|
||||
method = None
|
||||
try:
|
||||
name = action_call['name']
|
||||
args = action_call['arguments']
|
||||
# Default: no coordinates needed
|
||||
agent.coords1, agent.coords2 = None, None
|
||||
|
||||
# Compute coordinates for description-based actions
|
||||
if name == "click" and isinstance(args.get("instruction"), str):
|
||||
agent.coords1 = agent.generate_coords(args["instruction"], obs, request_vllm)
|
||||
elif name == "type":
|
||||
element_description = args.get("element_description")
|
||||
if isinstance(element_description, str) and element_description:
|
||||
agent.coords1 = agent.generate_coords(element_description, obs, request_vllm)
|
||||
elif name == "scroll" and isinstance(args.get("instruction"), str):
|
||||
agent.coords1 = agent.generate_coords(args["instruction"], obs, request_vllm)
|
||||
elif name == "drag_and_drop":
|
||||
sd = args.get("starting_description")
|
||||
ed = args.get("ending_description")
|
||||
if isinstance(sd, str) and isinstance(ed, str):
|
||||
agent.coords1 = agent.generate_coords(sd, obs, request_vllm)
|
||||
agent.coords2 = agent.generate_coords(ed, obs, request_vllm)
|
||||
elif name == "highlight_text_span":
|
||||
sp = args.get("starting_phrase")
|
||||
ep = args.get("ending_phrase")
|
||||
if isinstance(sp, str) and isinstance(ep, str):
|
||||
agent.coords1 = agent.generate_text_coords(sp, obs, alignment="start")
|
||||
agent.coords2 = agent.generate_text_coords(ep, obs, alignment="end")
|
||||
|
||||
# Dispatch to OSWorldACI method to build pyautogui command
|
||||
if hasattr(agent, name):
|
||||
method = getattr(agent, name)
|
||||
# Some arguments may be missing; rely on method defaults
|
||||
return method(**args),tool_output
|
||||
except Exception as e:
|
||||
if os.getenv("VERBOSEDEBUG", None) is not None:
|
||||
print(f"Error in _tool_call_to_pyautogui: {e}")
|
||||
tool_output = "Error: " + str(e).replace("OSWorldACI.","").strip()
|
||||
if method is not None:
|
||||
sig = inspect.signature(method)
|
||||
tool_output += f"\nThe tool signature is: {method.__name__}{sig}"
|
||||
|
||||
return "WAIT", tool_output
|
||||
|
||||
def request_vllm(image, prompt):
|
||||
CLICK_REGEXES = [
|
||||
# pyautogui.click(x=123, y=456)
|
||||
re.compile(r"click\s*\(\s*x\s*=\s*(\d+)\s*,\s*y\s*=\s*(\d+)\s*\)", re.IGNORECASE),
|
||||
# pyautogui.click(123, 456) or click(123,456)
|
||||
re.compile(r"click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", re.IGNORECASE),
|
||||
]
|
||||
|
||||
def parse_xy_from_text(text: str):
|
||||
if "click" not in text.lower():
|
||||
return [-1, -1]
|
||||
for rx in CLICK_REGEXES:
|
||||
m = rx.search(text)
|
||||
if m:
|
||||
try:
|
||||
return int(m.group(1)), int(m.group(2))
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
if isinstance(image, bytes):
|
||||
image = np.array(Image.open(BytesIO(image)).convert('RGB'))
|
||||
H, W, C = image.shape
|
||||
H, W = smart_resize(
|
||||
H,
|
||||
W,
|
||||
factor=28,
|
||||
min_pixels=1000,
|
||||
max_pixels=1000000000000,
|
||||
)
|
||||
assert C == 3
|
||||
if isinstance(image, np.ndarray):
|
||||
image_base64 = encode_numpy_image_to_base64(image)
|
||||
elif isinstance(image, bytes):
|
||||
image_base64 = encode_image_bytes(image)
|
||||
else:
|
||||
raise ValueError(f"Invalid image type: {type(image)}")
|
||||
messages=[
|
||||
{"role": "system", "content": GTA1_GROUNDING_SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image","image": f"data:image/png;base64,{image_base64}"
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
},
|
||||
],
|
||||
}]
|
||||
base_url = GTA1_SERVICE_URL
|
||||
payload = {
|
||||
"messages": messages,
|
||||
"max_new_tokens": 100,
|
||||
"temperature": 0.0,
|
||||
"top_p": 0.9,
|
||||
}
|
||||
for _ in range(10):
|
||||
try:
|
||||
httpx_client = httpx.Client()
|
||||
r = httpx_client.post(f"{base_url}/call_llm", json=payload, timeout=10)
|
||||
r.raise_for_status()
|
||||
resp = r.json()
|
||||
if isinstance(resp, dict):
|
||||
result_items = [resp]
|
||||
else:
|
||||
result_items = resp
|
||||
first = result_items[0]
|
||||
x,y = parse_xy_from_text(first.get("response"))
|
||||
x = x/W
|
||||
y = y/H
|
||||
return x,y
|
||||
except:
|
||||
if os.getenv("VERBOSEDEBUG", None) is not None:
|
||||
print(resp)
|
||||
time.sleep(1)
|
||||
continue
|
||||
raise RuntimeError(f"Failed to execute grounding")
|
||||
|
||||
|
||||
|
||||
def _prune_history_images(messages: List[Dict[str, Any]], max_recent_images: int) -> None:
|
||||
"""Keep only the very first image message and the latest N image messages.
|
||||
|
||||
- Preserves the earliest image-containing message (initial screenshot)
|
||||
- Preserves up to `max_recent_images` most recent image messages
|
||||
- Removes any other image messages
|
||||
"""
|
||||
try:
|
||||
if max_recent_images is None:
|
||||
return
|
||||
if max_recent_images < 0:
|
||||
return
|
||||
|
||||
image_indices: List[int] = []
|
||||
for idx, msg in enumerate(messages):
|
||||
if isinstance(msg, dict) and isinstance(msg.get('content'), list):
|
||||
for blk in msg['content']:
|
||||
if isinstance(blk, dict) and blk.get('type') in ('image_url', 'input_image'):
|
||||
image_indices.append(idx)
|
||||
break
|
||||
|
||||
if len(image_indices) <= 1:
|
||||
return # Zero or one image message — nothing to prune
|
||||
|
||||
first_image_idx = image_indices[0]
|
||||
recent_keep: List[int] = image_indices[-max_recent_images:] if max_recent_images > 0 else []
|
||||
keep_set = set([first_image_idx] + recent_keep)
|
||||
delete_indices = [i for i in image_indices if i not in keep_set]
|
||||
|
||||
# Remove from end to avoid reindexing issues
|
||||
if os.getenv("VERBOSEDEBUG", None) is not None:
|
||||
print(f"Pruning history images: {delete_indices}")
|
||||
for i in sorted(delete_indices, reverse=True):
|
||||
messages.pop(i)
|
||||
except Exception:
|
||||
# Be conservative: never fail the main loop due to pruning
|
||||
pass
|
||||
|
||||
def run_cua_gpt5gta1(
|
||||
env: DesktopEnv,
|
||||
instruction: str,
|
||||
max_steps: int,
|
||||
save_path: str = './',
|
||||
sleep_after_execution: float = 0.3,
|
||||
client_password: str = "",
|
||||
cua_model: str = "gpt-5",
|
||||
tts_step: int = 8,
|
||||
purge_history_images: int = 8,
|
||||
request_vllm: Callable = request_vllm,
|
||||
logger: logging.Logger = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
if os.getenv("X_API_KEY"):
|
||||
client = OpenAI(base_url=os.getenv("X_API_URL"), api_key="dummy", default_headers = {"X-Api-Key": os.getenv("X_API_KEY")})
|
||||
else:
|
||||
client = OpenAI()
|
||||
agent = OSWorldACI(platform="linux")
|
||||
message_formater = FormatMessage()
|
||||
default_reply = CUA_DEFAULT_REPLY.format(instruction=instruction)
|
||||
|
||||
# 0 / reset & first screenshot
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
obs_bytes = env.controller.get_screenshot()
|
||||
with open(os.path.join(save_path, "initial_screenshot.png"), "wb") as f:
|
||||
f.write(obs_bytes)
|
||||
traj = []
|
||||
history_inputs = [
|
||||
message_formater.create_system_message(CUA_SYSTEM_PROMPT_GPT5.format(CLIENT_PASSWORD=client_password)),
|
||||
message_formater.create_user_message(text=CUA_START_MESSAGE.format(instruction=instruction),image=obs_bytes,image_first=False),
|
||||
]
|
||||
|
||||
curr_obs = {"screenshot": obs_bytes}
|
||||
|
||||
summary_info = []
|
||||
step_no = 0
|
||||
logger.info(f"--------------------------------CUA Step {step_no+1}--------------------------------")
|
||||
response = call_openai_cua(client, history_inputs, cua_model, logger=logger, tts_step=tts_step, summary_info=[summary_info,curr_obs,instruction], client_password=client_password)
|
||||
reasoning = ""
|
||||
# 1 / iterative dialogue
|
||||
while step_no < max_steps:
|
||||
step_no += 1
|
||||
|
||||
# --- extract function calls and handle assistant content -------------
|
||||
calls: List[Dict[str, Any]] = []
|
||||
content_text = ""
|
||||
buffer_history = []
|
||||
|
||||
# Collect function calls from chat completions tool_calls
|
||||
for i, o in enumerate(response.output):
|
||||
typ = o["type"] if isinstance(o, dict) else getattr(o, "type", None)
|
||||
if typ == 'function_call':
|
||||
buffer_history.append(o)
|
||||
calls.append({
|
||||
'call_id': o.call_id,
|
||||
'name': o.name,
|
||||
'arguments': json.loads(o.arguments),
|
||||
})
|
||||
elif typ == 'message':
|
||||
content_text = o.content[0].text
|
||||
if os.getenv("VERBOSEDEBUG", None) is not None:
|
||||
print(content_text)
|
||||
buffer_history.append(
|
||||
{"role": o.role, "content": o.content}
|
||||
)
|
||||
assert len(calls) <= 1, f"Unexpected assistant content: {content_text} \n {calls}"
|
||||
|
||||
history_inputs.extend(buffer_history)
|
||||
for action_call in calls:
|
||||
traj.append(action_call)
|
||||
logger.info(f"[Action Call]: {action_call}")
|
||||
py_cmd, tool_output = _tool_call_to_pyautogui(agent, action_call, curr_obs, request_vllm, logger=logger)
|
||||
summary_info.append([curr_obs, action_call, content_text])
|
||||
# --- execute in VM ---------------------------------------------------
|
||||
obs, *_ = env.step(py_cmd, sleep_after_execution)
|
||||
|
||||
# --- send screenshot back -------------------------------------------
|
||||
with open(os.path.join(save_path, f"step_{step_no}.png"), "wb") as f:
|
||||
f.write(obs["screenshot"])
|
||||
|
||||
history_inputs.append(
|
||||
{
|
||||
'type': 'function_call_output',
|
||||
'call_id': action_call['call_id'],
|
||||
'output':tool_output.format(max_steps=max_steps, step_no=step_no)
|
||||
}
|
||||
)
|
||||
# Provide the screenshot as a separate user message so the model can actually see it
|
||||
history_inputs.append(
|
||||
message_formater.create_user_message(
|
||||
text=f"Here is the screenshot after the {step_no}-th action (tool call) is executed.",
|
||||
image=obs['screenshot']
|
||||
)
|
||||
)
|
||||
# Prune history to keep first image and at most N latest images
|
||||
if purge_history_images > 0:
|
||||
_prune_history_images(history_inputs, purge_history_images)
|
||||
curr_obs = obs
|
||||
# Handle plain assistant content string
|
||||
content_text = response.output_text or ''
|
||||
if isinstance(content_text, str) and content_text:
|
||||
if 'TERMINATE' in content_text:
|
||||
traj.append({"type": "TERMINATE"})
|
||||
logger.info(f"#Terminate message:\n{content_text}.")
|
||||
step_no-=1
|
||||
env.step("DONE", sleep_after_execution)
|
||||
return "DONE", traj
|
||||
elif 'INFEASIBLE' in content_text:
|
||||
traj.append({"type": "INFEASIBLE"})
|
||||
logger.info(f"Stop reason (unfinished):\n{content_text}.")
|
||||
step_no-=1
|
||||
env.step("FAIL", sleep_after_execution)
|
||||
return "FAIL", traj
|
||||
else:
|
||||
if len(calls) < 1:
|
||||
step_no-=1
|
||||
remaining_steps = max_steps - step_no
|
||||
if len(calls) < 1 or remaining_steps <= 1:
|
||||
remind_terminate_message = ""
|
||||
if remaining_steps <= 1:
|
||||
remind_terminate_message = "\n\n\nThe maximum number of steps has been reached. Please check the screenshot. Return 'TERMINATE' if the task is completed, or reply with 'INFEASIBLE' if the task is impossible to complete due to environmental constraints."
|
||||
history_inputs.append(message_formater.create_user_message(text=default_reply + remind_terminate_message))
|
||||
|
||||
assert len(calls) <= 1, f"Unexpected assistant content: {content_text} \n {calls}"
|
||||
|
||||
logger.info(f"--------------------------------CUA Step {step_no+1}--------------------------------")
|
||||
response = call_openai_cua(client, history_inputs, cua_model, logger=logger, tts_step=tts_step, summary_info=[summary_info,curr_obs,instruction], client_password=client_password)
|
||||
traj.append({"type": "INFEASIBLE"})
|
||||
env.step("FAIL", sleep_after_execution)
|
||||
return reasoning, traj
|
||||
|
|
@ -62,7 +62,7 @@ class LMMEngineOpenAI:
|
|||
self.model = model
|
||||
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if api_key is None and os.getenv("X_API_KEY") is None:
|
||||
if api_key is None:
|
||||
raise ValueError(
|
||||
"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY"
|
||||
)
|
||||
|
|
@ -72,10 +72,10 @@ class LMMEngineOpenAI:
|
|||
self.api_key = api_key
|
||||
self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
|
||||
|
||||
if api_key:
|
||||
if not self.base_url:
|
||||
self.llm_client = OpenAI(api_key=self.api_key)
|
||||
else:
|
||||
self.llm_client = client = OpenAI(base_url=os.getenv("X_API_URL"), api_key="dummy", default_headers = {"X-Api-Key": os.getenv("X_API_KEY")})
|
||||
self.llm_client = OpenAI(base_url=self.base_url, api_key=self.api_key)
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60
|
||||
|
|
@ -425,7 +425,6 @@ class OSWorldACI:
|
|||
platform: 'linux',
|
||||
width: int = 1920,
|
||||
height: int = 1080,
|
||||
model: str = "o3",
|
||||
):
|
||||
self.platform = (
|
||||
platform # Dictates how the switch_applications agent action works.
|
||||
|
|
@ -433,7 +432,7 @@ class OSWorldACI:
|
|||
|
||||
engine_params_for_generation = engine_params = {
|
||||
"engine_type": 'openai',
|
||||
"model": model,
|
||||
"model": 'o3',
|
||||
"base_url": '',
|
||||
"api_key": os.environ.get("OPENAI_API_KEY", ""),
|
||||
}
|
||||
|
|
@ -1,190 +0,0 @@
|
|||
"""
|
||||
Hosted GBOX Agent Client
|
||||
Thin HTTP wrapper that calls the hosted GBOX service
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import requests
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
logger = logging.getLogger("hosted-gbox-agent")
|
||||
|
||||
|
||||
class HostedGboxAgent:
|
||||
"""
|
||||
Client wrapper for hosted GBOX service.
|
||||
Follows the same interface as other OSWorld agents but delegates execution to remote service.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server_url: str,
|
||||
api_key: str,
|
||||
vm_ip: str,
|
||||
platform: str = "ubuntu",
|
||||
model: str = "claude-sonnet-4-5",
|
||||
max_steps: int = 15,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Initialize hosted agent client
|
||||
|
||||
Args:
|
||||
server_url: URL of hosted GBOX service (e.g., "http://44.201.221.203:8000")
|
||||
api_key: API key for authentication
|
||||
vm_ip: IP address of the VM to control
|
||||
platform: OS platform (ubuntu/windows)
|
||||
model: Claude model to use
|
||||
max_steps: Maximum steps per task
|
||||
"""
|
||||
self.server_url = server_url.rstrip('/')
|
||||
self.api_key = api_key
|
||||
self.vm_ip = vm_ip
|
||||
self.platform = platform
|
||||
self.model = model
|
||||
self.max_steps = max_steps
|
||||
self.runtime_logger = None
|
||||
|
||||
# HTTP client with timeout
|
||||
self.client = requests.Session()
|
||||
self.client.headers.update({"X-API-Key": api_key})
|
||||
|
||||
logger.info(f"Initialized hosted agent client for VM {vm_ip}")
|
||||
logger.info(f"Server: {server_url}, Model: {model}")
|
||||
|
||||
def reset(self, runtime_logger=None, vm_ip: str = None):
|
||||
"""
|
||||
Reset agent state (called by OSWorld before each task)
|
||||
|
||||
Args:
|
||||
runtime_logger: Logger instance for OSWorld runtime logs
|
||||
vm_ip: Updated VM IP (in case of snapshot revert)
|
||||
"""
|
||||
self.runtime_logger = runtime_logger
|
||||
|
||||
if vm_ip:
|
||||
self.vm_ip = vm_ip
|
||||
if self.runtime_logger:
|
||||
self.runtime_logger.info(f"[HOSTED] Updated VM IP to {vm_ip}")
|
||||
|
||||
if self.runtime_logger:
|
||||
self.runtime_logger.info(f"[HOSTED] Agent reset for VM {self.vm_ip}")
|
||||
|
||||
def predict(self, instruction: str, obs: Dict) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Execute task prediction (one call = full task execution)
|
||||
|
||||
Args:
|
||||
instruction: Task instruction
|
||||
obs: Observation dict (not used - agent fetches its own screenshots)
|
||||
|
||||
Returns:
|
||||
(reasoning_text, actions_list)
|
||||
- reasoning_text: Claude's reasoning/explanation
|
||||
- actions_list: ["DONE"] or ["FAIL"] or PyAutoGUI code
|
||||
"""
|
||||
try:
|
||||
# Prepare request (no screenshot needed - agent fetches its own)
|
||||
payload = {
|
||||
"vm_ip": self.vm_ip,
|
||||
"instruction": instruction,
|
||||
"platform": self.platform,
|
||||
"model": self.model,
|
||||
"max_steps": self.max_steps
|
||||
}
|
||||
|
||||
# Log request
|
||||
if self.runtime_logger:
|
||||
self.runtime_logger.info(f"[HOSTED] Sending task to service...")
|
||||
self.runtime_logger.info(f"[HOSTED] Instruction: {instruction[:100]}...")
|
||||
|
||||
# Call hosted service (this may take several minutes)
|
||||
response = self.client.post(
|
||||
f"{self.server_url}/execute",
|
||||
json=payload,
|
||||
timeout=3600 # 60 minutes timeout for full task execution
|
||||
)
|
||||
|
||||
# Check for errors
|
||||
if response.status_code == 401:
|
||||
raise RuntimeError("Authentication failed - invalid API key")
|
||||
elif response.status_code != 200:
|
||||
raise RuntimeError(f"Service returned {response.status_code}: {response.text}")
|
||||
|
||||
# Parse response
|
||||
result = response.json()
|
||||
reasoning = result.get("reasoning", "")
|
||||
actions = result.get("actions", ["FAIL"])
|
||||
logs = result.get("logs", "")
|
||||
session_id = result.get("session_id", "unknown")
|
||||
|
||||
# Forward server logs to OSWorld's runtime logger
|
||||
if logs and self.runtime_logger:
|
||||
for line in logs.split('\n'):
|
||||
if line.strip():
|
||||
self.runtime_logger.info(f"[SERVER] {line}")
|
||||
|
||||
# Log results
|
||||
if self.runtime_logger:
|
||||
self.runtime_logger.info(f"[HOSTED] Session ID: {session_id}")
|
||||
self.runtime_logger.info(f"[HOSTED] Actions: {actions}")
|
||||
self.runtime_logger.info(f"[HOSTED] Reasoning: {reasoning[:200]}...")
|
||||
|
||||
return reasoning, actions
|
||||
|
||||
except requests.Timeout:
|
||||
error_msg = "Service timeout (task took longer than 60 minutes)"
|
||||
logger.error(error_msg)
|
||||
if self.runtime_logger:
|
||||
self.runtime_logger.error(f"[HOSTED] {error_msg}")
|
||||
return f"ERROR: {error_msg}", ["FAIL"]
|
||||
|
||||
except requests.ConnectionError as e:
|
||||
error_msg = f"Cannot connect to service at {self.server_url}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
if self.runtime_logger:
|
||||
self.runtime_logger.error(f"[HOSTED] {error_msg}")
|
||||
return f"ERROR: {error_msg}", ["FAIL"]
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Hosted agent error: {str(e)}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
if self.runtime_logger:
|
||||
self.runtime_logger.error(f"[HOSTED] {error_msg}")
|
||||
return f"ERROR: {error_msg}", ["FAIL"]
|
||||
|
||||
def close(self):
|
||||
"""Close HTTP session"""
|
||||
self.client.close()
|
||||
|
||||
def __del__(self):
|
||||
"""Cleanup on deletion"""
|
||||
try:
|
||||
self.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# Factory function for compatibility with OSWorld runner
|
||||
def create_agent(vm_ip: str, **kwargs) -> HostedGboxAgent:
|
||||
"""
|
||||
Factory function to create hosted agent
|
||||
|
||||
Expects environment variables:
|
||||
- GBOX_SERVICE_URL: URL of hosted service
|
||||
- GBOX_SERVICE_API_KEY: API key for authentication
|
||||
"""
|
||||
server_url = os.getenv("GBOX_SERVICE_URL")
|
||||
api_key = os.getenv("GBOX_SERVICE_API_KEY")
|
||||
|
||||
if not server_url:
|
||||
raise ValueError("GBOX_SERVICE_URL environment variable not set")
|
||||
if not api_key:
|
||||
raise ValueError("GBOX_SERVICE_API_KEY environment variable not set")
|
||||
|
||||
return HostedGboxAgent(
|
||||
server_url=server_url,
|
||||
api_key=api_key,
|
||||
vm_ip=vm_ip,
|
||||
**kwargs
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,3 +0,0 @@
|
|||
from mm_agents.opencua.opencua_agent import OpenCUAAgent
|
||||
|
||||
__all__ = ["OpenCUAAgent"]
|
||||
|
|
@ -1,470 +0,0 @@
|
|||
"""
|
||||
OpenCUA Agent Implementation
|
||||
|
||||
This module implements an OpenCUA agent for desktop automation tasks, building upon
|
||||
existing frameworks and integrating multiple coordinate mapping systems.
|
||||
|
||||
Framework and Implementation Sources:
|
||||
- Main framework structure follows: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py
|
||||
- Agent implementation adapted from: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/aguvis_agent.py
|
||||
- Qwen2.5-VL coordinate mapping from: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
|
||||
"""
|
||||
|
||||
import re
|
||||
import os
|
||||
import ast
|
||||
import time
|
||||
import math
|
||||
import httpx
|
||||
import base64
|
||||
import backoff
|
||||
import traceback
|
||||
from loguru import logger
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from mm_agents.opencua.utils import (
|
||||
encode_image,
|
||||
smart_resize,
|
||||
)
|
||||
from mm_agents.opencua.prompts import (
|
||||
INSTRUTION_TEMPLATE,
|
||||
STEP_TEMPLATE,
|
||||
ACTION_HISTORY_TEMPLATE,
|
||||
THOUGHT_HISTORY_TEMPLATE,
|
||||
OBSERVATION_HISTORY_TEMPLATE,
|
||||
# OpenCUA-7B, 32B system prompts
|
||||
SYSTEM_PROMPT_V1_L1,
|
||||
SYSTEM_PROMPT_V1_L2,
|
||||
SYSTEM_PROMPT_V1_L3,
|
||||
# OpenCUA-72B system prompts
|
||||
build_sys_prompt,
|
||||
)
|
||||
|
||||
def parse_response_to_cot_and_action(input_string, screen_size, coordinate_type) -> Tuple[str, List[str], dict]:
|
||||
"""Parse response including Observation, Thought, Action and code block"""
|
||||
sections = {}
|
||||
try:
|
||||
|
||||
obs_match = re.search(r'^##\s*Observation\s*:?[\n\r]+(.*?)(?=^##\s*Thought:|^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
|
||||
if obs_match:
|
||||
sections['observation'] = obs_match.group(1).strip()
|
||||
|
||||
thought_match = re.search(r'^##\s*Thought\s*:?[\n\r]+(.*?)(?=^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
|
||||
if thought_match:
|
||||
sections['thought'] = thought_match.group(1).strip()
|
||||
|
||||
action_match = re.search(r'^##\s*Action\s*:?[\n\r]+(.*?)(?=^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
|
||||
if action_match:
|
||||
action = action_match.group(1).strip()
|
||||
sections['action'] = action.strip()
|
||||
|
||||
code_blocks = re.findall(r'```(?:code|python)?\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
|
||||
if not code_blocks:
|
||||
logger.error("No code blocks found in the input string")
|
||||
return f"<Error>: no code blocks found in the input string: {input_string}", ["FAIL"], sections
|
||||
code_block = code_blocks[-1].strip()
|
||||
sections['original_code'] = code_block
|
||||
|
||||
if "computer.wait" in code_block.lower():
|
||||
sections["code"] = "WAIT"
|
||||
return sections['action'], ["WAIT"], sections
|
||||
|
||||
elif "computer.terminate" in code_block.lower():
|
||||
lower_block = code_block.lower()
|
||||
if ("failure" in lower_block) or ("fail" in lower_block):
|
||||
sections['code'] = "FAIL"
|
||||
return code_block, ["FAIL"], sections
|
||||
elif "success" in lower_block:
|
||||
sections['code'] = "DONE"
|
||||
return code_block, ["DONE"], sections
|
||||
else:
|
||||
logger.error("Terminate action found but no specific status provided in code block")
|
||||
return f"<Error>: terminate action found but no specific status provided in code block: {input_string}", ["FAIL"], sections
|
||||
|
||||
# corrected_code = correct_pyautogui_arguments(code_block)
|
||||
corrected_code = code_block
|
||||
sections['code'] = corrected_code
|
||||
sections['code'] = project_coordinate_to_absolute_scale(corrected_code, screen_width=screen_size[0], screen_height=screen_size[1], coordinate_type=coordinate_type)
|
||||
|
||||
if ('code' not in sections or sections['code'] is None or sections['code'] == "") or ('action' not in sections or sections['action'] is None or sections['action'] == ""):
|
||||
logger.error("Missing required action or code section")
|
||||
return f"<Error>: no code parsed: {input_string}", ["FAIL"], sections
|
||||
|
||||
return sections['action'], [sections['code']], sections
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"<Error>: parsing response: {str(e)}\nTraceback:\n{traceback.format_exc()}\nInput string: {input_string}"
|
||||
logger.error(error_message)
|
||||
return error_message, ['FAIL'], sections
|
||||
|
||||
def project_coordinate_to_absolute_scale(pyautogui_code_relative_coordinates, screen_width, screen_height, coordinate_type="relative"):
|
||||
"""
|
||||
Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size.
|
||||
"""
|
||||
def _coordinate_projection(x, y, screen_width, screen_height, coordinate_type):
|
||||
if coordinate_type == "relative":
|
||||
return int(round(x * screen_width)), int(round(y * screen_height))
|
||||
elif coordinate_type == "qwen25":
|
||||
height, width = smart_resize(
|
||||
height=screen_height,
|
||||
width=screen_width,
|
||||
factor=28,
|
||||
min_pixels=3136,
|
||||
max_pixels=12845056
|
||||
)
|
||||
if 0 <= x <= 1 and 0 <= y <= 1:
|
||||
# If already normalized, treat like "relative"
|
||||
return int(round(x * width)), int(round(y * height))
|
||||
return int(x / width * screen_width), int(y / height * screen_height)
|
||||
else:
|
||||
raise ValueError(f"Invalid coordinate type: {coordinate_type}. Expected one of ['relative', 'relative1000', 'absolute', 'qwen25'].")
|
||||
|
||||
pattern = r'(pyautogui\.\w+\([^\)]*\))'
|
||||
matches = re.findall(pattern, pyautogui_code_relative_coordinates)
|
||||
|
||||
new_code = pyautogui_code_relative_coordinates
|
||||
|
||||
for full_call in matches:
|
||||
func_name_pattern = r'(pyautogui\.\w+)\((.*)\)'
|
||||
func_match = re.match(func_name_pattern, full_call, re.DOTALL)
|
||||
if not func_match:
|
||||
continue
|
||||
|
||||
func_name = func_match.group(1)
|
||||
args_str = func_match.group(2)
|
||||
|
||||
try:
|
||||
parsed = ast.parse(f"func({args_str})").body[0].value
|
||||
parsed_args = parsed.args
|
||||
parsed_keywords = parsed.keywords
|
||||
|
||||
except SyntaxError:
|
||||
return pyautogui_code_relative_coordinates
|
||||
|
||||
function_parameters = {
|
||||
'click': ['x', 'y', 'clicks', 'interval', 'button', 'duration', 'pause'],
|
||||
'rightClick': ['x', 'y', 'duration', 'tween', 'pause'],
|
||||
'middleClick': ['x', 'y', 'duration', 'tween', 'pause'],
|
||||
'doubleClick': ['x', 'y', 'interval', 'button', 'duration', 'pause'],
|
||||
'tripleClick': ['x', 'y', 'interval', 'button', 'duration', 'pause'],
|
||||
'moveTo': ['x', 'y', 'duration', 'tween', 'pause'],
|
||||
'dragTo': ['x', 'y', 'duration', 'button', 'mouseDownUp', 'pause'],
|
||||
}
|
||||
|
||||
func_base_name = func_name.split('.')[-1]
|
||||
|
||||
param_names = function_parameters.get(func_base_name, [])
|
||||
|
||||
args = {}
|
||||
for idx, arg in enumerate(parsed_args):
|
||||
if idx < len(param_names):
|
||||
param_name = param_names[idx]
|
||||
arg_value = ast.literal_eval(arg)
|
||||
args[param_name] = arg_value
|
||||
|
||||
try:
|
||||
for kw in parsed_keywords:
|
||||
param_name = kw.arg
|
||||
arg_value = ast.literal_eval(kw.value)
|
||||
args[param_name] = arg_value
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing keyword arguments: {e}")
|
||||
return pyautogui_code_relative_coordinates
|
||||
|
||||
updated = False
|
||||
if 'x' in args and 'y' in args:
|
||||
try:
|
||||
x_rel = float(args['x'])
|
||||
y_rel = float(args['y'])
|
||||
x_abs, y_abs = _coordinate_projection(x_rel, y_rel, screen_width, screen_height, coordinate_type)
|
||||
logger.warning(f"Projecting coordinates: ({x_rel}, {y_rel}) to ({x_abs}, {y_abs}) using {coordinate_type} projection.")
|
||||
args['x'] = x_abs
|
||||
args['y'] = y_abs
|
||||
updated = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if updated:
|
||||
reconstructed_args = []
|
||||
for idx, param_name in enumerate(param_names):
|
||||
if param_name in args:
|
||||
arg_value = args[param_name]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"'{arg_value}'"
|
||||
else:
|
||||
arg_repr = str(arg_value)
|
||||
reconstructed_args.append(arg_repr)
|
||||
else:
|
||||
break
|
||||
|
||||
used_params = set(param_names[:len(reconstructed_args)])
|
||||
for kw in parsed_keywords:
|
||||
if kw.arg not in used_params:
|
||||
arg_value = args[kw.arg]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"{kw.arg}='{arg_value}'"
|
||||
else:
|
||||
arg_repr = f"{kw.arg}={arg_value}"
|
||||
reconstructed_args.append(arg_repr)
|
||||
|
||||
new_args_str = ', '.join(reconstructed_args)
|
||||
new_full_call = f"{func_name}({new_args_str})"
|
||||
new_code = new_code.replace(full_call, new_full_call)
|
||||
|
||||
return new_code
|
||||
|
||||
def transform_agnet_action_to_code_block(action):
|
||||
if any(keyword in action for keyword in ["computer.terminate", "computer.wait", "browser.select_option", "browser.clear"]):
|
||||
return f"```code\n{action}\n```"
|
||||
else:
|
||||
return f"```python\n{action}\n```"
|
||||
|
||||
class OpenCUAAgent:
|
||||
"""
|
||||
OpenCUA Agent for desktop automation tasks.
|
||||
|
||||
This class implements a OpenCUA Model based agent that can observe
|
||||
desktop environments through screenshots and execute mouse/keyboard actions
|
||||
via PyAutoGUI to complete automation tasks.
|
||||
|
||||
Attributes:
|
||||
model (str): Name of the language model being used
|
||||
history_type (str): Type of history recording mechanism
|
||||
actions (list): History of executed actions
|
||||
observations (list): History of environment observations
|
||||
cots (list): Chain of thought reasoning records
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
model: str, # OpenCUA model name
|
||||
history_type: str, # History step type: action_history, thought_history, observation_history
|
||||
max_steps: int, # The max number of steps to finish the task
|
||||
max_image_history_length: int = 3, # The max number of images in the history
|
||||
platform: str = "ubuntu", # The platform of the computer
|
||||
max_tokens: int = 1500, # The max number of tokens in the response
|
||||
top_p: float = 0.9, # The top p value in the response
|
||||
temperature: float = 0, # The temperature value in the response
|
||||
action_space: str = "pyautogui", # The action space: pyautogui
|
||||
observation_type: str = "screenshot", # The observation type: screenshot
|
||||
cot_level: str = "l2", # The CoT level: l1, l2, l3
|
||||
screen_size: Tuple[int, int] = (1920, 1080), # The screen size
|
||||
coordinate_type: str = "relative", # The coordinate type: relative, absolute, qwen25
|
||||
use_old_sys_prompt: bool = False, # Whether to use the old system prompt
|
||||
password="osworld-public-evaluation", # The password for the ubuntu platform
|
||||
**kwargs
|
||||
):
|
||||
assert coordinate_type in ["relative", "absolute", "qwen25"]
|
||||
assert action_space in ["pyautogui"], "Invalid action space"
|
||||
assert observation_type in ["screenshot"], "Invalid observation type"
|
||||
assert history_type in ["action_history", "thought_history", "observation_history"]
|
||||
assert model is not None, "Model cannot be None"
|
||||
|
||||
self.model = model
|
||||
self.platform = platform
|
||||
self.max_tokens = max_tokens
|
||||
self.top_p = top_p
|
||||
self.temperature = temperature
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
self.history_type = history_type
|
||||
self.coordinate_type = coordinate_type
|
||||
self.cot_level = cot_level
|
||||
self.screen_size = screen_size
|
||||
self.max_image_history_length = max_image_history_length
|
||||
self.max_steps = max_steps
|
||||
self.password = password
|
||||
|
||||
if history_type == "action_history":
|
||||
self.HISTORY_TEMPLATE = ACTION_HISTORY_TEMPLATE
|
||||
elif history_type == "thought_history":
|
||||
self.HISTORY_TEMPLATE = THOUGHT_HISTORY_TEMPLATE
|
||||
elif history_type == "observation_history":
|
||||
self.HISTORY_TEMPLATE = OBSERVATION_HISTORY_TEMPLATE
|
||||
else:
|
||||
raise ValueError(f"Invalid history type: {history_type}")
|
||||
|
||||
if use_old_sys_prompt:
|
||||
if cot_level == "l1":
|
||||
self.system_prompt = SYSTEM_PROMPT_V1_L1
|
||||
elif cot_level == "l2":
|
||||
self.system_prompt = SYSTEM_PROMPT_V1_L2
|
||||
elif cot_level == "l3":
|
||||
self.system_prompt = SYSTEM_PROMPT_V1_L3
|
||||
else:
|
||||
raise ValueError("Invalid cot_level. Choose from 'l1', 'l2', or 'l3'.")
|
||||
else:
|
||||
self.system_prompt = build_sys_prompt(
|
||||
level=self.cot_level,
|
||||
password=self.password,
|
||||
use_random=False
|
||||
)
|
||||
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.cots = []
|
||||
|
||||
def reset(self, _logger=None):
|
||||
global logger
|
||||
logger = _logger if _logger is not None else logging.getLogger("desktopenv.agent")
|
||||
|
||||
self.observations = []
|
||||
self.cots = []
|
||||
self.actions = []
|
||||
|
||||
def _scale_scroll_for_windows(self, code: str, factor: int = 50) -> str:
|
||||
""" pyautogui.scroll has a different scale on Ubuntu and Windows, multiple 'factor' when scrolling on Windows system"""
|
||||
if self.platform.lower() != "windows":
|
||||
return code
|
||||
|
||||
pattern_pos = re.compile(r'(pyautogui\.scroll\()\s*([-+]?\d+)\s*\)')
|
||||
code = pattern_pos.sub(lambda m: f"{m.group(1)}{int(m.group(2))*factor})", code)
|
||||
return code
|
||||
|
||||
def predict(self, instruction: str, obs: Dict, **kwargs) -> Tuple[str, List[str], Dict]:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
if "step_idx" in kwargs:
|
||||
logger.info(f"========= {self.model} Step {kwargs['step_idx']} =======")
|
||||
else:
|
||||
logger.info(f"========================== {self.model} ===================================")
|
||||
logger.info(f"Instruction: \n{instruction}")
|
||||
|
||||
messages = []
|
||||
messages.append({
|
||||
"role": "system",
|
||||
"content": self.system_prompt
|
||||
})
|
||||
instruction_prompt = INSTRUTION_TEMPLATE.format(instruction=instruction)
|
||||
|
||||
history_step_texts = []
|
||||
for i in range(len(self.actions)):
|
||||
if i > len(self.actions) - self.max_image_history_length:
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{encode_image(self.observations[i]['screenshot'])}"}
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
|
||||
observation=self.cots[i].get('observation'),
|
||||
thought=self.cots[i].get('thought'),
|
||||
action=self.cots[i].get('action')
|
||||
)
|
||||
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": history_content
|
||||
})
|
||||
else:
|
||||
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
|
||||
observation=self.cots[i].get('observation'),
|
||||
thought=self.cots[i].get('thought'),
|
||||
action=self.cots[i].get('action')
|
||||
)
|
||||
history_step_texts.append(history_content)
|
||||
if i == len(self.actions) - self.max_image_history_length:
|
||||
messages.append({
|
||||
"role":"assistant",
|
||||
"content": "\n".join(history_step_texts)
|
||||
})
|
||||
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{encode_image(obs['screenshot'])}"}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": instruction_prompt
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
max_retry = 5
|
||||
retry_count = 0
|
||||
low_level_instruction = None
|
||||
pyautogui_actions = None
|
||||
other_cot = {}
|
||||
|
||||
while retry_count < max_retry:
|
||||
try:
|
||||
response = self.call_llm({
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"max_tokens": self.max_tokens,
|
||||
"top_p": self.top_p,
|
||||
"temperature": self.temperature if retry_count==0 else max(0.2, self.temperature)
|
||||
}, self.model)
|
||||
|
||||
logger.info(f"Model Output: \n{response}")
|
||||
if not response:
|
||||
logger.error("No response found in the response.")
|
||||
raise ValueError(f"No response found in the response:\n{response}.")
|
||||
|
||||
low_level_instruction, pyautogui_actions, other_cot = parse_response_to_cot_and_action(response, self.screen_size, self.coordinate_type)
|
||||
if "<Error>" in low_level_instruction or not pyautogui_actions:
|
||||
logger.error(f"Error parsing response: {low_level_instruction}")
|
||||
raise ValueError(f"Error parsing response: {low_level_instruction}")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during message preparation: {e}")
|
||||
retry_count += 1
|
||||
if retry_count == max_retry:
|
||||
logger.error("Maximum retries reached. Exiting.")
|
||||
return str(e), ['FAIL'], other_cot
|
||||
|
||||
pyautogui_actions = [
|
||||
self._scale_scroll_for_windows(code) for code in pyautogui_actions
|
||||
]
|
||||
logger.info(f"Action: \n{low_level_instruction}")
|
||||
logger.info(f"Code: \n{pyautogui_actions}")
|
||||
|
||||
self.observations.append(obs)
|
||||
self.actions.append(low_level_instruction)
|
||||
self.cots.append(other_cot)
|
||||
|
||||
current_step = len(self.actions)
|
||||
if current_step >= self.max_steps and 'computer.terminate' not in pyautogui_actions[0].lower():
|
||||
logger.warning(f"Reached maximum steps {self.max_steps}. Forcing termination.")
|
||||
low_level_instruction = 'Fail the task because reaching the maximum step limit.'
|
||||
pyautogui_actions = ['FAIL']
|
||||
other_cot['code'] = 'FAIL'
|
||||
|
||||
return response, pyautogui_actions, other_cot
|
||||
|
||||
|
||||
def call_llm(self, payload, model):
|
||||
"""Call the LLM API"""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ['OPENCUA_API_KEY']}"
|
||||
}
|
||||
|
||||
for _ in range(20):
|
||||
response = httpx.post(
|
||||
f"https://{self.model}.app.msh.team/v1/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=500,
|
||||
verify=False
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error("Failed to call LLM: " + response.text)
|
||||
logger.error("Retrying...")
|
||||
time.sleep(5)
|
||||
else:
|
||||
response = response.json()
|
||||
finish_reason = response["choices"][0].get("finish_reason")
|
||||
if finish_reason is not None and finish_reason == "stop": # for most of the time, length will not exceed max_tokens
|
||||
return response['choices'][0]['message']['content']
|
||||
else:
|
||||
logger.error("LLM did not finish properly, retrying...")
|
||||
time.sleep(5)
|
||||
|
|
@ -1,349 +0,0 @@
|
|||
import random
|
||||
|
||||
# System prompt for OpenCUA-7B, OpenCUA-32B
|
||||
# System prompts used in the training data
|
||||
SYSTEM_PROMPT_V1_L1 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
|
||||
SYSTEM_PROMPT_V1_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
|
||||
SYSTEM_PROMPT_V1_L3 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nObservation:\n - Describe the current computer state based on the full screenshot in detail. \n - Application Context:\n - The active application\n - The active window or page\n - Overall layout and visible interface\n - Key Elements:\n - Menu items and toolbars \n - Buttons and controls\n - Text fields and content\n - Dialog boxes or popups\n - Error messages or notifications\n - Loading states\n - Other key elements\n - Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n".strip()
|
||||
|
||||
# Testing prompt on OSWorld-Verified
|
||||
SYSTEM_PROMPT_V1_L2 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. The password of the computer is "osworld-public-evaluation". If the task is not possible to do, output the action computer.terminate(status='failure').
|
||||
|
||||
For each step, provide your response in this format:
|
||||
|
||||
Thought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning
|
||||
|
||||
Action:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize "—", maximize "□", close "X")\n - if the action involves keyboard actions like \'press\', \'write\', \'hotkey\':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions
|
||||
|
||||
Finally, output the action as PyAutoGUI code or the following functions:
|
||||
- {"name": "computer.triple_click", "description": "Triple click on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "The x coordinate of the triple click"}, "y": {"type": "number", "description": "The y coordinate of the triple click"}}, "required": ["x", "y"]}}
|
||||
- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, "required": ["status"]}}
|
||||
""".strip()
|
||||
|
||||
|
||||
# SYSTEM_PROMPT for OpenCUA-72B
|
||||
general_computer_instructions = [
|
||||
"""
|
||||
You are a GUI agent. You are given a task, a screenshot of the screen and your previous interactions with the computer. You need to perform a series of actions to complete the task. The password of the computer is "{password}", use it when you need sudo rights. You need to **wait** explicitly for installation, waiting website loading or running commands to finish. Don\'t terminate the task unless you are sure the task is finished. If you find that you can\'t finish the task, or the task is not finished exactly as the instruction indicates (you have made progress but not finished the task completely), or the task is impossible to complete, you must report **failure**.
|
||||
""".strip(),
|
||||
"""
|
||||
You are acting as a GUI agent. A task description, a screenshot, and your past interactions will be supplied. Execute the necessary steps to fulfil the task. Whenever sudo operations are required, use the computer’s password "{password}". Insert an explicit **wait** after launching any installation, waiting website loading or long-running command to let it finish. Do not output terminate action unless you are certain the task is complete. If you realise the task can be finished or impossible to do, you should report **failure**.
|
||||
""".strip(),
|
||||
"""
|
||||
Your mission as a GUI agent is to complete the provided task using the current screen image and the history of interactions. For commands requiring elevated privileges, supply "{password}" as the sudo password. Explicitly invoke **wait** after launching any installation or command that may take time to finish. Do not terminate the session unless success is certain. If the task cannot be fully executed, or turns out impossible, you must declare **failure**.
|
||||
""".strip(),
|
||||
]
|
||||
|
||||
l3_format_instruction = """For each step, provide your response in this format:
|
||||
# Step: {step number}
|
||||
## Observation:
|
||||
{observation}
|
||||
## Thought:
|
||||
{thought}
|
||||
## Action:
|
||||
{action}
|
||||
## Code:
|
||||
{code}"""
|
||||
|
||||
l2_format_instruction = """For each step, provide your response in this format:
|
||||
# Step: {step number}
|
||||
## Thought:
|
||||
{thought}
|
||||
## Action:
|
||||
{action}
|
||||
## Code:
|
||||
{code}"""
|
||||
|
||||
l1_format_instruction = """For each step, provide your response in this format:
|
||||
# Step: {step number}
|
||||
## Action:
|
||||
{action}
|
||||
## Code:
|
||||
{code}"""
|
||||
|
||||
observation_instructions = [
|
||||
"""For the Observation section, you should include the following parts if helpful:
|
||||
- Describe the current computer state based on the full screenshot in detail.
|
||||
- Application Context:
|
||||
- The active application
|
||||
- The active window or page
|
||||
- Overall layout and visible interface
|
||||
- Key Elements:
|
||||
- Menu items and toolbars
|
||||
- Buttons and controls
|
||||
- Text fields and content
|
||||
- Dialog boxes or popups
|
||||
- Error messages or notifications
|
||||
- Loading states
|
||||
- Other key elements
|
||||
- Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).
|
||||
""".strip(),
|
||||
|
||||
"""In the Observation section, outline everything visible on screen that could influence your next move:
|
||||
• Current system state as seen in the screenshot.
|
||||
• Application context:
|
||||
- Which application is running in the foreground
|
||||
- Specific window, tab, or page being displayed
|
||||
- High-level layout of panels, sidebars, and work areas
|
||||
• Salient interface elements:
|
||||
- Menus, ribbons, and toolbars
|
||||
- Actionable buttons, icons, toggles, and controls
|
||||
- Input areas such as text boxes or code editors
|
||||
- Pop-up dialogs, modals, alerts, or system notifications
|
||||
- Progress bars, spinners, or other loading indicators
|
||||
• Any text, labels, shapes, or on-screen cues that might help accomplish the task (cite names or visual traits when available).
|
||||
""".strip(),
|
||||
|
||||
# ── Variant 3 ──────────────────────────────────────────────────────────
|
||||
"""Write the Observation section as a thorough snapshot of the UI:
|
||||
- Start with a full-screen description: what the user sees at a glance.
|
||||
- Give application details: title, active workspace, and structural layout.
|
||||
- Enumerate critical elements:
|
||||
* Navigation menus and context bars
|
||||
* Primary and secondary buttons or icons
|
||||
* Editable fields, lists, tables, or rich-text areas
|
||||
* Dialogs, pop-ups, warnings, or confirmations
|
||||
* Indicators of loading or processing activity
|
||||
- Note any evidence, hints, or data (textual or visual) that could guide the task toward completion, referencing names, colors, shapes, or positions when explicit identifiers are missing.
|
||||
""".strip(),
|
||||
]
|
||||
|
||||
thought_instructions = [
|
||||
"""For the Thought section, you should include the following parts:
|
||||
- Reflection on the task when there is previous action:
|
||||
- Consider the correnctness of previous action and its outcomes
|
||||
- If the previous action was correct, describe the change in the state of the computer and reason
|
||||
- If the previous action was incorrect, reflect on what went wrong and why
|
||||
- Step by Step Progress Assessment:
|
||||
- Add necessary information according to the history screenshots, former actions and current screenshot.
|
||||
- Analyze what parts of the task have already been completed and how they contribute to the overall goal.
|
||||
- Make a plan on how to complete the task based on the history and currect screenshot.
|
||||
- Next Action Prediction:
|
||||
- Propose the most possible next action and state the reason
|
||||
- For Text Input Actions:
|
||||
- Note current cursor position
|
||||
- Consolidate repetitive actions (specify count for multiple keypresses)
|
||||
- Describe expected final text outcome
|
||||
- Use first-person perspective in reasoning
|
||||
""".strip(),
|
||||
|
||||
"""
|
||||
In the **Thought** block, cover these topics:
|
||||
|
||||
1. **Last-Step Reflection** (when a prior action exists)
|
||||
• Was my previous action correct? What evidence shows this?
|
||||
• If it succeeded, what state change occurred and why?
|
||||
• If it failed, where did I go wrong?
|
||||
|
||||
2. **Incremental Progress Audit**
|
||||
• Which sub-tasks are completed and how do they advance the mission?
|
||||
• Make a plan to finish the task based on past actions and the current UI state.
|
||||
|
||||
3. **Foresight for the Coming Action**
|
||||
• Predict the most logical next step.
|
||||
• State the reason why it is the best choice given the current context.
|
||||
|
||||
4. **Guidance for Text Entry**
|
||||
• Note the cursor location
|
||||
• Compress multiple identical keystrokes (e.g., “press Backspace ×3”)
|
||||
• Clarify the exact text expected after input
|
||||
|
||||
Use first-person inner dialogue throughout.
|
||||
""".strip(),
|
||||
|
||||
"""
|
||||
Compose your **Thought** section as an internal monologue that includes:
|
||||
|
||||
- **Retrospective** (if a prior step exists):
|
||||
* Evaluate the accuracy and effect of the last action.
|
||||
* If it was successful, reason about the resulting interface change.
|
||||
* If it was faulty, diagnose the misstep and its cause.
|
||||
|
||||
- **Ongoing Progress Evaluation**:
|
||||
* Outline which parts of the task are done and their impact on the overall objective.
|
||||
* Suggest a plan to complete the task based on past history and the current screen.
|
||||
|
||||
- **Decision Framework for the Next Move**:
|
||||
* Brainstorm possible next action given the present state.
|
||||
* Explain why this action is the most logical choice.
|
||||
|
||||
- **Special Rules for Keyboard Input**:
|
||||
* Specify current cursor focus or field.
|
||||
* Merge repeated keypresses into counts for brevity.
|
||||
* Describe the intended final text after typing.
|
||||
|
||||
Maintain a first-person voice for clarity of reasoning.
|
||||
""".strip(),
|
||||
]
|
||||
|
||||
action_instructions = [
|
||||
"""For the action section, you should provide clear, concise, and actionable instructions in one sentence.
|
||||
- If the action involves interacting with a specific target:
|
||||
- Describe target explicitly (if multiple elements share that name, you should distinguish the target) without using coordinates
|
||||
- Specify element names when possible (use original language if non-English)
|
||||
- Describe features (shape, color, position) if name unavailable
|
||||
- If the action involves keyboard actions like 'press', 'write', 'hotkey':
|
||||
- Consolidate repetitive keypresses with count
|
||||
- Specify expected text outcome for typing actions
|
||||
""".strip(),
|
||||
|
||||
"""
|
||||
Write the **Action** in one short, direct sentence.
|
||||
|
||||
• When clicking or otherwise interacting with a UI element:
|
||||
- Name the element explicitly — and, if multiple elements share that name, add a distinguishing detail.
|
||||
- Do **not** give coordinates.
|
||||
- Use the element's label (keep original language when it isn't English).
|
||||
- If unnamed, describe recognisable traits (shape, colour, on-screen position).
|
||||
|
||||
• When using the keyboard (press, type, hotkey):
|
||||
- Collapse repeated key presses into counts.
|
||||
- For typing, specify the text that should appear.
|
||||
""".strip(),
|
||||
|
||||
"""
|
||||
Provide the **Action** as a single, crisp imperative sentence.
|
||||
|
||||
- Mouse/GUI interactions:
|
||||
* Identify the target by name, and if duplicate names exist, clarify which one you mean.
|
||||
* Do not supply XY coordinates.
|
||||
* Preserve non-English labels verbatim.
|
||||
* If unnamed, describe the element's look or location (colour, shape, relative position).
|
||||
|
||||
- Keyboard operations (press, write, hotkey):
|
||||
* Combine repeated keystrokes with a multiplier.
|
||||
* State the exact text that will be entered.
|
||||
""".strip(),
|
||||
]
|
||||
|
||||
code_instrucion = """For the code section, you should output the corresponding code for the action. The code should be either PyAutoGUI code or one of the following functions warped in the code block:
|
||||
- {"name": "computer.wait", "description": "Make the computer wait for 20 seconds for installation, running code, etc.", "parameters": {"type": "object", "properties": {}, "required": []}}
|
||||
- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}, {"answer": {"type": "string", "description": "The answer of the task"}}, "required": ["status"]}}
|
||||
Examples for the code section:
|
||||
```python
|
||||
pyautogui.click(x=123, y=456)
|
||||
```
|
||||
```code
|
||||
computer.terminate(status="success")
|
||||
```
|
||||
```code
|
||||
computer.terminate(status="success", answer='''text''')
|
||||
```"""
|
||||
|
||||
SYSTEM_PROMPT_V2_L1 = """
|
||||
{general_computer_instruction}
|
||||
|
||||
{format_instruction}
|
||||
|
||||
{action_instruction}
|
||||
|
||||
{code_instruction}
|
||||
""".strip()
|
||||
|
||||
SYSTEM_PROMPT_V2_L2 = """
|
||||
{general_computer_instruction}
|
||||
|
||||
{format_instruction}
|
||||
|
||||
{thought_instruction}
|
||||
|
||||
{action_instruction}
|
||||
|
||||
{code_instruction}
|
||||
""".strip()
|
||||
|
||||
SYSTEM_PROMPT_V2_L3 = """
|
||||
{general_computer_instruction}
|
||||
|
||||
{format_instruction}
|
||||
|
||||
{observation_instruction}
|
||||
|
||||
{thought_instruction}
|
||||
|
||||
{action_instruction}
|
||||
|
||||
{code_instruction}
|
||||
""".strip()
|
||||
|
||||
|
||||
def build_sys_prompt(level, password="password", use_random=False):
|
||||
if not use_random:
|
||||
if level == "l1":
|
||||
return SYSTEM_PROMPT_V2_L1.format(
|
||||
general_computer_instruction=general_computer_instructions[0].format(
|
||||
password=password
|
||||
),
|
||||
format_instruction=l1_format_instruction,
|
||||
action_instruction=action_instructions[0],
|
||||
code_instruction=code_instrucion,
|
||||
)
|
||||
elif level == "l2":
|
||||
return SYSTEM_PROMPT_V2_L2.format(
|
||||
general_computer_instruction=general_computer_instructions[0].format(
|
||||
password=password
|
||||
),
|
||||
format_instruction=l2_format_instruction,
|
||||
thought_instruction=thought_instructions[0],
|
||||
action_instruction=action_instructions[0],
|
||||
code_instruction=code_instrucion,
|
||||
)
|
||||
elif level == "l3":
|
||||
return SYSTEM_PROMPT_V2_L3.format(
|
||||
general_computer_instruction=general_computer_instructions[0].format(
|
||||
password=password
|
||||
),
|
||||
format_instruction=l3_format_instruction,
|
||||
observation_instruction=observation_instructions[0],
|
||||
thought_instruction=thought_instructions[0],
|
||||
action_instruction=action_instructions[0],
|
||||
code_instruction=code_instrucion,
|
||||
)
|
||||
else:
|
||||
raise ValueError("Invalid level. Choose from 'l1', 'l2', or 'l3'.")
|
||||
else:
|
||||
if level == "l1":
|
||||
return SYSTEM_PROMPT_V2_L1.format(
|
||||
general_computer_instruction=random.choice(
|
||||
general_computer_instructions
|
||||
),
|
||||
format_instruction=l1_format_instruction,
|
||||
action_instruction=random.choice(action_instructions),
|
||||
code_instruction=code_instrucion,
|
||||
)
|
||||
elif level == "l2":
|
||||
return SYSTEM_PROMPT_V2_L2.format(
|
||||
general_computer_instruction=random.choice(
|
||||
general_computer_instructions
|
||||
),
|
||||
format_instruction=l2_format_instruction,
|
||||
thought_instruction=random.choice(thought_instructions),
|
||||
action_instruction=random.choice(action_instructions),
|
||||
code_instruction=code_instrucion,
|
||||
)
|
||||
elif level == "l3":
|
||||
return SYSTEM_PROMPT_V2_L3.format(
|
||||
general_computer_instruction=random.choice(
|
||||
general_computer_instructions
|
||||
),
|
||||
format_instruction=l3_format_instruction,
|
||||
observation_instruction=random.choice(observation_instructions),
|
||||
thought_instruction=random.choice(thought_instructions),
|
||||
action_instruction=random.choice(action_instructions),
|
||||
code_instruction=code_instrucion,
|
||||
)
|
||||
else:
|
||||
raise ValueError("Invalid level. Choose from 'l1', 'l2', or 'l3'.")
|
||||
|
||||
|
||||
# Modeling prompt templates for generating trajectories
|
||||
STEP_TEMPLATE = "# Step {step_num}:\n"
|
||||
INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"
|
||||
|
||||
ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
|
||||
THOUGHT_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n"
|
||||
OBSERVATION_HISTORY_TEMPLATE = "## Observation:\n{observation}\n\n## Thought:\n{thought}\n\n## Action:\n{action}\n"
|
||||
|
||||
ACTION_HISTORY_TEMPLATE_WITH_CODE = "## Action:\n{action}\n\n## Code:\n{code}\n"
|
||||
THOUGHT_HISTORY_TEMPLATE_WITH_CODE = "## Thought:\n{thought}\n\n## Action:\n{action}\n\n## Code:\n{code}\n"
|
||||
OBSERVATION_HISTORY_TEMPLATE_WITH_CODE = "## Observation:\n{observation}\n\n## Thought:\n{thought}\n\n## Action:\n{action}\n\n## Code:\n{code}\n"
|
||||
|
|
@ -1,483 +0,0 @@
|
|||
import re
|
||||
import base64
|
||||
from loguru import logger
|
||||
from typing import List, Optional
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
import tempfile
|
||||
import os
|
||||
import math
|
||||
|
||||
def encode_image(image_content):
|
||||
return base64.b64encode(image_content).decode("utf-8")
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = 56 * 56,
|
||||
max_pixels: int = 14 * 14 * 4 * 1280,
|
||||
max_aspect_ratio_allowed: Optional[float] = None,
|
||||
size_can_be_smaller_than_factor: bool = False,
|
||||
):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
|
||||
"""
|
||||
if not size_can_be_smaller_than_factor and (height < factor or width < factor):
|
||||
raise ValueError(
|
||||
f"height:{height} or width:{width} must be larger than factor:{factor} "
|
||||
f"(when size_can_be_smaller_than_factor is False)"
|
||||
)
|
||||
elif (
|
||||
max_aspect_ratio_allowed is not None
|
||||
and max(height, width) / min(height, width) > max_aspect_ratio_allowed
|
||||
):
|
||||
raise ValueError(
|
||||
f"absolute aspect ratio must be smaller than {max_aspect_ratio_allowed}, "
|
||||
f"got {max(height, width) / min(height, width)}"
|
||||
f"(when max_aspect_ratio_allowed is not None)"
|
||||
)
|
||||
h_bar = max(1, round(height / factor)) * factor
|
||||
w_bar = max(1, round(width / factor)) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = max(1, math.floor(height / beta / factor)) * factor
|
||||
w_bar = max(1, math.floor(width / beta / factor)) * factor
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
def call_openai_naive(model, payload, address_hint=None):
|
||||
"""
|
||||
Naive OpenAI API call using requests.
|
||||
"""
|
||||
# Extract fields from payload
|
||||
model = payload.get("model")
|
||||
payload["model"] = model.model_id if hasattr(model, "model_id") else "None"
|
||||
# address_hint not used here
|
||||
base_url = model.base_url
|
||||
# logger.warning(f"Base URL: {base_url}, Payload model: {payload['model']}")
|
||||
url = f"{base_url}/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
data = {
|
||||
**payload,
|
||||
"n": 1,
|
||||
}
|
||||
max_retry = 5
|
||||
chat_completions = None
|
||||
success = False
|
||||
while success is False and max_retry > 0:
|
||||
try:
|
||||
json_data = json.dumps(data)
|
||||
response = requests.post(
|
||||
url, headers=headers, data=json_data, timeout=120, verify=False
|
||||
)
|
||||
if response.status_code == 200:
|
||||
chat_completions = response.json()
|
||||
try:
|
||||
finish_reason = chat_completions["choices"][0].get("finish_reason")
|
||||
if (
|
||||
finish_reason is not None and finish_reason == "stop"
|
||||
): # for most of the time, length will not exceed max_tokens
|
||||
success = True
|
||||
else:
|
||||
time.sleep(5)
|
||||
max_retry -= 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error in processing chat completion: {e}")
|
||||
time.sleep(5)
|
||||
max_retry -= 1
|
||||
else:
|
||||
logger.error(f"Failed to call OpenAI API: {response.text}")
|
||||
time.sleep(5)
|
||||
max_retry -= 1
|
||||
except requests.exceptions.ReadTimeout:
|
||||
# timeout is normal, don't print trace
|
||||
max_retry -= 1
|
||||
logger.warning(f"Timeout in OpenAI API call, left retries: {max_retry}")
|
||||
time.sleep(5)
|
||||
|
||||
except Exception as e:
|
||||
max_retry -= 1
|
||||
logger.exception(f"Failed to call OpenAI API: {e}")
|
||||
time.sleep(5)
|
||||
|
||||
if chat_completions is None:
|
||||
raise RuntimeError("Failed to call OpenAI API, max_retry used up")
|
||||
try:
|
||||
infos = {}
|
||||
if "choices" in chat_completions:
|
||||
infos["finish_reason"] = chat_completions["choices"][0].get("finish_reason")
|
||||
infos["n"] = len(chat_completions["choices"])
|
||||
if "tool_calls" in chat_completions["choices"][0]["message"]:
|
||||
infos["tool_calls"] = chat_completions["choices"][0]["message"][
|
||||
"tool_calls"
|
||||
]
|
||||
infos["choices"] = chat_completions["choices"] # for the case of n > 1
|
||||
if "usage" in chat_completions:
|
||||
infos["usage"] = chat_completions["usage"]
|
||||
return chat_completions["choices"][0]["message"]["content"], infos
|
||||
except Exception as e:
|
||||
logger.error(f"Error in processing chat completion {e}")
|
||||
return "", {"n": 1, "usage": 0, "finish_reason": f"error {e}"}
|
||||
|
||||
|
||||
def preprocess_for_naive_openai(self, payload):
|
||||
if isinstance(payload["model"], str):
|
||||
payload["model"] = getattr(self, "openai_client", None)
|
||||
return payload
|
||||
|
||||
def encoded_img_to_pil_img(data_str):
|
||||
base64_str = data_str.replace("data:image/png;base64,", "")
|
||||
image_data = base64.b64decode(base64_str)
|
||||
return Image.open(BytesIO(image_data))
|
||||
|
||||
|
||||
def save_to_tmp_img_file(data_str):
|
||||
base64_str = data_str.replace("data:image/png;base64,", "")
|
||||
image_data = base64.b64decode(base64_str)
|
||||
image = Image.open(BytesIO(image_data))
|
||||
|
||||
tmp_img_path = os.path.join(tempfile.mkdtemp(), "tmp_img.png")
|
||||
image.save(tmp_img_path)
|
||||
|
||||
return tmp_img_path
|
||||
|
||||
|
||||
def bbox_to_center_1000(bbox: str) -> tuple[int, int]:
|
||||
regex_list = [
|
||||
r"<\|box_start\|>\((\d+),(\d+)\),\((\d+),(\d+)\)<\|box_end\|>", # '<|box_start|>(576,12),(592,42)<|box_end|>'
|
||||
r"<\|box_start\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|box_end\|>", # '<|box_start|>[[576, 12, 592, 42]]<|box_end|>'
|
||||
r"<\|box_start\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]<\|box_end\|>", # '<|box_start|>[[576, 12, 592, 42]<|box_end|>', this is actually wrong format, but we parse it anyway
|
||||
r"<\|box_start\|>\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)<\|box_end\|>", # '<|box_start|>(576, 12, 592, 42)<|box_end|>', this is actually wrong format, but we parse it anyway
|
||||
r"\((\d+),(\d+)\),\((\d+),(\d+)\)", # Versions without the 'bbox' special tokens
|
||||
r"\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]",
|
||||
r"\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]",
|
||||
r"\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)",
|
||||
]
|
||||
for regex in regex_list:
|
||||
match = re.search(regex, bbox)
|
||||
if match:
|
||||
break
|
||||
if not match:
|
||||
raise ValueError(
|
||||
f"Bounding box coordinates not found in the input string: {bbox}"
|
||||
)
|
||||
x_top_left, y_top_left, x_bottom_right, y_bottom_right = map(int, match.groups())
|
||||
x_center = (x_top_left + x_bottom_right) // 2
|
||||
y_center = (y_top_left + y_bottom_right) // 2
|
||||
return x_center, y_center
|
||||
|
||||
|
||||
def bbox_to_center_1(bbox: str) -> tuple[int, int]:
|
||||
regex_list = [
|
||||
r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]",
|
||||
]
|
||||
for regex in regex_list:
|
||||
match = re.search(regex, bbox)
|
||||
if match:
|
||||
break
|
||||
if not match:
|
||||
raise ValueError(
|
||||
f"Bounding box coordinates not found in the input string: {bbox}"
|
||||
)
|
||||
coordinates = tuple(map(float, match.groups()))
|
||||
coordinates = [int(coord * 1000) for coord in coordinates]
|
||||
x_center = (coordinates[0] + coordinates[2]) // 2
|
||||
y_center = (coordinates[1] + coordinates[3]) // 2
|
||||
return x_center, y_center
|
||||
|
||||
def _coordinate_projection(x, y, screen_width, screen_height, coordinate_type):
|
||||
if coordinate_type == "relative":
|
||||
return int(round(x * screen_width)), int(round(y * screen_height))
|
||||
elif coordinate_type == "absolute":
|
||||
return x, y
|
||||
elif coordinate_type == "qwen25":
|
||||
height, width = smart_resize(
|
||||
height=screen_height,
|
||||
width=screen_width,
|
||||
factor=28,
|
||||
min_pixels=3136,
|
||||
max_pixels=12845056,
|
||||
)
|
||||
return int(x / width * screen_width), int(y / height * screen_height)
|
||||
elif coordinate_type == "relative1000":
|
||||
if screen_width == 0 or screen_height == 0:
|
||||
raise ValueError(
|
||||
"Screen width and height must be greater than zero for relative1000 coordinates."
|
||||
)
|
||||
x_abs = int(round(x * screen_width / 1000))
|
||||
y_abs = int(round(y * screen_height / 1000))
|
||||
return x_abs, y_abs
|
||||
else:
|
||||
raise ValueError(f"Unsupported coordinate type: {coordinate_type}")
|
||||
|
||||
|
||||
def rescale_coord(
|
||||
coord: tuple[int, int],
|
||||
original_width: int,
|
||||
original_height: int,
|
||||
scaled_width=1000,
|
||||
scaled_height=1000,
|
||||
) -> tuple[int, int]:
|
||||
# According to https://huggingface.co/spaces/maxiw/OS-ATLAS/blob/398c3256a4fec409a074e0e4b5ac1d1d5bf7c240/app.py#L36
|
||||
# It seems that OS-ATLAS model are rescaled to output 1000x1000 images
|
||||
# So we need to rescale the coordinates back to the original image size
|
||||
x_scale = original_width / scaled_width
|
||||
y_scale = original_height / scaled_height
|
||||
return int(coord[0] * x_scale), int(coord[1] * y_scale)
|
||||
|
||||
|
||||
def _pyautogui_code_to_absolute_coordinates(
|
||||
pyautogui_code_relative_coordinates,
|
||||
logical_screen_size,
|
||||
coordinate_type="relative",
|
||||
model_input_size=None,
|
||||
):
|
||||
"""
|
||||
Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size.
|
||||
"""
|
||||
import re
|
||||
import ast
|
||||
|
||||
if coordinate_type not in ["relative", "relative1000", "absolute", "qwen25"]:
|
||||
raise ValueError(
|
||||
f"Invalid coordinate type: {coordinate_type}. Expected one of ['relative', 'relative1000', 'absolute', 'qwen25']."
|
||||
)
|
||||
|
||||
screen_width, screen_height = logical_screen_size
|
||||
if model_input_size is not None:
|
||||
model_width, model_height = model_input_size
|
||||
width_scale, height_scale = (
|
||||
screen_width / model_width,
|
||||
screen_height / model_height,
|
||||
)
|
||||
else:
|
||||
width_scale, height_scale = 1, 1
|
||||
|
||||
pattern = r"(pyautogui\.\w+\([^\)]*\))"
|
||||
|
||||
matches = re.findall(pattern, pyautogui_code_relative_coordinates)
|
||||
|
||||
new_code = pyautogui_code_relative_coordinates
|
||||
|
||||
for full_call in matches:
|
||||
func_name_pattern = r"(pyautogui\.\w+)\((.*)\)"
|
||||
func_match = re.match(func_name_pattern, full_call, re.DOTALL)
|
||||
if not func_match:
|
||||
continue
|
||||
|
||||
func_name = func_match.group(1)
|
||||
args_str = func_match.group(2)
|
||||
|
||||
try:
|
||||
parsed = ast.parse(f"func({args_str})").body[0].value
|
||||
parsed_args = parsed.args
|
||||
parsed_keywords = parsed.keywords
|
||||
except SyntaxError:
|
||||
return pyautogui_code_relative_coordinates
|
||||
|
||||
function_parameters = {
|
||||
"click": ["x", "y", "clicks", "interval", "button", "duration", "pause"],
|
||||
"moveTo": ["x", "y", "duration", "tween", "pause"],
|
||||
"moveRel": ["xOffset", "yOffset", "duration", "tween", "pause"],
|
||||
"dragTo": ["x", "y", "duration", "button", "mouseDownUp", "pause"],
|
||||
"dragRel": [
|
||||
"xOffset",
|
||||
"yOffset",
|
||||
"duration",
|
||||
"button",
|
||||
"mouseDownUp",
|
||||
"pause",
|
||||
],
|
||||
"doubleClick": ["x", "y", "interval", "button", "duration", "pause"],
|
||||
}
|
||||
|
||||
func_base_name = func_name.split(".")[-1]
|
||||
|
||||
param_names = function_parameters.get(func_base_name, [])
|
||||
|
||||
args = {}
|
||||
for idx, arg in enumerate(parsed_args):
|
||||
if idx < len(param_names):
|
||||
param_name = param_names[idx]
|
||||
arg_value = ast.literal_eval(arg)
|
||||
args[param_name] = arg_value
|
||||
|
||||
try:
|
||||
for kw in parsed_keywords:
|
||||
param_name = kw.arg
|
||||
arg_value = ast.literal_eval(kw.value)
|
||||
args[param_name] = arg_value
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing keyword arguments: {e}")
|
||||
return pyautogui_code_relative_coordinates
|
||||
|
||||
updated = False
|
||||
if "x" in args and "y" in args:
|
||||
try:
|
||||
x_rel = float(args["x"])
|
||||
y_rel = float(args["y"])
|
||||
x_abs, y_abs = _coordinate_projection(
|
||||
x_rel, y_rel, screen_width, screen_height, coordinate_type
|
||||
)
|
||||
# logger.warning(f"Projecting coordinates: ({x_rel}, {y_rel}) to ({x_abs}, {y_abs}) using {coordinate_type} projection.")
|
||||
args["x"] = x_abs * width_scale
|
||||
args["y"] = y_abs * height_scale
|
||||
updated = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if "xOffset" in args and "yOffset" in args:
|
||||
try:
|
||||
x_rel = float(args["xOffset"])
|
||||
y_rel = float(args["yOffset"])
|
||||
x_abs, y_abs = _coordinate_projection(
|
||||
x_rel, y_rel, screen_width, screen_height, coordinate_type
|
||||
)
|
||||
args["xOffset"] = x_abs * width_scale
|
||||
args["yOffset"] = y_abs * height_scale
|
||||
updated = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if updated:
|
||||
reconstructed_args = []
|
||||
for idx, param_name in enumerate(param_names):
|
||||
if param_name in args:
|
||||
arg_value = args[param_name]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"'{arg_value}'"
|
||||
else:
|
||||
arg_repr = str(arg_value)
|
||||
reconstructed_args.append(arg_repr)
|
||||
else:
|
||||
break
|
||||
|
||||
used_params = set(param_names[: len(reconstructed_args)])
|
||||
for kw in parsed_keywords:
|
||||
if kw.arg not in used_params:
|
||||
arg_value = args[kw.arg]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"{kw.arg}='{arg_value}'"
|
||||
else:
|
||||
arg_repr = f"{kw.arg}={arg_value}"
|
||||
reconstructed_args.append(arg_repr)
|
||||
|
||||
new_args_str = ", ".join(reconstructed_args)
|
||||
new_full_call = f"{func_name}({new_args_str})"
|
||||
new_code = new_code.replace(full_call, new_full_call)
|
||||
|
||||
return new_code
|
||||
|
||||
|
||||
def split_args(args_str: str) -> List[str]:
|
||||
args = []
|
||||
current_arg = ""
|
||||
within_string = False
|
||||
string_char = ""
|
||||
prev_char = ""
|
||||
for char in args_str:
|
||||
if char in ['"', "'"]:
|
||||
if not within_string:
|
||||
within_string = True
|
||||
string_char = char
|
||||
elif within_string and prev_char != "\\" and char == string_char:
|
||||
within_string = False
|
||||
if char == "," and not within_string:
|
||||
args.append(current_arg)
|
||||
current_arg = ""
|
||||
else:
|
||||
current_arg += char
|
||||
prev_char = char
|
||||
if current_arg:
|
||||
args.append(current_arg)
|
||||
return args
|
||||
|
||||
|
||||
def correct_pyautogui_arguments(code: str) -> str:
|
||||
function_corrections = {
|
||||
"write": {
|
||||
"incorrect_args": ["text", "content"],
|
||||
"correct_args": [],
|
||||
"keyword_arg": "message",
|
||||
},
|
||||
"press": {
|
||||
"incorrect_args": ["key", "button"],
|
||||
"correct_args": [],
|
||||
"keyword_arg": None,
|
||||
},
|
||||
"hotkey": {
|
||||
"incorrect_args": ["key1", "key2", "keys"],
|
||||
"correct_args": [],
|
||||
"keyword_arg": None,
|
||||
},
|
||||
}
|
||||
|
||||
lines = code.strip().split("\n")
|
||||
corrected_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
match = re.match(r"(pyautogui\.(\w+))\((.*)\)", line)
|
||||
if match:
|
||||
full_func_call = match.group(1)
|
||||
func_name = match.group(2)
|
||||
args_str = match.group(3)
|
||||
|
||||
if func_name in function_corrections:
|
||||
func_info = function_corrections[func_name]
|
||||
args = split_args(args_str)
|
||||
corrected_args = []
|
||||
|
||||
for arg in args:
|
||||
arg = arg.strip()
|
||||
kwarg_match = re.match(r"(\w+)\s*=\s*(.*)", arg)
|
||||
if kwarg_match:
|
||||
arg_name = kwarg_match.group(1)
|
||||
arg_value = kwarg_match.group(2)
|
||||
|
||||
if arg_name in func_info["incorrect_args"]:
|
||||
if func_info["keyword_arg"]:
|
||||
corrected_args.append(
|
||||
f"{func_info['keyword_arg']}={arg_value}"
|
||||
)
|
||||
else:
|
||||
corrected_args.append(arg_value)
|
||||
else:
|
||||
corrected_args.append(f"{arg_name}={arg_value}")
|
||||
else:
|
||||
corrected_args.append(arg)
|
||||
|
||||
corrected_args_str = ", ".join(corrected_args)
|
||||
corrected_line = f"{full_func_call}({corrected_args_str})"
|
||||
corrected_lines.append(corrected_line)
|
||||
else:
|
||||
corrected_lines.append(line)
|
||||
else:
|
||||
corrected_lines.append(line)
|
||||
|
||||
corrected_code = "\n".join(corrected_lines)
|
||||
return corrected_code
|
||||
|
||||
def image_message_from_obs(obs, for_training=False):
|
||||
if not for_training:
|
||||
return {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{encode_image(obs['screenshot'])}",
|
||||
"detail": "high",
|
||||
},
|
||||
}
|
||||
else:
|
||||
return {"type": "image_url", "image_url": {"url": obs["screenshot_path"]}}
|
||||
|
|
@ -0,0 +1,736 @@
|
|||
"""
|
||||
OpenCUA Agent Implementation
|
||||
|
||||
This module implements an OpenCUA agent for desktop automation tasks, building upon
|
||||
existing frameworks and integrating multiple coordinate mapping systems.
|
||||
|
||||
Framework and Implementation Sources:
|
||||
- Main framework structure follows: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py
|
||||
- Agent implementation adapted from: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/aguvis_agent.py
|
||||
- Qwen2.5-VL coordinate mapping from: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
|
||||
"""
|
||||
|
||||
import re
|
||||
import os
|
||||
import ast
|
||||
import time
|
||||
import math
|
||||
import httpx
|
||||
import base64
|
||||
import backoff
|
||||
from loguru import logger
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
# System prompts used in the training data
|
||||
AGNET_SYS_PROMPT_L1 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
|
||||
# AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
|
||||
AGNET_SYS_PROMPT_L3 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nObservation:\n - Describe the current computer state based on the full screenshot in detail. \n - Application Context:\n - The active application\n - The active window or page\n - Overall layout and visible interface\n - Key Elements:\n - Menu items and toolbars \n - Buttons and controls\n - Text fields and content\n - Dialog boxes or popups\n - Error messages or notifications\n - Loading states\n - Other key elements\n - Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n".strip()
|
||||
|
||||
# Testing prompt on OSWorld-Verified
|
||||
AGNET_SYS_PROMPT_L2 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. The password of the computer is "osworld-public-evaluation". If the task is not possible to do, output the action computer.terminate(status='failure').
|
||||
|
||||
For each step, provide your response in this format:
|
||||
|
||||
Thought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning
|
||||
|
||||
Action:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize "—", maximize "□", close "X")\n - if the action involves keyboard actions like \'press\', \'write\', \'hotkey\':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions
|
||||
|
||||
Finally, output the action as PyAutoGUI code or the following functions:
|
||||
- {"name": "computer.triple_click", "description": "Triple click on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "The x coordinate of the triple click"}, "y": {"type": "number", "description": "The y coordinate of the triple click"}}, "required": ["x", "y"]}}
|
||||
- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, "required": ["status"]}}
|
||||
""".strip()
|
||||
|
||||
|
||||
STEP_TEMPLATE = "# Step {step_num}:\n"
|
||||
INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"
|
||||
|
||||
ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
|
||||
THOUGHT_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n"
|
||||
OBSERVATION_HISTORY_TEMPLATE = "## Observation:\n{observation}\n\n## Thought:\n{thought}\n\n## Action:\n{action}\n"
|
||||
DETAIL_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n\n## Code:\n{code}\n"
|
||||
|
||||
|
||||
def encode_image(image_content):
|
||||
"""Encode the image to base64"""
|
||||
return base64.b64encode(image_content).decode('utf-8')
|
||||
|
||||
def parse_response_to_cot_and_action(input_string, screen_size, coordinate_type) -> Tuple[str, List[str], dict]:
|
||||
"""Parse response including Observation, Thought, Action and code block"""
|
||||
try:
|
||||
sections = {}
|
||||
|
||||
obs_match = re.search(r'^##\s*Observation\s*:?[\n\r]+(.*?)(?=^##\s*Thought:|^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
|
||||
if obs_match:
|
||||
sections['observation'] = obs_match.group(1).strip()
|
||||
|
||||
thought_match = re.search(r'^##\s*Thought\s*:?[\n\r]+(.*?)(?=^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
|
||||
if thought_match:
|
||||
sections['thought'] = thought_match.group(1).strip()
|
||||
|
||||
action_match = re.search(r'^##\s*Action\s*:?[\n\r]+(.*?)(?=^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
|
||||
if action_match:
|
||||
action = action_match.group(1).strip()
|
||||
sections['action'] = action.strip()
|
||||
|
||||
if "computer.terminate" in input_string.lower():
|
||||
# Look for code blocks that might contain terminate command
|
||||
code_blocks = re.findall(r'```(?:code|python)?\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
|
||||
if code_blocks:
|
||||
last_code = code_blocks[-1].strip().lower()
|
||||
if "fail" in last_code:
|
||||
sections['code'] = "FAIL"
|
||||
return "FAIL", ["FAIL"], sections
|
||||
elif "success" in last_code:
|
||||
sections['code'] = "DONE"
|
||||
return "DONE", ["DONE"], sections
|
||||
# Default to DONE if terminate is mentioned but no specific status
|
||||
sections['code'] = "DONE"
|
||||
return "DONE", ["DONE"], sections
|
||||
|
||||
code_blocks = re.findall(r'```(?:python)\s*(.*?)\s*```', input_string, re.DOTALL)
|
||||
if code_blocks:
|
||||
code = code_blocks[-1].strip()
|
||||
sections['original_code'] = transform_agnet_action_to_code_block(code)
|
||||
corrected_code = correct_pyautogui_arguments(code)
|
||||
sections['code'] = corrected_code
|
||||
sections['code'] = project_coordinate_to_absolute_scale(corrected_code, screen_width=screen_size[0], screen_height=screen_size[1], coordinate_type=coordinate_type)
|
||||
else:
|
||||
# No code blocks found
|
||||
sections['code'] = "WAIT"
|
||||
return "WAIT", ["WAIT"], sections
|
||||
|
||||
if 'code' not in sections:
|
||||
logger.error("Missing required action or code section")
|
||||
return None, None, {}
|
||||
|
||||
if 'action' not in sections:
|
||||
sections['action'] = ""
|
||||
|
||||
return sections['action'], [sections['code']], sections
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error parsing response: {str(e)}\nInput string: {input_string}")
|
||||
return None, None, {}
|
||||
|
||||
def correct_pyautogui_arguments(code: str) -> str:
|
||||
"""Correct the pyautogui arguments"""
|
||||
function_corrections = {
|
||||
'write': {
|
||||
'incorrect_args': ['text', 'content'],
|
||||
'correct_args': [],
|
||||
'keyword_arg': 'message'
|
||||
},
|
||||
'press': {
|
||||
'incorrect_args': ['key', 'button'],
|
||||
'correct_args': [],
|
||||
'keyword_arg': None
|
||||
},
|
||||
'hotkey': {
|
||||
'incorrect_args': ['key1', 'key2', 'keys'],
|
||||
'correct_args': [],
|
||||
'keyword_arg': None
|
||||
},
|
||||
}
|
||||
|
||||
lines = code.strip().split('\n')
|
||||
corrected_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
match = re.match(r'(pyautogui\.(\w+))\((.*)\)', line)
|
||||
if match:
|
||||
full_func_call = match.group(1)
|
||||
func_name = match.group(2)
|
||||
args_str = match.group(3)
|
||||
|
||||
if func_name in function_corrections:
|
||||
func_info = function_corrections[func_name]
|
||||
args = split_args(args_str)
|
||||
corrected_args = []
|
||||
|
||||
for arg in args:
|
||||
arg = arg.strip()
|
||||
kwarg_match = re.match(r'(\w+)\s*=\s*(.*)', arg)
|
||||
if kwarg_match:
|
||||
arg_name = kwarg_match.group(1)
|
||||
arg_value = kwarg_match.group(2)
|
||||
|
||||
if arg_name in func_info['incorrect_args']:
|
||||
if func_info['keyword_arg']:
|
||||
corrected_args.append(f"{func_info['keyword_arg']}={arg_value}")
|
||||
else:
|
||||
corrected_args.append(arg_value)
|
||||
else:
|
||||
corrected_args.append(f'{arg_name}={arg_value}')
|
||||
else:
|
||||
corrected_args.append(arg)
|
||||
|
||||
corrected_args_str = ', '.join(corrected_args)
|
||||
corrected_line = f'{full_func_call}({corrected_args_str})'
|
||||
corrected_lines.append(corrected_line)
|
||||
else:
|
||||
corrected_lines.append(line)
|
||||
else:
|
||||
corrected_lines.append(line)
|
||||
|
||||
corrected_code = '\n'.join(corrected_lines)
|
||||
return corrected_code
|
||||
|
||||
def split_args(args_str: str) -> List[str]:
|
||||
"""Split the arguments string into a list of arguments"""
|
||||
args = []
|
||||
current_arg = ''
|
||||
within_string = False
|
||||
string_char = ''
|
||||
prev_char = ''
|
||||
for char in args_str:
|
||||
if char in ['"', "'"]:
|
||||
if not within_string:
|
||||
within_string = True
|
||||
string_char = char
|
||||
elif within_string and prev_char != '\\' and char == string_char:
|
||||
within_string = False
|
||||
if char == ',' and not within_string:
|
||||
args.append(current_arg)
|
||||
current_arg = ''
|
||||
else:
|
||||
current_arg += char
|
||||
prev_char = char
|
||||
if current_arg:
|
||||
args.append(current_arg)
|
||||
return args
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int,
|
||||
min_pixels: int,
|
||||
max_pixels: int,
|
||||
max_aspect_ratio_allowed: Optional[float] = None,
|
||||
size_can_be_smaller_than_factor: bool = False,
|
||||
):
|
||||
"""
|
||||
The function is modified from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
|
||||
|
||||
Qwen2.5-VL based model need this function to resize screenshots.
|
||||
|
||||
Rescales the image so that the following conditions are met:
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
|
||||
"""
|
||||
if not size_can_be_smaller_than_factor and (height < factor or width < factor):
|
||||
raise ValueError(
|
||||
f"height:{height} or width:{width} must be larger than factor:{factor} "
|
||||
f"(when size_can_be_smaller_than_factor is False)"
|
||||
)
|
||||
elif max_aspect_ratio_allowed is not None and max(height, width) / min(height, width) > max_aspect_ratio_allowed:
|
||||
raise ValueError(
|
||||
f"absolute aspect ratio must be smaller than {max_aspect_ratio_allowed}, "
|
||||
f"got {max(height, width) / min(height, width)}"
|
||||
f"(when max_aspect_ratio_allowed is not None)"
|
||||
)
|
||||
h_bar = max(1, round(height / factor)) * factor
|
||||
w_bar = max(1, round(width / factor)) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = max(1, math.floor(height / beta / factor)) * factor
|
||||
w_bar = max(1, math.floor(width / beta / factor)) * factor
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
def _coordinate_projection(x, y, screen_width, screen_height, coordinate_type):
|
||||
"""Project the coordinates to the absolute scale"""
|
||||
if coordinate_type == "relative":
|
||||
return int(round(x * screen_width)), int(round(y * screen_height))
|
||||
elif coordinate_type == "absolute":
|
||||
return x, y
|
||||
elif coordinate_type == "qwen25":
|
||||
if 0 <= x <= 1 and 0 <= y <= 1:
|
||||
# If already normalized, treat like "relative"
|
||||
return int(round(x * screen_width)), int(round(y * screen_height))
|
||||
|
||||
height, width = smart_resize(
|
||||
height=screen_height,
|
||||
width=screen_width,
|
||||
factor=28,
|
||||
min_pixels=3136,
|
||||
max_pixels=12845056 # We use this max_pixels setting in our training data
|
||||
)
|
||||
return int(x / width * screen_width), int(y / height * screen_height)
|
||||
else:
|
||||
raise ValueError(f"Unsupported coordinate type: {coordinate_type}")
|
||||
|
||||
def project_coordinate_to_absolute_scale(pyautogui_code_relative_coordinates, screen_width, screen_height, coordinate_type="relative"):
|
||||
"""Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size."""
|
||||
if coordinate_type not in ["relative", "relative1000", "absolute", "qwen25"]:
|
||||
raise ValueError(f"Invalid coordinate type: {coordinate_type}. Expected one of ['relative', 'relative1000', 'absolute', 'qwen25'].")
|
||||
|
||||
pattern = r'(pyautogui\.\w+\([^\)]*\))'
|
||||
matches = re.findall(pattern, pyautogui_code_relative_coordinates)
|
||||
|
||||
new_code = pyautogui_code_relative_coordinates
|
||||
|
||||
for full_call in matches:
|
||||
func_name_pattern = r'(pyautogui\.\w+)\((.*)\)'
|
||||
func_match = re.match(func_name_pattern, full_call, re.DOTALL)
|
||||
if not func_match:
|
||||
continue
|
||||
|
||||
func_name = func_match.group(1)
|
||||
args_str = func_match.group(2)
|
||||
|
||||
try:
|
||||
parsed = ast.parse(f"func({args_str})").body[0].value
|
||||
parsed_args = parsed.args
|
||||
parsed_keywords = parsed.keywords
|
||||
except SyntaxError:
|
||||
return pyautogui_code_relative_coordinates
|
||||
|
||||
function_parameters = {
|
||||
'click': ['x', 'y', 'clicks', 'interval', 'button', 'duration', 'pause'],
|
||||
'moveTo': ['x', 'y', 'duration', 'tween', 'pause'],
|
||||
'moveRel': ['xOffset', 'yOffset', 'duration', 'tween', 'pause'],
|
||||
'dragTo': ['x', 'y', 'duration', 'button', 'mouseDownUp', 'pause'],
|
||||
'dragRel': ['xOffset', 'yOffset', 'duration', 'button', 'mouseDownUp', 'pause'],
|
||||
'doubleClick': ['x', 'y', 'interval', 'button', 'duration', 'pause'],
|
||||
}
|
||||
|
||||
func_base_name = func_name.split('.')[-1]
|
||||
|
||||
param_names = function_parameters.get(func_base_name, [])
|
||||
|
||||
args = {}
|
||||
for idx, arg in enumerate(parsed_args):
|
||||
if idx < len(param_names):
|
||||
param_name = param_names[idx]
|
||||
arg_value = ast.literal_eval(arg)
|
||||
args[param_name] = arg_value
|
||||
|
||||
try:
|
||||
for kw in parsed_keywords:
|
||||
param_name = kw.arg
|
||||
arg_value = ast.literal_eval(kw.value)
|
||||
args[param_name] = arg_value
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing keyword arguments: {e}")
|
||||
return pyautogui_code_relative_coordinates
|
||||
|
||||
updated = False
|
||||
if 'x' in args and 'y' in args:
|
||||
try:
|
||||
x_rel = float(args['x'])
|
||||
y_rel = float(args['y'])
|
||||
x_abs, y_abs = _coordinate_projection(x_rel, y_rel, screen_width, screen_height, coordinate_type)
|
||||
logger.warning(f"Projecting coordinates: ({x_rel}, {y_rel}) to ({x_abs}, {y_abs}) using {coordinate_type} projection.")
|
||||
args['x'] = x_abs
|
||||
args['y'] = y_abs
|
||||
updated = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if 'xOffset' in args and 'yOffset' in args:
|
||||
try:
|
||||
x_rel = float(args['xOffset'])
|
||||
y_rel = float(args['yOffset'])
|
||||
x_abs, y_abs = _coordinate_projection(x_rel, y_rel, screen_width, screen_height, coordinate_type)
|
||||
args['xOffset'] = x_abs
|
||||
args['yOffset'] = y_abs
|
||||
updated = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if updated:
|
||||
reconstructed_args = []
|
||||
for idx, param_name in enumerate(param_names):
|
||||
if param_name in args:
|
||||
arg_value = args[param_name]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"'{arg_value}'"
|
||||
else:
|
||||
arg_repr = str(arg_value)
|
||||
reconstructed_args.append(arg_repr)
|
||||
else:
|
||||
break
|
||||
|
||||
used_params = set(param_names[:len(reconstructed_args)])
|
||||
for kw in parsed_keywords:
|
||||
if kw.arg not in used_params:
|
||||
arg_value = args[kw.arg]
|
||||
if isinstance(arg_value, str):
|
||||
arg_repr = f"{kw.arg}='{arg_value}'"
|
||||
else:
|
||||
arg_repr = f"{kw.arg}={arg_value}"
|
||||
reconstructed_args.append(arg_repr)
|
||||
|
||||
new_args_str = ', '.join(reconstructed_args)
|
||||
new_full_call = f"{func_name}({new_args_str})"
|
||||
new_code = new_code.replace(full_call, new_full_call)
|
||||
|
||||
return new_code
|
||||
|
||||
def extract_positions_and_instructions(code, action) -> list[dict]:
|
||||
"""
|
||||
Extracts all `(x, y)` coordinates (both positional and keyword arguments)
|
||||
and their associated preceding comments as instructions from Python code.
|
||||
If there are no comments, use the corresponding action instead.
|
||||
|
||||
Args:
|
||||
code (str): The Python code as a string.
|
||||
action (str): The low-level action as a string.
|
||||
|
||||
Returns:
|
||||
list[dict]: A list of dictionaries with extracted positions and instructions.
|
||||
- function (str): The pyautogui function name.
|
||||
- x (int or float): The x-coordinate.
|
||||
- y (int or float): The y-coordinate.
|
||||
- instruction (str): The preceding comment as an instruction.
|
||||
"""
|
||||
lines = code.splitlines()
|
||||
extracted = []
|
||||
preceding_comment = action # To store the preceding comment
|
||||
|
||||
for line in lines:
|
||||
preceding_comment = action
|
||||
# Check if the line is a comment and store it
|
||||
if line.strip().startswith("#"):
|
||||
preceding_comment = line.strip().lstrip("#").strip() # Clean the comment
|
||||
|
||||
# Match pyautogui functions with positional arguments
|
||||
match_positional = re.match(r"(pyautogui\.\w+)\((\d+(\.\d+)?),\s*(\d+(\.\d+)?).*?\)", line)
|
||||
if match_positional:
|
||||
extracted.append({
|
||||
"function": match_positional.group(1), # pyautogui function name
|
||||
"x": float(match_positional.group(2)) if '.' in match_positional.group(2)\
|
||||
else int(match_positional.group(2)), # x-coordinate
|
||||
"y": float(match_positional.group(4)) if '.' in match_positional.group(4)\
|
||||
else int(match_positional.group(3)), # y-coordinate
|
||||
"instruction": preceding_comment, # Use the preceding comment
|
||||
})
|
||||
preceding_comment = None # Reset after associating it with a line
|
||||
continue
|
||||
|
||||
# Match pyautogui functions with keyword arguments
|
||||
match_keyword = re.match(r"(pyautogui\.\w+)\(.*?x=(\d+(\.\d+)?),\s*y=(\d+(\.\d+)?).*?\)", line)
|
||||
if match_keyword:
|
||||
extracted.append({
|
||||
"function": match_keyword.group(1), # pyautogui function name
|
||||
"x": float(match_keyword.group(2)) if '.' in match_keyword.group(2)\
|
||||
else int(match_keyword.group(2)), # x-coordinate
|
||||
"y": float(match_keyword.group(4)) if '.' in match_keyword.group(4)\
|
||||
else int(match_keyword.group(3)), # y-coordinate
|
||||
"instruction": preceding_comment, # Use the preceding comment
|
||||
})
|
||||
preceding_comment = None # Reset after associating it with a line
|
||||
|
||||
logger.info(f"Grounding extracted:\n{extracted}")
|
||||
return extracted
|
||||
|
||||
def update_code_with_new_coordinates(code, updated_positions):
|
||||
"""
|
||||
Replaces old `(x, y)` coordinates (both positional and keyword arguments)
|
||||
with updated ones in the code, handling multiple occurrences correctly.
|
||||
|
||||
Args:
|
||||
code (str): The original Python code as a string.
|
||||
updated_positions (list): A list of dictionaries with updated positions.
|
||||
|
||||
Returns:
|
||||
str: The updated Python code.
|
||||
"""
|
||||
|
||||
lines = code.splitlines()
|
||||
updated_code_lines = []
|
||||
position_index = 0 # Tracks which position update to use
|
||||
|
||||
for line in lines:
|
||||
if position_index < len(updated_positions):
|
||||
# Get the next update position
|
||||
update = updated_positions[position_index]
|
||||
function_pattern_positional = rf"{update['function']}\(\d+(\.\d+)?, \d+(\.\d+)?"
|
||||
function_pattern_keyword = rf"{update['function']}\(.*?x=\d+(\.\d+)?, y=\d+(\.\d+)?"
|
||||
|
||||
if re.search(function_pattern_positional, line):
|
||||
# Replace positional arguments
|
||||
line = re.sub(
|
||||
function_pattern_positional,
|
||||
f"{update['function']}({update['x']}, {update['y']}",
|
||||
line,
|
||||
count=1
|
||||
)
|
||||
position_index += 1 # Move to the next update
|
||||
elif re.search(function_pattern_keyword, line):
|
||||
# Replace keyword arguments
|
||||
line = re.sub(
|
||||
function_pattern_keyword,
|
||||
f"{update['function']}(x={update['x']}, y={update['y']}",
|
||||
line,
|
||||
count=1
|
||||
)
|
||||
position_index += 1 # Move to the next update
|
||||
|
||||
updated_code_lines.append(line)
|
||||
|
||||
return "\n".join(updated_code_lines)
|
||||
|
||||
def transform_agnet_action_to_code_block(action):
|
||||
"""Transform the agent action to a code block: not used in agent, for logging only"""
|
||||
if "computer.terminate" in action or "browser.select_option" in action or "browser.clear" in action:
|
||||
return f"```code\n{action}\n```"
|
||||
else:
|
||||
return f"```python\n{action}\n```"
|
||||
|
||||
class OpenCUAAgent:
|
||||
"""
|
||||
OpenCUA Agent for desktop automation tasks.
|
||||
|
||||
This class implements a OpenCUA Model based agent that can observe
|
||||
desktop environments through screenshots and execute mouse/keyboard actions
|
||||
via PyAutoGUI to complete automation tasks.
|
||||
|
||||
Attributes:
|
||||
model (str): Name of the language model being used
|
||||
history_type (str): Type of history recording mechanism
|
||||
actions (list): History of executed actions
|
||||
observations (list): History of environment observations
|
||||
cots (list): Chain of thought reasoning records
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
model: str, # OpenCUA model name
|
||||
history_type: str, # History step type: action_history, thought_history, observation_history
|
||||
max_image_history_length: int = 3, # The max number of images in the history
|
||||
platform: str = "ubuntu", # The platform of the computer
|
||||
max_tokens: int = 1500, # The max number of tokens in the response
|
||||
top_p: float = 0.9, # The top p value in the response
|
||||
temperature: float = 0, # The temperature value in the response
|
||||
action_space: str = "pyautogui", # The action space: pyautogui
|
||||
observation_type: str = "screenshot", # The observation type: screenshot
|
||||
cot_level: str = "l2", # The CoT level: l1, l2, l3
|
||||
screen_size: Tuple[int, int] = (1920, 1080), # The screen size
|
||||
coordinate_type: str = "relative", # The coordinate type: relative, absolute, qwen25
|
||||
**kwargs
|
||||
):
|
||||
assert coordinate_type in ["relative", "absolute", "qwen25"]
|
||||
assert action_space in ["pyautogui"], "Invalid action space"
|
||||
assert observation_type in ["screenshot"], "Invalid observation type"
|
||||
assert history_type in ["action_history", "thought_history", "observation_history"]
|
||||
assert model is not None, "Model cannot be None"
|
||||
|
||||
self.model = model
|
||||
self.platform = platform
|
||||
self.max_tokens = max_tokens
|
||||
self.top_p = top_p
|
||||
self.temperature = temperature
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
self.history_type = history_type
|
||||
self.coordinate_type = coordinate_type
|
||||
self.cot_level = cot_level
|
||||
self.screen_size = screen_size
|
||||
self.max_image_history_length = max_image_history_length
|
||||
|
||||
if history_type == "action_history":
|
||||
self.HISTORY_TEMPLATE = ACTION_HISTORY_TEMPLATE
|
||||
elif history_type == "thought_history":
|
||||
self.HISTORY_TEMPLATE = THOUGHT_HISTORY_TEMPLATE
|
||||
elif history_type == "observation_history":
|
||||
self.HISTORY_TEMPLATE = OBSERVATION_HISTORY_TEMPLATE
|
||||
else:
|
||||
raise ValueError(f"Invalid history type: {history_type}")
|
||||
|
||||
if cot_level == "l3":
|
||||
self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L3
|
||||
elif cot_level == "l2":
|
||||
self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L2
|
||||
elif cot_level == "l1":
|
||||
self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L1
|
||||
else:
|
||||
raise ValueError(f"Invalid COT level: {cot_level}")
|
||||
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.cots = []
|
||||
|
||||
def reset(self, _logger=None):
|
||||
global logger
|
||||
logger = _logger if _logger is not None else logging.getLogger("desktopenv.agent")
|
||||
|
||||
self.observations = []
|
||||
self.cots = []
|
||||
self.actions = []
|
||||
|
||||
def _scale_scroll_for_windows(self, code: str, factor: int = 50) -> str:
|
||||
""" pyautogui.scroll has a different scale on Ubuntu and Windows, multiple 'factor' when scrolling on Windows system"""
|
||||
if self.platform.lower() != "windows":
|
||||
return code
|
||||
|
||||
pattern_pos = re.compile(r'(pyautogui\.scroll\()\s*([-+]?\d+)\s*\)')
|
||||
code = pattern_pos.sub(lambda m: f"{m.group(1)}{int(m.group(2))*factor})", code)
|
||||
return code
|
||||
|
||||
def predict(self, instruction: str, obs: Dict, **kwargs) -> Tuple[str, List[str], Dict]:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
if "step_idx" in kwargs:
|
||||
logger.info(f"========= {self.model} Step {kwargs['step_idx']} =======")
|
||||
else:
|
||||
logger.info(f"========================== {self.model} ===================================")
|
||||
logger.info(f"Instruction: \n{instruction}")
|
||||
|
||||
messages = []
|
||||
messages.append({
|
||||
"role": "system",
|
||||
"content": self.SYSTEM_PROMPT
|
||||
})
|
||||
|
||||
history_step_texts = []
|
||||
for i in range(len(self.actions)):
|
||||
if i > len(self.actions) - self.max_image_history_length:
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{encode_image(self.observations[i]['screenshot'])}"}
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
|
||||
observation=self.cots[i].get('observation'),
|
||||
thought=self.cots[i].get('thought'),
|
||||
action=self.cots[i].get('action')
|
||||
)
|
||||
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": history_content
|
||||
})
|
||||
else:
|
||||
history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
|
||||
observation=self.cots[i].get('observation'),
|
||||
thought=self.cots[i].get('thought'),
|
||||
action=self.cots[i].get('action')
|
||||
)
|
||||
history_step_texts.append(history_content)
|
||||
if i == len(self.actions) - self.max_image_history_length:
|
||||
messages.append({
|
||||
"role":"assistant",
|
||||
"content": "\n".join(history_step_texts)
|
||||
})
|
||||
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{encode_image(obs['screenshot'])}"}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": INSTRUTION_TEMPLATE.format(instruction=instruction)
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
response = self.call_llm({
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"max_tokens": self.max_tokens,
|
||||
"top_p": self.top_p,
|
||||
"temperature": self.temperature
|
||||
}, self.model)
|
||||
|
||||
logger.info(f"Model Output: \n{response}")
|
||||
if not response:
|
||||
logger.error("No response found in the response.")
|
||||
return "ERROR", ["DONE"], {}
|
||||
|
||||
low_level_instruction, pyautogui_actions, other_cot = parse_response_to_cot_and_action(response, self.screen_size, self.coordinate_type)
|
||||
if not pyautogui_actions or len(pyautogui_actions) == 0:
|
||||
logger.error("No pyautogui actions found in the response.")
|
||||
return response, ["FAIL"], {}
|
||||
|
||||
pyautogui_actions = [
|
||||
self._scale_scroll_for_windows(code) for code in pyautogui_actions
|
||||
]
|
||||
|
||||
self.observations.append(obs)
|
||||
logger.info(f"Parsed Low-level Action: \n{low_level_instruction}")
|
||||
logger.info(f"Parsed pyautogui Action: \n{pyautogui_actions}")
|
||||
|
||||
self.actions.append(low_level_instruction)
|
||||
if 'action' not in other_cot or not other_cot['action'] or 'thought' not in other_cot or not other_cot['thought']:
|
||||
logger.error("Error! no action/thought in cot")
|
||||
logger.error(f"response: {response}")
|
||||
logger.error(f"cot: {other_cot}")
|
||||
self.cots.append(other_cot)
|
||||
|
||||
# Print message structure if needed
|
||||
# messages_to_print = []
|
||||
# current_image = 1
|
||||
# for msg in messages:
|
||||
# msg_copy = copy.deepcopy(msg)
|
||||
# if isinstance(msg_copy['content'], list):
|
||||
# for content in msg_copy['content']:
|
||||
# if content['type'] == 'image_url':
|
||||
# content['image_url']['url'] = f'Image {current_image}'
|
||||
# current_image += 1
|
||||
# messages_to_print.append(msg_copy)
|
||||
|
||||
# messages_to_print.append({
|
||||
# "new_step_cot": other_cot,
|
||||
# "response": response
|
||||
# })
|
||||
# logger.info(json.dumps(messages_to_print, indent=2))
|
||||
logger.info(f"New step cot: {other_cot}")
|
||||
|
||||
return response, pyautogui_actions, {}
|
||||
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.constant,
|
||||
# here you should add more model exceptions as you want,
|
||||
# but you are forbidden to add "Exception", that is, a common type of exception
|
||||
# because we want to catch this kind of Exception in the outside to ensure
|
||||
# each example won't exceed the time limit
|
||||
(
|
||||
Exception
|
||||
),
|
||||
interval=30,
|
||||
max_tries=10
|
||||
)
|
||||
def call_llm(self, payload, model):
|
||||
"""Call the LLM API"""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ['OPENCUA_API_KEY']}"
|
||||
}
|
||||
|
||||
for _ in range(30):
|
||||
response = httpx.post(
|
||||
os.environ['OPENCUA_URL'],
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=500,
|
||||
verify=False
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error("Failed to call LLM: " + response.text)
|
||||
logger.error("Retrying...")
|
||||
time.sleep(5)
|
||||
else:
|
||||
response = response.json()
|
||||
finish_reason = response["choices"][0].get("finish_reason")
|
||||
if finish_reason is not None and finish_reason == "stop": # for most of the time, length will not exceed max_tokens
|
||||
return response['choices'][0]['message']['content']
|
||||
else:
|
||||
logger.error("LLM did not finish properly, retrying...")
|
||||
time.sleep(5)
|
||||
|
|
@ -1,350 +0,0 @@
|
|||
import logging
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
from mm_agents.os_symphony.utils.common_utils import call_llm_safe, parse_code_from_string
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
|
||||
logger = logging.getLogger("desktopenv.coder_agent")
|
||||
|
||||
|
||||
def extract_code_block(action: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Extract code and determine type from action string."""
|
||||
if "```python" in action:
|
||||
code_type = "python"
|
||||
code = action.split("```python")[1].split("```")[0].strip()
|
||||
elif "```bash" in action:
|
||||
code_type = "bash"
|
||||
code = action.split("```bash")[1].split("```")[0].strip()
|
||||
elif "```" in action:
|
||||
code_type = None
|
||||
code = action.split("```")[1].split("```")[0].strip()
|
||||
else:
|
||||
code_type = None
|
||||
code = None
|
||||
|
||||
logger.debug(
|
||||
f"Extracted code block: type={code_type}, length={len(code) if code else 0}"
|
||||
)
|
||||
return code_type, code
|
||||
|
||||
|
||||
def execute_code(code_type: str, code: str, env_controller) -> Dict:
|
||||
"""Execute code based on its type."""
|
||||
# Log the full code being executed (untruncated)
|
||||
logger.info(f"CODING_AGENT_CODE_EXECUTION - Type: {code_type}\nCode:\n{code}")
|
||||
|
||||
try:
|
||||
if code_type == "bash":
|
||||
result = env_controller.run_bash_script(code, timeout=30)
|
||||
elif code_type == "python":
|
||||
result = env_controller.run_python_script(code)
|
||||
else:
|
||||
result = {"status": "error", "error": f"Unknown code type: {code_type}"}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error executing {code_type} code: {e}")
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def format_result(result: Dict, step_count: int) -> str:
|
||||
"""Format execution result into context string."""
|
||||
if not result:
|
||||
logger.warning(f"Step {step_count + 1}: No result returned from execution")
|
||||
return f"""
|
||||
Step {step_count + 1} Error:
|
||||
Error: No result returned from execution
|
||||
"""
|
||||
|
||||
status = result.get("status", "unknown")
|
||||
return_code = result.get("returncode", result.get("return_code", -1))
|
||||
|
||||
# Handle different response structures for bash vs python
|
||||
if "returncode" in result:
|
||||
# Bash script response
|
||||
output = result.get("output", "") # Contains both stdout and stderr merged
|
||||
error = result.get("error", "") # Always empty for bash
|
||||
else:
|
||||
# Python script response
|
||||
output = result.get("output", "") # stdout only
|
||||
error = result.get("error", "") # stderr only
|
||||
|
||||
logger.debug(f"Step {step_count + 1}: Status={status}, Return Code={return_code}")
|
||||
|
||||
# Format with better structure for multi-line outputs
|
||||
result_text = f"Step {step_count + 1} Result:\n"
|
||||
result_text += f"Status: {status}\n"
|
||||
result_text += f"Return Code: {return_code}\n"
|
||||
|
||||
if output:
|
||||
result_text += f"Output:\n{output}\n"
|
||||
|
||||
if error:
|
||||
result_text += f"Error:\n{error}\n"
|
||||
|
||||
return result_text
|
||||
|
||||
|
||||
class CoderAgent:
|
||||
"""A dedicated agent for executing code with a budget of steps."""
|
||||
|
||||
def __init__(self, engine_params: Dict, client_password: str, platform: str = "linux"):
|
||||
"""Initialize the CodeAgent."""
|
||||
if not engine_params:
|
||||
raise ValueError("engine_params cannot be None or empty")
|
||||
|
||||
self.engine_params = engine_params
|
||||
self.budget = engine_params.get("budget", 20)
|
||||
self.temperature = engine_params.get("temperature", 0.1)
|
||||
self.agent = None
|
||||
self.platform = platform
|
||||
self.client_password = client_password
|
||||
|
||||
logger.info(f"CodeAgent initialized with budget={self.budget} and platform={self.platform}")
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""Reset the code agent state."""
|
||||
logger.debug("Resetting CodeAgent state")
|
||||
self.agent = LMMAgent(
|
||||
engine_params=self.engine_params,
|
||||
system_prompt=PROCEDURAL_MEMORY.construct_coder_procedural_memory(platform=self.platform, client_password=self.client_password)
|
||||
)
|
||||
|
||||
def execute(self, task_instruction: str, screenshot: str, env_controller) -> Dict:
|
||||
"""Execute code for the given task with a budget of steps."""
|
||||
if env_controller is None:
|
||||
raise ValueError("env_controller is required for code execution")
|
||||
|
||||
print(f"\n🚀 STARTING CODE EXECUTION")
|
||||
print("=" * 60)
|
||||
print(f"Task: {task_instruction}")
|
||||
print(f"Budget: {self.budget} steps")
|
||||
print("=" * 60)
|
||||
|
||||
logger.info(f"Starting code execution for task: {task_instruction}")
|
||||
logger.info(f"Budget: {self.budget} steps")
|
||||
|
||||
self.reset()
|
||||
|
||||
|
||||
# Add initial task instruction and screenshot context as user message
|
||||
context = (
|
||||
f"Task: {task_instruction}\n\nCurrent screenshot is provided for context."
|
||||
)
|
||||
self.agent.add_message(context, image_content=screenshot, role="user")
|
||||
|
||||
step_count = 0
|
||||
execution_history = []
|
||||
execution_result_history = []
|
||||
while step_count < self.budget:
|
||||
logger.info(f"Step {step_count + 1}/{self.budget}")
|
||||
|
||||
# Get assistant response (thoughts and code)
|
||||
response = call_llm_safe(self.agent, temperature=self.temperature)
|
||||
|
||||
# Print to terminal for immediate visibility
|
||||
# print(f"\n🤖 CODING AGENT RESPONSE - Step {step_count + 1}/{self.budget}")
|
||||
# print("=" * 60)
|
||||
# print(response)
|
||||
# print("=" * 60)
|
||||
|
||||
# Log the latest message from the coding agent (untruncated)
|
||||
logger.info(
|
||||
f"CODING_AGENT_LATEST_MESSAGE - Step {step_count + 1}:\n{response}"
|
||||
)
|
||||
|
||||
# Check if response is None or empty
|
||||
if not response or response.strip() == "":
|
||||
error_msg = f"Step {step_count + 1}: LLM returned empty response"
|
||||
logger.error(error_msg)
|
||||
raise RuntimeError(error_msg)
|
||||
|
||||
# Parse the response to extract action
|
||||
action = parse_code_from_string(response)
|
||||
thoughts = response
|
||||
|
||||
execution_history.append(
|
||||
{"step": step_count + 1, "action": action, "thoughts": thoughts}
|
||||
)
|
||||
|
||||
# Check for completion signals
|
||||
action_upper = action.upper().strip()
|
||||
if action_upper == "DONE":
|
||||
print(f"\n✅ TASK COMPLETED - Step {step_count + 1}")
|
||||
print("=" * 60)
|
||||
print("Agent signaled task completion")
|
||||
print("=" * 60)
|
||||
logger.info(f"Step {step_count + 1}: Task completed successfully")
|
||||
completion_reason = "DONE"
|
||||
break
|
||||
elif action_upper == "FAIL":
|
||||
print(f"\n❌ TASK FAILED - Step {step_count + 1}")
|
||||
print("=" * 60)
|
||||
print("Agent signaled task failure")
|
||||
print("=" * 60)
|
||||
logger.info(f"Step {step_count + 1}: Task failed by agent request")
|
||||
completion_reason = "FAIL"
|
||||
break
|
||||
elif action_upper == 'INFEASIBLE':
|
||||
print(f"\n❌ TASK INFEASIBLE - Step {step_count + 1}")
|
||||
print("=" * 60)
|
||||
print("Agent signaled task infeasible")
|
||||
print("=" * 60)
|
||||
logger.info(f"Step {step_count + 1}: Task infeasible by agent request")
|
||||
completion_reason = "INFEASIBLE"
|
||||
break
|
||||
|
||||
# Extract and execute code
|
||||
code_type, code = extract_code_block(response.split("(Answer)")[-1])
|
||||
|
||||
if code:
|
||||
result = execute_code(code_type, code, env_controller)
|
||||
execution_result_history.append(
|
||||
{"step": step_count + 1, "result": result}
|
||||
)
|
||||
# Prepare formatted output and error for logging
|
||||
output = result.get("output", "")
|
||||
error = result.get("error", "")
|
||||
message = result.get("message", "")
|
||||
status = result.get("status", "")
|
||||
|
||||
# Print execution result to terminal for immediate visibility
|
||||
print(f"\n⚡ CODE EXECUTION RESULT - Step {step_count + 1}")
|
||||
print("-" * 50)
|
||||
print(f"Status: {status}")
|
||||
if output:
|
||||
print(f"Output:\n{output}")
|
||||
if error:
|
||||
print(f"Error:\n{error}")
|
||||
if message and not output and not error:
|
||||
print(f"Message:\n{message}")
|
||||
print("-" * 50)
|
||||
|
||||
log_lines = [
|
||||
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:",
|
||||
f"Status: {status}" if status else None,
|
||||
]
|
||||
|
||||
if output:
|
||||
log_lines.append(
|
||||
"Output:\n" + ("-" * 40) + f"\n{output}\n" + ("-" * 40)
|
||||
)
|
||||
if error:
|
||||
log_lines.append(
|
||||
"Error:\n" + ("!" * 40) + f"\n{error}\n" + ("!" * 40)
|
||||
)
|
||||
if message and not output and not error:
|
||||
log_lines.append(
|
||||
"Message:\n" + ("-" * 40) + f"\n{message}\n" + ("-" * 40)
|
||||
)
|
||||
|
||||
# Remove None entries and join
|
||||
formatted_log = "\n".join([line for line in log_lines if line])
|
||||
logger.info(formatted_log)
|
||||
else:
|
||||
print(f"\n⚠️ NO CODE BLOCK FOUND - Step {step_count + 1}")
|
||||
print("-" * 50)
|
||||
print("Action did not contain executable code")
|
||||
print("-" * 50)
|
||||
|
||||
logger.warning(f"Step {step_count + 1}: No code block found in action")
|
||||
result = {"status": "skipped", "message": "No code block found"}
|
||||
logger.info(
|
||||
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\n"
|
||||
f"Status: skipped\n"
|
||||
f"Message:\n{'-' * 40}\n{result['message']}\n{'-' * 40}"
|
||||
)
|
||||
# Add assistant's thoughts and code to message history
|
||||
self.agent.add_message(response, role="assistant")
|
||||
|
||||
# Process result and add formatted environment results as user message
|
||||
result_context = format_result(result, step_count)
|
||||
self.agent.add_message(result_context, role="user")
|
||||
|
||||
step_count += 1
|
||||
|
||||
# Handle budget exhaustion
|
||||
if "completion_reason" not in locals():
|
||||
print(f"\n⏰ BUDGET EXHAUSTED - {step_count} steps completed")
|
||||
print("=" * 60)
|
||||
print(f"Maximum budget of {self.budget} steps reached")
|
||||
print("=" * 60)
|
||||
logger.info(f"Budget exhausted after {step_count} steps")
|
||||
completion_reason = f"BUDGET_EXHAUSTED_AFTER_{step_count}_STEPS"
|
||||
|
||||
# Generate final summary
|
||||
logger.info("Generating execution summary")
|
||||
summary = self._generate_summary(execution_history, task_instruction)
|
||||
|
||||
result = {
|
||||
"task_instruction": task_instruction,
|
||||
"completion_reason": completion_reason,
|
||||
"summary": summary,
|
||||
"execution_history": execution_history,
|
||||
"execution_result_history": execution_result_history,
|
||||
"steps_executed": step_count,
|
||||
"budget": self.budget
|
||||
}
|
||||
|
||||
logger.info(f"Code execution completed: steps={step_count}")
|
||||
return result
|
||||
|
||||
def _generate_summary(
|
||||
self, execution_history: List[Dict], task_instruction: str
|
||||
) -> str:
|
||||
"""Generate summary of code execution session."""
|
||||
if not execution_history:
|
||||
logger.info("No execution history to summarize")
|
||||
return "No actions were executed."
|
||||
|
||||
logger.info(f"Generated summary for {len(execution_history)} steps")
|
||||
|
||||
# Build detailed execution context for summary agent
|
||||
execution_context = f"Task: {task_instruction}\n\nExecution Steps:\n"
|
||||
|
||||
for step in execution_history:
|
||||
step_num = step["step"]
|
||||
thoughts = step.get("thoughts", "")
|
||||
action = step.get("action", "")
|
||||
|
||||
execution_context += f"\nStep {step_num}:\n"
|
||||
if thoughts:
|
||||
execution_context += f"Thoughts: {thoughts}\n"
|
||||
execution_context += f"Code: {action}\n"
|
||||
|
||||
# Create summary prompt with same context as coding agent
|
||||
summary_prompt = f"""
|
||||
{execution_context}
|
||||
|
||||
Please provide a concise summary of the code execution session. Focus on:
|
||||
|
||||
1. The code logic implemented at each step
|
||||
2. The outputs and results produced by each code execution
|
||||
3. The progression of the solution approach
|
||||
|
||||
Do not make judgments about success or failure. Simply describe what was attempted and what resulted.
|
||||
|
||||
Keep the summary under 150 words and use clear, factual language.
|
||||
"""
|
||||
|
||||
# Generate summary using LLM with dedicated summary system prompt
|
||||
try:
|
||||
summary_agent = LMMAgent(
|
||||
engine_params=self.engine_params,
|
||||
system_prompt=PROCEDURAL_MEMORY.CODE_SUMMARY_AGENT_PROMPT,
|
||||
)
|
||||
summary_agent.add_message(summary_prompt, role="user")
|
||||
summary = call_llm_safe(summary_agent, temperature=self.temperature)
|
||||
|
||||
if not summary or summary.strip() == "":
|
||||
summary = "Summary generation failed - no response from LLM"
|
||||
logger.warning("Summary generation failed - empty response from LLM")
|
||||
|
||||
except Exception as e:
|
||||
summary = f"Summary generation failed: {str(e)}"
|
||||
logger.error(f"Error generating summary: {e}")
|
||||
|
||||
return summary
|
||||
|
|
@ -1,109 +0,0 @@
|
|||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
from mm_agents.os_symphony.utils.common_utils import call_llm_safe, smart_resize
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
class GrounderAgent:
|
||||
"""
|
||||
Class designed for interacting with GUI, serving for Grounding Agent and VLMSearcher
|
||||
"""
|
||||
def __init__(self, engine_params: Dict, screen_width: int, screen_height: int):
|
||||
self.engine_params_for_grounder = engine_params # grounder_params
|
||||
system_prompt, self.user_message = PROCEDURAL_MEMORY.construct_grounder_procedural_memory(model_name=engine_params["model"])
|
||||
self.grounding_model = LMMAgent(engine_params, system_prompt=system_prompt)
|
||||
# Width and height for Grounding Agent!
|
||||
self.width = engine_params['grounding_width']
|
||||
self.height = engine_params['grounding_height']
|
||||
print(f"[Grounder]: initialized width is {self.width}, height is {self.height}")
|
||||
# Width and height for actual screen!
|
||||
self.screen_width = screen_width
|
||||
self.screen_height = screen_height
|
||||
|
||||
# Given the state and worker's referring expression, use the grounding model to generate (x,y)
|
||||
def generate_coords(self, ref_expr: str, obs: Dict, detail=False, expansion_pixels=400, **kwargs) -> List:
|
||||
cur_screenshot = obs["screenshot"]
|
||||
|
||||
# store global offset
|
||||
global_offset_x = 0
|
||||
global_offset_y = 0
|
||||
|
||||
# final coordinates for output
|
||||
final_global_x = 0
|
||||
final_global_y = 0
|
||||
|
||||
cur_width, cur_height = self.screen_width, self.screen_height
|
||||
|
||||
print(f"[Grounder] start to ground!")
|
||||
self.grounding_model.reset()
|
||||
|
||||
# Configure the context
|
||||
prompt = self.user_message.replace("REF_EXPR", ref_expr)
|
||||
|
||||
# cosistent with the system prompt presented in the paper of GTA-1
|
||||
if 'gta' in self.engine_params_for_grounder['model']:
|
||||
self.grounding_model.add_system_prompt("You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.")
|
||||
|
||||
self.grounding_model.add_message(
|
||||
text_content=prompt, image_content=cur_screenshot, put_text_last=True, role="user"
|
||||
)
|
||||
|
||||
# Generate and parse coordinates
|
||||
response = call_llm_safe(self.grounding_model, temperature=0.05, **kwargs)
|
||||
print(f"[Grounder] prompt: {prompt}\nmodel: {self.engine_params_for_grounder['model']}, \nresponse: {response}")
|
||||
|
||||
|
||||
# 1. highest priority: (x1="...", y1="...", x="...", y="...")
|
||||
numericals = re.findall(r'(?:x1|y1|x|y)=["\']?(\d+)["\']?', response)
|
||||
# 2. second highest priority: just like <points>653 42</points> or [653, 42]
|
||||
if len(numericals) < 2:
|
||||
clean_response = re.sub(r'[xXyY]\d', '', response)
|
||||
numericals = re.findall(r'\d+', clean_response)
|
||||
assert len(numericals) >= 2
|
||||
|
||||
print(f"[Grounder] the parsed coordinates: {numericals}")
|
||||
|
||||
local_x, local_y = self._resize_coordinates([int(numericals[0]), int(numericals[1])], width=cur_width, height=cur_height)
|
||||
|
||||
# current global coordinates = local ordinates + global offset
|
||||
final_global_x = local_x + global_offset_x
|
||||
final_global_y = local_y + global_offset_y
|
||||
|
||||
if detail:
|
||||
return [cur_screenshot, global_offset_x, global_offset_y]
|
||||
else:
|
||||
return [final_global_x, final_global_y]
|
||||
|
||||
def dynamic_set_width_height(self, width: int, height: int):
|
||||
self.width = width
|
||||
self.height = height
|
||||
|
||||
# Resize from grounding model dim into OSWorld dim (1920 * 1080)
|
||||
def _resize_coordinates(self, coordinates: List[int], width:int, height:int) -> List[int]:
|
||||
"""
|
||||
width, height: for current observation
|
||||
grounding_width, grounding_height: width and height for Grounding model 1000x1000 or 1280x800)
|
||||
"""
|
||||
grounding_width = self.engine_params_for_grounder["grounding_width"]
|
||||
grounding_height = self.engine_params_for_grounder["grounding_height"]
|
||||
grounding_smart_resize = self.engine_params_for_grounder["grounding_smart_resize"]
|
||||
|
||||
|
||||
if not grounding_smart_resize:
|
||||
return [
|
||||
round(coordinates[0] * width / grounding_width),
|
||||
round(coordinates[1] * height / grounding_height),
|
||||
]
|
||||
else:
|
||||
smart_height, smart_width = smart_resize(height, width)
|
||||
return [
|
||||
round(coordinates[0] * width / smart_width),
|
||||
round(coordinates[1] * height / smart_height)
|
||||
]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue