Compare commits

...

27 Commits

Author SHA1 Message Date
XinyuanWangCS 5b44be1c55 add running command 2025-07-30 14:13:52 +00:00
XinyuanWangCS e993663b5b add system password to system prompt 2025-07-30 14:02:54 +00:00
XinyuanWangCS 84d98d2c9d Merge branch 'main' into wxy/opencua 2025-07-30 14:00:23 +00:00
XinyuanWangCS de3411e56c Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-25 09:11:01 +00:00
XinyuanWangCS 462e79c9d1 update detail 2025-07-25 09:10:42 +00:00
XinyuanWangCS c38264c971 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-24 08:29:57 +00:00
XinyuanWangCS abf267eb11 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-22 11:26:41 +00:00
XinyuanWangCS acf08a15d1 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-22 06:20:24 +00:00
XinyuanWangCS 953d5028ea ui-tars-0717 2025-07-19 18:22:34 +00:00
XinyuanWangCS 66def2c7a0 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-19 17:17:35 +00:00
XinyuanWangCS e5a2398549 update parallel; clean code; use sleep 3s 2025-07-18 14:33:49 +00:00
XinyuanWangCS 4db72ec960 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-18 14:31:18 +00:00
XinyuanWangCS a68c981777 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-18 13:40:22 +00:00
XinyuanWangCS e40671e53b Merge remote-tracking branch 'origin' into wxy/opencua 2025-07-18 11:52:51 +00:00
XinyuanWangCS aba043b9e8 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-18 08:35:59 +00:00
XinyuanWangCS 80aad6c2d5 Merge branch 'main' into wxy/opencua 2025-07-17 11:18:44 +00:00
XinyuanWangCS 923b612a6d Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-17 04:30:56 +00:00
XinyuanWangCS 6a5e119918 Merge remote-tracking branch 'origin' into wxy/opencua 2025-07-16 08:35:23 +00:00
XinyuanWangCS 710201d03a modify opencua agent; add comment lines 2025-07-16 08:33:59 +00:00
XinyuanWangCS 185fbe1398 Merge remote-tracking branch 'origin/main' into wxy/opencua 2025-07-15 15:36:20 +00:00
XinyuanWangCS e8508e8e3b debug agent history overlap 2025-07-15 14:50:37 +00:00
XinyuanWangCS 51af29354b merge 2025-07-15 04:15:18 +00:00
XinyuanWangCS 73dc19c1ce show result 2025-07-15 04:11:40 +00:00
XinyuanWangCS 462e6caea0 debug opencua 2025-07-15 04:10:53 +00:00
XinyuanWangCS 8c024e4910 debug, modify url input 2025-07-14 11:16:21 +00:00
XinyuanWangCS 30c8738be9 update url 2025-07-14 18:00:04 +08:00
XinyuanWangCS f5c4563c8e OpenCUA Agent code base 2025-07-13 01:18:45 +08:00
2 changed files with 55 additions and 14 deletions

View File

@ -14,21 +14,33 @@ import re
import os
import ast
import time
import json
import math
import copy
import httpx
import base64
import backoff
from io import BytesIO
from loguru import logger
from PIL import Image
from typing import Dict, List, Tuple, Optional
# System prompts used in the training data
AGNET_SYS_PROMPT_L1 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"\", maximize \"\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"\", maximize \"\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
# AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
AGNET_SYS_PROMPT_L3 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nObservation:\n - Describe the current computer state based on the full screenshot in detail. \n - Application Context:\n - The active application\n - The active window or page\n - Overall layout and visible interface\n - Key Elements:\n - Menu items and toolbars \n - Buttons and controls\n - Text fields and content\n - Dialog boxes or popups\n - Error messages or notifications\n - Loading states\n - Other key elements\n - Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).\n\nThought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning\n\nAction:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize \"\", maximize \"\", close \"X\")\n - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n".strip()
# Testing prompt on OSWorld-Verified
AGNET_SYS_PROMPT_L2 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. The password of the computer is "osworld-public-evaluation". If the task is not possible to do, output the action computer.terminate(status='failure').
For each step, provide your response in this format:
Thought:\n - Step by Step Progress Assessment:\n - Analyze completed task parts and their contribution to the overall goal\n - Reflect on potential errors, unexpected results, or obstacles\n - If previous action was incorrect, predict a logical recovery step\n - Next Action Analysis:\n - List possible next actions based on current state\n - Evaluate options considering current state and previous actions\n - Propose most logical next action\n - Anticipate consequences of the proposed action\n - For Text Input Actions:\n - Note current cursor position\n - Consolidate repetitive actions (specify count for multiple keypresses)\n - Describe expected final text outcome\n - Use first-person perspective in reasoning
Action:\n Provide clear, concise, and actionable instructions:\n - If the action involves interacting with a specific target:\n - Describe target explicitly without using coordinates\n - Specify element names when possible (use original language if non-English)\n - Describe features (shape, color, position) if name unavailable\n - For window control buttons, identify correctly (minimize "", maximize "", close "X")\n - if the action involves keyboard actions like \'press\', \'write\', \'hotkey\':\n - Consolidate repetitive keypresses with count\n - Specify expected text outcome for typing actions
Finally, output the action as PyAutoGUI code or the following functions:
- {"name": "computer.triple_click", "description": "Triple click on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "The x coordinate of the triple click"}, "y": {"type": "number", "description": "The y coordinate of the triple click"}}, "required": ["x", "y"]}}
- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, "required": ["status"]}}
""".strip()
STEP_TEMPLATE = "# Step {step_num}:\n"
INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"
@ -638,12 +650,12 @@ class OpenCUAAgent:
logger.info(f"Model Output: \n{response}")
if not response:
logger.error("No response found in the response.")
return "ERROR", [], {}
return "ERROR", ["DONE"], {}
low_level_instruction, pyautogui_actions, other_cot = parse_response_to_cot_and_action(response, self.screen_size, self.coordinate_type)
if not pyautogui_actions:
if not pyautogui_actions or len(pyautogui_actions) == 0:
logger.error("No pyautogui actions found in the response.")
return response, [], {}
return response, ["FAIL"], {}
pyautogui_actions = [
self._scale_scroll_for_windows(code) for code in pyautogui_actions

View File

@ -1,3 +1,35 @@
"""
This is the script to run OpenCUA agents on OSWorld tasks using AWS provider.
You should first host the OpenCUA model on your local machine or a server.
Command for OpenCUA-7B and OpenCUA-32B:
```
python run_multienv_opencua.py \
--headless \
--observation_type screenshot \
--model OpenCUA-32B \
--result_dir ./results --test_all_meta_path evaluation_examples/test_all_no_gdrive.json \
--max_steps 100 \
--num_envs 30 \
--coordinate_type qwen25
```
Command for OpenCUA-Qwen2-7B and OpenCUA-A3B:
```
python run_multienv_opencua.py \
--headless \
--observation_type screenshot \
--model OpenCUA-A3B \
--result_dir ./results \
--test_all_meta_path evaluation_examples/test_nogdrive.json \
--max_steps 100 \
--num_envs 10 \
--coordinate_type relative
```
"""
from __future__ import annotations
import argparse
import datetime
@ -7,9 +39,7 @@ import os
import sys
import signal
import time
from typing import List, Dict
import math
from tqdm import tqdm
from typing import List
from multiprocessing import Process, Manager
from multiprocessing import current_process
import lib_run_single
@ -26,7 +56,7 @@ if os.path.exists(".env"):
from dotenv import load_dotenv
load_dotenv()
# Logger Configs {{{ #
# Logger Configs
def config() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run end-to-end evaluation on the benchmark"
@ -58,7 +88,7 @@ def config() -> argparse.Namespace:
parser.add_argument("--model", type=str, default="opencua")
parser.add_argument("--temperature", type=float, default=0)
parser.add_argument("--top_p", type=float, default=0.9)
parser.add_argument("--max_tokens", type=int, default=8196)
parser.add_argument("--max_tokens", type=int, default=2048)
parser.add_argument("--stop_token", type=str, default=None)
# OpenCUAagent config
@ -129,7 +159,6 @@ stdout_handler.addFilter(logging.Filter("desktopenv"))
logger.addHandler(file_handler)
logger.addHandler(debug_handler)
logger.addHandler(stdout_handler)
# }}} Logger Configs #
logger = logging.getLogger("desktopenv.experiment")