add running command

add system password to system prompt
Merge branch 'main' into wxy/opencua
2025-07-30 14:13:52 +00:00 · 2025-07-30 14:02:54 +00:00 · 2025-07-30 14:00:23 +00:00 · 2025-07-25 09:11:01 +00:00 · 2025-07-25 09:10:42 +00:00 · 2025-07-24 08:29:57 +00:00
2 changed files with 55 additions and 14 deletions
--- a/mm_agents/opencua_agent.py
+++ b/mm_agents/opencua_agent.py
@ -14,21 +14,33 @@ import re
 import os
 import ast
 import time
-import json
 import math
-import copy
 import httpx
 import base64
 import backoff
-from io import BytesIO
 from loguru import logger
-from PIL import Image
 from typing import Dict, List, Tuple, Optional

+# System prompts used in the training data
 AGNET_SYS_PROMPT_L1 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
-AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
+# AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
 AGNET_SYS_PROMPT_L3 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nObservation:\n  - Describe the current computer state based on the full screenshot in detail. \n  - Application Context:\n    - The active application\n    - The active window or page\n    - Overall layout and visible interface\n  - Key Elements:\n    - Menu items and toolbars \n    - Buttons and controls\n    - Text fields and content\n    - Dialog boxes or popups\n    - Error messages or notifications\n    - Loading states\n    - Other key elements\n  - Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n".strip()

+# Testing prompt on OSWorld-Verified
+AGNET_SYS_PROMPT_L2 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. The password of the computer is "osworld-public-evaluation". If the task is not possible to do, output the action computer.terminate(status='failure').
+
+For each step, provide your response in this format:
+
+Thought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning
+
+Action:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize "—", maximize "□", close "X")\n  - if the action involves keyboard actions like \'press\', \'write\', \'hotkey\':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions
+
+Finally, output the action as PyAutoGUI code or the following functions:
+- {"name": "computer.triple_click", "description": "Triple click on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "The x coordinate of the triple click"}, "y": {"type": "number", "description": "The y coordinate of the triple click"}}, "required": ["x", "y"]}}
+- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, "required": ["status"]}}
+""".strip()
+
+
 STEP_TEMPLATE = "# Step {step_num}:\n"
 INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"

@ -638,12 +650,12 @@ class OpenCUAAgent:
        logger.info(f"Model Output: \n{response}")
        if not response:
            logger.error("No response found in the response.")
-            return "ERROR", [], {}
+            return "ERROR", ["DONE"], {}

        low_level_instruction, pyautogui_actions, other_cot = parse_response_to_cot_and_action(response, self.screen_size, self.coordinate_type)
-        if not pyautogui_actions:
+        if not pyautogui_actions or len(pyautogui_actions) == 0:
            logger.error("No pyautogui actions found in the response.")
-            return response, [], {}
+            return response, ["FAIL"], {}

        pyautogui_actions = [
            self._scale_scroll_for_windows(code) for code in pyautogui_actions
--- a/run_multienv_opencua.py
+++ b/run_multienv_opencua.py
@ -1,3 +1,35 @@
+"""
+    This is the script to run OpenCUA agents on OSWorld tasks using AWS provider.
+
+    You should first host the OpenCUA model on your local machine or a server.
+
+    Command for OpenCUA-7B and OpenCUA-32B:
+    ```
+        python run_multienv_opencua.py \
+            --headless \
+            --observation_type screenshot \
+            --model OpenCUA-32B \
+            --result_dir ./results --test_all_meta_path evaluation_examples/test_all_no_gdrive.json \
+            --max_steps 100 \
+            --num_envs 30  \
+            --coordinate_type qwen25
+    ```
+
+    Command for OpenCUA-Qwen2-7B and OpenCUA-A3B:
+    ```
+        python run_multienv_opencua.py \
+            --headless \
+            --observation_type screenshot \
+            --model OpenCUA-A3B \
+            --result_dir ./results \
+            --test_all_meta_path evaluation_examples/test_nogdrive.json \
+            --max_steps 100 \
+            --num_envs 10  \
+            --coordinate_type relative
+    ```
+
+"""
+
 from __future__ import annotations
 import argparse
 import datetime
@ -7,9 +39,7 @@ import os
 import sys
 import signal
 import time
-from typing import List, Dict
-import math
-from tqdm import tqdm
+from typing import List
 from multiprocessing import Process, Manager
 from multiprocessing import current_process
 import lib_run_single
@ -26,7 +56,7 @@ if os.path.exists(".env"):
    from dotenv import load_dotenv
    load_dotenv()

-#  Logger Configs {{{ #
+#  Logger Configs 
 def config() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Run end-to-end evaluation on the benchmark"
@ -58,7 +88,7 @@ def config() -> argparse.Namespace:
    parser.add_argument("--model", type=str, default="opencua")
    parser.add_argument("--temperature", type=float, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--max_tokens", type=int, default=8196)
+    parser.add_argument("--max_tokens", type=int, default=2048)
    parser.add_argument("--stop_token", type=str, default=None)

    # OpenCUAagent config
@ -129,7 +159,6 @@ stdout_handler.addFilter(logging.Filter("desktopenv"))
 logger.addHandler(file_handler)
 logger.addHandler(debug_handler)
 logger.addHandler(stdout_handler)
-#  }}} Logger Configs #

 logger = logging.getLogger("desktopenv.experiment")
Author	SHA1	Message	Date
XinyuanWangCS	5b44be1c55	add running command	2025-07-30 14:13:52 +00:00
XinyuanWangCS	e993663b5b	add system password to system prompt	2025-07-30 14:02:54 +00:00
XinyuanWangCS	84d98d2c9d	Merge branch 'main' into wxy/opencua	2025-07-30 14:00:23 +00:00
XinyuanWangCS	de3411e56c	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-25 09:11:01 +00:00
XinyuanWangCS	462e79c9d1	update detail	2025-07-25 09:10:42 +00:00
XinyuanWangCS	c38264c971	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-24 08:29:57 +00:00
XinyuanWangCS	abf267eb11	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-22 11:26:41 +00:00
XinyuanWangCS	acf08a15d1	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-22 06:20:24 +00:00
XinyuanWangCS	953d5028ea	ui-tars-0717	2025-07-19 18:22:34 +00:00
XinyuanWangCS	66def2c7a0	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-19 17:17:35 +00:00
XinyuanWangCS	e5a2398549	update parallel; clean code; use sleep 3s	2025-07-18 14:33:49 +00:00
XinyuanWangCS	4db72ec960	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-18 14:31:18 +00:00
XinyuanWangCS	a68c981777	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-18 13:40:22 +00:00
XinyuanWangCS	e40671e53b	Merge remote-tracking branch 'origin' into wxy/opencua	2025-07-18 11:52:51 +00:00
XinyuanWangCS	aba043b9e8	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-18 08:35:59 +00:00
XinyuanWangCS	80aad6c2d5	Merge branch 'main' into wxy/opencua	2025-07-17 11:18:44 +00:00
XinyuanWangCS	923b612a6d	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-17 04:30:56 +00:00
XinyuanWangCS	6a5e119918	Merge remote-tracking branch 'origin' into wxy/opencua	2025-07-16 08:35:23 +00:00
XinyuanWangCS	710201d03a	modify opencua agent; add comment lines	2025-07-16 08:33:59 +00:00
XinyuanWangCS	185fbe1398	Merge remote-tracking branch 'origin/main' into wxy/opencua	2025-07-15 15:36:20 +00:00
XinyuanWangCS	e8508e8e3b	debug agent history overlap	2025-07-15 14:50:37 +00:00
XinyuanWangCS	51af29354b	merge	2025-07-15 04:15:18 +00:00
XinyuanWangCS	73dc19c1ce	show result	2025-07-15 04:11:40 +00:00
XinyuanWangCS	462e6caea0	debug opencua	2025-07-15 04:10:53 +00:00
XinyuanWangCS	8c024e4910	debug, modify url input	2025-07-14 11:16:21 +00:00
XinyuanWangCS	30c8738be9	update url	2025-07-14 18:00:04 +08:00
XinyuanWangCS	f5c4563c8e	OpenCUA Agent code base	2025-07-13 01:18:45 +08:00