Delete mm_agents/anthropic/main.py

Merge branch 'feat/monitor' of https://github.com/xlang-ai/OSWorld into feat/monitor
feat&fix: update configuration management to save model arguments and enhance UI display for model args
2025-07-16 17:48:19 +08:00 · 2025-07-16 09:44:52 +00:00 · 2025-07-16 09:44:49 +00:00 · 2025-07-14 13:33:43 +08:00 · 2025-07-14 05:30:45 +00:00 · 2025-07-14 05:29:55 +00:00
12 changed files with 779 additions and 480 deletions
--- a/mm_agents/anthropic/main.py
+++ b/mm_agents/anthropic/main.py
@ -1,442 +0,0 @@
-import base64
-import os
-import time
-from typing import Any, cast, Optional, Dict
-from PIL import Image
-import io
-
-from anthropic import (
-    Anthropic,
-    AnthropicBedrock,
-    AnthropicVertex,
-    APIError,
-    APIResponseValidationError,
-    APIStatusError,
-)
-from anthropic.types.beta import (
-    BetaMessageParam,
-    BetaTextBlockParam,
-)
-from .utils import COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG,SYSTEM_PROMPT, SYSTEM_PROMPT_WINDOWS, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME
-from .utils import _response_to_params, _inject_prompt_caching, _maybe_filter_to_n_most_recent_images
-
-import logging
-logger = logging.getLogger("desktopenv.agent")
-
-class AnthropicAgent:
-    def __init__(self,
-                 platform: str = "Ubuntu",
-                 model: str = "claude-3-5-sonnet-20241022",
-                 provider: APIProvider = APIProvider.BEDROCK,
-                 max_tokens: int = 4096,
-                 api_key: str = os.environ.get("ANTHROPIC_API_KEY", None),
-                 system_prompt_suffix: str = "",
-                 only_n_most_recent_images: Optional[int] = 10,
-                 action_space: str = "claude_computer_use",
-                 screen_size: tuple[int, int] = (1920, 1080),
-                 *args, **kwargs
-                 ):
-        self.platform = platform
-        self.action_space = action_space
-        self.logger = logger
-        self.class_name = self.__class__.__name__
-        self.model_name = model
-        self.provider = provider
-        self.max_tokens = max_tokens
-        self.api_key = api_key
-        self.system_prompt_suffix = system_prompt_suffix
-        self.only_n_most_recent_images = only_n_most_recent_images
-        self.messages: list[BetaMessageParam] = []
-        self.screen_size = screen_size
-        self.resize_factor = (
-            screen_size[0] / 1280,  # Assuming 1280 is the base width
-            screen_size[1] / 720   # Assuming 720 is the base height
-        )
-
-    def add_tool_result(self, tool_call_id: str, result: str, screenshot: bytes = None):
-        """Add tool result to message history"""
-        tool_result_content = [
-            {
-                "type": "tool_result",
-                "tool_use_id": tool_call_id,
-                "content": [{"type": "text", "text": result}]
-            }
-        ]
-        
-        # Add screenshot if provided
-        if screenshot is not None:
-            screenshot_base64 = base64.b64encode(screenshot).decode('utf-8')
-            tool_result_content[0]["content"].append({
-                "type": "image",
-                "source": {
-                    "type": "base64",
-                    "media_type": "image/png", 
-                    "data": screenshot_base64
-                }
-            })
-        
-        self.messages.append({
-            "role": "user",
-            "content": tool_result_content
-        })
-    
-    def parse_actions_from_tool_call(self, tool_call: Dict) -> str:
-        result = ""
-        function_args = (
-            tool_call["input"]
-        )
-        
-        action = function_args.get("action")
-        if not action:
-            action = tool_call.function.name
-        action_conversion = {
-            "left click": "click",
-            "right click": "right_click"
-        }
-        action = action_conversion.get(action, action)
-        
-        text = function_args.get("text")
-        coordinate = function_args.get("coordinate")
-        scroll_direction = function_args.get("scroll_direction")
-        scroll_amount = function_args.get("scroll_amount")
-        duration = function_args.get("duration")
-        
-        # resize coordinates if resize_factor is set
-        if coordinate and self.resize_factor:
-            coordinate = (
-                int(coordinate[0] * self.resize_factor[0]),
-                int(coordinate[1] * self.resize_factor[1])
-            )
-
-        # Handle mouse move and drag actions
-        if action in ("mouse_move", "left_click_drag"):
-            if coordinate is None:
-                raise ValueError(f"coordinate is required for {action}")
-            if text is not None:
-                raise ValueError(f"text is not accepted for {action}")
-            if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
-                raise ValueError(f"{coordinate} must be a tuple of length 2")
-            if not all(isinstance(i, int) for i in coordinate):
-                raise ValueError(f"{coordinate} must be a tuple of ints")
-            
-            x, y = coordinate[0], coordinate[1]
-            if action == "mouse_move":
-                result += (
-                    f"pyautogui.moveTo({x}, {y}, duration={duration or 0.5})\n"
-                )
-                expected_outcome = f"Mouse moved to ({x},{y})."
-            elif action == "left_click_drag":
-                result += (
-                    f"pyautogui.dragTo({x}, {y}, duration={duration or 0.5})\n"
-                )
-                expected_outcome = f"Cursor dragged to ({x},{y})."
-
-        # Handle keyboard actions
-        elif action in ("key", "type"):
-            if text is None:
-                raise ValueError(f"text is required for {action}")
-            if coordinate is not None:
-                raise ValueError(f"coordinate is not accepted for {action}")
-            if not isinstance(text, str):
-                raise ValueError(f"{text} must be a string")
-
-            if action == "key":
-                key_conversion = {
-                    "page_down": "pagedown",
-                    "page_up": "pageup",
-                    "super_l": "win",
-                    "super": "command",
-                    "escape": "esc"
-                }
-                keys = text.split('+')
-                for key in keys:
-                    key = key.strip().lower()
-                    key = key_conversion.get(key, key)
-                    result += (f"pyautogui.keyDown('{key}')\n")
-                for key in reversed(keys):
-                    key = key.strip().lower()
-                    key = key_conversion.get(key, key)
-                    result += (f"pyautogui.keyUp('{key}')\n")
-                expected_outcome = f"Key {key} pressed."
-            elif action == "type":
-                result += (
-                    f"pyautogui.typewrite(\"\"\"{text}\"\"\", interval=0.01)\n"
-                )
-                expected_outcome = f"Text {text} written."
-
-        # Handle scroll actions
-        elif action == "scroll":
-            if coordinate is None:
-                if scroll_direction in ("up", "down"):
-                    result += (
-                        f"pyautogui.scroll({scroll_amount if scroll_direction == 'up' else -scroll_amount})\n"
-                    )
-                elif scroll_direction in ("left", "right"):
-                    result += (
-                        f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount})\n"
-                    )
-            else:
-                if scroll_direction in ("up", "down"):
-                    x, y = coordinate[0], coordinate[1]
-                    result += (
-                        f"pyautogui.scroll({scroll_amount if scroll_direction == 'up' else -scroll_amount}, {x}, {y})\n"
-                    )
-                elif scroll_direction in ("left", "right"):
-                    x, y = coordinate[0], coordinate[1]
-                    result += (
-                        f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount}, {x}, {y})\n"
-                    )
-            expected_outcome = "Scroll action finished"
-
-        # Handle click actions
-        elif action in ("left_click", "right_click", "double_click", "middle_click", "left_press"):
-            if coordinate is not None:
-                x, y = coordinate
-                if action == "left_click":
-                    result += (f"pyautogui.click({x}, {y})\n")
-                elif action == "right_click":
-                    result += (f"pyautogui.rightClick({x}, {y})\n")
-                elif action == "double_click":
-                    result += (f"pyautogui.doubleClick({x}, {y})\n")
-                elif action == "middle_click":
-                    result += (f"pyautogui.middleClick({x}, {y})\n")
-                elif action == "left_press":
-                    result += (f"pyautogui.mouseDown({x}, {y})\n")
-                    result += ("time.sleep(1)\n")
-                    result += (f"pyautogui.mouseUp({x}, {y})\n")
-            else:
-                if action == "left_click":
-                    result += ("pyautogui.click()\n")
-                elif action == "right_click":
-                    result += ("pyautogui.rightClick()\n")
-                elif action == "double_click":
-                    result += ("pyautogui.doubleClick()\n")
-                elif action == "middle_click":
-                    result += ("pyautogui.middleClick()\n")
-                elif action == "left_press":
-                    result += ("pyautogui.mouseDown()\n")
-                    result += ("time.sleep(1)\n")
-                    result += ("pyautogui.mouseUp()\n")
-            expected_outcome = "Click action finished"
-            
-        elif action == "wait":
-            result += "pyautogui.sleep(0.5)\n"
-            expected_outcome = "Wait for 0.5 seconds"
-        elif action == "fail":
-            result += "FAIL"
-            expected_outcome = "Finished"
-        elif action == "done":
-            result += "DONE"
-            expected_outcome = "Finished"
-        elif action == "call_user":
-            result += "CALL_USER"
-            expected_outcome = "Call user"
-        elif action == "screenshot":
-            result += "pyautogui.sleep(0.1)\n"
-            expected_outcome = "Screenshot taken"   
-        else:
-            raise ValueError(f"Invalid action: {action}")
-        
-        return result
-    
-    def predict(self, task_instruction: str, obs: Dict = None, system: Any = None):
-        system = BetaTextBlockParam(
-            type="text",
-            text=f"{SYSTEM_PROMPT_WINDOWS if self.platform == 'Windows' else SYSTEM_PROMPT}{' ' + self.system_prompt_suffix if self.system_prompt_suffix else ''}"
-        )
-        
-        # resize screenshot if resize_factor is set
-        if obs and "screenshot" in obs:
-            # Convert bytes to PIL Image
-            screenshot_bytes = obs["screenshot"]
-            screenshot_image = Image.open(io.BytesIO(screenshot_bytes))
-            
-            # Calculate new size based on resize factor
-            new_width, new_height = 1280, 720
-            
-            # Resize the image
-            resized_image = screenshot_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-            
-            # Convert back to bytes
-            output_buffer = io.BytesIO()
-            resized_image.save(output_buffer, format='PNG')
-            obs["screenshot"] = output_buffer.getvalue()
-            
-
-        if not self.messages:
-            
-            init_screenshot = obs
-            init_screenshot_base64 = base64.b64encode(init_screenshot["screenshot"]).decode('utf-8')
-            self.messages.append({
-                "role": "user",
-                "content": [
-                    {
-                    "type": "image",
-                    "source": {
-                            "type": "base64",
-                            "media_type": "image/png",
-                            "data": init_screenshot_base64,
-                        },
-                    },
-                    {"type": "text", "text": task_instruction},
-                ]
-            })
-            
-        if self.messages and "tool_use" in [content_block["type"] for content_block in self.messages[-1]["content"]]:
-            self.add_tool_result(
-                self.messages[-1]["content"][-1]["id"],
-                f"Success",
-                screenshot=obs.get("screenshot") if obs else None
-            )
-            
-        enable_prompt_caching = False
-        betas = ["computer-use-2025-01-24"]
-        if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
-            betas = ["computer-use-2025-01-24"]
-        elif self.model_name == "claude-3-5-sonnet-20241022":
-            betas = [COMPUTER_USE_BETA_FLAG]
-            
-        image_truncation_threshold = 10
-        if self.provider == APIProvider.ANTHROPIC:
-            client = Anthropic(api_key=self.api_key, max_retries=4)
-            enable_prompt_caching = True
-        elif self.provider == APIProvider.VERTEX:
-            client = AnthropicVertex()
-        elif self.provider == APIProvider.BEDROCK:
-            client = AnthropicBedrock(
-                # Authenticate by either providing the keys below or use the default AWS credential providers, such as
-                # using ~/.aws/credentials or the "AWS_SECRET_ACCESS_KEY" and "AWS_ACCESS_KEY_ID" environment variables.
-                aws_access_key=os.getenv('AWS_ACCESS_KEY_ID'),
-                aws_secret_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
-                # aws_region changes the aws region to which the request is made. By default, we read AWS_REGION,
-                # and if that's not present, we default to us-east-1. Note that we do not read ~/.aws/config for the region.
-                aws_region=os.getenv('AWS_DEFAULT_REGION'),
-            )
-
-        if enable_prompt_caching:
-            betas.append(PROMPT_CACHING_BETA_FLAG)
-            _inject_prompt_caching(self.messages)
-            image_truncation_threshold = 50
-            system["cache_control"] = {"type": "ephemeral"}
-
-        if self.only_n_most_recent_images:
-            _maybe_filter_to_n_most_recent_images(
-                self.messages,
-                self.only_n_most_recent_images,
-                min_removal_threshold=image_truncation_threshold,
-            )
-
-        try:
-
-            if self.model_name == "claude-3-5-sonnet-20241022":
-                tools = [
-                    {'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
-                    # {'type': 'bash_20241022', 'name': 'bash'},
-                    # {'name': 'str_replace_editor', 'type': 'text_editor_20241022'}
-                ] if self.platform == 'Ubuntu' else [
-                    {'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
-                ]
-            elif self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
-                tools = [
-                    {'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
-                    # {'type': 'bash_20250124', 'name': 'bash'},
-                    # {'name': 'str_replace_editor', 'type': 'text_editor_20250124'}
-                ] if self.platform == 'Ubuntu' else [
-                    {'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
-                ]
-            extra_body = {
-                "thinking": {"type": "enabled", "budget_tokens": 1024}
-            }
-            response = None
-            if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
-                response = client.beta.messages.create(
-                    max_tokens=self.max_tokens,
-                    messages=self.messages,
-                    model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
-                    system=[system],
-                    tools=tools,
-                    betas=betas,
-                    extra_body=extra_body
-                    )
-            elif self.model_name == "claude-3-5-sonnet-20241022":
-                response = client.beta.messages.create(
-                    max_tokens=self.max_tokens,
-                    messages=self.messages,
-                    model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
-                    system=[system],
-                    tools=tools,
-                    betas=betas,
-                )
-
-        except (APIError, APIStatusError, APIResponseValidationError) as e:
-            self.logger.exception(f"Anthropic API error: {str(e)}")
-            try:
-                self.logger.warning("Retrying with backup API key...")
-                backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
-                
-                if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
-                    response = backup_client.beta.messages.create(
-                        max_tokens=self.max_tokens,
-                        messages=self.messages,
-                        model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
-                        system=[system],
-                        tools=tools,
-                        betas=betas,
-                        extra_body=extra_body
-                    )
-                elif self.model_name == "claude-3-5-sonnet-20241022":
-                    response = backup_client.beta.messages.create(
-                        max_tokens=self.max_tokens,
-                        messages=self.messages,
-                        model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
-                        system=[system],
-                        tools=tools,
-                        betas=betas,
-                    )
-                self.logger.info("Successfully used backup API key")
-            except Exception as backup_e:
-                self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
-                return None, None
-
-        except Exception as e:
-            self.logger.exception(f"Error in Anthropic API: {str(e)}")
-            return None, None
-
-        response_params = _response_to_params(response)
-        logger.info(f"Received response params: {response_params}")
-
-        # Store response in message history
-        self.messages.append({
-            "role": "assistant",
-            "content": response_params
-        })
-
-        actions: list[Any] = []
-        reasonings: list[str] = []
-        for content_block in response_params:
-            if content_block["type"] == "tool_use":
-                actions.append({
-                    "name": content_block["name"],
-                    "input": cast(dict[str, Any], content_block["input"]),
-                    "id": content_block["id"],
-                    "action_type": content_block.get("type"),
-                    "command": self.parse_actions_from_tool_call(content_block)
-                })
-            elif content_block["type"] == "text":
-                reasonings.append(content_block["text"])
-        if isinstance(reasonings, list) and len(reasonings) > 0:
-            reasonings = reasonings[0]
-        else:
-            reasonings = ""
-        logger.info(f"Received actions: {actions}")
-        logger.info(f"Received reasonings: {reasonings}")
-        if len(actions) == 0:
-            actions = ["DONE"]
-        return reasonings, actions
-    
-    def reset(self, *args, **kwargs):
-        """
-        Reset the agent's state.
-        """
-        self.messages = []
-        self.logger.info(f"{self.class_name} reset.")
--- a/monitor/.env
+++ b/monitor/.env
@ -4,11 +4,11 @@
 # Monitor configuration
 TASK_CONFIG_PATH=../evaluation_examples/test_all.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_all
-ACTION_SPACE=pyautogui
-OBSERVATION_TYPE=screenshot
-MODEL_NAME=computer-use-preview
-MAX_STEPS=150
+RESULTS_BASE_PATH=../results
+# ACTION_SPACE=pyautogui
+# OBSERVATION_TYPE=screenshot
+# MODEL_NAME=computer-use-preview
+# MAX_STEPS=150
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0
-FLASK_DEBUG=true
+FLASK_DEBUG=false
--- a/monitor/main.py
+++ b/monitor/main.py
@ -1,14 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

+from functools import cache
 import os
 import json
 import time
+import subprocess
 from datetime import datetime
 from pathlib import Path
 from flask import Flask, render_template_string, jsonify, send_file, request, render_template
 from dotenv import load_dotenv

+
 # Load environment variables from .env file
 load_dotenv()

@ -38,12 +41,57 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
 MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
 MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))

+def initialize_default_config():
+    """Initialize default configuration from the first available config in results directory"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH, MAX_STEPS
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan for the first available configuration
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    # Use the first available configuration as default
+                                    ACTION_SPACE = action_space
+                                    OBSERVATION_TYPE = obs_type
+                                    MODEL_NAME = model_name
+                                    RESULTS_PATH = model_path
+                                    
+                                    # Read max_steps from args.json if available
+                                    model_args = get_model_args(action_space, obs_type, model_name)
+                                    if model_args and 'max_steps' in model_args:
+                                        MAX_STEPS = model_args['max_steps']
+                                    
+                                    print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME} (max_steps: {MAX_STEPS})")
+                                    return
+        except Exception as e:
+            print(f"Error scanning results directory for default config: {e}")
+    
+    # Fallback to original environment-based path if no configs found
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME} (max_steps: {MAX_STEPS})")
+
+# Initialize default configuration
+initialize_default_config()
+
 RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)

+if RESULTS_PATH not in TASK_STATUS_CACHE:
+    # Initialize cache for this results path
+    TASK_STATUS_CACHE[RESULTS_PATH] = {}
+
+@cache
 def load_task_list():
    with open(TASK_CONFIG_PATH, 'r') as f:
        return json.load(f)

+@cache
 def get_task_info(task_type, task_id):
    task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
    if os.path.exists(task_file):
@ -183,8 +231,8 @@ def get_task_status_brief(task_type, task_id):
    # Check if the status is already cached
    current_time = time.time()
    last_cache_time = None
-    if cache_key in TASK_STATUS_CACHE:
-        cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+    if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
+        cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
        last_cache_time = cached_time
        # If cached status is "Done", check if it's within the stability period
        if cached_status["status"].startswith("Done"):
@ -312,7 +360,7 @@ def get_task_status_brief(task_type, task_id):
    # Cache the status if it is done or error
    if status.startswith("Done") or status == "Error":
        current_time = last_cache_time if last_cache_time else current_time
-        TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
+        TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
    
    return status_dict

@ -434,6 +482,115 @@ def api_task_detail(task_type, task_id):
        "status": task_status
    })

+@app.route('/api/config')
+def api_config():
+    """Get configuration information from environment variables - deprecated, use /api/current-config instead"""
+    config_info = {
+        "task_config_path": TASK_CONFIG_PATH,
+        "results_base_path": RESULTS_BASE_PATH,
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "examples_base_path": EXAMPLES_BASE_PATH
+    }
+    return jsonify(config_info)
+
+@app.route('/api/available-configs')
+def api_available_configs():
+    """Get all available configuration combinations by scanning the results directory"""
+    configs = []
+    
+    if os.path.exists(RESULTS_BASE_PATH):
+        try:
+            # Scan action spaces
+            for action_space in os.listdir(RESULTS_BASE_PATH):
+                action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
+                if os.path.isdir(action_space_path):
+                    # Scan observation types
+                    for obs_type in os.listdir(action_space_path):
+                        obs_path = os.path.join(action_space_path, obs_type)
+                        if os.path.isdir(obs_path):
+                            # Scan model names
+                            for model_name in os.listdir(obs_path):
+                                model_path = os.path.join(obs_path, model_name)
+                                if os.path.isdir(model_path):
+                                    configs.append({
+                                        "action_space": action_space,
+                                        "observation_type": obs_type,
+                                        "model_name": model_name,
+                                        "path": model_path
+                                    })
+        except Exception as e:
+            print(f"Error scanning results directory: {e}")
+    
+    return jsonify(configs)
+
+@app.route('/api/current-config')
+def api_current_config():
+    """Get current configuration including args.json data"""
+    config = {
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    }
+    
+    # Add model args from args.json
+    model_args = get_model_args(ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    if model_args:
+        config["model_args"] = model_args
+    else:
+        config["model_args"] = {}
+    
+    return jsonify(config)
+
+@app.route('/api/set-config', methods=['POST'])
+def api_set_config():
+    """Set current configuration"""
+    global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH, MAX_STEPS
+    
+    data = request.get_json()
+    if not data:
+        return jsonify({"error": "No data provided"}), 400
+    
+    # Update global variables
+    ACTION_SPACE = data.get('action_space', ACTION_SPACE)
+    OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
+    MODEL_NAME = data.get('model_name', MODEL_NAME)
+    
+    # Update results path
+    RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    
+    # Update max_steps from args.json if available
+    model_args = get_model_args(ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
+    if model_args and 'max_steps' in model_args:
+        MAX_STEPS = model_args['max_steps']
+    
+    if RESULTS_PATH not in TASK_STATUS_CACHE:
+        # Initialize cache for this results path
+        TASK_STATUS_CACHE[RESULTS_PATH] = {}
+        
+    return jsonify({
+        "action_space": ACTION_SPACE,
+        "observation_type": OBSERVATION_TYPE,
+        "model_name": MODEL_NAME,
+        "max_steps": MAX_STEPS,
+        "results_path": RESULTS_PATH
+    })
+
+def get_model_args(action_space, observation_type, model_name):
+    """Get model arguments from args.json file"""
+    args_file = os.path.join(RESULTS_BASE_PATH, action_space, observation_type, model_name, "args.json")
+    if os.path.exists(args_file):
+        try:
+            with open(args_file, 'r') as f:
+                return json.load(f)
+        except Exception as e:
+            print(f"Error reading args.json: {e}")
+    return None
+
 if __name__ == '__main__':
    # Check if necessary directories exist
    if not os.path.exists(TASK_CONFIG_PATH):
@ -447,4 +604,4 @@ if __name__ == '__main__':
    port = 8080
    debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
    
-    app.run(host=host, port=port, debug=debug)
+    app.run(host=host, port=port, debug=debug, threaded=True)
--- a/monitor/static/index.css
+++ b/monitor/static/index.css
@ -1,5 +1,63 @@
 /* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
 body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
+
+.layout-container {
+    position: relative;
+    max-width: 1200px;
+    margin: 20px auto;
+    padding: 0 20px;
+}
+
+.main-content {
+    background: #fff;
+    border-radius: 14px;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+    padding: 36px 44px;
+}
+
+/* Floating Config Sidebar */
+.config-sidebar {
+    position: fixed;
+    top: 20px;
+    left: -280px;
+    width: 300px;
+    height: calc(100vh - 40px);
+    z-index: 1000;
+    transition: left 0.3s ease;
+}
+
+.config-sidebar:hover {
+    left: 0;
+}
+
+.config-toggle-btn {
+    position: absolute;
+    right: -50px;
+    top: 50%;
+    transform: translateY(-50%);
+    width: 50px;
+    height: 50px;
+    background: linear-gradient(135deg, #007bff, #0056b3);
+    border-radius: 0 25px 25px 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    color: white;
+    font-size: 1.2em;
+    cursor: pointer;
+    box-shadow: 2px 0 10px rgba(0,0,0,0.2);
+    transition: all 0.3s ease;
+}
+
+.config-toggle-btn:hover {
+    background: linear-gradient(135deg, #0056b3, #004085);
+    transform: translateY(-50%) scale(1.05);
+}
+
+.config-sidebar:hover .config-toggle-btn {
+    opacity: 0.8;
+}
+
 .main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
 h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
 h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    text-shadow: 0 1px 2px rgba(0,0,0,0.05);
 }

+.accuracy-percentage {
+    font-size: 0.7em;
+    font-weight: 600;
+    color: #ffffff;
+    margin-left: 8px;
+    background: rgba(255, 255, 255, 0.1);
+    padding: 4px 8px;
+    border-radius: 12px;
+    display: inline-block;
+    vertical-align: middle;
+}
+

 .stat-card span {
    font-size: 2em;
@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }

 .task-type-stats {
    display: flex;
-    gap: 16px;
    flex-wrap: wrap;
+    gap: 8px;
+    align-items: center;
 }

 .task-stat {
@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    color: #b71c1c;
 }

+/* Task type statistics styles */
+.task-stat.score {
+    color: #ffc107;
+    background: rgba(255, 193, 7, 0.1);
+}
+
+.task-stat.steps {
+    color: #17a2b8;
+    background: rgba(23, 162, 184, 0.1);
+}
+
+.task-stat.rate {
+    color: #28a745;
+    background: rgba(40, 167, 69, 0.1);
+}
+
 .tasks-container {
    padding: 20px;
    transition: all 0.4s cubic-bezier(.4,0,.2,1);
@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
    background: #a5c7e5;
 }

+/* Configuration Panel Styles */
+.config-panel {
+    background: #fff;
+    border-radius: 0 14px 14px 0;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.15);
+    overflow: hidden;
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+}
+
+.config-header {
+    display: flex;
+    align-items: center;
+    padding: 16px 20px;
+    background: linear-gradient(135deg, #6c757d, #495057);
+    color: white;
+    flex-shrink: 0;
+}
+
+.config-header i {
+    margin-right: 10px;
+    font-size: 1.1em;
+}
+
+.config-header span {
+    font-weight: 600;
+    font-size: 1.1em;
+}
+
+.config-content {
+    padding: 20px;
+    flex: 1;
+    overflow-y: auto;
+}
+
+.config-selector {
+    margin-bottom: 20px;
+    padding-bottom: 15px;
+    border-bottom: 1px solid #dee2e6;
+}
+
+.selector-item {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+
+.selector-item label {
+    font-weight: 600;
+    color: #495057;
+    font-size: 0.9em;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.selector-item select {
+    padding: 8px 12px;
+    border: 2px solid #e9ecef;
+    border-radius: 6px;
+    background: white;
+    font-size: 0.9em;
+    color: #495057;
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+
+.selector-item select:focus {
+    outline: none;
+    border-color: #007bff;
+    box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
+}
+
+.selector-item select:hover {
+    border-color: #007bff;
+}
+
+.config-list {
+    display: flex;
+    flex-direction: column;
+    gap: 15px;
+}
+
+.config-item {
+    display: flex;
+    flex-direction: column;
+    background: #f8f9fa;
+    padding: 12px;
+    border-radius: 8px;
+    border-left: 4px solid #007bff;
+    transition: all 0.3s ease;
+}
+
+.config-item:hover {
+    transform: translateX(3px);
+    box-shadow: 0 4px 12px rgba(0,123,255,0.15);
+}
+
+.config-label {
+    font-weight: 600;
+    color: #495057;
+    margin-bottom: 5px;
+    font-size: 0.9em;
+    text-transform: uppercase;
+    color: #495057;
+    font-size: 0.85em;
+    margin-bottom: 6px;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.config-value {
+    color: #007bff;
+    font-family: 'Courier New', monospace;
+    font-size: 0.9em;
+    font-weight: 600;
+    word-break: break-word;
+}
+
+.config-path {
+    font-size: 0.8em;
+    line-height: 1.3;
+}
+
+/* Responsive design for sidebar layout */
+@media (max-width: 1024px) {
+    .config-sidebar {
+        left: -250px;
+        width: 250px;
+    }
+    
+    .config-toggle-btn {
+        right: -40px;
+        width: 40px;
+        height: 40px;
+        font-size: 1em;
+    }
+}
+
+@media (max-width: 768px) {
+    .layout-container {
+        padding: 0 10px;
+    }
+    
+    .main-content {
+        padding: 20px 25px;
+    }
+    
+    .config-sidebar {
+        left: -220px;
+        width: 220px;
+        height: calc(100vh - 20px);
+        top: 10px;
+    }
+    
+    .config-toggle-btn {
+        right: -35px;
+        width: 35px;
+        height: 35px;
+        font-size: 0.9em;
+    }
+    
+    .config-content {
+        padding: 15px;
+    }
+    
+    .config-item {
+        padding: 10px;
+    }
+}
+
--- a/monitor/static/index.js
+++ b/monitor/static/index.js
@ -1,5 +1,8 @@
 document.addEventListener('DOMContentLoaded', () => {
-    fetchTasks();
+    fetchAvailableConfigs().then(() => {
+        fetchConfig();
+        fetchTasks();
+    });
    // Bind filter functionality
    document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
    document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {

 let allTaskData = null;
 let currentFilter = 'all';
+let availableConfigs = [];
+let currentConfig = null;
+let categoryStats = {};

 function refreshPage() {
    // Save expanded state before refresh
@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
    fetch('/api/tasks/brief')
        .then(response => response.json())
        .then(data => {
-            // Update stored data
            allTaskData = data;
+            categoryStats = calculateCategoryStats(data);
            // Only update statistics and task status, do not fully re-render
            updateStatistics(data);
            updateTaskStatus(data);
@ -148,6 +154,7 @@ function fetchTasks() {
        .then(response => response.json())
        .then(data => {
            allTaskData = data;
+            categoryStats = calculateCategoryStats(data);
            renderTasks(data);
            updateStatistics(data);
        })
@ -208,13 +215,15 @@ function updateStatistics(data) {
    document.getElementById('completed-tasks').textContent = completedTasks;
    document.getElementById('error-tasks').textContent = errorTasks;
    
-    // Update score display with formatted score
+    // Update score display with formatted score and accuracy percentage
    const scoreDisplay = document.getElementById('score-display');
    if (completedTasks > 0) {
        const scoreFormatted = totalScore.toFixed(2);
-        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
+        const averageScore = totalScore / completedTasks;
+        const accuracyPercentage = (averageScore * 100).toFixed(1);
+        scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
    } else {
-        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
+        scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
    }
    
    // Highlight the currently selected statistics card
@ -279,6 +288,10 @@ function renderTasks(data) {
        // Create header with task type name and statistics
        const typeHeader = document.createElement('div');
        typeHeader.className = 'task-type-header';
+        
+        // Get category stats for this task type
+        const stats = categoryStats[taskType] || {};
+        
        typeHeader.innerHTML = `
            <span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
            <div class="task-type-stats">
@ -286,6 +299,8 @@ function renderTasks(data) {
                <span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
                <span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
                <span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
+                ${stats.total_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.total_score} total score</span>` : ''}
+                ${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
            </div>
        `;
        typeSection.appendChild(typeHeader);
@ -453,7 +468,200 @@ function renderTasks(data) {
        container.appendChild(typeSection);
    });
 }
-// add auto-refresh with time interval 10 seconds
-setInterval(() => {
-        refreshPage();
-}, 10000); // 10 seconds interval
+
+function fetchAvailableConfigs() {
+    return fetch('/api/available-configs')
+        .then(response => response.json())
+        .then(data => {
+            availableConfigs = data;
+            populateConfigSelect();
+            return data;
+        })
+        .catch(error => {
+            console.error('Error fetching available configs:', error);
+            return [];
+        });
+}
+
+function populateConfigSelect() {
+    const select = document.getElementById('config-select');
+    select.innerHTML = '';
+    
+    if (availableConfigs.length === 0) {
+        select.innerHTML = '<option value="">No configurations found in results directory</option>';
+        return;
+    }
+    
+    // Add available configurations
+    availableConfigs.forEach((config, index) => {
+        const option = document.createElement('option');
+        option.value = index;
+        option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
+        select.appendChild(option);
+    });
+}
+
+function changeConfiguration() {
+    const select = document.getElementById('config-select');
+    const selectedIndex = select.value;
+    
+    if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
+        return;
+    }
+    
+    const selectedConfig = availableConfigs[selectedIndex];
+    
+    // Send configuration change request
+    fetch('/api/set-config', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json',
+        },
+        body: JSON.stringify(selectedConfig)
+    })
+    .then(response => response.json())
+    .then(data => {
+        currentConfig = data;
+        displayConfig(data);
+        // Refresh tasks with new configuration
+        fetchTasks();
+    })
+    .catch(error => {
+        console.error('Error setting config:', error);
+        displayConfigError();
+    });
+}
+
+function fetchConfig() {
+    return fetch('/api/current-config')
+        .then(response => response.json())
+        .then(data => {
+            currentConfig = data;
+            displayConfig(data);
+            updateConfigSelect();
+            return data;
+        })
+        .catch(error => {
+            console.error('Error fetching config:', error);
+            displayConfigError();
+        });
+}
+
+function updateConfigSelect() {
+    if (!currentConfig || availableConfigs.length === 0) return;
+    
+    const select = document.getElementById('config-select');
+    const currentConfigIndex = availableConfigs.findIndex(config => 
+        config.action_space === currentConfig.action_space &&
+        config.observation_type === currentConfig.observation_type &&
+        config.model_name === currentConfig.model_name
+    );
+    
+    if (currentConfigIndex !== -1) {
+        select.value = currentConfigIndex;
+    } else {
+        // Current config not found in available configs, select the first one if available
+        if (availableConfigs.length > 0) {
+            select.value = 0;
+            console.warn('Current config not found in available configs, defaulting to first available config');
+        }
+    }
+}
+
+function displayConfig(config) {
+    document.getElementById('action-space').textContent = config.action_space || 'N/A';
+    document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
+    document.getElementById('model-name').textContent = config.model_name || 'N/A';
+    document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
+    
+    // Display model args from args.json
+    const modelArgsElement = document.getElementById('model-args');
+    if (config.model_args && Object.keys(config.model_args).length > 0) {
+        let argsHtml = '';
+        Object.entries(config.model_args).forEach(([key, value]) => {
+            // Skip max_steps as it's already displayed above
+            if (key !== 'max_steps') {
+                argsHtml += `<div class="config-item">
+                    <span class="config-label">${key}:</span>
+                    <span class="config-value">${JSON.stringify(value)}</span>
+                </div>`;
+            }
+        });
+        modelArgsElement.innerHTML = argsHtml;
+        modelArgsElement.style.display = 'block';
+    } else {
+        modelArgsElement.style.display = 'none';
+    }
+}
+
+function displayConfigError() {
+    const configValues = document.querySelectorAll('.config-value');
+    configValues.forEach(element => {
+        element.textContent = 'Error loading';
+        element.style.color = '#dc3545';
+    });
+}
+
+function calculateCategoryStats(data) {
+    const stats = {};
+    
+    Object.entries(data).forEach(([taskType, tasks]) => {
+        let totalTasks = tasks.length;
+        let completedTasks = 0;
+        let runningTasks = 0;
+        let errorTasks = 0;
+        let totalScore = 0;
+        let totalSteps = 0;
+        let completedWithSteps = 0;
+        
+        tasks.forEach(task => {
+            const status = task.status.status;
+            
+            if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
+                completedTasks++;
+                
+                // Calculate score if available
+                if (task.status.result) {
+                    try {
+                        const score = parseFloat(task.status.result);
+                        if (!isNaN(score) && score >= 0 && score <= 1) {
+                            totalScore += score;
+                        }
+                    } catch (e) {
+                        // Ignore parsing errors
+                    }
+                }
+                
+                // Calculate steps for completed tasks
+                if (task.status.progress && task.status.progress > 0) {
+                    totalSteps += task.status.progress;
+                    completedWithSteps++;
+                }
+                
+            } else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
+                runningTasks++;
+                
+            } else if (status === 'Error') {
+                errorTasks++;
+            }
+        });
+        
+        // Calculate averages
+        const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
+        const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
+        const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
+        
+        stats[taskType] = {
+            total_tasks: totalTasks,
+            completed_tasks: completedTasks,
+            running_tasks: runningTasks,
+            error_tasks: errorTasks,
+            total_score: Math.round(totalScore * 100) / 100,
+            avg_score: Math.round(avgScore * 10000) / 10000,
+            avg_steps: Math.round(avgSteps * 10) / 10,
+            completion_rate: Math.round(completionRate * 10) / 10
+        };
+    });
+    
+    return stats;
+}
--- a/monitor/templates/index.html
+++ b/monitor/templates/index.html
@ -12,19 +12,65 @@
    <link rel="stylesheet" href="/static/index.css">
 </head>
 <body>
-    <div class="main-container">
-        <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
-        
-        <!-- Score Display Banner -->
-        <div class="score-banner">
-            <div class="score-content">
-                <i class="fas fa-star"></i>
-                <span class="score-label">Score:</span>
-                <span id="score-display" class="score-value">Loading...</span>
+    <div class="layout-container">
+        <!-- Floating Config Button and Sidebar -->
+        <div class="config-sidebar" id="config-sidebar">
+            <div class="config-toggle-btn">
+                <i class="fas fa-cogs"></i>
+            </div>
+            <div class="config-panel">
+                <div class="config-header">
+                    <i class="fas fa-cogs"></i>
+                    <span>Configuration</span>
+                </div>
+                <div class="config-content">
+                    <div class="config-selector">
+                        <div class="selector-item">
+                            <label for="config-select">Select Configuration:</label>
+                            <select id="config-select" onchange="changeConfiguration()">
+                                <option value="">Loading configurations...</option>
+                            </select>
+                        </div>
+                    </div>
+                    <div class="config-list">
+                        <div class="config-item">
+                            <span class="config-label">Action Space:</span>
+                            <span class="config-value" id="action-space">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Observation:</span>
+                            <span class="config-value" id="observation-type">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Model:</span>
+                            <span class="config-value" id="model-name">Loading...</span>
+                        </div>
+                        <div class="config-item">
+                            <span class="config-label">Max Steps:</span>
+                            <span class="config-value" id="max-steps">Loading...</span>
+                        </div>
+                        <div id="model-args" style="display: none;">
+                            <!-- Model args from args.json will be populated here -->
+                        </div>
+                    </div>
+                </div>
            </div>
        </div>
        
-        <div class="dashboard-stats">
+        <!-- Main Content -->
+        <div class="main-content">
+            <h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
+            
+            <!-- Score Display Banner -->
+            <div class="score-banner">
+                <div class="score-content">
+                    <i class="fas fa-star"></i>
+                    <span class="score-label">Score:</span>
+                    <span id="score-display" class="score-value">Loading...</span>
+                </div>
+            </div>
+            
+            <div class="dashboard-stats">
            <div class="stat-card">
                <i class="fas fa-running"></i>
                <span id="active-tasks">Loading...</span>
@ -46,10 +92,11 @@
                <div class="stat-label">Total Tasks</div>
            </div>
        </div>
-        <div id="task-container">
-            <div class="loading-spinner">
-                <div class="spinner"></div>
-                <div>Loading task data...</div>
+            <div id="task-container">
+                <div class="loading-spinner">
+                    <div class="spinner"></div>
+                    <div>Loading task data...</div>
+                </div>
            </div>
        </div>
    </div>
--- a/run.py
+++ b/run.py
@ -290,6 +290,18 @@ if __name__ == "__main__":
    ####### The complete version of the list of examples #######
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    args = config()
+    
+    # save args to json in result_dir/action_space/observation_type/model/args.json
+    path_to_args = os.path.join(
+        args.result_dir,
+        args.action_space,
+        args.observation_type,
+        args.model,
+        "args.json",
+    )
+    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
+    with open(path_to_args, "w", encoding="utf-8") as f:
+        json.dump(vars(args), f, indent=4)

    with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
        test_all_meta = json.load(f)
--- a/run_multienv.py
+++ b/run_multienv.py
@ -342,6 +342,18 @@ if __name__ == "__main__":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    args = config()
+    
+    # save args to json in result_dir/action_space/observation_type/model/args.json
+    path_to_args = os.path.join(
+        args.result_dir,
+        args.action_space,
+        args.observation_type,
+        args.model,
+        "args.json",
+    )
+    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
+    with open(path_to_args, "w", encoding="utf-8") as f:
+        json.dump(vars(args), f, indent=4)

    with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
        test_all_meta = json.load(f)
--- a/run_multienv_aguvis.py
+++ b/run_multienv_aguvis.py
@ -333,6 +333,18 @@ if __name__ == "__main__":
    ####### The complete version of the list of examples #######
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    args = config()
+    
+    # save args to json in result_dir/action_space/observation_type/model/args.json
+    path_to_args = os.path.join(
+        args.result_dir,
+        args.action_space,
+        args.observation_type,
+        args.model,
+        "args.json",
+    )
+    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
+    with open(path_to_args, "w", encoding="utf-8") as f:
+        json.dump(vars(args), f, indent=4)

    with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
        test_all_meta = json.load(f)
--- a/run_multienv_claude.py
+++ b/run_multienv_claude.py
@ -12,12 +12,12 @@ from typing import List, Dict
 import math
 from tqdm import tqdm
 from multiprocessing import Process, Manager
-import lib_run_single
-from desktop_env.desktop_env import DesktopEnv
+# import lib_run_single
+# from desktop_env.desktop_env import DesktopEnv
 from mm_agents.anthropic import AnthropicAgent as PromptAgent

-# import fake_run_single as lib_run_single
-# from test_env import DesktopEnv
+import fake_run_single as lib_run_single
+from test_env import DesktopEnv

 # .env
 from dotenv import load_dotenv
@ -352,6 +352,17 @@ if __name__ == "__main__":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    args = config()
+    # save args to json in result_dir/action_space/observation_type/model/args.json
+    path_to_args = os.path.join(
+        args.result_dir,
+        args.action_space,
+        args.observation_type,
+        args.model,
+        "args.json",
+    )
+    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
+    with open(path_to_args, "w", encoding="utf-8") as f:
+        json.dump(vars(args), f, indent=4)

    with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
        test_all_meta = json.load(f)
--- a/run_multienv_openaicua.py
+++ b/run_multienv_openaicua.py
@ -464,6 +464,18 @@ if __name__ == "__main__":
    
    try:
        args = config()
+        
+        # save args to json in result_dir/action_space/observation_type/model/args.json
+        path_to_args = os.path.join(
+            args.result_dir,
+            args.action_space,
+            args.observation_type,
+            args.model,
+            "args.json",
+        )
+        os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
+        with open(path_to_args, "w", encoding="utf-8") as f:
+            json.dump(vars(args), f, indent=4)

        with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
            test_all_meta = json.load(f)
--- a/run_uitars.py
+++ b/run_uitars.py
@ -321,6 +321,18 @@ if __name__ == "__main__":
    ####### The complete version of the list of examples #######
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    args = config()
+    
+    # save args to json in result_dir/action_space/observation_type/model/args.json
+    path_to_args = os.path.join(
+        args.result_dir,
+        args.action_space,
+        args.observation_type,
+        args.model,
+        "args.json",
+    )
+    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
+    with open(path_to_args, "w", encoding="utf-8") as f:
+        json.dump(vars(args), f, indent=4)

    with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
        test_all_meta = json.load(f)
Author	SHA1	Message	Date
Zilong Zhou	6275277abe	Delete mm_agents/anthropic/main.py	2025-07-16 17:48:19 +08:00
adlsdztony	9b2ecdf5c9	Merge branch 'feat/monitor' of https://github.com/xlang-ai/OSWorld into feat/monitor	2025-07-16 09:44:52 +00:00
adlsdztony	01008b54ce	feat&fix: update configuration management to save model arguments and enhance UI display for model args	2025-07-16 09:44:49 +00:00
Zilong Zhou	8740543c91	Merge branch 'main' into feat/monitor	2025-07-14 13:33:43 +08:00
adlsdztony	db2ef13ab1	Merge branch 'feat/claude-cua-support' into feat/monitor	2025-07-14 05:30:45 +00:00
adlsdztony	957d73f4ea	feat&fix: add accuracy percentage display to score and style updates for UI	2025-07-14 05:29:55 +00:00
adlsdztony	c5dfd3bc29	feat&fix: add configuration toggle button in UI and improve task loading performance	2025-07-14 04:53:34 +00:00
adlsdztony	7d17aa57f6	feat&fix: update environment configuration, enhance task statistics, and improve UI responsiveness	2025-07-14 04:28:01 +00:00
adlsdztony	4a0d605a83	feat&fix: add configuration management API endpoints and update UI for configuration selection	2025-07-14 02:36:09 +00:00
adlsdztony	033326ea6e	fix: update logger usage to use global logger and improve error handling	2025-07-14 01:47:56 +00:00
Zilong Zhou	f7c33ed33b	Delete test_env/utils.py	2025-07-13 15:31:55 +08:00
Zilong Zhou	0c426390e8	Delete test_env/logger.py	2025-07-13 15:30:50 +08:00
adlsdztony	996fc9d54e	feat: add notice about image size limitations for Anthropic API	2025-07-13 06:08:01 +00:00
adlsdztony	1493246fa6	feat: add setup instructions for Anthropic API integration	2025-07-13 06:04:06 +00:00
adlsdztony	7d4bcbd9d7	feat&fix: implement action parsing for tool calls and add screen size handling	2025-07-13 06:00:32 +00:00
adlsdztony	14fe4d9476	fix: update text formatting in action parsing and replace logger import	2025-07-13 04:51:53 +00:00
adlsdztony	e75ac625fc	feat&fix: implement action parsing for tool calls and update default action space	2025-07-13 04:33:31 +00:00
adlsdztony	03385db30e	chore: remove run_test_env.py script	2025-07-12 12:35:18 +00:00
adlsdztony	88b080388d	feat&fix: add tool result handling and update model default in evaluation script	2025-07-12 12:22:41 +00:00
adlsdztony	6549648c0d	feat: add script for end-to-end evaluation with logging and task distribution	2025-07-12 17:08:46 +08:00
adlsdztony	15674876a0	Merge branch 'main' into feat/claude-cua-support	2025-07-12 17:05:49 +08:00
adlsdztony	3455c8fb73	feat: add claude support	2025-05-31 20:34:35 +08:00