Compare commits
22 Commits
main
...
feat/monit
| Author | SHA1 | Date |
|---|---|---|
|
|
6275277abe | |
|
|
9b2ecdf5c9 | |
|
|
01008b54ce | |
|
|
8740543c91 | |
|
|
db2ef13ab1 | |
|
|
957d73f4ea | |
|
|
c5dfd3bc29 | |
|
|
7d17aa57f6 | |
|
|
4a0d605a83 | |
|
|
033326ea6e | |
|
|
f7c33ed33b | |
|
|
0c426390e8 | |
|
|
996fc9d54e | |
|
|
1493246fa6 | |
|
|
7d4bcbd9d7 | |
|
|
14fe4d9476 | |
|
|
e75ac625fc | |
|
|
03385db30e | |
|
|
88b080388d | |
|
|
6549648c0d | |
|
|
15674876a0 | |
|
|
3455c8fb73 |
|
|
@ -1,442 +0,0 @@
|
|||
import base64
|
||||
import os
|
||||
import time
|
||||
from typing import Any, cast, Optional, Dict
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
from anthropic import (
|
||||
Anthropic,
|
||||
AnthropicBedrock,
|
||||
AnthropicVertex,
|
||||
APIError,
|
||||
APIResponseValidationError,
|
||||
APIStatusError,
|
||||
)
|
||||
from anthropic.types.beta import (
|
||||
BetaMessageParam,
|
||||
BetaTextBlockParam,
|
||||
)
|
||||
from .utils import COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG,SYSTEM_PROMPT, SYSTEM_PROMPT_WINDOWS, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME
|
||||
from .utils import _response_to_params, _inject_prompt_caching, _maybe_filter_to_n_most_recent_images
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
class AnthropicAgent:
|
||||
def __init__(self,
|
||||
platform: str = "Ubuntu",
|
||||
model: str = "claude-3-5-sonnet-20241022",
|
||||
provider: APIProvider = APIProvider.BEDROCK,
|
||||
max_tokens: int = 4096,
|
||||
api_key: str = os.environ.get("ANTHROPIC_API_KEY", None),
|
||||
system_prompt_suffix: str = "",
|
||||
only_n_most_recent_images: Optional[int] = 10,
|
||||
action_space: str = "claude_computer_use",
|
||||
screen_size: tuple[int, int] = (1920, 1080),
|
||||
*args, **kwargs
|
||||
):
|
||||
self.platform = platform
|
||||
self.action_space = action_space
|
||||
self.logger = logger
|
||||
self.class_name = self.__class__.__name__
|
||||
self.model_name = model
|
||||
self.provider = provider
|
||||
self.max_tokens = max_tokens
|
||||
self.api_key = api_key
|
||||
self.system_prompt_suffix = system_prompt_suffix
|
||||
self.only_n_most_recent_images = only_n_most_recent_images
|
||||
self.messages: list[BetaMessageParam] = []
|
||||
self.screen_size = screen_size
|
||||
self.resize_factor = (
|
||||
screen_size[0] / 1280, # Assuming 1280 is the base width
|
||||
screen_size[1] / 720 # Assuming 720 is the base height
|
||||
)
|
||||
|
||||
def add_tool_result(self, tool_call_id: str, result: str, screenshot: bytes = None):
|
||||
"""Add tool result to message history"""
|
||||
tool_result_content = [
|
||||
{
|
||||
"type": "tool_result",
|
||||
"tool_use_id": tool_call_id,
|
||||
"content": [{"type": "text", "text": result}]
|
||||
}
|
||||
]
|
||||
|
||||
# Add screenshot if provided
|
||||
if screenshot is not None:
|
||||
screenshot_base64 = base64.b64encode(screenshot).decode('utf-8')
|
||||
tool_result_content[0]["content"].append({
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/png",
|
||||
"data": screenshot_base64
|
||||
}
|
||||
})
|
||||
|
||||
self.messages.append({
|
||||
"role": "user",
|
||||
"content": tool_result_content
|
||||
})
|
||||
|
||||
def parse_actions_from_tool_call(self, tool_call: Dict) -> str:
|
||||
result = ""
|
||||
function_args = (
|
||||
tool_call["input"]
|
||||
)
|
||||
|
||||
action = function_args.get("action")
|
||||
if not action:
|
||||
action = tool_call.function.name
|
||||
action_conversion = {
|
||||
"left click": "click",
|
||||
"right click": "right_click"
|
||||
}
|
||||
action = action_conversion.get(action, action)
|
||||
|
||||
text = function_args.get("text")
|
||||
coordinate = function_args.get("coordinate")
|
||||
scroll_direction = function_args.get("scroll_direction")
|
||||
scroll_amount = function_args.get("scroll_amount")
|
||||
duration = function_args.get("duration")
|
||||
|
||||
# resize coordinates if resize_factor is set
|
||||
if coordinate and self.resize_factor:
|
||||
coordinate = (
|
||||
int(coordinate[0] * self.resize_factor[0]),
|
||||
int(coordinate[1] * self.resize_factor[1])
|
||||
)
|
||||
|
||||
# Handle mouse move and drag actions
|
||||
if action in ("mouse_move", "left_click_drag"):
|
||||
if coordinate is None:
|
||||
raise ValueError(f"coordinate is required for {action}")
|
||||
if text is not None:
|
||||
raise ValueError(f"text is not accepted for {action}")
|
||||
if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
|
||||
raise ValueError(f"{coordinate} must be a tuple of length 2")
|
||||
if not all(isinstance(i, int) for i in coordinate):
|
||||
raise ValueError(f"{coordinate} must be a tuple of ints")
|
||||
|
||||
x, y = coordinate[0], coordinate[1]
|
||||
if action == "mouse_move":
|
||||
result += (
|
||||
f"pyautogui.moveTo({x}, {y}, duration={duration or 0.5})\n"
|
||||
)
|
||||
expected_outcome = f"Mouse moved to ({x},{y})."
|
||||
elif action == "left_click_drag":
|
||||
result += (
|
||||
f"pyautogui.dragTo({x}, {y}, duration={duration or 0.5})\n"
|
||||
)
|
||||
expected_outcome = f"Cursor dragged to ({x},{y})."
|
||||
|
||||
# Handle keyboard actions
|
||||
elif action in ("key", "type"):
|
||||
if text is None:
|
||||
raise ValueError(f"text is required for {action}")
|
||||
if coordinate is not None:
|
||||
raise ValueError(f"coordinate is not accepted for {action}")
|
||||
if not isinstance(text, str):
|
||||
raise ValueError(f"{text} must be a string")
|
||||
|
||||
if action == "key":
|
||||
key_conversion = {
|
||||
"page_down": "pagedown",
|
||||
"page_up": "pageup",
|
||||
"super_l": "win",
|
||||
"super": "command",
|
||||
"escape": "esc"
|
||||
}
|
||||
keys = text.split('+')
|
||||
for key in keys:
|
||||
key = key.strip().lower()
|
||||
key = key_conversion.get(key, key)
|
||||
result += (f"pyautogui.keyDown('{key}')\n")
|
||||
for key in reversed(keys):
|
||||
key = key.strip().lower()
|
||||
key = key_conversion.get(key, key)
|
||||
result += (f"pyautogui.keyUp('{key}')\n")
|
||||
expected_outcome = f"Key {key} pressed."
|
||||
elif action == "type":
|
||||
result += (
|
||||
f"pyautogui.typewrite(\"\"\"{text}\"\"\", interval=0.01)\n"
|
||||
)
|
||||
expected_outcome = f"Text {text} written."
|
||||
|
||||
# Handle scroll actions
|
||||
elif action == "scroll":
|
||||
if coordinate is None:
|
||||
if scroll_direction in ("up", "down"):
|
||||
result += (
|
||||
f"pyautogui.scroll({scroll_amount if scroll_direction == 'up' else -scroll_amount})\n"
|
||||
)
|
||||
elif scroll_direction in ("left", "right"):
|
||||
result += (
|
||||
f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount})\n"
|
||||
)
|
||||
else:
|
||||
if scroll_direction in ("up", "down"):
|
||||
x, y = coordinate[0], coordinate[1]
|
||||
result += (
|
||||
f"pyautogui.scroll({scroll_amount if scroll_direction == 'up' else -scroll_amount}, {x}, {y})\n"
|
||||
)
|
||||
elif scroll_direction in ("left", "right"):
|
||||
x, y = coordinate[0], coordinate[1]
|
||||
result += (
|
||||
f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount}, {x}, {y})\n"
|
||||
)
|
||||
expected_outcome = "Scroll action finished"
|
||||
|
||||
# Handle click actions
|
||||
elif action in ("left_click", "right_click", "double_click", "middle_click", "left_press"):
|
||||
if coordinate is not None:
|
||||
x, y = coordinate
|
||||
if action == "left_click":
|
||||
result += (f"pyautogui.click({x}, {y})\n")
|
||||
elif action == "right_click":
|
||||
result += (f"pyautogui.rightClick({x}, {y})\n")
|
||||
elif action == "double_click":
|
||||
result += (f"pyautogui.doubleClick({x}, {y})\n")
|
||||
elif action == "middle_click":
|
||||
result += (f"pyautogui.middleClick({x}, {y})\n")
|
||||
elif action == "left_press":
|
||||
result += (f"pyautogui.mouseDown({x}, {y})\n")
|
||||
result += ("time.sleep(1)\n")
|
||||
result += (f"pyautogui.mouseUp({x}, {y})\n")
|
||||
else:
|
||||
if action == "left_click":
|
||||
result += ("pyautogui.click()\n")
|
||||
elif action == "right_click":
|
||||
result += ("pyautogui.rightClick()\n")
|
||||
elif action == "double_click":
|
||||
result += ("pyautogui.doubleClick()\n")
|
||||
elif action == "middle_click":
|
||||
result += ("pyautogui.middleClick()\n")
|
||||
elif action == "left_press":
|
||||
result += ("pyautogui.mouseDown()\n")
|
||||
result += ("time.sleep(1)\n")
|
||||
result += ("pyautogui.mouseUp()\n")
|
||||
expected_outcome = "Click action finished"
|
||||
|
||||
elif action == "wait":
|
||||
result += "pyautogui.sleep(0.5)\n"
|
||||
expected_outcome = "Wait for 0.5 seconds"
|
||||
elif action == "fail":
|
||||
result += "FAIL"
|
||||
expected_outcome = "Finished"
|
||||
elif action == "done":
|
||||
result += "DONE"
|
||||
expected_outcome = "Finished"
|
||||
elif action == "call_user":
|
||||
result += "CALL_USER"
|
||||
expected_outcome = "Call user"
|
||||
elif action == "screenshot":
|
||||
result += "pyautogui.sleep(0.1)\n"
|
||||
expected_outcome = "Screenshot taken"
|
||||
else:
|
||||
raise ValueError(f"Invalid action: {action}")
|
||||
|
||||
return result
|
||||
|
||||
def predict(self, task_instruction: str, obs: Dict = None, system: Any = None):
|
||||
system = BetaTextBlockParam(
|
||||
type="text",
|
||||
text=f"{SYSTEM_PROMPT_WINDOWS if self.platform == 'Windows' else SYSTEM_PROMPT}{' ' + self.system_prompt_suffix if self.system_prompt_suffix else ''}"
|
||||
)
|
||||
|
||||
# resize screenshot if resize_factor is set
|
||||
if obs and "screenshot" in obs:
|
||||
# Convert bytes to PIL Image
|
||||
screenshot_bytes = obs["screenshot"]
|
||||
screenshot_image = Image.open(io.BytesIO(screenshot_bytes))
|
||||
|
||||
# Calculate new size based on resize factor
|
||||
new_width, new_height = 1280, 720
|
||||
|
||||
# Resize the image
|
||||
resized_image = screenshot_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
# Convert back to bytes
|
||||
output_buffer = io.BytesIO()
|
||||
resized_image.save(output_buffer, format='PNG')
|
||||
obs["screenshot"] = output_buffer.getvalue()
|
||||
|
||||
|
||||
if not self.messages:
|
||||
|
||||
init_screenshot = obs
|
||||
init_screenshot_base64 = base64.b64encode(init_screenshot["screenshot"]).decode('utf-8')
|
||||
self.messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/png",
|
||||
"data": init_screenshot_base64,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": task_instruction},
|
||||
]
|
||||
})
|
||||
|
||||
if self.messages and "tool_use" in [content_block["type"] for content_block in self.messages[-1]["content"]]:
|
||||
self.add_tool_result(
|
||||
self.messages[-1]["content"][-1]["id"],
|
||||
f"Success",
|
||||
screenshot=obs.get("screenshot") if obs else None
|
||||
)
|
||||
|
||||
enable_prompt_caching = False
|
||||
betas = ["computer-use-2025-01-24"]
|
||||
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
|
||||
betas = ["computer-use-2025-01-24"]
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
betas = [COMPUTER_USE_BETA_FLAG]
|
||||
|
||||
image_truncation_threshold = 10
|
||||
if self.provider == APIProvider.ANTHROPIC:
|
||||
client = Anthropic(api_key=self.api_key, max_retries=4)
|
||||
enable_prompt_caching = True
|
||||
elif self.provider == APIProvider.VERTEX:
|
||||
client = AnthropicVertex()
|
||||
elif self.provider == APIProvider.BEDROCK:
|
||||
client = AnthropicBedrock(
|
||||
# Authenticate by either providing the keys below or use the default AWS credential providers, such as
|
||||
# using ~/.aws/credentials or the "AWS_SECRET_ACCESS_KEY" and "AWS_ACCESS_KEY_ID" environment variables.
|
||||
aws_access_key=os.getenv('AWS_ACCESS_KEY_ID'),
|
||||
aws_secret_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
|
||||
# aws_region changes the aws region to which the request is made. By default, we read AWS_REGION,
|
||||
# and if that's not present, we default to us-east-1. Note that we do not read ~/.aws/config for the region.
|
||||
aws_region=os.getenv('AWS_DEFAULT_REGION'),
|
||||
)
|
||||
|
||||
if enable_prompt_caching:
|
||||
betas.append(PROMPT_CACHING_BETA_FLAG)
|
||||
_inject_prompt_caching(self.messages)
|
||||
image_truncation_threshold = 50
|
||||
system["cache_control"] = {"type": "ephemeral"}
|
||||
|
||||
if self.only_n_most_recent_images:
|
||||
_maybe_filter_to_n_most_recent_images(
|
||||
self.messages,
|
||||
self.only_n_most_recent_images,
|
||||
min_removal_threshold=image_truncation_threshold,
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
if self.model_name == "claude-3-5-sonnet-20241022":
|
||||
tools = [
|
||||
{'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
# {'type': 'bash_20241022', 'name': 'bash'},
|
||||
# {'name': 'str_replace_editor', 'type': 'text_editor_20241022'}
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
{'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
]
|
||||
elif self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
|
||||
tools = [
|
||||
{'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
# {'type': 'bash_20250124', 'name': 'bash'},
|
||||
# {'name': 'str_replace_editor', 'type': 'text_editor_20250124'}
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
{'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
]
|
||||
extra_body = {
|
||||
"thinking": {"type": "enabled", "budget_tokens": 1024}
|
||||
}
|
||||
response = None
|
||||
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
|
||||
except (APIError, APIStatusError, APIResponseValidationError) as e:
|
||||
self.logger.exception(f"Anthropic API error: {str(e)}")
|
||||
try:
|
||||
self.logger.warning("Retrying with backup API key...")
|
||||
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
|
||||
|
||||
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
self.logger.info("Successfully used backup API key")
|
||||
except Exception as backup_e:
|
||||
self.logger.exception(f"Backup API call also failed: {str(backup_e)}")
|
||||
return None, None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error in Anthropic API: {str(e)}")
|
||||
return None, None
|
||||
|
||||
response_params = _response_to_params(response)
|
||||
logger.info(f"Received response params: {response_params}")
|
||||
|
||||
# Store response in message history
|
||||
self.messages.append({
|
||||
"role": "assistant",
|
||||
"content": response_params
|
||||
})
|
||||
|
||||
actions: list[Any] = []
|
||||
reasonings: list[str] = []
|
||||
for content_block in response_params:
|
||||
if content_block["type"] == "tool_use":
|
||||
actions.append({
|
||||
"name": content_block["name"],
|
||||
"input": cast(dict[str, Any], content_block["input"]),
|
||||
"id": content_block["id"],
|
||||
"action_type": content_block.get("type"),
|
||||
"command": self.parse_actions_from_tool_call(content_block)
|
||||
})
|
||||
elif content_block["type"] == "text":
|
||||
reasonings.append(content_block["text"])
|
||||
if isinstance(reasonings, list) and len(reasonings) > 0:
|
||||
reasonings = reasonings[0]
|
||||
else:
|
||||
reasonings = ""
|
||||
logger.info(f"Received actions: {actions}")
|
||||
logger.info(f"Received reasonings: {reasonings}")
|
||||
if len(actions) == 0:
|
||||
actions = ["DONE"]
|
||||
return reasonings, actions
|
||||
|
||||
def reset(self, *args, **kwargs):
|
||||
"""
|
||||
Reset the agent's state.
|
||||
"""
|
||||
self.messages = []
|
||||
self.logger.info(f"{self.class_name} reset.")
|
||||
12
monitor/.env
12
monitor/.env
|
|
@ -4,11 +4,11 @@
|
|||
# Monitor configuration
|
||||
TASK_CONFIG_PATH=../evaluation_examples/test_all.json
|
||||
EXAMPLES_BASE_PATH=../evaluation_examples/examples
|
||||
RESULTS_BASE_PATH=../results_all
|
||||
ACTION_SPACE=pyautogui
|
||||
OBSERVATION_TYPE=screenshot
|
||||
MODEL_NAME=computer-use-preview
|
||||
MAX_STEPS=150
|
||||
RESULTS_BASE_PATH=../results
|
||||
# ACTION_SPACE=pyautogui
|
||||
# OBSERVATION_TYPE=screenshot
|
||||
# MODEL_NAME=computer-use-preview
|
||||
# MAX_STEPS=150
|
||||
FLASK_PORT=80
|
||||
FLASK_HOST=0.0.0.0
|
||||
FLASK_DEBUG=true
|
||||
FLASK_DEBUG=false
|
||||
165
monitor/main.py
165
monitor/main.py
|
|
@ -1,14 +1,17 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from functools import cache
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from flask import Flask, render_template_string, jsonify, send_file, request, render_template
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
|
|
@ -38,12 +41,57 @@ OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
|
|||
MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
|
||||
MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
|
||||
|
||||
def initialize_default_config():
|
||||
"""Initialize default configuration from the first available config in results directory"""
|
||||
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH, MAX_STEPS
|
||||
|
||||
if os.path.exists(RESULTS_BASE_PATH):
|
||||
try:
|
||||
# Scan for the first available configuration
|
||||
for action_space in os.listdir(RESULTS_BASE_PATH):
|
||||
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
|
||||
if os.path.isdir(action_space_path):
|
||||
for obs_type in os.listdir(action_space_path):
|
||||
obs_path = os.path.join(action_space_path, obs_type)
|
||||
if os.path.isdir(obs_path):
|
||||
for model_name in os.listdir(obs_path):
|
||||
model_path = os.path.join(obs_path, model_name)
|
||||
if os.path.isdir(model_path):
|
||||
# Use the first available configuration as default
|
||||
ACTION_SPACE = action_space
|
||||
OBSERVATION_TYPE = obs_type
|
||||
MODEL_NAME = model_name
|
||||
RESULTS_PATH = model_path
|
||||
|
||||
# Read max_steps from args.json if available
|
||||
model_args = get_model_args(action_space, obs_type, model_name)
|
||||
if model_args and 'max_steps' in model_args:
|
||||
MAX_STEPS = model_args['max_steps']
|
||||
|
||||
print(f"Initialized default config: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME} (max_steps: {MAX_STEPS})")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"Error scanning results directory for default config: {e}")
|
||||
|
||||
# Fallback to original environment-based path if no configs found
|
||||
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
print(f"Using fallback config from environment: {ACTION_SPACE}/{OBSERVATION_TYPE}/{MODEL_NAME} (max_steps: {MAX_STEPS})")
|
||||
|
||||
# Initialize default configuration
|
||||
initialize_default_config()
|
||||
|
||||
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
|
||||
if RESULTS_PATH not in TASK_STATUS_CACHE:
|
||||
# Initialize cache for this results path
|
||||
TASK_STATUS_CACHE[RESULTS_PATH] = {}
|
||||
|
||||
@cache
|
||||
def load_task_list():
|
||||
with open(TASK_CONFIG_PATH, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
@cache
|
||||
def get_task_info(task_type, task_id):
|
||||
task_file = os.path.join(EXAMPLES_BASE_PATH, task_type, f"{task_id}.json")
|
||||
if os.path.exists(task_file):
|
||||
|
|
@ -183,8 +231,8 @@ def get_task_status_brief(task_type, task_id):
|
|||
# Check if the status is already cached
|
||||
current_time = time.time()
|
||||
last_cache_time = None
|
||||
if cache_key in TASK_STATUS_CACHE:
|
||||
cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
|
||||
if cache_key in TASK_STATUS_CACHE[RESULTS_PATH]:
|
||||
cached_status, cached_time = TASK_STATUS_CACHE[RESULTS_PATH][cache_key]
|
||||
last_cache_time = cached_time
|
||||
# If cached status is "Done", check if it's within the stability period
|
||||
if cached_status["status"].startswith("Done"):
|
||||
|
|
@ -312,7 +360,7 @@ def get_task_status_brief(task_type, task_id):
|
|||
# Cache the status if it is done or error
|
||||
if status.startswith("Done") or status == "Error":
|
||||
current_time = last_cache_time if last_cache_time else current_time
|
||||
TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
|
||||
TASK_STATUS_CACHE[RESULTS_PATH][cache_key] = (status_dict, current_time)
|
||||
|
||||
return status_dict
|
||||
|
||||
|
|
@ -434,6 +482,115 @@ def api_task_detail(task_type, task_id):
|
|||
"status": task_status
|
||||
})
|
||||
|
||||
@app.route('/api/config')
|
||||
def api_config():
|
||||
"""Get configuration information from environment variables - deprecated, use /api/current-config instead"""
|
||||
config_info = {
|
||||
"task_config_path": TASK_CONFIG_PATH,
|
||||
"results_base_path": RESULTS_BASE_PATH,
|
||||
"action_space": ACTION_SPACE,
|
||||
"observation_type": OBSERVATION_TYPE,
|
||||
"model_name": MODEL_NAME,
|
||||
"max_steps": MAX_STEPS,
|
||||
"examples_base_path": EXAMPLES_BASE_PATH
|
||||
}
|
||||
return jsonify(config_info)
|
||||
|
||||
@app.route('/api/available-configs')
|
||||
def api_available_configs():
|
||||
"""Get all available configuration combinations by scanning the results directory"""
|
||||
configs = []
|
||||
|
||||
if os.path.exists(RESULTS_BASE_PATH):
|
||||
try:
|
||||
# Scan action spaces
|
||||
for action_space in os.listdir(RESULTS_BASE_PATH):
|
||||
action_space_path = os.path.join(RESULTS_BASE_PATH, action_space)
|
||||
if os.path.isdir(action_space_path):
|
||||
# Scan observation types
|
||||
for obs_type in os.listdir(action_space_path):
|
||||
obs_path = os.path.join(action_space_path, obs_type)
|
||||
if os.path.isdir(obs_path):
|
||||
# Scan model names
|
||||
for model_name in os.listdir(obs_path):
|
||||
model_path = os.path.join(obs_path, model_name)
|
||||
if os.path.isdir(model_path):
|
||||
configs.append({
|
||||
"action_space": action_space,
|
||||
"observation_type": obs_type,
|
||||
"model_name": model_name,
|
||||
"path": model_path
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error scanning results directory: {e}")
|
||||
|
||||
return jsonify(configs)
|
||||
|
||||
@app.route('/api/current-config')
|
||||
def api_current_config():
|
||||
"""Get current configuration including args.json data"""
|
||||
config = {
|
||||
"action_space": ACTION_SPACE,
|
||||
"observation_type": OBSERVATION_TYPE,
|
||||
"model_name": MODEL_NAME,
|
||||
"max_steps": MAX_STEPS,
|
||||
"results_path": RESULTS_PATH
|
||||
}
|
||||
|
||||
# Add model args from args.json
|
||||
model_args = get_model_args(ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
if model_args:
|
||||
config["model_args"] = model_args
|
||||
else:
|
||||
config["model_args"] = {}
|
||||
|
||||
return jsonify(config)
|
||||
|
||||
@app.route('/api/set-config', methods=['POST'])
|
||||
def api_set_config():
|
||||
"""Set current configuration"""
|
||||
global ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME, RESULTS_PATH, MAX_STEPS
|
||||
|
||||
data = request.get_json()
|
||||
if not data:
|
||||
return jsonify({"error": "No data provided"}), 400
|
||||
|
||||
# Update global variables
|
||||
ACTION_SPACE = data.get('action_space', ACTION_SPACE)
|
||||
OBSERVATION_TYPE = data.get('observation_type', OBSERVATION_TYPE)
|
||||
MODEL_NAME = data.get('model_name', MODEL_NAME)
|
||||
|
||||
# Update results path
|
||||
RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
|
||||
# Update max_steps from args.json if available
|
||||
model_args = get_model_args(ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
|
||||
if model_args and 'max_steps' in model_args:
|
||||
MAX_STEPS = model_args['max_steps']
|
||||
|
||||
if RESULTS_PATH not in TASK_STATUS_CACHE:
|
||||
# Initialize cache for this results path
|
||||
TASK_STATUS_CACHE[RESULTS_PATH] = {}
|
||||
|
||||
return jsonify({
|
||||
"action_space": ACTION_SPACE,
|
||||
"observation_type": OBSERVATION_TYPE,
|
||||
"model_name": MODEL_NAME,
|
||||
"max_steps": MAX_STEPS,
|
||||
"results_path": RESULTS_PATH
|
||||
})
|
||||
|
||||
def get_model_args(action_space, observation_type, model_name):
|
||||
"""Get model arguments from args.json file"""
|
||||
args_file = os.path.join(RESULTS_BASE_PATH, action_space, observation_type, model_name, "args.json")
|
||||
if os.path.exists(args_file):
|
||||
try:
|
||||
with open(args_file, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error reading args.json: {e}")
|
||||
return None
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Check if necessary directories exist
|
||||
if not os.path.exists(TASK_CONFIG_PATH):
|
||||
|
|
@ -447,4 +604,4 @@ if __name__ == '__main__':
|
|||
port = 8080
|
||||
debug = os.getenv("FLASK_DEBUG", "false").lower() == "true"
|
||||
|
||||
app.run(host=host, port=port, debug=debug)
|
||||
app.run(host=host, port=port, debug=debug, threaded=True)
|
||||
|
|
@ -1,5 +1,63 @@
|
|||
/* filepath: /home/adlsdztony/codes/OSWorld/monitor/static/index.css */
|
||||
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 0; background: linear-gradient(135deg, #f4f6fa 0%, #e9f0f9 100%); }
|
||||
|
||||
.layout-container {
|
||||
position: relative;
|
||||
max-width: 1200px;
|
||||
margin: 20px auto;
|
||||
padding: 0 20px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
background: #fff;
|
||||
border-radius: 14px;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
||||
padding: 36px 44px;
|
||||
}
|
||||
|
||||
/* Floating Config Sidebar */
|
||||
.config-sidebar {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: -280px;
|
||||
width: 300px;
|
||||
height: calc(100vh - 40px);
|
||||
z-index: 1000;
|
||||
transition: left 0.3s ease;
|
||||
}
|
||||
|
||||
.config-sidebar:hover {
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
position: absolute;
|
||||
right: -50px;
|
||||
top: 50%;
|
||||
transform: translateY(-50%);
|
||||
width: 50px;
|
||||
height: 50px;
|
||||
background: linear-gradient(135deg, #007bff, #0056b3);
|
||||
border-radius: 0 25px 25px 0;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
color: white;
|
||||
font-size: 1.2em;
|
||||
cursor: pointer;
|
||||
box-shadow: 2px 0 10px rgba(0,0,0,0.2);
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.config-toggle-btn:hover {
|
||||
background: linear-gradient(135deg, #0056b3, #004085);
|
||||
transform: translateY(-50%) scale(1.05);
|
||||
}
|
||||
|
||||
.config-sidebar:hover .config-toggle-btn {
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.main-container { max-width: 1100px; margin: 40px auto; background: #fff; border-radius: 14px; box-shadow: 0 8px 32px rgba(0,0,0,0.1); padding: 36px 44px; }
|
||||
h1 { font-size: 2.5em; margin-bottom: 24px; color: #1a237e; text-align: center; position: relative; }
|
||||
h1:after { content: ''; display: block; width: 80px; height: 4px; background: linear-gradient(90deg, #007bff, #00c6ff); margin: 12px auto 0; border-radius: 2px; }
|
||||
|
|
@ -125,6 +183,18 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
|||
text-shadow: 0 1px 2px rgba(0,0,0,0.05);
|
||||
}
|
||||
|
||||
.accuracy-percentage {
|
||||
font-size: 0.7em;
|
||||
font-weight: 600;
|
||||
color: #ffffff;
|
||||
margin-left: 8px;
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
padding: 4px 8px;
|
||||
border-radius: 12px;
|
||||
display: inline-block;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
|
||||
.stat-card span {
|
||||
font-size: 2em;
|
||||
|
|
@ -197,8 +267,9 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
|||
|
||||
.task-type-stats {
|
||||
display: flex;
|
||||
gap: 16px;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.task-stat {
|
||||
|
|
@ -228,6 +299,22 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
|||
color: #b71c1c;
|
||||
}
|
||||
|
||||
/* Task type statistics styles */
|
||||
.task-stat.score {
|
||||
color: #ffc107;
|
||||
background: rgba(255, 193, 7, 0.1);
|
||||
}
|
||||
|
||||
.task-stat.steps {
|
||||
color: #17a2b8;
|
||||
background: rgba(23, 162, 184, 0.1);
|
||||
}
|
||||
|
||||
.task-stat.rate {
|
||||
color: #28a745;
|
||||
background: rgba(40, 167, 69, 0.1);
|
||||
}
|
||||
|
||||
.tasks-container {
|
||||
padding: 20px;
|
||||
transition: all 0.4s cubic-bezier(.4,0,.2,1);
|
||||
|
|
@ -427,3 +514,174 @@ h2 { color: #0056b3; margin-top: 32px; font-size: 1.6em; }
|
|||
background: #a5c7e5;
|
||||
}
|
||||
|
||||
/* Configuration Panel Styles */
|
||||
.config-panel {
|
||||
background: #fff;
|
||||
border-radius: 0 14px 14px 0;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,0.15);
|
||||
overflow: hidden;
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.config-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
padding: 16px 20px;
|
||||
background: linear-gradient(135deg, #6c757d, #495057);
|
||||
color: white;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.config-header i {
|
||||
margin-right: 10px;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
.config-header span {
|
||||
font-weight: 600;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
.config-content {
|
||||
padding: 20px;
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.config-selector {
|
||||
margin-bottom: 20px;
|
||||
padding-bottom: 15px;
|
||||
border-bottom: 1px solid #dee2e6;
|
||||
}
|
||||
|
||||
.selector-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.selector-item label {
|
||||
font-weight: 600;
|
||||
color: #495057;
|
||||
font-size: 0.9em;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.selector-item select {
|
||||
padding: 8px 12px;
|
||||
border: 2px solid #e9ecef;
|
||||
border-radius: 6px;
|
||||
background: white;
|
||||
font-size: 0.9em;
|
||||
color: #495057;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.selector-item select:focus {
|
||||
outline: none;
|
||||
border-color: #007bff;
|
||||
box-shadow: 0 0 0 3px rgba(0,123,255,0.1);
|
||||
}
|
||||
|
||||
.selector-item select:hover {
|
||||
border-color: #007bff;
|
||||
}
|
||||
|
||||
.config-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
}
|
||||
|
||||
.config-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
background: #f8f9fa;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid #007bff;
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.config-item:hover {
|
||||
transform: translateX(3px);
|
||||
box-shadow: 0 4px 12px rgba(0,123,255,0.15);
|
||||
}
|
||||
|
||||
.config-label {
|
||||
font-weight: 600;
|
||||
color: #495057;
|
||||
margin-bottom: 5px;
|
||||
font-size: 0.9em;
|
||||
text-transform: uppercase;
|
||||
color: #495057;
|
||||
font-size: 0.85em;
|
||||
margin-bottom: 6px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.config-value {
|
||||
color: #007bff;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
font-weight: 600;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.config-path {
|
||||
font-size: 0.8em;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
/* Responsive design for sidebar layout */
|
||||
@media (max-width: 1024px) {
|
||||
.config-sidebar {
|
||||
left: -250px;
|
||||
width: 250px;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
right: -40px;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
font-size: 1em;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.layout-container {
|
||||
padding: 0 10px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
padding: 20px 25px;
|
||||
}
|
||||
|
||||
.config-sidebar {
|
||||
left: -220px;
|
||||
width: 220px;
|
||||
height: calc(100vh - 20px);
|
||||
top: 10px;
|
||||
}
|
||||
|
||||
.config-toggle-btn {
|
||||
right: -35px;
|
||||
width: 35px;
|
||||
height: 35px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.config-content {
|
||||
padding: 15px;
|
||||
}
|
||||
|
||||
.config-item {
|
||||
padding: 10px;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
document.addEventListener('DOMContentLoaded', () => {
|
||||
fetchTasks();
|
||||
fetchAvailableConfigs().then(() => {
|
||||
fetchConfig();
|
||||
fetchTasks();
|
||||
});
|
||||
// Bind filter functionality
|
||||
document.getElementById('total-tasks').parentElement.addEventListener('click', () => setTaskFilter('all'));
|
||||
document.getElementById('active-tasks').parentElement.addEventListener('click', () => setTaskFilter('active'));
|
||||
|
|
@ -9,6 +12,9 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||
|
||||
let allTaskData = null;
|
||||
let currentFilter = 'all';
|
||||
let availableConfigs = [];
|
||||
let currentConfig = null;
|
||||
let categoryStats = {};
|
||||
|
||||
function refreshPage() {
|
||||
// Save expanded state before refresh
|
||||
|
|
@ -31,8 +37,8 @@ function fetchTasksForRefresh() {
|
|||
fetch('/api/tasks/brief')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
// Update stored data
|
||||
allTaskData = data;
|
||||
categoryStats = calculateCategoryStats(data);
|
||||
// Only update statistics and task status, do not fully re-render
|
||||
updateStatistics(data);
|
||||
updateTaskStatus(data);
|
||||
|
|
@ -148,6 +154,7 @@ function fetchTasks() {
|
|||
.then(response => response.json())
|
||||
.then(data => {
|
||||
allTaskData = data;
|
||||
categoryStats = calculateCategoryStats(data);
|
||||
renderTasks(data);
|
||||
updateStatistics(data);
|
||||
})
|
||||
|
|
@ -208,13 +215,15 @@ function updateStatistics(data) {
|
|||
document.getElementById('completed-tasks').textContent = completedTasks;
|
||||
document.getElementById('error-tasks').textContent = errorTasks;
|
||||
|
||||
// Update score display with formatted score
|
||||
// Update score display with formatted score and accuracy percentage
|
||||
const scoreDisplay = document.getElementById('score-display');
|
||||
if (completedTasks > 0) {
|
||||
const scoreFormatted = totalScore.toFixed(2);
|
||||
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span>`;
|
||||
const averageScore = totalScore / completedTasks;
|
||||
const accuracyPercentage = (averageScore * 100).toFixed(1);
|
||||
scoreDisplay.innerHTML = `<span>${scoreFormatted}</span> / <span>${completedTasks}</span> <span class="accuracy-percentage">(${accuracyPercentage}%)</span>`;
|
||||
} else {
|
||||
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span>';
|
||||
scoreDisplay.innerHTML = '<span>0.00</span> / <span>0</span> <span class="accuracy-percentage">(0.0%)</span>';
|
||||
}
|
||||
|
||||
// Highlight the currently selected statistics card
|
||||
|
|
@ -279,6 +288,10 @@ function renderTasks(data) {
|
|||
// Create header with task type name and statistics
|
||||
const typeHeader = document.createElement('div');
|
||||
typeHeader.className = 'task-type-header';
|
||||
|
||||
// Get category stats for this task type
|
||||
const stats = categoryStats[taskType] || {};
|
||||
|
||||
typeHeader.innerHTML = `
|
||||
<span class="task-type-name"><i class="fas fa-layer-group"></i> ${taskType}</span>
|
||||
<div class="task-type-stats">
|
||||
|
|
@ -286,6 +299,8 @@ function renderTasks(data) {
|
|||
<span class="task-stat"><i class="fas fa-tasks"></i> ${tasks.length} total</span>
|
||||
<span class="task-stat running"><i class="fas fa-running"></i> ${runningCount} active</span>
|
||||
<span class="task-stat completed"><i class="fas fa-check-circle"></i> ${completedCount} completed</span>
|
||||
${stats.total_score ? `<span class="task-stat score"><i class="fas fa-star"></i> ${stats.total_score} total score</span>` : ''}
|
||||
${stats.avg_steps ? `<span class="task-stat steps"><i class="fas fa-chart-line"></i> ${stats.avg_steps} avg steps</span>` : ''}
|
||||
</div>
|
||||
`;
|
||||
typeSection.appendChild(typeHeader);
|
||||
|
|
@ -453,7 +468,200 @@ function renderTasks(data) {
|
|||
container.appendChild(typeSection);
|
||||
});
|
||||
}
|
||||
// add auto-refresh with time interval 10 seconds
|
||||
setInterval(() => {
|
||||
refreshPage();
|
||||
}, 10000); // 10 seconds interval
|
||||
|
||||
function fetchAvailableConfigs() {
|
||||
return fetch('/api/available-configs')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
availableConfigs = data;
|
||||
populateConfigSelect();
|
||||
return data;
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error fetching available configs:', error);
|
||||
return [];
|
||||
});
|
||||
}
|
||||
|
||||
function populateConfigSelect() {
|
||||
const select = document.getElementById('config-select');
|
||||
select.innerHTML = '';
|
||||
|
||||
if (availableConfigs.length === 0) {
|
||||
select.innerHTML = '<option value="">No configurations found in results directory</option>';
|
||||
return;
|
||||
}
|
||||
|
||||
// Add available configurations
|
||||
availableConfigs.forEach((config, index) => {
|
||||
const option = document.createElement('option');
|
||||
option.value = index;
|
||||
option.textContent = `${config.action_space} / ${config.observation_type} / ${config.model_name}`;
|
||||
select.appendChild(option);
|
||||
});
|
||||
}
|
||||
|
||||
function changeConfiguration() {
|
||||
const select = document.getElementById('config-select');
|
||||
const selectedIndex = select.value;
|
||||
|
||||
if (selectedIndex === '' || selectedIndex < 0 || selectedIndex >= availableConfigs.length) {
|
||||
return;
|
||||
}
|
||||
|
||||
const selectedConfig = availableConfigs[selectedIndex];
|
||||
|
||||
// Send configuration change request
|
||||
fetch('/api/set-config', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(selectedConfig)
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
currentConfig = data;
|
||||
displayConfig(data);
|
||||
// Refresh tasks with new configuration
|
||||
fetchTasks();
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error setting config:', error);
|
||||
displayConfigError();
|
||||
});
|
||||
}
|
||||
|
||||
function fetchConfig() {
|
||||
return fetch('/api/current-config')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
currentConfig = data;
|
||||
displayConfig(data);
|
||||
updateConfigSelect();
|
||||
return data;
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error fetching config:', error);
|
||||
displayConfigError();
|
||||
});
|
||||
}
|
||||
|
||||
function updateConfigSelect() {
|
||||
if (!currentConfig || availableConfigs.length === 0) return;
|
||||
|
||||
const select = document.getElementById('config-select');
|
||||
const currentConfigIndex = availableConfigs.findIndex(config =>
|
||||
config.action_space === currentConfig.action_space &&
|
||||
config.observation_type === currentConfig.observation_type &&
|
||||
config.model_name === currentConfig.model_name
|
||||
);
|
||||
|
||||
if (currentConfigIndex !== -1) {
|
||||
select.value = currentConfigIndex;
|
||||
} else {
|
||||
// Current config not found in available configs, select the first one if available
|
||||
if (availableConfigs.length > 0) {
|
||||
select.value = 0;
|
||||
console.warn('Current config not found in available configs, defaulting to first available config');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function displayConfig(config) {
|
||||
document.getElementById('action-space').textContent = config.action_space || 'N/A';
|
||||
document.getElementById('observation-type').textContent = config.observation_type || 'N/A';
|
||||
document.getElementById('model-name').textContent = config.model_name || 'N/A';
|
||||
document.getElementById('max-steps').textContent = config.max_steps || 'N/A';
|
||||
|
||||
// Display model args from args.json
|
||||
const modelArgsElement = document.getElementById('model-args');
|
||||
if (config.model_args && Object.keys(config.model_args).length > 0) {
|
||||
let argsHtml = '';
|
||||
Object.entries(config.model_args).forEach(([key, value]) => {
|
||||
// Skip max_steps as it's already displayed above
|
||||
if (key !== 'max_steps') {
|
||||
argsHtml += `<div class="config-item">
|
||||
<span class="config-label">${key}:</span>
|
||||
<span class="config-value">${JSON.stringify(value)}</span>
|
||||
</div>`;
|
||||
}
|
||||
});
|
||||
modelArgsElement.innerHTML = argsHtml;
|
||||
modelArgsElement.style.display = 'block';
|
||||
} else {
|
||||
modelArgsElement.style.display = 'none';
|
||||
}
|
||||
}
|
||||
|
||||
function displayConfigError() {
|
||||
const configValues = document.querySelectorAll('.config-value');
|
||||
configValues.forEach(element => {
|
||||
element.textContent = 'Error loading';
|
||||
element.style.color = '#dc3545';
|
||||
});
|
||||
}
|
||||
|
||||
function calculateCategoryStats(data) {
|
||||
const stats = {};
|
||||
|
||||
Object.entries(data).forEach(([taskType, tasks]) => {
|
||||
let totalTasks = tasks.length;
|
||||
let completedTasks = 0;
|
||||
let runningTasks = 0;
|
||||
let errorTasks = 0;
|
||||
let totalScore = 0;
|
||||
let totalSteps = 0;
|
||||
let completedWithSteps = 0;
|
||||
|
||||
tasks.forEach(task => {
|
||||
const status = task.status.status;
|
||||
|
||||
if (['Done', 'Done (Message Exit)', 'Done (Max Steps)', 'Done (Thought Exit)'].includes(status)) {
|
||||
completedTasks++;
|
||||
|
||||
// Calculate score if available
|
||||
if (task.status.result) {
|
||||
try {
|
||||
const score = parseFloat(task.status.result);
|
||||
if (!isNaN(score) && score >= 0 && score <= 1) {
|
||||
totalScore += score;
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore parsing errors
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate steps for completed tasks
|
||||
if (task.status.progress && task.status.progress > 0) {
|
||||
totalSteps += task.status.progress;
|
||||
completedWithSteps++;
|
||||
}
|
||||
|
||||
} else if (['Running', 'Preparing', 'Initializing'].includes(status)) {
|
||||
runningTasks++;
|
||||
|
||||
} else if (status === 'Error') {
|
||||
errorTasks++;
|
||||
}
|
||||
});
|
||||
|
||||
// Calculate averages
|
||||
const avgScore = completedTasks > 0 ? totalScore / completedTasks : 0;
|
||||
const avgSteps = completedWithSteps > 0 ? totalSteps / completedWithSteps : 0;
|
||||
const completionRate = totalTasks > 0 ? (completedTasks / totalTasks * 100) : 0;
|
||||
|
||||
stats[taskType] = {
|
||||
total_tasks: totalTasks,
|
||||
completed_tasks: completedTasks,
|
||||
running_tasks: runningTasks,
|
||||
error_tasks: errorTasks,
|
||||
total_score: Math.round(totalScore * 100) / 100,
|
||||
avg_score: Math.round(avgScore * 10000) / 10000,
|
||||
avg_steps: Math.round(avgSteps * 10) / 10,
|
||||
completion_rate: Math.round(completionRate * 10) / 10
|
||||
};
|
||||
});
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,19 +12,65 @@
|
|||
<link rel="stylesheet" href="/static/index.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="main-container">
|
||||
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
|
||||
|
||||
<!-- Score Display Banner -->
|
||||
<div class="score-banner">
|
||||
<div class="score-content">
|
||||
<i class="fas fa-star"></i>
|
||||
<span class="score-label">Score:</span>
|
||||
<span id="score-display" class="score-value">Loading...</span>
|
||||
<div class="layout-container">
|
||||
<!-- Floating Config Button and Sidebar -->
|
||||
<div class="config-sidebar" id="config-sidebar">
|
||||
<div class="config-toggle-btn">
|
||||
<i class="fas fa-cogs"></i>
|
||||
</div>
|
||||
<div class="config-panel">
|
||||
<div class="config-header">
|
||||
<i class="fas fa-cogs"></i>
|
||||
<span>Configuration</span>
|
||||
</div>
|
||||
<div class="config-content">
|
||||
<div class="config-selector">
|
||||
<div class="selector-item">
|
||||
<label for="config-select">Select Configuration:</label>
|
||||
<select id="config-select" onchange="changeConfiguration()">
|
||||
<option value="">Loading configurations...</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
<div class="config-list">
|
||||
<div class="config-item">
|
||||
<span class="config-label">Action Space:</span>
|
||||
<span class="config-value" id="action-space">Loading...</span>
|
||||
</div>
|
||||
<div class="config-item">
|
||||
<span class="config-label">Observation:</span>
|
||||
<span class="config-value" id="observation-type">Loading...</span>
|
||||
</div>
|
||||
<div class="config-item">
|
||||
<span class="config-label">Model:</span>
|
||||
<span class="config-value" id="model-name">Loading...</span>
|
||||
</div>
|
||||
<div class="config-item">
|
||||
<span class="config-label">Max Steps:</span>
|
||||
<span class="config-value" id="max-steps">Loading...</span>
|
||||
</div>
|
||||
<div id="model-args" style="display: none;">
|
||||
<!-- Model args from args.json will be populated here -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="dashboard-stats">
|
||||
<!-- Main Content -->
|
||||
<div class="main-content">
|
||||
<h1>OSWorld Monitor <span class="system-status online">System Online</span></h1>
|
||||
|
||||
<!-- Score Display Banner -->
|
||||
<div class="score-banner">
|
||||
<div class="score-content">
|
||||
<i class="fas fa-star"></i>
|
||||
<span class="score-label">Score:</span>
|
||||
<span id="score-display" class="score-value">Loading...</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="dashboard-stats">
|
||||
<div class="stat-card">
|
||||
<i class="fas fa-running"></i>
|
||||
<span id="active-tasks">Loading...</span>
|
||||
|
|
@ -46,10 +92,11 @@
|
|||
<div class="stat-label">Total Tasks</div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="task-container">
|
||||
<div class="loading-spinner">
|
||||
<div class="spinner"></div>
|
||||
<div>Loading task data...</div>
|
||||
<div id="task-container">
|
||||
<div class="loading-spinner">
|
||||
<div class="spinner"></div>
|
||||
<div>Loading task data...</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
12
run.py
12
run.py
|
|
@ -290,6 +290,18 @@ if __name__ == "__main__":
|
|||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
|
|
|||
|
|
@ -342,6 +342,18 @@ if __name__ == "__main__":
|
|||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
|
|
|||
|
|
@ -333,6 +333,18 @@ if __name__ == "__main__":
|
|||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
|
|
|||
|
|
@ -12,12 +12,12 @@ from typing import List, Dict
|
|||
import math
|
||||
from tqdm import tqdm
|
||||
from multiprocessing import Process, Manager
|
||||
import lib_run_single
|
||||
from desktop_env.desktop_env import DesktopEnv
|
||||
# import lib_run_single
|
||||
# from desktop_env.desktop_env import DesktopEnv
|
||||
from mm_agents.anthropic import AnthropicAgent as PromptAgent
|
||||
|
||||
# import fake_run_single as lib_run_single
|
||||
# from test_env import DesktopEnv
|
||||
import fake_run_single as lib_run_single
|
||||
from test_env import DesktopEnv
|
||||
|
||||
# .env
|
||||
from dotenv import load_dotenv
|
||||
|
|
@ -352,6 +352,17 @@ if __name__ == "__main__":
|
|||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
args = config()
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
|
|
|||
|
|
@ -464,6 +464,18 @@ if __name__ == "__main__":
|
|||
|
||||
try:
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
|
|
|||
|
|
@ -321,6 +321,18 @@ if __name__ == "__main__":
|
|||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
|
|
|||
Loading…
Reference in New Issue