|
|
|
|
@ -6,6 +6,9 @@ import os
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
from typing import Dict, List, Tuple
|
|
|
|
|
|
|
|
|
|
from http import HTTPStatus
|
|
|
|
|
import dashscope
|
|
|
|
|
from dashscope import MultiModalConversation
|
|
|
|
|
import backoff
|
|
|
|
|
import openai
|
|
|
|
|
from PIL import Image
|
|
|
|
|
@ -40,7 +43,7 @@ def process_image(image_bytes):
|
|
|
|
|
height=height,
|
|
|
|
|
width=width,
|
|
|
|
|
factor=32,
|
|
|
|
|
max_pixels=16 * 16 * 4 * 1280,
|
|
|
|
|
max_pixels=16 * 16 * 4 * 12800,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
image = image.resize((resized_width, resized_height))
|
|
|
|
|
@ -58,7 +61,7 @@ class Qwen3VLAgent:
|
|
|
|
|
self,
|
|
|
|
|
platform: str = "ubuntu",
|
|
|
|
|
model: str = "qwen3-vl",
|
|
|
|
|
max_tokens: int = 1500,
|
|
|
|
|
max_tokens: int = 40960,
|
|
|
|
|
top_p: float = 0.9,
|
|
|
|
|
temperature: float = 0.0,
|
|
|
|
|
action_space: str = "pyautogui",
|
|
|
|
|
@ -66,6 +69,9 @@ class Qwen3VLAgent:
|
|
|
|
|
history_n: int = 4,
|
|
|
|
|
add_thought_prefix: bool = False,
|
|
|
|
|
coordinate_type: str = "relative",
|
|
|
|
|
api_backend: str = "dashscope", # "openai" or "dashscope"
|
|
|
|
|
enable_thinking: bool = True, # Enable thinking mode for DashScope
|
|
|
|
|
thinking_budget: int = 32768, # Token budget for reasoning
|
|
|
|
|
):
|
|
|
|
|
self.platform = platform
|
|
|
|
|
self.model = model
|
|
|
|
|
@ -77,9 +83,13 @@ class Qwen3VLAgent:
|
|
|
|
|
self.history_n = history_n
|
|
|
|
|
self.add_thought_prefix = add_thought_prefix
|
|
|
|
|
self.coordinate_type = coordinate_type
|
|
|
|
|
self.api_backend = api_backend
|
|
|
|
|
self.enable_thinking = enable_thinking
|
|
|
|
|
self.thinking_budget = thinking_budget
|
|
|
|
|
|
|
|
|
|
assert action_space in ["pyautogui"], "Invalid action space"
|
|
|
|
|
assert observation_type in ["screenshot"], "Invalid observation type"
|
|
|
|
|
assert api_backend in ["openai", "dashscope"], "Invalid API backend, must be 'openai' or 'dashscope'"
|
|
|
|
|
|
|
|
|
|
self.thoughts = []
|
|
|
|
|
self.actions = []
|
|
|
|
|
@ -527,6 +537,70 @@ Previous actions:
|
|
|
|
|
|
|
|
|
|
return low_level_instruction, pyautogui_code
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _to_dashscope_messages(messages):
|
|
|
|
|
"""
|
|
|
|
|
Convert messages built for OpenAI compat into DashScope MultiModalConversation format.
|
|
|
|
|
- "text" part -> {"text": "..."}
|
|
|
|
|
- "image_url" -> {"image": "<url-or-data-uri>"}
|
|
|
|
|
- "video_url" -> {"video": "<url-or-data-uri>"}
|
|
|
|
|
"""
|
|
|
|
|
ds_msgs = []
|
|
|
|
|
for m in messages:
|
|
|
|
|
role = m.get("role", "")
|
|
|
|
|
parts = m.get("content", [])
|
|
|
|
|
ds_content = []
|
|
|
|
|
for p in parts:
|
|
|
|
|
ptype = p.get("type")
|
|
|
|
|
if ptype == "text":
|
|
|
|
|
ds_content.append({"text": p.get("text", "")})
|
|
|
|
|
elif ptype == "image_url":
|
|
|
|
|
url = (p.get("image_url") or {}).get("url", "")
|
|
|
|
|
# DashScope accepts http(s), file://, or data:image/*; keep as-is
|
|
|
|
|
ds_content.append({"image": url})
|
|
|
|
|
elif ptype == "video_url":
|
|
|
|
|
url = (p.get("video_url") or {}).get("url", "")
|
|
|
|
|
ds_content.append({"video": url})
|
|
|
|
|
else:
|
|
|
|
|
# If you ever pass raw assistant strings (no parts), tolerate it
|
|
|
|
|
if isinstance(p, str):
|
|
|
|
|
ds_content.append({"text": p})
|
|
|
|
|
# Also tolerate plain-string content (rare)
|
|
|
|
|
if not ds_content and isinstance(m.get("content"), str):
|
|
|
|
|
ds_content = [{"text": m["content"]}]
|
|
|
|
|
ds_msgs.append({"role": role, "content": ds_content})
|
|
|
|
|
return ds_msgs
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _extract_text_from_dashscope_response(resp):
|
|
|
|
|
"""Join all 'text' parts from the first choice, including reasoning if present."""
|
|
|
|
|
if hasattr(resp, "output"):
|
|
|
|
|
out = resp.output
|
|
|
|
|
else:
|
|
|
|
|
out = resp.get("output") if isinstance(resp, dict) else None
|
|
|
|
|
if not out:
|
|
|
|
|
return None
|
|
|
|
|
choices = getattr(out, "choices", None) if not isinstance(out, dict) else out.get("choices")
|
|
|
|
|
if not choices:
|
|
|
|
|
return None
|
|
|
|
|
msg = getattr(choices[0], "message", None) if not isinstance(choices[0], dict) else choices[0].get("message")
|
|
|
|
|
if not msg:
|
|
|
|
|
return None
|
|
|
|
|
content = getattr(msg, "content", None) if not isinstance(msg, dict) else msg.get("content", [])
|
|
|
|
|
if not content:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# Extract reasoning content if present (for thinking models)
|
|
|
|
|
reasoning_content = getattr(msg, "reasoning_content", None) if not isinstance(msg, dict) else msg.get("reasoning_content", None)
|
|
|
|
|
|
|
|
|
|
content_text = "".join(part.get("text", "") for part in content if isinstance(part, dict) and "text" in part)
|
|
|
|
|
|
|
|
|
|
# Format with thinking tags if reasoning exists
|
|
|
|
|
if reasoning_content is not None:
|
|
|
|
|
return f"<think>\n{reasoning_content}\n</think>\n\n{content_text}"
|
|
|
|
|
else:
|
|
|
|
|
return content_text
|
|
|
|
|
|
|
|
|
|
@backoff.on_exception(
|
|
|
|
|
backoff.constant,
|
|
|
|
|
(
|
|
|
|
|
@ -545,25 +619,93 @@ Previous actions:
|
|
|
|
|
def call_llm(self, payload, model):
|
|
|
|
|
messages = payload["messages"]
|
|
|
|
|
|
|
|
|
|
if self.api_backend == "openai":
|
|
|
|
|
return self._call_llm_openai(messages, model)
|
|
|
|
|
elif self.api_backend == "dashscope":
|
|
|
|
|
return self._call_llm_dashscope(messages, model)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f"Unknown API backend: {self.api_backend}")
|
|
|
|
|
|
|
|
|
|
def _call_llm_openai(self, messages, model):
|
|
|
|
|
"""Call LLM using OpenAI SDK (compatible with OpenAI-compatible endpoints)."""
|
|
|
|
|
base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1"
|
|
|
|
|
api_key = "sk-123"
|
|
|
|
|
client = openai.OpenAI(base_url=base_url, api_key=api_key)
|
|
|
|
|
|
|
|
|
|
for _ in range(MAX_RETRY_TIMES):
|
|
|
|
|
logger.info("Generating content with Qwen model: %s", model)
|
|
|
|
|
for attempt in range(1, MAX_RETRY_TIMES + 1):
|
|
|
|
|
logger.info(f"[OpenAI] Generating content with model: {model} (attempt {attempt}/{MAX_RETRY_TIMES})")
|
|
|
|
|
try:
|
|
|
|
|
response = client.chat.completions.create(
|
|
|
|
|
model=model,
|
|
|
|
|
messages=messages,
|
|
|
|
|
max_tokens=self.max_tokens,
|
|
|
|
|
temperature=self.temperature,
|
|
|
|
|
top_p=self.top_p,
|
|
|
|
|
# temperature=self.temperature,
|
|
|
|
|
# top_p=self.top_p,
|
|
|
|
|
)
|
|
|
|
|
return response.choices[0].message.content
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error calling Qwen model: {e}")
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
continue
|
|
|
|
|
logger.error(f"[OpenAI] Error calling model: {e}")
|
|
|
|
|
if attempt < MAX_RETRY_TIMES:
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
continue
|
|
|
|
|
break
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def _call_llm_dashscope(self, messages, model):
|
|
|
|
|
"""Call LLM using DashScope SDK."""
|
|
|
|
|
dashscope.base_http_api_url = "https://poc-dashscope.aliyuncs.com/api/v1"
|
|
|
|
|
dashscope.api_key = "sk-123"
|
|
|
|
|
|
|
|
|
|
# Convert message schema
|
|
|
|
|
ds_messages = self._to_dashscope_messages(messages)
|
|
|
|
|
|
|
|
|
|
# Retry loop
|
|
|
|
|
last_err = None
|
|
|
|
|
for attempt in range(1, MAX_RETRY_TIMES + 1):
|
|
|
|
|
thinking_status = f" (thinking={self.enable_thinking})" if self.enable_thinking else ""
|
|
|
|
|
logger.info(f"[DashScope] Generating content with model: {model}, thinking_status: {thinking_status} (attempt {attempt}/{MAX_RETRY_TIMES})")
|
|
|
|
|
try:
|
|
|
|
|
# Build API call parameters
|
|
|
|
|
call_params = {
|
|
|
|
|
"model": model,
|
|
|
|
|
"messages": ds_messages,
|
|
|
|
|
"max_tokens": min(self.max_tokens, 2048),
|
|
|
|
|
# "temperature": self.temperature,
|
|
|
|
|
# "top_p": self.top_p,
|
|
|
|
|
"vl_high_resolution_images": True,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Add thinking parameters if enabled
|
|
|
|
|
if self.enable_thinking:
|
|
|
|
|
call_params["enable_thinking"] = True
|
|
|
|
|
call_params["thinking_budget"] = self.thinking_budget
|
|
|
|
|
|
|
|
|
|
resp = MultiModalConversation.call(**call_params)
|
|
|
|
|
|
|
|
|
|
if getattr(resp, "status_code", None) not in (None, HTTPStatus.OK):
|
|
|
|
|
code = getattr(resp, "code", "")
|
|
|
|
|
msg = getattr(resp, "message", "")
|
|
|
|
|
reqid = getattr(resp, "request_id", "")
|
|
|
|
|
logger.warning(f"[DashScope] non-OK response (id={reqid}): {code} {msg}")
|
|
|
|
|
last_err = RuntimeError(f"DashScope status {resp.status_code}: {code} {msg}")
|
|
|
|
|
time.sleep(1.5 * attempt)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
text = self._extract_text_from_dashscope_response(resp)
|
|
|
|
|
if not text:
|
|
|
|
|
raise ValueError("DashScope response has no text content")
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
last_err = e
|
|
|
|
|
logger.error(f"[DashScope] call failed: {e}")
|
|
|
|
|
if attempt < MAX_RETRY_TIMES:
|
|
|
|
|
time.sleep(1.5 * attempt)
|
|
|
|
|
continue
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if last_err:
|
|
|
|
|
raise last_err
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def reset(self, _logger=None):
|
|
|
|
|
|