Compare commits

...

2 Commits

Author SHA1 Message Date
Timothyxxx efefe1e21d support dashscopoe sdk to call qwen3-vl-plus 2025-10-13 07:19:38 +00:00
Timothyxxx 75902360dd support dashscopoe sdk to call qwen3-vl-plus 2025-10-13 07:18:47 +00:00
2 changed files with 153 additions and 11 deletions

View File

@ -6,6 +6,9 @@ import os
from io import BytesIO
from typing import Dict, List, Tuple
from http import HTTPStatus
import dashscope
from dashscope import MultiModalConversation
import backoff
import openai
from PIL import Image
@ -40,7 +43,7 @@ def process_image(image_bytes):
height=height,
width=width,
factor=32,
max_pixels=16 * 16 * 4 * 1280,
max_pixels=16 * 16 * 4 * 12800,
)
image = image.resize((resized_width, resized_height))
@ -58,7 +61,7 @@ class Qwen3VLAgent:
self,
platform: str = "ubuntu",
model: str = "qwen3-vl",
max_tokens: int = 1500,
max_tokens: int = 40960,
top_p: float = 0.9,
temperature: float = 0.0,
action_space: str = "pyautogui",
@ -66,6 +69,9 @@ class Qwen3VLAgent:
history_n: int = 4,
add_thought_prefix: bool = False,
coordinate_type: str = "relative",
api_backend: str = "dashscope", # "openai" or "dashscope"
enable_thinking: bool = True, # Enable thinking mode for DashScope
thinking_budget: int = 32768, # Token budget for reasoning
):
self.platform = platform
self.model = model
@ -77,9 +83,13 @@ class Qwen3VLAgent:
self.history_n = history_n
self.add_thought_prefix = add_thought_prefix
self.coordinate_type = coordinate_type
self.api_backend = api_backend
self.enable_thinking = enable_thinking
self.thinking_budget = thinking_budget
assert action_space in ["pyautogui"], "Invalid action space"
assert observation_type in ["screenshot"], "Invalid observation type"
assert api_backend in ["openai", "dashscope"], "Invalid API backend, must be 'openai' or 'dashscope'"
self.thoughts = []
self.actions = []
@ -527,6 +537,70 @@ Previous actions:
return low_level_instruction, pyautogui_code
@staticmethod
def _to_dashscope_messages(messages):
"""
Convert messages built for OpenAI compat into DashScope MultiModalConversation format.
- "text" part -> {"text": "..."}
- "image_url" -> {"image": "<url-or-data-uri>"}
- "video_url" -> {"video": "<url-or-data-uri>"}
"""
ds_msgs = []
for m in messages:
role = m.get("role", "")
parts = m.get("content", [])
ds_content = []
for p in parts:
ptype = p.get("type")
if ptype == "text":
ds_content.append({"text": p.get("text", "")})
elif ptype == "image_url":
url = (p.get("image_url") or {}).get("url", "")
# DashScope accepts http(s), file://, or data:image/*; keep as-is
ds_content.append({"image": url})
elif ptype == "video_url":
url = (p.get("video_url") or {}).get("url", "")
ds_content.append({"video": url})
else:
# If you ever pass raw assistant strings (no parts), tolerate it
if isinstance(p, str):
ds_content.append({"text": p})
# Also tolerate plain-string content (rare)
if not ds_content and isinstance(m.get("content"), str):
ds_content = [{"text": m["content"]}]
ds_msgs.append({"role": role, "content": ds_content})
return ds_msgs
@staticmethod
def _extract_text_from_dashscope_response(resp):
"""Join all 'text' parts from the first choice, including reasoning if present."""
if hasattr(resp, "output"):
out = resp.output
else:
out = resp.get("output") if isinstance(resp, dict) else None
if not out:
return None
choices = getattr(out, "choices", None) if not isinstance(out, dict) else out.get("choices")
if not choices:
return None
msg = getattr(choices[0], "message", None) if not isinstance(choices[0], dict) else choices[0].get("message")
if not msg:
return None
content = getattr(msg, "content", None) if not isinstance(msg, dict) else msg.get("content", [])
if not content:
return None
# Extract reasoning content if present (for thinking models)
reasoning_content = getattr(msg, "reasoning_content", None) if not isinstance(msg, dict) else msg.get("reasoning_content", None)
content_text = "".join(part.get("text", "") for part in content if isinstance(part, dict) and "text" in part)
# Format with thinking tags if reasoning exists
if reasoning_content is not None:
return f"<think>\n{reasoning_content}\n</think>\n\n{content_text}"
else:
return content_text
@backoff.on_exception(
backoff.constant,
(
@ -545,25 +619,93 @@ Previous actions:
def call_llm(self, payload, model):
messages = payload["messages"]
if self.api_backend == "openai":
return self._call_llm_openai(messages, model)
elif self.api_backend == "dashscope":
return self._call_llm_dashscope(messages, model)
else:
raise ValueError(f"Unknown API backend: {self.api_backend}")
def _call_llm_openai(self, messages, model):
"""Call LLM using OpenAI SDK (compatible with OpenAI-compatible endpoints)."""
base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1"
api_key = "sk-123"
client = openai.OpenAI(base_url=base_url, api_key=api_key)
for _ in range(MAX_RETRY_TIMES):
logger.info("Generating content with Qwen model: %s", model)
for attempt in range(1, MAX_RETRY_TIMES + 1):
logger.info(f"[OpenAI] Generating content with model: {model} (attempt {attempt}/{MAX_RETRY_TIMES})")
try:
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=self.max_tokens,
temperature=self.temperature,
top_p=self.top_p,
# temperature=self.temperature,
# top_p=self.top_p,
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Error calling Qwen model: {e}")
time.sleep(5)
continue
logger.error(f"[OpenAI] Error calling model: {e}")
if attempt < MAX_RETRY_TIMES:
time.sleep(5)
continue
break
return ""
def _call_llm_dashscope(self, messages, model):
"""Call LLM using DashScope SDK."""
dashscope.base_http_api_url = "https://poc-dashscope.aliyuncs.com/api/v1"
dashscope.api_key = "sk-123"
# Convert message schema
ds_messages = self._to_dashscope_messages(messages)
# Retry loop
last_err = None
for attempt in range(1, MAX_RETRY_TIMES + 1):
thinking_status = f" (thinking={self.enable_thinking})" if self.enable_thinking else ""
logger.info(f"[DashScope] Generating content with model: {model}, thinking_status: {thinking_status} (attempt {attempt}/{MAX_RETRY_TIMES})")
try:
# Build API call parameters
call_params = {
"model": model,
"messages": ds_messages,
"max_tokens": min(self.max_tokens, 2048),
# "temperature": self.temperature,
# "top_p": self.top_p,
"vl_high_resolution_images": True,
}
# Add thinking parameters if enabled
if self.enable_thinking:
call_params["enable_thinking"] = True
call_params["thinking_budget"] = self.thinking_budget
resp = MultiModalConversation.call(**call_params)
if getattr(resp, "status_code", None) not in (None, HTTPStatus.OK):
code = getattr(resp, "code", "")
msg = getattr(resp, "message", "")
reqid = getattr(resp, "request_id", "")
logger.warning(f"[DashScope] non-OK response (id={reqid}): {code} {msg}")
last_err = RuntimeError(f"DashScope status {resp.status_code}: {code} {msg}")
time.sleep(1.5 * attempt)
continue
text = self._extract_text_from_dashscope_response(resp)
if not text:
raise ValueError("DashScope response has no text content")
return text
except Exception as e:
last_err = e
logger.error(f"[DashScope] call failed: {e}")
if attempt < MAX_RETRY_TIMES:
time.sleep(1.5 * attempt)
continue
break
if last_err:
raise last_err
return ""
def reset(self, _logger=None):

View File

@ -57,13 +57,13 @@ def config() -> argparse.Namespace:
parser.add_argument("--model", type=str, default="qwen3-vl")
parser.add_argument("--temperature", type=float, default=0)
parser.add_argument("--top_p", type=float, default=0.9)
parser.add_argument("--max_tokens", type=int, default=1500)
parser.add_argument("--max_tokens", type=int, default=40960)
parser.add_argument("--stop_token", type=str, default=None)
parser.add_argument(
"--coord",
type=str,
choices=["absolute", "relative"],
default="absolute",
default="relative",
help="Coordinate system for agent outputs (absolute or relative)",
)
parser.add_argument(