Add autoglm-os-9b-v (#344)

* update for autoglm-v

* Update run_autoglm.py

---------

Co-authored-by: hanyullai <hanyullai@outlook.com>
This commit is contained in:
Yanxiao Zhao 2025-09-24 19:43:28 +08:00 committed by GitHub
parent f59cf00cae
commit a4f8fe2f00
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 8425 additions and 2 deletions

View File

@ -253,14 +253,20 @@ def run_single_example_autoglm(agent, env, example, max_steps, instruction, args
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
}))
f.write("\n")
if done:
logger.info("The episode is done.")
break
if not done: # not completed the task yet
env.action_history.append('FAIL')
# Invalid Action
if not actions:
obs = env._get_obs() # update observation
step_idx += 1
if not done: # not completed the task yet
env.action_history.append('FAIL')
result = env.evaluate()
logger.info("Result: %.2f", result)
scores.append(result)

View File

@ -0,0 +1,7 @@
"""
AutoGLM agent implementation
"""
from .main import AutoGLMAgent
__all__ = ["AutoGLMAgent"]

265
mm_agents/autoglm_v/main.py Normal file
View File

@ -0,0 +1,265 @@
import logging
import re
from base64 import b64encode
from PIL import Image
from io import BytesIO
from typing import Dict, List
from .prompt.accessibility_tree_handle import linearize_accessibility_tree, trim_accessibility_tree
from .prompt.grounding_agent import GroundingAgent as Agent
from .tools.package.google_chrome import BrowserTools
from .prompt.procedural_memory import Prompt
logger = logging.getLogger("desktopenv.agent")
pure_text_settings = ["a11y_tree"]
def resize_image(image, w, h):
img = Image.open(BytesIO(image))
# resize to max_pixel_num max_pixels
img = img.resize((w, h))
buf = BytesIO()
img.save(buf, format='PNG') # 指定保存格式,比如 PNG、JPEG
img_bytes = buf.getvalue() # 得到 bytes 数据
return img_bytes
def parse_code_from_string(input_string):
# input_string = "\n".join([line.strip() for line in input_string.split(';') if line.strip()])
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
return [input_string.strip()]
# This regular expression will match both ```code``` and ```python code```
# and capture the `code` part. It uses a non-greedy match for the content inside.
pattern = r"```(?:\w+\s+)?(.*?)```"
# Find all non-overlapping matches in the string
matches = re.findall(pattern, input_string, re.DOTALL)
# The regex above captures the content inside the triple backticks.
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
# so the code inside backticks can span multiple lines.
# matches now contains all the captured code snippets
codes = []
for match in matches:
match = match.strip()
commands = ["WAIT", "DONE", "FAIL"] # fixme: updates this part when we have more commands
if match in commands:
codes.append(match.strip())
elif match.split("\n")[-1] in commands:
if len(match.split("\n")) > 1:
codes.append("\n".join(match.split("\n")[:-1]))
codes.append(match.split("\n")[-1])
else:
codes.append(match)
return codes
class AutoGLMAgent:
def __init__(
self,
action_space="autoglm_computer_use",
observation_type="a11y_tree",
max_trajectory_length=3,
a11y_tree_max_items=300,
with_image: bool = True,
screen_size = (1920, 1080),
image_size=(1920, 1080),
with_atree: bool = False,
glm41v_format: bool = True,
relative_coordinate: bool = True,
client_password="password",
gen_func=None,
tool_in_sys_msg: bool = True,
):
self.action_space = action_space
self.observation_type = observation_type
assert action_space in ["autoglm_computer_use"], "Invalid action space"
assert observation_type in ["a11y_tree"], "Invalid observation type"
self.max_trajectory_length = max_trajectory_length
self.a11y_tree_max_items = a11y_tree_max_items
self.with_image = with_image
self.screen_size = screen_size
self.image_size = image_size
self.with_atree = with_atree
self.glm41v_format = glm41v_format
self.relative_coordinate = relative_coordinate
self.client_password = client_password
self.gen_func = gen_func
self.tool_in_sys_msg = tool_in_sys_msg
self.tool_list = {
"libreoffice_calc": "CalcTools",
"libreoffice_impress": "ImpressTools",
"libreoffice_writer": "WriterTools",
"code": "CodeTools",
"vlc": "VLCTools",
"google_chrome": "BrowserTools",
}
Agent.relative_coordinate = relative_coordinate
self.contents = []
@property
def turn_number(self):
return len(self.contents)
def prepare(self, instruction: str, obs: Dict, history: List, last_result: str = "") -> List:
"""
Predict the next action(s) based on the current observation.
"""
if "exe_result" in obs and not last_result:
last_result = obs["exe_result"]
if self.contents:
self.contents[-1]["exe_result"] = last_result
cur_app = obs["cur_app"]
logger.info(f"current app is {cur_app}")
if cur_app:
tool_name = cur_app.strip().lower().replace("-", "_")
tool_name = tool_name if tool_name in self.tool_list.keys() else None
else:
tool_name = None
setup_prompt, func_def_prompt, note_prompt = Prompt.construct_procedural_memory(
Agent, app_name=tool_name, client_password=self.client_password, with_image=self.with_image, with_atree=self.with_atree, relative_coordinate=self.relative_coordinate, glm41v_format=self.glm41v_format
)
if self.tool_in_sys_msg:
system_message = setup_prompt + "\n\n" + func_def_prompt + "\n\n" + note_prompt
else:
system_message = setup_prompt + "\n\n" + note_prompt
system_message += "\n\n**IMPORTANT** You are asked to complete the following task: {}".format(instruction)
messages = [
{
"role": "system",
"content": system_message,
}
]
messages.extend(history)
if obs["apps"]:
app_str = "Window ID App Name Title\n"
for window_id, app in obs["apps"].items():
app_str += f"{window_id} {app['app_name']} {app['title']}\n"
else:
app_str = "None"
last_result = last_result.strip() if last_result else "None"
last_result = last_result[:2000] + "..." if len(last_result) > 2000 else last_result
tree = linearize_accessibility_tree(obs["accessibility_tree"], "Ubuntu")
tree = trim_accessibility_tree(tree, 300)
app_info = obs["app_info"].strip() if obs["app_info"] else "None"
app_info = app_info[:5000] + "..." if len(app_info) > 5000 else app_info
prompt = "* Apps: {}\n\n* Current App: {}{}\n\n* App Info: {}\n\n* Previous Action Result: {}".format(
app_str.strip(),
obs["cur_window_id"].strip() if obs["cur_window_id"] in app_str else "None",
'\n\n* A11y Tree: {}'.format(tree.strip()) if self.with_atree else "",
app_info,
last_result if last_result else "None",
) + (
"\n\n" + func_def_prompt if not self.tool_in_sys_msg else ""
)
content = [{"type": "text", "text": prompt}]
if self.with_image and obs.get('screenshot'):
screenshot = resize_image(obs['screenshot'], self.image_size[0], self.image_size[1])
content = [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{b64encode(screenshot).decode('utf-8')}",
"detail": "high",
},
}
] + content
messages.append({"role": "user", "content": content})
return messages
def execute(self, response, obs):
try:
actions = parse_code_from_string(response)
action = actions[0]
logger.info(f"The pesudo action is {action}")
if "Agent." in action:
actions = [
eval(action),
]
elif "BrowserTools." in action: # TODO: special check for BrowserTools
actions = [
eval(action),
]
else:
actions = Agent.tool_commands(action, obs["cur_app"].strip().replace("-", "_").lower())
logger.info(f"The grounded action is {actions[0]}")
except Exception as e:
print("Failed to parse action from response", e)
actions = []
return actions
def format_history(self, max_turns=30):
history = []
for ix in range(self.turn_number):
if ix == 0:
env_input = "**Environment State (Omitted)**"
else:
env_input = (
f"**Environment State (Omitted)**\nPrevious Action Result: {self.contents[ix - 1]['exe_result']}"
)
env_input = env_input[:2000] + "..." if len(env_input) > 2000 else env_input
response = (
self.contents[ix]["response"][:1500] + "..."
if len(self.contents[ix]["response"]) > 1500
else self.contents[ix]["response"]
)
history.append({"role": "user", "content": [{"type": "text", "text": env_input}]})
history.append({"role": "assistant", "content": [{"type": "text", "text": response}]})
return history[-max_turns * 2:]
def predict(self, instruction: str, obs: Dict) -> List:
history = self.format_history()
messages = self.prepare(instruction, obs, history)
assert self.gen_func is not None, "gen_func is not set"
try:
response = self.gen_func(messages)
except Exception as e:
logger.error("Failed to call gen_func, Error: " + str(e))
response = ""
logger.info("RESPONSE: %s", response)
actions = self.execute(response, obs)
# update the contents
self.contents.append(
{
"instruction": instruction,
"index": len(self.contents),
"response": response,
"action": "Parse error" if not actions else actions[0],
"exe_result": "Invalid action" if not actions else "",
**obs,
}
)
return response, actions
def reset(self, _logger=None):
global logger
logger = _logger if _logger is not None else logging.getLogger("desktopenv.aguvis_agent")
self.contents = []

View File

@ -0,0 +1,329 @@
import io
import re
import xml.etree.ElementTree as ET
from typing import List, Tuple
from PIL import Image, ImageDraw, ImageFont
from .deduplicate_node import filter_similar_nodes
attributes_ns_ubuntu = "https://accessibility.windows.example.org/ns/attributes"
attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
state_ns_windows = "https://accessibility.windows.example.org/ns/state"
component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
component_ns_windows = "https://accessibility.windows.example.org/ns/component"
value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
value_ns_windows = "https://accessibility.windows.example.org/ns/value"
class_ns_windows = "https://accessibility.windows.example.org/ns/class"
def find_leaf_nodes(xlm_file_str):
if not xlm_file_str:
return []
root = ET.fromstring(xlm_file_str)
# Recursive function to traverse the XML tree and collect leaf nodes
def collect_leaf_nodes(node, leaf_nodes):
# If the node has no children, it is a leaf node, add it to the list
if not list(node):
leaf_nodes.append(node)
# If the node has children, recurse on each child
for child in node:
collect_leaf_nodes(child, leaf_nodes)
# List to hold all leaf nodes
leaf_nodes = []
collect_leaf_nodes(root, leaf_nodes)
return leaf_nodes
def judge_node(node: ET, platform="Ubuntu", check_image=False) -> bool:
if platform == "Ubuntu":
_state_ns = state_ns_ubuntu
_component_ns = component_ns_ubuntu
elif platform == "Windows":
_state_ns = state_ns_windows
_component_ns = component_ns_windows
else:
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
keeps: bool = (
node.tag.startswith("document")
or node.tag.endswith("item")
or node.tag.endswith("button")
or node.tag.endswith("heading")
or node.tag.endswith("label")
or node.tag.endswith("scrollbar")
or node.tag.endswith("searchbox")
or node.tag.endswith("textbox")
or node.tag.endswith("link")
or node.tag.endswith("tabelement")
or node.tag.endswith("textfield")
or node.tag.endswith("textarea")
or node.tag.endswith("menu")
or node.tag
in {
"alert",
"canvas",
"check-box",
"combo-box",
"entry",
"icon",
"image",
"paragraph",
"scroll-bar",
"section",
"slider",
"static",
"table-cell",
"terminal",
"text",
"netuiribbontab",
"start",
"trayclockwclass",
"traydummysearchcontrol",
"uiimage",
"uiproperty",
"uiribboncommandbar",
}
)
keeps = (
keeps
and (
platform == "Ubuntu"
and node.get("{{{:}}}showing".format(_state_ns), "false") == "true"
and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
or platform == "Windows"
and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
)
and (
node.get("name", "") != ""
or node.text is not None
and len(node.text) > 0
or check_image
and node.get("image", "false") == "true"
)
)
# and (
# node.get("{{{:}}}enabled".format(_state_ns), "false") == "true"
# or node.get("{{{:}}}editable".format(_state_ns), "false") == "true"
# or node.get("{{{:}}}expandable".format(_state_ns), "false") == "true"
# or node.get("{{{:}}}checkable".format(_state_ns), "false") == "true"
# ) \
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(_component_ns), "(-1, -1)"))
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(_component_ns), "(-1, -1)"))
keeps = keeps and coordinates[0] >= 0 and coordinates[1] >= 0 and sizes[0] > 0 and sizes[1] > 0
return keeps
def filter_nodes(root: ET, platform="Ubuntu", check_image=False):
filtered_nodes = []
for node in root.iter():
if judge_node(node, platform, check_image):
filtered_nodes.append(node)
return filtered_nodes
def draw_bounding_boxes(nodes, image_file_content, down_sampling_ratio=1.0, platform="Ubuntu"):
if platform == "Ubuntu":
_state_ns = state_ns_ubuntu
_component_ns = component_ns_ubuntu
_value_ns = value_ns_ubuntu
elif platform == "Windows":
_state_ns = state_ns_windows
_component_ns = component_ns_windows
_value_ns = value_ns_windows
else:
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
# Load the screenshot image
image_stream = io.BytesIO(image_file_content)
image = Image.open(image_stream)
if float(down_sampling_ratio) != 1.0:
image = image.resize((int(image.size[0] * down_sampling_ratio), int(image.size[1] * down_sampling_ratio)))
draw = ImageDraw.Draw(image)
marks = []
drew_nodes = []
text_informations: List[str] = ["index\ttag\tname\ttext"]
try:
# Adjust the path to the font file you have or use a default one
font = ImageFont.truetype("arial.ttf", 15)
except IOError:
# Fallback to a basic font if the specified font can't be loaded
font = ImageFont.load_default()
index = 1
# Loop over all the visible nodes and draw their bounding boxes
for _node in nodes:
coords_str = _node.attrib.get("{{{:}}}screencoord".format(_component_ns))
size_str = _node.attrib.get("{{{:}}}size".format(_component_ns))
if coords_str and size_str:
try:
# Parse the coordinates and size from the strings
coords = tuple(map(int, coords_str.strip("()").split(", ")))
size = tuple(map(int, size_str.strip("()").split(", ")))
import copy
original_coords = copy.deepcopy(coords)
original_size = copy.deepcopy(size)
if float(down_sampling_ratio) != 1.0:
# Downsample the coordinates and size
coords = tuple(int(coord * down_sampling_ratio) for coord in coords)
size = tuple(int(s * down_sampling_ratio) for s in size)
# Check for negative sizes
if size[0] <= 0 or size[1] <= 0:
raise ValueError(f"Size must be positive, got: {size}")
# Calculate the bottom-right corner of the bounding box
bottom_right = (coords[0] + size[0], coords[1] + size[1])
# Check that bottom_right > coords (x1 >= x0, y1 >= y0)
if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}")
# Check if the area only contains one color
cropped_image = image.crop((*coords, *bottom_right))
if len(set(list(cropped_image.getdata()))) == 1:
continue
# Draw rectangle on image
draw.rectangle([coords, bottom_right], outline="red", width=1)
# Draw index number at the bottom left of the bounding box with black background
text_position = (coords[0], bottom_right[1]) # Adjust Y to be above the bottom right
text_bbox: Tuple[int, int, int, int] = draw.textbbox(text_position, str(index), font=font, anchor="lb")
# offset: int = bottom_right[1]-text_bbox[3]
# text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)
# draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
draw.rectangle(text_bbox, fill="black")
draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
# each mark is an x, y, w, h tuple
marks.append([original_coords[0], original_coords[1], original_size[0], original_size[1]])
drew_nodes.append(_node)
if _node.text:
node_text = _node.text if '"' not in _node.text else '"{:}"'.format(_node.text.replace('"', '""'))
elif _node.get("{{{:}}}class".format(class_ns_windows), "").endswith("EditWrapper") and _node.get(
"{{{:}}}value".format(_value_ns)
):
node_text = _node.get("{{{:}}}value".format(_value_ns), "")
node_text = node_text if '"' not in node_text else '"{:}"'.format(node_text.replace('"', '""'))
else:
node_text = '""'
text_information: str = "{:d}\t{:}\t{:}\t{:}".format(index, _node.tag, _node.get("name", ""), node_text)
text_informations.append(text_information)
index += 1
except ValueError:
pass
output_image_stream = io.BytesIO()
image.save(output_image_stream, format="PNG")
image_content = output_image_stream.getvalue()
return marks, drew_nodes, "\n".join(text_informations), image_content
def print_nodes_with_indent(nodes, indent=0):
for node in nodes:
print(" " * indent, node.tag, node.attrib)
print_nodes_with_indent(node, indent + 2)
def find_active_applications(tree, state_ns):
apps_with_active_tag = []
for application in list(tree.getroot()):
app_name = application.attrib.get("name")
for frame in application:
is_active = frame.attrib.get("{{{:}}}active".format(state_ns), "false")
if is_active == "true":
apps_with_active_tag.append(app_name)
if apps_with_active_tag:
to_keep = apps_with_active_tag + ["gnome-shell"]
else:
to_keep = ["gjs", "gnome-shell"]
return to_keep
def linearize_accessibility_tree(accessibility_tree, platform="Ubuntu"):
if platform == "Ubuntu":
_attributes_ns = attributes_ns_ubuntu
_state_ns = state_ns_ubuntu
_component_ns = component_ns_ubuntu
_value_ns = value_ns_ubuntu
elif platform == "Windows":
_attributes_ns = attributes_ns_windows
_state_ns = state_ns_windows
_component_ns = component_ns_windows
_value_ns = value_ns_windows
else:
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
try:
tree = ET.ElementTree(ET.fromstring(accessibility_tree))
keep_apps = find_active_applications(tree, _state_ns)
# Remove inactive applications
for application in list(tree.getroot()):
if application.get("name") not in keep_apps:
tree.getroot().remove(application)
filtered_nodes = filter_nodes(tree.getroot(), platform, check_image=True)
linearized_accessibility_tree = ["tag\ttext\tposition (center x & y)\tsize (w & h)"]
# Linearize the accessibility tree nodes into a table format
for node in filtered_nodes:
try:
text = node.text if node.text is not None else ""
text = text.strip()
name = node.get("name", "").strip()
if text == "":
text = name
elif name != "" and text != name:
text = f"{name} ({text})"
text = text.replace("\n", "\\n")
pos = node.get("{{{:}}}screencoord".format(_component_ns), "")
size = node.get("{{{:}}}size".format(_component_ns), "")
x, y = re.match(f"\((\d+), (\d+)\)", pos).groups()
w, h = re.match(f"\((\d+), (\d+)\)", size).groups()
x_mid, y_mid = int(x) + int(w) // 2, int(y) + int(h) // 2
linearized_accessibility_tree.append(
"{:}\t{:}\t{:}\t{:}".format(node.tag, text, f"({x_mid}, {y_mid})", size)
)
except Exception as e:
continue
# Filter out similar nodes
linearized_accessibility_tree = filter_similar_nodes("\n".join(linearized_accessibility_tree))
except Exception as e:
print(f"Error in linearize_accessibility_tree: {e}")
linearized_accessibility_tree = ""
return linearized_accessibility_tree
def trim_accessibility_tree(linearized_accessibility_tree, max_items):
lines = linearized_accessibility_tree.strip().split("\n")
if len(lines) > max_items:
lines = lines[:max_items]
linearized_accessibility_tree = "\n".join(lines)
linearized_accessibility_tree += "\n..."
return linearized_accessibility_tree

View File

@ -0,0 +1,100 @@
import re
def parse_line(line):
# 解析格式label Google Chrome (191, 13) (104, 17)
pattern = r"^(\S+)\s+(.+?)\s+\((\d+), (\d+)\)\s+\((\d+), (\d+)\)"
m = re.match(pattern, line)
if not m:
return None
node_type, text, cx, cy, w, h = m.groups()
cx, cy, w, h = map(int, (cx, cy, w, h))
# bounding box as (x1, y1, x2, y2)
x1 = cx - w // 2
y1 = cy - h // 2
x2 = x1 + w
y2 = y1 + h
return {
"type": node_type,
"text": text.strip(),
"bbox": (x1, y1, x2, y2),
"center": (cx, cy),
"size": (w, h),
"raw": line,
}
def iou(box1, box2):
# box: (x1, y1, x2, y2)
xi1 = max(box1[0], box2[0])
yi1 = max(box1[1], box2[1])
xi2 = min(box1[2], box2[2])
yi2 = min(box1[3], box2[3])
inter_width = max(0, xi2 - xi1)
inter_height = max(0, yi2 - yi1)
inter_area = inter_width * inter_height
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - inter_area
if union == 0:
return 0
return inter_area / union
def norm_text(s):
# 归一化文本:小写、去空格等
return re.sub(r"\s+", "", s.lower())
def text_similarity(a, b):
# 简单判定完全一致为1否则0
na, nb = norm_text(a), norm_text(b)
if na == nb:
return 1.0
else:
return 0
def filter_similar_nodes(linearized_accessibility_tree):
lines = [ln for ln in linearized_accessibility_tree.split("\n") if ln.strip()]
# parse all nodes
nodes = []
for ln in lines:
node = parse_line(ln)
if node:
nodes.append(node)
else:
# 解析不了的保留
nodes.append({"raw": ln, "invalid": True})
filtered = []
removed = [False] * len(nodes)
# 阈值可自行调整
IOU_THRESH = 0.2
TEXT_THRESH = 0.9
for i, ni in enumerate(nodes):
if ni.get("invalid"):
filtered.append(ni["raw"])
continue
if removed[i]:
continue
for j in range(i + 1, len(nodes)):
nj = nodes[j]
if nj.get("invalid"):
continue
iou_val = iou(ni["bbox"], nj["bbox"])
text_sim = text_similarity(ni["text"], nj["text"])
if iou_val > IOU_THRESH and text_sim > TEXT_THRESH:
# 二者极其相似,移除后者
removed[j] = True
# print(f"移除: {nj['raw']} (与 {ni['raw']} 相似度高)")
# 保留未被标记为移除的
if not removed[i]:
filtered.append(ni["raw"])
return "\n".join(filtered)
# 示例用法
if __name__ == "__main__":
linearized_accessibility_tree = "tag\ttext\tposition (center x & y)\tsize (w & h)\nicon\t\t(1853, 1001)\t(64, 64)\nlabel\tHome\t(1853, 1045)\t(40, 17)\nlabel\tActivities\t(49, 13)\t(63, 17)\ntext\tActivities\t(49, 13)\t(63, 17)\nlabel\tApr 171704\t(995, 13)\t(117, 27)\ntext\tApr 171704\t(995, 13)\t(87, 18)\nmenu\tSystem\t(1867, 13)\t(106, 27)\npush-button\tGoogle Chrome\t(35, 65)\t(70, 64)\npush-button\tThunderbird Mail\t(35, 133)\t(70, 64)\npush-button\tVisual Studio Code\t(35, 201)\t(70, 64)\npush-button\tVLC media player\t(35, 269)\t(70, 64)\npush-button\tLibreOffice Writer\t(35, 337)\t(70, 64)\npush-button\tLibreOffice Calc\t(35, 405)\t(70, 64)\npush-button\tLibreOffice Impress\t(35, 473)\t(70, 64)\npush-button\tGNU Image Manipulation Program\t(35, 541)\t(70, 64)\npush-button\tFiles\t(35, 609)\t(70, 64)\npush-button\tUbuntu Software\t(35, 677)\t(70, 64)\npush-button\tHelp\t(35, 745)\t(70, 64)\npush-button\tTrash\t(35, 816)\t(70, 64)\ntoggle-button\tShow Applications\t(35, 1045)\t(70, 70)"
result = filter_similar_nodes(linearized_accessibility_tree)
print(result)

View File

@ -0,0 +1,260 @@
import base64
import json
import logging
import os
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger("desktopenv.agent")
def agent_action(func):
func.is_agent_action = True
return func
switch_window_code = """import subprocess;
import pyautogui;
pyautogui.press('escape');
time.sleep(0.5);
subprocess.run(['wmctrl', '-ia', 'WINDOW_ID'])
subprocess.run(['wmctrl', '-ir', 'WINDOW_ID', '-b', 'add,maximized_vert,maximized_horz'])
print('Switch to WINDOW_ID')"""
launch_app_commands = {
# Web Browser
"chrome": "google-chrome --remote-debugging-port=1337",
# File Manager
"files": "nautilus",
# Terminal
"terminal": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-terminal',
# Utilities
"gedit": "gedit",
# Office
"libreoffice writer": "libreoffice --writer",
"libreoffice calc": "libreoffice --calc",
"libreoffice impress": "libreoffice --impress",
# System
"settings": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-control-center',
# Multimedia
"vlc": "vlc",
"gimp": "gimp",
# IDE
"vs code": "code",
# Email
"thunderbird": "thunderbird",
}
class GroundingAgent:
tool_list = {
"libreoffice_calc": "CalcTools",
"libreoffice_impress": "ImpressTools",
"libreoffice_writer": "WriterTools",
"code": "CodeTools",
"vlc": "VLCTools",
"google_chrome": "BrowserTools",
}
relative_coordinate = True # whether the coordinates are relative (0-1000) or absolute (e.g. 1920x1080)
@classmethod
def tool_commands(cls, code: str, tool_name: str):
command = f"from {tool_name} import *; "
command += code
tool_class = cls.tool_list[tool_name]
command += f"; {tool_class}.print_result()"
return [
command,
]
@classmethod
@agent_action
def click(
cls,
coordinate: List,
num_clicks: int = 1,
button_type: str = "left",
):
"""
Click on the element
Args:
coordinate (List): [x, y], coordinate of the element to click on
num_clicks (int): number of times to click the element
button_type (str): which mouse button to press ("left", "middle", or "right")
"""
command = ""
x, y = coordinate
if cls.relative_coordinate:
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
command += f"""pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); print("Click Success")""" # TODO: 最大化窗口需要一次调用
return command
@classmethod
@agent_action
def type(
cls,
coordinate: Optional[List] = None,
text: str = "",
overwrite: bool = False,
enter: bool = False,
):
"""
Type text into the element
Args:
coordinate (List): [x, y], coordinate of the element to type into. If None, typing starts at current cursor location
text (str): the text to type
overwrite (bool): True to overwrite existing text, False otherwise
enter (bool): True to press enter after typing, False otherwise
"""
command = ""
if coordinate is not None:
# Start typing at the center of the element
x, y = coordinate
if cls.relative_coordinate:
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
command += f"pyautogui.click({x}, {y}); "
if overwrite:
command += f"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); "
command += f"pyautogui.write({repr(text)}); "
if enter:
command += "pyautogui.press('enter'); "
command += "print('Type Success')"
return command
@classmethod
@agent_action
def drag_and_drop(cls, drag_from_coordinate: List, drop_on_coordinate: List):
"""
Drag element1 and drop it on element2
Args:
drag_from_coordinate (List): [x, y], coordinate of element to drag
drop_on_coordinate (List): [x, y], coordinate of element to drop on
"""
x1, y1 = drag_from_coordinate
if cls.relative_coordinate:
x1, y1 = round(x1 * 1920 / 1000), round(y1 * 1080 / 1000)
x2, y2 = drop_on_coordinate
if cls.relative_coordinate:
x2, y2 = round(x2 * 1920 / 1000), round(y2 * 1080 / 1000)
command = f"pyautogui.moveTo({x1}, {y1}); "
# TODO: specified duration?
command += f"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); "
command += "print('Drag and Drop Success')"
return command
@classmethod
@agent_action
def scroll(cls, coordinate: List, direction: str):
"""
Scroll the element in the specified direction
Args:
coordinate (List): [x, y], coordinate of the element to scroll in
direction (str): the direction to scroll ("up" or "down")
"""
x, y = coordinate
if cls.relative_coordinate:
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
amount = 100 if direction == "up" else -100
return f"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({amount}); print('Scroll Success')"
@classmethod
@agent_action
def open_app(cls, app_name: str):
"""
Open a specified application
Supported apps: chrome, files, terminal, gedit, libreoffice writer,
libreoffice calc, libreoffice impress, vs code, vlc, gimp, settings, thunderbird
Args:
app_name (str): name of the application to open
"""
app_name = app_name.lower().strip()
if app_name not in launch_app_commands:
command = f"print(f'{app_name} is not supported or recognized')"
else:
command = {
"action_type": "OPEN_APP",
"parameters": {"launch_app_command": launch_app_commands[app_name], "app_name": app_name},
}
return command
@classmethod
@agent_action
def switch_window(cls, window_id: str):
"""
Switch to the window with the given window id
Args:
window_id (str): the window id to switch to from the provided list of open windows
"""
return switch_window_code.replace("WINDOW_ID", window_id)
@classmethod
@agent_action
def hotkey(cls, keys: List):
"""
Press a hotkey combination
Args:
keys (List): the keys to press in combination (e.g. ['ctrl', 'c'] for copy, ['prtsc'] for screenshot)
"""
# add quotes around the keys
keys = [f"'{key}'" for key in keys]
key_str = ", ".join(keys).replace("'", "\\'")
return f"import pyautogui; pyautogui.hotkey({', '.join(keys)}); print(f'Press Hotkey: {key_str}')"
@classmethod
@agent_action
def quote(cls, content: str):
"""
Quote information from the current page for memory
Args:
content (str): text summarized or copied from the page for later operation
"""
return f'''print("""{content}""")'''
@classmethod
@agent_action
def wait(cls):
"""
Wait for a while
"""
return "WAIT"
@classmethod
@agent_action
def exit(cls, success: bool):
"""
End the current task
Args:
success (bool): True if successfully finish a task, False otherwise
"""
if success:
return "DONE"
else:
return "FAIL"

View File

@ -0,0 +1,194 @@
import inspect
import json
import os
import textwrap
current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
def generate_func(json_data):
# 收集所有类名和它们的函数
class_funcs = {}
no_class_funcs = []
cls_name = ""
for item in json_data:
if item["type"] == "function":
func = item["function"]
func_parts = func["name"].split(".")
if len(func_parts) == 2:
class_name, func_name = func_parts
if class_name not in class_funcs:
class_funcs[class_name] = []
class_funcs[class_name].append(item)
else:
no_class_funcs.append(item)
code = ""
# 生成有类的函数
for class_name, funcs in class_funcs.items():
code += f"class {class_name}:\n"
cls_name = class_name
for item in funcs:
func = item["function"]
func_name = func["name"].split(".")[-1]
description = func["description"]
params = func["parameters"]["properties"]
required = func["parameters"].get("required", [])
# 构建参数列表
param_list = ["cls"]
# 首先添加必需参数
for param_name in required:
param_list.append(f"{param_name}")
# 然后添加可选参数
for param_name in params:
if param_name not in required:
param_list.append(f"{param_name}") # 可选参数默认值设为None
# 构建函数定义
func_def = f" def {func_name}({', '.join(param_list)}):\n"
# 构建文档字符串
docstring = f' """\n {description}\n\n Args:\n'
if len(param_list) == 1: # 只有cls参数
docstring += " None\n"
else:
# 首先记录必需参数
for param_name in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}): {param_desc}\n"
# 然后记录可选参数
for param_name in params:
if param_name not in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
docstring += ' """\n'
code += func_def + docstring + "\n"
code += "\n"
# 生成没有类的函数
for item in no_class_funcs:
func = item["function"]
func_name = func["name"]
description = func["description"]
params = func["parameters"]["properties"]
required = func["parameters"].get("required", [])
# 构建参数列表
param_list = []
# 首先添加必需参数
for param_name in required:
param_list.append(f"{param_name}")
# 然后添加可选参数
for param_name in params:
if param_name not in required:
param_list.append(f"{param_name}")
# 构建函数定义
func_def = f"def {func_name}({', '.join(param_list)}):\n"
# 构建文档字符串
docstring = f' """\n {description}\n\n Args:\n'
if not param_list:
docstring += " None\n"
else:
# 首先记录必需参数
for param_name in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}): {param_desc}\n"
# 然后记录可选参数
for param_name in params:
if param_name not in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
docstring += ' """\n'
code += func_def + docstring + "\n"
return code.strip(), cls_name
setup_prompt = """You are a GUI operation agent. You will be given a task and your action history, with current observation ({observation_list}). You should help me control the computer, output the best action step by step to accomplish the task.
You should first generate a plan, reflect on the current observation, then generate actions to complete the task in python-style pseudo code using the predefined functions.
* Output Format:
{format_hint}"""
func_def_template = """* Available Functions:
```python
{class_content}
```"""
note_prompt = """* Note:
- Your code should only be wrapped in ```python```.
- Only **ONE-LINE-OF-CODE** at a time.
- Each code block is context independent, and variables from the previous round cannot be used in the next round.
{relative_coordinate_hint}- Return with `Agent.exit(success=True)` immediately after the task is completed.
- The computer's environment is Linux, e.g., Desktop path is '/home/user/Desktop'
- My computer's password is '{client_password}', feel free to use it when you need sudo rights"""
class Prompt:
@staticmethod
def construct_procedural_memory(agent_class, app_name=None, client_password="password", with_image=True, with_atree=False, relative_coordinate=True, glm41v_format=True):
agent_class_content = "Class Agent:"
for attr_name in dir(agent_class):
attr = getattr(agent_class, attr_name)
if callable(attr) and hasattr(attr, "is_agent_action"):
# Use inspect to get the full function signature
signature = inspect.signature(attr)
agent_class_content += f"""
def {attr_name}{signature}:
'''{attr.__doc__}'''
"""
if app_name is not None:
tool_path = os.path.join(current_dir, "tools", "apis", f"{app_name.lower()}.json")
with open(tool_path, "r") as f:
json_data = json.load(f)
tool_class_content, tool_class_name = generate_func(json_data)
agent_class_content += "\n\n{}".format(tool_class_content)
func_def_prompt = func_def_template.format(class_content=agent_class_content.strip())
# --- dynamic observation list ---
obs_items = []
if with_image:
obs_items.append("screenshot")
obs_items.append("current app name")
if with_atree:
obs_items.append("a11y tree (based on AT-SPI library)")
obs_items.append("app info")
obs_items.append("last action result")
observation_list = ", ".join(obs_items)
setup_prompt_formatted = setup_prompt.format(
observation_list=observation_list,
format_hint="<think>\n{**YOUR-PLAN-AND-THINKING**}</think>\n<answer>```python\n{**ONE-LINE-OF-CODE**}\n```</answer>" if glm41v_format else "<think>\n{**YOUR-PLAN-AND-THINKING**}\n</think>\n```python\n{**ONE-LINE-OF-CODE**}\n```"
)
note_prompt_formatted = note_prompt.format(
relative_coordinate_hint="- The coordinate [x, y] should be normalized to 0-1000, which usually should be the center of a specific target element.\n" if relative_coordinate else "",
client_password=client_password
)
return setup_prompt_formatted, func_def_prompt, note_prompt_formatted
if __name__ == "__main__":
from grounding_agent import GroundingAgent
print(Prompt.construct_procedural_memory(GroundingAgent, "vlc"))

View File

@ -0,0 +1,3 @@
from .func import generate_func
__all__ = ["generate_func"]

View File

@ -0,0 +1,236 @@
[
{
"type": "function",
"function": {
"name": "CodeTools.launch_vscode",
"description": "Launch VS Code with specified path",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "File path or directory to open"
}
},
"required": ["path"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.compare_files",
"description": "Compare two files in VS Code",
"parameters": {
"type": "object",
"properties": {
"file1": {
"type": "string",
"description": "First file path"
},
"file2": {
"type": "string",
"description": "Second file path"
}
},
"required": ["file1", "file2"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.add_folder",
"description": "Add folder to active VS Code window",
"parameters": {
"type": "object",
"properties": {
"folder": {
"type": "string",
"description": "Folder path to add"
}
},
"required": ["folder"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.goto_file",
"description": "Open file at specific position",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "File path to open"
},
"line": {
"type": "integer",
"description": "Line number",
"default": 1
},
"character": {
"type": "integer",
"description": "Character position",
"default": 1
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.perform_merge",
"description": "Perform three-way merge",
"parameters": {
"type": "object",
"properties": {
"path1": {
"type": "string",
"description": "First version file path"
},
"path2": {
"type": "string",
"description": "Second version file path"
},
"base": {
"type": "string",
"description": "Base version file path"
},
"result": {
"type": "string",
"description": "Output file path"
}
},
"required": ["path1", "path2", "base", "result"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.remove_folder",
"description": "Remove folder from active VS Code window",
"parameters": {
"type": "object",
"properties": {
"folder": {
"type": "string",
"description": "Folder path to remove"
}
},
"required": ["folder"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.install_extension",
"description": "Install or update VS Code extension",
"parameters": {
"type": "object",
"properties": {
"extension_id": {
"type": "string",
"description": "Extension identifier"
},
"pre_release": {
"type": "boolean",
"description": "Install pre-release version",
"default": false
}
},
"required": ["extension_id"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.uninstall_extension",
"description": "Uninstall VS Code extension",
"parameters": {
"type": "object",
"properties": {
"extension_id": {
"type": "string",
"description": "Extension identifier"
}
},
"required": ["extension_id"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.list_extensions",
"description": "List installed extensions",
"parameters": {
"type": "object",
"properties": {
"show_versions": {
"type": "boolean",
"description": "Show extension versions",
"default": false
},
"category": {
"type": "string",
"description": "Filter by category"
}
}
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.update_extensions",
"description": "Update all extensions to latest version",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.disable_extension",
"description": "Disable extension for next VS Code instance",
"parameters": {
"type": "object",
"properties": {
"extension_id": {
"type": "string",
"description": "Extension identifier"
}
},
"required": ["extension_id"]
}
}
},
{
"type": "function",
"function": {
"name": "CodeTools.toggle_sync",
"description": "Toggle VS Code synchronization",
"parameters": {
"type": "object",
"properties": {
"state": {
"type": "string",
"description": "Sync state",
"enum": ["on", "off"]
}
},
"required": ["state"]
}
}
}
]

View File

@ -0,0 +1,117 @@
def generate_func(json_data):
# 收集所有类名和它们的函数
class_funcs = {}
no_class_funcs = []
for item in json_data:
if item["type"] == "function":
func = item["function"]
func_parts = func["name"].split(".")
if len(func_parts) == 2:
class_name, func_name = func_parts
if class_name not in class_funcs:
class_funcs[class_name] = []
class_funcs[class_name].append(item)
else:
no_class_funcs.append(item)
code = ""
# 生成有类的函数
for class_name, funcs in class_funcs.items():
code += f"class {class_name}:\n"
for item in funcs:
func = item["function"]
func_name = func["name"].split(".")[-1]
description = func["description"]
params = func["parameters"]["properties"]
required = func["parameters"].get("required", [])
# 构建参数列表
param_list = ["cls"]
# 首先添加必需参数
for param_name in required:
param_list.append(f"{param_name}")
# 然后添加可选参数
for param_name in params:
if param_name not in required:
param_list.append(f"{param_name}") # 可选参数默认值设为None
# 构建函数定义
func_def = f" def {func_name}({', '.join(param_list)}):\n"
# 构建文档字符串
docstring = f' """\n {description}\n\n Args:\n'
if len(param_list) == 1: # 只有cls参数
docstring += " None\n"
else:
# 首先记录必需参数
for param_name in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}): {param_desc}\n"
# 然后记录可选参数
for param_name in params:
if param_name not in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
docstring += ' """\n'
code += func_def + docstring + "\n"
code += "\n"
# 生成没有类的函数
for item in no_class_funcs:
func = item["function"]
func_name = func["name"]
description = func["description"]
params = func["parameters"]["properties"]
required = func["parameters"].get("required", [])
# 构建参数列表
param_list = []
# 首先添加必需参数
for param_name in required:
param_list.append(f"{param_name}")
# 然后添加可选参数
for param_name in params:
if param_name not in required:
param_list.append(f"{param_name}")
# 构建函数定义
func_def = f"def {func_name}({', '.join(param_list)}):\n"
# 构建文档字符串
docstring = f' """\n {description}\n\n Args:\n'
if not param_list:
docstring += " None\n"
else:
# 首先记录必需参数
for param_name in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}): {param_desc}\n"
# 然后记录可选参数
for param_name in params:
if param_name not in required:
param_type = params[param_name]["type"]
param_desc = params[param_name].get("description", "")
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
docstring += ' """\n'
code += func_def + docstring + "\n"
return code.strip()
if __name__ == "__main__":
import json
with open("libreoffice_calc.json", "r") as f:
json_data = json.load(f)
print(generate_func(json_data))

View File

@ -0,0 +1,134 @@
[
{
"type": "function",
"function": {
"name": "BrowserTools.open_profile_settings",
"description": "Opens profile settings page.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.open_password_settings",
"description": "Opens password/autofill settings page.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.open_privacy_settings",
"description": "Opens privacy settings page.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.open_appearance_settings",
"description": "Opens appearance settings page.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.open_search_engine_settings",
"description": "Opens search engine settings page.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.bring_back_last_tab",
"description": "Restores last-closed tab (Ctrl+Shift+T).",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.print",
"description": "Opens print dialog (Ctrl+P).",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.delete_browsing_data",
"description": "Opens clear browsing data dialog (Ctrl+Shift+Del).",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.open_extensions",
"description": "Opens extensions management page.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.bookmark_page",
"description": "Bookmarks current page (Ctrl+D).",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "BrowserTools.open_bookmarks",
"description": "Opens bookmarks page.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
}
]

View File

@ -0,0 +1,634 @@
[
{
"type": "function",
"function": {
"name": "CalcTools.get_workbook_info",
"description": "Get workbook info: file path, name, sheets, and active sheet",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.save",
"description": "Save workbook to current location",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.get_column_data",
"description": "Get all data from specified column",
"parameters": {
"type": "object",
"properties": {
"column_name": {
"type": "string",
"description": "Column name (e.g. 'A', 'B')"
}
},
"required": [
"column_name"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.switch_active_sheet",
"description": "Switch to sheet (creates if not exists)",
"parameters": {
"type": "object",
"properties": {
"sheet_name": {
"type": "string",
"description": "Sheet name"
}
},
"required": [
"sheet_name"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.set_column_values",
"description": "Set values to column (values only, not formulas)",
"parameters": {
"type": "object",
"properties": {
"column_name": {
"type": "string",
"description": "Column name (e.g. 'A', 'B')"
},
"data": {
"type": "array",
"description": "Values to write"
},
"start_index": {
"type": "integer",
"description": "First row index (default: 2)"
}
},
"required": [
"column_name",
"data"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.highlight_range",
"description": "Highlight range with color",
"parameters": {
"type": "object",
"properties": {
"range_str": {
"type": "string",
"description": "Range (e.g. 'A1:B10')"
},
"color": {
"type": "integer",
"description": "Color value (default: 0xFF0000)"
}
},
"required": [
"range_str"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.transpose_range",
"description": "Transpose range and paste to target cell",
"parameters": {
"type": "object",
"properties": {
"source_range": {
"type": "string",
"description": "Source range (e.g. 'A1:B10')"
},
"target_cell": {
"type": "string",
"description": "Target cell (e.g. 'A1')"
}
},
"required": [
"source_range",
"target_cell"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.export_to_csv",
"description": "Export to CSV with same path/name",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.sort_column",
"description": "Sort column data",
"parameters": {
"type": "object",
"properties": {
"column_name": {
"type": "string",
"description": "Column name (e.g. 'A', 'B')"
},
"ascending": {
"type": "boolean",
"description": "Sort ascending (default: true)"
},
"start_index": {
"type": "integer",
"description": "First row index (default: 2)"
}
},
"required": [
"column_name"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.set_validation_list",
"description": "Set validation list for column",
"parameters": {
"type": "object",
"properties": {
"column_name": {
"type": "string",
"description": "Column name (e.g. 'A', 'B')"
},
"values": {
"type": "array",
"description": "Validation values"
}
},
"required": [
"column_name",
"values"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.hide_row_data",
"description": "Hide rows containing value",
"parameters": {
"type": "object",
"properties": {
"value": {
"type": "string",
"description": "Value to hide (default: 'N/A')"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.reorder_columns",
"description": "Reorder columns by specified order",
"parameters": {
"type": "object",
"properties": {
"column_order": {
"type": "array",
"description": "Column names in desired order (e.g. ['A', 'B', 'C'])"
}
},
"required": [
"column_order"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.create_pivot_table",
"description": "Create pivot table from source sheet",
"parameters": {
"type": "object",
"properties": {
"source_sheet": {
"type": "string",
"description": "Source sheet name"
},
"table_name": {
"type": "string",
"description": "Pivot table name"
},
"row_fields": {
"type": "array",
"description": "Row labels (e.g. ['A', 'B'])"
},
"col_fields": {
"type": "array",
"description": "Column labels (e.g. ['A', 'B'])"
},
"value_fields": {
"type": "array",
"description": "Value fields (e.g. ['A', 'B'])"
},
"aggregation_function": {
"type": "string",
"description": "Aggregation function (sum, count, average, min, max)"
},
"target_cell": {
"type": "string",
"description": "Target cell (default: 'A1')"
}
},
"required": [
"source_sheet",
"table_name",
"value_fields"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.merge_cells",
"description": "Merge cells in range",
"parameters": {
"type": "object",
"properties": {
"range_str": {
"type": "string",
"description": "Cell range (e.g. 'A1:B10')"
}
},
"required": [
"range_str"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.set_cell_value",
"description": "Set cell value",
"parameters": {
"type": "object",
"properties": {
"cell": {
"type": "string",
"description": "Cell reference (e.g. 'A1')"
},
"value": {
"type": "string",
"description": "Cell value"
}
},
"required": [
"cell",
"value"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.format_range",
"description": "Apply formatting to range",
"parameters": {
"type": "object",
"properties": {
"range_str": {
"type": "string",
"description": "Range (e.g. 'A1:B10')"
},
"background_color": {
"type": "string",
"description": "Background color (e.g. '#0000ff')"
},
"font_color": {
"type": "string",
"description": "Font color (e.g. '#ffffff')"
},
"bold": {
"type": "boolean",
"description": "Bold text"
},
"alignment": {
"type": "string",
"description": "Text alignment (left, center, right)"
}
},
"required": [
"range_str"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.create_chart",
"description": "Create chart from data range",
"parameters": {
"type": "object",
"properties": {
"chart_type": {
"type": "string",
"description": "Chart type (bar, column, line, pie, scatter, area)"
},
"data_range": {
"type": "string",
"description": "Data range (e.g. 'A1:B10')"
},
"title": {
"type": "string",
"description": "Chart title"
},
"x_axis_title": {
"type": "string",
"description": "X axis title"
},
"y_axis_title": {
"type": "string",
"description": "Y axis title"
}
},
"required": [
"chart_type",
"data_range"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.freeze_panes",
"description": "Freeze rows/columns",
"parameters": {
"type": "object",
"properties": {
"rows": {
"type": "integer",
"description": "Rows to freeze from top"
},
"columns": {
"type": "integer",
"description": "Columns to freeze from left"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.rename_sheet",
"description": "Rename worksheet",
"parameters": {
"type": "object",
"properties": {
"old_name": {
"type": "string",
"description": "Current sheet name"
},
"new_name": {
"type": "string",
"description": "New sheet name"
}
},
"required": [
"old_name",
"new_name"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.copy_sheet",
"description": "Copy worksheet",
"parameters": {
"type": "object",
"properties": {
"source_sheet": {
"type": "string",
"description": "Source sheet name"
},
"new_sheet_name": {
"type": "string",
"description": "New sheet name (optional)"
}
},
"required": [
"source_sheet"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.reorder_sheets",
"description": "Change sheet order",
"parameters": {
"type": "object",
"properties": {
"sheet_name": {
"type": "string",
"description": "Sheet to move"
},
"position": {
"type": "integer",
"description": "New position (0-based)"
}
},
"required": [
"sheet_name",
"position"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.set_chart_legend_position",
"description": "Set chart legend position",
"parameters": {
"type": "object",
"properties": {
"position": {
"type": "string",
"description": "Legend position (top, bottom, left, right, none)"
}
},
"required": [
"position"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.set_number_format",
"description": "Apply number format to range",
"parameters": {
"type": "object",
"properties": {
"range_str": {
"type": "string",
"description": "Range (e.g. 'A1:B10')"
},
"format_type": {
"type": "string",
"description": "Format type (general, number, currency, accounting, date, time, percentage, fraction, scientific, text)"
},
"decimal_places": {
"type": "integer",
"description": "Decimal places (optional)"
}
},
"required": [
"range_str",
"format_type"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.adjust_column_width",
"description": "Adjust column width",
"parameters": {
"type": "object",
"properties": {
"columns": {
"type": "string",
"description": "Column range (e.g. 'A:C')"
},
"width": {
"type": "number",
"description": "Width in characters"
},
"autofit": {
"type": "boolean",
"description": "Autofit to content"
}
},
"required": [
"columns"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.adjust_row_height",
"description": "Adjust row height",
"parameters": {
"type": "object",
"properties": {
"rows": {
"type": "string",
"description": "Row range (e.g. '1:10')"
},
"height": {
"type": "number",
"description": "Height in points"
},
"autofit": {
"type": "boolean",
"description": "Autofit to content"
}
},
"required": [
"rows"
]
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.export_to_pdf",
"description": "Export to PDF",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "PDF save path (default: same as original)"
},
"sheets": {
"type": "array",
"description": "Sheets to include (default: all)"
},
"open_after_export": {
"type": "boolean",
"description": "Open PDF after export (default: false)"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "CalcTools.set_zoom_level",
"description": "Set worksheet zoom level",
"parameters": {
"type": "object",
"properties": {
"zoom_percentage": {
"type": "integer",
"description": "Zoom percentage (10-400)"
}
},
"required": [
"zoom_percentage"
]
}
}
}
]

View File

@ -0,0 +1,559 @@
[
{
"type": "function",
"function": {
"name": "ImpressTools.save",
"description": "Save current presentation",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.go_to_slide",
"description": "Navigate to specific slide",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
}
},
"required": ["slide_index"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.get_slide_count",
"description": "Get total slide count",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.duplicate_slide",
"description": "Duplicate slide and place at end",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index to duplicate (1-based)"
}
},
"required": ["slide_index"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_slide_font",
"description": "Set font for all text in slide",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"font_name": {
"type": "string",
"description": "Font name (e.g., 'Arial', 'Times New Roman')"
}
},
"required": ["slide_index", "font_name"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.write_text",
"description": "Write text to textbox",
"parameters": {
"type": "object",
"properties": {
"content": {
"type": "string",
"description": "Text content"
},
"page_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Textbox index (0-based)"
},
"bold": {
"type": "boolean",
"description": "Bold text (default: false)"
},
"italic": {
"type": "boolean",
"description": "Italic text (default: false)"
},
"size": {
"type": "integer",
"description": "Font size"
},
"append": {
"type": "boolean",
"description": "Append to existing text (default: false)"
}
},
"required": ["content", "page_index", "box_index"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_style",
"description": "Set text style for textbox",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Textbox index (0-based)"
},
"bold": {
"type": "boolean",
"description": "Bold text"
},
"italic": {
"type": "boolean",
"description": "Italic text"
},
"underline": {
"type": "boolean",
"description": "Underline text"
}
},
"required": ["slide_index", "box_index"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.configure_auto_save",
"description": "Configure auto-save settings",
"parameters": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable auto-save"
},
"interval_minutes": {
"type": "number",
"description": "Auto-save interval in minutes (min: 1)"
}
},
"required": ["enabled", "interval_minutes"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_background_color",
"description": "Set textbox background color",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Textbox index (0-based)"
},
"color": {
"type": "string",
"description": "Color name or hex code"
}
},
"required": ["slide_index", "box_index", "color"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_text_color",
"description": "Set text color for textbox",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Textbox index (0-based)"
},
"color": {
"type": "string",
"description": "Color name or hex code"
}
},
"required": ["slide_index", "box_index", "color"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.delete_content",
"description": "Delete textbox from slide",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Textbox index (0-based)"
}
},
"required": ["slide_index", "box_index"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_slide_orientation",
"description": "Set slide orientation",
"parameters": {
"type": "object",
"properties": {
"orientation": {
"type": "string",
"description": "Slide orientation",
"enum": ["portrait", "landscape"]
}
},
"required": ["orientation"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.position_box",
"description": "Position textbox or image on slide",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Box index (0-based)"
},
"position": {
"type": "string",
"description": "Position on slide",
"enum": ["left", "right", "center", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right"]
}
},
"required": ["slide_index", "box_index", "position"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.insert_file",
"description": "Insert video or audio file",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "File path"
},
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"position": {
"type": "object",
"description": "Position coordinates",
"properties": {
"x": {
"type": "number",
"description": "X position (% of slide width)"
},
"y": {
"type": "number",
"description": "Y position (% of slide height)"
}
}
},
"size": {
"type": "object",
"description": "Size dimensions",
"properties": {
"width": {
"type": "number",
"description": "Width (% of slide width)"
},
"height": {
"type": "number",
"description": "Height (% of slide height)"
}
}
},
"autoplay": {
"type": "boolean",
"description": "Auto-play media"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_slide_background",
"description": "Set slide background color or image",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based). If not provided, applies to all slides"
},
"color": {
"type": "string",
"description": "Background color"
},
"image_path": {
"type": "string",
"description": "Background image path (overrides color)"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.save_as",
"description": "Save document to specified location",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "File save path with filename and extension"
},
"overwrite": {
"type": "boolean",
"description": "Overwrite existing file (default: false)"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.insert_image",
"description": "Insert image to slide",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"image_path": {
"type": "string",
"description": "Image file path"
},
"width": {
"type": "number",
"description": "Image width in cm"
},
"height": {
"type": "number",
"description": "Image height in cm"
},
"position": {
"type": "object",
"description": "Position coordinates",
"properties": {
"x": {
"type": "number",
"description": "X position (% of slide width)"
},
"y": {
"type": "number",
"description": "Y position (% of slide height)"
}
}
}
},
"required": ["slide_index", "image_path"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.configure_display_settings",
"description": "Configure presentation display settings",
"parameters": {
"type": "object",
"properties": {
"use_presenter_view": {
"type": "boolean",
"description": "Use presenter view"
},
"primary_monitor_only": {
"type": "boolean",
"description": "Use primary monitor only"
},
"monitor_for_presentation": {
"type": "integer",
"description": "Monitor number for presentation"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_slide_number_color",
"description": "Set slide number color",
"parameters": {
"type": "object",
"properties": {
"color": {
"type": "string",
"description": "Color name or hex code"
}
},
"required": ["color"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_text_strikethrough",
"description": "Apply strikethrough formatting to text",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Textbox index (0-based)"
},
"line_numbers": {
"type": "array",
"items": {
"type": "integer"
},
"description": "Line numbers for strikethrough (1-based)"
},
"apply": {
"type": "boolean",
"description": "Apply or remove strikethrough"
}
},
"required": ["slide_index", "box_index", "line_numbers", "apply"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.set_textbox_alignment",
"description": "Set text alignment for textbox",
"parameters": {
"type": "object",
"properties": {
"slide_index": {
"type": "integer",
"description": "Slide index (1-based)"
},
"box_index": {
"type": "integer",
"description": "Textbox index (0-based)"
},
"alignment": {
"type": "string",
"description": "Text alignment",
"enum": ["left", "center", "right", "justify"]
}
},
"required": ["slide_index", "box_index", "alignment"]
}
}
},
{
"type": "function",
"function": {
"name": "ImpressTools.export_to_image",
"description": "Export presentation or slide to image",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Image save path with filename and extension"
},
"format": {
"type": "string",
"description": "Image format",
"enum": ["png", "jpeg", "jpg", "gif", "bmp", "tiff"]
},
"slide_index": {
"type": "integer",
"description": "Specific slide index (1-based). If not provided, exports all slides"
}
},
"required": ["file_path", "format"]
}
}
}
]

View File

@ -0,0 +1,412 @@
[
{
"type": "function",
"function": {
"name": "WriterTools.save",
"description": "Save document to current location",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.write_text",
"description": "Write text at cursor position",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "Text to write"
},
"bold": {
"type": "boolean",
"description": "Apply bold formatting"
},
"italic": {
"type": "boolean",
"description": "Apply italic formatting"
},
"size": {
"type": "number",
"description": "Font size"
}
},
"required": ["text"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.set_color",
"description": "Change text color using regex pattern",
"parameters": {
"type": "object",
"properties": {
"pattern": {
"type": "string",
"description": "Regex pattern to match"
},
"color": {
"type": "number",
"description": "Hex color code (e.g., 0x000000)"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["pattern", "color"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.find_and_replace",
"description": "Find and replace text using regex",
"parameters": {
"type": "object",
"properties": {
"pattern": {
"type": "string",
"description": "Regex pattern to find"
},
"replacement": {
"type": "string",
"description": "Replacement text"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["pattern", "replacement"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.set_font",
"description": "Change font family",
"parameters": {
"type": "object",
"properties": {
"font_name": {
"type": "string",
"description": "Font name (e.g., 'Arial', 'Times New Roman')"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["font_name"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.set_line_spacing",
"description": "Set line spacing",
"parameters": {
"type": "object",
"properties": {
"spacing_value": {
"type": "number",
"description": "Spacing value (1.0=single, 2.0=double)"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["spacing_value"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.remove_highlighting",
"description": "Remove text highlighting",
"parameters": {
"type": "object",
"properties": {
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.find_highlighted_text",
"description": "Find text with specific highlight color",
"parameters": {
"type": "object",
"properties": {
"highlight_color": {
"type": "string",
"description": "Color name (e.g., 'yellow') or hex code"
}
},
"required": ["highlight_color"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.insert_formula_at_cursor",
"description": "Insert formula at cursor",
"parameters": {
"type": "object",
"properties": {
"formula": {
"type": "string",
"description": "Formula to insert"
}
},
"required": ["formula"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.insert_image_at_cursor",
"description": "Insert image at cursor",
"parameters": {
"type": "object",
"properties": {
"image_path": {
"type": "string",
"description": "Full path to image file"
},
"width": {
"type": "integer",
"description": "Display width in pixels"
},
"height": {
"type": "integer",
"description": "Display height in pixels"
}
},
"required": ["image_path"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.set_strikethrough",
"description": "Apply strikethrough formatting",
"parameters": {
"type": "object",
"properties": {
"pattern": {
"type": "string",
"description": "Regex pattern to match"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["pattern"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.set_font_size",
"description": "Change font size",
"parameters": {
"type": "object",
"properties": {
"font_size": {
"type": "number",
"description": "Font size in points"
},
"pattern": {
"type": "string",
"description": "Regex pattern to match"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["font_size", "pattern"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.export_to_pdf",
"description": "Export document to PDF",
"parameters": {
"type": "object",
"properties": {
"output_path": {
"type": "string",
"description": "PDF save path"
},
"output_filename": {
"type": "string",
"description": "PDF filename"
},
"include_comments": {
"type": "boolean",
"description": "Include comments in PDF"
},
"quality": {
"type": "string",
"description": "Export quality ('standard', 'high', 'print')"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.set_paragraph_alignment",
"description": "Set paragraph alignment",
"parameters": {
"type": "object",
"properties": {
"alignment": {
"type": "string",
"description": "Alignment type ('left', 'center', 'right', 'justify')"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["alignment"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.capitalize_words",
"description": "Capitalize first letter of each word",
"parameters": {
"type": "object",
"properties": {
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.set_default_font",
"description": "Set default font for new text",
"parameters": {
"type": "object",
"properties": {
"font_name": {
"type": "string",
"description": "Default font name"
},
"font_size": {
"type": "number",
"description": "Default font size in points"
}
},
"required": ["font_name"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.add_page_numbers",
"description": "Add page numbers",
"parameters": {
"type": "object",
"properties": {
"position": {
"type": "string",
"description": "Position ('bottom_left', 'bottom_center', 'bottom_right', 'top_left', 'top_center', 'top_right')"
},
"start_number": {
"type": "integer",
"description": "Starting page number"
},
"format": {
"type": "string",
"description": "Number format (e.g., '1', 'Page 1', '1 of N')"
}
},
"required": ["position"]
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.insert_page_break",
"description": "Insert page break",
"parameters": {
"type": "object",
"properties": {
"position": {
"type": "string",
"description": "Insert location ('at_cursor', 'end_of_document')"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "WriterTools.change_text_case",
"description": "Change text case",
"parameters": {
"type": "object",
"properties": {
"case_type": {
"type": "string",
"description": "Case type ('lowercase', 'uppercase')"
},
"pattern": {
"type": "string",
"description": "Regex pattern to match"
},
"paragraph_indices": {
"type": "array",
"description": "Target paragraph indices (0-based). Applies to all if omitted"
}
},
"required": ["case_type", "pattern"]
}
}
}
]

View File

@ -0,0 +1,166 @@
[
{
"type": "function",
"function": {
"name": "VLCTools.get_playlist",
"description": "Get current playlist with track info",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.play",
"description": "Start playing current media",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.pause",
"description": "Pause current media",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.next",
"description": "Switch to next track",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.previous",
"description": "Switch to previous track",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.add_to_playlist",
"description": "Add media file to playlist",
"parameters": {
"type": "object",
"properties": {
"uri": {
"type": "string",
"description": "Media file URI (file:// or https://)"
}
},
"required": ["uri"]
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.get_current_time",
"description": "Get current playback position in seconds",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.get_media_duration",
"description": "Get media duration in seconds",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.toggle_fullscreen",
"description": "Toggle or set fullscreen mode",
"parameters": {
"type": "object",
"properties": {
"enable": {
"type": "boolean",
"description": "Force fullscreen on/off, omit to toggle"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.get_settings",
"description": "Get VLC settings",
"parameters": {
"type": "object",
"properties": {}
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.set_settings",
"description": "Set VLC settings",
"parameters": {
"type": "object",
"properties": {
"field": {
"type": "string",
"description": "Setting name (e.g. qt-max-volume, qt-minimal-view)"
},
"value": {
"type": "string",
"description": "Setting value (use 0/1 for booleans)"
}
},
"required": ["field", "value"]
}
}
},
{
"type": "function",
"function": {
"name": "VLCTools.get_media_files",
"description": "Get media files from path",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Directory path"
},
"suffix": {
"type": "array",
"description": "File extensions, default: ['mp4','avi','mkv','mov','mp3','m4a','wav']"
}
},
"required": ["path"]
}
}
}
]

View File

@ -0,0 +1,260 @@
import json
import os
import subprocess
from pathlib import Path
class CodeTools:
ret = ""
@classmethod
def print_result(cls):
"""打印执行结果"""
print(cls.ret)
@classmethod
def launch_vscode(cls, path):
"""
Launches Visual Studio Code with the specified file path or directory.
在存在的窗口中打开一个文件或目录
Args:
path (str): 文件路径或目录
"""
try:
subprocess.run(["code", "-r", path], check=True)
cls.ret = "Successfully launched VS Code"
except subprocess.CalledProcessError as e:
cls.ret = f"Error launching VS Code: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def env_info(cls):
cls.ret = "None"
@classmethod
def compare_files(cls, file1, file2):
"""
Compares two files in VSCode.
在VSCode中比较两个文件
Args:
file1 (str): 第一个文件的路径
file2 (str): 第二个文件的路径
"""
try:
# 获取compare结果
subprocess.run(["code", "-d", file1, file2], check=True)
cls.ret = "The compared files are opened in VSCode"
except subprocess.CalledProcessError as e:
cls.ret = f"Error comparing files: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def add_folder(cls, folder):
"""
Adds a folder to the last active window in VSCode.
向VSCode的最后一个活动窗口添加文件夹
Args:
folder (str): 文件夹路径
"""
try:
subprocess.run(["code", "-a", folder], check=True)
cls.ret = "Successfully added folder"
except subprocess.CalledProcessError as e:
cls.ret = f"Error adding folder: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def goto_file(cls, file_path, line=1, character=1):
"""
Opens a file at a specific line and character position.
在特定行和字符的位置打开文件
Args:
file_path (str): 文件路径
line (int): 行号
character (int): 字符位置
"""
try:
command = f"{file_path}:{line}:{character}"
subprocess.run(["code", "-g", command], check=True)
cls.ret = "Successfully opened file, line: {}, character: {}".format(line, character)
except subprocess.CalledProcessError as e:
cls.ret = f"Error going to file: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def perform_merge(cls, path1, path2, base, result):
"""
Perform a three-way merge.
执行三方合并
Args:
path1 (str): 第一版本文件路径
path2 (str): 第二版本文件路径
base (str): 基础版本文件路径
result (str): 结果文件的保存路径
"""
try:
subprocess.run(["code", "-m", path1, path2, base, result], check=True)
cls.ret = "Successfully performed merge"
except subprocess.CalledProcessError as e:
cls.ret = f"Error performing merge: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def remove_folder(cls, folder):
"""
Removes a folder from the last active window in VSCode.
在VSCode的最后一个活动窗口中移除文件夹
Args:
folder (str): 文件夹路径
"""
try:
subprocess.run(["code", "--remove", folder], check=True)
cls.ret = "Successfully removed folder"
except subprocess.CalledProcessError as e:
cls.ret = f"Error removing folder: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def install_extension(cls, extension_id, pre_release=False):
"""
Installs an extension or updates it in VSCode.
安装或更新VSCode中的扩展
Args:
extension_id (str): 扩展的标识符
pre_release (bool): 是否安装预发布版本
"""
try:
command = ["code", "--install-extension", extension_id]
if pre_release:
command.append("--pre-release")
subprocess.run(command, check=True)
cls.ret = "Successfully installed extension"
except subprocess.CalledProcessError as e:
cls.ret = f"Error installing extension: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def uninstall_extension(cls, extension_id):
"""
Uninstalls an extension from VSCode.
从VSCode中卸载扩展
Args:
extension_id (str): 扩展的标识符
"""
try:
subprocess.run(["code", "--uninstall-extension", extension_id], check=True)
cls.ret = "Successfully uninstalled extension"
except subprocess.CalledProcessError as e:
cls.ret = f"Error uninstalling extension: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def list_extensions(cls, show_versions=False, category=None):
"""
Lists installed extensions in VSCode.
列出VSCode中安装的扩展
Args:
show_versions (bool): 是否显示扩展的版本
category (str): 按类别筛选扩展
"""
try:
command = ["code", "--list-extensions"]
if show_versions:
command.append("--show-versions")
if category:
command.extend(["--category", category])
cls.ret = subprocess.run(command, check=True, capture_output=True, text=True).stdout
except subprocess.CalledProcessError as e:
cls.ret = f"Error listing extensions: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def update_extensions(cls):
"""
Updates all installed extensions in VSCode to the latest version.
更新VSCode中所有安装的扩展到最新版本
"""
try:
subprocess.run(["code", "--update-extensions"], check=True)
cls.ret = "Successfully updated extensions"
except subprocess.CalledProcessError as e:
cls.ret = f"Error updating extensions: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def disable_extension(cls, extension_id):
"""
Disables a specific extension for the next instance of VSCode.
禁用在下一个VSCode窗口中的指定扩展
Args:
extension_id (str): 扩展的标识符
"""
try:
subprocess.run(["code", "--disable-extension", extension_id], check=True)
cls.ret = "Successfully disabled extension"
except subprocess.CalledProcessError as e:
cls.ret = f"Error disabling extension: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret
@classmethod
def toggle_sync(cls, state):
"""
Toggles synchronization on or off in VSCode.
在VSCode中开启或关闭同步
Args:
state (str): 'on' 'off' 表示开启或关闭
"""
try:
command = ["code", "--sync", state]
subprocess.run(command, check=True)
cls.ret = "Successfully toggled sync"
except subprocess.CalledProcessError as e:
cls.ret = f"Error toggling sync: {e}"
except Exception as e:
cls.ret = f"Unexpected error: {e}"
return cls.ret

View File

@ -0,0 +1,107 @@
class BrowserTools:
ret = ""
@classmethod
def print_result(cls):
print(cls.ret)
@classmethod
def env_info(cls):
cls.ret = "None"
# @classmethod
# def show_all_tabs(cls):
# cls.ret = "Browser not found"
# for attempt in range(3):
# with sync_playwright() as p:
# try:
# browser = p.chromium.connect_over_cdp(cls.remote_debugging_url)
# if not browser:
# continue
# context = browser.contexts[0]
# # 获取所有窗口名称
# cls.ret = 'Browser Tabs: '
# for idx, page in enumerate(context.pages):
# cls.ret += f"{idx}. {page.title()} ({page.url})" + '\n'
# return cls.ret
# except TimeoutError:
# cls.ret = 'Failed to get browser tabs'
# return None
# return None
@classmethod
def open_profile_settings(cls):
"""
Open the profile settings page in the browser.
"""
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/people"]}}
@classmethod
def open_password_settings(cls):
"""
Open the password settings page in the browser.
"""
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/autofill"]}}
@classmethod
def open_privacy_settings(cls):
"""
Open the privacy settings page in the browser.
"""
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/privacy"]}}
@classmethod
def open_appearance_settings(cls):
"""
Open the appearance settings page in the browser.
"""
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/appearance"]}}
@classmethod
def open_search_engine_settings(cls):
"""
Open the search engine settings page in the browser.
"""
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/search"]}}
@classmethod
def bring_back_last_tab(cls):
"""
Bring back the last tab in the browser.
"""
return f"import pyautogui; pyautogui.hotkey('ctrl', 'shift', 't'); print('Brought back last tab')"
@classmethod
def print(cls):
"""
Open the print option in current page.
"""
return f"import pyautogui; pyautogui.hotkey('ctrl', 'p'); print('Opened print option')"
@classmethod
def delete_browsing_data(cls):
"""
Delete browsing data in the browser.
"""
return f"import pyautogui; pyautogui.hotkey('ctrl', 'shift', 'del'); print('Deleted browsing data')"
@classmethod
def open_extensions(cls):
"""
open the extensions page in the browser.
"""
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://extensions"]}}
@classmethod
def bookmark_page(cls):
"""
Bookmark the current page in the browser.
"""
return f"import pyautogui; pyautogui.hotkey('ctrl', 'd'); print('Bookmarked page')"
@classmethod
def open_bookmarks(cls):
"""
Open the bookmarks page in the browser.
"""
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://bookmarks"]}}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,753 @@
import os
import re
import uno
from com.sun.star.awt.FontSlant import ITALIC, NONE, OBLIQUE
from com.sun.star.awt.FontWeight import BOLD, NORMAL
from com.sun.star.beans import PropertyValue
from com.sun.star.style.ParagraphAdjust import CENTER, LEFT, RIGHT
from com.sun.star.text.ControlCharacter import PARAGRAPH_BREAK
from com.sun.star.text.TextContentAnchorType import AS_CHARACTER
class WriterTools:
localContext = uno.getComponentContext()
resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
ctx = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
desktop = ctx.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
text = doc.Text
cursor = text.createTextCursor()
ret = ""
@classmethod
def close_other_window(cls):
"""关闭除当前文档外的所有文档"""
components = cls.desktop.getComponents().createEnumeration()
current_url = cls.doc.getURL()
while components.hasMoreElements():
doc = components.nextElement()
if doc.getURL() != current_url:
doc.close(True)
@classmethod
def save(cls):
"""保存文档到当前位置"""
try:
if cls.doc.hasLocation():
cls.doc.store()
else:
raise Exception("文档没有保存位置,请使用另存为功能")
return True
except Exception as e:
return False
@classmethod
def maximize_window(cls):
"""
将窗口设置为工作区最大尺寸
使用工作区域大小考虑任务栏等
"""
window = cls.doc.getCurrentController().getFrame().getContainerWindow()
toolkit = window.getToolkit()
device = toolkit.createScreenCompatibleDevice(0, 0)
workarea = toolkit.getWorkArea()
window.setPosSize(workarea.X, workarea.Y, workarea.Width, workarea.Height, 15)
@classmethod
def print_result(cls):
print(cls.ret)
@classmethod
def write_text(cls, text, bold=False, italic=False, size=None):
"""写入文本"""
cls.cursor.CharWeight = 150 if bold else 100
cls.cursor.CharPosture = ITALIC if italic else NONE
if size:
cls.cursor.CharHeight = size
cls.text.insertString(cls.cursor, text, False)
cls.ret = "Success"
@classmethod
def get_paragraphs(cls, start_index=0, count=None):
"""Retrieves paragraphs from the document as a list."""
text = cls.doc.getText()
paragraphs = text.createEnumeration()
paragraph_list = []
while paragraphs.hasMoreElements():
paragraph = paragraphs.nextElement()
if paragraph.supportsService("com.sun.star.text.Paragraph"):
paragraph_list.append(paragraph.getString())
if start_index < 0:
start_index = 0
elif start_index >= len(paragraph_list):
cls.ret = []
if count is not None:
end_index = min(start_index + count, len(paragraph_list))
cls.ret = paragraph_list[start_index:end_index]
else:
cls.ret = paragraph_list[start_index:]
return cls.ret
@classmethod
def env_info(cls):
paras = cls.get_paragraphs()
para_str = ""
for i, para in enumerate(paras):
para = para[:500] + "..." if len(para) > 500 else para
para_str += "Paragraph " + str(i) + ": " + para.strip() + "\n"
cls.ret = para_str
return cls.ret
@classmethod
def set_color(cls, pattern, color, paragraph_indices=None):
"""
Changes the color of matched text in the document for specified paragraphs.
Args:
pattern (str): Regular expression pattern to match text
color (int): Hex color code (e.g., 0x000000 for black)
paragraph_indices (list, optional): List of paragraph indices to modify (0-based).
If None, applies to all paragraphs.
"""
try:
enum = cls.doc.Text.createEnumeration()
paragraphs = []
while enum.hasMoreElements():
paragraphs.append(enum.nextElement())
if not paragraph_indices:
paragraphs_to_process = range(len(paragraphs))
else:
paragraphs_to_process = paragraph_indices
regex = re.compile(pattern)
for idx in paragraphs_to_process:
if idx < 0 or idx >= len(paragraphs):
continue
paragraph = paragraphs[idx]
if not paragraph.supportsService("com.sun.star.text.Paragraph"):
continue
para_text = paragraph.getString()
matches = regex.finditer(para_text)
for match in matches:
para_cursor = cls.text.createTextCursorByRange(paragraph.getStart())
para_cursor.goRight(match.start(), False)
para_cursor.goRight(match.end() - match.start(), True)
para_cursor.CharColor = color
cls.ret = "Success"
return True
except Exception as e:
cls.ret = f"Error: {str(e)}"
return False
@classmethod
def find_and_replace(cls, pattern, replacement, paragraph_indices=None):
"""
Finds all occurrences of a specified text pattern and replaces them with another text in the document.
Args:
pattern (str): The pattern to match in the document, should be a regular expression
replacement (str): The text to replace the found text with
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing)
Returns:
str: Success message with number of replacements made
"""
try:
enum = cls.doc.Text.createEnumeration()
paragraphs = []
while enum.hasMoreElements():
paragraphs.append(enum.nextElement())
total_replacements = 0
if not paragraph_indices:
paragraphs_to_process = list(range(len(paragraphs)))
else:
paragraphs_to_process = [i for i in paragraph_indices if 0 <= i < len(paragraphs)]
regex = re.compile(pattern)
for idx in paragraphs_to_process:
if idx >= len(paragraphs):
continue
paragraph = paragraphs[idx]
if paragraph.supportsService("com.sun.star.text.Paragraph"):
text_content = paragraph.getString()
new_text, count = regex.subn(replacement, text_content)
if count > 0:
paragraph.setString(new_text)
total_replacements += count
cls.ret = f"Successfully made {total_replacements} replacements"
return cls.ret
except Exception as e:
cls.ret = f"Error during find and replace: {str(e)}"
return cls.ret
@classmethod
def set_font(cls, font_name, paragraph_indices=None):
"""
Changes the font of text in the document or specified paragraphs.
Args:
font_name (str): The name of the font to apply (e.g., 'Times New Roman', 'Arial', 'Calibri')
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
If not provided, applies to all paragraphs.
"""
try:
text = cls.doc.getText()
enum = text.createEnumeration()
paragraphs = []
while enum.hasMoreElements():
paragraphs.append(enum.nextElement())
if not paragraph_indices:
paragraph_indices = range(len(paragraphs))
for idx in paragraph_indices:
if 0 <= idx < len(paragraphs):
paragraph = paragraphs[idx]
cursor = text.createTextCursorByRange(paragraph)
cursor.CharFontName = font_name
cls.ret = "Success"
return True
except Exception as e:
cls.ret = f"Error: {str(e)}"
return False
@classmethod
def set_line_spacing(cls, spacing_value, paragraph_indices=None):
"""
Sets the line spacing for specified paragraphs in the document.
Args:
spacing_value (float): The line spacing value to apply (1.0 for single spacing, 2.0 for double spacing, etc.)
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
If not provided, applies to all paragraphs.
"""
try:
text = cls.doc.getText()
paragraph_enum = text.createEnumeration()
line_spacing_value = int(spacing_value * 100)
current_index = 0
while paragraph_enum.hasMoreElements():
paragraph = paragraph_enum.nextElement()
if not paragraph_indices or current_index in paragraph_indices:
line_spacing = uno.createUnoStruct("com.sun.star.style.LineSpacing")
line_spacing.Mode = 0
line_spacing.Height = line_spacing_value
paragraph.ParaLineSpacing = line_spacing
if paragraph.String.strip():
current_index += 1
cls.ret = "Success"
return True
except Exception as e:
cls.ret = f"Error: {str(e)}"
return False
@classmethod
def remove_highlighting(cls, paragraph_indices=None):
"""
Removes ALL highlighting from text in the document for specified paragraphs.
Args:
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
If not provided, applies to all paragraphs.
Returns:
str: Success message or error message
"""
try:
text = cls.doc.getText()
paragraphs = text.createEnumeration()
target_indices = set(paragraph_indices) if paragraph_indices else None
current_index = 0
while paragraphs.hasMoreElements():
paragraph = paragraphs.nextElement()
if target_indices is None or current_index in target_indices:
if paragraph.supportsService("com.sun.star.text.Paragraph"):
para_cursor = text.createTextCursorByRange(paragraph)
# Remove all highlighting by setting back color to -1
para_cursor.CharBackColor = -1
# Additional cleanup for individual text portions (optional)
text_portions = paragraph.createEnumeration()
while text_portions.hasMoreElements():
text_portion = text_portions.nextElement()
if hasattr(text_portion, "CharBackColor"):
portion_cursor = text.createTextCursorByRange(text_portion)
portion_cursor.CharBackColor = -1
current_index += 1
cls.ret = "Successfully removed all highlighting"
return cls.ret
except Exception as e:
cls.ret = f"Error removing highlighting: {str(e)}"
return cls.ret
@classmethod
def find_highlighted_text(cls, highlight_color):
"""
Finds all text in the document that has a specific highlight color applied to it.
Args:
highlight_color (str): The highlight color to search for. Can be a color name (e.g., 'yellow', 'green') or hex code.
Returns:
list: A list of strings containing all text segments with the specified highlight color.
"""
color_map = {
"yellow": 16776960,
"green": 65280,
"blue": 255,
"red": 16711680,
"cyan": 65535,
"magenta": 16711935,
"black": 0,
"white": 16777215,
"gray": 8421504,
"lightgray": 12632256,
}
target_color = None
if highlight_color.lower() in color_map:
target_color = color_map[highlight_color.lower()]
elif highlight_color.startswith("#") and len(highlight_color) == 7:
try:
hex_color = highlight_color[1:]
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
target_color = (r << 16) + (g << 8) + b
except ValueError:
cls.ret = f"Invalid hex color format: {highlight_color}"
return []
else:
cls.ret = f"Unsupported color format: {highlight_color}"
return []
highlighted_text = []
text = cls.doc.getText()
enum_paragraphs = text.createEnumeration()
while enum_paragraphs.hasMoreElements():
paragraph = enum_paragraphs.nextElement()
if paragraph.supportsService("com.sun.star.text.Paragraph"):
enum_portions = paragraph.createEnumeration()
while enum_portions.hasMoreElements():
text_portion = enum_portions.nextElement()
if hasattr(text_portion, "CharBackColor") and text_portion.CharBackColor == target_color:
if text_portion.getString().strip():
highlighted_text.append(text_portion.getString())
cls.ret = f"Found {len(highlighted_text)} text segments with highlight color {highlight_color}"
return highlighted_text
@classmethod
def insert_formula_at_cursor(cls, formula):
"""
Inserts a formula at the current cursor position in the document.
Args:
formula (str): The formula to insert at the current cursor position.
Returns:
bool: True if successful, False otherwise
"""
try:
embedded_obj = cls.doc.createInstance("com.sun.star.text.TextEmbeddedObject")
embedded_obj.setPropertyValue("CLSID", "078B7ABA-54FC-457F-8551-6147e776a997")
embedded_obj.setPropertyValue("AnchorType", AS_CHARACTER)
cls.text.insertTextContent(cls.cursor, embedded_obj, False)
math_obj = embedded_obj.getEmbeddedObject()
math_obj.Formula = formula
cls.ret = "Formula inserted successfully"
return True
except Exception as e:
cls.ret = f"Error inserting formula: {str(e)}"
return False
@classmethod
def insert_image_at_cursor(cls, image_path, width=None, height=None):
"""
Inserts an image at the current cursor position in the document.
Args:
image_path (str): Full path to the image file to insert
width (int, optional): Width to display the image in pixels
height (int, optional): Height to display the image in pixels
Returns:
str: Success message or error message
"""
try:
if image_path.startswith("~"):
image_path = os.path.expanduser(image_path)
if not os.path.exists(image_path):
cls.ret = f"Error: Image file not found at {image_path}"
return cls.ret
image_path = os.path.abspath(image_path)
if os.name == "nt":
file_url = "file:///" + image_path.replace("\\", "/")
else:
file_url = "file://" + image_path
graphic = cls.doc.createInstance("com.sun.star.text.GraphicObject")
graphic.GraphicURL = file_url
graphic.AnchorType = AS_CHARACTER
if width is not None:
graphic.Width = width * 100
if height is not None:
graphic.Height = height * 100
cls.text.insertTextContent(cls.cursor, graphic, False)
cls.ret = "Success: Image inserted"
return cls.ret
except Exception as e:
cls.ret = f"Error: {str(e)}"
return cls.ret
@classmethod
def set_strikethrough(cls, pattern, paragraph_indices=None):
"""
Sets the strikethrough formatting for text matching the specified pattern in the document.
Args:
pattern (str): The regular expression pattern to match in the document
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
If not provided, applies to all paragraphs.
Returns:
str: Success message or error information
"""
try:
paragraphs = cls.doc.getText().createEnumeration()
para_index = 0
found_matches = 0
while paragraphs.hasMoreElements():
paragraph = paragraphs.nextElement()
if paragraph.supportsService("com.sun.star.text.Paragraph"):
if paragraph_indices and para_index not in paragraph_indices:
para_index += 1
continue
para_text = paragraph.getString()
matches = list(re.finditer(pattern, para_text))
for match in matches:
text_range = paragraph.getStart()
cursor = cls.doc.getText().createTextCursorByRange(text_range)
cursor.goRight(match.start(), False)
cursor.goRight(match.end() - match.start(), True)
cursor.CharStrikeout = 1
found_matches += 1
para_index += 1
cls.ret = f"Successfully applied strikethrough to {found_matches} matches of pattern: {pattern}"
return cls.ret
except Exception as e:
cls.ret = f"Error applying strikethrough: {str(e)}"
return cls.ret
@classmethod
def set_font_size(cls, font_size, pattern, paragraph_indices=None):
"""
Changes the font size of specified text in the document.
Args:
font_size (float): The font size to apply (in points).
pattern (str): The pattern to match in the document, should be a regular expression.
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
If not provided, applies to all paragraphs.
Returns:
str: Result message indicating success or failure.
"""
try:
regex = re.compile(pattern)
paragraphs = cls.doc.getText().createEnumeration()
current_index = 0
while paragraphs.hasMoreElements():
paragraph = paragraphs.nextElement()
if paragraph_indices and current_index not in paragraph_indices:
current_index += 1
continue
if paragraph.supportsService("com.sun.star.text.Paragraph"):
para_cursor = cls.text.createTextCursorByRange(paragraph)
para_text = paragraph.getString()
matches = list(regex.finditer(para_text))
for match in reversed(matches):
start_pos = match.start()
end_pos = match.end()
para_cursor.gotoStart(False)
para_cursor.goRight(start_pos, False)
para_cursor.goRight(end_pos - start_pos, True)
para_cursor.CharHeight = font_size
current_index += 1
cls.ret = f"Successfully changed font size to {font_size} for text matching '{pattern}'"
return cls.ret
except Exception as e:
cls.ret = f"Error changing font size: {str(e)}"
return cls.ret
@classmethod
def export_to_pdf(cls, output_path=None, output_filename=None, include_comments=False, quality="standard"):
"""
Exports the current document to PDF format.
Args:
output_path (str, optional): The full path where the PDF should be saved.
If not provided, uses the same location as the original document.
output_filename (str, optional): The filename to use for the PDF.
If not provided, uses the original document's filename with .pdf extension.
include_comments (bool, optional): Whether to include comments in the exported PDF.
Defaults to False.
quality (str, optional): The quality of the PDF export ('standard', 'high', 'print').
Defaults to 'standard'.
Returns:
str: Path to the exported PDF file or error message
"""
try:
doc_url = cls.doc.getURL()
if not doc_url and not output_path:
return "Error: Document has not been saved and no output path provided"
if doc_url:
doc_path = uno.fileUrlToSystemPath(os.path.dirname(doc_url))
doc_filename = os.path.basename(doc_url)
doc_name = os.path.splitext(doc_filename)[0]
else:
doc_path = ""
doc_name = "export"
final_path = output_path if output_path else doc_path
final_filename = output_filename if output_filename else f"{doc_name}.pdf"
if not final_filename.lower().endswith(".pdf"):
final_filename += ".pdf"
full_output_path = os.path.join(final_path, final_filename)
output_url = uno.systemPathToFileUrl(full_output_path)
export_props = []
if quality == "high":
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=1))
elif quality == "print":
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=2))
else:
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=0))
export_props.append(PropertyValue(Name="ExportNotes", Value=include_comments))
export_props.extend(
[
PropertyValue(Name="FilterName", Value="writer_pdf_Export"),
PropertyValue(Name="Overwrite", Value=True),
]
)
cls.doc.storeToURL(output_url, tuple(export_props))
cls.ret = f"PDF exported to: {full_output_path}"
return full_output_path
except Exception as e:
cls.ret = f"Error exporting to PDF: {str(e)}"
return cls.ret
@classmethod
def set_paragraph_alignment(cls, alignment, paragraph_indices=None):
"""
Sets the text alignment for specified paragraphs in the document.
Args:
alignment (str): The alignment to apply ('left', 'center', 'right', 'justify').
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
If not provided, applies to all paragraphs.
Returns:
str: Success message or error message
"""
try:
alignment_map = {"left": LEFT, "center": CENTER, "right": RIGHT, "justify": 3}
if alignment.lower() not in alignment_map:
cls.ret = f"Error: Invalid alignment '{alignment}'. Use 'left', 'center', 'right', or 'justify'."
return cls.ret
alignment_value = alignment_map[alignment.lower()]
text = cls.doc.getText()
paragraph_enum = text.createEnumeration()
paragraphs = []
while paragraph_enum.hasMoreElements():
paragraph = paragraph_enum.nextElement()
if paragraph.supportsService("com.sun.star.text.Paragraph"):
paragraphs.append(paragraph)
if paragraph_indices:
valid_indices = [i for i in paragraph_indices if 0 <= i < len(paragraphs)]
if len(valid_indices) != len(paragraph_indices):
cls.ret = f"Warning: Some paragraph indices were out of range (0-{len(paragraphs) - 1})"
for idx in valid_indices:
paragraphs[idx].ParaAdjust = alignment_value
else:
for paragraph in paragraphs:
paragraph.ParaAdjust = alignment_value
cls.ret = f"Successfully applied '{alignment}' alignment to paragraphs"
return cls.ret
except Exception as e:
cls.ret = f"Error setting paragraph alignment: {str(e)}"
return cls.ret
@classmethod
def capitalize_words(cls, paragraph_indices=None):
"""
Capitalizes the first letter of each word for specified paragraphs in the document.
Args:
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
If not provided, applies to all paragraphs.
Returns:
str: Success message or error message
"""
try:
text = cls.doc.getText()
enum = text.createEnumeration()
paragraphs = []
while enum.hasMoreElements():
paragraph = enum.nextElement()
if paragraph.supportsService("com.sun.star.text.Paragraph"):
paragraphs.append(paragraph)
if not paragraph_indices:
target_paragraphs = list(range(len(paragraphs)))
else:
target_paragraphs = paragraph_indices
valid_indices = [idx for idx in target_paragraphs if 0 <= idx < len(paragraphs)]
for idx in valid_indices:
paragraph = paragraphs[idx]
text_content = paragraph.getString()
if not text_content.strip():
continue
capitalized_text = " ".join(word.capitalize() if word else "" for word in text_content.split(" "))
para_cursor = text.createTextCursorByRange(paragraph.getStart())
para_cursor.gotoRange(paragraph.getEnd(), True)
para_cursor.setString(capitalized_text)
cls.ret = f"Successfully capitalized words in {len(valid_indices)} paragraphs"
return cls.ret
except Exception as e:
cls.ret = f"Error capitalizing words: {str(e)}"
return cls.ret
@classmethod
def set_default_font(cls, font_name, font_size=None):
"""
Sets the default font for new text in the document without changing existing text.
Args:
font_name (str): The name of the font to set as default (e.g., 'Times New Roman', 'Arial', 'Calibri')
font_size (float, optional): The default font size in points.
Returns:
str: Success message or error message
"""
try:
style_families = cls.doc.getStyleFamilies()
paragraph_styles = style_families.getByName("ParagraphStyles")
default_style_names = ["Default", "Standard", "Normal"]
standard_style = None
for style_name in default_style_names:
if paragraph_styles.hasByName(style_name):
standard_style = paragraph_styles.getByName(style_name)
break
if standard_style is None:
style_names = paragraph_styles.getElementNames()
if style_names:
standard_style = paragraph_styles.getByName(style_names[0])
else:
raise Exception("Could not find default paragraph style")
standard_style.setPropertyValue("CharFontName", font_name)
standard_style.setPropertyValue("CharFontNameAsian", font_name)
standard_style.setPropertyValue("CharFontNameComplex", font_name)
if font_size is not None:
standard_style.setPropertyValue("CharHeight", float(font_size))
standard_style.setPropertyValue("CharHeightAsian", float(font_size))
standard_style.setPropertyValue("CharHeightComplex", float(font_size))
cls.cursor.setPropertyValue("CharFontName", font_name)
cls.cursor.setPropertyValue("CharFontNameAsian", font_name)
cls.cursor.setPropertyValue("CharFontNameComplex", font_name)
if font_size is not None:
cls.cursor.setPropertyValue("CharHeight", float(font_size))
cls.cursor.setPropertyValue("CharHeightAsian", float(font_size))
cls.cursor.setPropertyValue("CharHeightComplex", float(font_size))
cls.ret = f"Default font set to '{font_name}'" + (f" with size {font_size}pt" if font_size else "")
return cls.ret
except Exception as e:
cls.ret = f"Error setting default font: {str(e)}"
return cls.ret
@classmethod
def add_page_numbers(cls, position, start_number=1, format=None):
"""
Adds page numbers to the document at the specified position.
Args:
position (str): Position of the page numbers ('bottom_left', 'bottom_center', 'bottom_right',
'top_left', 'top_center', 'top_right')
start_number (int, optional): The starting page number. Defaults to 1.
format (str, optional): Format of the page numbers (e.g., '1', 'Page 1', '1 of N').
Defaults to simple number format.
Returns:
str: Success message or error message
"""
try:
page_styles = cls.doc.StyleFamilies.getByName("PageStyles")
default_style = page_styles.getByName("Standard")
try:
default_style.setPropertyValue("PageNumberOffset", start_number)
except:
pass
if position.startswith("top"):
default_style.HeaderIsOn = True
target = default_style.HeaderText
else:
default_style.FooterIsOn = True
target = default_style.FooterText
cursor = target.createTextCursor()
cursor.gotoStart(False)
cursor.gotoEnd(True)
cursor.setString("")
cursor.gotoStart(False)
if position.endswith("_left"):
cursor.ParaAdjust = LEFT
elif position.endswith("_center"):
cursor.ParaAdjust = CENTER
elif position.endswith("_right"):
cursor.ParaAdjust = RIGHT
if not format or format == "1":
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
page_number.NumberingType = 4
target.insertTextContent(cursor, page_number, False)
elif format == "Page 1" or "Page" in format and "of" not in format:
target.insertString(cursor, "Page ", False)
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
page_number.NumberingType = 4
target.insertTextContent(cursor, page_number, False)
elif format == "1 of N" or format == "Page {page} of {total}" or "of" in format:
if "Page" in format:
target.insertString(cursor, "Page ", False)
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
page_number.NumberingType = 4
target.insertTextContent(cursor, page_number, False)
target.insertString(cursor, " of ", False)
page_count = cls.doc.createInstance("com.sun.star.text.TextField.PageCount")
page_count.NumberingType = 4
target.insertTextContent(cursor, page_count, False)
else:
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
page_number.NumberingType = 4
target.insertTextContent(cursor, page_number, False)
cls.ret = "Successfully added page numbers"
return cls.ret
except Exception as e:
cls.ret = f"Error adding page numbers: {str(e)}"
return cls.ret
@classmethod
def insert_page_break(cls, position="at_cursor"):
"""
Inserts a page break at the specified position.
Args:
position (str): Where to insert the page break: 'at_cursor' for current cursor position,
'end_of_document' for end of document. Defaults to 'at_cursor'.
"""
try:
if position == "end_of_document":
cls.cursor.gotoEnd(False)
cls.text.insertControlCharacter(cls.cursor, PARAGRAPH_BREAK, False)
cls.cursor.gotoStartOfParagraph(True)
cls.cursor.BreakType = uno.Enum("com.sun.star.style.BreakType", "PAGE_BEFORE")
cls.ret = "Page break inserted successfully"
return True
except Exception as e:
cls.ret = f"Error inserting page break: {str(e)}"
return False

View File

@ -0,0 +1,233 @@
import json
import os
import re
import xml.etree.ElementTree as ET
from pathlib import Path
from urllib.parse import quote
import requests
from requests.auth import HTTPBasicAuth
class VLCTools:
host = "localhost"
port = 8080
base_url = f"http://{host}:{port}/requests"
password = "password"
auth = HTTPBasicAuth("", password)
ret = ""
@classmethod
def print_result(cls):
print(cls.ret)
@classmethod
def _make_request(cls, endpoint, params=None):
url = f"{cls.base_url}/{endpoint}"
try:
response = requests.get(url, params=params, auth=cls.auth)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
return None
@classmethod
def _get_status(cls):
response = cls._make_request("status.xml")
if response:
return ET.fromstring(response.content)
return None
@classmethod
def env_info(cls):
cls.ret = "None"
@classmethod
def get_playlist(cls):
response = cls._make_request("playlist.xml")
if response:
info = ET.fromstring(response.content)
playlist_node = info.find('.//node[@name="Playlist"]')
if playlist_node is not None:
playlist_items = []
for leaf in playlist_node.findall("leaf"):
item = {"name": leaf.get("name"), "uri": leaf.get("uri"), "duration": leaf.get("duration") + "s"}
playlist_items.append(item)
cls.ret = f"Playlist: {playlist_items}"
return cls.ret
cls.ret = "Error getting playlist"
return None
@classmethod
def play(cls):
response = cls._make_request("status.xml", {"command": "pl_play"})
if response:
cls.ret = "Start playing the media"
return cls.ret
cls.ret = "Error playing the media"
return None
@classmethod
def pause(cls):
response = cls._make_request("status.xml", {"command": "pl_pause"})
if response:
cls.ret = "Pause the media"
return cls.ret
cls.ret = "Error pausing the media"
return None
@classmethod
def next(cls):
response = cls._make_request("status.xml", {"command": "pl_next"})
if response:
cls.ret = "Switch to next media"
return cls.ret
cls.ret = "Error switching to next media"
return None
@classmethod
def previous(cls):
response = cls._make_request("status.xml", {"command": "pl_previous"})
if response:
cls.ret = "Switch to previous media"
return cls.ret
cls.ret = "Error switching to previous media"
return None
@classmethod
def add_to_playlist(cls, uri):
if uri.startswith("http"):
encoded_uri = uri
else:
encoded_uri = "file://" + quote(uri.replace("file://", ""))
response = cls._make_request("status.xml", {"command": "in_play", "input": encoded_uri})
if response:
cls.ret = f"Add {uri} to playlist"
return cls.ret
cls.ret = f"Error adding {uri} to playlist"
return None
@classmethod
def get_current_time(cls):
status = cls._get_status()
if status is not None:
time = status.find("time")
cls.ret = int(time.text) if time is not None else None
return cls.ret
return None
@classmethod
def get_media_duration(cls):
status = cls._get_status()
if status is not None:
length = status.find("length")
if length is not None:
cls.ret = f"Media duration: {length.text} seconds"
return cls.ret
cls.ret = "Error getting media duration"
return None
@classmethod
def get_settings(cls):
settings = {}
with open(Path.home() / ".config/vlc/vlcrc", "r") as f:
for line in f:
if line:
try:
key, value = line.split("=")
if key.strip().startswith("#"):
continue
settings[key.strip()] = value.strip()
except:
continue
cls.ret = json.dumps(settings, indent=4, ensure_ascii=False)
return cls.ret
@classmethod
def set_settings(cls, field, value):
with open(Path.home() / ".config/vlc/vlcrc", "r") as rf:
settings = rf.read()
# 正则表达式匹配settings中的field项并替换
pattern = re.compile(r"#? *" + re.escape(field) + r"=.*")
# 判断是否存在field项
if pattern.search(settings):
settings = pattern.sub(f"{field}={value}", settings)
else:
settings += f"{field}={value}\n"
with open(Path.home() / ".config/vlc/vlcrc", "w") as wf:
wf.write(settings)
cls.ret = f"Set {field} to {value}"
return cls.ret
@classmethod
def toggle_fullscreen(cls, enable=None):
"""
Toggle fullscreen mode or set it explicitly based on the enable parameter.
Args:
enable (bool, optional): If provided, explicitly set fullscreen mode (True for fullscreen, False for windowed)
Returns:
str: Success or error message
"""
if enable is not None:
command = "fullscreen" if enable else "fullscreen off"
else:
command = "fullscreen"
response = cls._make_request("status.xml", {"command": command})
if response:
action = "enabled" if enable is True else "disabled" if enable is False else "toggled"
cls.ret = f"Fullscreen mode {action}"
return cls.ret
cls.ret = "Error changing fullscreen mode"
return None
@classmethod
def get_media_files(cls, path, suffix=None):
"""
Gets the media files for the specified path.
Args:
path (str): The path to the media files
suffix (List[str], optional): The suffix of the media files.
Defaults to ['mp4', 'avi', 'mkv', 'mov', 'mp3', 'm4a', 'wav']
"""
# Set default suffix if not provided
if suffix is None:
suffix = ["mp4", "avi", "mkv", "mov", "mp3", "m4a", "wav"]
# Validate path
if not path:
cls.ret = "Path cannot be empty"
return None
if not os.path.exists(path):
cls.ret = f"Path not found: {path}"
return None
# Initialize result list
media_files = []
# Convert suffix list to lowercase for case-insensitive comparison
suffix = [s.lower() for s in suffix]
# Walk through directory
try:
for root, _, files in os.walk(path):
for file in files:
# Check if file extension matches any of the specified suffixes
if any(file.lower().endswith(f".{s}") for s in suffix):
# Add full path of the file to results
full_path = os.path.join(root, file)
media_files.append(full_path)
except Exception as e:
cls.ret = f"Error while scanning directory: {str(e)}"
return None
cls.ret = media_files
return cls.ret

608
run_autoglm_v.py Normal file
View File

@ -0,0 +1,608 @@
"""Script to run end-to-end evaluation on the benchmark.
Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
"""
import argparse
import datetime
import json
import logging
import os
import sys
import math
import ast
import time
import backoff
import httpx
import requests
from openai import APIConnectionError, APIError, RateLimitError
from requests.exceptions import SSLError
from tqdm import tqdm
import lib_run_single
from desktop_env.desktop_env import MAX_RETRIES, DesktopEnv as DesktopEnvBase
from mm_agents.autoglm_v import AutoGLMAgent
from typing import Optional, Dict, Any
from openai import OpenAI
# Almost deprecated since it's not multi-env, use run_multienv_*.py instead
# Logger Configs {{{ #
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
stdout_handler = logging.StreamHandler(sys.stdout)
sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
file_handler.setLevel(logging.INFO)
debug_handler.setLevel(logging.DEBUG)
stdout_handler.setLevel(logging.INFO)
sdebug_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
)
file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter)
stdout_handler.setFormatter(formatter)
sdebug_handler.setFormatter(formatter)
stdout_handler.addFilter(logging.Filter("desktopenv"))
sdebug_handler.addFilter(logging.Filter("desktopenv"))
logger.addHandler(file_handler)
logger.addHandler(debug_handler)
logger.addHandler(stdout_handler)
logger.addHandler(sdebug_handler)
# }}} Logger Configs #
logger = logging.getLogger("desktopenv.experiment")
def config() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run end-to-end evaluation on the benchmark")
# environment config
parser.add_argument("--path_to_vm", type=str)
parser.add_argument(
"--provider_name",
type=str,
default="docker",
help="Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)",
)
parser.add_argument("--headless", action="store_true", default=True, help="Run in headless machine")
parser.add_argument("--action_space", type=str, default="autoglm_computer_use", help="Action type")
parser.add_argument(
"--observation_type",
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
default="a11y_tree",
help="Observation type",
)
parser.add_argument("--screen_width", type=int, default=1920)
parser.add_argument("--screen_height", type=int, default=1080)
parser.add_argument("--sleep_after_execution", type=float, default=1.0)
parser.add_argument("--max_steps", type=int, default=50)
# agent config
parser.add_argument("--max_trajectory_length", type=int, default=3)
parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples/examples")
# lm config
parser.add_argument("--model", type=str, default="autoglm-os")
parser.add_argument("--temperature", type=float, default=0.4)
parser.add_argument("--top_p", type=float, default=0.5)
parser.add_argument("--max_tokens", type=int, default=4096)
parser.add_argument("--stop_token", type=str, default=None)
parser.add_argument("--image_width", type=int, default=1280)
parser.add_argument("--image_height", type=int, default=720)
# example config
parser.add_argument("--domain", type=str, default="all")
parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_nogdrive.json")
# aws config
parser.add_argument(
"--region", type=str, default="us-east-1", help="AWS region for the VM"
)
parser.add_argument(
"--client_password", type=str, default="", help="Client password"
)
# logging related
parser.add_argument("--result_dir", type=str, default="./results")
args = parser.parse_args()
return args
class DesktopEnv(DesktopEnvBase):
def step(self, action, pause=2):
self._step_no += 1
self.action_history.append(action)
# Mark environment as used when step is called
self.is_environment_used = True
reward = 0 # todo: Define reward calculation for each example
done = False # todo: Define episode termination condition for each example
info = {}
logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
# handle the special actions
if action in ['WAIT', 'FAIL', 'DONE']:
if action == 'WAIT':
time.sleep(pause)
exe_result = 'Wait ' + str(pause) + ' seconds'
elif action == 'FAIL':
done = True
info = {"fail": True}
exe_result = 'Finish: fail'
elif action == 'DONE':
done = True
info = {"done": True}
exe_result = 'Finish: success'
elif type(action) == dict:
if action['action_type'] == 'OPEN_APP':
self.setup_controller._launch_setup(action['parameters']['launch_app_command'], shell=True)
exe_result = 'Open ' + action['parameters']['app_name']
elif action['action_type'] == 'OPEN_CHROME_TAB':
self.setup_controller._chrome_open_tabs_setup(action['parameters']['urls_to_open'])
exe_result = 'Open ' + str(action['parameters']['urls_to_open']) + ' in Chrome successfully'
else:
# the set of all possible python commands insides `pyautogui`
result = self.controller.execute_python_command(action)
try:
if result['error']:
exe_result = result['error'].strip()
else:
exe_result = result['output'].strip()
except Exception as e:
exe_result = 'Error Action: ' + action
logger.error(f"Error executing action: {e}")
time.sleep(pause)
observation = self._get_obs()
observation['exe_result'] = exe_result
return observation, reward, done, info
def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
# Reset to certain task in OSWorld
logger.info("Resetting environment...")
logger.info("Switching task...")
logger.info("Setting counters...")
self._traj_no += 1
self._step_no = 0
self.action_history.clear()
for attempt in range(MAX_RETRIES):
# Only revert to snapshot if environment has been used (step/setup)
# This optimization is especially important for cloud providers like AWS
# where unnecessary snapshot operations are costly and time-consuming
if task_config is not None:
# Only consider task proxy requirement if proxy is enabled at system level
task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
if not self.enable_proxy and task_config.get("proxy", False):
logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
if task_use_proxy != self.current_use_proxy:
# keep because get_info_from_website depend on this
self.current_use_proxy = task_use_proxy
if self.is_environment_used:
logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
self._revert_to_snapshot()
logger.info("Starting emulator...")
self._start_emulator()
logger.info("Emulator started.")
# Reset the usage flag after reverting
self.is_environment_used = False
else:
logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
if task_config is not None:
if task_config.get("proxy", False) and self.enable_proxy:
# If using proxy and proxy is enabled, set up the proxy configuration
self.setup_controller._proxy_setup(self.client_password)
self._set_task_info(task_config)
self.setup_controller.reset_cache_dir(self.cache_dir)
logger.info("Setting up environment...")
success = self.setup_controller.setup(self.config, task_config.get("proxy", False) and self.enable_proxy)
if success:
# Mark environment as used when setup is successfully executed
if self.config: # Only mark as used if there were actual setup operations
self.is_environment_used = True
break
else:
logger.error(
"Environment setup failed, retrying (%d/%d)...",
attempt + 1,
MAX_RETRIES,
)
time.sleep(5)
else:
break
logger.info("Environment setup complete.")
# Upload tools from autoglm package
import mm_agents.autoglm_v
tool_dir = os.path.join(os.path.dirname(mm_agents.autoglm_v.__file__), 'tools', 'package')
for file in os.listdir(tool_dir):
if os.path.isdir(os.path.join(tool_dir, file)):
continue
self.setup_controller._upload_file_setup([{
"local_path": os.path.join(tool_dir, file),
"path": os.path.join('~', file)
}])
# start soffice service for office tools
self.setup_controller._launch_setup('soffice --accept="socket,host=localhost,port=2002;urp;" --norestore --nologo --nodefault', shell=True)
time.sleep(5)
observation = self._get_obs()
return observation
def get_current_apps(self):
apps_code = r"""import subprocess;
command = "wmctrl -xl";
apps = subprocess.run(command, shell=True, capture_output=True, text=True).stdout.strip().split('\n');
print(apps);"""
window_code = r"""import subprocess;
command = "wmctrl -a :ACTIVE: -v 2>&1 | grep 'Using window' | awk '{print $3}'";
window_id = subprocess.run(command, shell=True, capture_output=True, text=True).stdout.strip();
print(window_id);"""
apps = self.controller.execute_python_command(apps_code)['output'].strip()
apps = ast.literal_eval(apps)
app_list = {}
for app in apps:
parts = app.split(maxsplit=4)
if len(parts) < 4:
continue
if parts[1] != '0':
continue
window_id = parts[0]
app_name = '.'.join(parts[2].split('.')[-(math.ceil(parts[2].count('.') / 2)):])
title = parts[3]
app_list[window_id] = {
'app_name': app_name,
'title': title
}
cur_id = self.controller.execute_python_command(window_code)['output'].strip()
return app_list, cur_id
def maximize_window(self):
window_state = r"""import subprocess;
command = "xprop -id $(xprop -root _NET_ACTIVE_WINDOW | awk -F' ' '{print $5}') _NET_WM_STATE"
output = subprocess.run(command, shell=True, capture_output=True, text=True).stdout.strip();
print(output);"""
for _ in range(5):
try:
self.setup_controller._launch_setup('wmctrl -r :ACTIVE: -b add,maximized_vert,maximized_horz', shell=True)
time.sleep(2)
output = self.controller.execute_python_command(window_state)['output'].strip()
if '_NET_WM_STATE_FOCUSED' not in output or '_NET_WM_STATE_SKIP_TASKBAR' in output or '_NET_WM_STATE_MODAL' in output or '_NET_WM_STATE_MAXIMIZED' in output: # 没有窗口 or popups or 模态窗口 or 窗口已经最大化
return
except Exception as e:
logger.error(f"Failed to maximize window: {e}")
time.sleep(1)
def _get_obs(self):
tool_list = {
"libreoffice_calc": "CalcTools",
"libreoffice_impress": "ImpressTools",
"libreoffice_writer": "WriterTools",
"code": "CodeTools",
"vlc": "VLCTools",
"google_chrome": "BrowserTools"
}
self.maximize_window()
for i in range(3):
try:
app_list, cur_id = self.get_current_apps()
except Exception as e:
if i == 2:
raise e
logger.error(f"Failed to get current apps: {e}")
time.sleep(1)
if cur_id in app_list:
cur_app = app_list[cur_id]['app_name']
tool_name = cur_app.strip().lower().replace('-', '_')
if tool_name in tool_list:
class_name = tool_list[tool_name]
command = f"from {tool_name} import *; "
command += f"{class_name}.env_info(); "
command += f"{class_name}.print_result();"
app_info = self.controller.execute_python_command(command)['output'].strip()
else:
app_info = None
else:
cur_app = None
app_info = None
tree = self.controller.get_accessibility_tree()
screenshot = self.controller.get_screenshot()
if screenshot is None:
logger.error("Failed to get screenshot.")
screenshot = b''
return {
"screenshot": screenshot,
"accessibility_tree": tree,
"instruction": self.instruction,
"apps": app_list,
"cur_window_id": cur_id,
"cur_app": cur_app,
"app_info": app_info,
}
def test(args: argparse.Namespace, test_all_meta: dict) -> None:
scores = []
max_steps = args.max_steps
# log args
logger.info("Args: %s", args)
# set wandb project
cfg_args = {
"path_to_vm": args.path_to_vm,
"provider_name": args.provider_name,
"headless": args.headless,
"action_space": args.action_space,
"observation_type": args.observation_type,
"screen_width": args.screen_width,
"screen_height": args.screen_height,
"sleep_after_execution": args.sleep_after_execution,
"max_steps": args.max_steps,
"max_trajectory_length": args.max_trajectory_length,
"model": args.model,
"temperature": args.temperature,
"top_p": args.top_p,
"max_tokens": args.max_tokens,
"stop_token": args.stop_token,
"result_dir": args.result_dir,
}
@backoff.on_exception(
backoff.constant,
(RateLimitError, APIConnectionError),
interval=0.1,
)
def call_llm(messages):
logger.info("Calling LLM...")
# Prepare the request data
data = {
"model": args.model,
"messages": messages,
"max_tokens": args.max_tokens,
"temperature": args.temperature,
"top_p": args.top_p,
"skip_special_tokens": False,
"stream": False,
"include_stop_str_in_output": True,
"stop": ["<|user|>", "<|observation|>", "</answer>"]
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', '')}"
}
# Get API base URL from environment or use default
base_url = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1')
url = f"{base_url}/chat/completions"
response = requests.post(
url,
json=data,
headers=headers,
timeout=60.0
)
response.raise_for_status()
result = response.json()
logger.info("LLM called successfully.")
return result['choices'][0]['message']['content']
env = DesktopEnv(
provider_name=args.provider_name,
region=args.region,
client_password=args.client_password,
path_to_vm=args.path_to_vm,
action_space=args.action_space,
screen_size=(args.screen_width, args.screen_height),
headless=args.headless,
os_type="Ubuntu",
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
)
agent = AutoGLMAgent(
action_space=args.action_space,
observation_type=args.observation_type,
screen_size=(args.screen_width, args.screen_height),
image_size=(args.image_width, args.image_height),
max_trajectory_length=args.max_trajectory_length,
client_password=args.client_password,
gen_func=call_llm,
)
for domain in tqdm(test_all_meta, desc="Domain"):
for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
config_file = os.path.join(args.test_config_base_dir, f"{domain}/{example_id}.json")
with open(config_file, "r", encoding="utf-8") as f:
example = json.load(f)
logger.info(f"[Domain]: {domain}")
logger.info(f"[Example ID]: {example_id}")
instruction = example["instruction"]
logger.info(f"[Instruction]: {instruction}")
# wandb each example config settings
cfg_args["instruction"] = instruction
cfg_args["start_time"] = datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S")
example_result_dir = os.path.join(
args.result_dir,
args.action_space,
args.observation_type,
args.model,
domain,
example_id,
)
os.makedirs(example_result_dir, exist_ok=True)
# example start running
try:
lib_run_single.run_single_example_autoglm(
agent,
env,
example,
max_steps,
instruction,
args,
example_result_dir,
scores,
)
except Exception as e:
logger.error(f"Exception in {domain}/{example_id}: {e}")
# Only attempt to end recording if controller exists (not Docker provider)
if hasattr(env, "controller") and env.controller is not None:
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(json.dumps({"Error": f"Time limit exceeded in {domain}/{example_id}"}))
f.write("\n")
env.close()
logger.info(f"Average score: {sum(scores) / len(scores)}")
def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json):
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
if not os.path.exists(target_dir):
return total_file_json
finished = {}
for domain in os.listdir(target_dir):
finished[domain] = []
domain_path = os.path.join(target_dir, domain)
if os.path.isdir(domain_path):
for example_id in os.listdir(domain_path):
if example_id == "onboard":
continue
example_path = os.path.join(domain_path, example_id)
if os.path.isdir(example_path):
if "result.txt" not in os.listdir(example_path):
# empty all files under example_id
for file in os.listdir(example_path):
os.remove(os.path.join(example_path, file))
else:
finished[domain].append(example_id)
if not finished:
return total_file_json
for domain, examples in finished.items():
if domain in total_file_json:
total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples]
return total_file_json
def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
if not os.path.exists(target_dir):
print("New experiment, no result yet.")
return None
all_result = []
for domain in os.listdir(target_dir):
domain_path = os.path.join(target_dir, domain)
if os.path.isdir(domain_path):
for example_id in os.listdir(domain_path):
example_path = os.path.join(domain_path, example_id)
if os.path.isdir(example_path):
if "result.txt" in os.listdir(example_path):
result_path = os.path.join(example_path, "result.txt")
try:
with open(result_path, "r") as rf:
res = rf.read().strip()
if res.lower() == "true":
score = 1.0
else:
score = float(res)
except Exception:
score = 0.0
all_result.append(score)
if not all_result:
print("New experiment, no result yet.")
return None
else:
print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
return all_result
if __name__ == "__main__":
####### The complete version of the list of examples #######
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = config()
if args.client_password == "":
if args.provider_name == "aws":
args.client_password = "osworld-public-evaluation"
else:
args.client_password = "password"
else:
args.client_password = args.client_password
# save args to json in result_dir/action_space/observation_type/model/args.json
path_to_args = os.path.join(
args.result_dir,
args.action_space,
args.observation_type,
args.model,
"args.json",
)
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
with open(path_to_args, "w", encoding="utf-8") as f:
json.dump(vars(args), f, indent=4)
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
test_all_meta = json.load(f)
if args.domain != "all":
test_all_meta = {args.domain: test_all_meta[args.domain]}
test_file_list = get_unfinished(
args.action_space,
args.model,
args.observation_type,
args.result_dir,
test_all_meta,
)
left_info = ""
for domain in test_file_list:
left_info += f"{domain}: {len(test_file_list[domain])}\n"
logger.info(f"Left tasks:\n{left_info}")
get_result(
args.action_space,
args.model,
args.observation_type,
args.result_dir,
test_all_meta,
)
test(args, test_file_list)

294
run_multienv_autoglm_v.py Normal file
View File

@ -0,0 +1,294 @@
"""Script to run end-to-end evaluation on the benchmark.
Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
"""
import argparse
import datetime
import json
import logging
import os
import sys
import math
import ast
import time
import backoff
import httpx
import requests
from tqdm import tqdm
from typing import Optional, Dict, Any
from multiprocessing import Pool
from openai import APIConnectionError, APIError, RateLimitError
from types import SimpleNamespace
import lib_run_single
from run_autoglm_v import DesktopEnv, get_unfinished, get_result
from desktop_env.desktop_env import MAX_RETRIES, DesktopEnv as DesktopEnvBase
from mm_agents.autoglm_v import AutoGLMAgent
from openai import OpenAI
logger = logging.getLogger("desktopenv.experiment")
def config() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run end-to-end evaluation on the benchmark")
# environment config
parser.add_argument("--path_to_vm", type=str)
parser.add_argument(
"--provider_name",
type=str,
default="docker",
help="Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)",
)
parser.add_argument("--headless", action="store_true", default=True, help="Run in headless machine")
parser.add_argument("--action_space", type=str, default="autoglm_computer_use", help="Action type")
parser.add_argument(
"--observation_type",
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
default="a11y_tree",
help="Observation type",
)
parser.add_argument("--screen_width", type=int, default=1920)
parser.add_argument("--screen_height", type=int, default=1080)
parser.add_argument("--sleep_after_execution", type=float, default=1.0)
parser.add_argument("--max_steps", type=int, default=30)
# agent config
parser.add_argument("--max_trajectory_length", type=int, default=3)
parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples/examples")
# lm config
parser.add_argument("--model", type=str, default="autoglm-os")
parser.add_argument("--temperature", type=float, default=0.4)
parser.add_argument("--top_p", type=float, default=0.5)
parser.add_argument("--max_tokens", type=int, default=2048)
parser.add_argument("--stop_token", type=str, default=None)
parser.add_argument("--image_width", type=int, default=1280)
parser.add_argument("--image_height", type=int, default=720)
# example config
parser.add_argument("--domain", type=str, default="all")
parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_nogdrive.json")
# aws config
parser.add_argument(
"--region", type=str, default="us-east-1", help="AWS region for the VM"
)
parser.add_argument("--client_password", type=str, default="", help="Client password")
# logging related
parser.add_argument("--result_dir", type=str, default="./results")
# parallel number
parser.add_argument("--num_workers", type=int, default=20, help="Number of parallel workers")
args = parser.parse_args()
return args
def _worker_run(task):
domain, example_id, args = task # args 为 argparse.Namespace
logger = logging.getLogger("desktopenv.experiment")
try:
config_file = os.path.join(args.test_config_base_dir, f"{domain}/{example_id}.json")
with open(config_file, "r", encoding="utf-8") as f:
example = json.load(f)
instruction = example["instruction"]
@backoff.on_exception(backoff.constant, (RateLimitError, APIConnectionError), interval=0.1)
def call_llm(messages):
logger.info("Calling LLM...")
# Prepare the request data
data = {
"model": args.model,
"messages": messages,
"max_tokens": args.max_tokens,
"temperature": args.temperature,
"top_p": args.top_p,
"skip_special_tokens": False,
"stream": False,
"include_stop_str_in_output": True,
"stop": ["<|user|>", "<|observation|>", "</answer>"]
}
# Set up proxy
# if os.environ.get('LAN_PROXY', None):
# proxies = {
# "http": os.environ.get('LAN_PROXY'),
# "https": os.environ.get('LAN_PROXY')
# }
# else:
# proxies = None
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', '')}"
}
# Get API base URL from environment or use default
base_url = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1')
url = f"{base_url}/chat/completions"
response = requests.post(
url,
json=data,
headers=headers,
# proxies=proxies,
timeout=60.0
)
response.raise_for_status()
result = response.json()
logger.info("LLM called successfully.")
return result['choices'][0]['message']['content']
env = DesktopEnv(
provider_name=args.provider_name,
region=args.region,
client_password=args.client_password,
path_to_vm=args.path_to_vm,
action_space=args.action_space,
screen_size=(args.screen_width, args.screen_height),
headless=args.headless,
os_type="Ubuntu",
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
)
agent = AutoGLMAgent(
action_space=args.action_space,
observation_type=args.observation_type,
screen_size=(args.screen_width, args.screen_height),
image_size=(args.image_width, args.image_height),
max_trajectory_length=args.max_trajectory_length,
client_password=args.client_password,
gen_func=call_llm,
)
example_result_dir = os.path.join(
args.result_dir,
args.action_space,
args.observation_type,
args.model,
domain,
example_id,
)
os.makedirs(example_result_dir, exist_ok=True)
local_scores = []
try:
lib_run_single.run_single_example_autoglm(
agent,
env,
example,
args.max_steps,
instruction,
args,
example_result_dir,
local_scores,
)
except Exception as e:
logger.error(f"[并发任务异常] {domain}/{example_id}: {e}")
if hasattr(env, "controller") and env.controller is not None:
try:
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
except Exception:
pass
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(json.dumps({"Error": f"Exception in {domain}/{example_id}: {str(e)}"}) + "\n")
finally:
try:
env.close()
except Exception:
pass
score = None
result_path = os.path.join(example_result_dir, "result.txt")
if os.path.exists(result_path):
try:
with open(result_path, "r") as rf:
res = rf.read().strip()
if res.lower() == "true":
score = 1.0
else:
score = float(res)
except Exception:
score = 0.0
else:
score = 0.0
logger.info(f"[Finish] {domain}/{example_id} score={score}")
return (domain, example_id, score)
except Exception as e:
logger = logging.getLogger("desktopenv.experiment")
logger.error(f"[Initializing Fail] {domain}/{example_id}: {e}")
return (domain, example_id, 0.0)
def test_parallel(args: argparse.Namespace, test_all_meta: dict):
tasks = []
for domain in test_all_meta:
for example_id in test_all_meta[domain]:
tasks.append((domain, example_id, args))
if not tasks:
logger.info("No pending tasks")
return
logger.info(f"Starting parallel execution: {args.num_workers} processes, {len(tasks)} tasks total")
results = []
with Pool(processes=args.num_workers) as pool:
for res in tqdm(pool.imap_unordered(_worker_run, tasks), total=len(tasks), desc="Parallel execution"):
results.append(res)
scores = [s for (_, _, s) in results if s is not None]
if scores:
avg = sum(scores) / len(scores)
logger.info(f"Parallel execution completed. Average score: {avg}")
else:
logger.info("No scores obtained.")
if __name__ == "__main__":
####### The complete version of the list of examples #######
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = config()
if args.client_password == "":
if args.provider_name == "aws":
args.client_password = "osworld-public-evaluation"
else:
args.client_password = "password"
else:
args.client_password = args.client_password
# save args to json in result_dir/action_space/observation_type/model/args.json
path_to_args = os.path.join(
args.result_dir,
args.action_space,
args.observation_type,
args.model,
"args.json",
)
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
with open(path_to_args, "w", encoding="utf-8") as f:
json.dump(vars(args), f, indent=4)
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
test_all_meta = json.load(f)
if args.domain != "all":
test_all_meta = {args.domain: test_all_meta[args.domain]}
test_file_list = get_unfinished(
args.action_space,
args.model,
args.observation_type,
args.result_dir,
test_all_meta,
)
left_info = ""
for domain in test_file_list:
left_info += f"{domain}: {len(test_file_list[domain])}\n"
logger.info(f"Left tasks:\n{left_info}")
get_result(
args.action_space,
args.model,
args.observation_type,
args.result_dir,
test_all_meta,
)
test_parallel(args, test_file_list)