working tests, changed code to find the first open tag

This commit is contained in:
Grace Guo 2025-10-03 11:38:30 -07:00
parent fc55584580
commit fc3222c99f
3 changed files with 771 additions and 2 deletions

291
model/parsers/qwen3vl.go Normal file
View File

@ -0,0 +1,291 @@
package parsers
import (
"context"
"fmt"
"log/slog"
"strings"
"encoding/json"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/logutil"
)
// type parserState int
const (
CollectingContent qwenParserState = iota
CollectingThinkingContent // this is because qwen3vl starts with <thinking>
// parserState_CompletedThinkingContent
CollectingToolContent
// parserState_CompletedToolContent
)
const (
thinkingOpenTag = "<thinking>"
thinkingCloseTag = "</thinking>"
)
type Qwen3VLParser struct {
state qwenParserState
buffer strings.Builder
tools []api.Tool
}
func (p *Qwen3VLParser) HasToolSupport() bool {
return true
}
func (p *Qwen3VLParser) HasThinkingSupport() bool {
return true
}
func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
p.tools = tools
return tools // Qwen doesn't modify tools
// does qwenvl modify tools?
}
// Add processes a chunk of string output from the model, accumulating it in the parser's buffer,
// and then parses any complete events (such as tool calls or content) that can be extracted from the buffer.
// It returns the parsed content (as a string), an empty string for "thinking" (since this parser does not support it),
// a slice of parsed tool calls, and an error if any occurred during parsing.
//
// Specifically, it works as follows:
// 1. Appends the new string chunk 's' to the internal accumulator.
// 2. Calls parseEvents() to extract any complete events (tool calls or content) from the buffer.
// 3. Iterates over the events:
// - For tool call events, attempts to parse them into api.ToolCall objects and collects them.
// - For content events, appends their content to a string builder.
// 4. Returns the accumulated content, an empty string for thinking, the collected tool calls, and any error encountered.
type qwenEventThinkingContent struct {
content string
}
func (qwenEventThinkingContent) isQwenEvent() {}
func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
// is s the complete content (aka the for sure unambiguous content)
p.buffer.WriteString(s)
// why do we write the entire string?
events := p.parseEvents()
// parse events:
// - parses the entire content
// parses an entire tool call
// parses an entire thinking content
var toolCalls []api.ToolCall
var sb strings.Builder
for _, event := range events {
switch event := event.(type) {
case qwenEventRawToolCall:
toolCall, err := parseToolCall(event, p.tools)
if err != nil {
slog.Warn("qwen tool call parsing failed", "error", err)
return "", "", nil, err
}
toolCalls = append(toolCalls, toolCall)
case qwenEventThinkingContent: // maybe we only need one?
print("unimplemented")
// how exactly does thinking work?
case qwenEventContent:
// TODO(drifkin): if the same turn contains multiple interleaved content
// events, we naively append them together here. See the note below about
// `qwenEvent`s for more details
sb.WriteString(event.content)
}
}
return sb.String(), "", toolCalls, nil
}
func (p *Qwen3VLParser) parseEvents() []qwenEvent {
var all []qwenEvent
keepLooping := true
for keepLooping {
var events []qwenEvent
events, keepLooping = p.eat()
if len(events) > 0 {
all = append(all, events...)
}
}
if len(all) > 0 {
slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
}
return all
}
// type qwenEventRawToolCall struct {
// raw string
// }
// type qwenEventContent struct {
// content string
// }
// think if a better name
func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent {
split := strings.SplitN(p.buffer.String(), tag, 2) // what is his 2 for?
before := split[0] // before the tag
// before = strings.TrimRightFunc(before, unicode.IsSpace) // trim all the space after the bfire
if len(before) > 0 {
events = append(events, qwenEventContent{content: before})
}
after := split[1]
p.buffer.Reset()
p.buffer.WriteString(after)
return events
}
// overlap = ambiguous
// findFirstTag returns the tag that appears first in the buffer among the provided tags.
// If no tag is found, it returns an empty string.
func findFirstTag(p *Qwen3VLParser, tags []string) string {
minIdx := -1
var firstTag string
for _, tag := range tags {
idx := strings.Index(p.buffer.String(), tag)
if idx != -1 && (minIdx == -1 || idx < minIdx) {
minIdx = idx
firstTag = tag
}
}
if minIdx == -1 { // just content
return ""
}
return firstTag // there is a possibility that there is no tag, can you return nil for that?
}
func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
var events []qwenEvent
// certain events:
// - thinking opening tag
// - tool opening tag
// since there is multiple tags, we need to think about which tag comes first
// we also need to create a list for
firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag})
switch p.state {
case CollectingContent: // we can only look for thinking content if we're collecting content
// if strings.Contains(p.buffer.String(), thinkingOpenTag) { // found thinking
if firstTag == thinkingOpenTag {
// string contains the openThinkingTag, we move it to the CollectingThinkingContent state
events = emitContentBeforeTag(p, events, thinkingOpenTag)
p.state = CollectingThinkingContent // <found a thinking>
return events, true
// } else if strings.Contains(p.buffer.String(), toolOpenTag) { // found tool call
} else if firstTag == toolOpenTag {
events = emitContentBeforeTag(p, events, toolOpenTag)
p.state = CollectingToolContent // found a <tool_call>
return events, true
} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 { // found a partial thinking tag
// it is only possible that they find 1
// found a partial think tag, emit the unambiguous before the partial tool call
// hello </think -> hello, so ambiguous start includes all the whitespace before the tag
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
ambiguousStart := len(beforePartialTag)
// HAVENT ADDED TRAILING WHITESPACE YET...
unambiguous := p.buffer.String()[:ambiguousStart]
ambiguous := p.buffer.String()[ambiguousStart:]
p.buffer.Reset()
p.buffer.WriteString(ambiguous)
events = append(events, qwenEventContent{content: unambiguous})
return events, false
} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { // found a partial tool call tag
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
ambiguousStart := len(beforePartialTag)
unambiguous := p.buffer.String()[:ambiguousStart]
ambiguous := p.buffer.String()[ambiguousStart:]
p.buffer.Reset()
p.buffer.WriteString(ambiguous)
events = append(events, qwenEventContent{content: unambiguous})
return events, false
} else { // no partial or full thinking or tool call tag found
// whitespaceLen := trailingWhitespaceLen(p.buffer.String()) <- all the trailing space we consider ambiguous
ambiguousStart := len(p.buffer.String()) // - whitespaceLen
unambiguous := p.buffer.String()[:ambiguousStart]
ambiguous := p.buffer.String()[ambiguousStart:]
p.buffer.Reset()
p.buffer.WriteString(ambiguous)
if len(unambiguous) > 0 {
events = append(events, qwenEventContent{content: unambiguous})
}
return events, false
}
case CollectingToolContent: // we only move towards the CollectingContent state
if strings.Contains(p.buffer.String(), toolCloseTag) {
split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) // this one splits by the first one
before := split[0]
if len(before) == 0 {
slog.Warn("qwen tool call closing tag found but no content before it")
}
after := split[1] // no whit space yet
events = append(events, qwenEventRawToolCall{raw: before}) // do these need to be "seperated"?
p.buffer.Reset()
p.buffer.WriteString(after)
p.state = CollectingContent
return events, true
} else {
return events, false
}
case CollectingThinkingContent:
if strings.Contains(p.buffer.String(), thinkingCloseTag) {
split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)
// so it looks like before contains the open tag
fmt.Println("split", split)
before := split[0]
if len(before) == 0 {
slog.Warn("qwen tool call closing tag found but no content before it")
}
after := split[1] // no whit space yet
events = append(events, qwenEventThinkingContent{content: before})
p.buffer.Reset()
p.buffer.WriteString(after)
p.state = CollectingContent
return events, true
} else {
return events, false
}
default:
panic("unreachable")
}
}
func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
// Expected JSON shape: {"name": "...", "arguments": { ... }}
// var in struct {
// Name string `json:"name"`
// Arguments json.RawMessage `json:"arguments"`
// }
fmt.Println(raw.raw)
var toolCall api.ToolCall
if err := json.Unmarshal([]byte(raw.raw), &toolCall); err != nil {
return api.ToolCall{}, err
}
// args := make(api.ToolCallFunctionArguments)
// if len(in.Arguments) > 0 && string(in.Arguments) != "null" {
// var obj map[string]any
// if err := json.Unmarshal(in.Arguments, &obj); err == nil {
// for k, v := range obj {
// args[k] = v
// }
// }
// }
fmt.Println(toolCall)
return toolCall, nil
}
// do we need to parse values

View File

@ -26,35 +26,162 @@ func TestQwen3VLParserStreaming(t *testing.T) {
steps []step
only bool
}{
// all of this is just thinking tests
{
desc: "with thinking",
desc: "simple thinking",
steps: []step{
{input: "<thinking>abc</thinking>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
},
},
{
desc: "thinking with split tags",
steps: []step{
{input: "<thinking>abc", wantEvents: []qwenEvent{}},
{input: "</thinking>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
},
},
{
desc: "thinking and tool call",
steps: []step{
{
input: "<thinking>I'm thinking</thinking><tool_call>I'm tool calling</tool_call>",
wantEvents: []qwenEvent{
qwenEventThinkingContent{content: "I'm thinking"},
qwenEventRawToolCall{raw: "I'm tool calling"},
},
},
},
},
{
desc: "thinking and content",
steps: []step{
{
input: "<thinking>I'm thinking</thinking>I'm content",
wantEvents: []qwenEvent{
qwenEventThinkingContent{content: "I'm thinking"},
qwenEventContent{content: "I'm content"},
},
},
},
},
{
desc: "thinking and tool call and content",
},
{
desc: "nested thinking (outside thinking, inside thinking)",
steps: []step{
{
input: "<thinking>I'm thinking<thinking>I'm nested thinking</thinking></thinking>",
wantEvents: []qwenEvent{
qwenEventThinkingContent{content: "I'm thinking<thinking>I'm nested thinking"},
qwenEventContent{content: "</thinking>"},
},
},
},
},
{
desc: "interleaved thinking",
steps: []step{
{
input: "<thinking>I'm thinking<thinking></thinking>I'm actually content</thinking>",
wantEvents: []qwenEvent{
qwenEventThinkingContent{content: "I'm thinking<thinking>"},
qwenEventContent{content: "I'm actually content</thinking>"},
},
},
},
},
{
desc: "nested thinking and tool call (outside thinking, inside tool call)",
steps: []step{
{
input: "<thinking>I'm thinking<tool_call>I'm nested tool call</tool_call></thinking>",
wantEvents: []qwenEvent{qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm nested tool call</tool_call>"}},
},
},
},
{
desc: "nested thinking and tool call (inside tool call, outside thinking)",
desc: "nested thinking and tool call (outside tool call, inside thinking)",
steps: []step{
{
input: "<tool_call>I'm nested tool call<thinking>I'm thinking</thinking></tool_call>",
wantEvents: []qwenEvent{qwenEventRawToolCall{raw: "I'm nested tool call<thinking>I'm thinking</thinking>"}},
},
},
},
{
desc: "interleaved thinking and tool call",
steps: []step{
{
input: "<thinking>I'm thinking<tool_call>I'm NOT a nested tool call</thinking></tool_call><tool_call>I'm nested tool call 2<thinking></tool_call></thinking>",
wantEvents: []qwenEvent{
qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm NOT a nested tool call"},
qwenEventContent{content: "</tool_call>"},
qwenEventRawToolCall{raw: "I'm nested tool call 2<thinking>"},
qwenEventContent{content: "</thinking>"},
},
},
},
},
{
desc: "partial thinking tag fakeout",
steps: []step{
{
input: "abc<thinking",
wantEvents: []qwenEvent{
qwenEventContent{content: "abc"},
},
},
{
input: " fakeout",
wantEvents: []qwenEvent{
qwenEventContent{content: "<thinking fakeout"},
},
},
},
},
{
desc: "partial thinking incomplete",
steps: []step{
{
input: "abc<thinking>unfinished</thinking", // when something is ambiguious, we dont emit anything
wantEvents: []qwenEvent{
qwenEventContent{content: "abc"},
},
},
},
},
}
anyOnlies := false
for _, tc := range cases {
if tc.only {
anyOnlies = true
}
}
for _, tc := range cases {
if anyOnlies && !tc.only {
continue
}
t.Run(tc.desc, func(t *testing.T) {
parser := Qwen3VLParser{}
for i, step := range tc.steps {
parser.buffer.WriteString(step.input)
gotEvents := parser.parseEvents()
if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
// avoid deep equal on empty vs. nil slices
continue
}
if !reflect.DeepEqual(gotEvents, step.wantEvents) {
t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
}
}
})
}
}
// TODO: devin was saying something about json cant figure out types?

351
model/renderers/qwen3vl.go Normal file
View File

@ -0,0 +1,351 @@
package renderers
import (
"encoding/json"
"fmt"
"strings"
"github.com/ollama/ollama/api"
)
// where should we set the image count?
var imageCount int
var videoCount int
// so i think from the renders, do vision is false
// basic
// [-] with tools
// [] with multiple tools
// [-] with tool calling
// [ ] with multiple tool calling
// with images and videos
// TODO: is there a way not to have to writ my own code for marshalWithSpaces
// the tool dictionaery list is slightly different
func marshalWithSpaces(v any) ([]byte, error) {
b, err := json.Marshal(v) // compact
if err != nil {
return nil, err
}
out := make([]byte, 0, len(b)+len(b)/8)
inStr, esc := false, false
for _, c := range b {
if inStr {
out = append(out, c)
if esc {
esc = false
continue
}
if c == '\\' {
esc = true
continue
}
if c == '"' {
inStr = false
}
continue
}
switch c {
case '"':
inStr = true
out = append(out, c)
case ':':
out = append(out, ':', ' ')
case ',':
out = append(out, ',', ' ')
default:
out = append(out, c)
}
}
return out, nil
}
// func pruneEmpty(v any) any {
// switch x := v.(type) {
// case map[string]any:
// out := make(map[string]any, len(x))
// for k, vv := range x {
// p := pruneEmpty(vv)
// switch pp := p.(type) {
// case nil:
// continue
// case string:
// if pp == "" {
// continue
// }
// case []any:
// if len(pp) == 0 {
// continue
// }
// case map[string]any:
// if len(pp) == 0 {
// continue
// }
// }
// out[k] = p
// }
// return out
// case []any:
// out := make([]any, 0, len(x))
// for _, vv := range x {
// p := pruneEmpty(vv)
// switch pp := p.(type) {
// case nil:
// continue
// case string:
// if pp == "" {
// continue
// }
// case []any:
// if len(pp) == 0 {
// continue
// }
// case map[string]any:
// if len(pp) == 0 {
// continue
// }
// }
// out = append(out, p)
// }
// return out
// default:
// return v
// }
// }
// func marshalWithSpaces(v any) ([]byte, error) {
// // 1) normalize to interface{} and prune empty fields
// var iv any
// b0, err := json.Marshal(v)
// if err != nil {
// return nil, err
// }
// if err := json.Unmarshal(b0, &iv); err != nil {
// return nil, err
// }
// iv = pruneEmpty(iv)
// // 2) compact marshal
// b, err := json.Marshal(iv)
// if err != nil {
// return nil, err
// }
// // 3) inject spaces after ':' and ',' outside strings
// out := make([]byte, 0, len(b)+len(b)/8)
// inStr, esc := false, false
// for _, c := range b {
// if inStr {
// out = append(out, c)
// if esc {
// esc = false
// continue
// }
// if c == '\\' {
// esc = true
// continue
// }
// if c == '"' {
// inStr = false
// }
// continue
// }
// switch c {
// case '"':
// inStr = true
// out = append(out, c)
// case ':':
// out = append(out, ':', ' ')
// case ',':
// out = append(out, ',', ' ')
// default:
// out = append(out, c)
// }
// }
// return out, nil
// }
// this is soooooooo ugly
// why exactly is the type of content?
func renderContent(content any, doVisionCount bool) string {
print(content)
switch content.(type) {
case string:
return content.(string)
default:
var subSb strings.Builder
for _, item := range content.([]any) {
if strings.Contains(item.(string), "image") || strings.Contains(item.(string), "image_url") || item.(map[string]any)["type"] == "image" {
if doVisionCount {
imageCount++
}
// if addVisionID {
// sb.WriteString("Picture " + strconv.Itoa(imageCount) + ": ") // do we need the itoa thing?
// }
subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
} else if strings.Contains(item.(string), "video") || item.(map[string]any)["type"] == "video" {
if doVisionCount {
videoCount++
}
// if addVisionID {
// sb.WriteString("Video " + strconv.Itoa(videoCount) + ": ") // do we need the itoa thing?
// }
subSb.WriteString("<|vision_start|><|video_pad|><|vision_end|>")
} else if strings.Contains(item.(string), "text") {
subSb.WriteString(item.(map[string]any)["text"].(string))
}
}
return subSb.String()
}
}
func Qwen3VLRenderer(messages []api.Message, tools []api.Tool, _ *api.ThinkValue) (string, error) {
var sb strings.Builder
// this is the tools section
fmt.Println("Number of tools (A):", len(tools))
if len(tools) > 0 {
sb.WriteString(imStartTag + "system\n")
if len(messages) > 0 && messages[0].Role == "system" {
sb.WriteString(messages[0].Content + "\n\n")
}
sb.WriteString("# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>")
for _, tool := range tools {
sb.WriteString("\n")
// if b, err := json.Marshal(tool); err == nil { // {{- tool_call.arguments | tojson -}}
// sb.Write(b)
// // so huggingface adds a space before every json object?
// }
// if b, err := json.MarshalIndent(tool, "", ""); err == nil { // {{- tool_call.arguments | tojson -}}
// sb.Write(b)
// // so huggingface adds a space before every json object?
// }
if b, err := marshalWithSpaces(tool); err == nil {
sb.Write(b) // JSON like {"a": 1, "b": 2}
}
}
sb.WriteString("\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n")
// sb.WriteString("<|im_end|>\n")
} else if len(messages) > 0 && messages[0].Role == "system" {
sb.WriteString("<|im_start|>system\n" + messages[0].Content + "<|im_end|>\n")
}
// what does the namespace do?
// Iterate through messages in reverse order to find the last query index
// how do we get these parameters?
multiStepTool := true
lastQueryIndex := len(messages) - 1
for i := len(messages) - 1; i >= 0; i-- { // go in reverse
message := messages[i]
if multiStepTool && message.Role == "user" {
// Check if content starts with <tool_response> and ends with </tool_response>
content := message.Content // use this with renderContent
if !(strings.HasPrefix(content, "<tool_response>") && strings.HasSuffix(content, "</tool_response>")) {
multiStepTool = false
lastQueryIndex = i
}
}
}
// this is the start of the messages
fmt.Println("Number of messages:", len(messages))
fmt.Println(messages)
for i, message := range messages {
// lastMessage := i == len(messages)-1
content := renderContent(message.Content, true)
// prefill := lastMessage && message.Role == "assistant"
fmt.Println(message) // print a message?
if message.Role == "user" || message.Role == "system" && i != 0 {
sb.WriteString("<|im_start|>" + message.Role + "\n" + content + "<|im_end|>\n")
} else if message.Role == "assistant" {
contentReasoning := ""
if message.Thinking != "" { // if message.reasoning_content is a string
contentReasoning = message.Thinking
} else if strings.Contains(content, "</think>") {
contentReasoning = strings.Split(content, "</think>")[0]
contentReasoning = strings.TrimRight(contentReasoning, "\n")
contentReasoningSplit := strings.Split(contentReasoning, "<think>") // how the fuck does this work?
contentReasoning = contentReasoningSplit[len(contentReasoningSplit)-1]
contentReasoning = strings.TrimLeft(contentReasoning, "\n")
// TODO: should be {%- set reasoning_content = content.split("</think>")[0].rstrip("\n").split("<think>")[-1].lstrip("\n") -%}
contentSplit := strings.Split(content, "</think>") // TODO: should be {%- set content = content.split("</think>")[-1].lstrip("\n") -%}
content = contentSplit[len(contentSplit)-1]
content = strings.TrimLeft(content, "\n")
}
if i > lastQueryIndex {
if i == len(messages)-1 || contentReasoning != "" {
sb.WriteString("<|im_start|>" + message.Role + "\n<think>\n" + strings.Trim(contentReasoning, "\n") + "\n</think>\n\n" + strings.TrimLeft(content, "\n"))
} else {
sb.WriteString("<|im_start|>" + message.Role + "\n" + content)
}
} else {
sb.WriteString("<|im_start|>" + message.Role + "\n" + content)
}
// if message.tool_calls
// if message.ToolCalls != nil {
if len(message.ToolCalls) > 0 {
for j, toolCall := range message.ToolCalls {
// what the fuck is this for?
if j > 0 || content != "" {
sb.WriteString("\n")
}
// if toolCall.Function != nil {
// toolCall = toolCall.Function
// }
// if there any way that toolcall does not have a function?
// toolCall = toolCall.Function
// {{- "<tool_call>\n{\"name\": \"" -}}
// {{- tool_call.name -}}
// {{- "\", \"arguments\": " -}}
// sb.WriteString("\n<tool_call>\n{\"name\": \"" + toolCall.Function.Name + "\", \"arguments\": ")
sb.WriteString("<tool_call>\n{\"name\": \"" + toolCall.Function.Name + "\", \"arguments\": ")
if b, err := marshalWithSpaces(toolCall.Function.Arguments); err == nil {
sb.Write(b) // JSON like {"a": 1, "b": 2}
}
sb.WriteString("}\n</tool_call>")
}
}
sb.WriteString("<|im_end|>\n")
} else if message.Role == "tool" {
if i == 0 || messages[i-1].Role != "tool" {
sb.WriteString("<|im_start|>user")
}
sb.WriteString("\n<tool_response>\n" + message.Content + "\n</tool_response>")
if i == len(messages)-1 || messages[i+1].Role != "tool" {
sb.WriteString("<|im_end|>\n")
}
}
// if lastMessage {
// sb.WriteString("<|im_start|>assistant\n<think>\n")
// }
}
// we might need to wrap this in something?
sb.WriteString("<|im_start|>assistant\n<think>\n")
// if addGenerationPrompt {
// sb.WriteString("<|im_start|>assistant\n<think>\n")
// }
return sb.String(), nil
}