ollama/model/parsers/qwen3vl.go

292 lines
9.5 KiB
Go

package parsers
import (
"context"
"fmt"
"log/slog"
"strings"
"encoding/json"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/logutil"
)
// type parserState int
const (
CollectingContent qwenParserState = iota
CollectingThinkingContent // this is because qwen3vl starts with <thinking>
// parserState_CompletedThinkingContent
CollectingToolContent
// parserState_CompletedToolContent
)
const (
thinkingOpenTag = "<thinking>"
thinkingCloseTag = "</thinking>"
)
type Qwen3VLParser struct {
state qwenParserState
buffer strings.Builder
tools []api.Tool
}
func (p *Qwen3VLParser) HasToolSupport() bool {
return true
}
func (p *Qwen3VLParser) HasThinkingSupport() bool {
return true
}
func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
p.tools = tools
return tools // Qwen doesn't modify tools
// does qwenvl modify tools?
}
// Add processes a chunk of string output from the model, accumulating it in the parser's buffer,
// and then parses any complete events (such as tool calls or content) that can be extracted from the buffer.
// It returns the parsed content (as a string), an empty string for "thinking" (since this parser does not support it),
// a slice of parsed tool calls, and an error if any occurred during parsing.
//
// Specifically, it works as follows:
// 1. Appends the new string chunk 's' to the internal accumulator.
// 2. Calls parseEvents() to extract any complete events (tool calls or content) from the buffer.
// 3. Iterates over the events:
// - For tool call events, attempts to parse them into api.ToolCall objects and collects them.
// - For content events, appends their content to a string builder.
// 4. Returns the accumulated content, an empty string for thinking, the collected tool calls, and any error encountered.
type qwenEventThinkingContent struct {
content string
}
func (qwenEventThinkingContent) isQwenEvent() {}
func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
// is s the complete content (aka the for sure unambiguous content)
p.buffer.WriteString(s)
// why do we write the entire string?
events := p.parseEvents()
// parse events:
// - parses the entire content
// parses an entire tool call
// parses an entire thinking content
var toolCalls []api.ToolCall
var sb strings.Builder
for _, event := range events {
switch event := event.(type) {
case qwenEventRawToolCall:
toolCall, err := parseToolCall(event, p.tools)
if err != nil {
slog.Warn("qwen tool call parsing failed", "error", err)
return "", "", nil, err
}
toolCalls = append(toolCalls, toolCall)
case qwenEventThinkingContent: // maybe we only need one?
print("unimplemented")
// how exactly does thinking work?
case qwenEventContent:
// TODO(drifkin): if the same turn contains multiple interleaved content
// events, we naively append them together here. See the note below about
// `qwenEvent`s for more details
sb.WriteString(event.content)
}
}
return sb.String(), "", toolCalls, nil
}
func (p *Qwen3VLParser) parseEvents() []qwenEvent {
var all []qwenEvent
keepLooping := true
for keepLooping {
var events []qwenEvent
events, keepLooping = p.eat()
if len(events) > 0 {
all = append(all, events...)
}
}
if len(all) > 0 {
slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
}
return all
}
// type qwenEventRawToolCall struct {
// raw string
// }
// type qwenEventContent struct {
// content string
// }
// think if a better name
func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent {
split := strings.SplitN(p.buffer.String(), tag, 2) // what is his 2 for?
before := split[0] // before the tag
// before = strings.TrimRightFunc(before, unicode.IsSpace) // trim all the space after the bfire
if len(before) > 0 {
events = append(events, qwenEventContent{content: before})
}
after := split[1]
p.buffer.Reset()
p.buffer.WriteString(after)
return events
}
// overlap = ambiguous
// findFirstTag returns the tag that appears first in the buffer among the provided tags.
// If no tag is found, it returns an empty string.
func findFirstTag(p *Qwen3VLParser, tags []string) string {
minIdx := -1
var firstTag string
for _, tag := range tags {
idx := strings.Index(p.buffer.String(), tag)
if idx != -1 && (minIdx == -1 || idx < minIdx) {
minIdx = idx
firstTag = tag
}
}
if minIdx == -1 { // just content
return ""
}
return firstTag // there is a possibility that there is no tag, can you return nil for that?
}
func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
var events []qwenEvent
// certain events:
// - thinking opening tag
// - tool opening tag
// since there is multiple tags, we need to think about which tag comes first
// we also need to create a list for
firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag})
switch p.state {
case CollectingContent: // we can only look for thinking content if we're collecting content
// if strings.Contains(p.buffer.String(), thinkingOpenTag) { // found thinking
if firstTag == thinkingOpenTag {
// string contains the openThinkingTag, we move it to the CollectingThinkingContent state
events = emitContentBeforeTag(p, events, thinkingOpenTag)
p.state = CollectingThinkingContent // <found a thinking>
return events, true
// } else if strings.Contains(p.buffer.String(), toolOpenTag) { // found tool call
} else if firstTag == toolOpenTag {
events = emitContentBeforeTag(p, events, toolOpenTag)
p.state = CollectingToolContent // found a <tool_call>
return events, true
} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 { // found a partial thinking tag
// it is only possible that they find 1
// found a partial think tag, emit the unambiguous before the partial tool call
// hello </think -> hello, so ambiguous start includes all the whitespace before the tag
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
ambiguousStart := len(beforePartialTag)
// HAVENT ADDED TRAILING WHITESPACE YET...
unambiguous := p.buffer.String()[:ambiguousStart]
ambiguous := p.buffer.String()[ambiguousStart:]
p.buffer.Reset()
p.buffer.WriteString(ambiguous)
events = append(events, qwenEventContent{content: unambiguous})
return events, false
} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { // found a partial tool call tag
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
ambiguousStart := len(beforePartialTag)
unambiguous := p.buffer.String()[:ambiguousStart]
ambiguous := p.buffer.String()[ambiguousStart:]
p.buffer.Reset()
p.buffer.WriteString(ambiguous)
events = append(events, qwenEventContent{content: unambiguous})
return events, false
} else { // no partial or full thinking or tool call tag found
// whitespaceLen := trailingWhitespaceLen(p.buffer.String()) <- all the trailing space we consider ambiguous
ambiguousStart := len(p.buffer.String()) // - whitespaceLen
unambiguous := p.buffer.String()[:ambiguousStart]
ambiguous := p.buffer.String()[ambiguousStart:]
p.buffer.Reset()
p.buffer.WriteString(ambiguous)
if len(unambiguous) > 0 {
events = append(events, qwenEventContent{content: unambiguous})
}
return events, false
}
case CollectingToolContent: // we only move towards the CollectingContent state
if strings.Contains(p.buffer.String(), toolCloseTag) {
split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) // this one splits by the first one
before := split[0]
if len(before) == 0 {
slog.Warn("qwen tool call closing tag found but no content before it")
}
after := split[1] // no whit space yet
events = append(events, qwenEventRawToolCall{raw: before}) // do these need to be "seperated"?
p.buffer.Reset()
p.buffer.WriteString(after)
p.state = CollectingContent
return events, true
} else {
return events, false
}
case CollectingThinkingContent:
if strings.Contains(p.buffer.String(), thinkingCloseTag) {
split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)
// so it looks like before contains the open tag
fmt.Println("split", split)
before := split[0]
if len(before) == 0 {
slog.Warn("qwen tool call closing tag found but no content before it")
}
after := split[1] // no whit space yet
events = append(events, qwenEventThinkingContent{content: before})
p.buffer.Reset()
p.buffer.WriteString(after)
p.state = CollectingContent
return events, true
} else {
return events, false
}
default:
panic("unreachable")
}
}
func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
// Expected JSON shape: {"name": "...", "arguments": { ... }}
// var in struct {
// Name string `json:"name"`
// Arguments json.RawMessage `json:"arguments"`
// }
fmt.Println(raw.raw)
var toolCall api.ToolCall
if err := json.Unmarshal([]byte(raw.raw), &toolCall); err != nil {
return api.ToolCall{}, err
}
// args := make(api.ToolCallFunctionArguments)
// if len(in.Arguments) > 0 && string(in.Arguments) != "null" {
// var obj map[string]any
// if err := json.Unmarshal(in.Arguments, &obj); err == nil {
// for k, v := range obj {
// args[k] = v
// }
// }
// }
fmt.Println(toolCall)
return toolCall, nil
}
// do we need to parse values