ollama/model/parsers/qwen3vl.go

package parsers

import (
	"context"
	"fmt"
	"log/slog"
	"strings"

	"encoding/json"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/logutil"
)

// type parserState int

const (
	CollectingContent         qwenParserState = iota
	CollectingThinkingContent                 // this is because qwen3vl starts with <thinking>
	// parserState_CompletedThinkingContent
	CollectingToolContent
	// parserState_CompletedToolContent
)

const (
	thinkingOpenTag  = "<thinking>"
	thinkingCloseTag = "</thinking>"
)

type Qwen3VLParser struct {
	state  qwenParserState
	buffer strings.Builder
	tools  []api.Tool
}

func (p *Qwen3VLParser) HasToolSupport() bool {
	return true
}

func (p *Qwen3VLParser) HasThinkingSupport() bool {
	return true
}

func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
	p.tools = tools
	return tools // Qwen doesn't modify tools
	// does qwenvl modify tools?
}

// Add processes a chunk of string output from the model, accumulating it in the parser's buffer,
// and then parses any complete events (such as tool calls or content) that can be extracted from the buffer.
// It returns the parsed content (as a string), an empty string for "thinking" (since this parser does not support it),
// a slice of parsed tool calls, and an error if any occurred during parsing.
//
// Specifically, it works as follows:
//   1. Appends the new string chunk 's' to the internal accumulator.
//   2. Calls parseEvents() to extract any complete events (tool calls or content) from the buffer.
//   3. Iterates over the events:
//        - For tool call events, attempts to parse them into api.ToolCall objects and collects them.
//        - For content events, appends their content to a string builder.
//   4. Returns the accumulated content, an empty string for thinking, the collected tool calls, and any error encountered.

type qwenEventThinkingContent struct {
	content string
}

func (qwenEventThinkingContent) isQwenEvent() {}

func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
	// is s the complete content (aka the for sure unambiguous content)
	p.buffer.WriteString(s)
	// why do we write the entire string?

	events := p.parseEvents()
	// parse events:
	// - parses the entire content
	// parses an entire tool call
	// parses an entire thinking content

	var toolCalls []api.ToolCall
	var sb strings.Builder
	for _, event := range events {
		switch event := event.(type) {
		case qwenEventRawToolCall:
			toolCall, err := parseToolCall(event, p.tools)
			if err != nil {
				slog.Warn("qwen tool call parsing failed", "error", err)
				return "", "", nil, err
			}
			toolCalls = append(toolCalls, toolCall)
		case qwenEventThinkingContent: // maybe we only need one?
			print("unimplemented")
			// how exactly does thinking work?
		case qwenEventContent:
			// TODO(drifkin): if the same turn contains multiple interleaved content
			// events, we naively append them together here. See the note below about
			// `qwenEvent`s for more details
			sb.WriteString(event.content)
		}
	}

	return sb.String(), "", toolCalls, nil
}

func (p *Qwen3VLParser) parseEvents() []qwenEvent {
	var all []qwenEvent

	keepLooping := true
	for keepLooping {
		var events []qwenEvent
		events, keepLooping = p.eat()
		if len(events) > 0 {
			all = append(all, events...)
		}
	}

	if len(all) > 0 {
		slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
	}

	return all
}

// type qwenEventRawToolCall struct {
// 	raw string
// }

// type qwenEventContent struct {
// 	content string
// }

// think if a better name
func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent {
	split := strings.SplitN(p.buffer.String(), tag, 2) // what is his 2 for?
	before := split[0]                                 // before the tag
	// before = strings.TrimRightFunc(before, unicode.IsSpace) // trim all the space after the bfire
	if len(before) > 0 {
		events = append(events, qwenEventContent{content: before})
	}
	after := split[1]
	p.buffer.Reset()
	p.buffer.WriteString(after)
	return events
}

// overlap = ambiguous

// findFirstTag returns the tag that appears first in the buffer among the provided tags.
// If no tag is found, it returns an empty string.
func findFirstTag(p *Qwen3VLParser, tags []string) string {
	minIdx := -1
	var firstTag string
	for _, tag := range tags {
		idx := strings.Index(p.buffer.String(), tag)
		if idx != -1 && (minIdx == -1 || idx < minIdx) {
			minIdx = idx
			firstTag = tag
		}
	}
	if minIdx == -1 { // just content
		return ""
	}
	return firstTag // there is a possibility that there is no tag, can you return nil for that?
}

func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
	var events []qwenEvent

	// certain events:
	// - thinking opening tag
	// - tool opening tag

	// since there is multiple tags, we need to think about which tag comes first
	// we also need to create a list for
	firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag})

	switch p.state {
	case CollectingContent: // we  can only look for thinking content if we're collecting content

		// if strings.Contains(p.buffer.String(), thinkingOpenTag) { // found thinking
		if firstTag == thinkingOpenTag {
			// string contains the openThinkingTag, we move it to the CollectingThinkingContent state
			events = emitContentBeforeTag(p, events, thinkingOpenTag)
			p.state = CollectingThinkingContent // <found a thinking>
			return events, true
			// } else if strings.Contains(p.buffer.String(), toolOpenTag) { // found tool call
		} else if firstTag == toolOpenTag {
			events = emitContentBeforeTag(p, events, toolOpenTag)
			p.state = CollectingToolContent // found a <tool_call>
			return events, true
		} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 { // found a partial thinking tag
			// it is only possible that they find 1
			// found a partial think tag, emit the unambiguous before the partial tool call
			// hello </think -> hello, so ambiguous start includes all the whitespace before the tag
			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
			ambiguousStart := len(beforePartialTag)
			// HAVENT ADDED TRAILING WHITESPACE YET...
			unambiguous := p.buffer.String()[:ambiguousStart]
			ambiguous := p.buffer.String()[ambiguousStart:]
			p.buffer.Reset()
			p.buffer.WriteString(ambiguous)
			events = append(events, qwenEventContent{content: unambiguous})
			return events, false
		} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { // found a partial tool call tag
			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
			ambiguousStart := len(beforePartialTag)

			unambiguous := p.buffer.String()[:ambiguousStart]
			ambiguous := p.buffer.String()[ambiguousStart:]
			p.buffer.Reset()
			p.buffer.WriteString(ambiguous)
			events = append(events, qwenEventContent{content: unambiguous})
			return events, false
		} else { // no partial or full thinking or tool call tag found
			// whitespaceLen := trailingWhitespaceLen(p.buffer.String()) <- all the trailing space we consider ambiguous
			ambiguousStart := len(p.buffer.String()) // - whitespaceLen
			unambiguous := p.buffer.String()[:ambiguousStart]
			ambiguous := p.buffer.String()[ambiguousStart:]
			p.buffer.Reset()
			p.buffer.WriteString(ambiguous)
			if len(unambiguous) > 0 {
				events = append(events, qwenEventContent{content: unambiguous})
			}
			return events, false
		}
	case CollectingToolContent: // we only move towards the CollectingContent state
		if strings.Contains(p.buffer.String(), toolCloseTag) {
			split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) // this one splits by the first one
			before := split[0]
			if len(before) == 0 {
				slog.Warn("qwen tool call closing tag found but no content before it")
			}
			after := split[1]                                          // no whit space yet
			events = append(events, qwenEventRawToolCall{raw: before}) // do these need to be "seperated"?
			p.buffer.Reset()
			p.buffer.WriteString(after)
			p.state = CollectingContent
			return events, true
		} else {
			return events, false
		}
	case CollectingThinkingContent:
		if strings.Contains(p.buffer.String(), thinkingCloseTag) {
			split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)
			// so it looks like before contains the open tag
			fmt.Println("split", split)
			before := split[0]
			if len(before) == 0 {
				slog.Warn("qwen tool call closing tag found but no content before it")
			}
			after := split[1] // no whit space yet
			events = append(events, qwenEventThinkingContent{content: before})
			p.buffer.Reset()
			p.buffer.WriteString(after)
			p.state = CollectingContent
			return events, true
		} else {
			return events, false
		}
	default:
		panic("unreachable")
	}
}

func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
	// Expected JSON shape: {"name": "...", "arguments": { ... }}
	// var in struct {
	// 	Name      string          `json:"name"`
	// 	Arguments json.RawMessage `json:"arguments"`
	// }
	fmt.Println(raw.raw)

	var toolCall api.ToolCall
	if err := json.Unmarshal([]byte(raw.raw), &toolCall); err != nil {
		return api.ToolCall{}, err
	}

	// args := make(api.ToolCallFunctionArguments)
	// 	if len(in.Arguments) > 0 && string(in.Arguments) != "null" {
	// 	var obj map[string]any
	// 	if err := json.Unmarshal(in.Arguments, &obj); err == nil {
	// 		for k, v := range obj {
	// 			args[k] = v
	// 		}
	// 	}
	// }
	fmt.Println(toolCall)
	return toolCall, nil
}

// do we need to parse values
working tests, changed code to find the first open tag 2025-10-04 02:38:30 +08:00			`package parsers`

			`import (`
			`"context"`
			`"fmt"`
			`"log/slog"`
			`"strings"`

			`"encoding/json"`

			`"github.com/ollama/ollama/api"`
			`"github.com/ollama/ollama/logutil"`
			`)`

			`// type parserState int`

			`const (`
			`CollectingContent qwenParserState = iota`
			`CollectingThinkingContent // this is because qwen3vl starts with <thinking>`
			`// parserState_CompletedThinkingContent`
			`CollectingToolContent`
			`// parserState_CompletedToolContent`
			`)`

			`const (`
			`thinkingOpenTag = "<thinking>"`
			`thinkingCloseTag = "</thinking>"`
			`)`

			`type Qwen3VLParser struct {`
			`state qwenParserState`
			`buffer strings.Builder`
			`tools []api.Tool`
			`}`

			`func (p *Qwen3VLParser) HasToolSupport() bool {`
			`return true`
			`}`

			`func (p *Qwen3VLParser) HasThinkingSupport() bool {`
			`return true`
			`}`

			`func (p Qwen3VLParser) Init(tools []api.Tool, lastMessage api.Message) []api.Tool {`
			`p.tools = tools`
			`return tools // Qwen doesn't modify tools`
			`// does qwenvl modify tools?`
			`}`

			`// Add processes a chunk of string output from the model, accumulating it in the parser's buffer,`
			`// and then parses any complete events (such as tool calls or content) that can be extracted from the buffer.`
			`// It returns the parsed content (as a string), an empty string for "thinking" (since this parser does not support it),`
			`// a slice of parsed tool calls, and an error if any occurred during parsing.`
			`//`
			`// Specifically, it works as follows:`
			`// 1. Appends the new string chunk 's' to the internal accumulator.`
			`// 2. Calls parseEvents() to extract any complete events (tool calls or content) from the buffer.`
			`// 3. Iterates over the events:`
			`// - For tool call events, attempts to parse them into api.ToolCall objects and collects them.`
			`// - For content events, appends their content to a string builder.`
			`// 4. Returns the accumulated content, an empty string for thinking, the collected tool calls, and any error encountered.`

			`type qwenEventThinkingContent struct {`
			`content string`
			`}`

			`func (qwenEventThinkingContent) isQwenEvent() {}`

			`func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {`
			`// is s the complete content (aka the for sure unambiguous content)`
			`p.buffer.WriteString(s)`
			`// why do we write the entire string?`

			`events := p.parseEvents()`
			`// parse events:`
			`// - parses the entire content`
			`// parses an entire tool call`
			`// parses an entire thinking content`

			`var toolCalls []api.ToolCall`
			`var sb strings.Builder`
			`for _, event := range events {`
			`switch event := event.(type) {`
			`case qwenEventRawToolCall:`
			`toolCall, err := parseToolCall(event, p.tools)`
			`if err != nil {`
			`slog.Warn("qwen tool call parsing failed", "error", err)`
			`return "", "", nil, err`
			`}`
			`toolCalls = append(toolCalls, toolCall)`
			`case qwenEventThinkingContent: // maybe we only need one?`
			`print("unimplemented")`
			`// how exactly does thinking work?`
			`case qwenEventContent:`
			`// TODO(drifkin): if the same turn contains multiple interleaved content`
			`// events, we naively append them together here. See the note below about`
			// `qwenEvent`s for more details
			`sb.WriteString(event.content)`
			`}`
			`}`

			`return sb.String(), "", toolCalls, nil`
			`}`

			`func (p *Qwen3VLParser) parseEvents() []qwenEvent {`
			`var all []qwenEvent`

			`keepLooping := true`
			`for keepLooping {`
			`var events []qwenEvent`
			`events, keepLooping = p.eat()`
			`if len(events) > 0 {`
			`all = append(all, events...)`
			`}`
			`}`

			`if len(all) > 0 {`
			`slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())`
			`}`

			`return all`
			`}`

			`// type qwenEventRawToolCall struct {`
			`// raw string`
			`// }`

			`// type qwenEventContent struct {`
			`// content string`
			`// }`

			`// think if a better name`
			`func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent {`
			`split := strings.SplitN(p.buffer.String(), tag, 2) // what is his 2 for?`
			`before := split[0] // before the tag`
			`// before = strings.TrimRightFunc(before, unicode.IsSpace) // trim all the space after the bfire`
			`if len(before) > 0 {`
			`events = append(events, qwenEventContent{content: before})`
			`}`
			`after := split[1]`
			`p.buffer.Reset()`
			`p.buffer.WriteString(after)`
			`return events`
			`}`

			`// overlap = ambiguous`

			`// findFirstTag returns the tag that appears first in the buffer among the provided tags.`
			`// If no tag is found, it returns an empty string.`
			`func findFirstTag(p *Qwen3VLParser, tags []string) string {`
			`minIdx := -1`
			`var firstTag string`
			`for _, tag := range tags {`
			`idx := strings.Index(p.buffer.String(), tag)`
			`if idx != -1 && (minIdx == -1 \|\| idx < minIdx) {`
			`minIdx = idx`
			`firstTag = tag`
			`}`
			`}`
			`if minIdx == -1 { // just content`
			`return ""`
			`}`
			`return firstTag // there is a possibility that there is no tag, can you return nil for that?`
			`}`

			`func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {`
			`var events []qwenEvent`

			`// certain events:`
			`// - thinking opening tag`
			`// - tool opening tag`

			`// since there is multiple tags, we need to think about which tag comes first`
			`// we also need to create a list for`
			`firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag})`

			`switch p.state {`
			`case CollectingContent: // we can only look for thinking content if we're collecting content`

			`// if strings.Contains(p.buffer.String(), thinkingOpenTag) { // found thinking`
			`if firstTag == thinkingOpenTag {`
			`// string contains the openThinkingTag, we move it to the CollectingThinkingContent state`
			`events = emitContentBeforeTag(p, events, thinkingOpenTag)`
			`p.state = CollectingThinkingContent // <found a thinking>`
			`return events, true`
			`// } else if strings.Contains(p.buffer.String(), toolOpenTag) { // found tool call`
			`} else if firstTag == toolOpenTag {`
			`events = emitContentBeforeTag(p, events, toolOpenTag)`
			`p.state = CollectingToolContent // found a <tool_call>`
			`return events, true`
			`} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 { // found a partial thinking tag`
			`// it is only possible that they find 1`
			`// found a partial think tag, emit the unambiguous before the partial tool call`
			`// hello </think -> hello, so ambiguous start includes all the whitespace before the tag`
			`beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]`
			`ambiguousStart := len(beforePartialTag)`
			`// HAVENT ADDED TRAILING WHITESPACE YET...`
			`unambiguous := p.buffer.String()[:ambiguousStart]`
			`ambiguous := p.buffer.String()[ambiguousStart:]`
			`p.buffer.Reset()`
			`p.buffer.WriteString(ambiguous)`
			`events = append(events, qwenEventContent{content: unambiguous})`
			`return events, false`
			`} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { // found a partial tool call tag`
			`beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]`
			`ambiguousStart := len(beforePartialTag)`

			`unambiguous := p.buffer.String()[:ambiguousStart]`
			`ambiguous := p.buffer.String()[ambiguousStart:]`
			`p.buffer.Reset()`
			`p.buffer.WriteString(ambiguous)`
			`events = append(events, qwenEventContent{content: unambiguous})`
			`return events, false`
			`} else { // no partial or full thinking or tool call tag found`
			`// whitespaceLen := trailingWhitespaceLen(p.buffer.String()) <- all the trailing space we consider ambiguous`
			`ambiguousStart := len(p.buffer.String()) // - whitespaceLen`
			`unambiguous := p.buffer.String()[:ambiguousStart]`
			`ambiguous := p.buffer.String()[ambiguousStart:]`
			`p.buffer.Reset()`
			`p.buffer.WriteString(ambiguous)`
			`if len(unambiguous) > 0 {`
			`events = append(events, qwenEventContent{content: unambiguous})`
			`}`
			`return events, false`
			`}`
			`case CollectingToolContent: // we only move towards the CollectingContent state`
			`if strings.Contains(p.buffer.String(), toolCloseTag) {`
			`split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) // this one splits by the first one`
			`before := split[0]`
			`if len(before) == 0 {`
			`slog.Warn("qwen tool call closing tag found but no content before it")`
			`}`
			`after := split[1] // no whit space yet`
			`events = append(events, qwenEventRawToolCall{raw: before}) // do these need to be "seperated"?`
			`p.buffer.Reset()`
			`p.buffer.WriteString(after)`
			`p.state = CollectingContent`
			`return events, true`
			`} else {`
			`return events, false`
			`}`
			`case CollectingThinkingContent:`
			`if strings.Contains(p.buffer.String(), thinkingCloseTag) {`
			`split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)`
			`// so it looks like before contains the open tag`
			`fmt.Println("split", split)`
			`before := split[0]`
			`if len(before) == 0 {`
			`slog.Warn("qwen tool call closing tag found but no content before it")`
			`}`
			`after := split[1] // no whit space yet`
			`events = append(events, qwenEventThinkingContent{content: before})`
			`p.buffer.Reset()`
			`p.buffer.WriteString(after)`
			`p.state = CollectingContent`
			`return events, true`
			`} else {`
			`return events, false`
			`}`
			`default:`
			`panic("unreachable")`
			`}`
			`}`

			`func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) {`
			`// Expected JSON shape: {"name": "...", "arguments": { ... }}`
			`// var in struct {`
			// Name string `json:"name"`
			// Arguments json.RawMessage `json:"arguments"`
			`// }`
			`fmt.Println(raw.raw)`

			`var toolCall api.ToolCall`
			`if err := json.Unmarshal([]byte(raw.raw), &toolCall); err != nil {`
			`return api.ToolCall{}, err`
			`}`

			`// args := make(api.ToolCallFunctionArguments)`
			`// if len(in.Arguments) > 0 && string(in.Arguments) != "null" {`
			`// var obj map[string]any`
			`// if err := json.Unmarshal(in.Arguments, &obj); err == nil {`
			`// for k, v := range obj {`
			`// args[k] = v`
			`// }`
			`// }`
			`// }`
			`fmt.Println(toolCall)`
			`return toolCall, nil`
			`}`

			`// do we need to parse values`