mirror of https://github.com/ollama/ollama.git
				
				
				
			
		
			
	
	
		
			292 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Go
		
	
	
	
		
		
			
		
	
	
			292 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Go
		
	
	
	
|  | package parsers | ||
|  | 
 | ||
|  | import ( | ||
|  | 	"context" | ||
|  | 	"fmt" | ||
|  | 	"log/slog" | ||
|  | 	"strings" | ||
|  | 
 | ||
|  | 	"encoding/json" | ||
|  | 
 | ||
|  | 	"github.com/ollama/ollama/api" | ||
|  | 	"github.com/ollama/ollama/logutil" | ||
|  | ) | ||
|  | 
 | ||
|  | // type parserState int
 | ||
|  | 
 | ||
|  | const ( | ||
|  | 	CollectingContent         qwenParserState = iota | ||
|  | 	CollectingThinkingContent                 // this is because qwen3vl starts with <thinking>
 | ||
|  | 	// parserState_CompletedThinkingContent
 | ||
|  | 	CollectingToolContent | ||
|  | 	// parserState_CompletedToolContent
 | ||
|  | ) | ||
|  | 
 | ||
|  | const ( | ||
|  | 	thinkingOpenTag  = "<thinking>" | ||
|  | 	thinkingCloseTag = "</thinking>" | ||
|  | ) | ||
|  | 
 | ||
|  | type Qwen3VLParser struct { | ||
|  | 	state  qwenParserState | ||
|  | 	buffer strings.Builder | ||
|  | 	tools  []api.Tool | ||
|  | } | ||
|  | 
 | ||
|  | func (p *Qwen3VLParser) HasToolSupport() bool { | ||
|  | 	return true | ||
|  | } | ||
|  | 
 | ||
|  | func (p *Qwen3VLParser) HasThinkingSupport() bool { | ||
|  | 	return true | ||
|  | } | ||
|  | 
 | ||
|  | func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool { | ||
|  | 	p.tools = tools | ||
|  | 	return tools // Qwen doesn't modify tools
 | ||
|  | 	// does qwenvl modify tools?
 | ||
|  | } | ||
|  | 
 | ||
|  | // Add processes a chunk of string output from the model, accumulating it in the parser's buffer,
 | ||
|  | // and then parses any complete events (such as tool calls or content) that can be extracted from the buffer.
 | ||
|  | // It returns the parsed content (as a string), an empty string for "thinking" (since this parser does not support it),
 | ||
|  | // a slice of parsed tool calls, and an error if any occurred during parsing.
 | ||
|  | //
 | ||
|  | // Specifically, it works as follows:
 | ||
|  | //   1. Appends the new string chunk 's' to the internal accumulator.
 | ||
|  | //   2. Calls parseEvents() to extract any complete events (tool calls or content) from the buffer.
 | ||
|  | //   3. Iterates over the events:
 | ||
|  | //        - For tool call events, attempts to parse them into api.ToolCall objects and collects them.
 | ||
|  | //        - For content events, appends their content to a string builder.
 | ||
|  | //   4. Returns the accumulated content, an empty string for thinking, the collected tool calls, and any error encountered.
 | ||
|  | 
 | ||
|  | type qwenEventThinkingContent struct { | ||
|  | 	content string | ||
|  | } | ||
|  | 
 | ||
|  | func (qwenEventThinkingContent) isQwenEvent() {} | ||
|  | 
 | ||
|  | func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) { | ||
|  | 	// is s the complete content (aka the for sure unambiguous content)
 | ||
|  | 	p.buffer.WriteString(s) | ||
|  | 	// why do we write the entire string?
 | ||
|  | 
 | ||
|  | 	events := p.parseEvents() | ||
|  | 	// parse events:
 | ||
|  | 	// - parses the entire content
 | ||
|  | 	// parses an entire tool call
 | ||
|  | 	// parses an entire thinking content
 | ||
|  | 
 | ||
|  | 	var toolCalls []api.ToolCall | ||
|  | 	var sb strings.Builder | ||
|  | 	for _, event := range events { | ||
|  | 		switch event := event.(type) { | ||
|  | 		case qwenEventRawToolCall: | ||
|  | 			toolCall, err := parseToolCall(event, p.tools) | ||
|  | 			if err != nil { | ||
|  | 				slog.Warn("qwen tool call parsing failed", "error", err) | ||
|  | 				return "", "", nil, err | ||
|  | 			} | ||
|  | 			toolCalls = append(toolCalls, toolCall) | ||
|  | 		case qwenEventThinkingContent: // maybe we only need one?
 | ||
|  | 			print("unimplemented") | ||
|  | 			// how exactly does thinking work?
 | ||
|  | 		case qwenEventContent: | ||
|  | 			// TODO(drifkin): if the same turn contains multiple interleaved content
 | ||
|  | 			// events, we naively append them together here. See the note below about
 | ||
|  | 			// `qwenEvent`s for more details
 | ||
|  | 			sb.WriteString(event.content) | ||
|  | 		} | ||
|  | 	} | ||
|  | 
 | ||
|  | 	return sb.String(), "", toolCalls, nil | ||
|  | } | ||
|  | 
 | ||
|  | func (p *Qwen3VLParser) parseEvents() []qwenEvent { | ||
|  | 	var all []qwenEvent | ||
|  | 
 | ||
|  | 	keepLooping := true | ||
|  | 	for keepLooping { | ||
|  | 		var events []qwenEvent | ||
|  | 		events, keepLooping = p.eat() | ||
|  | 		if len(events) > 0 { | ||
|  | 			all = append(all, events...) | ||
|  | 		} | ||
|  | 	} | ||
|  | 
 | ||
|  | 	if len(all) > 0 { | ||
|  | 		slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String()) | ||
|  | 	} | ||
|  | 
 | ||
|  | 	return all | ||
|  | } | ||
|  | 
 | ||
|  | // type qwenEventRawToolCall struct {
 | ||
|  | // 	raw string
 | ||
|  | // }
 | ||
|  | 
 | ||
|  | // type qwenEventContent struct {
 | ||
|  | // 	content string
 | ||
|  | // }
 | ||
|  | 
 | ||
|  | // think if a better name
 | ||
|  | func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent { | ||
|  | 	split := strings.SplitN(p.buffer.String(), tag, 2) // what is his 2 for?
 | ||
|  | 	before := split[0]                                 // before the tag
 | ||
|  | 	// before = strings.TrimRightFunc(before, unicode.IsSpace) // trim all the space after the bfire
 | ||
|  | 	if len(before) > 0 { | ||
|  | 		events = append(events, qwenEventContent{content: before}) | ||
|  | 	} | ||
|  | 	after := split[1] | ||
|  | 	p.buffer.Reset() | ||
|  | 	p.buffer.WriteString(after) | ||
|  | 	return events | ||
|  | } | ||
|  | 
 | ||
|  | // overlap = ambiguous
 | ||
|  | 
 | ||
|  | // findFirstTag returns the tag that appears first in the buffer among the provided tags.
 | ||
|  | // If no tag is found, it returns an empty string.
 | ||
|  | func findFirstTag(p *Qwen3VLParser, tags []string) string { | ||
|  | 	minIdx := -1 | ||
|  | 	var firstTag string | ||
|  | 	for _, tag := range tags { | ||
|  | 		idx := strings.Index(p.buffer.String(), tag) | ||
|  | 		if idx != -1 && (minIdx == -1 || idx < minIdx) { | ||
|  | 			minIdx = idx | ||
|  | 			firstTag = tag | ||
|  | 		} | ||
|  | 	} | ||
|  | 	if minIdx == -1 { // just content
 | ||
|  | 		return "" | ||
|  | 	} | ||
|  | 	return firstTag // there is a possibility that there is no tag, can you return nil for that?
 | ||
|  | } | ||
|  | 
 | ||
|  | func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { | ||
|  | 	var events []qwenEvent | ||
|  | 
 | ||
|  | 	// certain events:
 | ||
|  | 	// - thinking opening tag
 | ||
|  | 	// - tool opening tag
 | ||
|  | 
 | ||
|  | 	// since there is multiple tags, we need to think about which tag comes first
 | ||
|  | 	// we also need to create a list for
 | ||
|  | 	firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag}) | ||
|  | 
 | ||
|  | 	switch p.state { | ||
|  | 	case CollectingContent: // we  can only look for thinking content if we're collecting content
 | ||
|  | 
 | ||
|  | 		// if strings.Contains(p.buffer.String(), thinkingOpenTag) { // found thinking
 | ||
|  | 		if firstTag == thinkingOpenTag { | ||
|  | 			// string contains the openThinkingTag, we move it to the CollectingThinkingContent state
 | ||
|  | 			events = emitContentBeforeTag(p, events, thinkingOpenTag) | ||
|  | 			p.state = CollectingThinkingContent // <found a thinking>
 | ||
|  | 			return events, true | ||
|  | 			// } else if strings.Contains(p.buffer.String(), toolOpenTag) { // found tool call
 | ||
|  | 		} else if firstTag == toolOpenTag { | ||
|  | 			events = emitContentBeforeTag(p, events, toolOpenTag) | ||
|  | 			p.state = CollectingToolContent // found a <tool_call>
 | ||
|  | 			return events, true | ||
|  | 		} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 { // found a partial thinking tag
 | ||
|  | 			// it is only possible that they find 1
 | ||
|  | 			// found a partial think tag, emit the unambiguous before the partial tool call
 | ||
|  | 			// hello </think -> hello, so ambiguous start includes all the whitespace before the tag
 | ||
|  | 			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen] | ||
|  | 			ambiguousStart := len(beforePartialTag) | ||
|  | 			// HAVENT ADDED TRAILING WHITESPACE YET...
 | ||
|  | 			unambiguous := p.buffer.String()[:ambiguousStart] | ||
|  | 			ambiguous := p.buffer.String()[ambiguousStart:] | ||
|  | 			p.buffer.Reset() | ||
|  | 			p.buffer.WriteString(ambiguous) | ||
|  | 			events = append(events, qwenEventContent{content: unambiguous}) | ||
|  | 			return events, false | ||
|  | 		} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { // found a partial tool call tag
 | ||
|  | 			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen] | ||
|  | 			ambiguousStart := len(beforePartialTag) | ||
|  | 
 | ||
|  | 			unambiguous := p.buffer.String()[:ambiguousStart] | ||
|  | 			ambiguous := p.buffer.String()[ambiguousStart:] | ||
|  | 			p.buffer.Reset() | ||
|  | 			p.buffer.WriteString(ambiguous) | ||
|  | 			events = append(events, qwenEventContent{content: unambiguous}) | ||
|  | 			return events, false | ||
|  | 		} else { // no partial or full thinking or tool call tag found
 | ||
|  | 			// whitespaceLen := trailingWhitespaceLen(p.buffer.String()) <- all the trailing space we consider ambiguous
 | ||
|  | 			ambiguousStart := len(p.buffer.String()) // - whitespaceLen
 | ||
|  | 			unambiguous := p.buffer.String()[:ambiguousStart] | ||
|  | 			ambiguous := p.buffer.String()[ambiguousStart:] | ||
|  | 			p.buffer.Reset() | ||
|  | 			p.buffer.WriteString(ambiguous) | ||
|  | 			if len(unambiguous) > 0 { | ||
|  | 				events = append(events, qwenEventContent{content: unambiguous}) | ||
|  | 			} | ||
|  | 			return events, false | ||
|  | 		} | ||
|  | 	case CollectingToolContent: // we only move towards the CollectingContent state
 | ||
|  | 		if strings.Contains(p.buffer.String(), toolCloseTag) { | ||
|  | 			split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) // this one splits by the first one
 | ||
|  | 			before := split[0] | ||
|  | 			if len(before) == 0 { | ||
|  | 				slog.Warn("qwen tool call closing tag found but no content before it") | ||
|  | 			} | ||
|  | 			after := split[1]                                          // no whit space yet
 | ||
|  | 			events = append(events, qwenEventRawToolCall{raw: before}) // do these need to be "seperated"?
 | ||
|  | 			p.buffer.Reset() | ||
|  | 			p.buffer.WriteString(after) | ||
|  | 			p.state = CollectingContent | ||
|  | 			return events, true | ||
|  | 		} else { | ||
|  | 			return events, false | ||
|  | 		} | ||
|  | 	case CollectingThinkingContent: | ||
|  | 		if strings.Contains(p.buffer.String(), thinkingCloseTag) { | ||
|  | 			split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2) | ||
|  | 			// so it looks like before contains the open tag
 | ||
|  | 			fmt.Println("split", split) | ||
|  | 			before := split[0] | ||
|  | 			if len(before) == 0 { | ||
|  | 				slog.Warn("qwen tool call closing tag found but no content before it") | ||
|  | 			} | ||
|  | 			after := split[1] // no whit space yet
 | ||
|  | 			events = append(events, qwenEventThinkingContent{content: before}) | ||
|  | 			p.buffer.Reset() | ||
|  | 			p.buffer.WriteString(after) | ||
|  | 			p.state = CollectingContent | ||
|  | 			return events, true | ||
|  | 		} else { | ||
|  | 			return events, false | ||
|  | 		} | ||
|  | 	default: | ||
|  | 		panic("unreachable") | ||
|  | 	} | ||
|  | } | ||
|  | 
 | ||
|  | func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) { | ||
|  | 	// Expected JSON shape: {"name": "...", "arguments": { ... }}
 | ||
|  | 	// var in struct {
 | ||
|  | 	// 	Name      string          `json:"name"`
 | ||
|  | 	// 	Arguments json.RawMessage `json:"arguments"`
 | ||
|  | 	// }
 | ||
|  | 	fmt.Println(raw.raw) | ||
|  | 
 | ||
|  | 	var toolCall api.ToolCall | ||
|  | 	if err := json.Unmarshal([]byte(raw.raw), &toolCall); err != nil { | ||
|  | 		return api.ToolCall{}, err | ||
|  | 	} | ||
|  | 
 | ||
|  | 	// args := make(api.ToolCallFunctionArguments)
 | ||
|  | 	// 	if len(in.Arguments) > 0 && string(in.Arguments) != "null" {
 | ||
|  | 	// 	var obj map[string]any
 | ||
|  | 	// 	if err := json.Unmarshal(in.Arguments, &obj); err == nil {
 | ||
|  | 	// 		for k, v := range obj {
 | ||
|  | 	// 			args[k] = v
 | ||
|  | 	// 		}
 | ||
|  | 	// 	}
 | ||
|  | 	// }
 | ||
|  | 	fmt.Println(toolCall) | ||
|  | 	return toolCall, nil | ||
|  | } | ||
|  | 
 | ||
|  | // do we need to parse values
 |