mirror of https://github.com/ollama/ollama.git
working tests, changed code to find the first open tag
This commit is contained in:
parent
fc55584580
commit
fc3222c99f
|
@ -0,0 +1,291 @@
|
|||
package parsers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"encoding/json"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
)
|
||||
|
||||
// type parserState int
|
||||
|
||||
const (
|
||||
CollectingContent qwenParserState = iota
|
||||
CollectingThinkingContent // this is because qwen3vl starts with <thinking>
|
||||
// parserState_CompletedThinkingContent
|
||||
CollectingToolContent
|
||||
// parserState_CompletedToolContent
|
||||
)
|
||||
|
||||
const (
|
||||
thinkingOpenTag = "<thinking>"
|
||||
thinkingCloseTag = "</thinking>"
|
||||
)
|
||||
|
||||
type Qwen3VLParser struct {
|
||||
state qwenParserState
|
||||
buffer strings.Builder
|
||||
tools []api.Tool
|
||||
}
|
||||
|
||||
func (p *Qwen3VLParser) HasToolSupport() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (p *Qwen3VLParser) HasThinkingSupport() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
|
||||
p.tools = tools
|
||||
return tools // Qwen doesn't modify tools
|
||||
// does qwenvl modify tools?
|
||||
}
|
||||
|
||||
// Add processes a chunk of string output from the model, accumulating it in the parser's buffer,
|
||||
// and then parses any complete events (such as tool calls or content) that can be extracted from the buffer.
|
||||
// It returns the parsed content (as a string), an empty string for "thinking" (since this parser does not support it),
|
||||
// a slice of parsed tool calls, and an error if any occurred during parsing.
|
||||
//
|
||||
// Specifically, it works as follows:
|
||||
// 1. Appends the new string chunk 's' to the internal accumulator.
|
||||
// 2. Calls parseEvents() to extract any complete events (tool calls or content) from the buffer.
|
||||
// 3. Iterates over the events:
|
||||
// - For tool call events, attempts to parse them into api.ToolCall objects and collects them.
|
||||
// - For content events, appends their content to a string builder.
|
||||
// 4. Returns the accumulated content, an empty string for thinking, the collected tool calls, and any error encountered.
|
||||
|
||||
type qwenEventThinkingContent struct {
|
||||
content string
|
||||
}
|
||||
|
||||
func (qwenEventThinkingContent) isQwenEvent() {}
|
||||
|
||||
func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
|
||||
// is s the complete content (aka the for sure unambiguous content)
|
||||
p.buffer.WriteString(s)
|
||||
// why do we write the entire string?
|
||||
|
||||
events := p.parseEvents()
|
||||
// parse events:
|
||||
// - parses the entire content
|
||||
// parses an entire tool call
|
||||
// parses an entire thinking content
|
||||
|
||||
var toolCalls []api.ToolCall
|
||||
var sb strings.Builder
|
||||
for _, event := range events {
|
||||
switch event := event.(type) {
|
||||
case qwenEventRawToolCall:
|
||||
toolCall, err := parseToolCall(event, p.tools)
|
||||
if err != nil {
|
||||
slog.Warn("qwen tool call parsing failed", "error", err)
|
||||
return "", "", nil, err
|
||||
}
|
||||
toolCalls = append(toolCalls, toolCall)
|
||||
case qwenEventThinkingContent: // maybe we only need one?
|
||||
print("unimplemented")
|
||||
// how exactly does thinking work?
|
||||
case qwenEventContent:
|
||||
// TODO(drifkin): if the same turn contains multiple interleaved content
|
||||
// events, we naively append them together here. See the note below about
|
||||
// `qwenEvent`s for more details
|
||||
sb.WriteString(event.content)
|
||||
}
|
||||
}
|
||||
|
||||
return sb.String(), "", toolCalls, nil
|
||||
}
|
||||
|
||||
func (p *Qwen3VLParser) parseEvents() []qwenEvent {
|
||||
var all []qwenEvent
|
||||
|
||||
keepLooping := true
|
||||
for keepLooping {
|
||||
var events []qwenEvent
|
||||
events, keepLooping = p.eat()
|
||||
if len(events) > 0 {
|
||||
all = append(all, events...)
|
||||
}
|
||||
}
|
||||
|
||||
if len(all) > 0 {
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
|
||||
}
|
||||
|
||||
return all
|
||||
}
|
||||
|
||||
// type qwenEventRawToolCall struct {
|
||||
// raw string
|
||||
// }
|
||||
|
||||
// type qwenEventContent struct {
|
||||
// content string
|
||||
// }
|
||||
|
||||
// think if a better name
|
||||
func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent {
|
||||
split := strings.SplitN(p.buffer.String(), tag, 2) // what is his 2 for?
|
||||
before := split[0] // before the tag
|
||||
// before = strings.TrimRightFunc(before, unicode.IsSpace) // trim all the space after the bfire
|
||||
if len(before) > 0 {
|
||||
events = append(events, qwenEventContent{content: before})
|
||||
}
|
||||
after := split[1]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(after)
|
||||
return events
|
||||
}
|
||||
|
||||
// overlap = ambiguous
|
||||
|
||||
// findFirstTag returns the tag that appears first in the buffer among the provided tags.
|
||||
// If no tag is found, it returns an empty string.
|
||||
func findFirstTag(p *Qwen3VLParser, tags []string) string {
|
||||
minIdx := -1
|
||||
var firstTag string
|
||||
for _, tag := range tags {
|
||||
idx := strings.Index(p.buffer.String(), tag)
|
||||
if idx != -1 && (minIdx == -1 || idx < minIdx) {
|
||||
minIdx = idx
|
||||
firstTag = tag
|
||||
}
|
||||
}
|
||||
if minIdx == -1 { // just content
|
||||
return ""
|
||||
}
|
||||
return firstTag // there is a possibility that there is no tag, can you return nil for that?
|
||||
}
|
||||
|
||||
func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
||||
var events []qwenEvent
|
||||
|
||||
// certain events:
|
||||
// - thinking opening tag
|
||||
// - tool opening tag
|
||||
|
||||
// since there is multiple tags, we need to think about which tag comes first
|
||||
// we also need to create a list for
|
||||
firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag})
|
||||
|
||||
switch p.state {
|
||||
case CollectingContent: // we can only look for thinking content if we're collecting content
|
||||
|
||||
// if strings.Contains(p.buffer.String(), thinkingOpenTag) { // found thinking
|
||||
if firstTag == thinkingOpenTag {
|
||||
// string contains the openThinkingTag, we move it to the CollectingThinkingContent state
|
||||
events = emitContentBeforeTag(p, events, thinkingOpenTag)
|
||||
p.state = CollectingThinkingContent // <found a thinking>
|
||||
return events, true
|
||||
// } else if strings.Contains(p.buffer.String(), toolOpenTag) { // found tool call
|
||||
} else if firstTag == toolOpenTag {
|
||||
events = emitContentBeforeTag(p, events, toolOpenTag)
|
||||
p.state = CollectingToolContent // found a <tool_call>
|
||||
return events, true
|
||||
} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 { // found a partial thinking tag
|
||||
// it is only possible that they find 1
|
||||
// found a partial think tag, emit the unambiguous before the partial tool call
|
||||
// hello </think -> hello, so ambiguous start includes all the whitespace before the tag
|
||||
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
||||
ambiguousStart := len(beforePartialTag)
|
||||
// HAVENT ADDED TRAILING WHITESPACE YET...
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
return events, false
|
||||
} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { // found a partial tool call tag
|
||||
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
||||
ambiguousStart := len(beforePartialTag)
|
||||
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
return events, false
|
||||
} else { // no partial or full thinking or tool call tag found
|
||||
// whitespaceLen := trailingWhitespaceLen(p.buffer.String()) <- all the trailing space we consider ambiguous
|
||||
ambiguousStart := len(p.buffer.String()) // - whitespaceLen
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
if len(unambiguous) > 0 {
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
}
|
||||
return events, false
|
||||
}
|
||||
case CollectingToolContent: // we only move towards the CollectingContent state
|
||||
if strings.Contains(p.buffer.String(), toolCloseTag) {
|
||||
split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) // this one splits by the first one
|
||||
before := split[0]
|
||||
if len(before) == 0 {
|
||||
slog.Warn("qwen tool call closing tag found but no content before it")
|
||||
}
|
||||
after := split[1] // no whit space yet
|
||||
events = append(events, qwenEventRawToolCall{raw: before}) // do these need to be "seperated"?
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(after)
|
||||
p.state = CollectingContent
|
||||
return events, true
|
||||
} else {
|
||||
return events, false
|
||||
}
|
||||
case CollectingThinkingContent:
|
||||
if strings.Contains(p.buffer.String(), thinkingCloseTag) {
|
||||
split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)
|
||||
// so it looks like before contains the open tag
|
||||
fmt.Println("split", split)
|
||||
before := split[0]
|
||||
if len(before) == 0 {
|
||||
slog.Warn("qwen tool call closing tag found but no content before it")
|
||||
}
|
||||
after := split[1] // no whit space yet
|
||||
events = append(events, qwenEventThinkingContent{content: before})
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(after)
|
||||
p.state = CollectingContent
|
||||
return events, true
|
||||
} else {
|
||||
return events, false
|
||||
}
|
||||
default:
|
||||
panic("unreachable")
|
||||
}
|
||||
}
|
||||
|
||||
func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
|
||||
// Expected JSON shape: {"name": "...", "arguments": { ... }}
|
||||
// var in struct {
|
||||
// Name string `json:"name"`
|
||||
// Arguments json.RawMessage `json:"arguments"`
|
||||
// }
|
||||
fmt.Println(raw.raw)
|
||||
|
||||
var toolCall api.ToolCall
|
||||
if err := json.Unmarshal([]byte(raw.raw), &toolCall); err != nil {
|
||||
return api.ToolCall{}, err
|
||||
}
|
||||
|
||||
// args := make(api.ToolCallFunctionArguments)
|
||||
// if len(in.Arguments) > 0 && string(in.Arguments) != "null" {
|
||||
// var obj map[string]any
|
||||
// if err := json.Unmarshal(in.Arguments, &obj); err == nil {
|
||||
// for k, v := range obj {
|
||||
// args[k] = v
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
fmt.Println(toolCall)
|
||||
return toolCall, nil
|
||||
}
|
||||
|
||||
// do we need to parse values
|
|
@ -26,35 +26,162 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
steps []step
|
||||
only bool
|
||||
}{
|
||||
// all of this is just thinking tests
|
||||
{
|
||||
desc: "with thinking",
|
||||
desc: "simple thinking",
|
||||
steps: []step{
|
||||
{input: "<thinking>abc</thinking>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "thinking with split tags",
|
||||
steps: []step{
|
||||
{input: "<thinking>abc", wantEvents: []qwenEvent{}},
|
||||
{input: "</thinking>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "thinking and tool call",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking</thinking><tool_call>I'm tool calling</tool_call>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking"},
|
||||
qwenEventRawToolCall{raw: "I'm tool calling"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "thinking and content",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking</thinking>I'm content",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking"},
|
||||
qwenEventContent{content: "I'm content"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "thinking and tool call and content",
|
||||
},
|
||||
{
|
||||
desc: "nested thinking (outside thinking, inside thinking)",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<thinking>I'm nested thinking</thinking></thinking>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking<thinking>I'm nested thinking"},
|
||||
qwenEventContent{content: "</thinking>"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "interleaved thinking",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<thinking></thinking>I'm actually content</thinking>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking<thinking>"},
|
||||
qwenEventContent{content: "I'm actually content</thinking>"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "nested thinking and tool call (outside thinking, inside tool call)",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<tool_call>I'm nested tool call</tool_call></thinking>",
|
||||
wantEvents: []qwenEvent{qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm nested tool call</tool_call>"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "nested thinking and tool call (inside tool call, outside thinking)",
|
||||
desc: "nested thinking and tool call (outside tool call, inside thinking)",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<tool_call>I'm nested tool call<thinking>I'm thinking</thinking></tool_call>",
|
||||
wantEvents: []qwenEvent{qwenEventRawToolCall{raw: "I'm nested tool call<thinking>I'm thinking</thinking>"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "interleaved thinking and tool call",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<tool_call>I'm NOT a nested tool call</thinking></tool_call><tool_call>I'm nested tool call 2<thinking></tool_call></thinking>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm NOT a nested tool call"},
|
||||
qwenEventContent{content: "</tool_call>"},
|
||||
qwenEventRawToolCall{raw: "I'm nested tool call 2<thinking>"},
|
||||
qwenEventContent{content: "</thinking>"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "partial thinking tag fakeout",
|
||||
steps: []step{
|
||||
{
|
||||
input: "abc<thinking",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventContent{content: "abc"},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: " fakeout",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventContent{content: "<thinking fakeout"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "partial thinking incomplete",
|
||||
steps: []step{
|
||||
{
|
||||
input: "abc<thinking>unfinished</thinking", // when something is ambiguious, we dont emit anything
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventContent{content: "abc"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
anyOnlies := false
|
||||
for _, tc := range cases {
|
||||
if tc.only {
|
||||
anyOnlies = true
|
||||
}
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
if anyOnlies && !tc.only {
|
||||
continue
|
||||
}
|
||||
|
||||
t.Run(tc.desc, func(t *testing.T) {
|
||||
parser := Qwen3VLParser{}
|
||||
|
||||
for i, step := range tc.steps {
|
||||
parser.buffer.WriteString(step.input)
|
||||
gotEvents := parser.parseEvents()
|
||||
|
||||
if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
|
||||
// avoid deep equal on empty vs. nil slices
|
||||
continue
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(gotEvents, step.wantEvents) {
|
||||
t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: devin was saying something about json cant figure out types?
|
||||
|
|
|
@ -0,0 +1,351 @@
|
|||
package renderers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
// where should we set the image count?
|
||||
var imageCount int
|
||||
var videoCount int
|
||||
|
||||
// so i think from the renders, do vision is false
|
||||
|
||||
// basic
|
||||
// [-] with tools
|
||||
// [] with multiple tools
|
||||
// [-] with tool calling
|
||||
// [ ] with multiple tool calling
|
||||
// with images and videos
|
||||
|
||||
// TODO: is there a way not to have to writ my own code for marshalWithSpaces
|
||||
// the tool dictionaery list is slightly different
|
||||
|
||||
func marshalWithSpaces(v any) ([]byte, error) {
|
||||
b, err := json.Marshal(v) // compact
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
out := make([]byte, 0, len(b)+len(b)/8)
|
||||
inStr, esc := false, false
|
||||
for _, c := range b {
|
||||
if inStr {
|
||||
out = append(out, c)
|
||||
if esc {
|
||||
esc = false
|
||||
continue
|
||||
}
|
||||
if c == '\\' {
|
||||
esc = true
|
||||
continue
|
||||
}
|
||||
if c == '"' {
|
||||
inStr = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
switch c {
|
||||
case '"':
|
||||
inStr = true
|
||||
out = append(out, c)
|
||||
case ':':
|
||||
out = append(out, ':', ' ')
|
||||
case ',':
|
||||
out = append(out, ',', ' ')
|
||||
default:
|
||||
out = append(out, c)
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// func pruneEmpty(v any) any {
|
||||
// switch x := v.(type) {
|
||||
// case map[string]any:
|
||||
// out := make(map[string]any, len(x))
|
||||
// for k, vv := range x {
|
||||
// p := pruneEmpty(vv)
|
||||
// switch pp := p.(type) {
|
||||
// case nil:
|
||||
// continue
|
||||
// case string:
|
||||
// if pp == "" {
|
||||
// continue
|
||||
// }
|
||||
// case []any:
|
||||
// if len(pp) == 0 {
|
||||
// continue
|
||||
// }
|
||||
// case map[string]any:
|
||||
// if len(pp) == 0 {
|
||||
// continue
|
||||
// }
|
||||
// }
|
||||
// out[k] = p
|
||||
// }
|
||||
// return out
|
||||
// case []any:
|
||||
// out := make([]any, 0, len(x))
|
||||
// for _, vv := range x {
|
||||
// p := pruneEmpty(vv)
|
||||
// switch pp := p.(type) {
|
||||
// case nil:
|
||||
// continue
|
||||
// case string:
|
||||
// if pp == "" {
|
||||
// continue
|
||||
// }
|
||||
// case []any:
|
||||
// if len(pp) == 0 {
|
||||
// continue
|
||||
// }
|
||||
// case map[string]any:
|
||||
// if len(pp) == 0 {
|
||||
// continue
|
||||
// }
|
||||
// }
|
||||
// out = append(out, p)
|
||||
// }
|
||||
// return out
|
||||
// default:
|
||||
// return v
|
||||
// }
|
||||
// }
|
||||
|
||||
// func marshalWithSpaces(v any) ([]byte, error) {
|
||||
// // 1) normalize to interface{} and prune empty fields
|
||||
// var iv any
|
||||
// b0, err := json.Marshal(v)
|
||||
// if err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
// if err := json.Unmarshal(b0, &iv); err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
// iv = pruneEmpty(iv)
|
||||
|
||||
// // 2) compact marshal
|
||||
// b, err := json.Marshal(iv)
|
||||
// if err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
|
||||
// // 3) inject spaces after ':' and ',' outside strings
|
||||
// out := make([]byte, 0, len(b)+len(b)/8)
|
||||
// inStr, esc := false, false
|
||||
// for _, c := range b {
|
||||
// if inStr {
|
||||
// out = append(out, c)
|
||||
// if esc {
|
||||
// esc = false
|
||||
// continue
|
||||
// }
|
||||
// if c == '\\' {
|
||||
// esc = true
|
||||
// continue
|
||||
// }
|
||||
// if c == '"' {
|
||||
// inStr = false
|
||||
// }
|
||||
// continue
|
||||
// }
|
||||
// switch c {
|
||||
// case '"':
|
||||
// inStr = true
|
||||
// out = append(out, c)
|
||||
// case ':':
|
||||
// out = append(out, ':', ' ')
|
||||
// case ',':
|
||||
// out = append(out, ',', ' ')
|
||||
// default:
|
||||
// out = append(out, c)
|
||||
// }
|
||||
// }
|
||||
// return out, nil
|
||||
// }
|
||||
|
||||
// this is soooooooo ugly
|
||||
// why exactly is the type of content?
|
||||
func renderContent(content any, doVisionCount bool) string {
|
||||
print(content)
|
||||
switch content.(type) {
|
||||
case string:
|
||||
return content.(string)
|
||||
default:
|
||||
var subSb strings.Builder
|
||||
for _, item := range content.([]any) {
|
||||
if strings.Contains(item.(string), "image") || strings.Contains(item.(string), "image_url") || item.(map[string]any)["type"] == "image" {
|
||||
if doVisionCount {
|
||||
imageCount++
|
||||
}
|
||||
// if addVisionID {
|
||||
// sb.WriteString("Picture " + strconv.Itoa(imageCount) + ": ") // do we need the itoa thing?
|
||||
// }
|
||||
subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
|
||||
} else if strings.Contains(item.(string), "video") || item.(map[string]any)["type"] == "video" {
|
||||
if doVisionCount {
|
||||
videoCount++
|
||||
}
|
||||
// if addVisionID {
|
||||
// sb.WriteString("Video " + strconv.Itoa(videoCount) + ": ") // do we need the itoa thing?
|
||||
// }
|
||||
subSb.WriteString("<|vision_start|><|video_pad|><|vision_end|>")
|
||||
} else if strings.Contains(item.(string), "text") {
|
||||
subSb.WriteString(item.(map[string]any)["text"].(string))
|
||||
}
|
||||
}
|
||||
return subSb.String()
|
||||
}
|
||||
}
|
||||
|
||||
func Qwen3VLRenderer(messages []api.Message, tools []api.Tool, _ *api.ThinkValue) (string, error) {
|
||||
var sb strings.Builder
|
||||
// this is the tools section
|
||||
|
||||
fmt.Println("Number of tools (A):", len(tools))
|
||||
|
||||
if len(tools) > 0 {
|
||||
sb.WriteString(imStartTag + "system\n")
|
||||
if len(messages) > 0 && messages[0].Role == "system" {
|
||||
sb.WriteString(messages[0].Content + "\n\n")
|
||||
}
|
||||
sb.WriteString("# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>")
|
||||
for _, tool := range tools {
|
||||
sb.WriteString("\n")
|
||||
// if b, err := json.Marshal(tool); err == nil { // {{- tool_call.arguments | tojson -}}
|
||||
// sb.Write(b)
|
||||
// // so huggingface adds a space before every json object?
|
||||
// }
|
||||
// if b, err := json.MarshalIndent(tool, "", ""); err == nil { // {{- tool_call.arguments | tojson -}}
|
||||
// sb.Write(b)
|
||||
// // so huggingface adds a space before every json object?
|
||||
// }
|
||||
if b, err := marshalWithSpaces(tool); err == nil {
|
||||
sb.Write(b) // JSON like {"a": 1, "b": 2}
|
||||
}
|
||||
}
|
||||
sb.WriteString("\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n")
|
||||
// sb.WriteString("<|im_end|>\n")
|
||||
} else if len(messages) > 0 && messages[0].Role == "system" {
|
||||
sb.WriteString("<|im_start|>system\n" + messages[0].Content + "<|im_end|>\n")
|
||||
}
|
||||
|
||||
// what does the namespace do?
|
||||
|
||||
// Iterate through messages in reverse order to find the last query index
|
||||
|
||||
// how do we get these parameters?
|
||||
multiStepTool := true
|
||||
lastQueryIndex := len(messages) - 1
|
||||
|
||||
for i := len(messages) - 1; i >= 0; i-- { // go in reverse
|
||||
message := messages[i]
|
||||
if multiStepTool && message.Role == "user" {
|
||||
// Check if content starts with <tool_response> and ends with </tool_response>
|
||||
content := message.Content // use this with renderContent
|
||||
if !(strings.HasPrefix(content, "<tool_response>") && strings.HasSuffix(content, "</tool_response>")) {
|
||||
multiStepTool = false
|
||||
lastQueryIndex = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// this is the start of the messages
|
||||
|
||||
fmt.Println("Number of messages:", len(messages))
|
||||
|
||||
fmt.Println(messages)
|
||||
|
||||
for i, message := range messages {
|
||||
// lastMessage := i == len(messages)-1
|
||||
content := renderContent(message.Content, true)
|
||||
// prefill := lastMessage && message.Role == "assistant"
|
||||
|
||||
fmt.Println(message) // print a message?
|
||||
|
||||
if message.Role == "user" || message.Role == "system" && i != 0 {
|
||||
sb.WriteString("<|im_start|>" + message.Role + "\n" + content + "<|im_end|>\n")
|
||||
} else if message.Role == "assistant" {
|
||||
contentReasoning := ""
|
||||
if message.Thinking != "" { // if message.reasoning_content is a string
|
||||
contentReasoning = message.Thinking
|
||||
} else if strings.Contains(content, "</think>") {
|
||||
contentReasoning = strings.Split(content, "</think>")[0]
|
||||
contentReasoning = strings.TrimRight(contentReasoning, "\n")
|
||||
|
||||
contentReasoningSplit := strings.Split(contentReasoning, "<think>") // how the fuck does this work?
|
||||
contentReasoning = contentReasoningSplit[len(contentReasoningSplit)-1]
|
||||
|
||||
contentReasoning = strings.TrimLeft(contentReasoning, "\n")
|
||||
|
||||
// TODO: should be {%- set reasoning_content = content.split("</think>")[0].rstrip("\n").split("<think>")[-1].lstrip("\n") -%}
|
||||
contentSplit := strings.Split(content, "</think>") // TODO: should be {%- set content = content.split("</think>")[-1].lstrip("\n") -%}
|
||||
content = contentSplit[len(contentSplit)-1]
|
||||
content = strings.TrimLeft(content, "\n")
|
||||
}
|
||||
|
||||
if i > lastQueryIndex {
|
||||
if i == len(messages)-1 || contentReasoning != "" {
|
||||
sb.WriteString("<|im_start|>" + message.Role + "\n<think>\n" + strings.Trim(contentReasoning, "\n") + "\n</think>\n\n" + strings.TrimLeft(content, "\n"))
|
||||
} else {
|
||||
sb.WriteString("<|im_start|>" + message.Role + "\n" + content)
|
||||
}
|
||||
} else {
|
||||
sb.WriteString("<|im_start|>" + message.Role + "\n" + content)
|
||||
}
|
||||
|
||||
// if message.tool_calls
|
||||
// if message.ToolCalls != nil {
|
||||
if len(message.ToolCalls) > 0 {
|
||||
for j, toolCall := range message.ToolCalls {
|
||||
// what the fuck is this for?
|
||||
if j > 0 || content != "" {
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
|
||||
// if toolCall.Function != nil {
|
||||
// toolCall = toolCall.Function
|
||||
// }
|
||||
// if there any way that toolcall does not have a function?
|
||||
// toolCall = toolCall.Function
|
||||
|
||||
// {{- "<tool_call>\n{\"name\": \"" -}}
|
||||
// {{- tool_call.name -}}
|
||||
// {{- "\", \"arguments\": " -}}
|
||||
// sb.WriteString("\n<tool_call>\n{\"name\": \"" + toolCall.Function.Name + "\", \"arguments\": ")
|
||||
sb.WriteString("<tool_call>\n{\"name\": \"" + toolCall.Function.Name + "\", \"arguments\": ")
|
||||
if b, err := marshalWithSpaces(toolCall.Function.Arguments); err == nil {
|
||||
sb.Write(b) // JSON like {"a": 1, "b": 2}
|
||||
}
|
||||
sb.WriteString("}\n</tool_call>")
|
||||
}
|
||||
}
|
||||
sb.WriteString("<|im_end|>\n")
|
||||
} else if message.Role == "tool" {
|
||||
if i == 0 || messages[i-1].Role != "tool" {
|
||||
sb.WriteString("<|im_start|>user")
|
||||
}
|
||||
sb.WriteString("\n<tool_response>\n" + message.Content + "\n</tool_response>")
|
||||
if i == len(messages)-1 || messages[i+1].Role != "tool" {
|
||||
sb.WriteString("<|im_end|>\n")
|
||||
}
|
||||
}
|
||||
|
||||
// if lastMessage {
|
||||
// sb.WriteString("<|im_start|>assistant\n<think>\n")
|
||||
// }
|
||||
}
|
||||
|
||||
// we might need to wrap this in something?
|
||||
sb.WriteString("<|im_start|>assistant\n<think>\n")
|
||||
|
||||
// if addGenerationPrompt {
|
||||
// sb.WriteString("<|im_start|>assistant\n<think>\n")
|
||||
// }
|
||||
return sb.String(), nil
|
||||
|
||||
}
|
Loading…
Reference in New Issue