mirror of https://github.com/ollama/ollama.git
Working parser for thinking models - assumes state of thinking, emits unambiguous content in thinking, does not call tool call in thinking
This commit is contained in:
parent
ef84ad9440
commit
d1f3145a74
|
@ -13,15 +13,23 @@ import (
|
|||
"github.com/ollama/ollama/logutil"
|
||||
)
|
||||
|
||||
func (p *Qwen3VLParser) initialState() qwenParserState {
|
||||
if p.HasThinkingSupport() { // has thinking, start from collecting thinking content
|
||||
return CollectingThinkingContent
|
||||
}
|
||||
return CollectingContent
|
||||
}
|
||||
|
||||
// its because we dont call the Init function
|
||||
const (
|
||||
CollectingContent qwenParserState = iota
|
||||
CollectingThinkingContent
|
||||
CollectingThinkingContent qwenParserState = iota
|
||||
CollectingContent
|
||||
CollectingToolContent
|
||||
)
|
||||
|
||||
const (
|
||||
thinkingOpenTag = "<thinking>"
|
||||
thinkingCloseTag = "</thinking>"
|
||||
// thinkingOpenTag = "<think>"
|
||||
thinkingCloseTag = "</think>"
|
||||
)
|
||||
|
||||
type Qwen3VLParser struct {
|
||||
|
@ -40,6 +48,8 @@ func (p *Qwen3VLParser) HasThinkingSupport() bool {
|
|||
|
||||
func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
|
||||
p.tools = tools
|
||||
p.state = p.initialState()
|
||||
fmt.Println("[qwen3vl parser] initial state", p.state)
|
||||
return tools
|
||||
}
|
||||
|
||||
|
@ -112,49 +122,16 @@ func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qw
|
|||
return events
|
||||
}
|
||||
|
||||
// findFirstTag returns the tag that appears first in the buffer among the provided tags.
|
||||
// If no tag is found, it returns an empty string.
|
||||
func findFirstTag(p *Qwen3VLParser, tags []string) string {
|
||||
minIdx := -1
|
||||
var firstTag string
|
||||
for _, tag := range tags {
|
||||
idx := strings.Index(p.buffer.String(), tag)
|
||||
if idx != -1 && (minIdx == -1 || idx < minIdx) {
|
||||
minIdx = idx
|
||||
firstTag = tag
|
||||
}
|
||||
}
|
||||
if minIdx == -1 {
|
||||
return ""
|
||||
}
|
||||
return firstTag
|
||||
}
|
||||
|
||||
func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
||||
var events []qwenEvent
|
||||
|
||||
firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag})
|
||||
// fmt.Println("[qwen3vl parser] eat", p.state)
|
||||
|
||||
switch p.state {
|
||||
case CollectingContent:
|
||||
if firstTag == thinkingOpenTag {
|
||||
events = emitContentBeforeTag(p, events, thinkingOpenTag)
|
||||
p.state = CollectingThinkingContent
|
||||
return events, true
|
||||
} else if firstTag == toolOpenTag {
|
||||
if strings.Contains(p.buffer.String(), toolOpenTag) {
|
||||
events = emitContentBeforeTag(p, events, toolOpenTag)
|
||||
p.state = CollectingToolContent
|
||||
return events, true
|
||||
} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 {
|
||||
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
||||
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
||||
ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
return events, false
|
||||
} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 {
|
||||
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
||||
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
||||
|
@ -164,11 +141,14 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
|||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
if len(unambiguous) > 0 { // why does qwen3coder not have this here
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
}
|
||||
return events, false
|
||||
} else {
|
||||
whitespaceLen := trailingWhitespaceLen(p.buffer.String())
|
||||
ambiguousStart := len(p.buffer.String()) - whitespaceLen
|
||||
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
|
@ -195,21 +175,46 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
|||
} else {
|
||||
return events, false
|
||||
}
|
||||
case CollectingThinkingContent:
|
||||
case CollectingThinkingContent: // so we want to hip the unambiguous stuff
|
||||
if strings.Contains(p.buffer.String(), thinkingCloseTag) {
|
||||
split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)
|
||||
fmt.Println("split", split)
|
||||
// fmt.Println("split", split)
|
||||
before := split[0]
|
||||
if len(before) == 0 {
|
||||
slog.Warn("qwen tool call closing tag found but no content before it")
|
||||
}
|
||||
after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
|
||||
events = append(events, qwenEventThinkingContent{content: before})
|
||||
if len(before) > 0 {
|
||||
events = append(events, qwenEventThinkingContent{content: before})
|
||||
}
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(after)
|
||||
p.state = CollectingContent
|
||||
return events, true
|
||||
} else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 { // we see part of a close thinking tag
|
||||
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
||||
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
||||
ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
|
||||
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
if len(unambiguous) > 0 {
|
||||
events = append(events, qwenEventThinkingContent{content: unambiguous})
|
||||
}
|
||||
return events, false
|
||||
} else {
|
||||
whitespaceLen := trailingWhitespaceLen(p.buffer.String())
|
||||
ambiguousStart := len(p.buffer.String()) - whitespaceLen
|
||||
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
if len(unambiguous) > 0 {
|
||||
events = append(events, qwenEventThinkingContent{content: unambiguous})
|
||||
}
|
||||
return events, false
|
||||
}
|
||||
default:
|
||||
|
|
|
@ -15,7 +15,7 @@ import (
|
|||
// return t
|
||||
// }
|
||||
|
||||
func TestQwen3VLParserStreaming(t *testing.T) {
|
||||
func TestQwen3VLThinkingParserStreaming(t *testing.T) {
|
||||
type step struct {
|
||||
input string
|
||||
wantEvents []qwenEvent
|
||||
|
@ -30,21 +30,33 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
{
|
||||
desc: "simple thinking",
|
||||
steps: []step{
|
||||
{input: "<thinking>abc</thinking>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
|
||||
{input: "abc</think>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "simple trip thinking",
|
||||
steps: []step{
|
||||
{input: "<think>abc</think>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "<think>abc"}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "thinking with split tags",
|
||||
steps: []step{
|
||||
{input: "<thinking>abc", wantEvents: []qwenEvent{}},
|
||||
{input: "</thinking>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
|
||||
{input: "abc", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}},
|
||||
{input: "</think>", wantEvents: []qwenEvent{}},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "multiple think tags",
|
||||
steps: []step{
|
||||
{input: "abc<think>actually, is not thinking</think>", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc<think>actually, is not thinking"}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "thinking and tool call",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking</thinking><tool_call>I'm tool calling</tool_call>",
|
||||
input: "I'm thinking</think><tool_call>I'm tool calling</tool_call>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking"},
|
||||
qwenEventRawToolCall{raw: "I'm tool calling"},
|
||||
|
@ -56,7 +68,7 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "thinking and content",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking</thinking>I'm content",
|
||||
input: "I'm thinking</think>I'm content",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking"},
|
||||
qwenEventContent{content: "I'm content"},
|
||||
|
@ -71,10 +83,10 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "nested thinking (outside thinking, inside thinking)",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<thinking>I'm nested thinking</thinking></thinking>",
|
||||
input: "I'm thinking<think>I'm nested thinking</think></think>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking<thinking>I'm nested thinking"},
|
||||
qwenEventContent{content: "</thinking>"},
|
||||
qwenEventThinkingContent{content: "I'm thinking<think>I'm nested thinking"},
|
||||
qwenEventContent{content: "</think>"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -83,10 +95,10 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "interleaved thinking",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<thinking></thinking>I'm actually content</thinking>",
|
||||
input: "<think>I'm thinking</think>I'm actually content</think>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking<thinking>"},
|
||||
qwenEventContent{content: "I'm actually content</thinking>"},
|
||||
qwenEventThinkingContent{content: "<think>I'm thinking"},
|
||||
qwenEventContent{content: "I'm actually content</think>"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -95,7 +107,7 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "nested thinking and tool call (outside thinking, inside tool call)",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<tool_call>I'm nested tool call</tool_call></thinking>",
|
||||
input: "I'm thinking<tool_call>I'm nested tool call</tool_call></think>",
|
||||
wantEvents: []qwenEvent{qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm nested tool call</tool_call>"}},
|
||||
},
|
||||
},
|
||||
|
@ -104,8 +116,11 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "nested thinking and tool call (outside tool call, inside thinking)",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<tool_call>I'm nested tool call<thinking>I'm thinking</thinking></tool_call>",
|
||||
wantEvents: []qwenEvent{qwenEventRawToolCall{raw: "I'm nested tool call<thinking>I'm thinking</thinking>"}},
|
||||
input: "<tool_call>I'm nested tool call<think>I'm thinking</think></tool_call>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "<tool_call>I'm nested tool call<think>I'm thinking"},
|
||||
qwenEventContent{content: "</tool_call>"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -113,12 +128,12 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "interleaved thinking and tool call",
|
||||
steps: []step{
|
||||
{
|
||||
input: "<thinking>I'm thinking<tool_call>I'm NOT a nested tool call</thinking></tool_call><tool_call>I'm nested tool call 2<thinking></tool_call></thinking>",
|
||||
input: "I'm thinking<tool_call>I'm NOT a nested tool call</think></tool_call><tool_call>I'm nested tool call 2<think></tool_call></think>",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm NOT a nested tool call"},
|
||||
qwenEventContent{content: "</tool_call>"},
|
||||
qwenEventRawToolCall{raw: "I'm nested tool call 2<thinking>"},
|
||||
qwenEventContent{content: "</thinking>"},
|
||||
qwenEventRawToolCall{raw: "I'm nested tool call 2<think>"},
|
||||
qwenEventContent{content: "</think>"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -127,16 +142,12 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "partial thinking tag fakeout",
|
||||
steps: []step{
|
||||
{
|
||||
input: "abc<thinking",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventContent{content: "abc"},
|
||||
},
|
||||
input: "abc</think",
|
||||
wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}},
|
||||
},
|
||||
{
|
||||
input: " fakeout",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventContent{content: "<thinking fakeout"},
|
||||
},
|
||||
input: " fakeout",
|
||||
wantEvents: []qwenEvent{qwenEventThinkingContent{content: "</think fakeout"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -144,9 +155,46 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
desc: "partial thinking incomplete",
|
||||
steps: []step{
|
||||
{
|
||||
input: "abc<thinking>unfinished</thinking", // when something is ambiguious, we dont emit anything
|
||||
input: "abc<think>unfinished</think", // when something is ambiguious, we dont emit anything
|
||||
wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc<think>unfinished"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "test with split thinking and content",
|
||||
steps: []step{
|
||||
{
|
||||
input: "abc<think>unfinished</th", // when something is ambiguious, we dont emit anything
|
||||
wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc<think>unfinished"}},
|
||||
},
|
||||
{
|
||||
input: "ink> def",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventContent{content: "abc"},
|
||||
qwenEventContent{content: "def"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "thinking with no tags",
|
||||
steps: []step{
|
||||
{
|
||||
input: "Hello I am thinking",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "Hello I am thinking"},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "Hello I am thinking some more",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "Hello I am thinking some more"},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "Hello I am think</think> NOT",
|
||||
wantEvents: []qwenEvent{
|
||||
qwenEventThinkingContent{content: "Hello I am think"},
|
||||
qwenEventContent{content: "NOT"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -184,42 +232,9 @@ func TestQwen3VLParserStreaming(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestQwen3VLComplex(t *testing.T) {
|
||||
type step struct {
|
||||
input string
|
||||
wantEvents []qwenEvent
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
desc string
|
||||
steps []step
|
||||
only bool
|
||||
}{
|
||||
{
|
||||
desc: "simple tool call",
|
||||
steps: []step{
|
||||
{
|
||||
input: "Here are 30 distinct and popular emojis for you! 😊\n\n1. 😂 \n2. ❤️ \n3. 🌟 \n4. 🐶 \n5. 🍕 \n6. ✨ \n7. 🌈 \n8. 🎉 \n9. 🌎 \n10. 🦁 \n11. 💯 \n12. 🥰 \n13. 🌸 \n14. 🚀 \n15. 🌊 \n16. 🍦 \n17. 🌙 \n18. 🌞 \n19. 🌻 \n20. 🦋 \n21. 🍃 \n22. 🏆 \n23. 🌮 \n24. 🧸 \n25. 🎮 \n26. 📚 \n27. ✈️ \n28. 🌟 (sparkles) \n29. 🌈 (rainbow) \n30. 🥳 \n\n*Bonus fun fact:* The 😂 (Face with Tears of Joy) was Oxford Dictionaries' Word of the Year in 2015! 🎉 \nLet me know if you'd like themed emojis (e.g., animals, food, or emotions)! 🐱🍕📚",
|
||||
wantEvents: []qwenEvent{qwenEventContent{content: "bruh"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
for i, step := range tc.steps {
|
||||
parser := Qwen3VLParser{}
|
||||
parser.buffer.WriteString(step.input)
|
||||
gotEvents := parser.parseEvents()
|
||||
if !reflect.DeepEqual(gotEvents, step.wantEvents) {
|
||||
t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: devin was saying something about json cant figure out types?
|
||||
// do we need to test for
|
||||
func TestQwen3VLToolParser(t *testing.T) {
|
||||
func TestQwen3VLThinkingToolParser(t *testing.T) {
|
||||
type step struct {
|
||||
name string
|
||||
rawToolCall string
|
|
@ -17,7 +17,7 @@ var IMAGE2_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAIAAADTED8xAAADMElEQVR4nOz
|
|||
// - [ ] set descriptions to omitempty?
|
||||
// - [] images add the auto tag
|
||||
|
||||
func TestQwen3VLRenderer(t *testing.T) {
|
||||
func TestQwen3VLThinkingRenderer(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
msgs []api.Message
|
||||
|
@ -327,7 +327,8 @@ Thanks! What are the results?<|im_end|>
|
|||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
rendered, err := Qwen3VLRenderer(tt.msgs, tt.tools, nil)
|
||||
// rendered, err := Qwen3VLRenderer(tt.msgs, tt.tools, nil)
|
||||
rendered, err := (&Qwen3VLRenderer{true}).Render(tt.msgs, tt.tools, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
|
@ -1,380 +0,0 @@
|
|||
package renderers
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"encoding/base64"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
var IMAGE1_BASE64 = base64.StdEncoding.EncodeToString([]byte("image1"))
|
||||
var IMAGE2_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAIAAADTED8xAAADMElEQVR4nOzVwQnAIBQFQYXff81RUkQCOyDj1YOPnbXWPmeTRef+/3O/OyBjzh3CD95BfqICMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMO0TAAD//2Anhf4QtqobAAAAAElFTkSuQmCC"
|
||||
|
||||
// TODO:
|
||||
// - [ ] test videos?
|
||||
// - [ ] set descriptions to omitempty?
|
||||
// - [] images add the auto tag
|
||||
|
||||
func TestQwen3VLThinkingRenderer(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
msgs []api.Message
|
||||
images []api.ImageData
|
||||
tools []api.Tool
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "basic",
|
||||
msgs: []api.Message{
|
||||
{Role: "system", Content: "You are a helpful assistant."},
|
||||
{Role: "user", Content: "Hello, how are you?"},
|
||||
},
|
||||
expected: `<|im_start|>system
|
||||
You are a helpful assistant.<|im_end|>
|
||||
<|im_start|>user
|
||||
Hello, how are you?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "With thinking, end assistant.",
|
||||
msgs: []api.Message{
|
||||
// {Role: "system", Content: "You are a helpful assistant."},
|
||||
{Role: "user", Content: "Tell me a story in two sentences."},
|
||||
{Role: "assistant", Content: "abc<think>To make this story interesting, I will speak in poetry.</think>"}, // does the thinking even work?
|
||||
},
|
||||
expected: `<|im_start|>user
|
||||
Tell me a story in two sentences.<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
To make this story interesting, I will speak in poetry.
|
||||
</think>
|
||||
|
||||
<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "Multiple thinking",
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "Tell me a story in two sentences."},
|
||||
{Role: "assistant", Content: "abc<think>To make this story interesting, I will speak in poetry.</think><think>And I will speak in poetry after the first sentence.</think>"},
|
||||
},
|
||||
expected: `<|im_start|>user
|
||||
Tell me a story in two sentences.<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
To make this story interesting, I will speak in poetry.
|
||||
</think>
|
||||
|
||||
<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`, // the second thinking tag is not captured
|
||||
},
|
||||
{
|
||||
name: "Multiple thinking, multiple messages.",
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "Tell me a story in two sentences."},
|
||||
{Role: "assistant", Content: "abc<think>To make this story interesting, I will speak in poetry.</think><think>And I will speak in poetry after the first sentence.</think>"},
|
||||
{Role: "user", Content: "What is the weather like in San Francisco? <think>I will check the weather in San Francisco for you.</think>"},
|
||||
{Role: "assistant", Content: "I'll check the weather in San Francisco for you.<think>Speak poetry after the first sentence.</think><think>Speak poetry after the second sentence.</think>"},
|
||||
},
|
||||
expected: `<|im_start|>user
|
||||
Tell me a story in two sentences.<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<|im_end|>
|
||||
<|im_start|>user
|
||||
What is the weather like in San Francisco? <think>I will check the weather in San Francisco for you.</think><|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
Speak poetry after the first sentence.
|
||||
</think>
|
||||
|
||||
<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "Image",
|
||||
msgs: []api.Message{ // i think this is because it does not go through the renderer?
|
||||
{Role: "user", Content: "Describe this image.", Images: []api.ImageData{api.ImageData(IMAGE2_BASE64)}}, // does this work?
|
||||
}, // this is actually a local test, remote model may need to be different
|
||||
expected: `<|im_start|>user
|
||||
[img-0]Describe this image.<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`,
|
||||
}, // there's no way to do videos?
|
||||
{
|
||||
name: "Multiple images",
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "Describe these images.", Images: []api.ImageData{api.ImageData(IMAGE1_BASE64), api.ImageData(IMAGE2_BASE64)}},
|
||||
},
|
||||
expected: `<|im_start|>user
|
||||
[img-0][img-1]Describe these images.<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "with tools and response",
|
||||
msgs: []api.Message{
|
||||
{Role: "system", Content: "You are a helpful assistant with access to tools."},
|
||||
{Role: "user", Content: "What's the weather like in New York?"},
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: "I'll check the weather in New York for you.",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get-current-weather",
|
||||
Arguments: map[string]any{
|
||||
"location": "New York",
|
||||
"unit": "fahrenheit",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{Role: "tool", Content: "80", ToolName: "get-current-weather"},
|
||||
{Role: "user", Content: "That sounds nice! What about San Francisco?"},
|
||||
},
|
||||
tools: []api.Tool{
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "get-current-weather",
|
||||
Description: "Get the current weather for a location",
|
||||
Parameters: api.ToolFunctionParameters{
|
||||
Type: "object",
|
||||
Required: []string{"location"},
|
||||
Properties: map[string]api.ToolProperty{
|
||||
"location": {
|
||||
Type: api.PropertyType{"string"},
|
||||
Description: "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {
|
||||
Type: api.PropertyType{"string"},
|
||||
Enum: []any{"celsius", "fahrenheit"},
|
||||
Description: "The temperature unit",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: `<|im_start|>system
|
||||
You are a helpful assistant with access to tools.
|
||||
|
||||
# Tools
|
||||
|
||||
You may call one or more functions to assist with the user query.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags:
|
||||
<tools>
|
||||
{"type": "function", "function": {"name": "get-current-weather", "description": "Get the current weather for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "The temperature unit"}}, "required": ["location"]}}}
|
||||
</tools>
|
||||
|
||||
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||
<tool_call>
|
||||
{"name": <function-name>, "arguments": <args-json-object>}
|
||||
</tool_call><|im_end|>
|
||||
<|im_start|>user
|
||||
What's the weather like in New York?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
I'll check the weather in New York for you.
|
||||
<tool_call>
|
||||
{"name": "get-current-weather", "arguments": {"location": "New York", "unit": "fahrenheit"}}
|
||||
</tool_call><|im_end|>
|
||||
<|im_start|>user
|
||||
<tool_response>
|
||||
80
|
||||
</tool_response><|im_end|>
|
||||
<|im_start|>user
|
||||
That sounds nice! What about San Francisco?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "With tools and response, multiple tool calls",
|
||||
msgs: []api.Message{
|
||||
{
|
||||
Role: "system",
|
||||
Content: "You are a helpful assistant with access to tools.",
|
||||
},
|
||||
{
|
||||
Role: "user",
|
||||
Content: "Call two tools for me: add and multiply.",
|
||||
},
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: "Sure, I'll call both tools for you.",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "add",
|
||||
Arguments: map[string]any{
|
||||
"a": 2,
|
||||
"b": 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "multiply",
|
||||
Arguments: map[string]any{
|
||||
"x": 4,
|
||||
"y": 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Role: "tool",
|
||||
Content: "5",
|
||||
ToolName: "add",
|
||||
},
|
||||
{
|
||||
Role: "tool",
|
||||
Content: "20",
|
||||
ToolName: "multiply",
|
||||
},
|
||||
{
|
||||
Role: "user",
|
||||
Content: "Thanks! What are the results?",
|
||||
},
|
||||
},
|
||||
tools: []api.Tool{
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "add",
|
||||
Description: "Add two numbers",
|
||||
Parameters: api.ToolFunctionParameters{
|
||||
Type: "object",
|
||||
Required: []string{"a", "b"},
|
||||
Properties: map[string]api.ToolProperty{
|
||||
"a": {Type: api.PropertyType{"integer"}, Description: "First number"},
|
||||
"b": {Type: api.PropertyType{"integer"}, Description: "Second number"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "multiply",
|
||||
Description: "Multiply two numbers",
|
||||
Parameters: api.ToolFunctionParameters{
|
||||
Type: "object",
|
||||
Required: []string{"x", "y"},
|
||||
Properties: map[string]api.ToolProperty{
|
||||
"x": {Type: api.PropertyType{"integer"}, Description: "First factor"},
|
||||
"y": {Type: api.PropertyType{"integer"}, Description: "Second factor"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: `<|im_start|>system
|
||||
You are a helpful assistant with access to tools.
|
||||
|
||||
# Tools
|
||||
|
||||
You may call one or more functions to assist with the user query.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags:
|
||||
<tools>
|
||||
{"type": "function", "function": {"name": "add", "description": "Add two numbers", "parameters": {"type": "object", "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}}, "required": ["a", "b"]}}}
|
||||
{"type": "function", "function": {"name": "multiply", "description": "Multiply two numbers", "parameters": {"type": "object", "properties": {"x": {"type": "integer"}, "y": {"type": "integer"}}, "required": ["x", "y"]}}}
|
||||
</tools>
|
||||
|
||||
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||
<tool_call>
|
||||
{"name": <function-name>, "arguments": <args-json-object>}
|
||||
</tool_call><|im_end|>
|
||||
<|im_start|>user
|
||||
Call two tools for me: add and multiply.<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Sure, I'll call both tools for you.
|
||||
<tool_call>
|
||||
{"name": "add", "arguments": {"a": 2, "b": 3}}
|
||||
</tool_call>
|
||||
<tool_call>
|
||||
{"name": "multiply", "arguments": {"x": 4, "y": 5}}
|
||||
</tool_call><|im_end|>
|
||||
<|im_start|>user
|
||||
<tool_response>
|
||||
5
|
||||
</tool_response>
|
||||
<tool_response>
|
||||
20
|
||||
</tool_response><|im_end|>
|
||||
<|im_start|>user
|
||||
Thanks! What are the results?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
<think>
|
||||
`,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// rendered, err := Qwen3VLRenderer(tt.msgs, tt.tools, nil)
|
||||
rendered, err := (&Qwen3VLRenderer{true}).Render(tt.msgs, tt.tools, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if diff := cmp.Diff(rendered, tt.expected); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// what is this function for?
|
||||
|
||||
func TestFormatToolCallArgumentVL(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
arg any
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "string",
|
||||
arg: "foo",
|
||||
// notice no quotes around the string
|
||||
expected: "foo",
|
||||
},
|
||||
{
|
||||
name: "map",
|
||||
arg: map[string]any{"foo": "bar"},
|
||||
expected: "{\"foo\":\"bar\"}",
|
||||
},
|
||||
{
|
||||
name: "number",
|
||||
arg: 1,
|
||||
expected: "1",
|
||||
},
|
||||
{
|
||||
name: "boolean",
|
||||
arg: true,
|
||||
expected: "true",
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := formatToolCallArgument(tt.arg)
|
||||
if got != tt.expected {
|
||||
t.Errorf("formatToolCallArgument(%v) = %v, want %v", tt.arg, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue