mirror of https://github.com/ollama/ollama.git
Merge pull request #12417 from ollama/drifkin/qwen3-coder-unicode
parsers: fix unicode handling for qwen3-coder
This commit is contained in:
commit
34efbbd3f0
|
@ -11,6 +11,7 @@ import (
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode"
|
"unicode"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/logutil"
|
"github.com/ollama/ollama/logutil"
|
||||||
|
@ -204,12 +205,21 @@ func overlap(s, delim string) int {
|
||||||
}
|
}
|
||||||
|
|
||||||
func trailingWhitespaceLen(s string) int {
|
func trailingWhitespaceLen(s string) int {
|
||||||
for i := len(s) - 1; i >= 0; i-- {
|
remaining := s
|
||||||
if !unicode.IsSpace(rune(s[i])) {
|
total := 0
|
||||||
return len(s) - i - 1
|
for len(remaining) > 0 {
|
||||||
|
r, size := utf8.DecodeLastRuneInString(remaining)
|
||||||
|
// if it's an invalid utf8 rune, assume it isn't whitespace
|
||||||
|
if r == utf8.RuneError && size == 1 {
|
||||||
|
break
|
||||||
}
|
}
|
||||||
|
if !unicode.IsSpace(r) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
total += size
|
||||||
|
remaining = remaining[:len(remaining)-size]
|
||||||
}
|
}
|
||||||
return len(s)
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
type XMLFunctionCall struct {
|
type XMLFunctionCall struct {
|
||||||
|
|
|
@ -166,6 +166,137 @@ func TestQwenParserStreaming(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
desc: "unicode content",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "你好 🌍<tool_call>test</tool_call>مرحبا",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "你好 🌍"},
|
||||||
|
qwenEventRawToolCall{raw: "test"},
|
||||||
|
qwenEventContent{content: "مرحبا"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "arabic text handling",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "مرحبا بالعالم",
|
||||||
|
wantEvents: []qwenEvent{qwenEventContent{content: "مرحبا بالعالم"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "emoji passthrough",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "✅",
|
||||||
|
wantEvents: []qwenEvent{qwenEventContent{content: "✅"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "emoji after tool call",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "<tool_call>test</tool_call>完成 ✅",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventRawToolCall{raw: "test"},
|
||||||
|
qwenEventContent{content: "完成 ✅"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "unicode streaming with whitespace handling",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "مرحبا",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "مرحبا"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: " \n",
|
||||||
|
wantEvents: []qwenEvent{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "世界",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: " \n世界"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "non-breaking space withheld across chunks",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "Hello\u00a0",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "Hello"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "world",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "\u00a0world"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "ideographic space before partial tool",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "Hello\u3000<tool",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "Hello"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "_call>abc",
|
||||||
|
wantEvents: []qwenEvent{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "</tool_call>def",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventRawToolCall{raw: "abc"},
|
||||||
|
qwenEventContent{content: "def"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "ideographic space before partial tool fakeout",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "Hello\u3000<tool",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "Hello"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "fakeout>abc",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "\u3000<toolfakeout>abc"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "unicode with partial tool tag",
|
||||||
|
steps: []step{
|
||||||
|
{
|
||||||
|
input: "测试🎯 <to",
|
||||||
|
wantEvents: []qwenEvent{
|
||||||
|
qwenEventContent{content: "测试🎯"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
anyOnlies := false
|
anyOnlies := false
|
||||||
|
@ -347,6 +478,27 @@ ls && echo "a > b and a < b"
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "unicode in function names and parameters",
|
||||||
|
tools: []api.Tool{},
|
||||||
|
rawToolCall: `<function=获取天气>
|
||||||
|
<parameter=城市>
|
||||||
|
北京
|
||||||
|
</parameter>
|
||||||
|
<parameter=message>
|
||||||
|
Hello! 你好! 🌟 مرحبا
|
||||||
|
</parameter>
|
||||||
|
</function>`,
|
||||||
|
wantToolCall: api.ToolCall{
|
||||||
|
Function: api.ToolCallFunction{
|
||||||
|
Name: "获取天气",
|
||||||
|
Arguments: map[string]any{
|
||||||
|
"城市": "北京",
|
||||||
|
"message": "Hello! 你好! 🌟 مرحبا",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, step := range steps {
|
for i, step := range steps {
|
||||||
|
@ -360,6 +512,42 @@ ls && echo "a > b and a < b"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTrailingWhitespaceLenUnicode(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "ascii space",
|
||||||
|
input: "Hello ",
|
||||||
|
want: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "non-breaking space",
|
||||||
|
input: "Hello\u00a0",
|
||||||
|
want: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ideographic space",
|
||||||
|
input: "Hello\u3000",
|
||||||
|
want: 3,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple runes of whitespace",
|
||||||
|
input: "Hi\u00a0\u3000",
|
||||||
|
want: 5,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
got := trailingWhitespaceLen(tc.input)
|
||||||
|
if got != tc.want {
|
||||||
|
t.Errorf("%s: trailingWhitespaceLen(%q) = %d, want %d", tc.name, tc.input, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestQwenToolCallValueParsing(t *testing.T) {
|
func TestQwenToolCallValueParsing(t *testing.T) {
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
desc string
|
desc string
|
||||||
|
@ -867,6 +1055,8 @@ func TestTrailingWhitespaceLen(t *testing.T) {
|
||||||
{desc: "trailing whitespace with newlines", s: "abc \n", want: 2},
|
{desc: "trailing whitespace with newlines", s: "abc \n", want: 2},
|
||||||
{desc: "only whitespace", s: " \n ", want: 4},
|
{desc: "only whitespace", s: " \n ", want: 4},
|
||||||
{desc: "leading whitespace doesn't count", s: " \n abc", want: 0},
|
{desc: "leading whitespace doesn't count", s: " \n abc", want: 0},
|
||||||
|
{desc: "unicode with trailing space", s: "测试🎯 ", want: 1},
|
||||||
|
{desc: "unicode with trailing tab and newline", s: "مرحبا\t\n", want: 2},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tc := range cases {
|
for _, tc := range cases {
|
||||||
|
@ -876,3 +1066,30 @@ func TestTrailingWhitespaceLen(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestOverlapFunction(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
desc string
|
||||||
|
s string
|
||||||
|
delim string
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{desc: "no overlap", s: "hello", delim: "<tool", want: 0},
|
||||||
|
{desc: "full overlap", s: "hello<tool", delim: "<tool>", want: 5},
|
||||||
|
{desc: "partial overlap", s: "hello<to", delim: "<tool>", want: 3},
|
||||||
|
{desc: "unicode with partial overlap", s: "测试🎯<to", delim: "<tool>", want: 3},
|
||||||
|
{desc: "unicode string with no overlap", s: "مرحبا", delim: "<tool>", want: 0},
|
||||||
|
{desc: "unicode at boundary", s: "世界<", delim: "<tool>", want: 1},
|
||||||
|
{desc: "unicode delimiter single rune", s: "hello🔧", delim: "🔧工具", want: len("🔧")},
|
||||||
|
{desc: "unicode delimiter multiple runes", s: "hello🔧工", delim: "🔧工具", want: len("🔧工")},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.desc, func(t *testing.T) {
|
||||||
|
got := overlap(tc.s, tc.delim)
|
||||||
|
if got != tc.want {
|
||||||
|
t.Errorf("overlap(%q, %q) = %d, want %d", tc.s, tc.delim, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue