Merge pull request #12417 from ollama/drifkin/qwen3-coder-unicode

parsers: fix unicode handling for qwen3-coder
This commit is contained in:
Devon Rifkin 2025-09-25 15:56:34 -07:00 committed by GitHub
commit 34efbbd3f0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 231 additions and 4 deletions

View File

@ -11,6 +11,7 @@ import (
"strconv" "strconv"
"strings" "strings"
"unicode" "unicode"
"unicode/utf8"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/logutil" "github.com/ollama/ollama/logutil"
@ -204,12 +205,21 @@ func overlap(s, delim string) int {
} }
func trailingWhitespaceLen(s string) int { func trailingWhitespaceLen(s string) int {
for i := len(s) - 1; i >= 0; i-- { remaining := s
if !unicode.IsSpace(rune(s[i])) { total := 0
return len(s) - i - 1 for len(remaining) > 0 {
r, size := utf8.DecodeLastRuneInString(remaining)
// if it's an invalid utf8 rune, assume it isn't whitespace
if r == utf8.RuneError && size == 1 {
break
} }
if !unicode.IsSpace(r) {
break
}
total += size
remaining = remaining[:len(remaining)-size]
} }
return len(s) return total
} }
type XMLFunctionCall struct { type XMLFunctionCall struct {

View File

@ -166,6 +166,137 @@ func TestQwenParserStreaming(t *testing.T) {
}, },
}, },
}, },
{
desc: "unicode content",
steps: []step{
{
input: "你好 🌍<tool_call>test</tool_call>مرحبا",
wantEvents: []qwenEvent{
qwenEventContent{content: "你好 🌍"},
qwenEventRawToolCall{raw: "test"},
qwenEventContent{content: "مرحبا"},
},
},
},
},
{
desc: "arabic text handling",
steps: []step{
{
input: "مرحبا بالعالم",
wantEvents: []qwenEvent{qwenEventContent{content: "مرحبا بالعالم"}},
},
},
},
{
desc: "emoji passthrough",
steps: []step{
{
input: "✅",
wantEvents: []qwenEvent{qwenEventContent{content: "✅"}},
},
},
},
{
desc: "emoji after tool call",
steps: []step{
{
input: "<tool_call>test</tool_call>完成 ✅",
wantEvents: []qwenEvent{
qwenEventRawToolCall{raw: "test"},
qwenEventContent{content: "完成 ✅"},
},
},
},
},
{
desc: "unicode streaming with whitespace handling",
steps: []step{
{
input: "مرحبا",
wantEvents: []qwenEvent{
qwenEventContent{content: "مرحبا"},
},
},
{
input: " \n",
wantEvents: []qwenEvent{},
},
{
input: "世界",
wantEvents: []qwenEvent{
qwenEventContent{content: " \n世界"},
},
},
},
},
{
desc: "non-breaking space withheld across chunks",
steps: []step{
{
input: "Hello\u00a0",
wantEvents: []qwenEvent{
qwenEventContent{content: "Hello"},
},
},
{
input: "world",
wantEvents: []qwenEvent{
qwenEventContent{content: "\u00a0world"},
},
},
},
},
{
desc: "ideographic space before partial tool",
steps: []step{
{
input: "Hello\u3000<tool",
wantEvents: []qwenEvent{
qwenEventContent{content: "Hello"},
},
},
{
input: "_call>abc",
wantEvents: []qwenEvent{},
},
{
input: "</tool_call>def",
wantEvents: []qwenEvent{
qwenEventRawToolCall{raw: "abc"},
qwenEventContent{content: "def"},
},
},
},
},
{
desc: "ideographic space before partial tool fakeout",
steps: []step{
{
input: "Hello\u3000<tool",
wantEvents: []qwenEvent{
qwenEventContent{content: "Hello"},
},
},
{
input: "fakeout>abc",
wantEvents: []qwenEvent{
qwenEventContent{content: "\u3000<toolfakeout>abc"},
},
},
},
},
{
desc: "unicode with partial tool tag",
steps: []step{
{
input: "测试🎯 <to",
wantEvents: []qwenEvent{
qwenEventContent{content: "测试🎯"},
},
},
},
},
} }
anyOnlies := false anyOnlies := false
@ -347,6 +478,27 @@ ls && echo "a > b and a < b"
}, },
}, },
}, },
{
name: "unicode in function names and parameters",
tools: []api.Tool{},
rawToolCall: `<function=获取天气>
<parameter=城市>
北京
</parameter>
<parameter=message>
Hello! 你好! 🌟 مرحبا
</parameter>
</function>`,
wantToolCall: api.ToolCall{
Function: api.ToolCallFunction{
Name: "获取天气",
Arguments: map[string]any{
"城市": "北京",
"message": "Hello! 你好! 🌟 مرحبا",
},
},
},
},
} }
for i, step := range steps { for i, step := range steps {
@ -360,6 +512,42 @@ ls && echo "a > b and a < b"
} }
} }
func TestTrailingWhitespaceLenUnicode(t *testing.T) {
cases := []struct {
name string
input string
want int
}{
{
name: "ascii space",
input: "Hello ",
want: 1,
},
{
name: "non-breaking space",
input: "Hello\u00a0",
want: 2,
},
{
name: "ideographic space",
input: "Hello\u3000",
want: 3,
},
{
name: "multiple runes of whitespace",
input: "Hi\u00a0\u3000",
want: 5,
},
}
for _, tc := range cases {
got := trailingWhitespaceLen(tc.input)
if got != tc.want {
t.Errorf("%s: trailingWhitespaceLen(%q) = %d, want %d", tc.name, tc.input, got, tc.want)
}
}
}
func TestQwenToolCallValueParsing(t *testing.T) { func TestQwenToolCallValueParsing(t *testing.T) {
cases := []struct { cases := []struct {
desc string desc string
@ -867,6 +1055,8 @@ func TestTrailingWhitespaceLen(t *testing.T) {
{desc: "trailing whitespace with newlines", s: "abc \n", want: 2}, {desc: "trailing whitespace with newlines", s: "abc \n", want: 2},
{desc: "only whitespace", s: " \n ", want: 4}, {desc: "only whitespace", s: " \n ", want: 4},
{desc: "leading whitespace doesn't count", s: " \n abc", want: 0}, {desc: "leading whitespace doesn't count", s: " \n abc", want: 0},
{desc: "unicode with trailing space", s: "测试🎯 ", want: 1},
{desc: "unicode with trailing tab and newline", s: "مرحبا\t\n", want: 2},
} }
for _, tc := range cases { for _, tc := range cases {
@ -876,3 +1066,30 @@ func TestTrailingWhitespaceLen(t *testing.T) {
} }
} }
} }
func TestOverlapFunction(t *testing.T) {
cases := []struct {
desc string
s string
delim string
want int
}{
{desc: "no overlap", s: "hello", delim: "<tool", want: 0},
{desc: "full overlap", s: "hello<tool", delim: "<tool>", want: 5},
{desc: "partial overlap", s: "hello<to", delim: "<tool>", want: 3},
{desc: "unicode with partial overlap", s: "测试🎯<to", delim: "<tool>", want: 3},
{desc: "unicode string with no overlap", s: "مرحبا", delim: "<tool>", want: 0},
{desc: "unicode at boundary", s: "世界<", delim: "<tool>", want: 1},
{desc: "unicode delimiter single rune", s: "hello🔧", delim: "🔧工具", want: len("🔧")},
{desc: "unicode delimiter multiple runes", s: "hello🔧工", delim: "🔧工具", want: len("🔧工")},
}
for _, tc := range cases {
t.Run(tc.desc, func(t *testing.T) {
got := overlap(tc.s, tc.delim)
if got != tc.want {
t.Errorf("overlap(%q, %q) = %d, want %d", tc.s, tc.delim, got, tc.want)
}
})
}
}