parsers: fix unicode handling for qwen3-coder

When trimming whitespace at the end of every chunk, we were iterating
backwards over the string byte-by-byte instead of rune-by-rune.

As an example of how this can cause corruption, suppose we have the
multi-byte character  (`"\u2705"`), which is represented in utf-8 as
the three bytes `0xE2 0x9C 0x85`. It happens that `0x85` is NEL, which
passes `unicode.IsSpace()`. Because we were iterating byte-by-byte, this
caused us to mistakenly slice in the middle of the rune, removing `0x85`
and leaving `0xE2 0x9C`, which beyond being the incorrect place to
slice, is not even a valid utf-8 character.

`trailingWhitespaceLen()` was modified to count from the end in a
rune-aware way. Tests with various multibyte unicode characters were
also added.


Fixes: #12414
This commit is contained in:
Devon Rifkin 2025-09-25 15:47:46 -07:00
parent fbd82ba5bb
commit 05ba4ca1f4
2 changed files with 231 additions and 4 deletions

View File

@ -11,6 +11,7 @@ import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/logutil"
@ -204,12 +205,21 @@ func overlap(s, delim string) int {
}
func trailingWhitespaceLen(s string) int {
for i := len(s) - 1; i >= 0; i-- {
if !unicode.IsSpace(rune(s[i])) {
return len(s) - i - 1
remaining := s
total := 0
for len(remaining) > 0 {
r, size := utf8.DecodeLastRuneInString(remaining)
// if it's an invalid utf8 rune, assume it isn't whitespace
if r == utf8.RuneError && size == 1 {
break
}
if !unicode.IsSpace(r) {
break
}
return len(s)
total += size
remaining = remaining[:len(remaining)-size]
}
return total
}
type XMLFunctionCall struct {

View File

@ -166,6 +166,137 @@ func TestQwenParserStreaming(t *testing.T) {
},
},
},
{
desc: "unicode content",
steps: []step{
{
input: "你好 🌍<tool_call>test</tool_call>مرحبا",
wantEvents: []qwenEvent{
qwenEventContent{content: "你好 🌍"},
qwenEventRawToolCall{raw: "test"},
qwenEventContent{content: "مرحبا"},
},
},
},
},
{
desc: "arabic text handling",
steps: []step{
{
input: "مرحبا بالعالم",
wantEvents: []qwenEvent{qwenEventContent{content: "مرحبا بالعالم"}},
},
},
},
{
desc: "emoji passthrough",
steps: []step{
{
input: "✅",
wantEvents: []qwenEvent{qwenEventContent{content: "✅"}},
},
},
},
{
desc: "emoji after tool call",
steps: []step{
{
input: "<tool_call>test</tool_call>完成 ✅",
wantEvents: []qwenEvent{
qwenEventRawToolCall{raw: "test"},
qwenEventContent{content: "完成 ✅"},
},
},
},
},
{
desc: "unicode streaming with whitespace handling",
steps: []step{
{
input: "مرحبا",
wantEvents: []qwenEvent{
qwenEventContent{content: "مرحبا"},
},
},
{
input: " \n",
wantEvents: []qwenEvent{},
},
{
input: "世界",
wantEvents: []qwenEvent{
qwenEventContent{content: " \n世界"},
},
},
},
},
{
desc: "non-breaking space withheld across chunks",
steps: []step{
{
input: "Hello\u00a0",
wantEvents: []qwenEvent{
qwenEventContent{content: "Hello"},
},
},
{
input: "world",
wantEvents: []qwenEvent{
qwenEventContent{content: "\u00a0world"},
},
},
},
},
{
desc: "ideographic space before partial tool",
steps: []step{
{
input: "Hello\u3000<tool",
wantEvents: []qwenEvent{
qwenEventContent{content: "Hello"},
},
},
{
input: "_call>abc",
wantEvents: []qwenEvent{},
},
{
input: "</tool_call>def",
wantEvents: []qwenEvent{
qwenEventRawToolCall{raw: "abc"},
qwenEventContent{content: "def"},
},
},
},
},
{
desc: "ideographic space before partial tool fakeout",
steps: []step{
{
input: "Hello\u3000<tool",
wantEvents: []qwenEvent{
qwenEventContent{content: "Hello"},
},
},
{
input: "fakeout>abc",
wantEvents: []qwenEvent{
qwenEventContent{content: "\u3000<toolfakeout>abc"},
},
},
},
},
{
desc: "unicode with partial tool tag",
steps: []step{
{
input: "测试🎯 <to",
wantEvents: []qwenEvent{
qwenEventContent{content: "测试🎯"},
},
},
},
},
}
anyOnlies := false
@ -347,6 +478,27 @@ ls && echo "a > b and a < b"
},
},
},
{
name: "unicode in function names and parameters",
tools: []api.Tool{},
rawToolCall: `<function=获取天气>
<parameter=城市>
北京
</parameter>
<parameter=message>
Hello! 你好! 🌟 مرحبا
</parameter>
</function>`,
wantToolCall: api.ToolCall{
Function: api.ToolCallFunction{
Name: "获取天气",
Arguments: map[string]any{
"城市": "北京",
"message": "Hello! 你好! 🌟 مرحبا",
},
},
},
},
}
for i, step := range steps {
@ -360,6 +512,42 @@ ls && echo "a > b and a < b"
}
}
func TestTrailingWhitespaceLenUnicode(t *testing.T) {
cases := []struct {
name string
input string
want int
}{
{
name: "ascii space",
input: "Hello ",
want: 1,
},
{
name: "non-breaking space",
input: "Hello\u00a0",
want: 2,
},
{
name: "ideographic space",
input: "Hello\u3000",
want: 3,
},
{
name: "multiple runes of whitespace",
input: "Hi\u00a0\u3000",
want: 5,
},
}
for _, tc := range cases {
got := trailingWhitespaceLen(tc.input)
if got != tc.want {
t.Errorf("%s: trailingWhitespaceLen(%q) = %d, want %d", tc.name, tc.input, got, tc.want)
}
}
}
func TestQwenToolCallValueParsing(t *testing.T) {
cases := []struct {
desc string
@ -867,6 +1055,8 @@ func TestTrailingWhitespaceLen(t *testing.T) {
{desc: "trailing whitespace with newlines", s: "abc \n", want: 2},
{desc: "only whitespace", s: " \n ", want: 4},
{desc: "leading whitespace doesn't count", s: " \n abc", want: 0},
{desc: "unicode with trailing space", s: "测试🎯 ", want: 1},
{desc: "unicode with trailing tab and newline", s: "مرحبا\t\n", want: 2},
}
for _, tc := range cases {
@ -876,3 +1066,30 @@ func TestTrailingWhitespaceLen(t *testing.T) {
}
}
}
func TestOverlapFunction(t *testing.T) {
cases := []struct {
desc string
s string
delim string
want int
}{
{desc: "no overlap", s: "hello", delim: "<tool", want: 0},
{desc: "full overlap", s: "hello<tool", delim: "<tool>", want: 5},
{desc: "partial overlap", s: "hello<to", delim: "<tool>", want: 3},
{desc: "unicode with partial overlap", s: "测试🎯<to", delim: "<tool>", want: 3},
{desc: "unicode string with no overlap", s: "مرحبا", delim: "<tool>", want: 0},
{desc: "unicode at boundary", s: "世界<", delim: "<tool>", want: 1},
{desc: "unicode delimiter single rune", s: "hello🔧", delim: "🔧工具", want: len("🔧")},
{desc: "unicode delimiter multiple runes", s: "hello🔧工", delim: "🔧工具", want: len("🔧工")},
}
for _, tc := range cases {
t.Run(tc.desc, func(t *testing.T) {
got := overlap(tc.s, tc.delim)
if got != tc.want {
t.Errorf("overlap(%q, %q) = %d, want %d", tc.s, tc.delim, got, tc.want)
}
})
}
}