mirror of https://github.com/ollama/ollama.git
comment cleanup
This commit is contained in:
parent
7a57a469e7
commit
6976917864
|
@ -314,7 +314,6 @@ func (t *ToolFunctionParameters) String() string {
|
|||
return string(bts)
|
||||
}
|
||||
|
||||
// check what works with toolfunction
|
||||
type ToolFunction struct {
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description,omitempty"`
|
||||
|
|
|
@ -13,11 +13,9 @@ import (
|
|||
"github.com/ollama/ollama/logutil"
|
||||
)
|
||||
|
||||
// parsers shouldn't need to do images
|
||||
|
||||
const (
|
||||
CollectingContent qwenParserState = iota
|
||||
CollectingThinkingContent // this is because qwen3vl starts with <thinking>
|
||||
CollectingContent qwenParserState = iota
|
||||
CollectingThinkingContent
|
||||
CollectingToolContent
|
||||
)
|
||||
|
||||
|
@ -43,7 +41,6 @@ func (p *Qwen3VLParser) HasThinkingSupport() bool {
|
|||
func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
|
||||
p.tools = tools
|
||||
return tools
|
||||
// does qwenvl modify tools? what does this mean?
|
||||
}
|
||||
|
||||
type qwenEventThinkingContent struct {
|
||||
|
@ -103,9 +100,9 @@ func (p *Qwen3VLParser) parseEvents() []qwenEvent {
|
|||
|
||||
// think if a better name
|
||||
func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent {
|
||||
split := strings.SplitN(p.buffer.String(), tag, 2) // what is his 2 for?
|
||||
before := split[0] // before the tag
|
||||
before = strings.TrimRightFunc(before, unicode.IsSpace) // trim all the space after the bfire
|
||||
split := strings.SplitN(p.buffer.String(), tag, 2)
|
||||
before := split[0]
|
||||
before = strings.TrimRightFunc(before, unicode.IsSpace)
|
||||
if len(before) > 0 {
|
||||
events = append(events, qwenEventContent{content: before})
|
||||
}
|
||||
|
@ -127,52 +124,38 @@ func findFirstTag(p *Qwen3VLParser, tags []string) string {
|
|||
firstTag = tag
|
||||
}
|
||||
}
|
||||
if minIdx == -1 { // just content
|
||||
if minIdx == -1 {
|
||||
return ""
|
||||
}
|
||||
return firstTag // there is a possibility that there is no tag, can you return nil for that?
|
||||
return firstTag
|
||||
}
|
||||
|
||||
func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
||||
var events []qwenEvent
|
||||
|
||||
// certain events:
|
||||
// - thinking opening tag
|
||||
// - tool opening tag
|
||||
|
||||
// since there is multiple tags, we need to think about which tag comes first
|
||||
// we also need to create a list for
|
||||
firstTag := findFirstTag(p, []string{thinkingOpenTag, toolOpenTag})
|
||||
|
||||
switch p.state {
|
||||
case CollectingContent: // we can only look for thinking content if we're collecting content
|
||||
|
||||
// if strings.Contains(p.buffer.String(), thinkingOpenTag) { // found thinking
|
||||
case CollectingContent:
|
||||
if firstTag == thinkingOpenTag {
|
||||
// string contains the openThinkingTag, we move it to the CollectingThinkingContent state
|
||||
events = emitContentBeforeTag(p, events, thinkingOpenTag)
|
||||
p.state = CollectingThinkingContent // <found a thinking>
|
||||
p.state = CollectingThinkingContent
|
||||
return events, true
|
||||
// } else if strings.Contains(p.buffer.String(), toolOpenTag) { // found tool call
|
||||
} else if firstTag == toolOpenTag {
|
||||
events = emitContentBeforeTag(p, events, toolOpenTag)
|
||||
p.state = CollectingToolContent // found a <tool_call>
|
||||
p.state = CollectingToolContent
|
||||
return events, true
|
||||
} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 { // found a partial thinking tag
|
||||
// it is only possible that they find 1
|
||||
// found a partial think tag, emit the unambiguous before the partial tool call
|
||||
// hello </think -> hello, so ambiguous start includes all the whitespace before the tag
|
||||
} else if overlapLen := overlap(p.buffer.String(), thinkingOpenTag); overlapLen > 0 {
|
||||
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
||||
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
||||
ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
|
||||
// HAVENT ADDED TRAILING WHITESPACE YET...
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(ambiguous)
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
return events, false
|
||||
} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { // found a partial tool call tag
|
||||
} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 {
|
||||
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
||||
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
||||
ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
|
||||
|
@ -183,9 +166,9 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
|||
p.buffer.WriteString(ambiguous)
|
||||
events = append(events, qwenEventContent{content: unambiguous})
|
||||
return events, false
|
||||
} else { // no partial or full thinking or tool call tag found
|
||||
whitespaceLen := trailingWhitespaceLen(p.buffer.String()) // <- all the trailing space we consider ambiguous
|
||||
ambiguousStart := len(p.buffer.String()) - whitespaceLen // - whitespaceLen
|
||||
} else {
|
||||
whitespaceLen := trailingWhitespaceLen(p.buffer.String())
|
||||
ambiguousStart := len(p.buffer.String()) - whitespaceLen
|
||||
unambiguous := p.buffer.String()[:ambiguousStart]
|
||||
ambiguous := p.buffer.String()[ambiguousStart:]
|
||||
p.buffer.Reset()
|
||||
|
@ -195,16 +178,16 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
|||
}
|
||||
return events, false
|
||||
}
|
||||
case CollectingToolContent: // we only move towards the CollectingContent state
|
||||
case CollectingToolContent:
|
||||
if strings.Contains(p.buffer.String(), toolCloseTag) {
|
||||
split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) // this one splits by the first one
|
||||
split := strings.SplitN(p.buffer.String(), toolCloseTag, 2)
|
||||
before := split[0]
|
||||
if len(before) == 0 {
|
||||
slog.Warn("qwen tool call closing tag found but no content before it")
|
||||
}
|
||||
// after := split[1]
|
||||
after := strings.TrimLeftFunc(split[1], unicode.IsSpace) // no whit space yet
|
||||
events = append(events, qwenEventRawToolCall{raw: before}) // do these need to be "seperated"?
|
||||
|
||||
after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
|
||||
events = append(events, qwenEventRawToolCall{raw: before})
|
||||
p.buffer.Reset()
|
||||
p.buffer.WriteString(after)
|
||||
p.state = CollectingContent
|
||||
|
@ -215,13 +198,11 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
|||
case CollectingThinkingContent:
|
||||
if strings.Contains(p.buffer.String(), thinkingCloseTag) {
|
||||
split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)
|
||||
// so it looks like before contains the open tag
|
||||
fmt.Println("split", split)
|
||||
before := split[0]
|
||||
if len(before) == 0 {
|
||||
slog.Warn("qwen tool call closing tag found but no content before it")
|
||||
}
|
||||
// after := split[1] // no whit space yet
|
||||
after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
|
||||
events = append(events, qwenEventThinkingContent{content: before})
|
||||
p.buffer.Reset()
|
||||
|
|
|
@ -10,18 +10,6 @@ import (
|
|||
var imageCount int
|
||||
var videoCount int
|
||||
|
||||
// so i think from the renders, do vision is false
|
||||
|
||||
// basic
|
||||
// [-] with tools
|
||||
// [] with multiple tools
|
||||
// [-] with tool calling
|
||||
// [ ] with multiple tool calling
|
||||
// with images and videos
|
||||
|
||||
// TODO: is there a way not to have to writ my own code for marshalWithSpaces
|
||||
// the tool dictionaery list is slightly different
|
||||
|
||||
func marshalWithSpaces(v any) ([]byte, error) {
|
||||
b, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
|
@ -61,41 +49,7 @@ func marshalWithSpaces(v any) ([]byte, error) {
|
|||
return out, nil
|
||||
}
|
||||
|
||||
// func renderContent(content any, doVisionCount bool) string {
|
||||
// print(content)
|
||||
// switch content.(type) {
|
||||
// case string:
|
||||
// return content.(string)
|
||||
// default:
|
||||
// var subSb strings.Builder
|
||||
// for _, item := range content.([]any) {
|
||||
// if strings.Contains(item.(string), "image") || strings.Contains(item.(string), "image_url") || item.(map[string]any)["type"] == "image" {
|
||||
// if doVisionCount {
|
||||
// imageCount++
|
||||
// }
|
||||
// // if addVisionID {
|
||||
// // sb.WriteString("Picture " + strconv.Itoa(imageCount) + ": ") // do we need the itoa thing?
|
||||
// // }
|
||||
// subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
|
||||
// } else if strings.Contains(item.(string), "video") || item.(map[string]any)["type"] == "video" {
|
||||
// if doVisionCount {
|
||||
// videoCount++
|
||||
// }
|
||||
// // if addVisionID {
|
||||
// // sb.WriteString("Video " + strconv.Itoa(videoCount) + ": ") // do we need the itoa thing?
|
||||
// // }
|
||||
// subSb.WriteString("<|vision_start|><|video_pad|><|vision_end|>")
|
||||
// } else if strings.Contains(item.(string), "text") {
|
||||
// subSb.WriteString(item.(map[string]any)["text"].(string))
|
||||
// }
|
||||
// }
|
||||
// return subSb.String()
|
||||
// }
|
||||
// }
|
||||
|
||||
func renderContent(content api.Message, doVisionCount bool) string {
|
||||
// print(content)
|
||||
|
||||
// This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go
|
||||
var subSb strings.Builder
|
||||
for _ = range content.Images {
|
||||
|
@ -104,12 +58,9 @@ func renderContent(content api.Message, doVisionCount bool) string {
|
|||
}
|
||||
subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
|
||||
}
|
||||
// we dont support videos yet so idk what to do exactly
|
||||
|
||||
// subSb.WriteString("<|vision_start|><|video_pad|><|vision_end|>")
|
||||
// TODO: support videos
|
||||
|
||||
subSb.WriteString(content.Content)
|
||||
|
||||
return subSb.String()
|
||||
}
|
||||
|
||||
|
@ -129,7 +80,6 @@ func Qwen3VLRenderer(messages []api.Message, tools []api.Tool, _ *api.ThinkValue
|
|||
}
|
||||
}
|
||||
sb.WriteString("\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n")
|
||||
// sb.WriteString("<|im_end|>\n")
|
||||
} else if len(messages) > 0 && messages[0].Role == "system" {
|
||||
sb.WriteString("<|im_start|>system\n" + messages[0].Content + "<|im_end|>\n")
|
||||
}
|
||||
|
@ -149,11 +99,7 @@ func Qwen3VLRenderer(messages []api.Message, tools []api.Tool, _ *api.ThinkValue
|
|||
}
|
||||
|
||||
for i, message := range messages {
|
||||
// fmt.Println("This is the content that we are rendering: ", message.Content)
|
||||
|
||||
// content := renderContent(message.Content, true)
|
||||
|
||||
content := renderContent(message, true) // we want to render the entire message, because it may have images in them
|
||||
content := renderContent(message, true)
|
||||
|
||||
if message.Role == "user" || message.Role == "system" && i != 0 {
|
||||
sb.WriteString("<|im_start|>" + message.Role + "\n" + content + "<|im_end|>\n")
|
||||
|
@ -211,7 +157,6 @@ func Qwen3VLRenderer(messages []api.Message, tools []api.Tool, _ *api.ThinkValue
|
|||
|
||||
}
|
||||
|
||||
// sb.WriteString("<|im_start|>assistant\n<think>\n")
|
||||
sb.WriteString("<|im_start|>assistant\n")
|
||||
return sb.String(), nil
|
||||
|
||||
|
|
Loading…
Reference in New Issue