2023-07-04 03:22:44 +08:00
package server
import (
2024-06-18 01:38:55 +08:00
"bytes"
2024-05-01 01:55:19 +08:00
"cmp"
2023-07-26 05:08:51 +08:00
"context"
2023-07-07 01:40:11 +08:00
"encoding/json"
2023-10-07 04:06:20 +08:00
"errors"
2023-07-22 14:02:12 +08:00
"fmt"
2025-05-13 11:41:42 +08:00
"image"
2023-07-04 03:22:44 +08:00
"io"
2024-12-12 07:29:59 +08:00
"io/fs"
2024-01-19 02:52:01 +08:00
"log/slog"
2024-07-16 03:14:24 +08:00
"math"
2023-07-04 03:22:44 +08:00
"net"
"net/http"
2024-03-09 14:23:47 +08:00
"net/netip"
2025-09-18 05:40:53 +08:00
"net/url"
2023-07-08 03:27:43 +08:00
"os"
2023-08-31 04:35:03 +08:00
"os/signal"
2024-05-22 12:30:52 +08:00
"slices"
2023-07-07 01:40:11 +08:00
"strings"
2023-08-31 04:35:03 +08:00
"syscall"
2023-07-13 09:18:06 +08:00
"time"
2023-07-04 03:22:44 +08:00
2023-07-22 09:01:24 +08:00
"github.com/gin-contrib/cors"
2023-07-04 03:22:44 +08:00
"github.com/gin-gonic/gin"
2025-05-13 11:41:42 +08:00
"golang.org/x/image/webp"
2024-08-12 02:57:10 +08:00
"golang.org/x/sync/errgroup"
2023-07-04 03:22:44 +08:00
2024-03-27 04:04:17 +08:00
"github.com/ollama/ollama/api"
2025-09-18 05:40:53 +08:00
"github.com/ollama/ollama/auth"
2024-10-17 08:45:00 +08:00
"github.com/ollama/ollama/discover"
2024-05-25 05:57:15 +08:00
"github.com/ollama/ollama/envconfig"
2025-08-08 04:49:26 +08:00
"github.com/ollama/ollama/format"
2025-02-14 08:31:21 +08:00
"github.com/ollama/ollama/fs/ggml"
2025-08-22 04:56:22 +08:00
"github.com/ollama/ollama/harmony"
2024-03-27 04:04:17 +08:00
"github.com/ollama/ollama/llm"
2025-05-13 02:43:00 +08:00
"github.com/ollama/ollama/logutil"
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
"github.com/ollama/ollama/model/parsers"
2024-03-27 04:04:17 +08:00
"github.com/ollama/ollama/openai"
2025-02-28 04:04:53 +08:00
"github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/server/internal/registry"
2024-06-11 05:54:42 +08:00
"github.com/ollama/ollama/template"
2025-06-07 03:02:20 +08:00
"github.com/ollama/ollama/thinking"
2025-05-24 05:19:31 +08:00
"github.com/ollama/ollama/tools"
2024-05-14 09:48:28 +08:00
"github.com/ollama/ollama/types/errtypes"
2024-04-17 07:22:38 +08:00
"github.com/ollama/ollama/types/model"
2024-03-27 04:04:17 +08:00
"github.com/ollama/ollama/version"
2023-07-04 03:22:44 +08:00
)
2025-09-13 04:32:30 +08:00
func shouldUseHarmony ( model * Model ) bool {
if slices . Contains ( [ ] string { "gptoss" , "gpt-oss" } , model . Config . ModelFamily ) {
// heuristic to check whether the template expects to be parsed via harmony:
// search for harmony tags that are nearly always used
if model . Template . Contains ( "<|start|>" ) && model . Template . Contains ( "<|end|>" ) {
return true
}
}
return false
}
server/internal/registry: take over pulls from server package (#9485)
This commit replaces the old pull implementation in the server package
with the new, faster, more robust pull implementation in the registry
package.
The new endpoint, and now the remove endpoint too, are behind the
feature gate "client2" enabled only by setting the OLLAMA_EXPERIMENT
environment variable include "client2".
Currently, the progress indication is wired to perform the same as the
previous implementation to avoid making changes to the CLI, and because
the status reports happen at the start of the download, and the end of
the write to disk, the progress indication is not as smooth as it could
be. This is a known issue and will be addressed in a future change.
This implementation may be ~0.5-1.0% slower in rare cases, depending on
network and disk speed, but is generally MUCH faster and more robust
than the its predecessor in all other cases.
2025-03-06 06:48:18 +08:00
func experimentEnabled ( name string ) bool {
return slices . Contains ( strings . Split ( os . Getenv ( "OLLAMA_EXPERIMENT" ) , "," ) , name )
}
var useClient2 = experimentEnabled ( "client2" )
2025-08-08 04:49:26 +08:00
// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
// reduced context length on some models
var lowVRAMThreshold uint64 = 20 * format . GibiByte
2023-08-23 00:48:35 +08:00
var mode string = gin . DebugMode
2023-12-15 08:47:40 +08:00
type Server struct {
2025-08-08 04:49:26 +08:00
addr net . Addr
sched * Scheduler
lowVRAM bool
2023-12-15 08:47:40 +08:00
}
2023-08-23 00:48:35 +08:00
func init ( ) {
switch mode {
case gin . DebugMode :
case gin . ReleaseMode :
case gin . TestMode :
default :
mode = gin . DebugMode
}
gin . SetMode ( mode )
}
2024-08-02 05:52:15 +08:00
var (
errRequired = errors . New ( "is required" )
errBadTemplate = errors . New ( "template error" )
)
2024-06-21 02:00:08 +08:00
2025-04-03 00:44:27 +08:00
func modelOptions ( model * Model , requestOpts map [ string ] any ) ( api . Options , error ) {
2024-01-04 01:01:42 +08:00
opts := api . DefaultOptions ( )
if err := opts . FromMap ( model . Options ) ; err != nil {
return api . Options { } , err
}
if err := opts . FromMap ( requestOpts ) ; err != nil {
return api . Options { } , err
}
return opts , nil
2023-08-09 03:13:22 +08:00
}
2024-07-04 00:00:07 +08:00
// scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
2025-04-02 06:21:46 +08:00
func ( s * Server ) scheduleRunner ( ctx context . Context , name string , caps [ ] model . Capability , requestOpts map [ string ] any , keepAlive * api . Duration ) ( llm . LlamaServer , * Model , * api . Options , error ) {
2024-06-18 01:38:55 +08:00
if name == "" {
2024-07-04 00:00:07 +08:00
return nil , nil , nil , fmt . Errorf ( "model %w" , errRequired )
2023-08-09 03:13:22 +08:00
}
2024-06-18 01:38:55 +08:00
model , err := GetModel ( name )
2023-08-09 03:13:22 +08:00
if err != nil {
2024-07-04 00:00:07 +08:00
return nil , nil , nil , err
2024-01-04 01:01:42 +08:00
}
2025-05-14 08:36:02 +08:00
if slices . Contains ( model . Config . ModelFamilies , "mllama" ) && len ( model . ProjectorPaths ) > 0 {
return nil , nil , nil , fmt . Errorf ( "'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'" )
}
2024-06-18 01:38:55 +08:00
if err := model . CheckCapabilities ( caps ... ) ; err != nil {
2024-07-04 00:00:07 +08:00
return nil , nil , nil , fmt . Errorf ( "%s %w" , name , err )
2024-02-21 10:37:29 +08:00
}
2024-06-18 01:38:55 +08:00
opts , err := modelOptions ( model , requestOpts )
2024-01-04 01:01:42 +08:00
if err != nil {
2024-07-04 00:00:07 +08:00
return nil , nil , nil , err
2024-01-04 01:01:42 +08:00
}
2025-08-08 04:49:26 +08:00
// This model is much more capable with a larger context, so set that
// unless it would penalize performance too much
2025-08-15 05:42:58 +08:00
if ! s . lowVRAM && slices . Contains ( [ ] string { "gptoss" , "gpt-oss" } , model . Config . ModelFamily ) {
2025-08-06 03:21:16 +08:00
opts . NumCtx = max ( opts . NumCtx , 8192 )
}
2024-06-18 01:38:55 +08:00
runnerCh , errCh := s . sched . GetRunner ( ctx , model , opts , keepAlive )
2024-03-31 00:50:05 +08:00
var runner * runnerRef
select {
2024-06-18 01:38:55 +08:00
case runner = <- runnerCh :
case err = <- errCh :
2024-07-04 00:00:07 +08:00
return nil , nil , nil , err
2023-08-09 03:13:22 +08:00
}
2024-07-04 00:00:07 +08:00
return runner . llama , model , & opts , nil
2024-06-18 01:38:55 +08:00
}
func ( s * Server ) GenerateHandler ( c * gin . Context ) {
2024-07-14 00:25:31 +08:00
checkpointStart := time . Now ( )
2024-06-18 01:38:55 +08:00
var req api . GenerateRequest
if err := c . ShouldBindJSON ( & req ) ; errors . Is ( err , io . EOF ) {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
} else if err != nil {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2023-12-05 07:01:06 +08:00
return
}
2024-12-12 07:29:59 +08:00
name := model . ParseName ( req . Model )
if ! name . IsValid ( ) {
// Ideally this is "invalid model name" but we're keeping with
// what the API currently returns until we can change it.
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , req . Model ) } )
return
}
// We cannot currently consolidate this into GetModel because all we'll
// induce infinite recursion given the current code structure.
name , err := getExistingName ( name )
if err != nil {
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , req . Model ) } )
return
}
2025-04-02 06:21:46 +08:00
m , err := GetModel ( name . String ( ) )
2024-10-19 07:12:35 +08:00
if err != nil {
switch {
2024-12-12 07:29:59 +08:00
case errors . Is ( err , fs . ErrNotExist ) :
2024-10-19 07:12:35 +08:00
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , req . Model ) } )
2024-12-23 23:38:34 +08:00
case err . Error ( ) == errtypes . InvalidModelNameErrMsg :
2024-10-19 07:12:35 +08:00
c . JSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
default :
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
}
return
}
2025-09-18 05:40:53 +08:00
if m . Config . RemoteHost != "" && m . Config . RemoteModel != "" {
origModel := req . Model
remoteURL , err := url . Parse ( m . Config . RemoteHost )
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
if ! slices . Contains ( envconfig . Remotes ( ) , remoteURL . Hostname ( ) ) {
slog . Info ( "remote model" , "remotes" , envconfig . Remotes ( ) , "remoteURL" , m . Config . RemoteHost , "hostname" , remoteURL . Hostname ( ) )
c . JSON ( http . StatusBadRequest , gin . H { "error" : "this server cannot run this remote model" } )
return
}
req . Model = m . Config . RemoteModel
if req . Template == "" && m . Template . String ( ) != "" {
req . Template = m . Template . String ( )
}
if req . Options == nil {
req . Options = map [ string ] any { }
}
for k , v := range m . Options {
if _ , ok := req . Options [ k ] ; ! ok {
req . Options [ k ] = v
}
}
// update the system prompt from the model if one isn't already specified
if req . System == "" && m . System != "" {
req . System = m . System
}
if len ( m . Messages ) > 0 {
slog . Warn ( "embedded messages in the model not supported with '/api/generate'; try '/api/chat' instead" )
}
fn := func ( resp api . GenerateResponse ) error {
resp . Model = origModel
resp . RemoteModel = m . Config . RemoteModel
resp . RemoteHost = m . Config . RemoteHost
data , err := json . Marshal ( resp )
if err != nil {
return err
}
if _ , err = c . Writer . Write ( append ( data , '\n' ) ) ; err != nil {
return err
}
c . Writer . Flush ( )
return nil
}
client := api . NewClient ( remoteURL , http . DefaultClient )
err = client . Generate ( c , & req , fn )
if err != nil {
var sErr api . AuthorizationError
if errors . As ( err , & sErr ) && sErr . StatusCode == http . StatusUnauthorized {
pk , pkErr := auth . GetPublicKey ( )
if pkErr != nil {
slog . Error ( "couldn't get public key" , "error" , pkErr )
c . JSON ( http . StatusUnauthorized , gin . H { "error" : "error getting public key" } )
return
}
c . JSON ( http . StatusUnauthorized , gin . H { "public_key" : pk } )
return
}
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
return
}
2024-09-12 07:36:21 +08:00
// expire the runner
2025-08-28 02:51:25 +08:00
if req . Prompt == "" && req . KeepAlive != nil && req . KeepAlive . Duration == 0 {
2025-04-02 06:21:46 +08:00
s . sched . expireRunner ( m )
2024-09-12 07:36:21 +08:00
c . JSON ( http . StatusOK , api . GenerateResponse {
Model : req . Model ,
CreatedAt : time . Now ( ) . UTC ( ) ,
Response : "" ,
Done : true ,
DoneReason : "unload" ,
} )
return
}
2024-12-05 09:37:12 +08:00
if req . Raw && ( req . Template != "" || req . System != "" || len ( req . Context ) > 0 ) {
2024-06-18 01:38:55 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "raw mode does not support template, system, or context" } )
2024-06-11 05:54:42 +08:00
return
}
2025-09-13 04:32:30 +08:00
useHarmony := shouldUseHarmony ( m ) && ! req . Raw
var harmonyMessageHandler * harmony . HarmonyMessageHandler
var harmonyToolParser * harmony . HarmonyToolCallAccumulator
2025-08-06 03:21:16 +08:00
if useHarmony {
2025-09-13 04:32:30 +08:00
harmonyMessageHandler = harmony . NewHarmonyMessageHandler ( )
harmonyMessageHandler . HarmonyParser . AddImplicitStart ( )
harmonyToolParser = harmonyMessageHandler . CreateToolParser ( )
2025-08-06 03:21:16 +08:00
}
// Validate Think value: string values currently only allowed for gptoss models
if req . Think != nil && req . Think . IsString ( ) && ! useHarmony {
2025-08-13 02:02:01 +08:00
c . JSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "think value %q is not supported for this model" , req . Think . String ( ) ) } )
2025-08-06 03:21:16 +08:00
return
}
2025-04-02 06:21:46 +08:00
caps := [ ] model . Capability { model . CapabilityCompletion }
2024-06-21 10:13:36 +08:00
if req . Suffix != "" {
2025-04-02 06:21:46 +08:00
caps = append ( caps , model . CapabilityInsert )
2024-06-21 10:13:36 +08:00
}
2025-08-13 02:02:01 +08:00
if req . Think != nil && req . Think . Bool ( ) {
2025-05-29 10:38:52 +08:00
caps = append ( caps , model . CapabilityThinking )
// TODO(drifkin): consider adding a warning if it's false and the model
// doesn't support thinking. It's not strictly required, but it can be a
// hint that the user is on an older qwen3/r1 model that doesn't have an
// updated template supporting thinking
}
2024-06-21 10:13:36 +08:00
2024-12-12 07:29:59 +08:00
r , m , opts , err := s . scheduleRunner ( c . Request . Context ( ) , name . String ( ) , caps , req . Options , req . KeepAlive )
2024-06-18 01:38:55 +08:00
if errors . Is ( err , errCapabilityCompletion ) {
c . JSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "%q does not support generate" , req . Model ) } )
return
} else if err != nil {
2024-06-21 02:00:08 +08:00
handleScheduleError ( c , req . Model , err )
return
}
2024-07-14 00:25:31 +08:00
checkpointLoaded := time . Now ( )
2024-10-19 07:12:35 +08:00
// load the model
2024-06-21 02:00:08 +08:00
if req . Prompt == "" {
c . JSON ( http . StatusOK , api . GenerateResponse {
Model : req . Model ,
CreatedAt : time . Now ( ) . UTC ( ) ,
Done : true ,
DoneReason : "load" ,
} )
2024-06-18 01:38:55 +08:00
return
}
2023-12-05 07:01:06 +08:00
2025-05-14 08:36:02 +08:00
if slices . Contains ( m . Config . ModelFamilies , "mllama" ) && len ( req . Images ) > 1 {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "this model only supports one image while more than one image requested" } )
2024-10-19 07:12:35 +08:00
return
}
2024-06-18 01:38:55 +08:00
images := make ( [ ] llm . ImageData , len ( req . Images ) )
for i := range req . Images {
2025-05-14 08:36:02 +08:00
images [ i ] = llm . ImageData { ID : i , Data : req . Images [ i ] }
2024-06-18 01:38:55 +08:00
}
2023-12-06 03:57:33 +08:00
2024-06-18 01:38:55 +08:00
prompt := req . Prompt
if ! req . Raw {
2024-07-04 00:00:07 +08:00
tmpl := m . Template
2024-06-18 01:38:55 +08:00
if req . Template != "" {
tmpl , err = template . Parse ( req . Template )
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
}
2024-06-21 10:13:36 +08:00
var values template . Values
if req . Suffix != "" {
values . Prompt = prompt
values . Suffix = req . Suffix
} else {
var msgs [ ] api . Message
if req . System != "" {
msgs = append ( msgs , api . Message { Role : "system" , Content : req . System } )
} else if m . System != "" {
msgs = append ( msgs , api . Message { Role : "system" , Content : m . System } )
}
2024-06-20 05:14:28 +08:00
if req . Context == nil {
msgs = append ( msgs , m . Messages ... )
}
2024-06-21 10:13:36 +08:00
for _ , i := range images {
2024-10-12 06:34:01 +08:00
imgPrompt := ""
msgs = append ( msgs , api . Message { Role : "user" , Content : fmt . Sprintf ( "[img-%d]" + imgPrompt , i . ID ) } )
2024-06-21 10:13:36 +08:00
}
values . Messages = append ( msgs , api . Message { Role : "user" , Content : req . Prompt } )
}
2025-08-13 02:02:01 +08:00
values . Think = req . Think != nil && req . Think . Bool ( )
2025-08-06 03:21:16 +08:00
values . ThinkLevel = ""
if req . Think != nil {
2025-08-13 02:02:01 +08:00
values . ThinkLevel = req . Think . String ( )
2025-08-06 03:21:16 +08:00
}
2025-05-29 10:38:52 +08:00
values . IsThinkSet = req . Think != nil
2024-06-20 05:14:28 +08:00
var b bytes . Buffer
if req . Context != nil {
2024-12-01 06:05:50 +08:00
slog . Warn ( "the context field is deprecated and will be removed in a future version of Ollama" )
2024-08-01 20:56:15 +08:00
s , err := r . Detokenize ( c . Request . Context ( ) , req . Context )
2024-06-20 05:14:28 +08:00
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
2024-08-02 04:50:05 +08:00
b . WriteString ( s )
2024-06-20 05:14:28 +08:00
}
2024-08-02 04:50:05 +08:00
if err := tmpl . Execute ( & b , values ) ; err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
prompt = b . String ( )
2023-08-09 03:13:22 +08:00
}
2025-08-16 04:52:50 +08:00
// If debug mode is enabled, return the rendered template instead of calling the model
if req . DebugRenderOnly {
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
c . JSON ( http . StatusOK , api . GenerateResponse {
2025-08-16 04:52:50 +08:00
Model : req . Model ,
CreatedAt : time . Now ( ) . UTC ( ) ,
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
DebugInfo : & api . DebugInfo {
2025-08-16 04:52:50 +08:00
RenderedTemplate : prompt ,
ImageCount : len ( images ) ,
} ,
} )
return
}
2025-06-07 03:02:20 +08:00
var thinkingState * thinking . Parser
2025-08-06 03:21:16 +08:00
if ! useHarmony {
openingTag , closingTag := thinking . InferTags ( m . Template . Template )
2025-08-13 02:02:01 +08:00
if req . Think != nil && req . Think . Bool ( ) && openingTag != "" && closingTag != "" {
2025-08-06 03:21:16 +08:00
thinkingState = & thinking . Parser {
OpeningTag : openingTag ,
ClosingTag : closingTag ,
}
2025-09-18 07:39:04 +08:00
if strings . HasSuffix ( strings . TrimSpace ( prompt ) , openingTag ) {
thinkingState . AddContent ( openingTag )
}
2025-05-29 10:38:52 +08:00
}
}
2023-12-05 07:01:06 +08:00
ch := make ( chan any )
go func ( ) {
2024-07-14 00:25:31 +08:00
// TODO (jmorganca): avoid building the response twice both here and below
var sb strings . Builder
2023-12-05 07:01:06 +08:00
defer close ( ch )
2024-07-04 00:00:07 +08:00
if err := r . Completion ( c . Request . Context ( ) , llm . CompletionRequest {
2025-09-13 04:32:30 +08:00
Prompt : prompt ,
Images : images ,
Format : req . Format ,
Options : opts ,
2024-07-14 00:25:31 +08:00
} , func ( cr llm . CompletionResponse ) {
res := api . GenerateResponse {
2025-04-04 01:19:24 +08:00
Model : req . Model ,
CreatedAt : time . Now ( ) . UTC ( ) ,
Response : cr . Content ,
Done : cr . Done ,
2023-12-06 03:57:33 +08:00
Metrics : api . Metrics {
2024-07-14 00:25:31 +08:00
PromptEvalCount : cr . PromptEvalCount ,
PromptEvalDuration : cr . PromptEvalDuration ,
EvalCount : cr . EvalCount ,
EvalDuration : cr . EvalDuration ,
2023-12-06 03:57:33 +08:00
} ,
2023-12-05 07:01:06 +08:00
}
2024-07-14 00:25:31 +08:00
2025-08-06 03:21:16 +08:00
if useHarmony {
2025-09-13 04:32:30 +08:00
content , thinking , toolContent := harmonyMessageHandler . AddContent ( cr . Content , harmonyToolParser )
res . Response = content
res . Thinking = thinking
harmonyToolParser . Add ( toolContent )
} else if thinkingState != nil {
2025-06-06 01:22:32 +08:00
thinking , content := thinkingState . AddContent ( cr . Content )
2025-05-29 10:38:52 +08:00
res . Thinking = thinking
res . Response = content
}
2024-07-14 00:25:31 +08:00
if _ , err := sb . WriteString ( cr . Content ) ; err != nil {
ch <- gin . H { "error" : err . Error ( ) }
}
if cr . Done {
2025-09-13 04:32:30 +08:00
if useHarmony {
toolName , toolContent := harmonyToolParser . Drain ( )
if toolName != nil {
* toolName = strings . TrimPrefix ( * toolName , "functions." )
var args api . ToolCallFunctionArguments
if err := json . Unmarshal ( [ ] byte ( toolContent ) , & args ) ; err != nil {
errStr := fmt . Sprintf ( "error parsing tool call: raw='%s', err=%s" , toolContent , err . Error ( ) )
ch <- gin . H { "error" : errStr }
return
}
res . ToolCalls = append ( res . ToolCalls , api . ToolCall {
Function : api . ToolCallFunction {
Name : * toolName ,
Arguments : args ,
} ,
} )
}
}
res . DoneReason = cr . DoneReason . String ( )
res . TotalDuration = time . Since ( checkpointStart )
res . LoadDuration = checkpointLoaded . Sub ( checkpointStart )
2024-07-14 00:25:31 +08:00
if ! req . Raw {
2024-08-02 04:50:05 +08:00
tokens , err := r . Tokenize ( c . Request . Context ( ) , prompt + sb . String ( ) )
2024-07-14 00:25:31 +08:00
if err != nil {
ch <- gin . H { "error" : err . Error ( ) }
return
}
2024-08-02 04:50:05 +08:00
res . Context = tokens
2024-07-14 00:25:31 +08:00
}
}
2025-08-06 03:21:16 +08:00
if useHarmony {
// only send messages with meaningful content (empty messages confuse clients)
if res . Response != "" || res . Thinking != "" || res . Done || len ( res . ToolCalls ) > 0 {
ch <- res
}
return
}
2024-07-14 00:25:31 +08:00
ch <- res
2024-06-18 01:38:55 +08:00
} ) ; err != nil {
2023-12-05 07:01:06 +08:00
ch <- gin . H { "error" : err . Error ( ) }
}
} ( )
if req . Stream != nil && ! * req . Stream {
2024-06-18 01:38:55 +08:00
var r api . GenerateResponse
2025-05-29 10:38:52 +08:00
var sbThinking strings . Builder
var sbContent strings . Builder
2024-06-18 01:38:55 +08:00
for rr := range ch {
switch t := rr . ( type ) {
2023-12-10 23:53:38 +08:00
case api . GenerateResponse :
2025-05-29 10:38:52 +08:00
sbThinking . WriteString ( t . Thinking )
sbContent . WriteString ( t . Response )
2024-06-18 01:38:55 +08:00
r = t
2023-12-10 23:53:38 +08:00
case gin . H :
2024-06-18 01:38:55 +08:00
msg , ok := t [ "error" ] . ( string )
if ! ok {
msg = "unexpected error format in response"
2023-12-10 23:53:38 +08:00
}
2024-06-18 01:38:55 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : msg } )
return
2023-12-10 23:53:38 +08:00
default :
2024-06-18 01:38:55 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : "unexpected response" } )
2023-12-05 07:01:06 +08:00
return
}
}
2023-12-10 23:53:38 +08:00
2025-05-29 10:38:52 +08:00
r . Thinking = sbThinking . String ( )
r . Response = sbContent . String ( )
2024-06-18 01:38:55 +08:00
c . JSON ( http . StatusOK , r )
2023-12-05 07:01:06 +08:00
return
}
streamResponse ( c , ch )
}
2024-07-16 03:14:24 +08:00
func ( s * Server ) EmbedHandler ( c * gin . Context ) {
2024-07-31 04:12:21 +08:00
checkpointStart := time . Now ( )
2024-07-16 03:14:24 +08:00
var req api . EmbedRequest
err := c . ShouldBindJSON ( & req )
switch {
case errors . Is ( err , io . EOF ) :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
case err != nil :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
return
}
truncate := true
if req . Truncate != nil && ! * req . Truncate {
truncate = false
}
var input [ ] string
switch i := req . Input . ( type ) {
case string :
if len ( i ) > 0 {
input = append ( input , i )
}
case [ ] any :
for _ , v := range i {
if _ , ok := v . ( string ) ; ! ok {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "invalid input type" } )
return
}
input = append ( input , v . ( string ) )
}
default :
2024-08-14 01:19:56 +08:00
if req . Input != nil {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "invalid input type" } )
return
}
2024-07-16 03:14:24 +08:00
}
2024-12-12 07:29:59 +08:00
name , err := getExistingName ( model . ParseName ( req . Model ) )
if err != nil {
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , req . Model ) } )
return
}
2025-04-02 06:21:46 +08:00
r , m , opts , err := s . scheduleRunner ( c . Request . Context ( ) , name . String ( ) , [ ] model . Capability { } , req . Options , req . KeepAlive )
2024-07-16 03:14:24 +08:00
if err != nil {
handleScheduleError ( c , req . Model , err )
return
}
2024-07-31 04:12:21 +08:00
checkpointLoaded := time . Now ( )
2024-08-14 01:19:56 +08:00
if len ( input ) == 0 {
c . JSON ( http . StatusOK , api . EmbedResponse { Model : req . Model , Embeddings : [ ] [ ] float32 { } } )
return
}
2025-03-14 05:24:27 +08:00
kvData , _ , err := getModelData ( m . ModelPath , false )
2024-07-16 03:14:24 +08:00
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
2024-08-12 02:57:10 +08:00
var count int
2024-07-16 03:14:24 +08:00
for i , s := range input {
tokens , err := r . Tokenize ( c . Request . Context ( ) , s )
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
ctxLen := min ( opts . NumCtx , int ( kvData . ContextLength ( ) ) )
if len ( tokens ) > ctxLen {
if ! truncate {
2025-09-19 05:00:21 +08:00
c . JSON ( http . StatusBadRequest , gin . H { "error" : "input exceeds maximum context length" } )
2024-07-16 03:14:24 +08:00
return
}
2025-09-16 06:35:59 +08:00
if bos := kvData . Uint ( "tokenizer.ggml.bos_token_id" ) ; tokens [ 0 ] != int ( bos ) && kvData . Bool ( "add_bos_token" , true ) {
ctxLen --
}
if eos := kvData . Uint ( "tokenizer.ggml.eos_token_id" ) ; tokens [ len ( tokens ) - 1 ] != int ( eos ) && kvData . Bool ( "add_eos_token" , true ) {
ctxLen --
}
2025-09-19 05:00:21 +08:00
slog . Info ( "" , "ctxLen" , ctxLen , "tokenCount" , len ( tokens ) )
if ctxLen <= 0 {
// return error if the truncated input would be empty or just special tokens
c . JSON ( http . StatusBadRequest , gin . H { "error" : "input after truncation exceeds maximum context length" } )
return
}
2024-07-16 03:14:24 +08:00
tokens = tokens [ : ctxLen ]
2025-09-16 06:35:59 +08:00
2024-07-16 03:14:24 +08:00
s , err = r . Detokenize ( c . Request . Context ( ) , tokens )
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
}
2024-08-12 02:57:10 +08:00
count += len ( tokens )
2024-07-16 03:14:24 +08:00
input [ i ] = s
}
2024-08-12 02:57:10 +08:00
var g errgroup . Group
embeddings := make ( [ ] [ ] float32 , len ( input ) )
for i , text := range input {
g . Go ( func ( ) error {
embedding , err := r . Embedding ( c . Request . Context ( ) , text )
if err != nil {
return err
}
2025-09-12 01:36:10 +08:00
// TODO: this first normalization should be done by the model
embedding = normalize ( embedding )
if req . Dimensions > 0 && req . Dimensions < len ( embedding ) {
embedding = normalize ( embedding [ : req . Dimensions ] )
}
embeddings [ i ] = embedding
2024-08-12 02:57:10 +08:00
return nil
} )
2024-07-16 03:14:24 +08:00
}
2024-08-12 02:57:10 +08:00
if err := g . Wait ( ) ; err != nil {
2025-03-14 02:22:19 +08:00
c . AbortWithStatusJSON ( http . StatusInternalServerError , gin . H { "error" : strings . TrimSpace ( err . Error ( ) ) } )
2024-08-12 02:57:10 +08:00
return
2024-07-16 03:14:24 +08:00
}
resp := api . EmbedResponse {
2024-07-31 04:12:21 +08:00
Model : req . Model ,
2024-08-12 02:57:10 +08:00
Embeddings : embeddings ,
2024-07-31 04:12:21 +08:00
TotalDuration : time . Since ( checkpointStart ) ,
LoadDuration : checkpointLoaded . Sub ( checkpointStart ) ,
2024-08-12 02:57:10 +08:00
PromptEvalCount : count ,
2024-07-16 03:14:24 +08:00
}
c . JSON ( http . StatusOK , resp )
}
func normalize ( vec [ ] float32 ) [ ] float32 {
var sum float32
for _ , v := range vec {
sum += v * v
}
2025-09-12 01:36:10 +08:00
norm := float32 ( 1.0 / max ( math . Sqrt ( float64 ( sum ) ) , 1e-12 ) )
2024-07-16 03:14:24 +08:00
for i := range vec {
vec [ i ] *= norm
}
return vec
}
2024-03-31 00:50:05 +08:00
func ( s * Server ) EmbeddingsHandler ( c * gin . Context ) {
2023-12-05 07:01:06 +08:00
var req api . EmbeddingRequest
2024-06-18 01:38:55 +08:00
if err := c . ShouldBindJSON ( & req ) ; errors . Is ( err , io . EOF ) {
2023-12-05 07:01:06 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
2024-06-18 01:38:55 +08:00
} else if err != nil {
2023-12-05 07:01:06 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
return
}
2024-12-12 07:29:59 +08:00
name := model . ParseName ( req . Model )
if ! name . IsValid ( ) {
c . JSON ( http . StatusBadRequest , gin . H { "error" : "model is required" } )
return
}
2025-04-02 06:21:46 +08:00
r , _ , _ , err := s . scheduleRunner ( c . Request . Context ( ) , name . String ( ) , [ ] model . Capability { } , req . Options , req . KeepAlive )
2023-12-05 07:01:06 +08:00
if err != nil {
2024-06-21 02:00:08 +08:00
handleScheduleError ( c , req . Model , err )
2023-12-05 07:01:06 +08:00
return
}
2024-03-01 09:40:56 +08:00
// an empty request loads the model
if req . Prompt == "" {
c . JSON ( http . StatusOK , api . EmbeddingResponse { Embedding : [ ] float64 { } } )
2023-08-09 03:13:22 +08:00
return
}
2024-08-12 02:57:10 +08:00
embedding , err := r . Embedding ( c . Request . Context ( ) , req . Prompt )
2023-08-09 03:13:22 +08:00
if err != nil {
2025-03-14 02:22:19 +08:00
c . AbortWithStatusJSON ( http . StatusInternalServerError , gin . H { "error" : strings . TrimSpace ( err . Error ( ) ) } )
2023-08-09 03:13:22 +08:00
return
}
2024-08-12 02:57:10 +08:00
var e [ ] float64
for _ , v := range embedding {
e = append ( e , float64 ( v ) )
2024-07-16 03:14:24 +08:00
}
resp := api . EmbeddingResponse {
2024-08-12 02:57:10 +08:00
Embedding : e ,
2024-07-16 03:14:24 +08:00
}
c . JSON ( http . StatusOK , resp )
2023-08-09 03:13:22 +08:00
}
2024-08-27 10:36:11 +08:00
func ( s * Server ) PullHandler ( c * gin . Context ) {
2023-07-12 02:54:22 +08:00
var req api . PullRequest
2023-10-19 07:08:42 +08:00
err := c . ShouldBindJSON ( & req )
switch {
case errors . Is ( err , io . EOF ) :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
case err != nil :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2023-07-12 02:54:22 +08:00
return
}
2024-05-14 06:27:51 +08:00
name := model . ParseName ( cmp . Or ( req . Model , req . Name ) )
if ! name . IsValid ( ) {
2024-12-23 23:38:34 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : errtypes . InvalidModelNameErrMsg } )
2024-05-14 06:27:51 +08:00
return
}
server: allow mixed-case model names on push, pull, cp, and create (#7676)
This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.
This does not break existing, intended, behaviors.
Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
2024-11-20 07:05:57 +08:00
name , err = getExistingName ( name )
if err != nil {
2024-05-14 06:27:51 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2023-10-19 06:56:34 +08:00
return
}
2023-07-17 08:02:22 +08:00
ch := make ( chan any )
go func ( ) {
defer close ( ch )
2023-07-19 09:51:30 +08:00
fn := func ( r api . ProgressResponse ) {
ch <- r
2023-07-17 08:02:22 +08:00
}
2023-07-19 09:51:30 +08:00
2024-02-15 03:29:49 +08:00
regOpts := & registryOptions {
2023-07-22 06:42:19 +08:00
Insecure : req . Insecure ,
}
2023-07-26 05:08:51 +08:00
ctx , cancel := context . WithCancel ( c . Request . Context ( ) )
defer cancel ( )
2024-05-14 06:27:51 +08:00
if err := PullModel ( ctx , name . DisplayShortest ( ) , regOpts , fn ) ; err != nil {
2023-07-21 03:12:08 +08:00
ch <- gin . H { "error" : err . Error ( ) }
2023-07-17 08:02:22 +08:00
}
} ( )
2023-10-12 00:54:27 +08:00
if req . Stream != nil && ! * req . Stream {
waitForStream ( c , ch )
return
}
2023-07-17 08:02:22 +08:00
streamResponse ( c , ch )
}
2024-08-27 10:36:11 +08:00
func ( s * Server ) PushHandler ( c * gin . Context ) {
2023-07-17 08:02:22 +08:00
var req api . PushRequest
2023-10-19 07:08:42 +08:00
err := c . ShouldBindJSON ( & req )
switch {
case errors . Is ( err , io . EOF ) :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
case err != nil :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2023-07-12 02:54:22 +08:00
return
}
2023-07-07 01:40:11 +08:00
2024-12-12 07:29:59 +08:00
var mname string
2024-01-12 06:07:54 +08:00
if req . Model != "" {
2024-12-12 07:29:59 +08:00
mname = req . Model
2024-01-12 06:07:54 +08:00
} else if req . Name != "" {
2024-12-12 07:29:59 +08:00
mname = req . Name
2024-01-12 06:07:54 +08:00
} else {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "model is required" } )
2023-10-19 06:56:34 +08:00
return
}
2023-07-17 08:02:22 +08:00
ch := make ( chan any )
go func ( ) {
defer close ( ch )
2023-07-19 09:51:30 +08:00
fn := func ( r api . ProgressResponse ) {
ch <- r
2023-07-17 08:02:22 +08:00
}
2023-07-19 09:51:30 +08:00
2024-02-15 03:29:49 +08:00
regOpts := & registryOptions {
2023-07-22 06:42:19 +08:00
Insecure : req . Insecure ,
}
2023-10-10 01:24:27 +08:00
ctx , cancel := context . WithCancel ( c . Request . Context ( ) )
defer cancel ( )
2024-12-12 07:29:59 +08:00
name , err := getExistingName ( model . ParseName ( mname ) )
if err != nil {
ch <- gin . H { "error" : err . Error ( ) }
return
}
if err := PushModel ( ctx , name . DisplayShortest ( ) , regOpts , fn ) ; err != nil {
2023-07-21 03:12:08 +08:00
ch <- gin . H { "error" : err . Error ( ) }
2023-07-17 08:02:22 +08:00
}
} ( )
2023-10-12 00:54:27 +08:00
if req . Stream != nil && ! * req . Stream {
waitForStream ( c , ch )
return
}
2023-07-17 08:02:22 +08:00
streamResponse ( c , ch )
}
2024-12-12 07:29:59 +08:00
// getExistingName searches the models directory for the longest prefix match of
// the input name and returns the input name with all existing parts replaced
// with each part found. If no parts are found, the input name is returned as
// is.
server: allow mixed-case model names on push, pull, cp, and create (#7676)
This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.
This does not break existing, intended, behaviors.
Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
2024-11-20 07:05:57 +08:00
func getExistingName ( n model . Name ) ( model . Name , error ) {
var zero model . Name
existing , err := Manifests ( true )
2024-05-14 06:27:51 +08:00
if err != nil {
server: allow mixed-case model names on push, pull, cp, and create (#7676)
This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.
This does not break existing, intended, behaviors.
Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
2024-11-20 07:05:57 +08:00
return zero , err
2024-05-14 06:27:51 +08:00
}
2024-12-12 07:29:59 +08:00
var set model . Name // tracks parts already canonicalized
server: allow mixed-case model names on push, pull, cp, and create (#7676)
This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.
This does not break existing, intended, behaviors.
Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
2024-11-20 07:05:57 +08:00
for e := range existing {
2024-12-12 07:29:59 +08:00
if set . Host == "" && strings . EqualFold ( e . Host , n . Host ) {
n . Host = e . Host
}
if set . Namespace == "" && strings . EqualFold ( e . Namespace , n . Namespace ) {
n . Namespace = e . Namespace
}
if set . Model == "" && strings . EqualFold ( e . Model , n . Model ) {
n . Model = e . Model
}
if set . Tag == "" && strings . EqualFold ( e . Tag , n . Tag ) {
n . Tag = e . Tag
2024-05-14 06:27:51 +08:00
}
}
server: allow mixed-case model names on push, pull, cp, and create (#7676)
This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.
This does not break existing, intended, behaviors.
Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
2024-11-20 07:05:57 +08:00
return n , nil
2024-05-14 06:27:51 +08:00
}
2024-08-27 10:36:11 +08:00
func ( s * Server ) DeleteHandler ( c * gin . Context ) {
2024-04-18 08:23:19 +08:00
var r api . DeleteRequest
if err := c . ShouldBindJSON ( & r ) ; errors . Is ( err , io . EOF ) {
2023-10-19 07:08:42 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
2024-04-18 08:23:19 +08:00
} else if err != nil {
2023-10-19 07:08:42 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2023-07-21 07:09:23 +08:00
return
}
2024-04-18 08:23:19 +08:00
n := model . ParseName ( cmp . Or ( r . Model , r . Name ) )
if ! n . IsValid ( ) {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "name %q is invalid" , cmp . Or ( r . Model , r . Name ) ) } )
2023-07-22 14:02:12 +08:00
return
}
2023-09-27 08:28:14 +08:00
2024-12-12 07:29:59 +08:00
n , err := getExistingName ( n )
if err != nil {
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , cmp . Or ( r . Model , r . Name ) ) } )
return
}
2024-04-18 08:23:19 +08:00
m , err := ParseNamedManifest ( n )
2023-09-27 08:28:14 +08:00
if err != nil {
2024-10-02 06:45:43 +08:00
switch {
case os . IsNotExist ( err ) :
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , cmp . Or ( r . Model , r . Name ) ) } )
default :
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
}
2023-09-27 08:28:14 +08:00
return
}
2024-04-18 08:23:19 +08:00
if err := m . Remove ( ) ; err != nil {
2023-09-27 08:28:14 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
2024-05-09 05:36:08 +08:00
if err := m . RemoveLayers ( ) ; err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
2023-07-21 07:09:23 +08:00
}
2024-08-27 10:36:11 +08:00
func ( s * Server ) ShowHandler ( c * gin . Context ) {
2023-09-07 02:04:17 +08:00
var req api . ShowRequest
2023-10-19 07:08:42 +08:00
err := c . ShouldBindJSON ( & req )
switch {
case errors . Is ( err , io . EOF ) :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
case err != nil :
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2023-09-07 02:04:17 +08:00
return
}
2024-01-12 06:07:54 +08:00
if req . Model != "" {
2024-01-19 07:36:50 +08:00
// noop
2024-01-12 06:07:54 +08:00
} else if req . Name != "" {
2024-01-19 07:36:50 +08:00
req . Model = req . Name
2024-01-12 06:07:54 +08:00
} else {
2024-01-05 09:23:11 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "model is required" } )
2023-10-19 06:56:34 +08:00
return
}
2024-01-05 09:23:11 +08:00
resp , err := GetModelInfo ( req )
2023-09-07 02:04:17 +08:00
if err != nil {
2024-06-16 11:53:56 +08:00
switch {
case os . IsNotExist ( err ) :
2024-01-19 07:36:50 +08:00
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , req . Model ) } )
2024-12-23 23:38:34 +08:00
case err . Error ( ) == errtypes . InvalidModelNameErrMsg :
2024-06-16 11:53:56 +08:00
c . JSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
default :
2023-09-07 02:04:17 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
}
return
}
c . JSON ( http . StatusOK , resp )
}
2024-01-05 09:23:11 +08:00
func GetModelInfo ( req api . ShowRequest ) ( * api . ShowResponse , error ) {
2024-12-12 07:29:59 +08:00
name := model . ParseName ( req . Model )
if ! name . IsValid ( ) {
2025-03-29 02:50:22 +08:00
return nil , ErrModelPathInvalid
2024-12-12 07:29:59 +08:00
}
name , err := getExistingName ( name )
if err != nil {
return nil , err
}
m , err := GetModel ( name . String ( ) )
2023-09-07 02:04:17 +08:00
if err != nil {
return nil , err
}
2023-12-12 05:56:22 +08:00
modelDetails := api . ModelDetails {
2024-06-16 11:53:56 +08:00
ParentModel : m . ParentModel ,
Format : m . Config . ModelFormat ,
Family : m . Config . ModelFamily ,
Families : m . Config . ModelFamilies ,
ParameterSize : m . Config . ModelType ,
QuantizationLevel : m . Config . FileType ,
2023-12-12 05:56:22 +08:00
}
2024-01-05 09:23:11 +08:00
if req . System != "" {
2024-06-16 11:53:56 +08:00
m . System = req . System
2024-01-05 09:23:11 +08:00
}
2024-06-18 01:38:55 +08:00
msgs := make ( [ ] api . Message , len ( m . Messages ) )
for i , msg := range m . Messages {
msgs [ i ] = api . Message { Role : msg . Role , Content : msg . Content }
2024-01-26 04:12:36 +08:00
}
2024-12-12 07:29:59 +08:00
manifest , err := ParseNamedManifest ( name )
2024-06-16 11:53:56 +08:00
if err != nil {
return nil , err
}
2023-09-07 02:04:17 +08:00
resp := & api . ShowResponse {
2025-04-02 06:21:46 +08:00
License : strings . Join ( m . License , "\n" ) ,
System : m . System ,
Template : m . Template . String ( ) ,
Details : modelDetails ,
Messages : msgs ,
Capabilities : m . Capabilities ( ) ,
ModifiedAt : manifest . fi . ModTime ( ) ,
2023-09-07 02:04:17 +08:00
}
2025-09-18 05:40:53 +08:00
if m . Config . RemoteHost != "" {
resp . RemoteHost = m . Config . RemoteHost
resp . RemoteModel = m . Config . RemoteModel
if m . Config . ModelFamily != "" {
resp . ModelInfo = make ( map [ string ] any )
resp . ModelInfo [ "general.architecture" ] = m . Config . ModelFamily
if m . Config . BaseName != "" {
resp . ModelInfo [ "general.basename" ] = m . Config . BaseName
}
if m . Config . ContextLen > 0 {
resp . ModelInfo [ fmt . Sprintf ( "%s.context_length" , m . Config . ModelFamily ) ] = m . Config . ContextLen
}
if m . Config . EmbedLen > 0 {
resp . ModelInfo [ fmt . Sprintf ( "%s.embedding_length" , m . Config . ModelFamily ) ] = m . Config . EmbedLen
}
}
}
2023-09-07 02:04:17 +08:00
var params [ ] string
cs := 30
2024-06-16 11:53:56 +08:00
for k , v := range m . Options {
2023-09-07 02:04:17 +08:00
switch val := v . ( type ) {
2025-04-03 00:44:27 +08:00
case [ ] any :
2023-09-07 02:04:17 +08:00
for _ , nv := range val {
2024-01-17 02:34:44 +08:00
params = append ( params , fmt . Sprintf ( "%-*s %#v" , cs , k , nv ) )
2023-09-07 02:04:17 +08:00
}
2024-01-17 02:34:44 +08:00
default :
params = append ( params , fmt . Sprintf ( "%-*s %#v" , cs , k , v ) )
2023-09-07 02:04:17 +08:00
}
}
resp . Parameters = strings . Join ( params , "\n" )
2025-07-23 04:40:47 +08:00
if len ( req . Options ) > 0 {
if m . Options == nil {
m . Options = make ( map [ string ] any )
}
for k , v := range req . Options {
2024-06-16 11:53:56 +08:00
m . Options [ k ] = v
2024-01-05 09:23:11 +08:00
}
}
2024-04-27 07:59:31 +08:00
var sb strings . Builder
2024-05-15 06:34:29 +08:00
fmt . Fprintln ( & sb , "# Modelfile generated by \"ollama show\"" )
2024-04-27 07:59:31 +08:00
fmt . Fprintln ( & sb , "# To build a new Modelfile based on this, replace FROM with:" )
2024-06-16 11:53:56 +08:00
fmt . Fprintf ( & sb , "# FROM %s\n\n" , m . ShortName )
fmt . Fprint ( & sb , m . String ( ) )
2024-04-27 07:59:31 +08:00
resp . Modelfile = sb . String ( )
2024-01-05 09:23:11 +08:00
2025-09-18 05:40:53 +08:00
// skip loading tensor information if this is a remote model
if m . Config . RemoteHost != "" && m . Config . RemoteModel != "" {
return resp , nil
}
2025-03-14 05:24:27 +08:00
kvData , tensors , err := getModelData ( m . ModelPath , req . Verbose )
2024-06-20 05:19:02 +08:00
if err != nil {
return nil , err
}
2025-03-14 05:24:27 +08:00
2024-06-20 05:19:02 +08:00
delete ( kvData , "general.name" )
delete ( kvData , "tokenizer.chat_template" )
resp . ModelInfo = kvData
2025-03-14 05:24:27 +08:00
tensorData := make ( [ ] api . Tensor , len ( tensors . Items ( ) ) )
for cnt , t := range tensors . Items ( ) {
tensorData [ cnt ] = api . Tensor { Name : t . Name , Type : t . Type ( ) , Shape : t . Shape }
}
resp . Tensors = tensorData
2024-06-20 05:19:02 +08:00
if len ( m . ProjectorPaths ) > 0 {
2025-03-14 05:24:27 +08:00
projectorData , _ , err := getModelData ( m . ProjectorPaths [ 0 ] , req . Verbose )
2024-06-20 05:19:02 +08:00
if err != nil {
return nil , err
}
resp . ProjectorInfo = projectorData
}
2023-09-07 02:04:17 +08:00
return resp , nil
}
2025-03-14 05:24:27 +08:00
func getModelData ( digest string , verbose bool ) ( ggml . KV , ggml . Tensors , error ) {
2024-06-25 12:47:52 +08:00
maxArraySize := 0
if verbose {
maxArraySize = - 1
}
2025-03-14 05:24:27 +08:00
data , err := llm . LoadModel ( digest , maxArraySize )
2024-06-20 05:19:02 +08:00
if err != nil {
2025-03-14 05:24:27 +08:00
return nil , ggml . Tensors { } , err
2024-06-20 05:19:02 +08:00
}
2025-03-14 05:24:27 +08:00
kv := data . KV ( )
2024-06-20 05:19:02 +08:00
if ! verbose {
for k := range kv {
if t , ok := kv [ k ] . ( [ ] any ) ; len ( t ) > 5 && ok {
kv [ k ] = [ ] any { }
}
}
}
2025-03-14 05:24:27 +08:00
return kv , data . Tensors ( ) , nil
2024-06-20 05:19:02 +08:00
}
2024-08-27 10:36:11 +08:00
func ( s * Server ) ListHandler ( c * gin . Context ) {
2024-11-06 06:21:45 +08:00
ms , err := Manifests ( true )
2023-07-19 00:09:45 +08:00
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
2023-08-31 02:14:12 +08:00
2024-06-07 01:11:45 +08:00
models := [ ] api . ListModelResponse { }
2024-05-07 07:34:13 +08:00
for n , m := range ms {
var cf ConfigV2
2024-08-06 08:13:52 +08:00
if m . Config . Digest != "" {
f , err := m . Config . Open ( )
if err != nil {
slog . Warn ( "bad manifest filepath" , "name" , n , "error" , err )
continue
}
defer f . Close ( )
if err := json . NewDecoder ( f ) . Decode ( & cf ) ; err != nil {
slog . Warn ( "bad manifest config" , "name" , n , "error" , err )
continue
}
2023-07-19 00:09:45 +08:00
}
2023-08-31 02:14:12 +08:00
2025-06-07 11:29:14 +08:00
// tag should never be masked
models = append ( models , api . ListModelResponse {
2025-09-18 05:40:53 +08:00
Model : n . DisplayShortest ( ) ,
Name : n . DisplayShortest ( ) ,
RemoteModel : cf . RemoteModel ,
RemoteHost : cf . RemoteHost ,
Size : m . Size ( ) ,
Digest : m . digest ,
ModifiedAt : m . fi . ModTime ( ) ,
2024-05-07 07:34:13 +08:00
Details : api . ModelDetails {
Format : cf . ModelFormat ,
Family : cf . ModelFamily ,
Families : cf . ModelFamilies ,
ParameterSize : cf . ModelType ,
QuantizationLevel : cf . FileType ,
} ,
2025-06-07 11:29:14 +08:00
} )
2023-07-19 00:09:45 +08:00
}
2024-06-07 01:11:45 +08:00
slices . SortStableFunc ( models , func ( i , j api . ListModelResponse ) int {
2024-04-18 05:54:14 +08:00
// most recently modified first
return cmp . Compare ( j . ModifiedAt . Unix ( ) , i . ModifiedAt . Unix ( ) )
} )
2023-07-20 06:00:28 +08:00
c . JSON ( http . StatusOK , api . ListResponse { Models : models } )
2023-07-19 00:09:45 +08:00
}
2024-08-27 10:36:11 +08:00
func ( s * Server ) CopyHandler ( c * gin . Context ) {
2024-04-17 07:22:38 +08:00
var r api . CopyRequest
if err := c . ShouldBindJSON ( & r ) ; errors . Is ( err , io . EOF ) {
2023-10-19 07:08:42 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
2024-04-17 07:22:38 +08:00
} else if err != nil {
2023-10-19 07:08:42 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2023-07-24 23:27:28 +08:00
return
}
2024-04-17 07:22:38 +08:00
src := model . ParseName ( r . Source )
if ! src . IsValid ( ) {
2024-05-02 03:39:05 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "source %q is invalid" , r . Source ) } )
return
2023-10-19 06:56:34 +08:00
}
server: allow mixed-case model names on push, pull, cp, and create (#7676)
This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.
This does not break existing, intended, behaviors.
Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
2024-11-20 07:05:57 +08:00
src , err := getExistingName ( src )
if err != nil {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
return
}
2023-10-19 06:56:34 +08:00
2024-04-17 07:22:38 +08:00
dst := model . ParseName ( r . Destination )
if ! dst . IsValid ( ) {
2024-05-08 08:35:52 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "destination %q is invalid" , r . Destination ) } )
2023-07-24 23:27:28 +08:00
return
}
server: allow mixed-case model names on push, pull, cp, and create (#7676)
This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.
This does not break existing, intended, behaviors.
Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
2024-11-20 07:05:57 +08:00
dst , err = getExistingName ( dst )
if err != nil {
2024-05-14 06:27:51 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
return
}
2024-04-17 07:22:38 +08:00
if err := CopyModel ( src , dst ) ; errors . Is ( err , os . ErrNotExist ) {
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model %q not found" , r . Source ) } )
} else if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
}
2023-07-24 23:27:28 +08:00
}
2024-03-31 00:50:05 +08:00
func ( s * Server ) HeadBlobHandler ( c * gin . Context ) {
2023-11-15 06:07:40 +08:00
path , err := GetBlobsPath ( c . Param ( "digest" ) )
if err != nil {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
return
}
if _ , err := os . Stat ( path ) ; err != nil {
c . AbortWithStatusJSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "blob %q not found" , c . Param ( "digest" ) ) } )
return
}
2023-11-16 05:55:37 +08:00
c . Status ( http . StatusOK )
2023-11-15 06:07:40 +08:00
}
2024-03-31 00:50:05 +08:00
func ( s * Server ) CreateBlobHandler ( c * gin . Context ) {
2024-05-21 05:58:27 +08:00
if ib , ok := intermediateBlobs [ c . Param ( "digest" ) ] ; ok {
p , err := GetBlobsPath ( ib )
2024-05-11 06:48:41 +08:00
if err != nil {
c . AbortWithStatusJSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
if _ , err := os . Stat ( p ) ; errors . Is ( err , os . ErrNotExist ) {
2024-05-21 05:58:27 +08:00
slog . Info ( "evicting intermediate blob which no longer exists" , "digest" , ib )
delete ( intermediateBlobs , c . Param ( "digest" ) )
2024-05-11 06:48:41 +08:00
} else if err != nil {
c . AbortWithStatusJSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
} else {
c . Status ( http . StatusOK )
return
}
}
2024-04-06 00:30:09 +08:00
path , err := GetBlobsPath ( c . Param ( "digest" ) )
if err != nil {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
return
}
_ , err = os . Stat ( path )
switch {
case errors . Is ( err , os . ErrNotExist ) :
// noop
case err != nil :
c . AbortWithStatusJSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
default :
c . Status ( http . StatusOK )
return
}
2023-11-25 04:01:23 +08:00
layer , err := NewLayer ( c . Request . Body , "" )
2023-11-18 07:21:57 +08:00
if err != nil {
c . AbortWithStatusJSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
2023-11-25 04:01:23 +08:00
if layer . Digest != c . Param ( "digest" ) {
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "digest mismatch, expected %q, got %q" , c . Param ( "digest" ) , layer . Digest ) } )
2023-11-15 06:07:40 +08:00
return
}
2023-11-16 05:55:37 +08:00
c . Status ( http . StatusCreated )
2023-11-15 06:07:40 +08:00
}
2024-03-09 16:22:08 +08:00
func isLocalIP ( ip netip . Addr ) bool {
if interfaces , err := net . Interfaces ( ) ; err == nil {
for _ , iface := range interfaces {
addrs , err := iface . Addrs ( )
if err != nil {
continue
}
for _ , a := range addrs {
if parsed , _ , err := net . ParseCIDR ( a . String ( ) ) ; err == nil {
if parsed . String ( ) == ip . String ( ) {
return true
}
}
}
}
}
return false
}
2024-03-09 14:23:47 +08:00
func allowedHost ( host string ) bool {
2024-12-11 05:43:22 +08:00
host = strings . ToLower ( host )
2024-03-09 16:22:08 +08:00
if host == "" || host == "localhost" {
2024-03-09 14:23:47 +08:00
return true
}
2024-12-11 05:43:22 +08:00
if hostname , err := os . Hostname ( ) ; err == nil && host == strings . ToLower ( hostname ) {
2024-03-09 14:23:47 +08:00
return true
}
2024-08-02 05:52:15 +08:00
tlds := [ ] string {
2024-03-09 15:23:59 +08:00
"localhost" ,
"local" ,
"internal" ,
2023-12-15 08:47:40 +08:00
}
2023-10-30 23:10:18 +08:00
2024-03-09 15:29:53 +08:00
// check if the host is a local TLD
2024-03-09 14:23:47 +08:00
for _ , tld := range tlds {
if strings . HasSuffix ( host , "." + tld ) {
return true
}
}
2024-03-09 15:29:53 +08:00
return false
2024-03-09 15:23:59 +08:00
}
2024-03-09 14:23:47 +08:00
2024-03-09 15:23:59 +08:00
func allowedHostsMiddleware ( addr net . Addr ) gin . HandlerFunc {
return func ( c * gin . Context ) {
if addr == nil {
2024-03-09 14:23:47 +08:00
c . Next ( )
return
}
2024-03-09 16:22:08 +08:00
if addr , err := netip . ParseAddrPort ( addr . String ( ) ) ; err == nil && ! addr . Addr ( ) . IsLoopback ( ) {
2024-03-09 14:23:47 +08:00
c . Next ( )
return
}
host , _ , err := net . SplitHostPort ( c . Request . Host )
if err != nil {
host = c . Request . Host
}
2024-03-09 15:23:59 +08:00
if addr , err := netip . ParseAddr ( host ) ; err == nil {
2024-03-09 16:22:08 +08:00
if addr . IsLoopback ( ) || addr . IsPrivate ( ) || addr . IsUnspecified ( ) || isLocalIP ( addr ) {
2024-03-09 15:23:59 +08:00
c . Next ( )
return
}
}
2024-03-09 14:23:47 +08:00
if allowedHost ( host ) {
2024-05-22 13:21:04 +08:00
if c . Request . Method == http . MethodOptions {
2024-05-09 04:14:00 +08:00
c . AbortWithStatus ( http . StatusNoContent )
return
}
2024-03-09 14:23:47 +08:00
c . Next ( )
return
}
c . AbortWithStatus ( http . StatusForbidden )
}
2023-12-15 08:47:40 +08:00
}
2023-10-30 23:10:18 +08:00
2025-03-03 12:55:44 +08:00
func ( s * Server ) GenerateRoutes ( rc * ollama . Registry ) ( http . Handler , error ) {
2025-02-22 13:02:26 +08:00
corsConfig := cors . DefaultConfig ( )
corsConfig . AllowWildcard = true
corsConfig . AllowBrowserExtensions = true
corsConfig . AllowHeaders = [ ] string {
"Authorization" ,
"Content-Type" ,
"User-Agent" ,
"Accept" ,
"X-Requested-With" ,
// OpenAI compatibility headers
2025-04-15 06:36:10 +08:00
"OpenAI-Beta" ,
"x-stainless-arch" ,
"x-stainless-async" ,
"x-stainless-custom-poll-interval" ,
"x-stainless-helper-method" ,
2025-02-22 13:02:26 +08:00
"x-stainless-lang" ,
"x-stainless-os" ,
2025-04-15 06:36:10 +08:00
"x-stainless-package-version" ,
"x-stainless-poll-helper" ,
2025-02-22 13:02:26 +08:00
"x-stainless-retry-count" ,
"x-stainless-runtime" ,
"x-stainless-runtime-version" ,
"x-stainless-timeout" ,
}
corsConfig . AllowOrigins = envconfig . AllowedOrigins ( )
2023-07-22 09:01:24 +08:00
2023-07-06 03:37:33 +08:00
r := gin . Default ( )
2025-05-07 05:45:37 +08:00
r . HandleMethodNotAllowed = true
2023-09-22 03:38:49 +08:00
r . Use (
2025-02-22 13:02:26 +08:00
cors . New ( corsConfig ) ,
2024-03-09 14:23:47 +08:00
allowedHostsMiddleware ( s . addr ) ,
2023-09-22 03:38:49 +08:00
)
2023-07-06 03:37:33 +08:00
2025-02-22 13:02:26 +08:00
// General
r . HEAD ( "/" , func ( c * gin . Context ) { c . String ( http . StatusOK , "Ollama is running" ) } )
r . GET ( "/" , func ( c * gin . Context ) { c . String ( http . StatusOK , "Ollama is running" ) } )
r . HEAD ( "/api/version" , func ( c * gin . Context ) { c . JSON ( http . StatusOK , gin . H { "version" : version . Version } ) } )
r . GET ( "/api/version" , func ( c * gin . Context ) { c . JSON ( http . StatusOK , gin . H { "version" : version . Version } ) } )
2025-02-28 04:04:53 +08:00
// Local model cache management (new implementation is at end of function)
2024-08-27 10:36:11 +08:00
r . POST ( "/api/pull" , s . PullHandler )
r . POST ( "/api/push" , s . PushHandler )
2025-02-22 13:02:26 +08:00
r . HEAD ( "/api/tags" , s . ListHandler )
r . GET ( "/api/tags" , s . ListHandler )
2024-08-27 10:36:11 +08:00
r . POST ( "/api/show" , s . ShowHandler )
server/internal/registry: take over pulls from server package (#9485)
This commit replaces the old pull implementation in the server package
with the new, faster, more robust pull implementation in the registry
package.
The new endpoint, and now the remove endpoint too, are behind the
feature gate "client2" enabled only by setting the OLLAMA_EXPERIMENT
environment variable include "client2".
Currently, the progress indication is wired to perform the same as the
previous implementation to avoid making changes to the CLI, and because
the status reports happen at the start of the download, and the end of
the write to disk, the progress indication is not as smooth as it could
be. This is a known issue and will be addressed in a future change.
This implementation may be ~0.5-1.0% slower in rare cases, depending on
network and disk speed, but is generally MUCH faster and more robust
than the its predecessor in all other cases.
2025-03-06 06:48:18 +08:00
r . DELETE ( "/api/delete" , s . DeleteHandler )
2025-02-22 13:02:26 +08:00
2025-09-18 05:40:53 +08:00
r . DELETE ( "/api/user/keys/:encodedKey" , s . SignoutHandler )
r . POST ( "/api/me" , s . WhoamiHandler )
2025-02-22 13:02:26 +08:00
// Create
r . POST ( "/api/create" , s . CreateHandler )
2024-03-31 00:50:05 +08:00
r . POST ( "/api/blobs/:digest" , s . CreateBlobHandler )
r . HEAD ( "/api/blobs/:digest" , s . HeadBlobHandler )
2025-02-22 13:02:26 +08:00
r . POST ( "/api/copy" , s . CopyHandler )
// Inference
2024-08-27 10:36:11 +08:00
r . GET ( "/api/ps" , s . PsHandler )
2025-02-22 13:02:26 +08:00
r . POST ( "/api/generate" , s . GenerateHandler )
r . POST ( "/api/chat" , s . ChatHandler )
r . POST ( "/api/embed" , s . EmbedHandler )
r . POST ( "/api/embeddings" , s . EmbeddingsHandler )
2023-07-04 03:22:44 +08:00
2025-02-22 13:02:26 +08:00
// Inference (OpenAI compatibility)
2024-07-03 02:50:56 +08:00
r . POST ( "/v1/chat/completions" , openai . ChatMiddleware ( ) , s . ChatHandler )
2024-07-03 07:01:45 +08:00
r . POST ( "/v1/completions" , openai . CompletionsMiddleware ( ) , s . GenerateHandler )
2024-07-17 04:36:08 +08:00
r . POST ( "/v1/embeddings" , openai . EmbeddingsMiddleware ( ) , s . EmbedHandler )
2024-08-27 10:36:11 +08:00
r . GET ( "/v1/models" , openai . ListMiddleware ( ) , s . ListHandler )
r . GET ( "/v1/models/:model" , openai . RetrieveMiddleware ( ) , s . ShowHandler )
2024-02-08 06:24:29 +08:00
server/internal/registry: take over pulls from server package (#9485)
This commit replaces the old pull implementation in the server package
with the new, faster, more robust pull implementation in the registry
package.
The new endpoint, and now the remove endpoint too, are behind the
feature gate "client2" enabled only by setting the OLLAMA_EXPERIMENT
environment variable include "client2".
Currently, the progress indication is wired to perform the same as the
previous implementation to avoid making changes to the CLI, and because
the status reports happen at the start of the download, and the end of
the write to disk, the progress indication is not as smooth as it could
be. This is a known issue and will be addressed in a future change.
This implementation may be ~0.5-1.0% slower in rare cases, depending on
network and disk speed, but is generally MUCH faster and more robust
than the its predecessor in all other cases.
2025-03-06 06:48:18 +08:00
if rc != nil {
// wrap old with new
rs := & registry . Local {
Client : rc ,
Logger : slog . Default ( ) , // TODO(bmizerany): Take a logger, do not use slog.Default()
Fallback : r ,
2025-03-04 11:11:16 +08:00
server/internal/registry: take over pulls from server package (#9485)
This commit replaces the old pull implementation in the server package
with the new, faster, more robust pull implementation in the registry
package.
The new endpoint, and now the remove endpoint too, are behind the
feature gate "client2" enabled only by setting the OLLAMA_EXPERIMENT
environment variable include "client2".
Currently, the progress indication is wired to perform the same as the
previous implementation to avoid making changes to the CLI, and because
the status reports happen at the start of the download, and the end of
the write to disk, the progress indication is not as smooth as it could
be. This is a known issue and will be addressed in a future change.
This implementation may be ~0.5-1.0% slower in rare cases, depending on
network and disk speed, but is generally MUCH faster and more robust
than the its predecessor in all other cases.
2025-03-06 06:48:18 +08:00
Prune : PruneLayers ,
}
return rs , nil
2025-02-28 04:04:53 +08:00
}
server/internal/registry: take over pulls from server package (#9485)
This commit replaces the old pull implementation in the server package
with the new, faster, more robust pull implementation in the registry
package.
The new endpoint, and now the remove endpoint too, are behind the
feature gate "client2" enabled only by setting the OLLAMA_EXPERIMENT
environment variable include "client2".
Currently, the progress indication is wired to perform the same as the
previous implementation to avoid making changes to the CLI, and because
the status reports happen at the start of the download, and the end of
the write to disk, the progress indication is not as smooth as it could
be. This is a known issue and will be addressed in a future change.
This implementation may be ~0.5-1.0% slower in rare cases, depending on
network and disk speed, but is generally MUCH faster and more robust
than the its predecessor in all other cases.
2025-03-06 06:48:18 +08:00
return r , nil
2023-12-15 08:47:40 +08:00
}
func Serve ( ln net . Listener ) error {
2025-05-13 02:43:00 +08:00
slog . SetDefault ( logutil . NewLogger ( os . Stderr , envconfig . LogLevel ( ) ) )
2024-05-25 05:57:15 +08:00
slog . Info ( "server config" , "env" , envconfig . Values ( ) )
2024-02-01 06:59:32 +08:00
2024-03-15 11:18:06 +08:00
blobsDir , err := GetBlobsPath ( "" )
if err != nil {
return err
}
if err := fixBlobs ( blobsDir ) ; err != nil {
return err
}
2024-07-04 08:22:13 +08:00
if ! envconfig . NoPrune ( ) {
2024-11-06 06:21:45 +08:00
if _ , err := Manifests ( false ) ; err != nil {
slog . Warn ( "corrupt manifests detected, skipping prune operation. Re-pull or delete to clear" , "error" , err )
} else {
// clean up unused layers and manifests
if err := PruneLayers ( ) ; err != nil {
return err
}
2023-12-15 08:47:40 +08:00
2024-11-06 06:21:45 +08:00
manifestsPath , err := GetManifestPath ( )
if err != nil {
return err
}
2023-12-15 08:47:40 +08:00
2024-11-06 06:21:45 +08:00
if err := PruneDirectory ( manifestsPath ) ; err != nil {
return err
}
2023-12-15 08:47:40 +08:00
}
}
2025-02-28 04:04:53 +08:00
s := & Server { addr : ln . Addr ( ) }
server/internal/registry: take over pulls from server package (#9485)
This commit replaces the old pull implementation in the server package
with the new, faster, more robust pull implementation in the registry
package.
The new endpoint, and now the remove endpoint too, are behind the
feature gate "client2" enabled only by setting the OLLAMA_EXPERIMENT
environment variable include "client2".
Currently, the progress indication is wired to perform the same as the
previous implementation to avoid making changes to the CLI, and because
the status reports happen at the start of the download, and the end of
the write to disk, the progress indication is not as smooth as it could
be. This is a known issue and will be addressed in a future change.
This implementation may be ~0.5-1.0% slower in rare cases, depending on
network and disk speed, but is generally MUCH faster and more robust
than the its predecessor in all other cases.
2025-03-06 06:48:18 +08:00
var rc * ollama . Registry
if useClient2 {
var err error
rc , err = ollama . DefaultRegistry ( )
if err != nil {
return err
}
2025-02-28 04:04:53 +08:00
}
2025-03-03 12:55:44 +08:00
h , err := s . GenerateRoutes ( rc )
2025-02-28 04:04:53 +08:00
if err != nil {
return err
}
server/internal/registry: take over pulls from server package (#9485)
This commit replaces the old pull implementation in the server package
with the new, faster, more robust pull implementation in the registry
package.
The new endpoint, and now the remove endpoint too, are behind the
feature gate "client2" enabled only by setting the OLLAMA_EXPERIMENT
environment variable include "client2".
Currently, the progress indication is wired to perform the same as the
previous implementation to avoid making changes to the CLI, and because
the status reports happen at the start of the download, and the end of
the write to disk, the progress indication is not as smooth as it could
be. This is a known issue and will be addressed in a future change.
This implementation may be ~0.5-1.0% slower in rare cases, depending on
network and disk speed, but is generally MUCH faster and more robust
than the its predecessor in all other cases.
2025-03-06 06:48:18 +08:00
2025-02-28 04:04:53 +08:00
http . Handle ( "/" , h )
2024-03-31 00:50:05 +08:00
ctx , done := context . WithCancel ( context . Background ( ) )
2024-05-10 06:47:02 +08:00
schedCtx , schedDone := context . WithCancel ( ctx )
sched := InitScheduler ( schedCtx )
2025-02-28 04:04:53 +08:00
s . sched = sched
2023-12-15 08:47:40 +08:00
2024-01-19 02:52:01 +08:00
slog . Info ( fmt . Sprintf ( "Listening on %s (version %s)" , ln . Addr ( ) , version . Version ) )
2023-12-15 08:47:40 +08:00
srvr := & http . Server {
2024-06-25 12:47:52 +08:00
// Use http.DefaultServeMux so we get net/http/pprof for
// free.
//
// TODO(bmizerany): Decide if we want to make this
// configurable so it is not exposed by default, or allow
// users to bind it to a different port. This was a quick
// and easy way to get pprof, but it may not be the best
// way.
Handler : nil ,
2023-07-04 03:22:44 +08:00
}
2023-08-31 04:35:03 +08:00
// listen for a ctrl+c and stop any loaded llm
signals := make ( chan os . Signal , 1 )
2023-09-22 03:38:49 +08:00
signal . Notify ( signals , syscall . SIGINT , syscall . SIGTERM )
2023-08-31 04:35:03 +08:00
go func ( ) {
<- signals
2024-05-07 07:01:37 +08:00
srvr . Close ( )
2024-05-10 06:47:02 +08:00
schedDone ( )
2024-03-31 00:50:05 +08:00
sched . unloadAllRunners ( )
2024-05-10 06:47:02 +08:00
done ( )
2023-08-31 04:35:03 +08:00
} ( )
2024-05-10 06:47:02 +08:00
s . sched . Run ( schedCtx )
2024-03-31 00:50:05 +08:00
2025-05-13 11:41:42 +08:00
// register the experimental webp decoder
// so webp images can be used in multimodal inputs
image . RegisterFormat ( "webp" , "RIFF????WEBP" , webp . Decode , webp . DecodeConfig )
2024-03-31 00:50:05 +08:00
// At startup we retrieve GPU information so we can get log messages before loading a model
// This will log warnings to the log in case we have problems with detected GPUs
2024-10-17 08:45:00 +08:00
gpus := discover . GetGPUInfo ( )
2024-05-08 05:54:26 +08:00
gpus . LogDetails ( )
2023-09-12 23:04:35 +08:00
2025-08-08 04:49:26 +08:00
var totalVRAM uint64
for _ , gpu := range gpus {
totalVRAM += gpu . TotalMemory - envconfig . GpuOverhead ( )
}
if totalVRAM < lowVRAMThreshold {
s . lowVRAM = true
slog . Info ( "entering low vram mode" , "total vram" , format . HumanBytes2 ( totalVRAM ) , "threshold" , format . HumanBytes2 ( lowVRAMThreshold ) )
}
2024-05-10 06:47:02 +08:00
err = srvr . Serve ( ln )
// If server is closed from the signal handler, wait for the ctx to be done
// otherwise error out quickly
if ! errors . Is ( err , http . ErrServerClosed ) {
return err
}
<- ctx . Done ( )
2024-05-17 07:25:38 +08:00
return nil
2023-07-04 03:22:44 +08:00
}
2023-07-07 01:40:11 +08:00
2025-04-03 00:44:27 +08:00
func waitForStream ( c * gin . Context , ch chan any ) {
2023-10-12 00:54:27 +08:00
c . Header ( "Content-Type" , "application/json" )
2025-05-09 04:17:30 +08:00
var latest api . ProgressResponse
2023-10-12 00:54:27 +08:00
for resp := range ch {
switch r := resp . ( type ) {
case api . ProgressResponse :
2025-05-09 04:17:30 +08:00
latest = r
2023-10-12 00:54:27 +08:00
case gin . H :
2024-07-20 06:24:29 +08:00
status , ok := r [ "status" ] . ( int )
if ! ok {
status = http . StatusInternalServerError
}
2025-05-09 04:17:30 +08:00
errorMsg , ok := r [ "error" ] . ( string )
if ! ok {
errorMsg = "unknown error"
2023-10-12 00:54:27 +08:00
}
2025-05-09 04:17:30 +08:00
c . JSON ( status , gin . H { "error" : errorMsg } )
return
2023-10-12 00:54:27 +08:00
default :
2025-05-09 04:17:30 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : "unknown message type" } )
2023-10-12 00:54:27 +08:00
return
}
}
2025-05-09 04:17:30 +08:00
c . JSON ( http . StatusOK , latest )
2023-10-12 00:54:27 +08:00
}
2023-07-15 05:15:53 +08:00
func streamResponse ( c * gin . Context , ch chan any ) {
2023-08-09 12:38:10 +08:00
c . Header ( "Content-Type" , "application/x-ndjson" )
2023-07-12 02:54:22 +08:00
c . Stream ( func ( w io . Writer ) bool {
val , ok := <- ch
if ! ok {
return false
}
bts , err := json . Marshal ( val )
if err != nil {
2024-01-19 02:52:01 +08:00
slog . Info ( fmt . Sprintf ( "streamResponse: json.Marshal failed with %s" , err ) )
2023-07-12 02:54:22 +08:00
return false
}
2023-09-30 12:45:52 +08:00
// Delineate chunks with new-line delimiter
2023-07-12 02:54:22 +08:00
bts = append ( bts , '\n' )
if _ , err := w . Write ( bts ) ; err != nil {
2024-01-19 02:52:01 +08:00
slog . Info ( fmt . Sprintf ( "streamResponse: w.Write failed with %s" , err ) )
2023-07-12 02:54:22 +08:00
return false
}
return true
} )
}
2023-12-06 03:57:33 +08:00
2025-09-18 05:40:53 +08:00
func ( s * Server ) WhoamiHandler ( c * gin . Context ) {
// todo allow other hosts
u , err := url . Parse ( "https://ollama.com" )
if err != nil {
slog . Error ( err . Error ( ) )
c . JSON ( http . StatusInternalServerError , gin . H { "error" : "URL parse error" } )
return
}
client := api . NewClient ( u , http . DefaultClient )
user , err := client . Whoami ( c )
if err != nil {
slog . Error ( err . Error ( ) )
}
c . JSON ( http . StatusOK , user )
}
func ( s * Server ) SignoutHandler ( c * gin . Context ) {
encodedKey := c . Param ( "encodedKey" )
// todo allow other hosts
u , err := url . Parse ( "https://ollama.com" )
if err != nil {
slog . Error ( err . Error ( ) )
c . JSON ( http . StatusInternalServerError , gin . H { "error" : "URL parse error" } )
return
}
client := api . NewClient ( u , http . DefaultClient )
err = client . Signout ( c , encodedKey )
if err != nil {
slog . Error ( err . Error ( ) )
if strings . Contains ( err . Error ( ) , "page not found" ) || strings . Contains ( err . Error ( ) , "invalid credentials" ) {
c . JSON ( http . StatusNotFound , gin . H { "error" : "you are not currently signed in" } )
return
}
c . JSON ( http . StatusInternalServerError , gin . H { "error" : "there was an error signing out" } )
return
}
c . JSON ( http . StatusOK , nil )
}
2024-08-27 10:36:11 +08:00
func ( s * Server ) PsHandler ( c * gin . Context ) {
2024-06-07 01:11:45 +08:00
models := [ ] api . ProcessModelResponse { }
2024-05-14 08:17:36 +08:00
for _ , v := range s . sched . loaded {
model := v . model
modelDetails := api . ModelDetails {
Format : model . Config . ModelFormat ,
Family : model . Config . ModelFamily ,
Families : model . Config . ModelFamilies ,
ParameterSize : model . Config . ModelType ,
QuantizationLevel : model . Config . FileType ,
}
2024-06-07 01:11:45 +08:00
mr := api . ProcessModelResponse {
2024-05-14 08:17:36 +08:00
Model : model . ShortName ,
Name : model . ShortName ,
2025-05-30 03:21:48 +08:00
Size : int64 ( v . totalSize ) ,
SizeVRAM : int64 ( v . vramSize ) ,
2024-05-14 08:17:36 +08:00
Digest : model . Digest ,
Details : modelDetails ,
ExpiresAt : v . expiresAt ,
}
2025-07-09 02:59:06 +08:00
if v . Options != nil {
2025-05-30 03:21:48 +08:00
mr . ContextLength = v . Options . NumCtx
2025-07-09 02:59:06 +08:00
}
2024-05-16 06:43:16 +08:00
// The scheduler waits to set expiresAt, so if a model is loading it's
// possible that it will be set to the unix epoch. For those cases, just
// calculate the time w/ the sessionDuration instead.
var epoch time . Time
if v . expiresAt == epoch {
mr . ExpiresAt = time . Now ( ) . Add ( v . sessionDuration )
}
2024-05-14 08:17:36 +08:00
models = append ( models , mr )
}
2024-06-22 06:59:41 +08:00
slices . SortStableFunc ( models , func ( i , j api . ProcessModelResponse ) int {
// longest duration remaining listed first
return cmp . Compare ( j . ExpiresAt . Unix ( ) , i . ExpiresAt . Unix ( ) )
} )
2024-06-07 01:11:45 +08:00
c . JSON ( http . StatusOK , api . ProcessResponse { Models : models } )
2024-05-14 08:17:36 +08:00
}
2024-03-31 00:50:05 +08:00
func ( s * Server ) ChatHandler ( c * gin . Context ) {
2024-07-14 00:25:31 +08:00
checkpointStart := time . Now ( )
2023-12-06 03:57:33 +08:00
var req api . ChatRequest
2024-06-18 01:38:55 +08:00
if err := c . ShouldBindJSON ( & req ) ; errors . Is ( err , io . EOF ) {
2023-12-06 03:57:33 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : "missing request body" } )
return
2024-06-18 01:38:55 +08:00
} else if err != nil {
2023-12-06 03:57:33 +08:00
c . AbortWithStatusJSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
return
}
2025-09-18 05:40:53 +08:00
name := model . ParseName ( req . Model )
if ! name . IsValid ( ) {
c . JSON ( http . StatusBadRequest , gin . H { "error" : "model is required" } )
return
}
name , err := getExistingName ( name )
if err != nil {
c . JSON ( http . StatusBadRequest , gin . H { "error" : "model is required" } )
return
}
m , err := GetModel ( req . Model )
if err != nil {
switch {
case os . IsNotExist ( err ) :
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model '%s' not found" , req . Model ) } )
case err . Error ( ) == errtypes . InvalidModelNameErrMsg :
c . JSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
default :
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
2024-09-12 07:36:21 +08:00
}
2025-09-18 05:40:53 +08:00
return
}
// expire the runner
if len ( req . Messages ) == 0 && req . KeepAlive != nil && int ( req . KeepAlive . Seconds ( ) ) == 0 {
s . sched . expireRunner ( m )
2024-09-12 07:36:21 +08:00
c . JSON ( http . StatusOK , api . ChatResponse {
Model : req . Model ,
CreatedAt : time . Now ( ) . UTC ( ) ,
Message : api . Message { Role : "assistant" } ,
Done : true ,
DoneReason : "unload" ,
} )
return
}
2025-09-18 05:40:53 +08:00
if m . Config . RemoteHost != "" && m . Config . RemoteModel != "" {
origModel := req . Model
remoteURL , err := url . Parse ( m . Config . RemoteHost )
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
if ! slices . Contains ( envconfig . Remotes ( ) , remoteURL . Hostname ( ) ) {
slog . Info ( "remote model" , "remotes" , envconfig . Remotes ( ) , "remoteURL" , m . Config . RemoteHost , "hostname" , remoteURL . Hostname ( ) )
c . JSON ( http . StatusBadRequest , gin . H { "error" : "this server cannot run this remote model" } )
return
}
req . Model = m . Config . RemoteModel
if req . Options == nil {
req . Options = map [ string ] any { }
}
msgs := append ( m . Messages , req . Messages ... )
if req . Messages [ 0 ] . Role != "system" && m . System != "" {
msgs = append ( [ ] api . Message { { Role : "system" , Content : m . System } } , msgs ... )
}
msgs = filterThinkTags ( msgs , m )
req . Messages = msgs
for k , v := range m . Options {
if _ , ok := req . Options [ k ] ; ! ok {
req . Options [ k ] = v
}
}
fn := func ( resp api . ChatResponse ) error {
resp . Model = origModel
resp . RemoteModel = m . Config . RemoteModel
resp . RemoteHost = m . Config . RemoteHost
data , err := json . Marshal ( resp )
if err != nil {
return err
}
if _ , err = c . Writer . Write ( append ( data , '\n' ) ) ; err != nil {
return err
}
c . Writer . Flush ( )
return nil
}
client := api . NewClient ( remoteURL , http . DefaultClient )
err = client . Chat ( c , & req , fn )
if err != nil {
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
}
return
}
2025-04-02 06:21:46 +08:00
caps := [ ] model . Capability { model . CapabilityCompletion }
2024-07-19 02:44:57 +08:00
if len ( req . Tools ) > 0 {
2025-04-02 06:21:46 +08:00
caps = append ( caps , model . CapabilityTools )
2024-06-21 04:45:47 +08:00
}
2025-08-13 02:02:01 +08:00
if req . Think != nil && req . Think . Bool ( ) {
2025-05-29 10:38:52 +08:00
caps = append ( caps , model . CapabilityThinking )
}
2024-06-21 04:45:47 +08:00
2024-12-12 07:29:59 +08:00
r , m , opts , err := s . scheduleRunner ( c . Request . Context ( ) , name . String ( ) , caps , req . Options , req . KeepAlive )
2024-06-18 01:38:55 +08:00
if errors . Is ( err , errCapabilityCompletion ) {
c . JSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "%q does not support chat" , req . Model ) } )
2023-12-06 03:57:33 +08:00
return
2024-06-18 01:38:55 +08:00
} else if err != nil {
2024-06-21 02:00:08 +08:00
handleScheduleError ( c , req . Model , err )
2023-12-06 03:57:33 +08:00
return
}
2024-02-01 09:39:38 +08:00
2024-07-14 00:25:31 +08:00
checkpointLoaded := time . Now ( )
2024-06-18 01:38:55 +08:00
if len ( req . Messages ) == 0 {
c . JSON ( http . StatusOK , api . ChatResponse {
2024-05-10 04:30:14 +08:00
Model : req . Model ,
2024-06-18 01:38:55 +08:00
CreatedAt : time . Now ( ) . UTC ( ) ,
Message : api . Message { Role : "assistant" } ,
2024-05-10 04:30:14 +08:00
Done : true ,
DoneReason : "load" ,
2024-06-18 01:38:55 +08:00
} )
2024-02-08 08:30:33 +08:00
return
}
2024-06-20 05:14:28 +08:00
msgs := append ( m . Messages , req . Messages ... )
2024-07-17 02:09:00 +08:00
if req . Messages [ 0 ] . Role != "system" && m . System != "" {
2024-06-20 05:14:28 +08:00
msgs = append ( [ ] api . Message { { Role : "system" , Content : m . System } } , msgs ... )
2024-07-14 06:08:00 +08:00
}
2025-05-01 04:57:45 +08:00
msgs = filterThinkTags ( msgs , m )
2024-07-14 06:08:00 +08:00
2025-09-16 02:46:25 +08:00
var builtinParser parsers . Parser
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
if m . Config . Parser != "" {
builtinParser = parsers . ParserForName ( m . Config . Parser )
}
2025-09-13 04:32:30 +08:00
var harmonyMessageHandler * harmony . HarmonyMessageHandler
var harmonyToolParser * harmony . HarmonyToolCallAccumulator
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
useHarmony := shouldUseHarmony ( m ) || m . Config . Parser == "harmony"
2025-08-15 08:17:25 +08:00
processedTools := req . Tools
if useHarmony {
2025-09-13 04:32:30 +08:00
harmonyMessageHandler = harmony . NewHarmonyMessageHandler ( )
var lastMessage * api . Message
if len ( msgs ) > 0 {
lastMessage = & msgs [ len ( msgs ) - 1 ]
}
harmonyMessageHandler . HarmonyParser . AddImplicitStartOrPrefill ( lastMessage )
harmonyToolParser = harmonyMessageHandler . CreateToolParser ( )
2025-08-15 08:17:25 +08:00
// make a copy of tools to pass to the chat prompt. Function names may be
// renamed to be valid Harmony function names.
processedTools = make ( [ ] api . Tool , len ( req . Tools ) )
copy ( processedTools , req . Tools )
for i , tool := range processedTools {
2025-09-13 04:32:30 +08:00
processedTools [ i ] . Function . Name = harmonyMessageHandler . FunctionNameMap . ConvertAndAdd ( tool . Function . Name )
2025-08-15 08:17:25 +08:00
}
}
prompt , images , err := chatPrompt ( c . Request . Context ( ) , m , r . Tokenize , opts , msgs , processedTools , req . Think )
2024-06-18 01:38:55 +08:00
if err != nil {
2024-11-28 05:40:57 +08:00
slog . Error ( "chat prompt error" , "error" , err )
2024-06-18 01:38:55 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
return
2024-02-13 07:06:57 +08:00
}
2025-08-16 04:52:50 +08:00
// If debug mode is enabled, return the rendered template instead of calling the model
if req . DebugRenderOnly {
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
c . JSON ( http . StatusOK , api . ChatResponse {
2025-08-16 04:52:50 +08:00
Model : req . Model ,
CreatedAt : time . Now ( ) . UTC ( ) ,
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
DebugInfo : & api . DebugInfo {
2025-08-16 04:52:50 +08:00
RenderedTemplate : prompt ,
ImageCount : len ( images ) ,
} ,
} )
return
}
2025-08-06 03:21:16 +08:00
// Validate Think value: string values currently only allowed for gptoss models
if req . Think != nil && req . Think . IsString ( ) && ! useHarmony {
2025-08-13 02:02:01 +08:00
c . JSON ( http . StatusBadRequest , gin . H { "error" : fmt . Sprintf ( "think value %q is not supported for this model" , req . Think . String ( ) ) } )
2025-08-06 03:21:16 +08:00
return
}
2025-06-07 03:02:20 +08:00
var thinkingState * thinking . Parser
openingTag , closingTag := thinking . InferTags ( m . Template . Template )
2025-08-13 02:02:01 +08:00
if req . Think != nil && req . Think . Bool ( ) && openingTag != "" && closingTag != "" {
2025-06-07 03:02:20 +08:00
thinkingState = & thinking . Parser {
2025-06-06 01:22:32 +08:00
OpeningTag : openingTag ,
ClosingTag : closingTag ,
2025-05-29 10:38:52 +08:00
}
2025-08-23 03:00:16 +08:00
if strings . HasSuffix ( strings . TrimSpace ( prompt ) , openingTag ) {
thinkingState . AddContent ( openingTag )
}
2025-05-29 10:38:52 +08:00
}
2025-05-24 05:19:31 +08:00
var toolParser * tools . Parser
2025-08-06 03:21:16 +08:00
if len ( req . Tools ) > 0 && ! useHarmony {
2025-06-13 05:18:54 +08:00
toolParser = tools . NewParser ( m . Template . Template , req . Tools )
2025-05-24 05:19:31 +08:00
}
2023-12-06 03:57:33 +08:00
ch := make ( chan any )
go func ( ) {
defer close ( ch )
2025-05-24 05:19:31 +08:00
2024-07-04 00:00:07 +08:00
if err := r . Completion ( c . Request . Context ( ) , llm . CompletionRequest {
2025-09-13 04:32:30 +08:00
Prompt : prompt ,
Images : images ,
Format : req . Format ,
Options : opts ,
2024-06-18 01:38:55 +08:00
} , func ( r llm . CompletionResponse ) {
2024-07-14 00:25:31 +08:00
res := api . ChatResponse {
2025-04-04 01:19:24 +08:00
Model : req . Model ,
CreatedAt : time . Now ( ) . UTC ( ) ,
2025-09-13 04:32:30 +08:00
Message : api . Message { Role : "assistant" , Content : r . Content } ,
2025-04-04 01:19:24 +08:00
Done : r . Done ,
2023-12-06 03:57:33 +08:00
Metrics : api . Metrics {
PromptEvalCount : r . PromptEvalCount ,
PromptEvalDuration : r . PromptEvalDuration ,
EvalCount : r . EvalCount ,
EvalDuration : r . EvalDuration ,
} ,
}
2025-08-06 03:21:16 +08:00
if r . Done {
res . DoneReason = r . DoneReason . String ( )
res . TotalDuration = time . Since ( checkpointStart )
res . LoadDuration = checkpointLoaded . Sub ( checkpointStart )
}
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
// TODO(drifkin): fold this as much as possibleinto the generic m.Config.Parser logic
2025-08-06 03:21:16 +08:00
if useHarmony {
2025-09-13 04:32:30 +08:00
content , thinking , toolContent := harmonyMessageHandler . AddContent ( r . Content , harmonyToolParser )
res . Message . Content = content
res . Message . Thinking = thinking
harmonyToolParser . Add ( toolContent )
if r . Done {
toolName , toolContent := harmonyToolParser . Drain ( )
if toolName != nil {
* toolName = strings . TrimPrefix ( * toolName , "functions." )
* toolName = harmonyMessageHandler . FunctionNameMap . OriginalFromConverted ( * toolName )
var args api . ToolCallFunctionArguments
if err := json . Unmarshal ( [ ] byte ( toolContent ) , & args ) ; err != nil {
errStr := fmt . Sprintf ( "error parsing tool call: raw='%s', err=%s" , toolContent , err . Error ( ) )
ch <- gin . H { "error" : errStr }
return
}
res . Message . ToolCalls = [ ] api . ToolCall { { Function : api . ToolCallFunction { Name : * toolName , Arguments : args } } }
}
2025-08-06 03:21:16 +08:00
}
2025-09-13 04:32:30 +08:00
2025-08-06 03:21:16 +08:00
// only send messages with meaningful content (empty messages confuse clients)
if res . Message . Content != "" || res . Message . Thinking != "" || len ( res . Message . ToolCalls ) > 0 || res . Done {
ch <- res
}
2025-09-13 04:32:30 +08:00
add qwen3-coder tool support
The format qwen3-coder uses is relatively unique, both in rendering and
in parsing. To implement parsing, I wrote a custom parser in similar
style to harmony. For the rendering, I found that the logic would be
much more difficult to follow in a template, so I introduced the concept
of a built-in renderer that uses go code, rather than a template to
generate prompts.
I set us up for future built-in parsers and renderers by making it so
they can be specified in a Modelfile like so:
```
RENDERER "qwen3-coder"
PARSER "qwen3-coder"
```
These need to be provided explicitly because the architecture alone is
not enough to understand what format the model expects to receive, and
what format we expect it to output (e.g., qwen3-coder is `qwen3moe`,
which includes other qwen3-family models as well)
I haven't converted harmony to be one of these "built-ins" yet, since
some of it is in flux with the changes @ParthSareen has been making to
move harmony to the runner. It is likely that many other built-ins will
need to move to the runner as well, but I'm able to slightly defer that
decision since qwen3-coder doesn't have thinking (and therefore doesn't
need to be in the runner to make structured outputs work). I expect to
unify harmony with this approach very soon.
Whether a particular model supports tools or thinking was previously
inferred from templates, but without a template we now also use the
parser itself to declare what it supports. If we have future models that
re-use the same parsing format, but have different capabilities, we'll
want to parameterize them and give them different names to be specified
as a `PARSER`.
Misc changes:
- I worked on the renderer by diffing outputs from the reference
implementation and ours. To make it easier to do this, I extended
<https://github.com/ollama/ollama/pull/11875> to also support
returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
return
} else if builtinParser != nil {
slog . Log ( context . TODO ( ) , logutil . LevelTrace , "builtin parser input" , "parser" , m . Config . Parser , "content" , r . Content )
content , thinking , toolCalls , err := builtinParser . Add ( r . Content , req . Tools )
if err != nil {
ch <- gin . H { "error" : err . Error ( ) }
return
}
res . Message . Content = content
res . Message . Thinking = thinking
res . Message . ToolCalls = toolCalls
if res . Message . Content != "" || res . Message . Thinking != "" || len ( res . Message . ToolCalls ) > 0 || r . Done {
slog . Log ( context . TODO ( ) , logutil . LevelTrace , "builtin parser output" , "parser" , m . Config . Parser , "content" , content , "thinking" , thinking , "toolCalls" , toolCalls , "done" , r . Done )
ch <- res
} else {
slog . Log ( context . TODO ( ) , logutil . LevelTrace , "builtin parser empty output" , "parser" , m . Config . Parser )
}
2025-08-06 03:21:16 +08:00
return
}
2024-07-14 00:25:31 +08:00
2025-05-29 10:38:52 +08:00
if thinkingState != nil {
2025-06-06 01:22:32 +08:00
thinkingContent , remainingContent := thinkingState . AddContent ( res . Message . Content )
2025-05-29 10:38:52 +08:00
if thinkingContent == "" && remainingContent == "" && ! r . Done {
// need to accumulate more to decide what to send
return
}
res . Message . Content = remainingContent
res . Message . Thinking = thinkingContent
}
2025-05-24 05:19:31 +08:00
if len ( req . Tools ) > 0 {
2025-05-29 10:38:52 +08:00
toolCalls , content := toolParser . Add ( res . Message . Content )
2025-05-24 05:19:31 +08:00
if len ( content ) > 0 {
res . Message . Content = content
} else if len ( toolCalls ) > 0 {
res . Message . ToolCalls = toolCalls
res . Message . Content = ""
2025-05-29 10:38:52 +08:00
} else if res . Message . Thinking != "" {
// don't return
2025-05-24 05:19:31 +08:00
} else {
if r . Done {
2025-06-13 05:18:54 +08:00
res . Message . Content = toolParser . Content ( )
2025-05-24 05:19:31 +08:00
ch <- res
}
return
2024-11-28 05:40:57 +08:00
}
}
2025-05-29 10:38:52 +08:00
2025-05-24 05:19:31 +08:00
ch <- res
2024-06-18 01:38:55 +08:00
} ) ; err != nil {
2023-12-06 03:57:33 +08:00
ch <- gin . H { "error" : err . Error ( ) }
}
} ( )
if req . Stream != nil && ! * req . Stream {
2024-06-21 04:45:47 +08:00
var resp api . ChatResponse
2025-05-24 05:19:31 +08:00
var toolCalls [ ] api . ToolCall
2025-05-29 10:38:52 +08:00
var sbThinking strings . Builder
var sbContent strings . Builder
2024-06-18 01:38:55 +08:00
for rr := range ch {
switch t := rr . ( type ) {
2023-12-10 23:53:38 +08:00
case api . ChatResponse :
2025-05-29 10:38:52 +08:00
sbThinking . WriteString ( t . Message . Thinking )
sbContent . WriteString ( t . Message . Content )
2024-06-21 04:45:47 +08:00
resp = t
2025-05-24 05:19:31 +08:00
if len ( req . Tools ) > 0 {
toolCalls = append ( toolCalls , t . Message . ToolCalls ... )
}
2023-12-10 23:53:38 +08:00
case gin . H :
2024-06-18 01:38:55 +08:00
msg , ok := t [ "error" ] . ( string )
if ! ok {
msg = "unexpected error format in response"
2023-12-10 23:53:38 +08:00
}
2024-06-18 01:38:55 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : msg } )
return
2023-12-10 23:53:38 +08:00
default :
2024-06-18 01:38:55 +08:00
c . JSON ( http . StatusInternalServerError , gin . H { "error" : "unexpected response" } )
2023-12-10 23:53:38 +08:00
return
2023-12-06 03:57:33 +08:00
}
}
2023-12-10 23:53:38 +08:00
2025-05-29 10:38:52 +08:00
resp . Message . Content = sbContent . String ( )
resp . Message . Thinking = sbThinking . String ( )
2025-05-24 05:19:31 +08:00
if len ( toolCalls ) > 0 {
resp . Message . ToolCalls = toolCalls
2024-06-21 04:45:47 +08:00
}
c . JSON ( http . StatusOK , resp )
2023-12-06 03:57:33 +08:00
return
}
streamResponse ( c , ch )
}
2024-05-04 07:25:57 +08:00
2024-06-21 02:00:08 +08:00
func handleScheduleError ( c * gin . Context , name string , err error ) {
2024-06-18 01:38:55 +08:00
switch {
2024-06-21 10:13:36 +08:00
case errors . Is ( err , errCapabilities ) , errors . Is ( err , errRequired ) :
2024-06-21 02:00:08 +08:00
c . JSON ( http . StatusBadRequest , gin . H { "error" : err . Error ( ) } )
2024-06-18 01:38:55 +08:00
case errors . Is ( err , context . Canceled ) :
2024-05-04 07:25:57 +08:00
c . JSON ( 499 , gin . H { "error" : "request canceled" } )
2024-06-18 01:38:55 +08:00
case errors . Is ( err , ErrMaxQueue ) :
2024-05-04 07:25:57 +08:00
c . JSON ( http . StatusServiceUnavailable , gin . H { "error" : err . Error ( ) } )
2024-06-21 02:00:08 +08:00
case errors . Is ( err , os . ErrNotExist ) :
c . JSON ( http . StatusNotFound , gin . H { "error" : fmt . Sprintf ( "model %q not found, try pulling it first" , name ) } )
2024-06-18 01:38:55 +08:00
default :
c . JSON ( http . StatusInternalServerError , gin . H { "error" : err . Error ( ) } )
2024-05-04 07:25:57 +08:00
}
}
2025-05-01 04:57:45 +08:00
func filterThinkTags ( msgs [ ] api . Message , m * Model ) [ ] api . Message {
if m . Config . ModelFamily == "qwen3" || model . ParseName ( m . Name ) . Model == "deepseek-r1" {
finalUserIndex := - 1
for i , msg := range msgs {
if msg . Role == "user" {
finalUserIndex = i
}
}
for i , msg := range msgs {
if msg . Role == "assistant" && i < finalUserIndex {
2025-05-29 10:38:52 +08:00
// TODO(drifkin): this is from before we added proper thinking support.
// However, even if thinking is not enabled (and therefore we shouldn't
// change the user output), we should probably perform this filtering
// for all thinking models (not just qwen3 & deepseek-r1) since it tends
// to save tokens and improve quality.
2025-06-07 03:02:20 +08:00
thinkingState := & thinking . Parser {
2025-06-06 01:22:32 +08:00
OpeningTag : "<think>" ,
ClosingTag : "</think>" ,
2025-05-29 10:38:52 +08:00
}
2025-06-06 01:22:32 +08:00
_ , content := thinkingState . AddContent ( msg . Content )
2025-05-29 10:38:52 +08:00
msgs [ i ] . Content = content
2025-05-01 04:57:45 +08:00
}
}
}
return msgs
}