ollama/server/routes.go

2379 lines
64 KiB
Go
Raw Permalink Normal View History

package server
import (
2024-06-18 01:38:55 +08:00
"bytes"
2024-05-01 01:55:19 +08:00
"cmp"
"context"
"encoding/base64"
2023-07-07 01:40:11 +08:00
"encoding/json"
"errors"
"fmt"
"image"
"io"
"io/fs"
"log/slog"
"math"
"math/rand"
"net"
"net/http"
"net/netip"
2025-09-18 05:40:53 +08:00
"net/url"
"os"
"os/signal"
2024-05-22 12:30:52 +08:00
"slices"
2023-07-07 01:40:11 +08:00
"strings"
"syscall"
2023-07-13 09:18:06 +08:00
"time"
2023-07-22 09:01:24 +08:00
"github.com/gin-contrib/cors"
"github.com/gin-gonic/gin"
"golang.org/x/image/webp"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/api"
2025-09-18 05:40:53 +08:00
"github.com/ollama/ollama/auth"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/middleware"
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
"github.com/ollama/ollama/model/parsers"
"github.com/ollama/ollama/model/renderers"
"github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/server/internal/registry"
2024-06-11 05:54:42 +08:00
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/thinking"
"github.com/ollama/ollama/tools"
"github.com/ollama/ollama/types/errtypes"
2024-04-17 07:22:38 +08:00
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
)
const signinURLStr = "https://ollama.com/connect?name=%s&key=%s"
func shouldUseHarmony(model *Model) bool {
if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
// heuristic to check whether the template expects to be parsed via harmony:
// search for harmony tags that are nearly always used
if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
return true
}
}
return false
}
func experimentEnabled(name string) bool {
return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name)
}
var useClient2 = experimentEnabled("client2")
// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
// reduced context length on some models
var lowVRAMThreshold uint64 = 20 * format.GibiByte
2023-08-23 00:48:35 +08:00
var mode string = gin.DebugMode
2023-12-15 08:47:40 +08:00
type Server struct {
addr net.Addr
sched *Scheduler
lowVRAM bool
2023-12-15 08:47:40 +08:00
}
2023-08-23 00:48:35 +08:00
func init() {
switch mode {
case gin.DebugMode:
case gin.ReleaseMode:
case gin.TestMode:
default:
mode = gin.DebugMode
}
gin.SetMode(mode)
// Tell renderers to use [img] tags
renderers.RenderImgTags = true
2023-08-23 00:48:35 +08:00
}
2024-08-02 05:52:15 +08:00
var (
errRequired = errors.New("is required")
errBadTemplate = errors.New("template error")
)
2024-06-21 02:00:08 +08:00
func modelOptions(model *Model, requestOpts map[string]any) (api.Options, error) {
opts := api.DefaultOptions()
if err := opts.FromMap(model.Options); err != nil {
return api.Options{}, err
}
if err := opts.FromMap(requestOpts); err != nil {
return api.Options{}, err
}
return opts, nil
}
// scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
2024-06-18 01:38:55 +08:00
if name == "" {
return nil, nil, nil, fmt.Errorf("model %w", errRequired)
}
2024-06-18 01:38:55 +08:00
model, err := GetModel(name)
if err != nil {
return nil, nil, nil, err
}
if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
}
2024-06-18 01:38:55 +08:00
if err := model.CheckCapabilities(caps...); err != nil {
return nil, nil, nil, fmt.Errorf("%s %w", name, err)
}
2024-06-18 01:38:55 +08:00
opts, err := modelOptions(model, requestOpts)
if err != nil {
return nil, nil, nil, err
}
// This model is much more capable with a larger context, so set that
// unless it would penalize performance too much
2025-10-29 08:39:47 +08:00
if !s.lowVRAM && slices.Contains([]string{
"gptoss", "gpt-oss",
"qwen3vl", "qwen3vlmoe",
}, model.Config.ModelFamily) {
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
opts.NumCtx = max(opts.NumCtx, 8192)
}
2024-06-18 01:38:55 +08:00
runnerCh, errCh := s.sched.GetRunner(ctx, model, opts, keepAlive)
var runner *runnerRef
select {
2024-06-18 01:38:55 +08:00
case runner = <-runnerCh:
case err = <-errCh:
return nil, nil, nil, err
}
return runner.llama, model, &opts, nil
2024-06-18 01:38:55 +08:00
}
func signinURL() (string, error) {
pubKey, err := auth.GetPublicKey()
if err != nil {
return "", err
}
encKey := base64.RawURLEncoding.EncodeToString([]byte(pubKey))
h, _ := os.Hostname()
return fmt.Sprintf(signinURLStr, url.PathEscape(h), encKey), nil
}
2024-06-18 01:38:55 +08:00
func (s *Server) GenerateHandler(c *gin.Context) {
checkpointStart := time.Now()
2024-06-18 01:38:55 +08:00
var req api.GenerateRequest
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
} else if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
return
}
name := model.ParseName(req.Model)
if !name.IsValid() {
// Ideally this is "invalid model name" but we're keeping with
// what the API currently returns until we can change it.
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
return
}
// We cannot currently consolidate this into GetModel because all we'll
// induce infinite recursion given the current code structure.
name, err := getExistingName(name)
if err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
return
}
m, err := GetModel(name.String())
if err != nil {
switch {
case errors.Is(err, fs.ErrNotExist):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
case err.Error() == errtypes.InvalidModelNameErrMsg:
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
}
if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
return
}
2025-09-18 05:40:53 +08:00
if m.Config.RemoteHost != "" && m.Config.RemoteModel != "" {
origModel := req.Model
remoteURL, err := url.Parse(m.Config.RemoteHost)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
if !slices.Contains(envconfig.Remotes(), remoteURL.Hostname()) {
slog.Info("remote model", "remotes", envconfig.Remotes(), "remoteURL", m.Config.RemoteHost, "hostname", remoteURL.Hostname())
c.JSON(http.StatusBadRequest, gin.H{"error": "this server cannot run this remote model"})
return
}
req.Model = m.Config.RemoteModel
if req.Template == "" && m.Template.String() != "" {
req.Template = m.Template.String()
}
if req.Options == nil {
req.Options = map[string]any{}
}
for k, v := range m.Options {
if _, ok := req.Options[k]; !ok {
req.Options[k] = v
}
}
// update the system prompt from the model if one isn't already specified
if req.System == "" && m.System != "" {
req.System = m.System
}
if len(m.Messages) > 0 {
slog.Warn("embedded messages in the model not supported with '/api/generate'; try '/api/chat' instead")
}
fn := func(resp api.GenerateResponse) error {
resp.Model = origModel
resp.RemoteModel = m.Config.RemoteModel
resp.RemoteHost = m.Config.RemoteHost
data, err := json.Marshal(resp)
if err != nil {
return err
}
if _, err = c.Writer.Write(append(data, '\n')); err != nil {
return err
}
c.Writer.Flush()
return nil
}
client := api.NewClient(remoteURL, http.DefaultClient)
err = client.Generate(c, &req, fn)
if err != nil {
var authError api.AuthorizationError
if errors.As(err, &authError) {
sURL, sErr := signinURL()
if sErr != nil {
slog.Error(sErr.Error())
c.JSON(http.StatusInternalServerError, gin.H{"error": "error getting authorization details"})
2025-09-18 05:40:53 +08:00
return
}
c.JSON(authError.StatusCode, gin.H{"error": "unauthorized", "signin_url": sURL})
return
}
var apiError api.StatusError
if errors.As(err, &apiError) {
c.JSON(apiError.StatusCode, apiError)
2025-09-18 05:40:53 +08:00
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
contentType := "application/json; charset=utf-8"
if req.Stream != nil && *req.Stream {
contentType = "application/x-ndjson"
}
c.Header("Content-Type", contentType)
2025-09-18 05:40:53 +08:00
return
}
2024-09-12 07:36:21 +08:00
// expire the runner
2025-08-28 02:51:25 +08:00
if req.Prompt == "" && req.KeepAlive != nil && req.KeepAlive.Duration == 0 {
s.sched.expireRunner(m)
2024-09-12 07:36:21 +08:00
c.JSON(http.StatusOK, api.GenerateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Response: "",
Done: true,
DoneReason: "unload",
})
return
}
if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) {
2024-06-18 01:38:55 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
2024-06-11 05:54:42 +08:00
return
}
var builtinParser parsers.Parser
if shouldUseHarmony(m) && m.Config.Parser == "" {
m.Config.Parser = "harmony"
}
if !req.Raw && m.Config.Parser != "" {
builtinParser = parsers.ParserForName(m.Config.Parser)
if builtinParser != nil {
// no tools or last message for generate endpoint
2025-11-20 09:21:07 +08:00
builtinParser.Init(nil, nil, req.Think)
}
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
}
// Validate Think value: string values currently only allowed for harmony/gptoss models
if req.Think != nil && req.Think.IsString() && m.Config.Parser != "harmony" {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
return
}
caps := []model.Capability{model.CapabilityCompletion}
if req.Suffix != "" {
caps = append(caps, model.CapabilityInsert)
}
modelCaps := m.Capabilities()
if slices.Contains(modelCaps, model.CapabilityThinking) {
caps = append(caps, model.CapabilityThinking)
if req.Think == nil {
req.Think = &api.ThinkValue{Value: true}
}
} else {
if req.Think != nil && req.Think.Bool() {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
return
}
}
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
2024-06-18 01:38:55 +08:00
if errors.Is(err, errCapabilityCompletion) {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)})
return
} else if err != nil {
2024-06-21 02:00:08 +08:00
handleScheduleError(c, req.Model, err)
return
}
checkpointLoaded := time.Now()
// load the model
2024-06-21 02:00:08 +08:00
if req.Prompt == "" {
c.JSON(http.StatusOK, api.GenerateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Done: true,
DoneReason: "load",
})
2024-06-18 01:38:55 +08:00
return
}
if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
return
}
2024-06-18 01:38:55 +08:00
images := make([]llm.ImageData, len(req.Images))
for i := range req.Images {
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
2024-06-18 01:38:55 +08:00
}
2023-12-06 03:57:33 +08:00
2024-06-18 01:38:55 +08:00
prompt := req.Prompt
if !req.Raw {
tmpl := m.Template
2024-06-18 01:38:55 +08:00
if req.Template != "" {
tmpl, err = template.Parse(req.Template)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
}
var values template.Values
if req.Suffix != "" {
values.Prompt = prompt
values.Suffix = req.Suffix
} else {
var msgs []api.Message
if req.System != "" {
msgs = append(msgs, api.Message{Role: "system", Content: req.System})
} else if m.System != "" {
msgs = append(msgs, api.Message{Role: "system", Content: m.System})
}
2024-06-20 05:14:28 +08:00
if req.Context == nil {
msgs = append(msgs, m.Messages...)
}
userMsg := api.Message{Role: "user", Content: req.Prompt}
for _, i := range images {
userMsg.Images = append(userMsg.Images, i.Data)
}
values.Messages = append(msgs, userMsg)
}
values.Think = req.Think != nil && req.Think.Bool()
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
values.ThinkLevel = ""
if req.Think != nil {
values.ThinkLevel = req.Think.String()
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
}
values.IsThinkSet = req.Think != nil
2024-06-20 05:14:28 +08:00
var b bytes.Buffer
if req.Context != nil {
slog.Warn("the context field is deprecated and will be removed in a future version of Ollama")
2024-08-01 20:56:15 +08:00
s, err := r.Detokenize(c.Request.Context(), req.Context)
2024-06-20 05:14:28 +08:00
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
2024-08-02 04:50:05 +08:00
b.WriteString(s)
2024-06-20 05:14:28 +08:00
}
2024-08-02 04:50:05 +08:00
// check that we're in the `api/chat`-like flow, and if so, generate the
// prompt the same way
// TEMP(drifkin): we should really just detect the chat-like flow and call
// the real chat handler, but doing this as a stopgap to get renderer
// support for generate
if values.Messages != nil && values.Suffix == "" && req.Template == "" {
prompt, images, err = chatPrompt(c.Request.Context(), m, r.Tokenize, opts, values.Messages, []api.Tool{}, req.Think, req.Truncate == nil || *req.Truncate)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// TEMP(drifkin): req.Context will be removed very soon, but we're temporarily supporting it in this flow here
if req.Context != nil {
b.WriteString(prompt)
prompt = b.String()
}
} else {
// legacy flow
if err := tmpl.Execute(&b, values); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
2024-08-02 04:50:05 +08:00
prompt = b.String()
}
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
c.JSON(http.StatusOK, api.GenerateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
DebugInfo: &api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
var thinkingState *thinking.Parser
if builtinParser == nil {
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
openingTag, closingTag := thinking.InferTags(m.Template.Template)
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
thinkingState = &thinking.Parser{
OpeningTag: openingTag,
ClosingTag: closingTag,
}
if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
thinkingState.AddContent(openingTag)
}
}
}
ch := make(chan any)
go func() {
// TODO (jmorganca): avoid building the response twice both here and below
var sb strings.Builder
defer close(ch)
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
Prompt: prompt,
Images: images,
Format: req.Format,
Options: opts,
Shift: req.Shift == nil || *req.Shift,
Truncate: req.Truncate == nil || *req.Truncate,
Logprobs: req.Logprobs,
TopLogprobs: req.TopLogprobs,
}, func(cr llm.CompletionResponse) {
res := api.GenerateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Response: cr.Content,
Done: cr.Done,
2023-12-06 03:57:33 +08:00
Metrics: api.Metrics{
PromptEvalCount: cr.PromptEvalCount,
PromptEvalDuration: cr.PromptEvalDuration,
EvalCount: cr.EvalCount,
EvalDuration: cr.EvalDuration,
2023-12-06 03:57:33 +08:00
},
Logprobs: toAPILogprobs(cr.Logprobs),
}
if builtinParser != nil {
content, thinking, toolCalls, err := builtinParser.Add(cr.Content, cr.Done)
if err != nil {
ch <- gin.H{"error": err.Error()}
return
}
res.Response = content
res.Thinking = thinking
if cr.Done && len(toolCalls) > 0 {
res.ToolCalls = toolCalls
}
} else if thinkingState != nil {
2025-06-06 01:22:32 +08:00
thinking, content := thinkingState.AddContent(cr.Content)
res.Thinking = thinking
res.Response = content
}
if _, err := sb.WriteString(cr.Content); err != nil {
ch <- gin.H{"error": err.Error()}
}
if cr.Done {
res.DoneReason = cr.DoneReason.String()
res.TotalDuration = time.Since(checkpointStart)
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
if !req.Raw {
2024-08-02 04:50:05 +08:00
tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String())
if err != nil {
ch <- gin.H{"error": err.Error()}
return
}
2024-08-02 04:50:05 +08:00
res.Context = tokens
}
}
if builtinParser != nil {
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
// only send messages with meaningful content (empty messages confuse clients)
if res.Response != "" || res.Thinking != "" || res.Done || len(res.ToolCalls) > 0 {
ch <- res
}
return
}
ch <- res
2024-06-18 01:38:55 +08:00
}); err != nil {
var serr api.StatusError
if errors.As(err, &serr) {
ch <- gin.H{"error": serr.ErrorMessage, "status": serr.StatusCode}
} else {
ch <- gin.H{"error": err.Error()}
}
}
}()
if req.Stream != nil && !*req.Stream {
2024-06-18 01:38:55 +08:00
var r api.GenerateResponse
var allLogprobs []api.Logprob
var sbThinking strings.Builder
var sbContent strings.Builder
2024-06-18 01:38:55 +08:00
for rr := range ch {
switch t := rr.(type) {
case api.GenerateResponse:
sbThinking.WriteString(t.Thinking)
sbContent.WriteString(t.Response)
2024-06-18 01:38:55 +08:00
r = t
// Accumulate logprobs from all chunks for non-streaming response
if len(t.Logprobs) > 0 {
allLogprobs = append(allLogprobs, t.Logprobs...)
}
case gin.H:
msg, ok := t["error"].(string)
if !ok {
msg = "unexpected error format in response"
}
status, ok := t["status"].(int)
if !ok {
status = http.StatusInternalServerError
}
c.JSON(status, gin.H{"error": msg})
2024-06-18 01:38:55 +08:00
return
default:
2024-06-18 01:38:55 +08:00
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected response"})
return
}
}
r.Thinking = sbThinking.String()
r.Response = sbContent.String()
r.Logprobs = allLogprobs
2024-06-18 01:38:55 +08:00
c.JSON(http.StatusOK, r)
return
}
streamResponse(c, ch)
}
func (s *Server) EmbedHandler(c *gin.Context) {
checkpointStart := time.Now()
var req api.EmbedRequest
err := c.ShouldBindJSON(&req)
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
case err != nil:
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
truncate := true
if req.Truncate != nil && !*req.Truncate {
truncate = false
}
var input []string
switch i := req.Input.(type) {
case string:
if len(i) > 0 {
input = append(input, i)
}
case []any:
for _, v := range i {
if _, ok := v.(string); !ok {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
return
}
input = append(input, v.(string))
}
default:
if req.Input != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
return
}
}
name, err := getExistingName(model.ParseName(req.Model))
if err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
return
}
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
if err != nil {
handleScheduleError(c, req.Model, err)
return
}
checkpointLoaded := time.Now()
if len(input) == 0 {
c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}})
return
}
kvData, _, err := getModelData(m.ModelPath, false)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
var count int
for i, s := range input {
tokens, err := r.Tokenize(c.Request.Context(), s)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
ctxLen := min(opts.NumCtx, int(kvData.ContextLength()))
if len(tokens) > ctxLen {
if !truncate {
c.JSON(http.StatusBadRequest, gin.H{"error": "input exceeds maximum context length"})
return
}
if bos := kvData.Uint("tokenizer.ggml.bos_token_id"); tokens[0] != int(bos) && kvData.Bool("add_bos_token", true) {
ctxLen--
}
if eos := kvData.Uint("tokenizer.ggml.eos_token_id"); tokens[len(tokens)-1] != int(eos) && kvData.Bool("add_eos_token", true) {
ctxLen--
}
slog.Info("", "ctxLen", ctxLen, "tokenCount", len(tokens))
if ctxLen <= 0 {
// return error if the truncated input would be empty or just special tokens
c.JSON(http.StatusBadRequest, gin.H{"error": "input after truncation exceeds maximum context length"})
return
}
tokens = tokens[:ctxLen]
s, err = r.Detokenize(c.Request.Context(), tokens)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
}
count += len(tokens)
input[i] = s
}
var g errgroup.Group
embeddings := make([][]float32, len(input))
for i, text := range input {
g.Go(func() error {
embedding, err := r.Embedding(c.Request.Context(), text)
if err != nil {
return err
}
// TODO: this first normalization should be done by the model
embedding = normalize(embedding)
if req.Dimensions > 0 && req.Dimensions < len(embedding) {
embedding = normalize(embedding[:req.Dimensions])
}
embeddings[i] = embedding
return nil
})
}
if err := g.Wait(); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
return
}
resp := api.EmbedResponse{
Model: req.Model,
Embeddings: embeddings,
TotalDuration: time.Since(checkpointStart),
LoadDuration: checkpointLoaded.Sub(checkpointStart),
PromptEvalCount: count,
}
c.JSON(http.StatusOK, resp)
}
func normalize(vec []float32) []float32 {
var sum float32
for _, v := range vec {
sum += v * v
}
norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12))
for i := range vec {
vec[i] *= norm
}
return vec
}
func (s *Server) EmbeddingsHandler(c *gin.Context) {
var req api.EmbeddingRequest
2024-06-18 01:38:55 +08:00
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
2024-06-18 01:38:55 +08:00
} else if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
name := model.ParseName(req.Model)
if !name.IsValid() {
c.JSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
r, _, _, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
if err != nil {
2024-06-21 02:00:08 +08:00
handleScheduleError(c, req.Model, err)
return
}
// an empty request loads the model
if req.Prompt == "" {
c.JSON(http.StatusOK, api.EmbeddingResponse{Embedding: []float64{}})
return
}
embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
return
}
var e []float64
for _, v := range embedding {
e = append(e, float64(v))
}
resp := api.EmbeddingResponse{
Embedding: e,
}
c.JSON(http.StatusOK, resp)
}
func (s *Server) PullHandler(c *gin.Context) {
2023-07-12 02:54:22 +08:00
var req api.PullRequest
2023-10-19 07:08:42 +08:00
err := c.ShouldBindJSON(&req)
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
case err != nil:
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
2023-07-12 02:54:22 +08:00
return
}
name := model.ParseName(cmp.Or(req.Model, req.Name))
if !name.IsValid() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": errtypes.InvalidModelNameErrMsg})
return
}
name, err = getExistingName(name)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
ch := make(chan any)
go func() {
defer close(ch)
2023-07-19 09:51:30 +08:00
fn := func(r api.ProgressResponse) {
ch <- r
}
2023-07-19 09:51:30 +08:00
2024-02-15 03:29:49 +08:00
regOpts := &registryOptions{
Insecure: req.Insecure,
}
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
if err := PullModel(ctx, name.DisplayShortest(), regOpts, fn); err != nil {
2023-07-21 03:12:08 +08:00
ch <- gin.H{"error": err.Error()}
}
}()
if req.Stream != nil && !*req.Stream {
waitForStream(c, ch)
return
}
streamResponse(c, ch)
}
func (s *Server) PushHandler(c *gin.Context) {
var req api.PushRequest
2023-10-19 07:08:42 +08:00
err := c.ShouldBindJSON(&req)
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
case err != nil:
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
2023-07-12 02:54:22 +08:00
return
}
2023-07-07 01:40:11 +08:00
var mname string
if req.Model != "" {
mname = req.Model
} else if req.Name != "" {
mname = req.Name
} else {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
ch := make(chan any)
go func() {
defer close(ch)
2023-07-19 09:51:30 +08:00
fn := func(r api.ProgressResponse) {
ch <- r
}
2023-07-19 09:51:30 +08:00
2024-02-15 03:29:49 +08:00
regOpts := &registryOptions{
Insecure: req.Insecure,
}
2023-10-10 01:24:27 +08:00
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
name, err := getExistingName(model.ParseName(mname))
if err != nil {
ch <- gin.H{"error": err.Error()}
return
}
if err := PushModel(ctx, name.DisplayShortest(), regOpts, fn); err != nil {
2023-07-21 03:12:08 +08:00
ch <- gin.H{"error": err.Error()}
}
}()
if req.Stream != nil && !*req.Stream {
waitForStream(c, ch)
return
}
streamResponse(c, ch)
}
// getExistingName searches the models directory for the longest prefix match of
// the input name and returns the input name with all existing parts replaced
// with each part found. If no parts are found, the input name is returned as
// is.
func getExistingName(n model.Name) (model.Name, error) {
var zero model.Name
existing, err := Manifests(true)
if err != nil {
return zero, err
}
var set model.Name // tracks parts already canonicalized
for e := range existing {
if set.Host == "" && strings.EqualFold(e.Host, n.Host) {
n.Host = e.Host
}
if set.Namespace == "" && strings.EqualFold(e.Namespace, n.Namespace) {
n.Namespace = e.Namespace
}
if set.Model == "" && strings.EqualFold(e.Model, n.Model) {
n.Model = e.Model
}
if set.Tag == "" && strings.EqualFold(e.Tag, n.Tag) {
n.Tag = e.Tag
}
}
return n, nil
}
func (s *Server) DeleteHandler(c *gin.Context) {
var r api.DeleteRequest
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
2023-10-19 07:08:42 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
} else if err != nil {
2023-10-19 07:08:42 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
2023-07-21 07:09:23 +08:00
return
}
n := model.ParseName(cmp.Or(r.Model, r.Name))
if !n.IsValid() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("name %q is invalid", cmp.Or(r.Model, r.Name))})
return
}
2023-09-27 08:28:14 +08:00
n, err := getExistingName(n)
if err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", cmp.Or(r.Model, r.Name))})
return
}
m, err := ParseNamedManifest(n)
2023-09-27 08:28:14 +08:00
if err != nil {
switch {
case os.IsNotExist(err):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", cmp.Or(r.Model, r.Name))})
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
2023-09-27 08:28:14 +08:00
return
}
if err := m.Remove(); err != nil {
2023-09-27 08:28:14 +08:00
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
if err := m.RemoveLayers(); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
2023-07-21 07:09:23 +08:00
}
func (s *Server) ShowHandler(c *gin.Context) {
2023-09-07 02:04:17 +08:00
var req api.ShowRequest
2023-10-19 07:08:42 +08:00
err := c.ShouldBindJSON(&req)
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
case err != nil:
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
2023-09-07 02:04:17 +08:00
return
}
if req.Model != "" {
2024-01-19 07:36:50 +08:00
// noop
} else if req.Name != "" {
2024-01-19 07:36:50 +08:00
req.Model = req.Name
} else {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
resp, err := GetModelInfo(req)
2023-09-07 02:04:17 +08:00
if err != nil {
switch {
case os.IsNotExist(err):
2024-01-19 07:36:50 +08:00
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
case err.Error() == errtypes.InvalidModelNameErrMsg:
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
default:
2023-09-07 02:04:17 +08:00
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
}
c.JSON(http.StatusOK, resp)
}
func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
name := model.ParseName(req.Model)
if !name.IsValid() {
return nil, ErrModelPathInvalid
}
name, err := getExistingName(name)
if err != nil {
return nil, err
}
m, err := GetModel(name.String())
2023-09-07 02:04:17 +08:00
if err != nil {
return nil, err
}
modelDetails := api.ModelDetails{
ParentModel: m.ParentModel,
Format: m.Config.ModelFormat,
Family: m.Config.ModelFamily,
Families: m.Config.ModelFamilies,
ParameterSize: m.Config.ModelType,
QuantizationLevel: m.Config.FileType,
}
if req.System != "" {
m.System = req.System
}
2024-06-18 01:38:55 +08:00
msgs := make([]api.Message, len(m.Messages))
for i, msg := range m.Messages {
msgs[i] = api.Message{Role: msg.Role, Content: msg.Content}
2024-01-26 04:12:36 +08:00
}
manifest, err := ParseNamedManifest(name)
if err != nil {
return nil, err
}
2023-09-07 02:04:17 +08:00
resp := &api.ShowResponse{
License: strings.Join(m.License, "\n"),
System: m.System,
Template: m.Template.String(),
Details: modelDetails,
Messages: msgs,
Capabilities: m.Capabilities(),
ModifiedAt: manifest.fi.ModTime(),
2023-09-07 02:04:17 +08:00
}
2025-09-18 05:40:53 +08:00
if m.Config.RemoteHost != "" {
resp.RemoteHost = m.Config.RemoteHost
resp.RemoteModel = m.Config.RemoteModel
if m.Config.ModelFamily != "" {
resp.ModelInfo = make(map[string]any)
resp.ModelInfo["general.architecture"] = m.Config.ModelFamily
if m.Config.BaseName != "" {
resp.ModelInfo["general.basename"] = m.Config.BaseName
}
if m.Config.ContextLen > 0 {
resp.ModelInfo[fmt.Sprintf("%s.context_length", m.Config.ModelFamily)] = m.Config.ContextLen
}
if m.Config.EmbedLen > 0 {
resp.ModelInfo[fmt.Sprintf("%s.embedding_length", m.Config.ModelFamily)] = m.Config.EmbedLen
}
}
}
2023-09-07 02:04:17 +08:00
var params []string
cs := 30
for k, v := range m.Options {
2023-09-07 02:04:17 +08:00
switch val := v.(type) {
case []any:
2023-09-07 02:04:17 +08:00
for _, nv := range val {
2024-01-17 02:34:44 +08:00
params = append(params, fmt.Sprintf("%-*s %#v", cs, k, nv))
2023-09-07 02:04:17 +08:00
}
2024-01-17 02:34:44 +08:00
default:
params = append(params, fmt.Sprintf("%-*s %#v", cs, k, v))
2023-09-07 02:04:17 +08:00
}
}
resp.Parameters = strings.Join(params, "\n")
if len(req.Options) > 0 {
if m.Options == nil {
m.Options = make(map[string]any)
}
for k, v := range req.Options {
m.Options[k] = v
}
}
var sb strings.Builder
fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
fmt.Fprintf(&sb, "# FROM %s\n\n", m.ShortName)
fmt.Fprint(&sb, m.String())
resp.Modelfile = sb.String()
2025-09-18 05:40:53 +08:00
// skip loading tensor information if this is a remote model
if m.Config.RemoteHost != "" && m.Config.RemoteModel != "" {
return resp, nil
}
kvData, tensors, err := getModelData(m.ModelPath, req.Verbose)
if err != nil {
return nil, err
}
delete(kvData, "general.name")
delete(kvData, "tokenizer.chat_template")
resp.ModelInfo = kvData
tensorData := make([]api.Tensor, len(tensors.Items()))
for cnt, t := range tensors.Items() {
tensorData[cnt] = api.Tensor{Name: t.Name, Type: t.Type(), Shape: t.Shape}
}
resp.Tensors = tensorData
if len(m.ProjectorPaths) > 0 {
projectorData, _, err := getModelData(m.ProjectorPaths[0], req.Verbose)
if err != nil {
return nil, err
}
resp.ProjectorInfo = projectorData
}
2023-09-07 02:04:17 +08:00
return resp, nil
}
func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
maxArraySize := 0
if verbose {
maxArraySize = -1
}
data, err := llm.LoadModel(digest, maxArraySize)
if err != nil {
return nil, ggml.Tensors{}, err
}
kv := data.KV()
if !verbose {
for k := range kv {
if t, ok := kv[k].([]any); len(t) > 5 && ok {
kv[k] = []any{}
}
}
}
return kv, data.Tensors(), nil
}
func (s *Server) ListHandler(c *gin.Context) {
ms, err := Manifests(true)
2023-07-19 00:09:45 +08:00
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
2023-08-31 02:14:12 +08:00
models := []api.ListModelResponse{}
2024-05-07 07:34:13 +08:00
for n, m := range ms {
var cf ConfigV2
if m.Config.Digest != "" {
f, err := m.Config.Open()
if err != nil {
slog.Warn("bad manifest filepath", "name", n, "error", err)
continue
}
defer f.Close()
if err := json.NewDecoder(f).Decode(&cf); err != nil {
slog.Warn("bad manifest config", "name", n, "error", err)
continue
}
2023-07-19 00:09:45 +08:00
}
2023-08-31 02:14:12 +08:00
// tag should never be masked
models = append(models, api.ListModelResponse{
2025-09-18 05:40:53 +08:00
Model: n.DisplayShortest(),
Name: n.DisplayShortest(),
RemoteModel: cf.RemoteModel,
RemoteHost: cf.RemoteHost,
Size: m.Size(),
Digest: m.digest,
ModifiedAt: m.fi.ModTime(),
2024-05-07 07:34:13 +08:00
Details: api.ModelDetails{
Format: cf.ModelFormat,
Family: cf.ModelFamily,
Families: cf.ModelFamilies,
ParameterSize: cf.ModelType,
QuantizationLevel: cf.FileType,
},
})
2023-07-19 00:09:45 +08:00
}
slices.SortStableFunc(models, func(i, j api.ListModelResponse) int {
2024-04-18 05:54:14 +08:00
// most recently modified first
return cmp.Compare(j.ModifiedAt.Unix(), i.ModifiedAt.Unix())
})
2023-07-20 06:00:28 +08:00
c.JSON(http.StatusOK, api.ListResponse{Models: models})
2023-07-19 00:09:45 +08:00
}
func (s *Server) CopyHandler(c *gin.Context) {
2024-04-17 07:22:38 +08:00
var r api.CopyRequest
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
2023-10-19 07:08:42 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
2024-04-17 07:22:38 +08:00
} else if err != nil {
2023-10-19 07:08:42 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
2023-07-24 23:27:28 +08:00
return
}
2024-04-17 07:22:38 +08:00
src := model.ParseName(r.Source)
if !src.IsValid() {
2024-05-02 03:39:05 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
return
}
src, err := getExistingName(src)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
2024-04-17 07:22:38 +08:00
dst := model.ParseName(r.Destination)
if !dst.IsValid() {
2024-05-08 08:35:52 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Destination)})
2023-07-24 23:27:28 +08:00
return
}
dst, err = getExistingName(dst)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
2024-04-17 07:22:38 +08:00
if err := CopyModel(src, dst); errors.Is(err, os.ErrNotExist) {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found", r.Source)})
} else if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
2023-07-24 23:27:28 +08:00
}
func (s *Server) HeadBlobHandler(c *gin.Context) {
2023-11-15 06:07:40 +08:00
path, err := GetBlobsPath(c.Param("digest"))
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
if _, err := os.Stat(path); err != nil {
c.AbortWithStatusJSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("blob %q not found", c.Param("digest"))})
return
}
2023-11-16 05:55:37 +08:00
c.Status(http.StatusOK)
2023-11-15 06:07:40 +08:00
}
func (s *Server) CreateBlobHandler(c *gin.Context) {
2024-05-21 05:58:27 +08:00
if ib, ok := intermediateBlobs[c.Param("digest")]; ok {
p, err := GetBlobsPath(ib)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
if _, err := os.Stat(p); errors.Is(err, os.ErrNotExist) {
2024-05-21 05:58:27 +08:00
slog.Info("evicting intermediate blob which no longer exists", "digest", ib)
delete(intermediateBlobs, c.Param("digest"))
} else if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
} else {
c.Status(http.StatusOK)
return
}
}
2024-04-06 00:30:09 +08:00
path, err := GetBlobsPath(c.Param("digest"))
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
_, err = os.Stat(path)
switch {
case errors.Is(err, os.ErrNotExist):
// noop
case err != nil:
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
default:
c.Status(http.StatusOK)
return
}
2023-11-25 04:01:23 +08:00
layer, err := NewLayer(c.Request.Body, "")
2023-11-18 07:21:57 +08:00
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
2023-11-25 04:01:23 +08:00
if layer.Digest != c.Param("digest") {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("digest mismatch, expected %q, got %q", c.Param("digest"), layer.Digest)})
2023-11-15 06:07:40 +08:00
return
}
2023-11-16 05:55:37 +08:00
c.Status(http.StatusCreated)
2023-11-15 06:07:40 +08:00
}
2024-03-09 16:22:08 +08:00
func isLocalIP(ip netip.Addr) bool {
if interfaces, err := net.Interfaces(); err == nil {
for _, iface := range interfaces {
addrs, err := iface.Addrs()
if err != nil {
continue
}
for _, a := range addrs {
if parsed, _, err := net.ParseCIDR(a.String()); err == nil {
if parsed.String() == ip.String() {
return true
}
}
}
}
}
return false
}
func allowedHost(host string) bool {
host = strings.ToLower(host)
2024-03-09 16:22:08 +08:00
if host == "" || host == "localhost" {
return true
}
if hostname, err := os.Hostname(); err == nil && host == strings.ToLower(hostname) {
return true
}
2024-08-02 05:52:15 +08:00
tlds := []string{
2024-03-09 15:23:59 +08:00
"localhost",
"local",
"internal",
2023-12-15 08:47:40 +08:00
}
2024-03-09 15:29:53 +08:00
// check if the host is a local TLD
for _, tld := range tlds {
if strings.HasSuffix(host, "."+tld) {
return true
}
}
2024-03-09 15:29:53 +08:00
return false
2024-03-09 15:23:59 +08:00
}
2024-03-09 15:23:59 +08:00
func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
return func(c *gin.Context) {
if addr == nil {
c.Next()
return
}
2024-03-09 16:22:08 +08:00
if addr, err := netip.ParseAddrPort(addr.String()); err == nil && !addr.Addr().IsLoopback() {
c.Next()
return
}
host, _, err := net.SplitHostPort(c.Request.Host)
if err != nil {
host = c.Request.Host
}
2024-03-09 15:23:59 +08:00
if addr, err := netip.ParseAddr(host); err == nil {
2024-03-09 16:22:08 +08:00
if addr.IsLoopback() || addr.IsPrivate() || addr.IsUnspecified() || isLocalIP(addr) {
2024-03-09 15:23:59 +08:00
c.Next()
return
}
}
if allowedHost(host) {
2024-05-22 13:21:04 +08:00
if c.Request.Method == http.MethodOptions {
c.AbortWithStatus(http.StatusNoContent)
return
}
c.Next()
return
}
c.AbortWithStatus(http.StatusForbidden)
}
2023-12-15 08:47:40 +08:00
}
func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
corsConfig := cors.DefaultConfig()
corsConfig.AllowWildcard = true
corsConfig.AllowBrowserExtensions = true
corsConfig.AllowHeaders = []string{
"Authorization",
"Content-Type",
"User-Agent",
"Accept",
"X-Requested-With",
// OpenAI compatibility headers
"OpenAI-Beta",
"x-stainless-arch",
"x-stainless-async",
"x-stainless-custom-poll-interval",
"x-stainless-helper-method",
"x-stainless-lang",
"x-stainless-os",
"x-stainless-package-version",
"x-stainless-poll-helper",
"x-stainless-retry-count",
"x-stainless-runtime",
"x-stainless-runtime-version",
"x-stainless-timeout",
}
corsConfig.AllowOrigins = envconfig.AllowedOrigins()
2023-07-22 09:01:24 +08:00
2023-07-06 03:37:33 +08:00
r := gin.Default()
r.HandleMethodNotAllowed = true
r.Use(
cors.New(corsConfig),
allowedHostsMiddleware(s.addr),
)
2023-07-06 03:37:33 +08:00
// General
r.HEAD("/", func(c *gin.Context) { c.String(http.StatusOK, "Ollama is running") })
r.GET("/", func(c *gin.Context) { c.String(http.StatusOK, "Ollama is running") })
r.HEAD("/api/version", func(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"version": version.Version}) })
r.GET("/api/version", func(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"version": version.Version}) })
// Local model cache management (new implementation is at end of function)
r.POST("/api/pull", s.PullHandler)
r.POST("/api/push", s.PushHandler)
r.HEAD("/api/tags", s.ListHandler)
r.GET("/api/tags", s.ListHandler)
r.POST("/api/show", s.ShowHandler)
r.DELETE("/api/delete", s.DeleteHandler)
2025-09-18 05:40:53 +08:00
r.POST("/api/me", s.WhoamiHandler)
r.POST("/api/signout", s.SignoutHandler)
// deprecated
r.DELETE("/api/user/keys/:encodedKey", s.SignoutHandler)
// Create
r.POST("/api/create", s.CreateHandler)
r.POST("/api/blobs/:digest", s.CreateBlobHandler)
r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
r.POST("/api/copy", s.CopyHandler)
// Inference
r.GET("/api/ps", s.PsHandler)
r.POST("/api/generate", s.GenerateHandler)
r.POST("/api/chat", s.ChatHandler)
r.POST("/api/embed", s.EmbedHandler)
r.POST("/api/embeddings", s.EmbeddingsHandler)
// Inference (OpenAI compatibility)
r.POST("/v1/chat/completions", middleware.ChatMiddleware(), s.ChatHandler)
r.POST("/v1/completions", middleware.CompletionsMiddleware(), s.GenerateHandler)
r.POST("/v1/embeddings", middleware.EmbeddingsMiddleware(), s.EmbedHandler)
r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler)
r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler)
if rc != nil {
// wrap old with new
rs := &registry.Local{
Client: rc,
Logger: slog.Default(), // TODO(bmizerany): Take a logger, do not use slog.Default()
Fallback: r,
Prune: PruneLayers,
}
return rs, nil
}
return r, nil
2023-12-15 08:47:40 +08:00
}
func Serve(ln net.Listener) error {
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
slog.Info("server config", "env", envconfig.Values())
blobsDir, err := GetBlobsPath("")
if err != nil {
return err
}
if err := fixBlobs(blobsDir); err != nil {
return err
}
2024-07-04 08:22:13 +08:00
if !envconfig.NoPrune() {
if _, err := Manifests(false); err != nil {
slog.Warn("corrupt manifests detected, skipping prune operation. Re-pull or delete to clear", "error", err)
} else {
// clean up unused layers and manifests
if err := PruneLayers(); err != nil {
return err
}
2023-12-15 08:47:40 +08:00
manifestsPath, err := GetManifestPath()
if err != nil {
return err
}
2023-12-15 08:47:40 +08:00
if err := PruneDirectory(manifestsPath); err != nil {
return err
}
2023-12-15 08:47:40 +08:00
}
}
s := &Server{addr: ln.Addr()}
var rc *ollama.Registry
if useClient2 {
var err error
rc, err = ollama.DefaultRegistry()
if err != nil {
return err
}
}
h, err := s.GenerateRoutes(rc)
if err != nil {
return err
}
http.Handle("/", h)
ctx, done := context.WithCancel(context.Background())
schedCtx, schedDone := context.WithCancel(ctx)
sched := InitScheduler(schedCtx)
s.sched = sched
2023-12-15 08:47:40 +08:00
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
2023-12-15 08:47:40 +08:00
srvr := &http.Server{
// Use http.DefaultServeMux so we get net/http/pprof for
// free.
//
// TODO(bmizerany): Decide if we want to make this
// configurable so it is not exposed by default, or allow
// users to bind it to a different port. This was a quick
// and easy way to get pprof, but it may not be the best
// way.
Handler: nil,
}
// listen for a ctrl+c and stop any loaded llm
signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-signals
srvr.Close()
schedDone()
sched.unloadAllRunners()
done()
}()
s.sched.Run(schedCtx)
// register the experimental webp decoder
// so webp images can be used in multimodal inputs
image.RegisterFormat("webp", "RIFF????WEBP", webp.Decode, webp.DecodeConfig)
// At startup we retrieve GPU information so we can get log messages before loading a model
// This will log warnings to the log in case we have problems with detected GPUs
gpus := discover.GPUDevices(ctx, nil)
discover.LogDetails(gpus)
var totalVRAM uint64
for _, gpu := range gpus {
totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
}
if totalVRAM < lowVRAMThreshold {
s.lowVRAM = true
slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
}
err = srvr.Serve(ln)
// If server is closed from the signal handler, wait for the ctx to be done
// otherwise error out quickly
if !errors.Is(err, http.ErrServerClosed) {
return err
}
<-ctx.Done()
2024-05-17 07:25:38 +08:00
return nil
}
2023-07-07 01:40:11 +08:00
func waitForStream(c *gin.Context, ch chan any) {
c.Header("Content-Type", "application/json")
var latest api.ProgressResponse
for resp := range ch {
switch r := resp.(type) {
case api.ProgressResponse:
latest = r
case gin.H:
status, ok := r["status"].(int)
if !ok {
status = http.StatusInternalServerError
}
errorMsg, ok := r["error"].(string)
if !ok {
errorMsg = "unknown error"
}
c.JSON(status, gin.H{"error": errorMsg})
return
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": "unknown message type"})
return
}
}
c.JSON(http.StatusOK, latest)
}
func streamResponse(c *gin.Context, ch chan any) {
c.Header("Content-Type", "application/x-ndjson")
2023-07-12 02:54:22 +08:00
c.Stream(func(w io.Writer) bool {
val, ok := <-ch
if !ok {
return false
}
// errors are provided as a gin.H with an "error" field and
// an optional "status" field. For errors that are streamed
// before any content, we need to set the status code and
// content type for the error.
if h, ok := val.(gin.H); ok {
if e, ok := h["error"].(string); ok {
status, ok := h["status"].(int)
if !ok {
status = http.StatusInternalServerError
}
if !c.Writer.Written() {
c.Header("Content-Type", "application/json")
c.JSON(status, gin.H{"error": e})
} else {
if err := json.NewEncoder(c.Writer).Encode(gin.H{"error": e}); err != nil {
slog.Error("streamResponse failed to encode json error", "error", err)
}
}
return false
}
}
2023-07-12 02:54:22 +08:00
bts, err := json.Marshal(val)
if err != nil {
slog.Info(fmt.Sprintf("streamResponse: json.Marshal failed with %s", err))
2023-07-12 02:54:22 +08:00
return false
}
// Delineate chunks with new-line delimiter
2023-07-12 02:54:22 +08:00
bts = append(bts, '\n')
if _, err := w.Write(bts); err != nil {
slog.Info(fmt.Sprintf("streamResponse: w.Write failed with %s", err))
2023-07-12 02:54:22 +08:00
return false
}
return true
})
}
2023-12-06 03:57:33 +08:00
2025-09-18 05:40:53 +08:00
func (s *Server) WhoamiHandler(c *gin.Context) {
// todo allow other hosts
u, err := url.Parse("https://ollama.com")
if err != nil {
slog.Error(err.Error())
c.JSON(http.StatusInternalServerError, gin.H{"error": "URL parse error"})
return
}
client := api.NewClient(u, http.DefaultClient)
user, err := client.Whoami(c)
if err != nil {
slog.Error(err.Error())
}
// user isn't signed in
if user != nil && user.Name == "" {
sURL, sErr := signinURL()
if sErr != nil {
slog.Error(sErr.Error())
c.JSON(http.StatusInternalServerError, gin.H{"error": "error getting authorization details"})
return
}
c.JSON(http.StatusUnauthorized, gin.H{"error": "unauthorized", "signin_url": sURL})
return
}
2025-09-18 05:40:53 +08:00
c.JSON(http.StatusOK, user)
}
func (s *Server) SignoutHandler(c *gin.Context) {
pubKey, err := auth.GetPublicKey()
if err != nil {
slog.Error("couldn't get public key", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": "there was an error signing out"})
return
}
encKey := base64.RawURLEncoding.EncodeToString([]byte(pubKey))
2025-09-18 05:40:53 +08:00
// todo allow other hosts
u, err := url.Parse("https://ollama.com")
if err != nil {
slog.Error(err.Error())
c.JSON(http.StatusInternalServerError, gin.H{"error": "URL parse error"})
return
}
client := api.NewClient(u, http.DefaultClient)
err = client.Disconnect(c, encKey)
2025-09-18 05:40:53 +08:00
if err != nil {
var authError api.AuthorizationError
if errors.As(err, &authError) {
c.JSON(http.StatusUnauthorized, gin.H{"error": "you are not currently signed in"})
2025-09-18 05:40:53 +08:00
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": "there was an error signing out"})
return
}
c.JSON(http.StatusOK, nil)
}
func (s *Server) PsHandler(c *gin.Context) {
models := []api.ProcessModelResponse{}
for _, v := range s.sched.loaded {
model := v.model
modelDetails := api.ModelDetails{
Format: model.Config.ModelFormat,
Family: model.Config.ModelFamily,
Families: model.Config.ModelFamilies,
ParameterSize: model.Config.ModelType,
QuantizationLevel: model.Config.FileType,
}
mr := api.ProcessModelResponse{
Model: model.ShortName,
Name: model.ShortName,
Size: int64(v.totalSize),
SizeVRAM: int64(v.vramSize),
Digest: model.Digest,
Details: modelDetails,
ExpiresAt: v.expiresAt,
}
if v.Options != nil {
mr.ContextLength = v.Options.NumCtx
}
// The scheduler waits to set expiresAt, so if a model is loading it's
// possible that it will be set to the unix epoch. For those cases, just
// calculate the time w/ the sessionDuration instead.
var epoch time.Time
if v.expiresAt == epoch {
mr.ExpiresAt = time.Now().Add(v.sessionDuration)
}
models = append(models, mr)
}
slices.SortStableFunc(models, func(i, j api.ProcessModelResponse) int {
// longest duration remaining listed first
return cmp.Compare(j.ExpiresAt.Unix(), i.ExpiresAt.Unix())
})
c.JSON(http.StatusOK, api.ProcessResponse{Models: models})
}
func toolCallId() string {
const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
b := make([]byte, 8)
for i := range b {
b[i] = letterBytes[rand.Intn(len(letterBytes))]
}
return "call_" + strings.ToLower(string(b))
}
func (s *Server) ChatHandler(c *gin.Context) {
checkpointStart := time.Now()
2023-12-06 03:57:33 +08:00
var req api.ChatRequest
2024-06-18 01:38:55 +08:00
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
2023-12-06 03:57:33 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
2024-06-18 01:38:55 +08:00
} else if err != nil {
2023-12-06 03:57:33 +08:00
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
return
}
2025-09-18 05:40:53 +08:00
name := model.ParseName(req.Model)
if !name.IsValid() {
c.JSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
name, err := getExistingName(name)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
m, err := GetModel(req.Model)
if err != nil {
switch {
case os.IsNotExist(err):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
case err.Error() == errtypes.InvalidModelNameErrMsg:
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
2024-09-12 07:36:21 +08:00
}
2025-09-18 05:40:53 +08:00
return
}
if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
return
}
2025-09-18 05:40:53 +08:00
// expire the runner
if len(req.Messages) == 0 && req.KeepAlive != nil && req.KeepAlive.Duration == 0 {
2025-09-18 05:40:53 +08:00
s.sched.expireRunner(m)
2024-09-12 07:36:21 +08:00
c.JSON(http.StatusOK, api.ChatResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant"},
Done: true,
DoneReason: "unload",
})
return
}
2025-09-18 05:40:53 +08:00
if m.Config.RemoteHost != "" && m.Config.RemoteModel != "" {
origModel := req.Model
remoteURL, err := url.Parse(m.Config.RemoteHost)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
if !slices.Contains(envconfig.Remotes(), remoteURL.Hostname()) {
slog.Info("remote model", "remotes", envconfig.Remotes(), "remoteURL", m.Config.RemoteHost, "hostname", remoteURL.Hostname())
c.JSON(http.StatusBadRequest, gin.H{"error": "this server cannot run this remote model"})
return
}
req.Model = m.Config.RemoteModel
if req.Options == nil {
req.Options = map[string]any{}
}
var msgs []api.Message
if len(req.Messages) > 0 {
msgs = append(m.Messages, req.Messages...)
if req.Messages[0].Role != "system" && m.System != "" {
msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
}
2025-09-18 05:40:53 +08:00
}
2025-09-18 05:40:53 +08:00
msgs = filterThinkTags(msgs, m)
req.Messages = msgs
for k, v := range m.Options {
if _, ok := req.Options[k]; !ok {
req.Options[k] = v
}
}
fn := func(resp api.ChatResponse) error {
resp.Model = origModel
resp.RemoteModel = m.Config.RemoteModel
resp.RemoteHost = m.Config.RemoteHost
data, err := json.Marshal(resp)
if err != nil {
return err
}
if _, err = c.Writer.Write(append(data, '\n')); err != nil {
return err
}
c.Writer.Flush()
return nil
}
client := api.NewClient(remoteURL, http.DefaultClient)
err = client.Chat(c, &req, fn)
if err != nil {
var authError api.AuthorizationError
if errors.As(err, &authError) {
sURL, sErr := signinURL()
if sErr != nil {
slog.Error(sErr.Error())
c.JSON(http.StatusInternalServerError, gin.H{"error": "error getting authorization details"})
return
}
c.JSON(authError.StatusCode, gin.H{"error": "unauthorized", "signin_url": sURL})
return
}
var apiError api.StatusError
if errors.As(err, &apiError) {
c.JSON(apiError.StatusCode, apiError)
return
}
2025-09-18 05:40:53 +08:00
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
contentType := "application/json; charset=utf-8"
if req.Stream != nil && *req.Stream {
contentType = "application/x-ndjson"
}
c.Header("Content-Type", contentType)
2025-09-18 05:40:53 +08:00
return
}
caps := []model.Capability{model.CapabilityCompletion}
if len(req.Tools) > 0 {
caps = append(caps, model.CapabilityTools)
2024-06-21 04:45:47 +08:00
}
modelCaps := m.Capabilities()
if slices.Contains(modelCaps, model.CapabilityThinking) {
caps = append(caps, model.CapabilityThinking)
if req.Think == nil {
req.Think = &api.ThinkValue{Value: true}
}
} else {
if req.Think != nil && req.Think.Bool() {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
return
}
}
2024-06-21 04:45:47 +08:00
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
2024-06-18 01:38:55 +08:00
if errors.Is(err, errCapabilityCompletion) {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support chat", req.Model)})
2023-12-06 03:57:33 +08:00
return
2024-06-18 01:38:55 +08:00
} else if err != nil {
2024-06-21 02:00:08 +08:00
handleScheduleError(c, req.Model, err)
2023-12-06 03:57:33 +08:00
return
}
2024-02-01 09:39:38 +08:00
checkpointLoaded := time.Now()
2024-06-18 01:38:55 +08:00
if len(req.Messages) == 0 {
c.JSON(http.StatusOK, api.ChatResponse{
2024-05-10 04:30:14 +08:00
Model: req.Model,
2024-06-18 01:38:55 +08:00
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant"},
2024-05-10 04:30:14 +08:00
Done: true,
DoneReason: "load",
2024-06-18 01:38:55 +08:00
})
return
}
2024-06-20 05:14:28 +08:00
msgs := append(m.Messages, req.Messages...)
if req.Messages[0].Role != "system" && m.System != "" {
2024-06-20 05:14:28 +08:00
msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
}
msgs = filterThinkTags(msgs, m)
if shouldUseHarmony(m) && m.Config.Parser == "" {
m.Config.Parser = "harmony"
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
}
var builtinParser parsers.Parser
processedTools := req.Tools
if m.Config.Parser != "" {
builtinParser = parsers.ParserForName(m.Config.Parser)
if builtinParser != nil {
// Determine last message for chat prefill
var lastMessage *api.Message
if len(msgs) > 0 {
lastMessage = &msgs[len(msgs)-1]
}
// Initialize parser and get processed tools
2025-11-20 09:21:07 +08:00
processedTools = builtinParser.Init(req.Tools, lastMessage, req.Think)
}
}
truncate := req.Truncate == nil || *req.Truncate
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think, truncate)
2024-06-18 01:38:55 +08:00
if err != nil {
2024-11-28 05:40:57 +08:00
slog.Error("chat prompt error", "error", err)
2024-06-18 01:38:55 +08:00
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
c.JSON(http.StatusOK, api.ChatResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
DebugInfo: &api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
// Validate Think value: string values currently only allowed for harmony/gptoss models
if req.Think != nil && req.Think.IsString() && m.Config.Parser != "harmony" {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
return
}
var thinkingState *thinking.Parser
openingTag, closingTag := thinking.InferTags(m.Template.Template)
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
thinkingState = &thinking.Parser{
2025-06-06 01:22:32 +08:00
OpeningTag: openingTag,
ClosingTag: closingTag,
}
if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
thinkingState.AddContent(openingTag)
}
}
var toolParser *tools.Parser
if len(req.Tools) > 0 && (builtinParser == nil || !builtinParser.HasToolSupport()) {
toolParser = tools.NewParser(m.Template.Template, req.Tools)
}
type structuredOutputsState int
const (
structuredOutputsState_None structuredOutputsState = iota
structuredOutputsState_ReadyToApply
structuredOutputsState_Applying
)
2023-12-06 03:57:33 +08:00
ch := make(chan any)
go func() {
defer close(ch)
structuredOutputsState := structuredOutputsState_None
for {
var tb strings.Builder
currentFormat := req.Format
// structured outputs via double request is enabled when:
// 1. the model supports the thinking capability and
// 2. it uses a built-in parser or our generic thinking parser
// Note that the current approach does not work for (potential future)
// non-thinking models that emit anything before actual content. This
// current approach uses the transition from parsed thinking content to
// parsed non-thinking content as the signal to turn constraining on
if req.Format != nil && structuredOutputsState == structuredOutputsState_None && ((builtinParser != nil || thinkingState != nil) && slices.Contains(m.Capabilities(), model.CapabilityThinking)) {
currentFormat = nil
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-06 03:21:16 +08:00
}
// sets up new context given parent context per request
ctx, cancel := context.WithCancel(c.Request.Context())
err := r.Completion(ctx, llm.CompletionRequest{
Prompt: prompt,
Images: images,
Format: currentFormat,
Options: opts,
Shift: req.Shift == nil || *req.Shift,
Truncate: truncate,
Logprobs: req.Logprobs,
TopLogprobs: req.TopLogprobs,
}, func(r llm.CompletionResponse) {
res := api.ChatResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant", Content: r.Content},
Done: r.Done,
Metrics: api.Metrics{
PromptEvalCount: r.PromptEvalCount,
PromptEvalDuration: r.PromptEvalDuration,
EvalCount: r.EvalCount,
EvalDuration: r.EvalDuration,
},
Logprobs: toAPILogprobs(r.Logprobs),
}
if r.Done {
res.DoneReason = r.DoneReason.String()
res.TotalDuration = time.Since(checkpointStart)
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
}
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
if builtinParser != nil {
slog.Log(context.TODO(), logutil.LevelTrace, "builtin parser input", "parser", m.Config.Parser, "content", r.Content)
content, thinking, toolCalls, err := builtinParser.Add(r.Content, r.Done)
if err != nil {
ch <- gin.H{"error": err.Error()}
return
}
res.Message.Content = content
res.Message.Thinking = thinking
for i := range toolCalls {
toolCalls[i].ID = toolCallId()
}
res.Message.ToolCalls = toolCalls
tb.WriteString(thinking)
// we are now receiving content from the model - we should start applying structured outputs
if structuredOutputsState == structuredOutputsState_None && req.Format != nil && tb.String() != "" && res.Message.Content != "" {
structuredOutputsState = structuredOutputsState_ReadyToApply
cancel()
return
}
if res.Message.Content != "" || res.Message.Thinking != "" || len(res.Message.ToolCalls) > 0 || r.Done {
slog.Log(context.TODO(), logutil.LevelTrace, "builtin parser output", "parser", m.Config.Parser, "content", content, "thinking", thinking, "toolCalls", toolCalls, "done", r.Done)
ch <- res
} else {
slog.Log(context.TODO(), logutil.LevelTrace, "builtin parser empty output", "parser", m.Config.Parser)
}
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
return
}
if thinkingState != nil {
thinkingContent, remainingContent := thinkingState.AddContent(res.Message.Content)
if thinkingContent == "" && remainingContent == "" && !r.Done {
// need to accumulate more to decide what to send
return
}
res.Message.Thinking = thinkingContent
tb.WriteString(thinkingContent)
// emit the collected thinking text before restarting with structured outputs and clear unstructured content
// to avoid leaking mixed tokens like "</think>Hello"
if structuredOutputsState == structuredOutputsState_None && req.Format != nil && tb.String() != "" && remainingContent != "" {
structuredOutputsState = structuredOutputsState_ReadyToApply
res.Message.Content = ""
ch <- res
cancel()
return
}
res.Message.Content = remainingContent
add qwen3-coder tool support The format qwen3-coder uses is relatively unique, both in rendering and in parsing. To implement parsing, I wrote a custom parser in similar style to harmony. For the rendering, I found that the logic would be much more difficult to follow in a template, so I introduced the concept of a built-in renderer that uses go code, rather than a template to generate prompts. I set us up for future built-in parsers and renderers by making it so they can be specified in a Modelfile like so: ``` RENDERER "qwen3-coder" PARSER "qwen3-coder" ``` These need to be provided explicitly because the architecture alone is not enough to understand what format the model expects to receive, and what format we expect it to output (e.g., qwen3-coder is `qwen3moe`, which includes other qwen3-family models as well) I haven't converted harmony to be one of these "built-ins" yet, since some of it is in flux with the changes @ParthSareen has been making to move harmony to the runner. It is likely that many other built-ins will need to move to the runner as well, but I'm able to slightly defer that decision since qwen3-coder doesn't have thinking (and therefore doesn't need to be in the runner to make structured outputs work). I expect to unify harmony with this approach very soon. Whether a particular model supports tools or thinking was previously inferred from templates, but without a template we now also use the parser itself to declare what it supports. If we have future models that re-use the same parsing format, but have different capabilities, we'll want to parameterize them and give them different names to be specified as a `PARSER`. Misc changes: - I worked on the renderer by diffing outputs from the reference implementation and ours. To make it easier to do this, I extended <https://github.com/ollama/ollama/pull/11875> to also support returning the prompt via the openai compat layer
2025-09-12 04:40:35 +08:00
}
if len(req.Tools) > 0 {
toolCalls, content := toolParser.Add(res.Message.Content)
if len(content) > 0 {
res.Message.Content = content
} else if len(toolCalls) > 0 {
for i := range toolCalls {
toolCalls[i].ID = toolCallId()
}
res.Message.ToolCalls = toolCalls
res.Message.Content = ""
} else if res.Message.Thinking != "" {
// don't return
} else {
if r.Done {
res.Message.Content = toolParser.Content()
ch <- res
}
return
}
}
ch <- res
})
if err != nil {
if structuredOutputsState == structuredOutputsState_ReadyToApply && strings.Contains(err.Error(), "context canceled") && c.Request.Context().Err() == nil {
// only ignores error if it's a context cancellation due to setting structured outputs
} else {
var serr api.StatusError
if errors.As(err, &serr) {
ch <- gin.H{"error": serr.ErrorMessage, "status": serr.StatusCode}
} else {
ch <- gin.H{"error": err.Error()}
}
return
}
}
// ignored structured outputs cancellation falls through to here, start a new request with the structured outputs and updated prompt. use the
if structuredOutputsState == structuredOutputsState_ReadyToApply {
structuredOutputsState = structuredOutputsState_Applying
msg := api.Message{
Role: "assistant",
Thinking: tb.String(),
}
msgs = append(msgs, msg)
prompt, _, err = chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think, truncate)
if err != nil {
slog.Error("chat prompt error applying structured outputs", "error", err)
ch <- gin.H{"error": err.Error()}
return
2024-11-28 05:40:57 +08:00
}
// force constraining by terminating thinking header, the parser is already at this state
// when the last message is thinking, the rendered for gpt-oss cannot disambiguate between having the
// model continue thinking or ending thinking and outputting the final message.
// TODO(parthsareen): consider adding prefill disambiguation logic to the renderer for structured outputs.
if shouldUseHarmony(m) || (builtinParser != nil && m.Config.Parser == "harmony") {
prompt += "<|end|><|start|>assistant<|channel|>final<|message|>"
}
continue
2024-11-28 05:40:57 +08:00
}
break
2023-12-06 03:57:33 +08:00
}
}()
if req.Stream != nil && !*req.Stream {
2024-06-21 04:45:47 +08:00
var resp api.ChatResponse
var toolCalls []api.ToolCall
var allLogprobs []api.Logprob
var sbThinking strings.Builder
var sbContent strings.Builder
2024-06-18 01:38:55 +08:00
for rr := range ch {
switch t := rr.(type) {
case api.ChatResponse:
sbThinking.WriteString(t.Message.Thinking)
sbContent.WriteString(t.Message.Content)
2024-06-21 04:45:47 +08:00
resp = t
if len(req.Tools) > 0 {
toolCalls = append(toolCalls, t.Message.ToolCalls...)
}
// Accumulate logprobs from all chunks for non-streaming response
if len(t.Logprobs) > 0 {
allLogprobs = append(allLogprobs, t.Logprobs...)
}
case gin.H:
msg, ok := t["error"].(string)
if !ok {
msg = "unexpected error format in response"
}
status, ok := t["status"].(int)
if !ok {
status = http.StatusInternalServerError
}
c.JSON(status, gin.H{"error": msg})
2024-06-18 01:38:55 +08:00
return
default:
2024-06-18 01:38:55 +08:00
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected response"})
return
2023-12-06 03:57:33 +08:00
}
}
resp.Message.Content = sbContent.String()
resp.Message.Thinking = sbThinking.String()
resp.Logprobs = allLogprobs
if len(toolCalls) > 0 {
resp.Message.ToolCalls = toolCalls
2024-06-21 04:45:47 +08:00
}
c.JSON(http.StatusOK, resp)
2023-12-06 03:57:33 +08:00
return
}
streamResponse(c, ch)
}
2024-06-21 02:00:08 +08:00
func handleScheduleError(c *gin.Context, name string, err error) {
2024-06-18 01:38:55 +08:00
switch {
case errors.Is(err, errCapabilities), errors.Is(err, errRequired):
2024-06-21 02:00:08 +08:00
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
2024-06-18 01:38:55 +08:00
case errors.Is(err, context.Canceled):
c.JSON(499, gin.H{"error": "request canceled"})
2024-06-18 01:38:55 +08:00
case errors.Is(err, ErrMaxQueue):
c.JSON(http.StatusServiceUnavailable, gin.H{"error": err.Error()})
2024-06-21 02:00:08 +08:00
case errors.Is(err, os.ErrNotExist):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found, try pulling it first", name)})
2024-06-18 01:38:55 +08:00
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
}
func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
finalUserIndex := -1
for i, msg := range msgs {
if msg.Role == "user" {
finalUserIndex = i
}
}
for i, msg := range msgs {
if msg.Role == "assistant" && i < finalUserIndex {
// TODO(drifkin): this is from before we added proper thinking support.
// However, even if thinking is not enabled (and therefore we shouldn't
// change the user output), we should probably perform this filtering
// for all thinking models (not just qwen3 & deepseek-r1) since it tends
// to save tokens and improve quality.
thinkingState := &thinking.Parser{
2025-06-06 01:22:32 +08:00
OpeningTag: "<think>",
ClosingTag: "</think>",
}
2025-06-06 01:22:32 +08:00
_, content := thinkingState.AddContent(msg.Content)
msgs[i].Content = content
}
}
}
return msgs
}