diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 12f361408..f423106e7 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -432,6 +432,22 @@ jobs: docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }} working-directory: ${{ runner.temp }} + # Trigger downstream release process + trigger: + runs-on: ubuntu-latest + environment: release + needs: [darwin-build, windows-build, windows-depends] + steps: + - name: Trigger downstream release process + run: | + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \ + -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}" + # Aggregate all the assets and ship a release release: needs: [darwin-sign, windows-sign, linux-build] diff --git a/.golangci.yaml b/.golangci.yaml index 9bb9786a8..9d6705bd3 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -19,8 +19,8 @@ linters: - nolintlint - nosprintfhostport - staticcheck - - tenv - unconvert + - usetesting - wastedassign - whitespace disable: diff --git a/Makefile.sync b/Makefile.sync index 949ade809..711667c98 100644 --- a/Makefile.sync +++ b/Makefile.sync @@ -1,6 +1,6 @@ UPSTREAM=https://github.com/ggerganov/llama.cpp.git WORKDIR=llama/vendor -FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac +FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618 .PHONY: help help: @@ -15,11 +15,13 @@ help: @echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync" .PHONY: sync -sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml +sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal -.PHONY: llama/build-info.cpp -llama/build-info.cpp: llama/build-info.cpp.in - sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@ +llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp + sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@ + +ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml + go generate ./$(@D) .PHONY: llama/llama.cpp llama/llama.cpp: llama/vendor/ @@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/ rsync -arvzc -f "merge $@/.rsync-filter" $< $@ PATCHES=$(wildcard llama/patches/*.patch) +PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES))))) .PHONY: apply-patches .NOTPARALLEL: -apply-patches: $(addsuffix ed, $(PATCHES)) +apply-patches: $(PATCHED) -%.patched: %.patch +llama/patches/.%.patched: llama/patches/%.patch @if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi .PHONY: checkout @@ -57,4 +60,4 @@ format-patches: llama/patches .PHONE: clean clean: checkout - $(RM) $(addsuffix ed, $(PATCHES)) + $(RM) llama/patches/.*.patched diff --git a/README.md b/README.md index b88c9e74c..6a4815c1e 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ Here are some example models that can be downloaded: | QwQ | 32B | 20GB | `ollama run qwq` | | DeepSeek-R1 | 7B | 4.7GB | `ollama run deepseek-r1` | | DeepSeek-R1 | 671B | 404GB | `ollama run deepseek-r1:671b` | +| Llama 4 | 109B | 67GB | `ollama run llama4:scout` | +| Llama 4 | 400B | 245GB | `ollama run llama4:maverick` | | Llama 3.3 | 70B | 43GB | `ollama run llama3.3` | | Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` | | Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` | @@ -77,7 +79,7 @@ Here are some example models that can be downloaded: | Code Llama | 7B | 3.8GB | `ollama run codellama` | | Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` | | LLaVA | 7B | 4.5GB | `ollama run llava` | -| Granite-3.2 | 8B | 4.9GB | `ollama run granite3.2` | +| Granite-3.3 | 8B | 4.9GB | `ollama run granite3.3` | > [!NOTE] > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models. @@ -312,6 +314,8 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat) - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats) - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS) +- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics) +- [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks) - [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories) - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases) - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG) @@ -394,12 +398,14 @@ See the [API documentation](./docs/api.md) for all endpoints. - [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool) - [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration) - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.) +- [Flufy](https://github.com/Aharon-Bensadoun/Flufy) (A beautiful chat interface for interacting with Ollama's API. Built with React, TypeScript, and Material-UI.) - [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance) - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).) - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama) - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable) - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers) +- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI) ### Cloud @@ -469,7 +475,7 @@ See the [API documentation](./docs/api.md) for all endpoints. ### Libraries -- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/) +- [LangChain](https://python.langchain.com/docs/integrations/chat/ollama/) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/) - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama) - [crewAI](https://github.com/crewAIInc/crewAI) - [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration) @@ -522,6 +528,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider) - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic - [Ollama for D](https://github.com/kassane/ollama-d) +- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama) ### Mobile @@ -578,6 +585,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai) - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c) - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs) +- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama) ### Supported backends diff --git a/api/client_test.go b/api/client_test.go index fe9a15899..2ceeec9cf 100644 --- a/api/client_test.go +++ b/api/client_test.go @@ -1,7 +1,6 @@ package api import ( - "context" "encoding/json" "fmt" "net/http" @@ -137,7 +136,7 @@ func TestClientStream(t *testing.T) { client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient) var receivedChunks []ChatResponse - err := client.stream(context.Background(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error { + err := client.stream(t.Context(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error { var resp ChatResponse if err := json.Unmarshal(chunk, &resp); err != nil { return fmt.Errorf("failed to unmarshal chunk: %w", err) @@ -223,7 +222,7 @@ func TestClientDo(t *testing.T) { ID string `json:"id"` Success bool `json:"success"` } - err := client.do(context.Background(), http.MethodPost, "/v1/messages", nil, &resp) + err := client.do(t.Context(), http.MethodPost, "/v1/messages", nil, &resp) if tc.wantErr != "" { if err == nil { diff --git a/api/types.go b/api/types.go index 7d8b6e532..602f93da8 100644 --- a/api/types.go +++ b/api/types.go @@ -271,9 +271,6 @@ type Options struct { RepeatPenalty float32 `json:"repeat_penalty,omitempty"` PresencePenalty float32 `json:"presence_penalty,omitempty"` FrequencyPenalty float32 `json:"frequency_penalty,omitempty"` - Mirostat int `json:"mirostat,omitempty"` - MirostatTau float32 `json:"mirostat_tau,omitempty"` - MirostatEta float32 `json:"mirostat_eta,omitempty"` Stop []string `json:"stop,omitempty"` } @@ -283,12 +280,7 @@ type Runner struct { NumBatch int `json:"num_batch,omitempty"` NumGPU int `json:"num_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"` - LowVRAM bool `json:"low_vram,omitempty"` - F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored - LogitsAll bool `json:"logits_all,omitempty"` - VocabOnly bool `json:"vocab_only,omitempty"` UseMMap *bool `json:"use_mmap,omitempty"` - UseMLock bool `json:"use_mlock,omitempty"` NumThread int `json:"num_thread,omitempty"` } @@ -471,13 +463,6 @@ type ProcessModelResponse struct { SizeVRAM int64 `json:"size_vram"` } -type RetrieveModelResponse struct { - Id string `json:"id"` - Object string `json:"object"` - Created int64 `json:"created"` - OwnedBy string `json:"owned_by"` -} - type TokenResponse struct { Token string `json:"token"` } @@ -660,9 +645,6 @@ func DefaultOptions() Options { RepeatPenalty: 1.1, PresencePenalty: 0.0, FrequencyPenalty: 0.0, - Mirostat: 0, - MirostatTau: 5.0, - MirostatEta: 0.1, Seed: -1, Runner: Runner{ @@ -671,8 +653,6 @@ func DefaultOptions() Options { NumBatch: 512, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically NumThread: 0, // let the runtime decide - LowVRAM: false, - UseMLock: false, UseMMap: nil, }, } diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go index 9985fc3f8..22e3de194 100644 --- a/app/lifecycle/logging.go +++ b/app/lifecycle/logging.go @@ -4,20 +4,14 @@ import ( "fmt" "log/slog" "os" - "path/filepath" "strconv" "strings" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/logutil" ) func InitLogging() { - level := slog.LevelInfo - - if envconfig.Debug() { - level = slog.LevelDebug - } - var logFile *os.File var err error // Detect if we're a GUI app on windows, and if not, send logs to console @@ -33,20 +27,8 @@ func InitLogging() { return } } - handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{ - Level: level, - AddSource: true, - ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr { - if attr.Key == slog.SourceKey { - source := attr.Value.Any().(*slog.Source) - source.File = filepath.Base(source.File) - } - return attr - }, - }) - - slog.SetDefault(slog.New(handler)) + slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel())) slog.Info("ollama app started") } diff --git a/benchmark/server_benchmark_test.go b/benchmark/server_benchmark_test.go index 672b8b173..4a3c46cda 100644 --- a/benchmark/server_benchmark_test.go +++ b/benchmark/server_benchmark_test.go @@ -78,7 +78,7 @@ func BenchmarkColdStart(b *testing.B) { for _, tt := range tests { b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) { - ctx := context.Background() + ctx := b.Context() // Set number of tokens as our throughput metric b.SetBytes(int64(tt.maxTokens)) @@ -113,7 +113,7 @@ func BenchmarkWarmStart(b *testing.B) { for _, tt := range tests { b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) { - ctx := context.Background() + ctx := b.Context() // Pre-warm the model warmup(client, m, tt.prompt, b) @@ -140,7 +140,7 @@ func setup(b *testing.B) *api.Client { if err != nil { b.Fatal(err) } - if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil { + if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil { b.Fatalf("Model unavailable: %v", err) } diff --git a/cmd/cmd.go b/cmd/cmd.go index 79ff87ac8..ad4be7f91 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -31,6 +31,7 @@ import ( "github.com/olekukonko/tablewriter" "github.com/spf13/cobra" "golang.org/x/crypto/ssh" + "golang.org/x/sync/errgroup" "golang.org/x/term" "github.com/ollama/ollama/api" @@ -41,6 +42,7 @@ import ( "github.com/ollama/ollama/runner" "github.com/ollama/ollama/server" "github.com/ollama/ollama/types/model" + "github.com/ollama/ollama/types/syncmap" "github.com/ollama/ollama/version" ) @@ -106,7 +108,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error { } spinner.Stop() - req.Name = args[0] + req.Model = args[0] quantize, _ := cmd.Flags().GetString("quantize") if quantize != "" { req.Quantize = quantize @@ -117,34 +119,54 @@ func CreateHandler(cmd *cobra.Command, args []string) error { return err } - if len(req.Files) > 0 { - fileMap := map[string]string{} - for f, digest := range req.Files { + var g errgroup.Group + g.SetLimit(max(runtime.GOMAXPROCS(0)-1, 1)) + + files := syncmap.NewSyncMap[string, string]() + for f, digest := range req.Files { + g.Go(func() error { if _, err := createBlob(cmd, client, f, digest, p); err != nil { return err } - fileMap[filepath.Base(f)] = digest - } - req.Files = fileMap + + // TODO: this is incorrect since the file might be in a subdirectory + // instead this should take the path relative to the model directory + // but the current implementation does not allow this + files.Store(filepath.Base(f), digest) + return nil + }) } - if len(req.Adapters) > 0 { - fileMap := map[string]string{} - for f, digest := range req.Adapters { + adapters := syncmap.NewSyncMap[string, string]() + for f, digest := range req.Adapters { + g.Go(func() error { if _, err := createBlob(cmd, client, f, digest, p); err != nil { return err } - fileMap[filepath.Base(f)] = digest - } - req.Adapters = fileMap + + // TODO: same here + adapters.Store(filepath.Base(f), digest) + return nil + }) } + if err := g.Wait(); err != nil { + return err + } + + req.Files = files.Items() + req.Adapters = adapters.Items() + bars := make(map[string]*progress.Bar) fn := func(resp api.ProgressResponse) error { if resp.Digest != "" { bar, ok := bars[resp.Digest] if !ok { - bar = progress.NewBar(fmt.Sprintf("pulling %s...", resp.Digest[7:19]), resp.Total, resp.Completed) + msg := resp.Status + if msg == "" { + msg = fmt.Sprintf("pulling %s...", resp.Digest[7:19]) + } + bar = progress.NewBar(msg, resp.Total, resp.Completed) bars[resp.Digest] = bar p.Add(resp.Digest, bar) } @@ -213,7 +235,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string, digest stri } }() - if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil { + if err := client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil { return "", err } return digest, nil @@ -725,11 +747,38 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error { case float64: v = fmt.Sprintf("%g", vData) case []any: - n := 3 - if len(vData) < n { - n = len(vData) + targetWidth := 10 // Small width where we are displaying the data in a column + + var itemsToShow int + totalWidth := 1 // Start with 1 for opening bracket + + // Find how many we can fit + for i := range vData { + itemStr := fmt.Sprintf("%v", vData[i]) + width := runewidth.StringWidth(itemStr) + + // Add separator width (", ") for all items except the first + if i > 0 { + width += 2 + } + + // Check if adding this item would exceed our width limit + if totalWidth+width > targetWidth && i > 0 { + break + } + + totalWidth += width + itemsToShow++ + } + + // Format the output + if itemsToShow < len(vData) { + v = fmt.Sprintf("%v", vData[:itemsToShow]) + v = strings.TrimSuffix(v, "]") + v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow) + } else { + v = fmt.Sprintf("%v", vData) } - v = fmt.Sprintf("%v", vData[:n]) default: v = fmt.Sprintf("%T", vData) } @@ -750,10 +799,19 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error { head := func(s string, n int) (rows [][]string) { scanner := bufio.NewScanner(strings.NewReader(s)) - for scanner.Scan() && (len(rows) < n || n < 0) { - if text := scanner.Text(); text != "" { - rows = append(rows, []string{"", strings.TrimSpace(text)}) + count := 0 + for scanner.Scan() { + text := strings.TrimSpace(scanner.Text()) + if text == "" { + continue } + count++ + if n < 0 || count <= n { + rows = append(rows, []string{"", text}) + } + } + if n >= 0 && count > n { + rows = append(rows, []string{"", "..."}) } return } @@ -1260,7 +1318,7 @@ func NewCLI() *cobra.Command { } createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"") - createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)") + createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)") showCmd := &cobra.Command{ Use: "show MODEL", diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go index 367a35b6b..cf5fe7caa 100644 --- a/cmd/cmd_test.go +++ b/cmd/cmd_test.go @@ -2,7 +2,6 @@ package cmd import ( "bytes" - "context" "encoding/json" "io" "net/http" @@ -226,6 +225,7 @@ Weigh anchor! System You are a pirate! Ahoy, matey! + ... ` if diff := cmp.Diff(expect, b.String()); diff != "" { @@ -337,7 +337,7 @@ func TestDeleteHandler(t *testing.T) { t.Cleanup(mockServer.Close) cmd := &cobra.Command{} - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) if err := DeleteHandler(cmd, []string{"test-model"}); err != nil { t.Fatalf("DeleteHandler failed: %v", err) } @@ -399,11 +399,6 @@ func TestGetModelfileName(t *testing.T) { var expectedFilename string if tt.fileExists { - tempDir, err := os.MkdirTemp("", "modelfiledir") - defer os.RemoveAll(tempDir) - if err != nil { - t.Fatalf("temp modelfile dir creation failed: %v", err) - } var fn string if tt.modelfileName != "" { fn = tt.modelfileName @@ -411,7 +406,7 @@ func TestGetModelfileName(t *testing.T) { fn = "Modelfile" } - tempFile, err := os.CreateTemp(tempDir, fn) + tempFile, err := os.CreateTemp(t.TempDir(), fn) if err != nil { t.Fatalf("temp modelfile creation failed: %v", err) } @@ -530,7 +525,7 @@ func TestPushHandler(t *testing.T) { cmd := &cobra.Command{} cmd.Flags().Bool("insecure", false, "") - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) // Redirect stderr to capture progress output oldStderr := os.Stderr @@ -635,7 +630,7 @@ func TestListHandler(t *testing.T) { t.Setenv("OLLAMA_HOST", mockServer.URL) cmd := &cobra.Command{} - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) // Capture stdout oldStdout := os.Stdout @@ -690,7 +685,7 @@ func TestCreateHandler(t *testing.T) { return } - if req.Name != "test-model" { + if req.Model != "test-model" { t.Errorf("expected model name 'test-model', got %s", req.Name) } @@ -730,7 +725,7 @@ func TestCreateHandler(t *testing.T) { })) t.Setenv("OLLAMA_HOST", mockServer.URL) t.Cleanup(mockServer.Close) - tempFile, err := os.CreateTemp("", "modelfile") + tempFile, err := os.CreateTemp(t.TempDir(), "modelfile") if err != nil { t.Fatal(err) } @@ -750,7 +745,7 @@ func TestCreateHandler(t *testing.T) { } cmd.Flags().Bool("insecure", false, "") - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) // Redirect stderr to capture progress output oldStderr := os.Stderr diff --git a/cmd/interactive.go b/cmd/interactive.go index 82a3bfcbe..d7e6fbcfb 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.") if opts.MultiModal { - fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file")) + fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file")) } fmt.Fprintln(os.Stderr, "") @@ -511,7 +511,7 @@ func extractFileNames(input string) []string { // Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20) // and followed by more characters and a file extension // This will capture non filename strings, but we'll check for file existence to remove mismatches - regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b` + regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b` re := regexp.MustCompile(regexPattern) return re.FindAllString(input, -1) @@ -531,6 +531,8 @@ func extractFileData(input string) (string, []api.ImageData, error) { return "", imgs, err } fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp) + input = strings.ReplaceAll(input, "'"+nfp+"'", "") + input = strings.ReplaceAll(input, "'"+fp+"'", "") input = strings.ReplaceAll(input, fp, "") imgs = append(imgs, data) } @@ -551,7 +553,7 @@ func getImageData(filePath string) ([]byte, error) { } contentType := http.DetectContentType(buf) - allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"} + allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"} if !slices.Contains(allowedTypes, contentType) { return nil, fmt.Errorf("invalid image type: %s", contentType) } diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index 3f60448b6..809f53ff9 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -1,6 +1,8 @@ package cmd import ( + "os" + "path/filepath" "testing" "github.com/stretchr/testify/assert" @@ -10,14 +12,17 @@ func TestExtractFilenames(t *testing.T) { // Unix style paths input := ` some preamble ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg -/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG` +/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG +/unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP` res := extractFileNames(input) - assert.Len(t, res, 5) + assert.Len(t, res, 7) assert.Contains(t, res[0], "one.png") assert.Contains(t, res[1], "two.jpg") assert.Contains(t, res[2], "three.jpeg") assert.Contains(t, res[3], "four.png") assert.Contains(t, res[4], "five.JPG") + assert.Contains(t, res[5], "six.webp") + assert.Contains(t, res[6], "seven.WEBP") assert.NotContains(t, res[4], '"') assert.NotContains(t, res, "inbetween1") assert.NotContains(t, res, "./1.svg") @@ -28,10 +33,12 @@ func TestExtractFilenames(t *testing.T) { /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4 ./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6 d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 - d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending + d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG +c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12 +d:\path with\spaces\thirteen.WEBP some ending ` res = extractFileNames(input) - assert.Len(t, res, 10) + assert.Len(t, res, 13) assert.NotContains(t, res, "inbetween2") assert.Contains(t, res[0], "one.png") assert.Contains(t, res[0], "c:") @@ -49,4 +56,31 @@ d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 assert.Contains(t, res[8], "d:") assert.Contains(t, res[9], "ten.PNG") assert.Contains(t, res[9], "E:") + assert.Contains(t, res[10], "eleven.webp") + assert.Contains(t, res[10], "c:") + assert.Contains(t, res[11], "twelve.WebP") + assert.Contains(t, res[11], "c:") + assert.Contains(t, res[12], "thirteen.WEBP") + assert.Contains(t, res[12], "d:") +} + +// Ensure that file paths wrapped in single quotes are removed with the quotes. +func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) { + dir := t.TempDir() + fp := filepath.Join(dir, "img.jpg") + data := make([]byte, 600) + copy(data, []byte{ + 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F', + 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xd9, + }) + if err := os.WriteFile(fp, data, 0o600); err != nil { + t.Fatalf("failed to write test image: %v", err) + } + + input := "before '" + fp + "' after" + cleaned, imgs, err := extractFileData(input) + assert.NoError(t, err) + assert.Len(t, imgs, 1) + assert.Equal(t, cleaned, "before after") } diff --git a/convert/convert.go b/convert/convert.go index ffcc2b8ab..4a6df66c7 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -1,12 +1,13 @@ package convert import ( + "cmp" "encoding/json" "errors" "fmt" - "io" "io/fs" "log/slog" + "os" "slices" "strings" @@ -14,13 +15,12 @@ import ( ) type ModelParameters struct { - Architectures []string `json:"architectures"` - VocabSize uint32 `json:"vocab_size"` - TextModel TextParameters `json:"text_config"` -} + Architectures []string `json:"architectures"` + VocabSize uint32 `json:"vocab_size"` -type TextParameters struct { - VocabSize uint32 `json:"vocab_size"` + TextModel struct { + VocabSize uint32 `json:"vocab_size"` + } `json:"text_config"` } type AdapterParameters struct { @@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV { } for _, sv := range t.SpecialVocabulary { - kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID) kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken + kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID) + if len(sv.IDs) > 0 { + kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs + } } return kv @@ -89,7 +92,7 @@ type ModelConverter interface { // KV maps parameters to LLM key-values KV(*Tokenizer) ggml.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. - Tensors([]Tensor) []ggml.Tensor + Tensors([]Tensor) []*ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string @@ -106,13 +109,13 @@ type AdapterConverter interface { // KV maps parameters to LLM key-values KV(ggml.KV) ggml.KV // Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here. - Tensors([]Tensor) []ggml.Tensor + Tensors([]Tensor) []*ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string } -func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error { +func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error { bts, err := fs.ReadFile(fsys, "adapter_config.json") if err != nil { return err @@ -147,14 +150,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error { return err } - return writeFile(ws, conv.KV(baseKV), conv.Tensors(ts)) + return writeFile(f, conv.KV(baseKV), conv.Tensors(ts)) } // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model. -func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { +func ConvertModel(fsys fs.FS, f *os.File) error { bts, err := fs.ReadFile(fsys, "config.json") if err != nil { return err @@ -173,6 +176,8 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { switch p.Architectures[0] { case "LlamaForCausalLM": conv = &llamaModel{} + case "MllamaForConditionalGeneration": + conv = &mllamaModel{} case "Llama4ForConditionalGeneration": conv = &llama4Model{} case "Mistral3ForConditionalGeneration": @@ -189,6 +194,8 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { conv = &phi3Model{} case "Qwen2ForCausalLM": conv = &qwen2Model{} + case "Qwen2_5_VLForConditionalGeneration": + conv = &qwen25VLModel{} case "BertModel": conv = &bertModel{} case "CohereForCausalLM": @@ -212,24 +219,22 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { return err } - vocabSize := int(p.VocabSize) - if vocabSize == 0 { - tVocabSize := int(p.TextModel.VocabSize) - vocabSize = tVocabSize - } + vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize)) switch { case vocabSize == 0: - slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens)) + slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens)) case vocabSize > len(t.Vocabulary.Tokens): - slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens)) + slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens)) for i := range vocabSize - len(t.Vocabulary.Tokens) { t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i)) t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1) t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined) } case vocabSize < len(t.Vocabulary.Tokens): - return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize) + slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens)) + p.VocabSize = uint32(len(t.Vocabulary.Tokens)) + p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens)) default: slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } @@ -239,13 +244,13 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { return err } - return writeFile(ws, conv.KV(t), conv.Tensors(ts)) + return writeFile(f, conv.KV(t), conv.Tensors(ts)) } -func writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error { +func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error { for i := range ts { ts[i].Shape = slices.Clone(ts[i].Shape) slices.Reverse(ts[i].Shape) } - return ggml.WriteGGUF(ws, kv, ts) + return ggml.WriteGGUF(f, kv, ts) } diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 8575652aa..a9f4b8a77 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if slices.Contains([]string{ "embeddings.position_ids", @@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor { continue } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_commandr.go b/convert/convert_commandr.go index 738a2cf3b..a909515bd 100644 --- a/convert/convert_commandr.go +++ b/convert/convert_commandr.go @@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 2f329943e..26698d6a6 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma2_adapter.go b/convert/convert_gemma2_adapter.go index 3494aa3f9..6299cd9e0 100644 --- a/convert/convert_gemma2_adapter.go +++ b/convert/convert_gemma2_adapter.go @@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV { return kv } -func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 0caaa1949..43969749c 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -126,11 +126,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor if p.RopeScaling.factors != nil { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: "rope_freqs.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.factors))}, @@ -139,13 +139,14 @@ func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor { } for _, t := range ts { - if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") { + if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") || + strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") { if !p.skipRepack { t.SetRepacker(p.repack) } } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), @@ -181,9 +182,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa } var heads uint32 - if strings.HasSuffix(name, "attn_q.weight") { + if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") { heads = p.NumAttentionHeads - } else if strings.HasSuffix(name, "attn_k.weight") { + } else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") { heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) } else { return nil, fmt.Errorf("unknown tensor for repack: %s", name) diff --git a/convert/convert_llama4.go b/convert/convert_llama4.go index 26a230b33..3e3792339 100644 --- a/convert/convert_llama4.go +++ b/convert/convert_llama4.go @@ -88,13 +88,13 @@ func (p *llama4Model) Replacements() []string { } // Tensors implements ModelConverter. -func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor var textTensors []Tensor for _, t := range ts { if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), @@ -112,7 +112,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { // clone tensor since we need separate repackers tt := t.Clone() tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim))) - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name), Kind: tt.Kind(), Shape: newShape, @@ -125,7 +125,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack()) newShape := slices.Clone(t.Shape()) newShape[1], newShape[2] = newShape[2], newShape[1] - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: newShape, diff --git a/convert/convert_llama_adapter.go b/convert/convert_llama_adapter.go index 718ef047e..4cc451153 100644 --- a/convert/convert_llama_adapter.go +++ b/convert/convert_llama_adapter.go @@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV { return kv } -func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: shape, diff --git a/convert/convert_mistral.go b/convert/convert_mistral.go index 6c224ae4f..a6fd4c41a 100644 --- a/convert/convert_mistral.go +++ b/convert/convert_mistral.go @@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV { return kv } -func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if !strings.HasPrefix(t.Name(), "v.") { @@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { } } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 95a289f76..17580ff8f 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { +func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor { oldnew := []string{ "model.layers", "blk", "w1", "ffn_gate_exps", @@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { return true }) - var out []ggml.Tensor + var out []*ggml.Tensor for n, e := range experts { // TODO(mxyng): sanity check experts - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: n, Kind: e[0].Kind(), Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...), diff --git a/convert/convert_mllama.go b/convert/convert_mllama.go new file mode 100644 index 000000000..12478be71 --- /dev/null +++ b/convert/convert_mllama.go @@ -0,0 +1,160 @@ +package convert + +import ( + "strings" + + "github.com/ollama/ollama/fs/ggml" + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" +) + +type mllamaModel struct { + ModelParameters + TextModel struct { + llamaModel + + CrossAttentionLayers []int32 `json:"cross_attention_layers"` + } `json:"text_config"` + VisionModel struct { + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NumGlobalLayers uint32 `json:"num_global_layers"` + IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"` + + HiddenSize uint32 `json:"hidden_size"` + IntermediateSize uint32 `json:"intermediate_size"` + + AttentionHeads uint32 `json:"attention_heads"` + + ImageSize uint32 `json:"image_size"` + PatchSize uint32 `json:"patch_size"` + NumChannels uint32 `json:"num_channels"` + MaxNumTiles uint32 `json:"max_num_tiles"` + NormEpsilon float32 `json:"norm_eps"` + RopeTheta float32 `json:"rope.freq_base"` + } `json:"vision_config"` +} + +func (m *mllamaModel) KV(t *Tokenizer) ggml.KV { + kv := m.ModelParameters.KV(t) + kv["general.architecture"] = "mllama" + + for k, v := range m.TextModel.KV(t) { + if strings.HasPrefix(k, "llama.") { + kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v + } + } + + kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers + + kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers + kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers + kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices + + kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize + kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize + + kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads + kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon + + kv["mllama.vision.image_size"] = m.VisionModel.ImageSize + kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize + kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles + kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels + + return kv +} + +func (m *mllamaModel) Replacements() []string { + return append( + m.TextModel.Replacements(), + "language_model.", "", + "gate_attn", "attn_gate", + "gate_ffn", "ffn_gate", + "cross_attn.", "cross_attn_", + "vision_model", "v", + "class_embedding", "class_embd", + "patch_embedding", "patch_embd", + "gated_positional_embedding.tile_embedding", "tile_position_embd", + "gated_positional_embedding.embedding", "position_embd.weight", + "gated_positional_embedding", "position_embd", + "embedding.weight", "weight", + "pre_tile_positional_embedding", "pre_tile_position_embd", + "post_tile_positional_embedding", "post_tile_position_embd", + "layernorm_pre", "pre_ln", + "layernorm_post", "post_ln", + "global_transformer.layers", "global.blk", + "transformer.layers", "blk", + "mlp.fc1", "ffn_up", + "mlp.fc2", "ffn_down", + "multi_modal_projector", "mm.0", + ) +} + +func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor + var text []Tensor + for _, t := range ts { + if t.Name() == "v.position_embd.gate" { + for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} { + tt := t.Clone() + tt.SetRepacker(m.repack(name)) + out = append(out, &ggml.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: tt, + }) + } + } else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" { + t.SetRepacker(m.repack(t.Name())) + out = append(out, &ggml.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") { + out = append(out, &ggml.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } else { + text = append(text, t) + } + } + + return append(out, m.TextModel.Tensors(text)...) +} + +func (m *mllamaModel) repack(name string) Repacker { + return func(_ string, data []float32, shape []uint64) (_ []float32, err error) { + dims := make([]int, len(shape)) + for i, dim := range shape { + dims[i] = int(dim) + } + + var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + t, err = tensor.Tanh(t) + if err != nil { + return nil, err + } + + if name == "v.position_embd.gate" { + t, err = tensor.Sub(float32(1), t) + if err != nil { + return nil, err + } + } + + t = tensor.Materialize(t) + // flatten tensor so it can be return as a vector + if err := t.Reshape(t.Shape().TotalSize()); err != nil { + return nil, err + } + + return native.VectorF32(t.(*tensor.Dense)) + } +} diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index d1c13795a..5a6756053 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV { return kv } -func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { +func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor { var addRopeFactors sync.Once - out := make([]ggml.Tensor, 0, len(ts)+2) + out := make([]*ggml.Tensor, 0, len(ts)+2) for _, t := range ts { if strings.HasPrefix(t.Name(), "blk.0.") { addRopeFactors.Do(func() { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: "rope_factors_long.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))}, WriterTo: p.RopeScaling.LongFactor, - }, ggml.Tensor{ + }, &ggml.Tensor{ Name: "rope_factors_short.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))}, @@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { }) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_qwen2.go b/convert/convert_qwen2.go index 18278802e..3647c4e54 100644 --- a/convert/convert_qwen2.go +++ b/convert/convert_qwen2.go @@ -15,6 +15,7 @@ type qwen2Model struct { Type string `json:"type"` Factor ropeFactor `json:"factor"` OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"` + MropeSection []int32 `json:"mrope_section"` } `json:"rope_scaling"` RMSNormEPS float32 `json:"rms_norm_eps"` } @@ -39,16 +40,18 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV { case "yarn": kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor + case "mrope", "default": + kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection default: panic("unknown rope scaling type") } return kv } -func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_qwen25vl.go b/convert/convert_qwen25vl.go new file mode 100644 index 000000000..c2d5a633b --- /dev/null +++ b/convert/convert_qwen25vl.go @@ -0,0 +1,102 @@ +package convert + +import ( + "cmp" + "slices" + "strings" + + "github.com/ollama/ollama/fs/ggml" +) + +type qwen25VLModel struct { + qwen2Model + + VisionModel struct { + Depth uint32 `json:"depth"` + HiddenSize uint32 `json:"hidden_size"` + NumHeads uint32 `json:"num_heads"` + InChannels uint32 `json:"in_chans"` + PatchSize uint32 `json:"patch_size"` + SpatialMergeSize uint32 `json:"spatial_merge_size"` + SpatialPatchSize uint32 `json:"spatial_patch_size"` + WindowSize uint32 `json:"window_size"` + RMSNormEps float32 `json:"layer_norm_epsilon"` + RopeTheta float32 `json:"rope_theta"` + FullAttentionBlocks []int32 `json:"fullatt_block_indexes"` + TemporalPatchSize uint32 `json:"temporal_patch_size"` + } `json:"vision_config"` +} + +var _ ModelConverter = (*qwen25VLModel)(nil) + +func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV { + kv := q.ModelParameters.KV(t) + kv["general.architecture"] = "qwen25vl" + + for k, v := range q.qwen2Model.KV(t) { + if strings.HasPrefix(k, "qwen2.") { + kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v + } + } + + if q.VisionModel.FullAttentionBlocks == nil { + kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31} + } + + kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32) + kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize + kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16) + kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels + kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14) + kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2) + kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize + kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112) + kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6) + kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4) + kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks + kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2) + + return kv +} + +func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor + + for _, t := range ts { + if strings.Contains(t.Name(), "patch_embed.proj") { + for t := range splitDim(t, 2, + strings.NewReplacer("patch_embed.proj", "patch_embd_0"), + strings.NewReplacer("patch_embed.proj", "patch_embd_1"), + ) { + t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 }) + out = append(out, t) + } + } else if strings.Contains(t.Name(), "attn.qkv") { + out = append(out, slices.Collect(splitDim(t, 0, + strings.NewReplacer("attn.qkv", "attn_q"), + strings.NewReplacer("attn.qkv", "attn_k"), + strings.NewReplacer("attn.qkv", "attn_v"), + ))...) + } else { + out = append(out, &ggml.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + } + + return out +} + +func (p *qwen25VLModel) Replacements() []string { + return append( + p.qwen2Model.Replacements(), + "visual", "v", + "blocks", "blk", + "attn.proj", "attn_out", + "norm1", "ln1", + "norm2", "ln2", + ) +} diff --git a/convert/convert_test.go b/convert/convert_test.go index 1cdc26c41..b9db6fa15 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -130,6 +130,7 @@ func TestConvertModel(t *testing.T) { if err != nil { t.Fatal(err) } + defer expectFile.Close() var expect map[string]string if err := json.NewDecoder(expectFile).Decode(&expect); err != nil { diff --git a/convert/fs.go b/convert/fs.go deleted file mode 100644 index 31132dbe7..000000000 --- a/convert/fs.go +++ /dev/null @@ -1,58 +0,0 @@ -package convert - -import ( - "archive/zip" - "errors" - "io" - "io/fs" - "os" - "path/filepath" -) - -type ZipReader struct { - r *zip.Reader - p string - - // limit is the maximum size of a file that can be read directly - // from the zip archive. Files larger than this size will be extracted - limit int64 -} - -func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS { - return &ZipReader{r, p, limit} -} - -func (z *ZipReader) Open(name string) (fs.File, error) { - r, err := z.r.Open(name) - if err != nil { - return nil, err - } - defer r.Close() - - if fi, err := r.Stat(); err != nil { - return nil, err - } else if fi.Size() < z.limit { - return r, nil - } - - if !filepath.IsLocal(name) { - return nil, zip.ErrInsecurePath - } - - n := filepath.Join(z.p, name) - if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) { - w, err := os.Create(n) - if err != nil { - return nil, err - } - defer w.Close() - - if _, err := io.Copy(w, r); err != nil { - return nil, err - } - } else if err != nil { - return nil, err - } - - return os.Open(n) -} diff --git a/convert/reader.go b/convert/reader.go index ab81d5c0b..07d12f0dd 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -38,7 +38,10 @@ const ( func (t tensorBase) Kind() uint32 { if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || t.name == "token_types.weight" || - t.name == "v.positional_embedding_vlm" { + t.name == "v.positional_embedding_vlm" || + t.name == "v.tile_position_embd.weight" || + t.name == "v.pre_tile_position_embd.weight" || + t.name == "v.post_tile_position_embd.weight" { // these tensors are always F32 return 0 } diff --git a/convert/tensor.go b/convert/tensor.go new file mode 100644 index 000000000..ffb22ead9 --- /dev/null +++ b/convert/tensor.go @@ -0,0 +1,56 @@ +package convert + +import ( + "iter" + "slices" + "strings" + + "github.com/ollama/ollama/fs/ggml" + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" +) + +// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension +// is split evenly based on the number of replacers provided. +func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] { + return func(yield func(*ggml.Tensor) bool) { + for i, replacer := range replacers { + shape := slices.Clone(t.Shape()) + shape[dim] = shape[dim] / uint64(len(replacers)) + + slice := slices.Repeat([]tensor.Slice{nil}, len(shape)) + slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim])) + + tt := t.Clone() + tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) { + dims := make([]int, len(shape)) + for i := range shape { + dims[i] = int(shape[i]) + } + + var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + t, err := t.Slice(slice...) + if err != nil { + return nil, err + } + + t = tensor.Materialize(t) + // flatten tensor so it can be written as a vector + if err := t.Reshape(t.Shape().TotalSize()); err != nil { + return nil, err + } + + return native.VectorF32(t.(*tensor.Dense)) + }) + + if !yield(&ggml.Tensor{ + Name: replacer.Replace(t.Name()), + Kind: t.Kind(), + Shape: shape, + WriterTo: tt, + }) { + break + } + } + } +} diff --git a/convert/tokenizer.go b/convert/tokenizer.go index 74e2efed0..bedcd4f80 100644 --- a/convert/tokenizer.go +++ b/convert/tokenizer.go @@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error) } if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) { + // noop } else if err != nil { return nil, err } else { @@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error) } } + if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) { + } else if err != nil { + return nil, err + } else { + defer f.Close() + + var p map[string]json.RawMessage + if err := json.NewDecoder(f).Decode(&p); err != nil { + return nil, err + } + + for _, st := range specialTokenTypes { + if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok { + var ids []int32 + if err := json.Unmarshal(bts, &ids); err != nil { + // value is not a list so the existing ID is used + continue + } + + if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool { + return sv.Type == st + }); i >= 0 { + t.SpecialVocabulary[i].IDs = ids + } + } + } + } + return t, nil } @@ -280,6 +309,9 @@ type SpecialVocabulary struct { ID int Content string AddToken bool + + // IDs is populated by generation_config.json + IDs []int32 } func (sv SpecialVocabulary) Key() string { diff --git a/convert/tokenizer_test.go b/convert/tokenizer_test.go index c6ef9732f..813096fd9 100644 --- a/convert/tokenizer_test.go +++ b/convert/tokenizer_test.go @@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) { Pre: "default", }, }, + { + name: "generation config eos token ids", + fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ + "tokenizer.json": strings.NewReader(`{ + "added_tokens": [ + { + "id": 0, + "content": "", + "special": true + }, + { + "id": 1, + "content": "", + "special": true + }, + { + "id": 2, + "content": "", + "special": true + }, + { + "id": 3, + "content": "", + "special": true + } + ], + "model": { + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3 + } + } + }`), + "tokenizer_config.json": strings.NewReader(`{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": "", + "eos_token": "" + }`), + "generation_config.json": strings.NewReader(`{ + "bos_token_id": 0, + "eos_token_id": [1, 2, 3] + }`), + }), + specialTokenTypes: []string{"pad", "eos", "bos", "unk"}, + want: &Tokenizer{ + Vocabulary: &Vocabulary{ + Model: "gpt2", + Tokens: []string{"", "", "", ""}, + Scores: []float32{0, 1, 2, 3}, + Types: []int32{3, 3, 3, 3}, + }, + SpecialVocabulary: []*SpecialVocabulary{ + {Type: "eos", Content: "", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false}, + {Type: "bos", Content: "", ID: 0, AddToken: true}, + }, + Pre: "default", + }, + }, } for _, tt := range cases { diff --git a/discover/gpu.go b/discover/gpu.go index ba906a184..15bad4466 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e } func getVerboseState() C.uint16_t { - if envconfig.Debug() { + if envconfig.LogLevel() < slog.LevelInfo { return C.uint16_t(1) } return C.uint16_t(0) diff --git a/discover/gpu_info.h b/discover/gpu_info.h index 094b791a8..ee7ff4c33 100644 --- a/discover/gpu_info.h +++ b/discover/gpu_info.h @@ -27,12 +27,14 @@ #endif +#ifndef LOG #define LOG(verbose, ...) \ do { \ if (verbose) { \ fprintf(stderr, __VA_ARGS__); \ } \ } while (0) +#endif #ifdef __cplusplus extern "C" { diff --git a/discover/gpu_info_cudart.c b/discover/gpu_info_cudart.c index 03f15a2c3..bc5115bfd 100644 --- a/discover/gpu_info_cudart.c +++ b/discover/gpu_info_cudart.c @@ -1,6 +1,7 @@ #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? #include +#include #include "gpu_info_cudart.h" void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { @@ -58,7 +59,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret); UNLOAD_LIBRARY(resp->ch.handle); resp->ch.handle = NULL; - if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { + if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) { resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"); return; } @@ -168,9 +169,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) { resp->free = memInfo.free; resp->used = memInfo.used; - LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total); - LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free); - LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used); + LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total); + LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free); + LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used); LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); } @@ -180,4 +181,4 @@ void cudart_release(cudart_handle_t h) { h.handle = NULL; } -#endif // __APPLE__ \ No newline at end of file +#endif // __APPLE__ diff --git a/discover/gpu_info_nvcuda.c b/discover/gpu_info_nvcuda.c index 466e1ac24..d2d0b683b 100644 --- a/discover/gpu_info_nvcuda.c +++ b/discover/gpu_info_nvcuda.c @@ -1,6 +1,7 @@ #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? #include +#include #include "gpu_info_nvcuda.h" void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { @@ -193,8 +194,8 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { resp->total = memInfo.total; resp->free = memInfo.free; - LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024); - LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024); + LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024); + LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024); LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); @@ -247,4 +248,4 @@ void nvcuda_release(nvcuda_handle_t h) { h.handle = NULL; } -#endif // __APPLE__ \ No newline at end of file +#endif // __APPLE__ diff --git a/docs/api.md b/docs/api.md index 7f3e5e2d2..abd276150 100644 --- a/docs/api.md +++ b/docs/api.md @@ -19,7 +19,7 @@ ### Model names -Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version. +Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version. ### Durations @@ -394,9 +394,6 @@ curl http://localhost:11434/api/generate -d '{ "repeat_penalty": 1.2, "presence_penalty": 1.5, "frequency_penalty": 1.0, - "mirostat": 1, - "mirostat_tau": 0.8, - "mirostat_eta": 0.6, "penalize_newline": true, "stop": ["\n", "user:"], "numa": false, @@ -404,10 +401,7 @@ curl http://localhost:11434/api/generate -d '{ "num_batch": 2, "num_gpu": 1, "main_gpu": 0, - "low_vram": false, - "vocab_only": false, "use_mmap": true, - "use_mlock": false, "num_thread": 8 } }' @@ -958,19 +952,8 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo | Type | Recommended | | --- | :-: | -| q2_K | | -| q3_K_L | | -| q3_K_M | | -| q3_K_S | | -| q4_0 | | -| q4_1 | | | q4_K_M | * | | q4_K_S | | -| q5_0 | | -| q5_1 | | -| q5_K_M | | -| q5_K_S | | -| q6_K | | | q8_0 | * | ### Examples @@ -1015,8 +998,8 @@ Quantize a non-quantized model. ```shell curl http://localhost:11434/api/create -d '{ - "model": "llama3.1:quantized", - "from": "llama3.1:8b-instruct-fp16", + "model": "llama3.2:quantized", + "from": "llama3.2:3b-instruct-fp16", "quantize": "q4_K_M" }' ``` @@ -1026,12 +1009,14 @@ curl http://localhost:11434/api/create -d '{ A stream of JSON objects is returned: ```json -{"status":"quantizing F16 model to Q4_K_M"} -{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"} -{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"} -{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"} +{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302} +{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552} +{"status":"verifying conversion"} +{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"} +{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"} +{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"} +{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"} {"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"} -{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"} {"status":"writing manifest"} {"status":"success"} ``` @@ -1169,29 +1154,37 @@ A single JSON object will be returned. { "models": [ { - "name": "codellama:13b", - "modified_at": "2023-11-04T14:56:49.277302595-07:00", - "size": 7365960935, - "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697", + "name": "deepseek-r1:latest", + "model": "deepseek-r1:latest", + "modified_at": "2025-05-10T08:06:48.639712648-07:00", + "size": 4683075271, + "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163", "details": { + "parent_model": "", "format": "gguf", - "family": "llama", - "families": null, - "parameter_size": "13B", - "quantization_level": "Q4_0" + "family": "qwen2", + "families": [ + "qwen2" + ], + "parameter_size": "7.6B", + "quantization_level": "Q4_K_M" } }, { - "name": "llama3:latest", - "modified_at": "2023-12-07T09:32:18.757212583-08:00", - "size": 3825819519, - "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e", + "name": "llama3.2:latest", + "model": "llama3.2:latest", + "modified_at": "2025-05-04T17:37:44.706015396-07:00", + "size": 2019393189, + "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72", "details": { + "parent_model": "", "format": "gguf", "family": "llama", - "families": null, - "parameter_size": "7B", - "quantization_level": "Q4_0" + "families": [ + "llama" + ], + "parameter_size": "3.2B", + "quantization_level": "Q4_K_M" } } ] diff --git a/docs/modelfile.md b/docs/modelfile.md index a71183f40..6513873ce 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -150,9 +150,6 @@ PARAMETER | Parameter | Description | Value Type | Example Usage | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- | -| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 | -| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 | -| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 | | num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 | | repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 | | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 | diff --git a/envconfig/config.go b/envconfig/config.go index b18e93f89..9d7c2e218 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -149,9 +149,22 @@ func Bool(k string) func() bool { } } +// LogLevel returns the log level for the application. +// Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE +func LogLevel() slog.Level { + level := slog.LevelInfo + if s := Var("OLLAMA_DEBUG"); s != "" { + if b, _ := strconv.ParseBool(s); b { + level = slog.LevelDebug + } else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 { + level = slog.Level(i * -4) + } + } + + return level +} + var ( - // Debug enabled additional debug information. - Debug = Bool("OLLAMA_DEBUG") // FlashAttention enables the experimental flash attention feature. FlashAttention = Bool("OLLAMA_FLASH_ATTENTION") // KvCacheType is the quantization type for the K/V cache. @@ -209,8 +222,6 @@ var ( MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0) // MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable. MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512) - // MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable. - MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0) ) func Uint64(key string, defaultValue uint64) func() uint64 { @@ -238,7 +249,7 @@ type EnvVar struct { func AsMap() map[string]EnvVar { ret := map[string]EnvVar{ - "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, + "OLLAMA_DEBUG": {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"}, "OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"}, "OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"}, diff --git a/envconfig/config_test.go b/envconfig/config_test.go index 9e80645c7..f232f1cdc 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -1,11 +1,13 @@ package envconfig import ( + "log/slog" "math" "testing" "time" "github.com/google/go-cmp/cmp" + "github.com/ollama/ollama/logutil" ) func TestHost(t *testing.T) { @@ -292,3 +294,34 @@ func TestContextLength(t *testing.T) { }) } } + +func TestLogLevel(t *testing.T) { + cases := map[string]slog.Level{ + // Default to INFO + "": slog.LevelInfo, + "false": slog.LevelInfo, + "f": slog.LevelInfo, + "0": slog.LevelInfo, + + // True values enable Debug + "true": slog.LevelDebug, + "t": slog.LevelDebug, + + // Positive values increase verbosity + "1": slog.LevelDebug, + "2": logutil.LevelTrace, + + // Negative values decrease verbosity + "-1": slog.LevelWarn, + "-2": slog.LevelError, + } + + for k, v := range cases { + t.Run(k, func(t *testing.T) { + t.Setenv("OLLAMA_DEBUG", k) + if i := LogLevel(); i != v { + t.Errorf("%s: expected %d, got %d", k, v, i) + } + }) + } +} diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 427a43aec..8c0a2ae56 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -36,12 +36,12 @@ func (kv KV) ParameterCount() uint64 { return keyValue(kv, "general.parameter_count", uint64(0)) } -func (kv KV) FileType() fileType { +func (kv KV) FileType() FileType { if t := kv.Uint("general.file_type"); t > 0 { - return fileType(t) + return FileType(t) } - return fileTypeUnknown + return FileTypeUnknown } func (kv KV) BlockCount() uint64 { @@ -125,6 +125,8 @@ func (kv KV) OllamaEngineRequired() bool { "gemma3", "mistral3", "llama4", + "mllama", + "qwen25vl", }, kv.Architecture()) } @@ -149,7 +151,7 @@ func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue .. return val.(T) } - slog.Warn("key not found", "key", key, "default", defaultValue[0]) + slog.Debug("key not found", "key", key, "default", defaultValue[0]) return defaultValue[0] } @@ -226,7 +228,11 @@ func (t Tensor) block() (n int) { } func (t Tensor) blockSize() uint64 { - switch t.Kind { + return (TensorType)(t.Kind).BlockSize() +} + +func (t TensorType) BlockSize() uint64 { + switch t { case 0, // F32 1, // F16 @@ -252,73 +258,77 @@ func (t Tensor) blockSize() uint64 { } func (t Tensor) typeSize() uint64 { - blockSize := t.blockSize() + return TensorType(t.Kind).TypeSize() +} - switch t.Kind { - case 0: // FP32 +func (t TensorType) TypeSize() uint64 { + blockSize := t.BlockSize() + + switch t { + case TensorTypeF32: return 4 - case 1: // FP16 + case TensorTypeF16: return 2 - case 2: // Q4_0 + case TensorTypeQ4_0: return 2 + blockSize/2 - case 3: // Q4_1 + case TensorTypeQ4_1: return 2 + 2 + blockSize/2 - case 6: // Q5_0 + case TensorTypeQ5_0: return 2 + 4 + blockSize/2 - case 7: // Q5_1 + case TensorTypeQ5_1: return 2 + 2 + 4 + blockSize/2 - case 8: // Q8_0 + case TensorTypeQ8_0: return 2 + blockSize - case 9: // Q8_1 + case TensorTypeQ8_1: return 2 + 2 + blockSize - case 10: // Q2_K + case TensorTypeQ2_K: return blockSize/16 + blockSize/4 + 2 + 2 - case 11: // Q3_K + case TensorTypeQ3_K: return blockSize/8 + blockSize/4 + 12 + 2 - case 12: // Q4_K + case TensorTypeQ4_K: return 2 + 2 + 12 + blockSize/2 - case 13: // Q5_K + case TensorTypeQ5_K: return 2 + 2 + 12 + blockSize/8 + blockSize/2 - case 14: // Q6_K + case TensorTypeQ6_K: return blockSize/2 + blockSize/4 + blockSize/16 + 2 - case 15: // Q8_K + case TensorTypeQ8_K: return 4 + blockSize + 2*blockSize/16 - case 16: // IQ2_XXS + case tensorTypeIQ2_XXS: return 2 + 2*blockSize/8 - case 17: // IQ2_XS + case tensorTypeIQ2_XS: return 2 + 2*blockSize/8 + blockSize/32 - case 18: // IQ3_XXS + case tensorTypeIQ3_XXS: return 2 + blockSize/4 + blockSize/8 - case 19: // IQ1_S + case tensorTypeIQ1_S: return 2 + blockSize/8 + blockSize/16 - case 20: // IQ4_NL + case tensorTypeIQ4_NL: return 2 + blockSize/2 - case 21: // IQ3_S + case tensorTypeIQ3_S: return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4 - case 22: // IQ2_S + case tensorTypeIQ2_S: return 2 + blockSize/4 + blockSize/16 - case 23: // IQ4_XS + case tensorTypeIQ4_XS: return 2 + 2 + blockSize/2 + blockSize/64 - case 24: // I8 + case TensorTypeI8: return 1 - case 25: // I16 + case TensorTypeI16: return 2 - case 26: // I32 + case TensorTypeI32: return 4 - case 27: // I64 + case TensorTypeI64: return 8 - case 28: // F64 + case TensorTypeF64: return 8 - case 29: // IQ1_M + case tensorTypeIQ1_M: return blockSize/8 + blockSize/16 + blockSize/32 - case 30: // BF16 + case TensorTypeBF16: return 2 default: return 0 } } -func (t Tensor) parameters() uint64 { +func (t Tensor) Elements() uint64 { var count uint64 = 1 for _, n := range t.Shape { count *= n @@ -327,11 +337,11 @@ func (t Tensor) parameters() uint64 { } func (t Tensor) Size() uint64 { - return t.parameters() * t.typeSize() / t.blockSize() + return t.Elements() * t.typeSize() / t.blockSize() } func (t Tensor) Type() string { - return fileType(t.Kind).String() + return TensorType(t.Kind).String() } type container interface { @@ -480,7 +490,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri var ropeFreqsCount uint64 if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok { if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { - ropeFreqsCount = ropeFreqsWeights.parameters() + ropeFreqsCount = ropeFreqsWeights.Elements() } } @@ -640,6 +650,20 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { graphSize = 4 * (imageSize*imageSize*numChannels + embeddingLength*patchSize + numPatches*numPatches*headCount) + case "qwen25vl": + maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280)) + + numPatches := maxPixels / (patchSize * patchSize) + + graphSize = 4 * (maxPixels*numChannels + // Original image storage + // Normalized pixels + maxPixels*numChannels + + // Patches storage (numPatches * channels * patchSize^2) + numPatches*numChannels*patchSize*patchSize + + // Self-attention calculations + numPatches*numPatches*headCount + + // Additional buffer for processing + embeddingLength*numPatches) case "llama4": // vision graph is computed independently in the same schedule // and is negligible compared to the worst case text graph diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go index b7029bc38..8e75625e0 100644 --- a/fs/ggml/gguf.go +++ b/fs/ggml/gguf.go @@ -9,8 +9,12 @@ import ( "io" "log/slog" "maps" + "os" + "runtime" "slices" "strings" + + "golang.org/x/sync/errgroup" ) type containerGGUF struct { @@ -225,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { } llm.tensors = append(llm.tensors, &tensor) - llm.parameters += tensor.parameters() + llm.parameters += tensor.Elements() } // patch KV with parameter count @@ -488,25 +492,38 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error { return err } + if t == ggufTypeString { + for _, e := range any(s).([]string) { + if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil { + return err + } + + if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil { + return err + } + } + return nil + } + return binary.Write(w, binary.LittleEndian, s) } -func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { +func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error { alignment := kv.Uint("general.alignment", 32) - if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil { + if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil { return err } @@ -514,12 +531,12 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { slices.Sort(keys) for _, key := range keys { - if err := ggufWriteKV(ws, key, kv[key]); err != nil { + if err := ggufWriteKV(f, key, kv[key]); err != nil { return err } } - slices.SortStableFunc(ts, func(a, b Tensor) int { + slices.SortStableFunc(ts, func(a, b *Tensor) int { if i, j := a.block(), b.block(); i < 0 && j > 0 { return 1 } else if i > 0 && j < 0 { @@ -530,22 +547,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { }) var s uint64 - for _, t := range ts { - t.Offset = s - if err := ggufWriteTensorInfo(ws, t); err != nil { + for i := range ts { + ts[i].Offset = s + if err := ggufWriteTensorInfo(f, ts[i]); err != nil { return err } - s += t.Size() + s += ts[i].Size() s += uint64(ggufPadding(int64(s), int64(alignment))) } + offset, err := f.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + offset += ggufPadding(offset, int64(alignment)) + + var g errgroup.Group + g.SetLimit(runtime.GOMAXPROCS(0)) + // TODO consider reducing if tensors size * gomaxprocs is larger than free memory for _, t := range ts { - if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil { + t := t + w := io.NewOffsetWriter(f, offset+int64(t.Offset)) + g.Go(func() error { + _, err := t.WriteTo(w) return err - } + }) } - return nil + return g.Wait() } func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { @@ -560,8 +589,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { var err error switch v := v.(type) { - case uint32: + case uint32, FileType: err = writeGGUF(ws, ggufTypeUint32, v) + case uint64: + err = writeGGUF(ws, ggufTypeUint64, v) case float32: err = writeGGUF(ws, ggufTypeFloat32, v) case bool: @@ -570,32 +601,20 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { err = writeGGUFString(ws, v) case []int32: err = writeGGUFArray(ws, ggufTypeInt32, v) + case *array[int32]: + err = writeGGUFArray(ws, ggufTypeInt32, v.values) case []uint32: err = writeGGUFArray(ws, ggufTypeUint32, v) + case *array[uint32]: + err = writeGGUFArray(ws, ggufTypeUint32, v.values) case []float32: err = writeGGUFArray(ws, ggufTypeFloat32, v) + case *array[float32]: + err = writeGGUFArray(ws, ggufTypeFloat32, v.values) case []string: - if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil { - return err - } - - for _, e := range v { - if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil { - return err - } - } + err = writeGGUFArray(ws, ggufTypeString, v) + case *array[string]: + err = writeGGUFArray(ws, ggufTypeString, v.values) default: return fmt.Errorf("improper type for '%s'", k) } @@ -603,7 +622,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { return err } -func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { +func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error { slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset) if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil { return err @@ -630,20 +649,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { return binary.Write(ws, binary.LittleEndian, t.Offset) } -func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error { - offset, err := ws.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil { - return err - } - - _, err = t.WriteTo(ws) - return err -} - func ggufPadding(offset, align int64) int64 { return (align - offset%align) % align } diff --git a/fs/ggml/gguf_test.go b/fs/ggml/gguf_test.go index 22e7a5514..10d3b6849 100644 --- a/fs/ggml/gguf_test.go +++ b/fs/ggml/gguf_test.go @@ -18,7 +18,7 @@ func TestWriteGGUF(t *testing.T) { if err := WriteGGUF(w, KV{ "general.alignment": uint32(16), - }, []Tensor{ + }, []*Tensor{ {Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, {Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, {Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, diff --git a/fs/ggml/type.go b/fs/ggml/type.go index 7265afbcd..4d3d5bcad 100644 --- a/fs/ggml/type.go +++ b/fs/ggml/type.go @@ -1,26 +1,31 @@ package ggml -import "fmt" +import ( + "fmt" + "log/slog" + "strings" +) -type fileType uint32 +// FileType is the Go equivalent to llama_ftype used for gguf file typing +type FileType uint32 const ( - fileTypeF32 fileType = iota - fileTypeF16 + FileTypeF32 FileType = iota + FileTypeF16 fileTypeQ4_0 fileTypeQ4_1 - fileTypeQ4_1_F16 - fileTypeQ4_2 // unused - fileTypeQ4_3 // unused - fileTypeQ8_0 + fileTypeQ4_1_F16 // unused by GGML + fileTypeQ4_2 // unused by GGML + fileTypeQ4_3 // unused by GGML + FileTypeQ8_0 fileTypeQ5_0 fileTypeQ5_1 fileTypeQ2_K fileTypeQ3_K_S fileTypeQ3_K_M fileTypeQ3_K_L - fileTypeQ4_K_S - fileTypeQ4_K_M + FileTypeQ4_K_S + FileTypeQ4_K_M fileTypeQ5_K_S fileTypeQ5_K_M fileTypeQ6_K @@ -37,93 +42,62 @@ const ( fileTypeIQ2_M fileTypeIQ4_XS fileTypeIQ1_M - fileTypeBF16 + FileTypeBF16 + fileTypeQ4_0_4_4 // unused by GGML + fileTypeQ4_0_4_8 // unused by GGML + fileTypeQ4_0_8_8 // unused by GGML + fileTypeTQ1_0 + fileTypeTQ2_0 - fileTypeUnknown + FileTypeUnknown = 1024 ) -func ParseFileType(s string) (fileType, error) { +// ParseFileType parses the provided GGUF file type +// Only Ollama supported types are considered valid +func ParseFileType(s string) (FileType, error) { switch s { case "F32": - return fileTypeF32, nil + return FileTypeF32, nil case "F16": - return fileTypeF16, nil - case "Q4_0": - return fileTypeQ4_0, nil - case "Q4_1": - return fileTypeQ4_1, nil - case "Q4_1_F16": - return fileTypeQ4_1_F16, nil + return FileTypeF16, nil case "Q8_0": - return fileTypeQ8_0, nil - case "Q5_0": - return fileTypeQ5_0, nil - case "Q5_1": - return fileTypeQ5_1, nil - case "Q2_K": - return fileTypeQ2_K, nil - case "Q3_K_S": - return fileTypeQ3_K_S, nil - case "Q3_K_M": - return fileTypeQ3_K_M, nil - case "Q3_K_L": - return fileTypeQ3_K_L, nil + return FileTypeQ8_0, nil case "Q4_K_S": - return fileTypeQ4_K_S, nil - case "Q4_K_M": - return fileTypeQ4_K_M, nil - case "Q5_K_S": - return fileTypeQ5_K_S, nil - case "Q5_K_M": - return fileTypeQ5_K_M, nil - case "Q6_K": - return fileTypeQ6_K, nil - case "IQ2_XXS": - return fileTypeIQ2_XXS, nil - case "IQ2_XS": - return fileTypeIQ2_XS, nil - case "Q2_K_S": - return fileTypeQ2_K_S, nil - case "IQ3_XS": - return fileTypeIQ3_XS, nil - case "IQ3_XXS": - return fileTypeIQ3_XXS, nil - case "IQ1_S": - return fileTypeIQ1_S, nil - case "IQ4_NL": - return fileTypeIQ4_NL, nil - case "IQ3_S": - return fileTypeIQ3_S, nil - case "IQ3_M": - return fileTypeIQ3_M, nil - case "IQ2_S": - return fileTypeIQ2_S, nil - case "IQ2_M": - return fileTypeIQ2_M, nil - case "IQ4_XS": - return fileTypeIQ4_XS, nil - case "IQ1_M": - return fileTypeIQ1_M, nil + return FileTypeQ4_K_S, nil + case "Q4_K_M", "Q4_K": + return FileTypeQ4_K_M, nil case "BF16": - return fileTypeBF16, nil + return FileTypeBF16, nil default: - return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s) + supportedFileTypes := []FileType{ + FileTypeF32, + FileTypeF16, + FileTypeQ4_K_S, + FileTypeQ4_K_M, + FileTypeQ8_0, + // fsggml.FileTypeBF16, // TODO + } + strs := make([]string, len(supportedFileTypes)) + for i := range supportedFileTypes { + strs[i] = supportedFileTypes[i].String() + } + + return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", ")) } } -func (t fileType) String() string { +func (t FileType) String() string { + // Note: this routine will return a broader set of file types for existing models switch t { - case fileTypeF32: + case FileTypeF32: return "F32" - case fileTypeF16: + case FileTypeF16: return "F16" case fileTypeQ4_0: return "Q4_0" case fileTypeQ4_1: return "Q4_1" - case fileTypeQ4_1_F16: - return "Q4_1_F16" - case fileTypeQ8_0: + case FileTypeQ8_0: return "Q8_0" case fileTypeQ5_0: return "Q5_0" @@ -137,9 +111,9 @@ func (t fileType) String() string { return "Q3_K_M" case fileTypeQ3_K_L: return "Q3_K_L" - case fileTypeQ4_K_S: + case FileTypeQ4_K_S: return "Q4_K_S" - case fileTypeQ4_K_M: + case FileTypeQ4_K_M: return "Q4_K_M" case fileTypeQ5_K_S: return "Q5_K_S" @@ -147,39 +121,198 @@ func (t fileType) String() string { return "Q5_K_M" case fileTypeQ6_K: return "Q6_K" - case fileTypeIQ2_XXS: - return "IQ2_XXS" - case fileTypeIQ2_XS: - return "IQ2_XS" case fileTypeQ2_K_S: return "Q2_K_S" - case fileTypeIQ3_XS: - return "IQ3_XS" - case fileTypeIQ3_XXS: - return "IQ3_XXS" - case fileTypeIQ1_S: - return "IQ1_S" - case fileTypeIQ4_NL: - return "IQ4_NL" - case fileTypeIQ3_S: - return "IQ3_S" - case fileTypeIQ3_M: - return "IQ3_M" - case fileTypeIQ2_S: - return "IQ2_S" - case fileTypeIQ4_XS: - return "IQ4_XS" - case fileTypeIQ2_M: - return "IQ2_M" - case fileTypeIQ1_M: - return "IQ1_M" - case fileTypeBF16: + case FileTypeBF16: return "BF16" default: return "unknown" } } -func (t fileType) Value() uint32 { +func (t FileType) Value() uint32 { return uint32(t) } + +func (ftype FileType) ToTensorType() TensorType { + switch ftype { + case FileTypeF32: + return TensorTypeF32 + case FileTypeF16: + return TensorTypeF16 + case fileTypeQ4_0: + return TensorTypeQ4_0 + case fileTypeQ4_1: + return TensorTypeQ4_1 + case FileTypeQ8_0: + return TensorTypeQ8_0 + case fileTypeQ5_0: + return TensorTypeQ5_0 + case fileTypeQ5_1: + return TensorTypeQ5_1 + case fileTypeQ2_K: + return TensorTypeQ2_K + case fileTypeQ3_K_S: + return TensorTypeQ3_K + case fileTypeQ3_K_M: + return TensorTypeQ3_K + case fileTypeQ3_K_L: + return TensorTypeQ3_K + case FileTypeQ4_K_S: + return TensorTypeQ4_K + case FileTypeQ4_K_M: + return TensorTypeQ4_K + case fileTypeQ5_K_S: + return TensorTypeQ5_K + case fileTypeQ5_K_M: + return TensorTypeQ5_K + case fileTypeQ6_K: + return TensorTypeQ6_K + case fileTypeQ2_K_S: + return TensorTypeQ2_K + case FileTypeBF16: + return TensorTypeBF16 + default: + slog.Warn("unsupported file type", "type", ftype) + return 0 // F32 + } +} + +// TensorType is equivalent to ggml_type for individual tensor types +// Note: these are not the same as FileType +type TensorType uint32 + +const ( + TensorTypeF32 TensorType = iota + TensorTypeF16 + TensorTypeQ4_0 + TensorTypeQ4_1 + tensorTypeQ4_2 // unused by GGML + tensorTypeQ4_3 // unused by GGML + TensorTypeQ5_0 + TensorTypeQ5_1 + TensorTypeQ8_0 + TensorTypeQ8_1 + TensorTypeQ2_K + TensorTypeQ3_K + TensorTypeQ4_K + TensorTypeQ5_K + TensorTypeQ6_K + TensorTypeQ8_K + tensorTypeIQ2_XXS // not supported by ollama + tensorTypeIQ2_XS // not supported by ollama + tensorTypeIQ3_XXS // not supported by ollama + tensorTypeIQ1_S // not supported by ollama + tensorTypeIQ4_NL // not supported by ollama + tensorTypeIQ3_S // not supported by ollama + tensorTypeIQ2_S // not supported by ollama + tensorTypeIQ4_XS // not supported by ollama + TensorTypeI8 + TensorTypeI16 + TensorTypeI32 + TensorTypeI64 + TensorTypeF64 + tensorTypeIQ1_M // not supported by ollama + TensorTypeBF16 + tensorTypeQ4_0_4_4 // unused by GGML + tensorTypeQ4_0_4_8 // unused by GGML + tensorTypeQ4_0_8_8 // unused by GGML + tensorTypeTQ1_0 // not supported by ollama + tensorTypeTQ2_0 // not supported by ollama + tensorTypeIQ4_NL_4_4 // unused by GGML + tensorTypeIQ4_NL_4_8 // unused by GGML + tensorTypeIQ4_NL_8_8 // unused by GGML +) + +// ParseFileType parses the provided GGUF file type +// Only Ollama supported types are considered valid +func ParseTensorType(s string) (TensorType, error) { + switch s { + case "F32": + return TensorTypeF32, nil + case "F16": + return TensorTypeF16, nil + case "Q4_0": + return TensorTypeQ4_0, nil + case "Q4_1": + return TensorTypeQ4_1, nil + case "Q5_0": + return TensorTypeQ5_0, nil + case "Q5_1": + return TensorTypeQ5_1, nil + case "Q8_0": + return TensorTypeQ8_0, nil + case "Q8_1": + return TensorTypeQ8_1, nil + case "Q2_K": + return TensorTypeQ2_K, nil + case "Q3_K": + return TensorTypeQ3_K, nil + case "Q4_K": + return TensorTypeQ4_K, nil + case "Q5_K": + return TensorTypeQ5_K, nil + case "Q6_K": + return TensorTypeQ6_K, nil + case "Q8_K": + return TensorTypeQ8_K, nil + case "F64": + return TensorTypeF64, nil + case "BF16": + return TensorTypeBF16, nil + default: + return 0, fmt.Errorf("unsupported quantization type %s", s) + } +} + +func (t TensorType) IsQuantized() bool { + switch t { + case TensorTypeF32, TensorTypeF16, TensorTypeBF16: + return false + default: + return true + } +} + +func (t TensorType) RowSize(ne uint64) uint64 { + return t.TypeSize() * ne / t.BlockSize() +} + +func (t TensorType) String() string { + switch t { + case TensorTypeF32: + return "F32" + case TensorTypeF16: + return "F16" + case TensorTypeQ4_0: + return "Q4_0" + case TensorTypeQ4_1: + return "Q4_1" + case TensorTypeQ5_0: + return "Q5_0" + case TensorTypeQ5_1: + return "Q5_1" + case TensorTypeQ8_0: + return "Q8_0" + case TensorTypeQ8_1: + return "Q8_1" + case TensorTypeQ2_K: + return "Q2_K" + case TensorTypeQ3_K: + return "Q3_K" + case TensorTypeQ4_K: + return "Q4_K" + case TensorTypeQ5_K: + return "Q5_K" + case TensorTypeQ6_K: + return "Q6_K" + case TensorTypeQ8_K: + return "Q8_K" + case TensorTypeF64: + return "F64" + case TensorTypeBF16: + return "BF16" + default: + return "unknown" + } +} diff --git a/integration/model_arch_test.go b/integration/model_arch_test.go index e094d3cea..6ce183d79 100644 --- a/integration/model_arch_test.go +++ b/integration/model_arch_test.go @@ -48,17 +48,6 @@ var ( } ) -func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) { - deadline, hasDeadline := t.Deadline() - if !hasDeadline { - return 8 * time.Minute, 10 * time.Minute - } else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 { - t.Skip("too little time") - return time.Duration(0), time.Duration(0) - } - return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second)) -} - func TestModelsGenerate(t *testing.T) { softTimeout, hardTimeout := getTimeouts(t) slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) diff --git a/integration/quantization_test.go b/integration/quantization_test.go new file mode 100644 index 000000000..af9da0b62 --- /dev/null +++ b/integration/quantization_test.go @@ -0,0 +1,130 @@ +//go:build integration && models + +package integration + +import ( + "bytes" + "context" + "fmt" + "log/slog" + "strings" + "testing" + "time" + + "github.com/ollama/ollama/api" +) + +func TestQuantization(t *testing.T) { + sourceModels := []string{ + "qwen2.5:0.5b-instruct-fp16", + } + quantizations := []string{ + "Q8_0", + "Q4_K_S", + "Q4_K_M", + "Q4_K", + } + softTimeout, hardTimeout := getTimeouts(t) + started := time.Now() + slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) + ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) + defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + for _, base := range sourceModels { + if err := PullIfMissing(ctx, client, base); err != nil { + t.Fatalf("pull failed %s", err) + } + for _, quant := range quantizations { + newName := fmt.Sprintf("%s__%s", base, quant) + t.Run(newName, func(t *testing.T) { + if time.Now().Sub(started) > softTimeout { + t.Skip("skipping remaining tests to avoid excessive runtime") + } + req := &api.CreateRequest{ + Model: newName, + Quantization: quant, + From: base, + } + fn := func(resp api.ProgressResponse) error { + // fmt.Print(".") + return nil + } + t.Logf("quantizing: %s -> %s", base, quant) + if err := client.Create(ctx, req, fn); err != nil { + t.Fatalf("create failed %s", err) + } + defer func() { + req := &api.DeleteRequest{ + Model: newName, + } + t.Logf("deleting: %s -> %s", base, quant) + if err := client.Delete(ctx, req); err != nil { + t.Logf("failed to clean up %s: %s", req.Model, err) + } + }() + // Check metadata on the model + resp, err := client.Show(ctx, &api.ShowRequest{Name: newName}) + if err != nil { + t.Fatalf("unable to show model: %s", err) + } + if !strings.Contains(resp.Details.QuantizationLevel, quant) { + t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel) + } + + stream := true + genReq := api.GenerateRequest{ + Model: newName, + Prompt: "why is the sky blue?", + KeepAlive: &api.Duration{Duration: 3 * time.Second}, + Options: map[string]any{ + "seed": 42, + "temperature": 0.0, + }, + Stream: &stream, + } + t.Logf("verifying: %s -> %s", base, quant) + + // Some smaller quantizations can cause models to have poor quality + // or get stuck in repetition loops, so we stop as soon as we have any matches + anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"} + reqCtx, reqCancel := context.WithCancel(ctx) + atLeastOne := false + var buf bytes.Buffer + genfn := func(response api.GenerateResponse) error { + buf.Write([]byte(response.Response)) + fullResp := strings.ToLower(buf.String()) + for _, resp := range anyResp { + if strings.Contains(fullResp, resp) { + atLeastOne = true + t.Log(fullResp) + reqCancel() + break + } + } + return nil + } + + done := make(chan int) + var genErr error + go func() { + genErr = client.Generate(reqCtx, &genReq, genfn) + done <- 0 + }() + + select { + case <-done: + if genErr != nil && !atLeastOne { + t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt) + } + case <-ctx.Done(): + t.Error("outer test context done while waiting for generate") + } + + t.Logf("passed") + + }) + } + } +} diff --git a/integration/utils_test.go b/integration/utils_test.go index e08806fca..19f4d1bf8 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -217,6 +217,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err) return } + defer fp.Close() data, err := io.ReadAll(fp) if err != nil { slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err) @@ -358,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) { } } } + +func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) { + deadline, hasDeadline := t.Deadline() + if !hasDeadline { + return 8 * time.Minute, 10 * time.Minute + } else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 { + t.Skip("too little time") + return time.Duration(0), time.Duration(0) + } + return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second)) +} diff --git a/kvcache/causal.go b/kvcache/causal.go index ea07932cd..9bc1d5da2 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) { } } - return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells)) + return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize) } func (c *Causal) updateSlidingWindow() { diff --git a/llama/build-info.cpp b/llama/build-info.cpp index be908c364..afef6b85c 100644 --- a/llama/build-info.cpp +++ b/llama/build-info.cpp @@ -1,4 +1,4 @@ int LLAMA_BUILD_NUMBER = 0; -char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac"; +char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618"; char const *LLAMA_COMPILER = ""; char const *LLAMA_BUILD_TARGET = ""; diff --git a/llama/llama.cpp/.rsync-filter b/llama/llama.cpp/.rsync-filter index 9bc06684b..1f81b0075 100644 --- a/llama/llama.cpp/.rsync-filter +++ b/llama/llama.cpp/.rsync-filter @@ -10,11 +10,11 @@ include common/stb_image.* include include/ include include/llama.* include include/llama-*.* -include examples/ -include examples/llava/ -include examples/llava/clip.* -include examples/llava/clip-impl.* -include examples/llava/llava.* +include tools/ +include tools/mtmd/ +include tools/mtmd/clip.* +include tools/mtmd/clip-impl.* +include tools/mtmd/llava.* include src/ include src/llama.* include src/llama-*.* diff --git a/llama/llama.cpp/common/common.cpp b/llama/llama.cpp/common/common.cpp index 94f545f81..2b1d8da59 100644 --- a/llama/llama.cpp/common/common.cpp +++ b/llama/llama.cpp/common/common.cpp @@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.n_threads = params.cpuparams.n_threads; cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? params.cpuparams.n_threads : params.cpuparams_batch.n_threads; - cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_freq_base = params.rope_freq_base; @@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; + cparams.op_offload = !params.no_op_offload; if (params.reranking) { cparams.embeddings = true; @@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector & tokens, int64_t stride) { + const int64_t ne_datapoint = llama_n_ctx(ctx); + const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride; + ggml_opt_dataset_t result = ggml_opt_dataset_init( + GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1); + + llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data; + llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data; + + for (int64_t idata = 0; idata < ndata; ++idata) { + memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token)); + memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token)); + } + + return result; +} diff --git a/llama/llama.cpp/common/common.h b/llama/llama.cpp/common/common.h index e6eaa8e80..dea34267c 100644 --- a/llama/llama.cpp/common/common.h +++ b/llama/llama.cpp/common/common.h @@ -66,7 +66,6 @@ enum llama_example { LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_MAIN, - LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL, @@ -96,6 +95,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_PENALTIES = 10, + COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, }; // dimensionality reduction methods, used by cvector-generator @@ -161,6 +161,7 @@ struct common_params_sampling { std::vector samplers = { COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_DRY, + COMMON_SAMPLER_TYPE_TOP_N_SIGMA, COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TOP_P, @@ -323,7 +324,6 @@ struct common_params { bool ctx_shift = true; // context shift on inifinite text generation bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation @@ -332,6 +332,7 @@ struct common_params { bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data + bool no_op_offload = false; // globally disable offload host tensor operations to device bool single_turn = false; // single turn chat conversation @@ -340,8 +341,10 @@ struct common_params { common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; - // multimodal models (see examples/llava) + // multimodal models (see tools/mtmd) struct common_params_model mmproj; + bool mmproj_use_gpu = true; // use GPU for multimodal model + bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding @@ -407,13 +410,14 @@ struct common_params { bool process_output = false; // collect data for the output tensor bool compute_ppl = true; // whether to compute perplexity + bool parse_special = false; // whether to parse special tokens during imatrix tokenization // cvector-generator params int n_pca_batch = 100; int n_pca_iterations = 1000; dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; - std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; - std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + std::string cvector_positive_file = "tools/cvector-generator/positive.txt"; + std::string cvector_negative_file = "tools/cvector-generator/negative.txt"; bool spm_infill = false; // suffix/prefix/middle pattern for infill @@ -662,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } + +// +// training utils +// + +ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector & tokens, int64_t stride); diff --git a/llama/llama.cpp/common/json-schema-to-grammar.cpp b/llama/llama.cpp/common/json-schema-to-grammar.cpp index 56043678c..656b3ecaa 100644 --- a/llama/llama.cpp/common/json-schema-to-grammar.cpp +++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp @@ -16,6 +16,9 @@ using json = nlohmann::ordered_json; static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { auto has_max = max_items != std::numeric_limits::max(); + if (max_items == 0) { + return ""; + } if (min_items == 0 && max_items == 1) { return item_rule + "?"; } diff --git a/llama/llama.cpp/common/sampling.cpp b/llama/llama.cpp/common/sampling.cpp index 1735b6501..28705e24c 100644 --- a/llama/llama.cpp/common/sampling.cpp +++ b/llama/llama.cpp/common/sampling.cpp @@ -1,6 +1,7 @@ #include "sampling.h" #include "common.h" +#include "log.h" #include #include @@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.logit_bias.data())); if (params.mirostat == 0) { - if (params.top_n_sigma >= 0) { - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); - llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma)); - } else { - for (const auto & cnstr : params.samplers) { - switch (cnstr) { - case COMMON_SAMPLER_TYPE_DRY: - { - std::vector c_breakers; - c_breakers.reserve(params.dry_sequence_breakers.size()); - for (const auto & str : params.dry_sequence_breakers) { - c_breakers.push_back(str.c_str()); - } - - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + for (const auto & cnstr : params.samplers) { + switch (cnstr) { + case COMMON_SAMPLER_TYPE_DRY: + { + std::vector c_breakers; + c_breakers.reserve(params.dry_sequence_breakers.size()); + for (const auto & str : params.dry_sequence_breakers) { + c_breakers.push_back(str.c_str()); } - break; - case COMMON_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); - break; - case COMMON_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_XTC: - llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); - break; - case COMMON_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); - break; - case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); - break; - case COMMON_SAMPLER_TYPE_PENALTIES: - llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); - break; - default: - GGML_ASSERT(false && "unknown sampler type"); - } + + llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + } + break; + case COMMON_SAMPLER_TYPE_TOP_K: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + break; + case COMMON_SAMPLER_TYPE_TOP_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma)); + break; + case COMMON_SAMPLER_TYPE_MIN_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + break; + case COMMON_SAMPLER_TYPE_TYPICAL_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_TEMPERATURE: + llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + break; + case COMMON_SAMPLER_TYPE_INFILL: + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); + break; + case COMMON_SAMPLER_TYPE_PENALTIES: + llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); + break; + default: + GGML_ASSERT(false && "unknown sampler type"); } } llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); @@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TOP_K: return 'k'; case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y'; case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; + case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's'; case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_XTC: return 'x'; @@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TOP_K: return "top_k"; case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; + case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma"; case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_XTC: return "xtc"; @@ -504,6 +504,7 @@ std::vector common_sampler_types_from_names(const std::vect { "dry", COMMON_SAMPLER_TYPE_DRY }, { "top_k", COMMON_SAMPLER_TYPE_TOP_K }, { "top_p", COMMON_SAMPLER_TYPE_TOP_P }, + { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA }, { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, @@ -517,6 +518,7 @@ std::vector common_sampler_types_from_names(const std::vect std::unordered_map sampler_alt_name_map { { "top-k", COMMON_SAMPLER_TYPE_TOP_K }, { "top-p", COMMON_SAMPLER_TYPE_TOP_P }, + { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA }, { "nucleus", COMMON_SAMPLER_TYPE_TOP_P }, { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P }, @@ -533,14 +535,16 @@ std::vector common_sampler_types_from_names(const std::vect auto sampler = sampler_canonical_name_map.find(name); if (sampler != sampler_canonical_name_map.end()) { samplers.push_back(sampler->second); - } else { - if (allow_alt_names) { - sampler = sampler_alt_name_map.find(name); - if (sampler != sampler_alt_name_map.end()) { - samplers.push_back(sampler->second); - } + continue; + } + if (allow_alt_names) { + sampler = sampler_alt_name_map.find(name); + if (sampler != sampler_alt_name_map.end()) { + samplers.push_back(sampler->second); + continue; } } + LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str()); } return samplers; @@ -552,6 +556,7 @@ std::vector common_sampler_types_from_chars(const std::stri { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, @@ -566,6 +571,8 @@ std::vector common_sampler_types_from_chars(const std::stri const auto sampler = sampler_name_map.find(c); if (sampler != sampler_name_map.end()) { samplers.push_back(sampler->second); + } else { + LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c); } } diff --git a/llama/llama.cpp/examples/llava/clip.cpp b/llama/llama.cpp/examples/llava/clip.cpp deleted file mode 100644 index d57b4bd6e..000000000 --- a/llama/llama.cpp/examples/llava/clip.cpp +++ /dev/null @@ -1,2927 +0,0 @@ -// NOTE: This is modified from clip.cpp only for LLaVA, -// so there might be still unnecessary artifacts hanging around -// I'll gradually clean and extend it -// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch -#include "clip.h" -#include "clip-impl.h" -#include "ggml.h" -#include "ggml-cpp.h" -#include "ggml-cpu.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "gguf.h" - -#define STB_IMAGE_IMPLEMENTATION -#include "stb_image.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX - #define NOMINMAX -#endif -#include -#if __GLIBCXX__ -#include -#include -#include -#endif -#endif - -struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; - -//#define CLIP_DEBUG_FUNCTIONS - -#ifdef CLIP_DEBUG_FUNCTIONS -static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); - return; - } - - // PPM header: P6 format, width, height, and max color value - file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; - - // Write pixel data - for (size_t i = 0; i < img.buf.size(); i += 3) { - // PPM expects binary data in RGB format, which matches our image buffer - file.write(reinterpret_cast(&img.buf[i]), 3); - } - - file.close(); -} - -static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); - return; - } - - int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data - int bytesPerPixel = 3; - int widthInBytes = img.nx * bytesPerPixel; - int paddingAmount = (4 - (widthInBytes % 4)) % 4; - int stride = widthInBytes + paddingAmount; - - // Bitmap file header - unsigned char fileHeader[14] = { - 'B','M', // Signature - 0,0,0,0, // Image file size in bytes - 0,0,0,0, // Reserved - 54,0,0,0 // Start of pixel array - }; - - // Total file size - fileSize = 54 + (stride * img.ny); - fileHeader[2] = (unsigned char)(fileSize); - fileHeader[3] = (unsigned char)(fileSize >> 8); - fileHeader[4] = (unsigned char)(fileSize >> 16); - fileHeader[5] = (unsigned char)(fileSize >> 24); - - // Bitmap information header (BITMAPINFOHEADER) - unsigned char infoHeader[40] = { - 40,0,0,0, // Size of this header (40 bytes) - 0,0,0,0, // Image width - 0,0,0,0, // Image height - 1,0, // Number of color planes - 24,0, // Bits per pixel - 0,0,0,0, // No compression - 0,0,0,0, // Image size (can be 0 for no compression) - 0,0,0,0, // X pixels per meter (not specified) - 0,0,0,0, // Y pixels per meter (not specified) - 0,0,0,0, // Total colors (color table not used) - 0,0,0,0 // Important colors (all are important) - }; - - // Width and height in the information header - infoHeader[4] = (unsigned char)(img.nx); - infoHeader[5] = (unsigned char)(img.nx >> 8); - infoHeader[6] = (unsigned char)(img.nx >> 16); - infoHeader[7] = (unsigned char)(img.nx >> 24); - infoHeader[8] = (unsigned char)(img.ny); - infoHeader[9] = (unsigned char)(img.ny >> 8); - infoHeader[10] = (unsigned char)(img.ny >> 16); - infoHeader[11] = (unsigned char)(img.ny >> 24); - - // Write file headers - file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); - file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); - - // Pixel data - std::vector padding(3, 0); // Max padding size to be added to each row - for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top - for (int x = 0; x < img.nx; ++x) { - // Each pixel - size_t pixelIndex = (y * img.nx + x) * 3; - unsigned char pixel[3] = { - img.buf[pixelIndex + 2], // BMP stores pixels in BGR format - img.buf[pixelIndex + 1], - img.buf[pixelIndex] - }; - file.write(reinterpret_cast(pixel), 3); - } - // Write padding for the row - file.write(reinterpret_cast(padding.data()), paddingAmount); - } - - file.close(); -} - -// debug function to convert f32 to u8 -static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(3 * src.nx * src.ny); - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); - } -} -#endif - - -// -// clip layers -// - -enum patch_merge_type { - PATCH_MERGE_FLAT, - PATCH_MERGE_SPATIAL_UNPAD, -}; - -struct clip_hparams { - int32_t image_size; - int32_t patch_size; - int32_t hidden_size; - int32_t n_intermediate; - int32_t projection_dim; - int32_t n_head; - int32_t n_layer; - - patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; - - float eps; - - std::vector image_grid_pinpoints; - int32_t image_crop_resolution; - std::unordered_set vision_feature_layer; -}; - -struct clip_layer { - // attention - struct ggml_tensor * k_w = nullptr; - struct ggml_tensor * k_b = nullptr; - struct ggml_tensor * q_w = nullptr; - struct ggml_tensor * q_b = nullptr; - struct ggml_tensor * v_w = nullptr; - struct ggml_tensor * v_b = nullptr; - - struct ggml_tensor * o_w = nullptr; - struct ggml_tensor * o_b = nullptr; - - // layernorm 1 - struct ggml_tensor * ln_1_w = nullptr; - struct ggml_tensor * ln_1_b = nullptr; - - // ff - struct ggml_tensor * ff_i_w = nullptr; - struct ggml_tensor * ff_i_b = nullptr; - - struct ggml_tensor * ff_o_w = nullptr; - struct ggml_tensor * ff_o_b = nullptr; - - // layernorm 2 - struct ggml_tensor * ln_2_w = nullptr; - struct ggml_tensor * ln_2_b = nullptr; -}; - -struct clip_vision_model { - struct clip_hparams hparams; - - // embeddings - struct ggml_tensor * class_embedding = nullptr; - struct ggml_tensor * patch_embeddings_0 = nullptr; - struct ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) - struct ggml_tensor * patch_bias = nullptr; - struct ggml_tensor * position_embeddings = nullptr; - - struct ggml_tensor * pre_ln_w = nullptr; - struct ggml_tensor * pre_ln_b = nullptr; - - std::vector layers; - - struct ggml_tensor * post_ln_w; - struct ggml_tensor * post_ln_b; - - struct ggml_tensor * projection; - - // LLaVA projection - struct ggml_tensor * mm_0_w = nullptr; - struct ggml_tensor * mm_0_b = nullptr; - struct ggml_tensor * mm_2_w = nullptr; - struct ggml_tensor * mm_2_b = nullptr; - - struct ggml_tensor * image_newline = nullptr; - - // Yi type models with mlp+normalization projection - struct ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 - struct ggml_tensor * mm_1_b = nullptr; - struct ggml_tensor * mm_3_w = nullptr; - struct ggml_tensor * mm_3_b = nullptr; - struct ggml_tensor * mm_4_w = nullptr; - struct ggml_tensor * mm_4_b = nullptr; - - //GLMV-Edge projection - struct ggml_tensor * mm_model_adapter_conv_w = nullptr; - struct ggml_tensor * mm_model_adapter_conv_b = nullptr; - struct ggml_tensor * boi_w = nullptr; - struct ggml_tensor * eoi_w = nullptr; - - // MobileVLM projection - struct ggml_tensor * mm_model_mlp_1_w = nullptr; - struct ggml_tensor * mm_model_mlp_1_b = nullptr; - struct ggml_tensor * mm_model_mlp_3_w = nullptr; - struct ggml_tensor * mm_model_mlp_3_b = nullptr; - struct ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; - struct ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; - struct ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; - struct ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; - struct ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; - struct ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; - struct ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; - struct ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; - struct ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; - struct ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; - struct ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; - struct ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; - struct ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; - struct ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; - struct ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; - struct ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; - struct ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; - struct ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; - struct ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; - struct ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; - - // MobileVLM_V2 projection - struct ggml_tensor * mm_model_mlp_0_w = nullptr; - struct ggml_tensor * mm_model_mlp_0_b = nullptr; - struct ggml_tensor * mm_model_mlp_2_w = nullptr; - struct ggml_tensor * mm_model_mlp_2_b = nullptr; - struct ggml_tensor * mm_model_peg_0_w = nullptr; - struct ggml_tensor * mm_model_peg_0_b = nullptr; - - // MINICPMV projection - struct ggml_tensor * mm_model_pos_embed_k = nullptr; - struct ggml_tensor * mm_model_query = nullptr; - struct ggml_tensor * mm_model_proj = nullptr; - struct ggml_tensor * mm_model_kv_proj = nullptr; - struct ggml_tensor * mm_model_attn_q_w = nullptr; - struct ggml_tensor * mm_model_attn_q_b = nullptr; - struct ggml_tensor * mm_model_attn_k_w = nullptr; - struct ggml_tensor * mm_model_attn_k_b = nullptr; - struct ggml_tensor * mm_model_attn_v_w = nullptr; - struct ggml_tensor * mm_model_attn_v_b = nullptr; - struct ggml_tensor * mm_model_attn_o_w = nullptr; - struct ggml_tensor * mm_model_attn_o_b = nullptr; - struct ggml_tensor * mm_model_ln_q_w = nullptr; - struct ggml_tensor * mm_model_ln_q_b = nullptr; - struct ggml_tensor * mm_model_ln_kv_w = nullptr; - struct ggml_tensor * mm_model_ln_kv_b = nullptr; - struct ggml_tensor * mm_model_ln_post_w = nullptr; - struct ggml_tensor * mm_model_ln_post_b = nullptr; - - // gemma3 - struct ggml_tensor * mm_input_proj_w = nullptr; - struct ggml_tensor * mm_soft_emb_norm_w = nullptr; -}; - -struct clip_ctx { - bool has_text_encoder = false; - bool has_vision_encoder = false; - bool has_llava_projector = false; - bool has_minicpmv_projector = false; - bool has_glm_projector = false; - bool has_qwen2vl_merger = false; - int minicpmv_version = 2; - - struct clip_vision_model vision_model; - projector_type proj_type = PROJECTOR_TYPE_MLP; - - int32_t max_feature_layer; // unused in newer models like gemma3 - float image_mean[3]; - float image_std[3]; - bool use_gelu = false; - bool use_silu = false; - - gguf_context_ptr ctx_gguf; - ggml_context_ptr ctx_data; - - std::vector buf_compute_meta; - - std::vector backend_ptrs; - std::vector backend_buft; - - ggml_backend_t backend; - ggml_backend_t backend_cpu; - ggml_backend_buffer_ptr buf; - - ggml_backend_sched_ptr sched; - - clip_image_size load_image_size; - - clip_ctx(clip_context_params & ctx_params) { - backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - backend = ctx_params.use_gpu - ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr) - : nullptr; - - if (backend) { - LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend)); - backend_ptrs.push_back(backend); - backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); - } else { - backend = backend_cpu; - LOG_INF("%s: CLIP using CPU backend\n", __func__); - } - - backend_ptrs.push_back(backend_cpu); - backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); - - sched.reset( - ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) - ); - } - - ~clip_ctx() { - ggml_backend_free(backend); - if (backend != backend_cpu) { - ggml_backend_free(backend_cpu); - } - } -}; - -static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { - const auto & model = ctx->vision_model; - const auto & hparams = model.hparams; - - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; - const float eps = hparams.eps; - - GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1 - - struct ggml_init_params params = { - /*.mem_size =*/ ctx->buf_compute_meta.size(), - /*.mem_buffer =*/ ctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ggml_context_ptr ctx0_ptr(ggml_init(params)); - auto ctx0 = ctx0_ptr.get(); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - // input raw - struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - - struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size); - inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); - inp = ggml_add(ctx0, inp, model.patch_bias); - - // position embeddings - struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings); - - // loop over layers - for (int il = 0; il < n_layer; il++) { - struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states - - // layernorm1 - { - cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b); - } - - // self-attention - { - - struct ggml_tensor * Q = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); - - Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - - struct ggml_tensor * K = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); - - K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - - struct ggml_tensor * V = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); - - V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches); - } - - // attention output - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, embeddings); - - embeddings = cur; // embeddings = residual, cur = hidden_states - - // layernorm2 - { - cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); - - // siglip uses gelu - cur = ggml_gelu(ctx0, cur); - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); - - // residual 2 - cur = ggml_add(ctx0, embeddings, cur); - - embeddings = cur; - } - - // post-layernorm - if (model.post_ln_w) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "post_ln"); - - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); - } - - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - const int batch_size = 1; - const int mm_tokens_per_image = 256; // default value for gemma3 - const int tokens_per_side = sqrt(mm_tokens_per_image); - const int patches_per_image = sqrt(num_patches); - const int kernel_size = patches_per_image / tokens_per_side; - - embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings)); - embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size); - - // doing a pool2d to reduce the number of output tokens to 256 - embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); - embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size); - embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings)); - - // apply norm before projection - embeddings = ggml_rms_norm(ctx0, embeddings, eps); - embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w); - - // apply projection - embeddings = ggml_mul_mat(ctx0, - ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), - embeddings); - } - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; -} - -static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); - return nullptr; - } - - const auto & model = ctx->vision_model; - const auto & hparams = model.hparams; - - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { - LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height); - image_size_width = load_image_size.width; - image_size_height = load_image_size.height; - if (is_inf) { - image_size_width = imgs.entries[0]->nx; - image_size_height = imgs.entries[0]->ny; - } - } - else if (ctx->has_qwen2vl_merger) { - // use the image's native resolution when image is avaible - if (is_inf) { - // if (imgs->data->nx && imgs->data->ny) { - image_size_width = imgs.entries[0]->nx; - image_size_height = imgs.entries[0]->ny; - } - } - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int patches_w = image_size_width / patch_size; - const int patches_h = image_size_height / patch_size; - const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const float eps = hparams.eps; - int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - - const int batch_size = imgs.entries.size(); - - if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { - GGML_ASSERT(batch_size == 1); - } - - struct ggml_init_params params = { - /*.mem_size =*/ ctx->buf_compute_meta.size(), - /*.mem_buffer =*/ ctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ggml_context_ptr ctx0_ptr(ggml_init(params)); - auto ctx0 = ctx0_ptr.get(); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - - struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - if (ctx->has_qwen2vl_merger) { - GGML_ASSERT(image_size_width % (patch_size * 2) == 0); - GGML_ASSERT(image_size_height % (patch_size * 2) == 0); - - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_reshape_4d( - ctx0, inp, - hidden_size * 2, patches_w / 2, patches_h, batch_size); - inp = ggml_reshape_4d( - ctx0, inp, - hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); - inp = ggml_reshape_3d( - ctx0, inp, - hidden_size, patches_w * patches_h, batch_size); - } - else { - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - } - - if (model.patch_bias) { - // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); - inp = ggml_add(ctx0, inp, model.patch_bias); - } - struct ggml_tensor * embeddings = inp; - struct ggml_tensor * pos_embed = nullptr; - - if (ctx->has_llava_projector) { - // concat class_embeddings and patch_embeddings - if (model.class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } - } - - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding - embeddings = - ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); - } - - if (ctx->has_minicpmv_projector) { - int pos_w = image_size_width/patch_size; - int pos_h = image_size_height/patch_size; - if (ctx->minicpmv_version == 2) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 3) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 4) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } - ggml_set_name(pos_embed, "pos_embed"); - ggml_set_input(pos_embed); - } - - // pre-layernorm - if (model.pre_ln_w) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "pre_ln"); - - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); - } - - std::vector embedding_stack; - const auto & vision_feature_layer = hparams.vision_feature_layer; - - // loop over layers - for (int il = 0; il < ctx->max_feature_layer; il++) { - struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states - - // If this is an embedding feature layer, save the output. - // NOTE: 0 index here refers to the input to the encoder. - if (vision_feature_layer.find(il) != vision_feature_layer.end()) { - embedding_stack.push_back(embeddings); - } - - //const size_t nb_q_w = model.layers[il].q_w->nb[0]; - - // layernorm1 - { - cur = ggml_norm(ctx0, cur, eps); - - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), - model.layers[il].ln_1_b); - } - - // self-attention - { - - struct ggml_tensor * Q = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); - - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { - Q = ggml_rope_multi( - ctx0, Q, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - } - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); - - struct ggml_tensor * K = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); - - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { - K = ggml_rope_multi( - ctx0, K, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - } - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - - struct ggml_tensor * V = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); - - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); - } - - // attention output - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, embeddings); - - embeddings = cur; // embeddings = residual, cur = hidden_states - - // layernorm2 - { - cur = ggml_norm(ctx0, cur, eps); - - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); - - if (ctx->use_gelu) { - cur = ggml_gelu_inplace(ctx0, cur); - } else if (ctx->use_silu) { - cur = ggml_silu_inplace(ctx0, cur); - } else { - cur = ggml_gelu_quick_inplace(ctx0, cur); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); - - // residual 2 - cur = ggml_add(ctx0, embeddings, cur); - - embeddings = cur; - } - - // post-layernorm - if (model.post_ln_w) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "post_ln"); - - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); - } - - // final layer is a vision feature layer - if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) { - embedding_stack.push_back(embeddings); - } - - // If feature layers are explicitly set, stack them (if we have multiple) - if (!embedding_stack.empty()) { - embeddings = embedding_stack[0]; - for (size_t i = 1; i < embedding_stack.size(); i++) { - embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); - } - } - - // llava projector - if (ctx->has_llava_projector) { - embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - - struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); - ggml_set_name(patches, "patches"); - ggml_set_input(patches); - - // shape [1, 576, 1024] - // ne is whcn, ne = [1024, 576, 1, 1] - embeddings = ggml_get_rows(ctx0, embeddings, patches); - - // print_tensor_info(embeddings, "embeddings"); - - // llava projector - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - - embeddings = ggml_gelu(ctx0, embeddings); - if (model.mm_2_w) { - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - } - } - else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); - // First LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), - model.mm_1_b); - - // GELU activation - embeddings = ggml_gelu(ctx0, embeddings); - - // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); - - // Second LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), - model.mm_4_b); - } - else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - // MobileVLM projector - int n_patch = 24; - struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); - mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); - mlp_1 = ggml_gelu(ctx0, mlp_1); - struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); - mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); - // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] - - // block 1 - struct ggml_tensor * block_1 = nullptr; - { - // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] - mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); - mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); - // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); - - // layer norm - // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - - // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // hardswish - struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // residual - block_1 = ggml_add(ctx0, mlp_3, block_1); - } - - // block_2 - { - // stride = 2 - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); - - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // layer norm - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // hardswish - struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - // not sure the parameters is right for globalAvgPooling - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - - // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); - block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); - // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] - } - embeddings = block_1; - } - else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) - { - int n_patch = 24; - struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); - mlp_0 = ggml_gelu(ctx0, mlp_0); - struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); - mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); - // mlp_2 ne = [2048, 576, 1, 1] - // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); - // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); - // mlp_2 ne [24, 24, 2048, 1] - mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); - // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); - peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); - embeddings = peg_0; - } - else { - GGML_ABORT("fatal error"); - } - } - // minicpmv projector - else if (ctx->has_minicpmv_projector) - { - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - struct ggml_tensor * q = model.mm_model_query; - { // layernorm - q = ggml_norm(ctx0, q, eps); - q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - } - struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - { // layernorm - v = ggml_norm(ctx0, v, eps); - v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); - } - struct ggml_tensor * k; - { // position - // q = ggml_add(ctx0, q, model.mm_model_pos_embed); - k = ggml_add(ctx0, v, pos_embed); - } - - { // attention - int hidden_size = 4096; - const int d_head = 128; - int n_head = hidden_size/d_head; - int num_query = 96; - if (ctx->minicpmv_version == 2) { - hidden_size = 4096; - n_head = hidden_size/d_head; - num_query = 96; - } - else if (ctx->minicpmv_version == 3) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - else if (ctx->minicpmv_version == 4) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); - struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); - struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); - // permute - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); - - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); - } - { // layernorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); - } - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); - } - else { - GGML_ASSERT(false); - } - } - // glm projector - else if (ctx->has_glm_projector) { - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { - size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); - embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); - embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); - embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); - embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); - //GLU - { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - embeddings = ggml_gelu_inplace(ctx0, embeddings); - struct ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); - embeddings = ggml_silu_inplace(ctx0, embeddings); - embeddings = ggml_mul(ctx0, embeddings,x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); - } - } else { - GGML_ABORT("fatal error"); - } - } - else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); - - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - - // GELU activation - embeddings = ggml_gelu(ctx0, embeddings); - - // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); - } - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; -} - -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return clip_image_build_graph_siglip(ctx, imgs); - } else { - // TODO: we should have one build_* function per model - return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); - } -} - -struct clip_model_loader { - ggml_context_ptr ctx_meta; - gguf_context_ptr ctx_gguf; - - clip_ctx & ctx_clip; - std::string fname; - - size_t model_size; // in bytes - - // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model - clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) { - struct ggml_context * meta = nullptr; - - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, - }; - - ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params)); - if (!ctx_gguf.get()) { - throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); - } - - ctx_meta.reset(meta); - - const int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); - - // print gguf info - { - std::string name; - get_string(KEY_NAME, name, false); - std::string description; - get_string(KEY_DESCRIPTION, description, false); - LOG_INF("%s: model name: %s\n", __func__, name.c_str()); - LOG_INF("%s: description: %s\n", __func__, description.c_str()); - LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get())); - LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get())); - LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); - LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get())); - LOG_INF("\n"); - } - - // tensors - { - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); - const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i); - enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i); - struct ggml_tensor * cur = ggml_get_tensor(meta, name); - size_t tensor_size = ggml_nbytes(cur); - model_size += tensor_size; - LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", - __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); - } - } - } - - void load_hparams() { - // projector type - { - std::string proj_type; - get_string(KEY_PROJ_TYPE, proj_type, false); - if (!proj_type.empty()) { - ctx_clip.proj_type = clip_projector_type_from_string(proj_type); - } - if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) { - throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str())); - } - } - - // other hparams - { - get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false); - get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false); - GGML_ASSERT(ctx_clip.has_vision_encoder); - GGML_ASSERT(!ctx_clip.has_text_encoder); - - // legacy keys, use KEY_PROJ_TYPE instead - get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false); - get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false); - get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); - get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false); - get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false); - // !!! do NOT extend the list above, use KEY_PROJ_TYPE instead - - get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false); - get_bool(KEY_USE_SILU, ctx_clip.use_silu, false); - - auto & hparams = ctx_clip.vision_model.hparams; - get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size); - get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head); - get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate); - get_u32(string_format(KEY_N_BLOCK, "vision"), hparams.n_layer); - get_u32(string_format(KEY_PROJ_DIM, "vision"), hparams.projection_dim); - get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps); - get_u32(KEY_IMAGE_SIZE, hparams.image_size); - get_u32(KEY_PATCH_SIZE, hparams.patch_size); - get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); - get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false); - - { - std::string mm_patch_merge_type; - get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); - if (mm_patch_merge_type == "spatial_unpad") { - hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD; - } - } - - { - int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN); - int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD); - GGML_ASSERT(idx_mean >= 0 && "image_mean not found"); - GGML_ASSERT(idx_std >= 0 && "image_std not found"); - const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean); - const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std); - for (int i = 0; i < 3; ++i) { - ctx_clip.image_mean[i] = mean_data[i]; - ctx_clip.image_std[i] = std_data[i]; - } - } - - // Load the vision feature layer indices if they are explicitly provided; - // if multiple vision feature layers are present, the values will be concatenated - // to form the final visual features. - // NOTE: gguf conversions should standardize the values of the vision feature layer to - // be non-negative, since we use -1 to mark values as unset here. - std::vector vision_feature_layer; - get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false); - // convert std::vector to std::unordered_set - for (auto & layer : vision_feature_layer) { - hparams.vision_feature_layer.insert(layer); - } - // Calculate the deepest feature layer based on hparams and projector type - ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip); - - LOG_INF("%s: text_encoder: %d\n", __func__, ctx_clip.has_text_encoder); - LOG_INF("%s: vision_encoder: %d\n", __func__, ctx_clip.has_vision_encoder); - LOG_INF("%s: llava_projector: %d\n", __func__, ctx_clip.has_llava_projector); - LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector); - LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); - LOG_INF("%s: glm_projector: %d\n", __func__, ctx_clip.has_glm_projector); - LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); - LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); - } - } - - void load_tensors() { - std::map tensor_offset; - std::vector tensors_to_load; - - // get offsets - for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) { - const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); - tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i); - } - - // create data context - struct ggml_init_params params = { - /*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ctx_clip.ctx_data.reset(ggml_init(params)); - if (!ctx_clip.ctx_data) { - throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__)); - } - - // helper function - auto get_tensor = [&](const std::string & name, bool required = true) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str()); - if (!cur && required) { - throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str())); - } - if (cur) { - tensors_to_load.push_back(cur); - // add tensors to context - struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur); - ggml_set_name(data_tensor, cur->name); - cur = data_tensor; - } - return cur; - }; - - auto & vision_model = ctx_clip.vision_model; - - vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false); - - vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false); - vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"), false); - - vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false); - vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"), false); - - vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false); - vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); - vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); - if (vision_model.patch_embeddings_1 == nullptr) { - ctx_clip.has_qwen2vl_merger = false; - } - - vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false); - - // layers - vision_model.layers.resize(vision_model.hparams.n_layer); - for (int il = 0; il < vision_model.hparams.n_layer; ++il) { - auto & layer = vision_model.layers[il]; - layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight")); - layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight")); - layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight")); - layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight")); - layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false); - layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); - layer.ff_i_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight")); - layer.ff_o_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight")); - layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); - layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false); - layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false); - layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false); - layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false); - layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false); - layer.ff_i_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false); - layer.ff_o_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false); - } - - switch (ctx_clip.proj_type) { - case PROJECTOR_TYPE_MLP: - case PROJECTOR_TYPE_MLP_NORM: - { - // LLaVA projection - vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false); - vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); - // Yi-type llava - vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false); - vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); - // missing in Yi-type llava - vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false); - vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); - // Yi-type llava - vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false); - vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false); - vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false); - vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false); - if (vision_model.mm_3_w) { - // TODO: this is a hack to support Yi-type llava - ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM; - } - vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false); - } break; - case PROJECTOR_TYPE_LDP: - { - // MobileVLM projection - vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); - vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); - vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); - vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); - vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); - vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); - vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); - vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); - vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); - vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); - vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); - vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); - vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); - vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); - vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); - vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); - vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); - vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); - vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); - vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); - vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); - vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); - vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); - vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); - } break; - case PROJECTOR_TYPE_LDPV2: - { - // MobilVLM_V2 projection - vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); - vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); - vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); - vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias")); - vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); - vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); - } break; - case PROJECTOR_TYPE_RESAMPLER: - { - // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); - vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); - vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY); - vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ); - vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ); - vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight")); - vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight")); - vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight")); - vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias")); - vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias")); - vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias")); - vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight")); - vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias")); - vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight")); - vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias")); - vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight")); - vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias")); - vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight")); - vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias")); - } break; - case PROJECTOR_TYPE_GLM_EDGE: - { - vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight")); - vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias")); - vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight")); - vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight")); - vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias")); - vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); - vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); - vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); - vision_model.boi_w = get_tensor(TN_GLM_BOI_W); - vision_model.eoi_w = get_tensor(TN_GLM_EOI_W); - } break; - case PROJECTOR_TYPE_MERGER: - { - vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); - vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); - vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); - } break; - case PROJECTOR_TYPE_GEMMA3: - { - vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); - vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); - } break; - default: - GGML_ASSERT(false && "unknown projector type"); - } - - // load data - { - std::vector read_buf; - -#ifdef _WIN32 - int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0); - if (!wlen) { - throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__)); - } - wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); - wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wbuf, wlen); - if (!wlen) { - free(wbuf); - throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__)); - } -#if __GLIBCXX__ - int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY); - __gnu_cxx::stdio_filebuf buffer(fd, std::ios_base::in); - std::istream fin(&buffer); -#else // MSVC - // unused in our current build - auto fin = std::ifstream(wbuf, std::ios::binary); -#endif - free(wbuf); -#else - auto fin = std::ifstream(fname, std::ios::binary); -#endif - if (!fin) { - throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); - } - - // alloc memory and offload data - ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); - ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); - ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - for (auto & t : tensors_to_load) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); - const size_t offset = tensor_offset[t->name]; - fin.seekg(offset, std::ios::beg); - if (!fin) { - throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); - } - size_t num_bytes = ggml_nbytes(cur); - if (ggml_backend_buft_is_host(buft)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); - } - } -#if defined(_WIN32) && defined(__GLIBCXX__) - close(fd); -#else - fin.close(); -#endif - - LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); - } - } - - void alloc_compute_meta() { - ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - - // create a fake batch - clip_image_f32_batch batch; - clip_image_f32_ptr img(clip_image_f32_init()); - clip_image_size image_size; - image_size.width = clip_get_image_size(&ctx_clip); - image_size.height = clip_get_image_size(&ctx_clip); - int n_patches = clip_get_image_size(&ctx_clip) / image_size.width; - img->nx = n_patches; - img->ny = n_patches; - img->buf.resize(n_patches * image_size.width * image_size.height * 3); - batch.entries.push_back(std::move(img)); - - ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false); - ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); - for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) { - ggml_backend_t backend = ctx_clip.backend_ptrs[i]; - ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend); - if (size > 1) { - LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); - } - } - } - - void get_bool(const std::string & key, bool & output, bool required = true) { - const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); - if (i < 0) { - if (required) throw std::runtime_error("Key not found: " + key); - return; - } - output = gguf_get_val_bool(ctx_gguf.get(), i); - } - - void get_i32(const std::string & key, int & output, bool required = true) { - const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); - if (i < 0) { - if (required) throw std::runtime_error("Key not found: " + key); - return; - } - output = gguf_get_val_i32(ctx_gguf.get(), i); - } - - void get_u32(const std::string & key, int & output, bool required = true) { - const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); - if (i < 0) { - if (required) throw std::runtime_error("Key not found: " + key); - return; - } - output = gguf_get_val_u32(ctx_gguf.get(), i); - } - - void get_f32(const std::string & key, float & output, bool required = true) { - const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); - if (i < 0) { - if (required) throw std::runtime_error("Key not found: " + key); - return; - } - output = gguf_get_val_f32(ctx_gguf.get(), i); - } - - void get_string(const std::string & key, std::string & output, bool required = true) { - const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); - if (i < 0) { - if (required) throw std::runtime_error("Key not found: " + key); - return; - } - output = std::string(gguf_get_val_str(ctx_gguf.get(), i)); - } - - void get_arr_int(const std::string & key, std::vector & output, bool required = true) { - const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); - if (i < 0) { - if (required) throw std::runtime_error("Key not found: " + key); - return; - } - int n = gguf_get_arr_n(ctx_gguf.get(), i); - output.resize(n); - const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i); - for (int i = 0; i < n; ++i) { - output[i] = values[i]; - } - } -}; - -// read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity) { - return clip_init(fname, clip_context_params{ - /* use_gpu */ true, - /* verbosity */ static_cast(verbosity), - }); -} - -struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) { - g_logger_state.verbosity_thold = ctx_params.verbosity; - clip_ctx * ctx_clip = new clip_ctx(ctx_params); - - try { - clip_model_loader loader(fname, *ctx_clip); - loader.load_hparams(); - loader.load_tensors(); - loader.alloc_compute_meta(); - } catch (const std::exception & e) { - LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what()); - delete ctx_clip; - return nullptr; - } - - return ctx_clip; -} - -void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { - ctx_clip->load_image_size = *load_image_size; // copy -} - -struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { - return &ctx_clip->load_image_size; -} - -struct clip_image_size * clip_image_size_init() { - struct clip_image_size * load_image_size = new struct clip_image_size(); - load_image_size->width = 448; - load_image_size->height = 448; - return load_image_size; -} - -struct clip_image_u8 * clip_image_u8_init() { - return new clip_image_u8(); -} - -struct clip_image_f32 * clip_image_f32_init() { - return new clip_image_f32(); -} - -struct clip_image_f32_batch * clip_image_f32_batch_init() { - return new clip_image_f32_batch(); -} - -unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { - if (nx) *nx = img->nx; - if (ny) *ny = img->ny; - return img->buf.data(); -} - -void clip_image_size_free(struct clip_image_size * load_image_size) { - if (load_image_size == nullptr) { - return; - } - delete load_image_size; -} -void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; } -void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; } -void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; } - -size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) { - return batch->entries.size(); -} - -size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) { - if (idx < 0 || idx >= (int)batch->entries.size()) { - LOG_ERR("%s: invalid index %d\n", __func__, idx); - return 0; - } - return batch->entries[idx]->nx; -} - -size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { - if (idx < 0 || idx >= (int)batch->entries.size()) { - LOG_ERR("%s: invalid index %d\n", __func__, idx); - return 0; - } - return batch->entries[idx]->ny; -} - -clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { - if (idx < 0 || idx >= (int)batch->entries.size()) { - LOG_ERR("%s: invalid index %d\n", __func__, idx); - return nullptr; - } - return batch->entries[idx].get(); -} - -void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { - img->nx = nx; - img->ny = ny; - img->buf.resize(3 * nx * ny); - memcpy(img->buf.data(), rgb_pixels, img->buf.size()); -} - -bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { - int nx, ny, nc; - auto * data = stbi_load(fname, &nx, &ny, &nc, 3); - if (!data) { - LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); - return false; - } - clip_build_img_from_pixels(data, nx, ny, img); - stbi_image_free(data); - return true; -} - -bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { - int nx, ny, nc; - auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); - if (!data) { - LOG_ERR("%s: failed to decode image bytes\n", __func__); - return false; - } - clip_build_img_from_pixels(data, nx, ny, img); - stbi_image_free(data); - return true; -} - -// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not -static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - // TODO @ngxson : seems like this could be done more efficiently on cgraph - for (size_t i = 0; i < src.buf.size(); ++i) { - int c = i % 3; // rgb - dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; - } -} - -// set of tools to manupulate images -// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv -struct image_manipulation { - // Bilinear resize function - static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float x_ratio = static_cast(src.nx - 1) / target_width; - float y_ratio = static_cast(src.ny - 1) / target_height; - - for (int y = 0; y < target_height; y++) { - for (int x = 0; x < target_width; x++) { - float px = x_ratio * x; - float py = y_ratio * y; - int x_floor = static_cast(px); - int y_floor = static_cast(py); - float x_lerp = px - x_floor; - float y_lerp = py - y_floor; - - for (int c = 0; c < 3; c++) { - float top = lerp( - static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), - static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - float bottom = lerp( - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); - } - } - } - } - - // Bicubic resize function - // part of image will be cropped if the aspect ratio is different - static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; - - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float Cc; - float C[5]; - float d0, d2, d3, a0, a1, a2, a3; - int i, j, k, jj; - int x, y; - float dx, dy; - float tx, ty; - - tx = (float)nx / (float)target_width; - ty = (float)ny / (float)target_height; - - // Bicubic interpolation; adapted from ViT.cpp, inspired from : - // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 - // -> https://en.wikipedia.org/wiki/Bicubic_interpolation - - for (i = 0; i < target_height; i++) { - for (j = 0; j < target_width; j++) { - x = (int)(tx * j); - y = (int)(ty * i); - - dx = tx * j - x; - dy = ty * i - y; - - for (k = 0; k < 3; k++) { - for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - - C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; - - d0 = C[0] - C[1]; - d2 = C[2] - C[1]; - d3 = C[3] - C[1]; - a0 = C[1]; - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; - - const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); - } - } - } - } - - return true; - } - - // llava-1.6 type of resize_and_pad - // if the ratio is not 1:1, padding with pad_color will be applied - // pad_color is single channel, default is 0 (black) - static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array pad_color = {0, 0, 0}) { - int target_width = target_resolution.width; - int target_height = target_resolution.height; - - float scale_w = static_cast(target_width) / image.nx; - float scale_h = static_cast(target_height) / image.ny; - - int new_width, new_height; - - if (scale_w < scale_h) { - new_width = target_width; - new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); - } else { - new_height = target_height; - new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); - } - - clip_image_u8 resized_image; - bicubic_resize(image, resized_image, new_width, new_height); - - clip_image_u8 padded_image; - padded_image.nx = target_width; - padded_image.ny = target_height; - padded_image.buf.resize(3 * target_width * target_height); - - // Fill the padded image with the fill color - for (size_t i = 0; i < padded_image.buf.size(); i += 3) { - padded_image.buf[i] = pad_color[0]; - padded_image.buf[i + 1] = pad_color[1]; - padded_image.buf[i + 2] = pad_color[2]; - } - - // Calculate padding offsets - int pad_x = (target_width - new_width) / 2; - int pad_y = (target_height - new_height) / 2; - - // Copy the resized image into the center of the padded buffer - for (int y = 0; y < new_height; ++y) { - for (int x = 0; x < new_width; ++x) { - for (int c = 0; c < 3; ++c) { - padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; - } - } - } - dst = std::move(padded_image); - } - - static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int src_idx = 3 * ((y + i)*image.nx + (x + j)); - int dst_idx = 3 * (i*w + j); - dst.buf[dst_idx] = image.buf[src_idx]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; - } - } - } - -private: - static inline int clip(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); - } - - // Linear interpolation between two points - static inline float lerp(float s, float e, float t) { - return s + (e - s) * t; - } -}; - -/** - * implementation of LLaVA-UHD: - * - https://arxiv.org/pdf/2403.11703 - * - https://github.com/thunlp/LLaVA-UHD - * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 - * - * overview: - * - an image always have a single overview (downscaled image) - * - an image can have 0 or multiple slices, depending on the image size - * - each slice can then be considered as a separate image - * - * for example: - * - * [overview] --> [slice 1] --> [slice 2] - * | | - * +--> [slice 3] --> [slice 4] - */ -struct llava_uhd { - struct slice_coordinates { - int x; - int y; - clip_image_size size; - }; - - struct slice_instructions { - clip_image_size overview_size; // size of downscaled image - clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) - clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices - std::vector slices; - bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) - }; - - static int get_max_slices(struct clip_ctx * ctx) { - if (clip_is_minicpmv(ctx)) { - return 9; - } - return 0; - } - - static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { - slice_instructions res; - const int patch_size = clip_get_patch_size(ctx); - const int slice_size = clip_get_image_size(ctx); - const int max_slice_nums = get_max_slices(ctx); - const int original_width = original_size.width; - const int original_height = original_size.height; - const float log_ratio = log((float)original_width / original_height); - const float ratio = (float)original_width * original_height / (slice_size * slice_size); - const int multiple = fmin(ceil(ratio), max_slice_nums); - const bool has_slices = (multiple > 1); - const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty(); - - if (has_pinpoints) { - // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) - auto refine_size = llava_uhd::select_best_resolution( - ctx->vision_model.hparams.image_grid_pinpoints, - original_size); - res.overview_size = clip_image_size{slice_size, slice_size}; - res.refined_size = refine_size; - res.grid_size = clip_image_size{0, 0}; - res.padding_refined = true; - - for (int y = 0; y < refine_size.height; y += slice_size) { - for (int x = 0; x < refine_size.width; x += slice_size) { - slice_coordinates slice; - slice.x = x; - slice.y = y; - slice.size.width = std::min(slice_size, refine_size.width - x); - slice.size.height = std::min(slice_size, refine_size.height - y); - res.slices.push_back(slice); - if (x == 0) { - res.grid_size.width++; - } - } - res.grid_size.height++; - } - - return res; - } - - // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) - - auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices); - res.overview_size = best_size; - - if (!has_slices) { - // skip slicing logic - res.refined_size = clip_image_size{0, 0}; - res.grid_size = clip_image_size{0, 0}; - - } else { - auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); - auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); - res.grid_size = best_grid; - res.refined_size = refine_size; - - int width = refine_size.width; - int height = refine_size.height; - int grid_x = int(width / best_grid.width); - int grid_y = int(height / best_grid.height); - for (int patches_y = 0, ic = 0; - patches_y < refine_size.height && ic < best_grid.height; - patches_y += grid_y, ic += 1) { - for (int patches_x = 0, jc = 0; - patches_x < refine_size.width && jc < best_grid.width; - patches_x += grid_x, jc += 1) { - slice_coordinates slice; - slice.x = patches_x; - slice.y = patches_y; - slice.size.width = grid_x; - slice.size.height = grid_y; - res.slices.push_back(slice); - // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y); - } - } - } - - return res; - } - - static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { - std::vector output; - - // resize to overview size - clip_image_u8_ptr resized_img(clip_image_u8_init()); - image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height); - output.push_back(std::move(resized_img)); - if (inst.slices.empty()) { - // no slices, just return the resized image - return output; - } - - // resize to refined size - clip_image_u8_ptr refined_img(clip_image_u8_init()); - if (inst.padding_refined) { - image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size); - } else { - image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height); - } - - // create slices - for (const auto & slice : inst.slices) { - int x = slice.x; - int y = slice.y; - int w = slice.size.width; - int h = slice.size.height; - - clip_image_u8_ptr img_slice(clip_image_u8_init()); - image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h); - output.push_back(std::move(img_slice)); - } - - return output; - } - -private: - static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.width; - int height = original_size.height; - if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { - float r = static_cast(width) / height; - height = static_cast(scale_resolution / std::sqrt(r)); - width = static_cast(height * r); - } - clip_image_size res; - res.width = ensure_divide(width, patch_size); - res.height = ensure_divide(height, patch_size); - return res; - } - - /** - * Selects the best resolution from a list of possible resolutions based on the original size. - * - * @param original_size The original size of the image - * @param possible_resolutions A list of possible resolutions - * @return The best fit resolution - */ - static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions) { - int original_width = original_size.width; - int original_height = original_size.height; - clip_image_size best_fit; - int max_effective_resolution = 0; - int min_wasted_resolution = std::numeric_limits::max(); - - for (const auto & resolution : possible_resolutions) { - int width = resolution.width; - int height = resolution.height; - float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); - int downscaled_width = static_cast(original_width * scale); - int downscaled_height = static_cast(original_height * scale); - int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); - int wasted_resolution = (width * height) - effective_resolution; - // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); - if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { - max_effective_resolution = effective_resolution; - min_wasted_resolution = wasted_resolution; - best_fit = resolution; - } - } - - return best_fit; - } - - // used by llava 1.6 with custom list of pinpoints - static clip_image_size select_best_resolution(const std::vector & pinpoints, const clip_image_size & original_size) { - std::vector possible_resolutions; - for (size_t i = 0; i < pinpoints.size(); i += 2) { - possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]}); - } - return select_best_resolution(original_size, possible_resolutions); - } - - static int ensure_divide(int length, int patch_size) { - return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); - } - - static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.width; - int height = original_size.height; - int grid_x = grid.width; - int grid_y = grid.height; - - int refine_width = ensure_divide(width, grid_x); - int refine_height = ensure_divide(height, grid_y); - - clip_image_size grid_size; - grid_size.width = refine_width / grid_x; - grid_size.height = refine_height / grid_y; - - auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale); - int best_grid_width = best_grid_size.width; - int best_grid_height = best_grid_size.height; - - clip_image_size refine_size; - refine_size.width = best_grid_width * grid_x; - refine_size.height = best_grid_height * grid_y; - return refine_size; - } - - static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { - std::vector candidate_split_grids_nums; - for (int i : {multiple - 1, multiple, multiple + 1}) { - if (i == 1 || i > max_slice_nums) { - continue; - } - candidate_split_grids_nums.push_back(i); - } - - std::vector candidate_grids; - for (int split_grids_nums : candidate_split_grids_nums) { - int m = 1; - while (m <= split_grids_nums) { - if (split_grids_nums % m == 0) { - candidate_grids.push_back(clip_image_size{m, split_grids_nums / m}); - } - ++m; - } - } - - clip_image_size best_grid{1, 1}; - float min_error = std::numeric_limits::infinity(); - for (const auto& grid : candidate_grids) { - float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); - if (error < min_error) { - best_grid = grid; - min_error = error; - } - } - return best_grid; - } -}; - -// TODO @ngxson : decprecate the load_image_size singleton pattern -int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { - const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size); - return inst.grid_size.width; -} - -// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector -// res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - - clip_image_size original_size{img->nx, img->ny}; - bool pad_to_square = true; - auto & params = ctx->vision_model.hparams; - // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { - pad_to_square = false; - } - - if (clip_is_minicpmv(ctx)) { - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std); - res_imgs->entries.push_back(std::move(res)); - } - return true; - } - else if (ctx->has_qwen2vl_merger) { - clip_image_u8 resized; - auto patch_size = clip_get_patch_size(ctx) * 2; - int nx = ceil((float)img->nx / patch_size) * patch_size; - int ny = ceil((float)img->ny / patch_size) * patch_size; - image_manipulation::bicubic_resize(*img, resized, nx, ny); - - clip_image_f32_ptr img_f32(clip_image_f32_init()); - // clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std); - // res_imgs->data[0] = *res; - res_imgs->entries.push_back(std::move(img_f32)); - return true; - } - - if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - clip_image_u8 resized_image; - int sz = params.image_size; - image_manipulation::bicubic_resize(*img, resized_image, sz, sz); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - //clip_image_save_to_bmp(resized_image, "resized.bmp"); - normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); - res_imgs->entries.push_back(std::move(img_f32)); - return true; - } - - // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - - clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily - - if (pad_to_square) { - // for llava-1.5, we resize image to a square, and pad the shorter side with a background color - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - const int longer_side = std::max(img->nx, img->ny); - temp->nx = longer_side; - temp->ny = longer_side; - temp->buf.resize(3 * longer_side * longer_side); - - // background color in RGB from LLaVA (this is the mean rgb color * 255) - const std::array pad_color = {122, 116, 104}; - - // resize the image to the target_size - image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color); - - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); - res_imgs->entries.push_back(std::move(res)); - return true; - - } else if (!params.image_grid_pinpoints.empty()) { - // "spatial_unpad" with "anyres" processing for llava-1.6 - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std); - res_imgs->entries.push_back(std::move(res)); - } - - return true; - - } - - GGML_ASSERT(false && "Unknown image preprocessing type"); -} - -ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { - return ctx->vision_model.image_newline; -} - -void clip_free(clip_ctx * ctx) { - if (ctx == nullptr) { - return; - } - delete ctx; -} - -size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - int extra_tokens = ctx->has_glm_projector ? 2 : 0; - return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float); -} - -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { - clip_image_f32 img; - img.nx = img_w; - img.ny = img_h; - return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); -} - -int32_t clip_get_image_size(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.image_size; -} - -int32_t clip_get_patch_size(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.patch_size; -} - -int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.hidden_size; -} - -const char * clip_patch_merge_type(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat"; -} - -const int32_t * clip_image_grid(const struct clip_ctx * ctx) { - if (ctx->vision_model.hparams.image_grid_pinpoints.size()) { - return &ctx->vision_model.hparams.image_grid_pinpoints.front(); - } - return nullptr; -} - -size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.image_grid_pinpoints.size(); -} - -int clip_n_patches(const struct clip_ctx * ctx) { - clip_image_f32 img; - img.nx = ctx->vision_model.hparams.image_size; - img.ny = ctx->vision_model.hparams.image_size; - return clip_n_patches_by_img(ctx, &img); -} - -int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { - const auto & params = ctx->vision_model.hparams; - - int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); - - if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { - n_patches /= 4; - } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - n_patches = 96; - } - else if (ctx->minicpmv_version == 3) { - n_patches = 64; - } - else if (ctx->minicpmv_version == 4) { - n_patches = 64; - } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - int patch_size = params.patch_size * 2; - int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); - int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); - n_patches = x_patch * y_patch; - } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - n_patches = 256; - } - - return n_patches; -} - -static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { - assert(embed_dim % 2 == 0); - int H = pos.size(); - int W = pos[0].size(); - - std::vector omega(embed_dim / 2); - for (int i = 0; i < embed_dim / 2; ++i) { - omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); - } - - std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - for (int d = 0; d < embed_dim / 2; ++d) { - float out_value = pos[h][w] * omega[d]; - emb[h][w][d] = sin(out_value); - emb[h][w][d + embed_dim / 2] = cos(out_value); - } - } - } - - return emb; -} - -static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { - assert(embed_dim % 2 == 0); - std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) - std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) - - int H = emb_h.size(); - int W = emb_h[0].size(); - std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); - - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - for (int d = 0; d < embed_dim / 2; ++d) { - emb[h][w][d] = emb_h[h][w][d]; - emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; - } - } - } - return emb; -} - -static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { - int grid_h_size = image_size.first; - int grid_w_size = image_size.second; - - std::vector grid_h(grid_h_size); - std::vector grid_w(grid_w_size); - - for (int i = 0; i < grid_h_size; ++i) { - grid_h[i] = static_cast(i); - } - for (int i = 0; i < grid_w_size; ++i) { - grid_w[i] = static_cast(i); - } - - std::vector> grid(grid_h_size, std::vector(grid_w_size)); - for (int h = 0; h < grid_h_size; ++h) { - for (int w = 0; w < grid_w_size; ++w) { - grid[h][w] = grid_w[w]; - } - } - std::vector>> grid_2d = {grid, grid}; - for (int h = 0; h < grid_h_size; ++h) { - for (int w = 0; w < grid_w_size; ++w) { - grid_2d[0][h][w] = grid_h[h]; - grid_2d[1][h][w] = grid_w[w]; - } - } - - std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); - - int H = image_size.first; - int W = image_size.second; - std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; - } - } - - return pos_embed_2d; -} - -bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - - clip_image_f32_batch imgs; - clip_image_f32_ptr img_copy(clip_image_f32_init()); - *img_copy = *img; - imgs.entries.push_back(std::move(img_copy)); - - return clip_image_batch_encode(ctx, n_threads, &imgs, vec); -} - -bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { - const clip_image_f32_batch & imgs = *imgs_c_ptr; - - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - - int batch_size = imgs.entries.size(); - if (ctx->has_llava_projector) { - GGML_ASSERT(batch_size == 1); // TODO: support multiple images - } - if (ctx->has_minicpmv_projector) { - GGML_ASSERT(batch_size == 1); - } - if (ctx->has_glm_projector) { - GGML_ASSERT(batch_size == 1); - ggml_tensor * boi = ctx->vision_model.boi_w; - ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi)); - vec = (float*)(vec+ggml_nelements(boi)); //offset for boi - } - - // build the inference graph - ggml_backend_sched_reset(ctx->sched.get()); - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); - ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); - - // set inputs - const auto & model = ctx->vision_model; - const auto & hparams = model.hparams; - - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { - image_size_width = imgs.entries[0]->nx; - image_size_height = imgs.entries[0]->ny; - } - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int pos_w = ctx->load_image_size.width / patch_size; - const int pos_h = ctx->load_image_size.height / patch_size; - - { - struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - float * data = (float *)malloc(ggml_nbytes(inp_raw)); - - for (size_t i = 0; i < imgs.entries.size(); i++) { - const int nx = imgs.entries[i]->nx; - const int ny = imgs.entries[i]->ny; - if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { - GGML_ASSERT(nx == image_size && ny == image_size); - } - - const int n = nx * ny; - - for (int b = 0; b < batch_size; b++) { - for (int k = 0; k < 3; k++) { - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k]; - } - } - } - } - } - ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); - free(data); - } - if (ctx->has_minicpmv_projector) { - { - // inspired from siglip: - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - int bucket_coords_h[1024]; - int bucket_coords_w[1024]; - for (int i = 0; i < pos_h; i++){ - bucket_coords_h[i] = std::floor(70.0*i/pos_h); - } - for (int i = 0; i < pos_w; i++){ - bucket_coords_w[i] = std::floor(70.0*i/pos_w); - } - for (int i = 0, id = 0; i < pos_h; i++){ - for (int j = 0; j < pos_w; j++){ - positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; - } - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - - { - // inspired from resampler of Qwen-VL: - // -> https://huggingface.co/Qwen/Qwen-VL/tree/main - // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 - struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - int embed_dim = 4096; - if (ctx->minicpmv_version == 2) { - embed_dim = 4096; - } - else if (ctx->minicpmv_version == 3) { - embed_dim = 3584; - } - else if (ctx->minicpmv_version == 4) { - embed_dim = 3584; - } - auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - - float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;i < pos_w * pos_h; ++i){ - for(int j=0; j < embed_dim; ++j){ - pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j]; - } - } - - ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); - free(pos_embed_data); - } - } - else { - if (model.class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); - } - - if (ctx->has_qwen2vl_merger) { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - const int pw = image_size_width / patch_size; - const int ph = image_size_height / patch_size; - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - - int ptr = 0; - for (int y = 0; y < ph; y+=2) - { - for (int x = 0; x < pw; x+=2) - { - for (int dy = 0; dy < 2; dy++) { - for (int dx = 0; dx < 2; dx++) { - positions_data[ptr] = y + dy; - positions_data[num_patches + ptr] = x + dx; - positions_data[num_patches * 2 + ptr] = y + dy; - positions_data[num_patches * 3 + ptr] = x + dx; - ptr++; - } - } - } - } - - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - // do nothing - } - else { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - - if (!ctx->has_glm_projector) { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - // The patches vector is used to get rows to index into the embeds with; - // we should skip dim 0 only if we have CLS to avoid going out of bounds - // when retrieving the rows. - int patch_offset = model.class_embedding ? 1 : 0; - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + patch_offset; - } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); - } - } - } - - ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); - - auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); - if (status != GGML_STATUS_SUCCESS) { - LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); - return false; - } - - // the last node is the embedding tensor - struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); - - // copy the embeddings to the location passed by the user - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - - if (ctx->has_glm_projector) { - //eoi - ggml_tensor * eoi = ctx->vision_model.eoi_w; - int offset = ggml_nelements(embeddings); - ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi)); - } - - return true; -} - -bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { - assert(itype < GGML_TYPE_COUNT); - ggml_type type = static_cast(itype); - - auto * ctx_clip = clip_init(fname_inp, clip_context_params{ - /* use_gpu */ false, - /* verbosity */ GGML_LOG_LEVEL_ERROR, - }); - - const auto & ctx_src = ctx_clip->ctx_gguf.get(); - const auto & ctx_data = ctx_clip->ctx_data.get(); - - auto * ctx_out = gguf_init_empty(); - gguf_set_kv(ctx_out, ctx_src); - gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); - gguf_set_val_u32(ctx_out, "general.file_type", itype); - - auto fout = std::ofstream(fname_out, std::ios::binary); - - const int n_tensors = gguf_get_n_tensors(ctx_src); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx_src, i); - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - gguf_add_tensor(ctx_out, cur); - } - - const size_t meta_size = gguf_get_meta_size(ctx_out); - for (size_t i = 0; i < meta_size; ++i) { - fout.put(0); - } - - // regexes of tensor names to be quantized - const std::vector k_names = { - ".*weight", - }; - - std::vector work(512); - std::vector conv_buf(512); - size_t total_size_org = 0; - size_t total_size_new = 0; - - for (int i = 0; i < n_tensors; ++i) { - const std::string name = gguf_get_tensor_name(ctx_src, i); - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str()); - - enum ggml_type new_type; - void * new_data; - size_t new_size; - - bool quantize = false; - for (const auto & s : k_names) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; - } - } - - // quantize only 2D tensors and bigger than block size - quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type); - - if (quantize) { - new_type = type; - if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { - new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type - // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); - } - const size_t n_elms = ggml_nelements(cur); - float * f32_data; - - switch (cur->type) { - case GGML_TYPE_F32: - f32_data = (float *)cur->data; - break; - case GGML_TYPE_F16: - if (conv_buf.size() < n_elms) { - conv_buf.resize(n_elms); - } - for (size_t j = 0; j < n_elms; ++j) { - conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]); - } - f32_data = (float *)conv_buf.data(); - break; - default: - LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__); - gguf_free(ctx_out); - return false; - } - - if (work.size() < n_elms * 4) { - work.resize(n_elms * 4); - } - new_data = work.data(); - - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr); - } else { - new_type = cur->type; - new_data = cur->data; - new_size = ggml_nbytes(cur); - } - const size_t orig_size = ggml_nbytes(cur); - total_size_org += orig_size; - total_size_new += new_size; - gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data); - fout.write((const char *)new_data, new_size); - size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; - for (size_t j = 0; j < pad; ++j) { - fout.put(0); - } - - LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, - orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); - } - - // go back to beginning of file and write the updated metadata - fout.seekp(0, std::ios::beg); - std::vector meta(meta_size); - gguf_get_meta_data(ctx_out, meta.data()); - fout.write((const char *)meta.data(), meta_size); - - fout.close(); - - clip_free(ctx_clip); - gguf_free(ctx_out); - - { - LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); - } - - return true; -} - -int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - return ctx->vision_model.mm_model_peg_0_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - return 4096; - } - else if (ctx->minicpmv_version == 3) { - return 3584; - } - else if (ctx->minicpmv_version == 4) { - return 3584; - } - } - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){ - return ctx->vision_model.mm_model_mlp_3_w->ne[1]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - return ctx->vision_model.mm_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return ctx->vision_model.mm_input_proj_w->ne[0]; - } - - std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; - throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); -} - -int clip_is_minicpmv(const struct clip_ctx * ctx) { - if (ctx->has_minicpmv_projector) { - return ctx->minicpmv_version; - } - return 0; -} - -bool clip_is_glm(const struct clip_ctx * ctx) { - return ctx->has_glm_projector; -} - -bool clip_is_qwen2vl(const struct clip_ctx * ctx) { - return ctx->has_qwen2vl_merger; -} - -bool clip_is_llava(const struct clip_ctx * ctx) { - return ctx->has_llava_projector; -} - -bool clip_is_gemma3(const struct clip_ctx * ctx) { - return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; -} - -// Determine the number of encoder layers to iterate over -int get_deepest_feature_layer(const struct clip_ctx * ctx) { - // Get the index of the second to last layer; this is the - // default for models that have a llava projector - const auto & hparams = ctx->vision_model.hparams; - int n_layer = hparams.n_layer - 1; - int deepest_feature_layer = -1; - - // Handle other projectors; incrementing here indicates that we - // should use the last encoder layer for the vision features. - if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) { - n_layer += 1; - } - - // If we set explicit vision feature layers, only go up to the deepest one - for (const auto & feature_layer : hparams.vision_feature_layer) { - if (feature_layer > deepest_feature_layer) { - deepest_feature_layer = feature_layer; - } - } - return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; -} - -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { - clip_image_f32 clip_img; - clip_img.buf.resize(h * w * 3); - for (int i = 0; i < h*w*3; i++) - { - clip_img.buf[i] = img[i]; - } - clip_img.nx = w; - clip_img.ny = h; - clip_image_encode(ctx, n_threads, &clip_img, vec); - return true; -} - -// -// API used internally with mtmd -// - -projector_type clip_get_projector_type(const struct clip_ctx * ctx) { - return ctx->proj_type; -} diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h index f91896e48..abedebdb7 100644 --- a/llama/llama.cpp/include/llama.h +++ b/llama/llama.cpp/include/llama.h @@ -4,6 +4,7 @@ #include "ggml.h" #include "ggml-cpu.h" #include "ggml-backend.h" +#include "ggml-opt.h" #include #include @@ -111,6 +112,8 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, + LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, }; enum llama_rope_type { @@ -255,7 +258,6 @@ extern "C" { llama_token * token; float * embd; - int32_t n_embd; llama_pos * pos; int32_t * n_seq_id; llama_seq_id ** seq_id; @@ -351,20 +353,18 @@ extern "C" { enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] - // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. - // TODO: move at the end of the struct - bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - bool embeddings; // if true, extract embeddings (together with logits) - bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings - bool cross_attn; // whether to use cross attention - // Abort callback // if it returns true, execution of llama_decode() will be aborted // currently works only with CPU execution ggml_abort_callback abort_callback; void * abort_callback_data; + + // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. + bool embeddings; // if true, extract embeddings (together with logits) + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + bool no_perf; // whether to measure performance timings + bool op_offload; // whether to offload host tensor operations to device }; // model quantization parameters @@ -446,6 +446,10 @@ extern "C" { size_t n_paths, struct llama_model_params params); + LLAMA_API void llama_model_save_to_file( + const struct llama_model * model, + const char * path_model); + DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model), "use llama_model_free instead"); @@ -460,10 +464,6 @@ extern "C" { struct llama_context_params params), "use llama_init_from_model instead"); - // TODO (jmorganca): this should most likely be passed in as part of a batch - // and not set on the context for all batches. - LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state); - // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); @@ -929,14 +929,19 @@ extern "C" { // Frees a batch of tokens allocated with llama_batch_init() LLAMA_API void llama_batch_free(struct llama_batch batch); - // Processes a batch of tokens with the ecoder part of the encoder-decoder model. - // Stores the encoder output internally for later use by the decoder cross-attention layers. + // Process a batch of tokens. + // In contrast to llama_decode() - this call does not use KV cache. + // For encode-decoder contexts, processes the batch using the encoder. + // Can store the encoder output internally for later use by the decoder's cross-attention layers. // 0 - success // < 0 - error. the KV cache state is restored to the state before this call LLAMA_API int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch); + // Process a batch of tokens. + // Requires KV cache. + // For encode-decoder contexts, processes the batch using the decoder. // Positive return values does not mean a fatal error, but rather a warning. // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) @@ -1237,6 +1242,7 @@ extern "C" { "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + /// Setting k <= 0 makes this a noop LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 @@ -1432,6 +1438,37 @@ extern "C" { LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); + // + // training + // + + // function that returns whether or not a given tensor contains trainable parameters + typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata); + + // always returns true + LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata); + + struct llama_opt_params { + uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0 + + llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters + void * param_filter_ud; // userdata for determining which tensors contain trainable parameters + + ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters + void * get_opt_pars_ud; // userdata for calculating optimizer parameters + }; + + LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); + + LLAMA_API void llama_opt_epoch( + struct llama_context * lctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result_train, + ggml_opt_result_t result_eval, + int64_t idata_split, + ggml_opt_epoch_callback callback_train, + ggml_opt_epoch_callback callback_eval); + #ifdef __cplusplus } #endif diff --git a/llama/llama.cpp/src/llama-adapter.cpp b/llama/llama.cpp/src/llama-adapter.cpp index 7ac54d239..8d94034ae 100644 --- a/llama/llama.cpp/src/llama-adapter.cpp +++ b/llama/llama.cpp/src/llama-adapter.cpp @@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ std::vector buft_extra; { auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) @@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } buft = ggml_backend_dev_buffer_type(cpu_dev); break; diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp index dd01df60a..5ab3f5722 100644 --- a/llama/llama.cpp/src/llama-arch.cpp +++ b/llama/llama.cpp/src/llama-arch.cpp @@ -6,7 +6,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_MLLAMA, "mllama" }, { LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_DECI, "deci" }, { LLM_ARCH_FALCON, "falcon" }, @@ -20,6 +19,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BERT, "bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, @@ -73,7 +73,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, - { LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -109,6 +108,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, + { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, @@ -144,7 +144,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, - { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, @@ -274,40 +273,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, - { - LLM_ARCH_MLLAMA, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" }, - { LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" }, - { LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" }, - { LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" }, - { LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" }, - { LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" }, - { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" }, - { LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" }, - }, - }, { LLM_ARCH_DECI, { @@ -511,6 +476,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_NOMIC_BERT_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_JINA_BERT_V2, { @@ -1587,22 +1570,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, - { - LLM_ARCH_MISTRAL3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - } - }, { LLM_ARCH_UNKNOWN, { @@ -1734,14 +1701,6 @@ static const std::map LLM_TENSOR_INFOS = { // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h index b6227eebf..525c1b7d4 100644 --- a/llama/llama.cpp/src/llama-arch.h +++ b/llama/llama.cpp/src/llama-arch.h @@ -11,7 +11,6 @@ enum llm_arch { LLM_ARCH_LLAMA, LLM_ARCH_LLAMA4, - LLM_ARCH_MLLAMA, LLM_ARCH_DECI, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, @@ -24,6 +23,7 @@ enum llm_arch { LLM_ARCH_REFACT, LLM_ARCH_BERT, LLM_ARCH_NOMIC_BERT, + LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, @@ -75,7 +75,6 @@ enum llm_arch { LLM_ARCH_CHAMELEON, LLM_ARCH_SOLAR, LLM_ARCH_WAVTOKENIZER_DEC, - LLM_ARCH_MISTRAL3, LLM_ARCH_PLM, LLM_ARCH_BAILINGMOE, LLM_ARCH_UNKNOWN, @@ -113,6 +112,7 @@ enum llm_kv { LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_GATING_FUNC, + LLM_KV_MOE_EVERY_N_LAYERS, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, @@ -148,7 +148,6 @@ enum llm_kv { LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, - LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, @@ -350,14 +349,6 @@ enum llm_tensor { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, LLM_TENSOR_BSKCN_TV, - LLM_TENSOR_CROSS_ATTN_K_NORM, - LLM_TENSOR_CROSS_ATTN_K_PROJ, - LLM_TENSOR_CROSS_ATTN_O_PROJ, - LLM_TENSOR_CROSS_ATTN_Q_NORM, - LLM_TENSOR_CROSS_ATTN_Q_PROJ, - LLM_TENSOR_CROSS_ATTN_V_PROJ, - LLM_TENSOR_CROSS_ATTN_ATTN_GATE, - LLM_TENSOR_CROSS_ATTN_MLP_GATE, LLM_TENSOR_CONV1D, LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_NORM, diff --git a/llama/llama.cpp/src/llama-batch.cpp b/llama/llama.cpp/src/llama-batch.cpp index 8682b0e68..a88b2fe30 100644 --- a/llama/llama.cpp/src/llama-batch.cpp +++ b/llama/llama.cpp/src/llama-batch.cpp @@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { return ubatch; } -void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) { +llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) { GGML_ASSERT(batch.n_tokens >= 0); this->batch = &batch; this->n_embd = n_embd; @@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim for (size_t i = 0; i < n_tokens; ++i) { ids[i] = i; } + if (simple_split) { seq.resize(1); llama_sbatch_seq & s = seq[0]; @@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim s.length = n_tokens; return; } + std::sort(ids.begin(), ids.end(), [&batch](size_t a, size_t b) { int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1; @@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim return n_seq_a > n_seq_b; } ); + // init seq llama_sbatch_seq * last_seq = nullptr; @@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim seq.push_back(new_seq); last_seq = &seq.back(); } + // keep shared prompts first at the end, then sort by length descending. std::sort(seq.begin(), seq.end(), [](llama_sbatch_seq & a, llama_sbatch_seq & b) { @@ -316,7 +320,6 @@ struct llama_batch llama_batch_get_one( /*n_tokens =*/ n_tokens, /*tokens =*/ tokens, /*embd =*/ nullptr, - /*n_embd =*/ 0, /*pos =*/ nullptr, /*n_seq_id =*/ nullptr, /*seq_id =*/ nullptr, @@ -329,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ /*n_tokens =*/ 0, /*tokens =*/ nullptr, /*embd =*/ nullptr, - /*n_embd =*/ 0, /*pos =*/ nullptr, /*n_seq_id =*/ nullptr, /*seq_id =*/ nullptr, @@ -338,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ if (embd) { batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); - batch.n_embd = embd; } else { batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); } diff --git a/llama/llama.cpp/src/llama-batch.h b/llama/llama.cpp/src/llama-batch.h index f1df40d27..6305051b6 100644 --- a/llama/llama.cpp/src/llama-batch.h +++ b/llama/llama.cpp/src/llama-batch.h @@ -70,7 +70,8 @@ struct llama_sbatch { // sequence-wise split llama_ubatch split_seq(size_t n_ubatch); - void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false); + llama_sbatch() = default; + llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false); }; // temporary allocate memory for the input batch if needed diff --git a/llama/llama.cpp/src/llama-chat.cpp b/llama/llama.cpp/src/llama-chat.cpp index 721faa4e8..d12743e6b 100644 --- a/llama/llama.cpp/src/llama-chat.cpp +++ b/llama/llama.cpp/src/llama-chat.cpp @@ -35,6 +35,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 }, { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN }, { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 }, + { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN }, { "phi3", LLM_CHAT_TEMPLATE_PHI_3 }, { "phi4", LLM_CHAT_TEMPLATE_PHI_4 }, { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 }, @@ -50,8 +51,8 @@ static const std::map LLM_CHAT_TEMPLATES = { { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, - { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, - { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 }, + { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 }, + { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 }, { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE }, { "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, @@ -62,6 +63,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "yandex", LLM_CHAT_TEMPLATE_YANDEX }, { "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, + { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -81,7 +83,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { if (tmpl_contains("<|im_start|>")) { return tmpl_contains("<|im_sep|>") ? LLM_CHAT_TEMPLATE_PHI_4 - : LLM_CHAT_TEMPLATE_CHATML; + : tmpl_contains("") + ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml + : LLM_CHAT_TEMPLATE_CHATML; } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { if (tmpl_contains("[SYSTEM_PROMPT]")) { return LLM_CHAT_TEMPLATE_MISTRAL_V7; @@ -119,8 +123,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { return LLM_CHAT_TEMPLATE_PHI_3; + } else if (tmpl_contains("[gMASK]")) { + return LLM_CHAT_TEMPLATE_CHATGLM_4; } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { return tmpl_contains("") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE; + } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) { + return LLM_CHAT_TEMPLATE_GLMEDGE; } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { return LLM_CHAT_TEMPLATE_ZEPHYR; } else if (tmpl_contains("bos_token + message['role']")) { @@ -149,9 +157,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_LLAMA_3; } else if (tmpl_contains("[gMASK]sop")) { // chatglm3-6b - return LLM_CHAT_TEMPLATE_CHATGML_3; - } else if (tmpl_contains("[gMASK]")) { - return LLM_CHAT_TEMPLATE_CHATGML_4; + return LLM_CHAT_TEMPLATE_CHATGLM_3; } else if (tmpl_contains(LU8("<用户>"))) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF return LLM_CHAT_TEMPLATE_MINICPM; @@ -197,19 +203,20 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|im_start|>assistant\n"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) { + } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) { // Official mistral 'v7' template // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7 + // https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken + const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : ""; for (auto message : chat) { std::string role(message->role); std::string content(message->content); if (role == "system") { - ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]"; + ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]"; } else if (role == "user") { - ss << "[INST] " << content << "[/INST]"; - } - else { - ss << " " << content << ""; + ss << "[INST]" << trailing_space << content << "[/INST]"; + } else { + ss << trailing_space << content << ""; } } } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 @@ -432,7 +439,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) { // chatglm3-6b ss << "[gMASK]" << "sop"; for (auto message : chat) { @@ -442,14 +449,14 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) { ss << "[gMASK]" << ""; for (auto message : chat) { std::string role(message->role); ss << "<|" << role << "|>" << "\n" << message->content; } if (add_ass) { - ss << "<|assistant|>"; + ss << "<|assistant|>\n"; } } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { for (auto message : chat) { @@ -620,7 +627,23 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|header_start|>assistant<|header_end|>\n\n"; } - } else { + } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) { + // SmolVLM + ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n"; + } else { + ss << "Assistant: " << message->content << "\n"; + } + } + if (add_ass) { + ss << "Assistant:"; + } + } else { // template not supported return -1; } diff --git a/llama/llama.cpp/src/llama-chat.h b/llama/llama.cpp/src/llama-chat.h index 34537ca21..db24ade21 100644 --- a/llama/llama.cpp/src/llama-chat.h +++ b/llama/llama.cpp/src/llama-chat.h @@ -14,6 +14,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_MISTRAL_V3, LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN, LLM_CHAT_TEMPLATE_MISTRAL_V7, + LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN, LLM_CHAT_TEMPLATE_PHI_3, LLM_CHAT_TEMPLATE_PHI_4, LLM_CHAT_TEMPLATE_FALCON_3, @@ -29,8 +30,8 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, - LLM_CHAT_TEMPLATE_CHATGML_3, - LLM_CHAT_TEMPLATE_CHATGML_4, + LLM_CHAT_TEMPLATE_CHATGLM_3, + LLM_CHAT_TEMPLATE_CHATGLM_4, LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_EXAONE_3, @@ -41,6 +42,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_LLAMA4, + LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp index 4b3e6a83e..1f3a39564 100644 --- a/llama/llama.cpp/src/llama-context.cpp +++ b/llama/llama.cpp/src/llama-context.cpp @@ -6,11 +6,9 @@ #include "llama-model.h" #include "llama-kv-cache.h" -#include #include #include #include -#include // // llama_context @@ -95,6 +93,7 @@ llama_context::llama_context( } cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + cparams.op_offload = params.op_offload; const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -114,12 +113,10 @@ llama_context::llama_context( } if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", __func__, n_ctx_per_seq, hparams.n_ctx_train); } - logits_all = params.logits_all; - if (!hparams.vocab_only) { // GPU backends for (auto * dev : model.devices) { @@ -177,44 +174,13 @@ llama_context::llama_context( } // init the memory module - // TODO: for now, always create a unified KV cache if (!hparams.vocab_only) { - kv_self.reset(static_cast(model.create_memory())); + llama_memory_params params_mem = { + /*.type_k =*/ params.type_k, + /*.type_v =*/ params.type_v, + }; - LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - - cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self->get_padding(cparams)); - - LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); - - uint32_t kv_size = cparams.n_ctx; - ggml_type type_k = params.type_k; - ggml_type type_v = params.type_v; - - if (llama_model_is_recurrent(&model)) { - // Mamba needs at least as many KV cells as there are sequences kept at any time - kv_size = std::max((uint32_t) 1, params.n_seq_max); - // it's probably best to keep as much precision as possible for the states - type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states - type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states - } - - GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); - GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); - - if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { - throw std::runtime_error("failed to initialize self-attention cache"); - } - - { - const size_t memory_size_k = kv_self->size_k_bytes(); - const size_t memory_size_v = kv_self->size_v_bytes(); - - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), - ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); - } + memory.reset(model.create_memory(params_mem, cparams)); } // init backends @@ -278,7 +244,7 @@ llama_context::llama_context( } } - sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload)); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); @@ -286,7 +252,7 @@ llama_context::llama_context( } // reserve worst-case graph - if (!hparams.vocab_only) { + if (!hparams.vocab_only && memory) { const uint32_t n_seqs = 1; // TODO: worst-case number of sequences const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); @@ -305,7 +271,9 @@ llama_context::llama_context( int n_nodes_tg = -1; // simulate full KV cache - kv_self->n = kv_self->size; + llama_kv_cache * kv_self = static_cast(memory.get()); + + kv_self->set_full(); cross.v_embd.clear(); @@ -391,7 +359,9 @@ llama_context::llama_context( } } -llama_context::~llama_context() = default; +llama_context::~llama_context() { + ggml_opt_free(opt_ctx); +} void llama_context::synchronize() { ggml_backend_sched_synchronize(sched.get()); @@ -427,6 +397,18 @@ const llama_model & llama_context::get_model() const { return model; } +const llama_cparams & llama_context::get_cparams() const { + return cparams; +} + +ggml_backend_sched_t llama_context::get_sched() const { + return sched.get(); +} + +ggml_context * llama_context::get_ctx_compute() const { + return ctx_compute.get(); +} + uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } @@ -456,329 +438,44 @@ uint32_t llama_context::n_threads_batch() const { } llama_kv_cache * llama_context::get_kv_self() { - return kv_self.get(); + llama_kv_cache * kv_self = static_cast(memory.get()); + return kv_self; } const llama_kv_cache * llama_context::get_kv_self() const { - return kv_self.get(); -} - -ggml_tensor * llama_context::build_rope_shift( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * shift, - ggml_tensor * factors, - float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const { - const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; - - const auto & yarn_ext_factor = cparams.yarn_ext_factor; - const auto & yarn_beta_fast = cparams.yarn_beta_fast; - const auto & yarn_beta_slow = cparams.yarn_beta_slow; - - const auto & hparams = model.hparams; - - const auto & n_rot = hparams.n_rot; - const auto & rope_type = hparams.rope_type; - - // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. - // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. - const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor; - - ggml_tensor * tmp; - - if (ggml_is_quantized(cur->type)) { - // dequantize to f32 -> RoPE -> quantize back - tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); - - if (bbuf) { - for (const auto & backend : backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { - ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); - break; - } - } - } - - tmp = ggml_rope_ext_inplace(ctx0, tmp, - shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - - tmp = ggml_cpy(ctx0, tmp, cur); - } else { - // we rotate only the first n_rot dimensions - tmp = ggml_rope_ext_inplace(ctx0, cur, - shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - } - - return tmp; -} - -class llm_graph_input_k_shift : public llm_graph_input_i { -public: - llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {} - virtual ~llm_graph_input_k_shift() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * k_shift; // I32 [kv_size] - - const llama_kv_cache_unified * kv_self; -}; - -void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { - GGML_UNUSED(ubatch); - - if (k_shift) { - assert(ggml_backend_buffer_is_host(k_shift->buffer)); - - int32_t * data = (int32_t *) k_shift->data; - - for (uint32_t i = 0; i < kv_self->size; ++i) { - data[i] = kv_self->cells[i].delta; - } - } -} - -llm_graph_result_ptr llama_context::build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf) const { - auto res = std::make_unique(); - - const auto & hparams = model.hparams; - - const auto & n_layer = hparams.n_layer; - - const auto & n_embd_head_k = hparams.n_embd_head_k; - //const auto & n_embd_head_v = hparams.n_embd_head_v; - - //GGML_ASSERT(kv_self->size == n_ctx); - - auto inp = std::make_unique(kv_self.get()); - - inp->k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_ctx); - ggml_set_input(inp->k_shift); - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - - const bool is_swa = hparams.is_swa(il); - - // note: the swa rope params could become part of the cparams in the future - // if we decide to make them configurable, like the non-sliding ones - const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; - const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale; - - ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il); - - ggml_tensor * k = - ggml_view_3d(ctx0, kv_self->k_l[il], - n_embd_head_k, n_head_kv, kv_self->size, - ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), - 0); - - ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); - - ggml_build_forward_expand(gf, cur); - } - - res->add_input(std::move(inp)); - - return res; -} - -llm_graph_result_ptr llama_context::build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - const std::vector & moves) const { - auto res = std::make_unique(); - - const auto & hparams = model.hparams; - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(v_l[il]->type); - const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - for (const auto & move : moves) { - for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il], - n_embd_k_gqa, move.len, - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il], - n_embd_k_gqa, move.len, - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (cparams.flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], - n_embd_v_gqa, move.len, - ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src)); - - view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], - n_embd_v_gqa, move.len, - ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst)); - } else { - view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], - move.len, n_embd_v_gqa, - ggml_row_size(kv_self->v_l[il]->type, kv_self->size), - ggml_row_size(kv_self->v_l[il]->type, move.src)); - - view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], - move.len, n_embd_v_gqa, - ggml_row_size(kv_self->v_l[il]->type, kv_self->size), - ggml_row_size(kv_self->v_l[il]->type, move.dst)); - } - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } - } -#endif - - return res; + llama_kv_cache * kv_self = static_cast(memory.get()); + return kv_self; } void llama_context::kv_self_update() { - auto & kv = kv_self; + bool need_reserve = false; - if (kv->has_shift) { - if (!kv->get_can_shift()) { - GGML_ABORT("The current context does not support K-shift"); + llama_kv_cache * kv_self = static_cast(memory.get()); + + need_reserve = kv_self->update(*this); + + // reserve a worst case graph if needed + if (need_reserve) { + LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + // simulate full KV cache + kv_self->set_full(); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(sched.get()); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); } - - LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__); - - // apply K-shift if needed - if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { - ggml_backend_sched_reset(sched.get()); - - auto * gf = graph_init(); - - auto res = build_kv_self_shift(ctx_compute.get(), gf); - - ggml_backend_sched_alloc_graph(sched.get(), gf); - - res->set_inputs(nullptr); - - graph_compute(gf, false); - } - - { - kv->has_shift = false; - - for (uint32_t i = 0; i < kv->size; ++i) { - kv->cells[i].delta = 0; - } - } - } - - // defragment the KV cache if needed - if (kv->do_defrag) { - LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); - const uint32_t n_max_nodes = graph_max_nodes(); - const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer); - if (!kv->defrag_prepare(n_max_nodes)) { - LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__); - return; - } - - for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) { - std::vector chunk; - auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size()); - chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end); - - ggml_backend_sched_reset(sched.get()); - auto * gf = graph_init(); - auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk); - ggml_backend_sched_alloc_graph(sched.get(), gf); - res->set_inputs(nullptr); - graph_compute(gf, false); - } - - kv->do_defrag = false; } } @@ -787,9 +484,6 @@ enum llama_pooling_type llama_context::pooling_type() const { } float * llama_context::get_logits() { - // reorder logits for backward compatibility - output_reorder(); - return logits; } @@ -820,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); } - return logits + j*model.hparams.n_vocab; + return logits + j*model.vocab.n_tokens(); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG @@ -832,9 +526,6 @@ float * llama_context::get_logits_ith(int32_t i) { } float * llama_context::get_embeddings() { - // reorder embeddings for backward compatibility - output_reorder(); - return embd; } @@ -941,10 +632,6 @@ void llama_context::set_warmup(bool value) { cparams.warmup = value; } -void llama_context::set_cross_attn(bool value) { - cparams.cross_attn = value; -} - void llama_context::set_adapter_lora( llama_adapter_lora * adapter, float scale) { @@ -990,8 +677,8 @@ int llama_context::encode(llama_batch & inp_batch) { } // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1); + // note: during encode, we always pass the full sequence starting from pos = 0 + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0); const llama_batch & batch = batch_allocr.batch; const int32_t n_tokens = batch.n_tokens; @@ -1016,11 +703,13 @@ int llama_context::encode(llama_batch & inp_batch) { t_compute_start_us = ggml_time_us(); } + embd_seq.clear(); + n_queued_tokens += n_tokens; const int64_t n_embd = hparams.n_embd; - sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true); + llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); const llama_ubatch ubatch = sbatch.split_simple(n_tokens); @@ -1077,12 +766,12 @@ int llama_context::encode(llama_batch & inp_batch) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); GGML_ASSERT(backend_embd != nullptr); - GGML_ASSERT(embd != nullptr); - switch (cparams.pooling_type) { case LLAMA_POOLING_TYPE_NONE: { // extract token embeddings + GGML_ASSERT(embd != nullptr); + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float)); } break; @@ -1107,11 +796,18 @@ int llama_context::encode(llama_batch & inp_batch) { } break; case LLAMA_POOLING_TYPE_RANK: { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; case LLAMA_POOLING_TYPE_UNSPECIFIED: { GGML_ABORT("unknown pooling type"); @@ -1149,25 +845,33 @@ int llama_context::encode(llama_batch & inp_batch) { } int llama_context::decode(llama_batch & inp_batch) { + if (!memory) { + LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__); + return encode(inp_batch); + } + if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } + llama_kv_cache * kv_self = static_cast(memory.get()); + // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1); + // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1); const llama_batch & batch = batch_allocr.batch; + const auto & vocab = model.vocab; const auto & hparams = model.hparams; - const int32_t n_vocab = hparams.n_vocab; + const int32_t n_vocab = vocab.n_tokens(); const int64_t n_tokens_all = batch.n_tokens; const int64_t n_embd = hparams.n_embd; - llama_kv_cache_guard kv_guard(kv_self.get()); + llama_kv_cache_guard kv_guard(kv_self); GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT @@ -1201,18 +905,14 @@ int llama_context::decode(llama_batch & inp_batch) { for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs_all += batch.logits[i] != 0; } - } else if (logits_all || embd_pooled) { + } else if (embd_pooled) { n_outputs_all = n_tokens_all; } else { // keep last output only n_outputs_all = 1; } - const bool logits_all = n_outputs_all == n_tokens_all; - - sbatch.from_batch(batch, batch.n_embd, - /* simple_split */ !kv_self->recurrent, - /* logits_all */ logits_all); + llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all); // reserve output buffer if (output_reserve(n_outputs_all) < n_outputs_all) { @@ -1226,22 +926,7 @@ int llama_context::decode(llama_batch & inp_batch) { int64_t n_outputs_prev = 0; while (sbatch.n_tokens > 0) { - llama_ubatch ubatch = llama_ubatch(); - - const auto & n_ubatch = cparams.n_ubatch; - - if (kv_self->recurrent) { - if (embd_pooled) { - // Pooled embeddings cannot be split across ubatches (yet) - ubatch = sbatch.split_seq(cparams.n_ubatch); - } else { - // recurrent model architectures are easier to implement - // with equal-length sequences - ubatch = sbatch.split_equal(cparams.n_ubatch); - } - } else { - ubatch = sbatch.split_simple(n_ubatch); - } + llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled); // count the outputs in this u_batch { @@ -1261,27 +946,15 @@ int llama_context::decode(llama_batch & inp_batch) { } // find KV slot - { + if (!kv_self->find_slot(ubatch)) { + kv_self->defrag_sched(-1.0f); + kv_self->update(*this); if (!kv_self->find_slot(ubatch)) { - kv_self->defrag(); - kv_self_update(); - if (!kv_self->find_slot(ubatch)) { - LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); - return 1; - } - } - - if (!kv_self->recurrent) { - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = kv_self->get_padding(cparams); - kv_self->n = std::min(kv_self->size, std::max(pad, GGML_PAD(kv_self->cell_max(), pad))); + LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); + return 1; } } - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head); - ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); @@ -1395,43 +1068,68 @@ int llama_context::decode(llama_batch & inp_batch) { // finalize the batch processing kv_guard.commit(); + // set to total number of outputs in the batch, for use in llama_get_logits_ith + n_outputs = n_outputs_all; + // set output mappings { bool sorted_output = true; - GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); + auto & out_ids = sbatch.out_ids; + + GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all); for (int64_t i = 0; i < n_outputs_all; ++i) { - int64_t out_id = sbatch.out_ids[i]; + int64_t out_id = out_ids[i]; output_ids[out_id] = i; if (out_id != i) { sorted_output = false; } } - if (sorted_output) { - sbatch.out_ids.clear(); + // make the outputs have the same order they had in the user-provided batch + // note: this is mostly relevant for recurrent models atm + if (!sorted_output) { + const uint32_t n_vocab = model.vocab.n_tokens(); + const uint32_t n_embd = model.hparams.n_embd; + + GGML_ASSERT((size_t) n_outputs == out_ids.size()); + + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); + } + } + if (embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); + } + } + } + std::fill(output_ids.begin(), output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + output_ids[out_ids[i]] = i; + } } } - // set to total number of outputs in the batch, for use in llama_get_logits_ith - n_outputs = n_outputs_all; - // wait for the computation to finish (automatically done when obtaining the model output) //synchronize(); // decide if we need to defrag the kv cache - if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { - // - do not defrag small contexts (i.e. < 2048 tokens) - // - count the padding towards the number of used tokens - const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f; - - // queue defragmentation for next llama_kv_cache_update - if (fragmentation > cparams.defrag_thold) { - LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); - - kv_self->defrag(); - } + if (cparams.defrag_thold > 0.0f) { + kv_self->defrag_sched(cparams.defrag_thold); } // Reset state for the next token before backend sync, to allow the CPU activities in the reset to @@ -1447,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) { int32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; + const auto & vocab = model.vocab; const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); const auto n_batch = cparams.n_batch; - const auto n_vocab = hparams.n_vocab; + const auto n_vocab = vocab.n_tokens(); const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead @@ -1510,52 +1209,12 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); - ggml_backend_buffer_clear(buf_output.get(), 0); - this->n_outputs = 0; this->n_outputs_max = n_outputs_max; return n_outputs_max; } -void llama_context::output_reorder() { - auto & out_ids = sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = model.hparams.n_vocab; - const uint32_t n_embd = model.hparams.n_embd; - - GGML_ASSERT((size_t) n_outputs == out_ids.size()); - - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); - } - } - if (embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); - } - } - } - std::fill(output_ids.begin(), output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - output_ids[out_ids[i]] = i; - } - out_ids.clear(); - } -} - // // graph // @@ -1592,7 +1251,7 @@ llm_graph_result_ptr llama_context::graph_build( /*.backend_cpu =*/ backend_cpu, /*.cvec =*/ &cvec, /*.loras =*/ &loras, - /*.memory =*/ kv_self.get(), + /*.memory =*/ memory.get(), /*.cross =*/ &cross, /*.n_outputs =*/ n_outputs, /*.cb =*/ graph_get_cb(), @@ -1996,8 +1655,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { { LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__); - output_reorder(); - const auto n_outputs = this->n_outputs; const auto & output_ids = this->output_ids; @@ -2028,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { { LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); - const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab); + const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); io.write(&logits_size, sizeof(logits_size)); @@ -2051,6 +1708,8 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { } LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); + llama_kv_cache * kv_self = static_cast(memory.get()); + kv_self->state_write(io); return io.n_bytes(); @@ -2134,8 +1793,13 @@ size_t llama_context::state_read_data(llama_io_read_i & io) { } } - LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); - kv_self->state_read(io); + if (memory) { + LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); + + llama_kv_cache * kv_self = static_cast(memory.get()); + + kv_self->state_read(io); + } return io.n_bytes(); } @@ -2143,7 +1807,11 @@ size_t llama_context::state_read_data(llama_io_read_i & io) { size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) { GGML_UNUSED(seq_id); - kv_self->state_write(io, seq_id); + if (memory) { + llama_kv_cache * kv_self = static_cast(memory.get()); + + kv_self->state_write(io, seq_id); + } return io.n_bytes(); } @@ -2151,7 +1819,11 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) { GGML_UNUSED(seq_id); - kv_self->state_read(io, seq_id); + if (memory) { + llama_kv_cache * kv_self = static_cast(memory.get()); + + kv_self->state_read(io, seq_id); + } return io.n_bytes(); } @@ -2179,6 +1851,218 @@ void llama_context::perf_reset() { t_p_eval_us = n_p_eval = 0; } +// +// training +// + +static void llama_set_param(struct ggml_tensor * tensor, llama_opt_param_filter param_filter, void * userdata) { + if (!tensor || tensor->type != GGML_TYPE_F32) { + return; + } + if (!param_filter(tensor, userdata)) { + return; + } + if (strcmp(tensor->name, "token_embd.weight") == 0) { + return; // FIXME + } + if (strcmp(tensor->name, "rope_freqs.weight") == 0) { + return; // FIXME + } + ggml_set_param(tensor); +} + +void llama_context::opt_init(struct llama_model * model, struct llama_opt_params lopt_params) { + GGML_ASSERT(!opt_ctx); + model->hparams.n_ctx_train = lopt_params.n_ctx_train > 0 ? lopt_params.n_ctx_train : n_ctx(); + const uint32_t n_batch = std::min(this->n_batch(), model->hparams.n_ctx_train); + const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch); + GGML_ASSERT(model->hparams.n_ctx_train % n_batch == 0); + GGML_ASSERT(n_batch % n_ubatch == 0); + + ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY); + opt_params.opt_period = n_batch / n_ubatch; + opt_params.get_opt_pars = lopt_params.get_opt_pars; + opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud; + + opt_ctx = ggml_opt_init(opt_params); + + llama_opt_param_filter param_filter = lopt_params.param_filter; + void * param_filter_ud = lopt_params.param_filter_ud; + + //llama_set_param(model->tok_embd, param_filter, param_filter_ud); // FIXME + llama_set_param(model->type_embd, param_filter, param_filter_ud); + llama_set_param(model->pos_embd, param_filter, param_filter_ud); + llama_set_param(model->tok_norm, param_filter, param_filter_ud); + llama_set_param(model->tok_norm_b, param_filter, param_filter_ud); + llama_set_param(model->output_norm, param_filter, param_filter_ud); + llama_set_param(model->output_norm_b, param_filter, param_filter_ud); + llama_set_param(model->output, param_filter, param_filter_ud); + llama_set_param(model->output_b, param_filter, param_filter_ud); + llama_set_param(model->output_norm_enc, param_filter, param_filter_ud); + llama_set_param(model->cls, param_filter, param_filter_ud); + llama_set_param(model->cls_b, param_filter, param_filter_ud); + llama_set_param(model->cls_out, param_filter, param_filter_ud); + llama_set_param(model->cls_out_b, param_filter, param_filter_ud); + + for (struct llama_layer & layer : model->layers) { + for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) { + llama_set_param(reinterpret_cast(&layer)[i], param_filter, param_filter_ud); + } + } +} + +void llama_context::opt_epoch_iter( + ggml_opt_dataset_t dataset, + ggml_opt_result_t result, + const std::vector & tokens, + const std::vector & labels_sparse, + llama_batch & batch, + ggml_opt_epoch_callback callback, + bool train, + int64_t idata_in_loop, + int64_t ndata_in_loop, + int64_t t_loop_start) { + GGML_ASSERT(opt_ctx); + const uint32_t n_ctx = llama_model_n_ctx_train(&model); + const uint32_t n_batch = std::min(this->n_batch(), n_ctx); + const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch); + + llama_kv_cache * kv_self = static_cast(memory.get()); + + kv_self->clear(); + llama_kv_cache_guard kv_guard(kv_self); + + for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) { + batch.n_tokens = n_batch; + for (uint32_t pos_batch = 0; pos_batch < n_batch; ++pos_batch) { + batch.token [pos_batch] = tokens[pos_ctx + pos_batch]; + batch.pos [pos_batch] = pos_ctx + pos_batch; + batch.n_seq_id[pos_batch] = 1; + batch.seq_id [pos_batch][0] = 0; + batch.logits [pos_batch] = true; + } + + const auto n_tokens_all = batch.n_tokens; + + n_queued_tokens += n_tokens_all; + + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + embd_seq.clear(); + + int64_t n_outputs_all = n_tokens_all; + + llama_sbatch sbatch = kv_self->sbatch_init(batch, /*logits_all =*/ true); + + // reserve output buffer + if (output_reserve(n_outputs_all) < n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + GGML_ABORT("TODO: handle this error"); + }; + + for (uint32_t pos_batch = 0; pos_batch < n_batch; pos_batch += n_ubatch) { + llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled); + + n_outputs = ubatch.n_tokens; + + // TODO: not sure if this is needed + if (!kv_self->find_slot(ubatch)) { + kv_self->defrag_sched(-1.0f); + kv_self->update(*this); + if (!kv_self->find_slot(ubatch)) { + LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); + GGML_ABORT("TODO: handle this error"); + } + } + + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT); + + struct ggml_context * ctx_compute_opt; + { + const size_t size_gf = ggml_graph_size(gf); + const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true); + struct ggml_init_params params = { + /*.mem_size =*/ size_meta, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ctx_compute_opt = ggml_init(params); + } + ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits()); + ggml_opt_alloc(opt_ctx, train); + res->set_inputs(&ubatch); + { + struct ggml_tensor * labels = ggml_opt_labels(opt_ctx); + GGML_ASSERT(labels->ne[1] == n_ubatch); + ggml_set_zero(labels); + const float onef = 1.0f; + for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) { + const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch; + GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]); + ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float)); + } + } + ggml_opt_eval(opt_ctx, result); + if (callback) { + callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start); + } + ggml_free(ctx_compute_opt); + } + } + + kv_guard.commit(); +} + +void llama_context::opt_epoch( + ggml_opt_dataset_t dataset, + ggml_opt_result_t result_train, + ggml_opt_result_t result_eval, + int64_t idata_split, + ggml_opt_epoch_callback callback_train, + ggml_opt_epoch_callback callback_eval) { + const uint32_t n_ctx = this->n_ctx(); + const uint32_t n_batch = std::min(cparams.n_batch, n_ctx); + const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch); + const int64_t ndata = ggml_opt_dataset_ndata(dataset); + + GGML_ASSERT(idata_split >= 0); + GGML_ASSERT(idata_split <= ndata); + + const uint32_t ubatch_per_ctx = n_ctx / n_ubatch; + + struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + std::vector tokens(n_ctx); + std::vector labels_sparse(n_ctx); + + int64_t idata = 0; + + int64_t t_loop_start = ggml_time_us(); + int64_t ndata_in_loop = idata_split*ubatch_per_ctx; + for (; idata < idata_split; ++idata) { + constexpr bool train = true; + const int64_t idata_in_loop = idata*ubatch_per_ctx; + + ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata); + opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, + callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start); + } + + t_loop_start = ggml_time_us(); + ndata_in_loop = (ndata - idata_split)*ubatch_per_ctx; + for (; idata < ndata; ++idata) { + constexpr bool train = false; + const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx; + + ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata); + opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, + callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start); + } + + llama_batch_free(batch); +} + // // interface implementation // @@ -2206,14 +2090,13 @@ llama_context_params llama_context_default_params() { /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, - /*.logits_all =*/ false, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, - /*.cross_attn =*/ false, - /*.abort_callback =*/ nullptr, - /*.abort_callback_data =*/ nullptr, + /*.op_offload =*/ true, }; return result; @@ -2339,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { ctx->set_warmup(warmup); } -void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) { - ctx->set_cross_attn(cross_attention); -} - void llama_synchronize(llama_context * ctx) { ctx->synchronize(); } @@ -2511,7 +2390,7 @@ void llama_kv_cache_seq_cp( llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); + llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); } void llama_kv_self_seq_cp( @@ -2525,14 +2404,14 @@ void llama_kv_self_seq_cp( return; } - return kv->seq_cp(seq_id_src, seq_id_dst, p0, p1); + kv->seq_cp(seq_id_src, seq_id_dst, p0, p1); } // deprecated void llama_kv_cache_seq_keep( llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_self_seq_keep(ctx, seq_id); + llama_kv_self_seq_keep(ctx, seq_id); } void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { @@ -2541,7 +2420,7 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { return; } - return kv->seq_keep(seq_id); + kv->seq_keep(seq_id); } // deprecated @@ -2551,7 +2430,7 @@ void llama_kv_cache_seq_add( llama_pos p0, llama_pos p1, llama_pos delta) { - return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); + llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); } void llama_kv_self_seq_add( @@ -2565,7 +2444,7 @@ void llama_kv_self_seq_add( return; } - return kv->seq_add(seq_id, p0, p1, delta); + kv->seq_add(seq_id, p0, p1, delta); } // deprecated @@ -2575,7 +2454,7 @@ void llama_kv_cache_seq_div( llama_pos p0, llama_pos p1, int d) { - return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); + llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); } void llama_kv_self_seq_div( @@ -2589,7 +2468,7 @@ void llama_kv_self_seq_div( return; } - return kv->seq_div(seq_id, p0, p1, d); + kv->seq_div(seq_id, p0, p1, d); } // deprecated @@ -2608,7 +2487,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { // deprecated void llama_kv_cache_defrag(llama_context * ctx) { - return llama_kv_self_defrag(ctx); + llama_kv_self_defrag(ctx); } void llama_kv_self_defrag(llama_context * ctx) { @@ -2617,7 +2496,8 @@ void llama_kv_self_defrag(llama_context * ctx) { return; } - return kv->defrag(); + // force defrag + kv->defrag_sched(-1.0f); } // deprecated @@ -2801,3 +2681,34 @@ void llama_perf_context_print(const llama_context * ctx) { void llama_perf_context_reset(llama_context * ctx) { ctx->perf_reset(); } + +// +// training +// + +bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata) { + GGML_UNUSED(tensor); + GGML_UNUSED(userdata); + return true; +} + +void llama_opt_init(struct llama_context * ctx, struct llama_model * model, struct llama_opt_params lopt_params) { + ctx->opt_init(model, lopt_params); +} + +void llama_opt_epoch( + struct llama_context * ctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result_train, + ggml_opt_result_t result_eval, + int64_t idata_split, + ggml_opt_epoch_callback callback_train, + ggml_opt_epoch_callback callback_eval) { + ctx->opt_epoch( + dataset, + result_train, + result_eval, + idata_split, + callback_train, + callback_eval); +} diff --git a/llama/llama.cpp/src/llama-context.h b/llama/llama.cpp/src/llama-context.h index a59ff8fd4..0264e9371 100644 --- a/llama/llama.cpp/src/llama-context.h +++ b/llama/llama.cpp/src/llama-context.h @@ -8,6 +8,7 @@ #include "llama-kv-cache.h" #include "ggml-cpp.h" +#include "ggml-opt.h" #include #include @@ -28,7 +29,12 @@ struct llama_context { void synchronize(); - const llama_model & get_model() const; + const llama_model & get_model() const; + const llama_cparams & get_cparams() const; + + ggml_backend_sched_t get_sched() const; + + ggml_context * get_ctx_compute() const; uint32_t n_ctx() const; uint32_t n_ctx_per_seq() const; @@ -66,7 +72,6 @@ struct llama_context { void set_embeddings (bool value); void set_causal_attn(bool value); void set_warmup(bool value); - void set_cross_attn(bool value); void set_adapter_lora( llama_adapter_lora * adapter, @@ -130,6 +135,32 @@ struct llama_context { llama_perf_context_data perf_get_data() const; void perf_reset(); + // + // training + // + + void opt_init(struct llama_model * model, struct llama_opt_params lopt_params); + + void opt_epoch( + ggml_opt_dataset_t dataset, + ggml_opt_result_t result_train, + ggml_opt_result_t result_eval, + int64_t idata_split, + ggml_opt_epoch_callback callback_train, + ggml_opt_epoch_callback callback_eval); + + void opt_epoch_iter( + ggml_opt_dataset_t dataset, + ggml_opt_result_t result, + const std::vector & tokens, + const std::vector & labels_sparse, + llama_batch & batch, + ggml_opt_epoch_callback callback, + bool train, + int64_t idata_in_loop, + int64_t ndata_in_loop, + int64_t t_loop_start); + private: // // output @@ -139,51 +170,30 @@ private: // Returns max number of outputs for which space was reserved. int32_t output_reserve(int32_t n_outputs); - // make the outputs have the same order they had in the user-provided batch - // TODO: maybe remove this - void output_reorder(); - // // graph // +public: int32_t graph_max_nodes() const; // zero-out inputs and create the ctx_compute for the compute graph ggml_cgraph * graph_init(); - llm_graph_result_ptr graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype); - // returns the result of ggml_backend_sched_graph_compute_async execution ggml_status graph_compute( ggml_cgraph * gf, bool batched); +private: + llm_graph_result_ptr graph_build( + ggml_context * ctx, + ggml_cgraph * gf, + const llama_ubatch & ubatch, + llm_graph_type gtype); + llm_graph_cb graph_get_cb() const; - // used by kv_self_update() - ggml_tensor * build_rope_shift( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * shift, - ggml_tensor * factors, - float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const; - - llm_graph_result_ptr build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf) const; - - llm_graph_result_ptr build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - const std::vector & moves) const; - // TODO: read/write lora adapters and cvec size_t state_write_data(llama_io_write_i & io); size_t state_read_data (llama_io_read_i & io); @@ -200,14 +210,10 @@ private: llama_cparams cparams; llama_adapter_cvec cvec; llama_adapter_loras loras; - llama_sbatch sbatch; llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably - std::unique_ptr kv_self; - - // TODO: remove - bool logits_all = false; + std::unique_ptr memory; // decode output (2-dimensional array: [n_outputs][n_vocab]) size_t logits_size = 0; // capacity (of floats) for logits @@ -234,6 +240,9 @@ private: ggml_context_ptr ctx_compute; + // training + ggml_opt_context_t opt_ctx = nullptr; + ggml_threadpool_t threadpool = nullptr; ggml_threadpool_t threadpool_batch = nullptr; diff --git a/llama/llama.cpp/src/llama-cparams.h b/llama/llama.cpp/src/llama-cparams.h index 85ad91b9b..246fa5777 100644 --- a/llama/llama.cpp/src/llama-cparams.h +++ b/llama/llama.cpp/src/llama-cparams.h @@ -29,8 +29,8 @@ struct llama_cparams { bool offload_kqv; bool flash_attn; bool no_perf; - bool cross_attn; bool warmup; + bool op_offload; enum llama_pooling_type pooling_type; diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp index d740c1200..b0e3f6359 100644 --- a/llama/llama.cpp/src/llama-graph.cpp +++ b/llama/llama.cpp/src/llama-graph.cpp @@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + if (ubatch->token && n_pos_per_embd == 4) { + // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D + // the 3 first dims are the same, and 4th dim is all 0 + std::vector pos_data(n_tokens*n_pos_per_embd); + // copy the first dimension + for (int i = 0; i < n_tokens; ++i) { + pos_data[ i] = ubatch->pos[i]; + pos_data[ n_tokens + i] = ubatch->pos[i]; + pos_data[2 * n_tokens + i] = ubatch->pos[i]; + pos_data[3 * n_tokens + i] = 0; // 4th dim is 0 + } + ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); + } else { + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + } } } @@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ) * f_attn_temp_scale + 1.0; } - ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); + ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale)); } } @@ -270,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) { // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self->head; - - ////////////////////////////////////////////// - // TODO: this should not mutate the KV cache ! - llama_kv_cell & kv_cell = const_cast(kv_self)->cells[i]; - - // prevent out-of-bound sources - if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) { - kv_cell.src = cell_id; - } - - data[i] = kv_cell.src; - - // TODO: do not mutate the KV cache - // ensure copy only happens once - if (kv_cell.src != (int32_t) cell_id) { - kv_cell.src = cell_id; - } + data[i] = kv_self->s_copy(i); } } } @@ -303,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) { // clear unused states for (int i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self->head; - - ////////////////////////////////////////////// - // TODO: this should not mutate the KV cache ! - llama_kv_cell & kv_cell = const_cast(kv_self)->cells[i]; - - data[i] = (float) (kv_cell.src >= 0); - - // only clear once - if (kv_cell.src < 0) { - kv_cell.src = cell_id; - } + data[i] = kv_self->s_mask(i); } } } @@ -546,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { } } -void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) { - if (ubatch->embd) { - ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state)); - } -} - // // llm_graph_context // @@ -598,7 +578,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : res (std::make_unique()) { } -int64_t llm_graph_context::n_pos_per_token() const { +int64_t llm_graph_context::n_pos_per_embd() const { return arch == LLM_ARCH_QWEN2VL ? 4 : 1; } @@ -802,13 +782,17 @@ ggml_tensor * llm_graph_context::build_ffn( } break; } - if (type_gate == LLM_FFN_PAR) { + if (gate && type_gate == LLM_FFN_PAR) { cur = ggml_mul(ctx0, cur, tmp); cb(cur, "ffn_gate_par", il); } if (down) { cur = build_lora_mm(down, cur); + if (arch == LLM_ARCH_GLM4) { + // GLM4 seems to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } } if (down_b) { @@ -916,28 +900,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); - ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); + ggml_tensor * experts = nullptr; + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } switch (type_op) { case LLM_FFN_SILU: { - gate = ggml_silu(ctx0, gate); - cb(gate, "ffn_moe_silu", il); + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: { - gate = ggml_gelu(ctx0, gate); - cb(gate, "ffn_moe_gelu", il); + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_moe_gelu", il); } break; default: GGML_ABORT("fatal error"); } - ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); + if (gate_exps) { + cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate_par", il); + } - ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); if (!weight_before_ffn) { @@ -980,6 +971,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); //cb(inp->tokens, "inp_tokens", -1); ggml_set_input(inp->tokens); + res->t_tokens = inp->tokens; cur = ggml_get_rows(ctx0, tok_embd, inp->tokens); @@ -1020,11 +1012,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { } ggml_tensor * llm_graph_context::build_inp_pos() const { - auto inp = std::make_unique(n_pos_per_token()); + auto inp = std::make_unique(n_pos_per_embd()); auto & cur = inp->pos; - cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd()); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1033,11 +1025,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto & cur = inp->attn_scale; - cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); + // this need to be 1x1xN for broadcasting + cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1085,7 +1078,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const { } ggml_tensor * llm_graph_context::build_inp_s_copy() const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); auto inp = std::make_unique(kv_self); @@ -1102,7 +1095,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const { } ggml_tensor * llm_graph_context::build_inp_s_mask() const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); auto inp = std::make_unique(kv_self); @@ -1235,8 +1228,19 @@ ggml_tensor * llm_graph_context::build_attn_mha( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (v_mla) { +#if 0 + // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens. + // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient. cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens); cur = ggml_mul_mat(ctx0, v_mla, cur); +#else + // It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1. + // The permutations are noops and only change how the tensor data is interpreted. + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_mul_mat(ctx0, v_mla, cur); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs. +#endif } cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); @@ -1416,8 +1420,6 @@ ggml_tensor * llm_graph_context::build_attn( // store to KV cache { - GGML_ASSERT(!kv_self->recurrent); - const auto kv_head = kv_self->head; GGML_ASSERT(kv_self->size == n_ctx); @@ -1512,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); } -ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const { - const int64_t n_embd = hparams.n_embd; - - auto inp = std::make_unique(); - - ggml_tensor * cur = nullptr; - - inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4); - ggml_set_input(inp->cross_attn_state); - - cur = inp->cross_attn_state; - - cb(cur, "inp_cross_attn_state", -1); - - res->add_input(std::move(inp)); - - return cur; -} - ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_cross * inp, ggml_cgraph * gf, @@ -1586,7 +1569,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state( ggml_tensor * state_mask, int32_t n_state, int32_t n_seqs) const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); const auto n_kv = kv_self->n; const auto kv_head = kv_self->head; @@ -1618,7 +1601,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( ggml_tensor * state_mask, const llama_ubatch & ubatch, int il) const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); const auto token_shift_count = hparams.token_shift_count; @@ -1639,7 +1622,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store( ggml_tensor * token_shift, const llama_ubatch & ubatch, int il) const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); const auto token_shift_count = hparams.token_shift_count; const auto n_embd = hparams.n_embd; diff --git a/llama/llama.cpp/src/llama-graph.h b/llama/llama.cpp/src/llama-graph.h index 260a2af21..832a8c09f 100644 --- a/llama/llama.cpp/src/llama-graph.h +++ b/llama/llama.cpp/src/llama-graph.h @@ -19,6 +19,7 @@ struct llama_cparams; class llama_memory_i; class llama_kv_cache_unified; +class llama_kv_cache_recurrent; // certain models (typically multi-modal) can produce different types of graphs enum llm_graph_type { @@ -86,34 +87,31 @@ public: ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] - ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] }; class llm_graph_input_pos : public llm_graph_input_i { public: - llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} virtual ~llm_graph_input_pos() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * pos = nullptr; // I32 [n_batch] - const int64_t n_pos_per_token = 1; + const int64_t n_pos_per_embd = 1; }; // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * attn_scale = nullptr; // F32 [n_batch] - const int64_t n_pos_per_token = 1; - const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; }; @@ -189,26 +187,26 @@ public: class llm_graph_input_s_copy : public llm_graph_input_i { public: - llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {} + llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} virtual ~llm_graph_input_s_copy() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * s_copy; // I32 [kv_size] - const llama_kv_cache_unified * kv_self; + const llama_kv_cache_recurrent * kv_self; }; class llm_graph_input_s_mask : public llm_graph_input_i { public: - llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {} + llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} virtual ~llm_graph_input_s_mask() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * s_mask; // F32 [1, n_kv] - const llama_kv_cache_unified * kv_self; + const llama_kv_cache_recurrent * kv_self; }; class llm_graph_input_cross_embd : public llm_graph_input_i { @@ -286,16 +284,6 @@ public: const llama_cross * cross = nullptr; }; -class llm_graph_input_cross_attn_state : public llm_graph_input_i { -public: - llm_graph_input_cross_attn_state() = default; - virtual ~llm_graph_input_cross_attn_state() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] -}; - // // llm_graph_result // @@ -310,6 +298,7 @@ class llm_graph_result_i { public: virtual ~llm_graph_result_i() = default; + virtual ggml_tensor * get_tokens() = 0; virtual ggml_tensor * get_logits() = 0; virtual ggml_tensor * get_embd() = 0; virtual ggml_tensor * get_embd_pooled() = 0; @@ -324,6 +313,7 @@ class llm_graph_result : public llm_graph_result_i { public: virtual ~llm_graph_result() = default; + ggml_tensor * get_tokens() override { return t_tokens; } ggml_tensor * get_logits() override { return t_logits; } ggml_tensor * get_embd() override { return t_embd; } ggml_tensor * get_embd_pooled() override { return t_embd_pooled; } @@ -340,6 +330,7 @@ public: } // important graph nodes + ggml_tensor * t_tokens = nullptr; ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; @@ -363,8 +354,8 @@ struct llm_graph_params { const llama_cparams & cparams; const llama_ubatch & ubatch; - ggml_backend_sched * sched; - ggml_backend * backend_cpu; + ggml_backend_sched_t sched; + ggml_backend_t backend_cpu; const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; @@ -415,9 +406,9 @@ struct llm_graph_context { ggml_context * ctx0 = nullptr; - ggml_backend_sched * sched; + ggml_backend_sched_t sched; - ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? + ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; @@ -430,7 +421,7 @@ struct llm_graph_context { llm_graph_context(const llm_graph_params & params); - int64_t n_pos_per_token() const; + int64_t n_pos_per_embd() const; void cb(ggml_tensor * cur, const char * name, int il) const; @@ -504,7 +495,6 @@ struct llm_graph_context { ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_mask() const; - ggml_tensor * build_inp_cross_attn_state() const; ggml_tensor * build_inp_cross_embd() const; ggml_tensor * build_inp_pos_bucket_enc() const; diff --git a/llama/llama.cpp/src/llama-hparams.cpp b/llama/llama.cpp/src/llama-hparams.cpp index 6a02de036..8a6679601 100644 --- a/llama/llama.cpp/src/llama-hparams.cpp +++ b/llama/llama.cpp/src/llama-hparams.cpp @@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const { GGML_ABORT("fatal error"); } - -bool llama_hparams::cross_attention_layers(uint32_t il) const { - return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); -} diff --git a/llama/llama.cpp/src/llama-hparams.h b/llama/llama.cpp/src/llama-hparams.h index c8a34d521..48dce4071 100644 --- a/llama/llama.cpp/src/llama-hparams.h +++ b/llama/llama.cpp/src/llama-hparams.h @@ -2,8 +2,6 @@ #include "llama.h" -#include - #include // bump if necessary @@ -44,7 +42,6 @@ struct llama_hparams { uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; - uint32_t n_vocab = 0; // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA uint32_t n_embd_head_k_mla = 0; @@ -59,7 +56,6 @@ struct llama_hparams { std::array n_ff_arr; std::array, 4> n_bskcn_arr = {}; - std::array cross_attn_layers; uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; @@ -72,6 +68,7 @@ struct llama_hparams { float expert_weights_scale = 0.0; bool expert_weights_norm = false; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; + uint32_t moe_every_n_layers = 0; float f_norm_eps; float f_norm_rms_eps; @@ -162,9 +159,6 @@ struct llama_hparams { // Block skip connection bool n_bskcn(uint32_t n, uint32_t il) const; - // cross attention layers - bool cross_attention_layers(uint32_t il) const; - bool is_swa(uint32_t il) const; }; diff --git a/llama/llama.cpp/src/llama-kv-cache.cpp b/llama/llama.cpp/src/llama-kv-cache.cpp index 35a750d39..60e67b036 100644 --- a/llama/llama.cpp/src/llama-kv-cache.cpp +++ b/llama/llama.cpp/src/llama-kv-cache.cpp @@ -4,33 +4,41 @@ #include "llama-batch.h" #include "llama-cparams.h" #include "llama-model.h" +#include "llama-context.h" #include #include +#include #include #include #include -llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) { +// +// llama_kv_cache_unified +// + +uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) { + // the FA kernels require padding to avoid extra runtime boundary checks + return cparams.flash_attn ? 256u : 32u; } -bool llama_kv_cache_unified::init( +llama_kv_cache_unified::llama_kv_cache_unified( const llama_model & model, - const llama_cparams & cparams, ggml_type type_k, ggml_type type_v, + bool v_trans, + bool offload, uint32_t kv_size, - bool offload) { + uint32_t padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) { const int32_t n_layer = hparams.n_layer; has_shift = false; + can_shift = true; - recurrent = llama_model_is_recurrent(&model); - v_trans = !recurrent && !cparams.flash_attn; - can_shift = !recurrent; + LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d\n", + __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift, padding); - LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", - __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift); + GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding"); head = 0; size = kv_size; @@ -76,35 +84,24 @@ bool llama_kv_cache_unified::init( const char * dev_name = "CPU"; - ggml_backend_buffer_type_t buft; + ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type(); + if (offload) { auto * dev = model.dev_layer(i); buft = ggml_backend_dev_buffer_type(dev); dev_name = ggml_backend_dev_name(dev); - } else { - buft = ggml_backend_cpu_buffer_type(); } - LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__, - i, n_embd_k_gqa, n_embd_v_gqa, dev_name); + LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, i, dev_name); ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { - LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); - return false; + throw std::runtime_error("failed to create ggml context for kv cache"); } - ggml_tensor * k, *v; - - // for cross attention layers - if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) { - k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i)); - v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i)); - } else { - k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); - v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); - } + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); k_l.push_back(k); @@ -118,55 +115,28 @@ bool llama_kv_cache_unified::init( ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); - return false; + throw std::runtime_error("failed to allocate buffer for kv cache"); } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); bufs.emplace_back(buf); } - return true; -} + { + const size_t memory_size_k = size_k_bytes(); + const size_t memory_size_v = size_v_bytes(); -int32_t llama_kv_cache_unified::get_n_tokens() const { - int32_t result = 0; - - for (uint32_t i = 0; i < size; i++) { - result += cells[i].seq_id.size(); + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - - return result; -} - -int32_t llama_kv_cache_unified::get_used_cells() const { - return used; -} - -size_t llama_kv_cache_unified::total_size() const { - size_t size = 0; - for (const auto & buf : bufs) { - size += ggml_backend_buffer_get_size(buf.get()); - } - - return size; -} - -llama_pos llama_kv_cache_unified::pos_max() const { - llama_pos pos_max = -1; - for (const auto & cell : cells) { - pos_max = std::max(pos_max, cell.pos); - } - - return pos_max; } void llama_kv_cache_unified::clear() { for (int32_t i = 0; i < (int32_t) size; ++i) { cells[i].pos = -1; cells[i].seq_id.clear(); - cells[i].src = -1; - cells[i].tail = -1; } head = 0; used = 0; @@ -187,35 +157,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1 = std::numeric_limits::max(); } - // models like Mamba or RWKV can't have a state partially erased - if (recurrent) { - if (seq_id >= (int64_t) size) { - // could be fatal - return false; - } - if (0 <= seq_id) { - int32_t & tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - const llama_kv_cell & cell = cells[tail_id]; - // partial intersection is invalid - if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { - return false; - } - // invalidate tails which will be cleared - if (p0 <= cell.pos && cell.pos < p1) { - tail_id = -1; - } - } - } else { - // seq_id is negative, then the range should include everything or nothing - if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { - return false; - } - } - - return true; - } - for (uint32_t i = 0; i < size; ++i) { if (cells[i].pos >= p0 && cells[i].pos < p1) { if (seq_id < 0) { @@ -232,7 +173,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos } cells[i].pos = -1; - cells[i].src = -1; if (new_head == size) { new_head = i; @@ -262,34 +202,6 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id p1 = std::numeric_limits::max(); } - if (recurrent) { - if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { - llama_kv_cell & tail_src = cells[seq_id_src]; - llama_kv_cell & tail_dst = cells[seq_id_dst]; - if (tail_dst.tail >= 0) { - // clear destination seq_id if it wasn't empty - llama_kv_cell & cell_dst = cells[tail_dst.tail]; - - cell_dst.seq_id.erase(seq_id_dst); - tail_dst.tail = -1; - if (cell_dst.seq_id.empty()) { - cell_dst.pos = -1; - cell_dst.delta = -1; - cell_dst.src = -1; - used -= 1; - } - } - if (tail_src.tail >= 0) { - llama_kv_cell & cell_src = cells[tail_src.tail]; - - cell_src.seq_id.insert(seq_id_dst); - tail_dst.tail = tail_src.tail; - } - } - - return; - } - // otherwise, this is the KV of a Transformer-like model head = 0; @@ -304,17 +216,12 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { uint32_t new_head = size; for (uint32_t i = 0; i < size; ++i) { - if (recurrent && (llama_seq_id) i != seq_id) { - cells[i].tail = -1; - } - if (!cells[i].has_seq_id(seq_id)) { if (cells[i].pos >= 0) { used--; } cells[i].pos = -1; - cells[i].src = -1; cells[i].seq_id.clear(); if (new_head == size){ @@ -352,20 +259,6 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po return; } - if (recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be shifted - if (0 <= seq_id && seq_id < (int64_t) size) { - const int32_t tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos += delta; - } - } - } - return; - } - for (uint32_t i = 0; i < size; ++i) { if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { has_shift = true; @@ -408,21 +301,6 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po return; } - if (recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be changed - if (0 <= seq_id && seq_id < (int64_t) size) { - const int32_t tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos /= d; - } - } - } - - return; - } - for (uint32_t i = 0; i < size; ++i) { if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { has_shift = true; @@ -448,23 +326,11 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { return result; } -void llama_kv_cache_unified::defrag() { - if (!recurrent) { - do_defrag = true; - } -} - void llama_kv_cache_unified::restore() { if (pending.ranges.empty()) { return; } - // TODO: tmp - move to llama_kv_cache_recurrent - if (recurrent) { - seq_rm(-1, -1, -1); - return; - } - uint32_t new_head = size; for (auto & range : pending.ranges) { @@ -477,7 +343,6 @@ void llama_kv_cache_unified::restore() { } cells[i].pos = -1; - cells[i].src = -1; } new_head = std::min(new_head, range.c0); @@ -489,11 +354,6 @@ void llama_kv_cache_unified::restore() { } void llama_kv_cache_unified::commit() { - // TODO: tmp - move to llama_kv_cache_recurrent - if (recurrent) { - return; - } - if (pending.ranges.empty()) { LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n", __func__, "https://github.com/ggml-org/llama.cpp/pull/12695"); @@ -503,8 +363,103 @@ void llama_kv_cache_unified::commit() { pending.ranges.clear(); } -bool llama_kv_cache_unified::get_can_shift() const { - return can_shift; +bool llama_kv_cache_unified::update(llama_context & lctx) { + auto * sched = lctx.get_sched(); + + if (has_shift) { + if (!get_can_shift()) { + GGML_ABORT("The current KV cache / model configuration does not support K-shift"); + } + + LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__); + + // apply K-shift if needed + if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { + ggml_backend_sched_reset(sched); + + auto * gf = lctx.graph_init(); + + auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf); + + ggml_backend_sched_alloc_graph(sched, gf); + + res->set_inputs(nullptr); + + lctx.graph_compute(gf, false); + } + + { + has_shift = false; + + for (uint32_t i = 0; i < size; ++i) { + cells[i].delta = 0; + } + } + } + + if (do_defrag) { + LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); + const uint32_t n_max_nodes = lctx.graph_max_nodes(); + const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer); + if (!defrag_prepare(n_max_nodes)) { + LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__); + return false; + } + + for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) { + std::vector chunk; + auto end = std::min(i + max_moves, defrag_info.moves.size()); + chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end); + + ggml_backend_sched_reset(sched); + + auto * gf = lctx.graph_init(); + + auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk); + + ggml_backend_sched_alloc_graph(sched, gf); + + res->set_inputs(nullptr); + + lctx.graph_compute(gf, false); + } + + do_defrag = false; + } + + // we never need to reserve a worst case graph + return false; +} + +void llama_kv_cache_unified::defrag_sched(float thold) { + // - do not defrag small contexts (i.e. < 2048 tokens) + // - count the padding towards the number of used tokens + const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f; + + // queue defragmentation for next llama_kv_cache_update + if (fragmentation > thold) { + LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); + + do_defrag = true; + } +} + +void llama_kv_cache_unified::set_full() { + n = size; +} + +llama_sbatch llama_kv_cache_unified::sbatch_init( + const llama_batch & batch, + bool logits_all) { + return llama_sbatch(batch, hparams.n_embd, true, logits_all); +} + +llama_ubatch llama_kv_cache_unified::ubatch_next( + llama_sbatch & sbatch, + uint32_t n_ubatch, + bool embd_pooled) const { + GGML_UNUSED(embd_pooled); + return sbatch.split_simple(n_ubatch); } bool llama_kv_cache_unified::find_slot( @@ -519,169 +474,6 @@ bool llama_kv_cache_unified::find_slot( head = 0; } - if (recurrent) { - // For recurrent state architectures (like Mamba or RWKV), - // each cache cell can store the state for a whole sequence. - // A slot should be always be contiguous. - - // can only process batches with an equal number of new tokens in each sequence - GGML_ASSERT(ubatch.equal_seqs); - - int32_t min = size - 1; - int32_t max = 0; - - // everything should fit if all seq_ids are smaller than the max - for (uint32_t s = 0; s < n_seqs; ++s) { - const uint32_t n_seq_id = ubatch.n_seq_id[s]; - for (uint32_t j = 0; j < n_seq_id; ++j) { - const llama_seq_id seq_id = ubatch.seq_id[s][j]; - - if (seq_id < 0 || (uint32_t) seq_id >= size) { - // too big seq_id - // TODO: would it be possible to resize the cache instead? - LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); - return false; - } - if (j > 0) { - llama_kv_cell & seq = cells[seq_id]; - if (seq.tail >= 0) { - llama_kv_cell & cell = cells[seq.tail]; - // clear cells from seq_ids that become shared - // (should not normally happen, but let's handle it anyway) - cell.seq_id.erase(seq_id); - seq.tail = -1; - if (cell.seq_id.empty()) { - cell.pos = -1; - cell.src = -1; - used -= 1; - } - } - } - } - } - -#ifndef NDEBUG - { - std::vector tails_verif; - tails_verif.assign(size, -1); - for (uint32_t i = 0; i < size; ++i) { - llama_kv_cell & cell = cells[i]; - for (llama_seq_id seq_id : cell.seq_id) { - if (tails_verif[seq_id] != -1) { - LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]); - } - tails_verif[seq_id] = i; - } - } - for (uint32_t i = 0; i < size; ++i) { - if (tails_verif[i] != cells[i].tail) { - LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]); - } - } - } -#endif - - // find next empty cell - uint32_t next_empty_cell = head; - - for (uint32_t i = 0; i < size; ++i) { - if (next_empty_cell >= size) { next_empty_cell -= size; } - llama_kv_cell & cell = cells[next_empty_cell]; - if (cell.is_empty()) { break; } - next_empty_cell += 1; - } - - // find usable cell range - for (uint32_t s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - llama_kv_cell & seq_meta = cells[seq_id]; - bool has_cell = false; - if (seq_meta.tail >= 0) { - llama_kv_cell & cell = cells[seq_meta.tail]; - GGML_ASSERT(cell.has_seq_id(seq_id)); - // does this seq_id "own" the cell? - if (cell.seq_id.size() == 1) { has_cell = true; } - } - if (!has_cell) { - llama_kv_cell & empty_cell = cells[next_empty_cell]; - GGML_ASSERT(empty_cell.is_empty()); - // copy old tail into the empty cell - if (seq_meta.tail >= 0) { - llama_kv_cell & orig_cell = cells[seq_meta.tail]; - empty_cell.pos = orig_cell.pos; - empty_cell.src = orig_cell.src; - orig_cell.seq_id.erase(seq_id); - empty_cell.seq_id.insert(seq_id); // will be overwritten - } - seq_meta.tail = next_empty_cell; - // find next empty cell - if (s + 1 < n_seqs) { - next_empty_cell += 1; - for (uint32_t i = 0; i < size; ++i) { - if (next_empty_cell >= size) { next_empty_cell -= size; } - llama_kv_cell & cell = cells[next_empty_cell]; - if (cell.is_empty()) { break; } - next_empty_cell += 1; - } - } - } - if (min > seq_meta.tail) { min = seq_meta.tail; } - if (max < seq_meta.tail) { max = seq_meta.tail; } - } - - // gather and re-order - for (uint32_t s = 0; s < n_seqs; ++s) { - int32_t dst_id = s + min; - int32_t src_id = cells[ubatch.seq_id[s][0]].tail; - if (dst_id != src_id) { - llama_kv_cell & dst_cell = cells[dst_id]; - llama_kv_cell & src_cell = cells[src_id]; - - std::swap(dst_cell.pos, src_cell.pos); - std::swap(dst_cell.src, src_cell.src); - std::swap(dst_cell.seq_id, src_cell.seq_id); - - // swap tails (assuming they NEVER overlap) - for (const llama_seq_id seq_id : src_cell.seq_id) { - cells[seq_id].tail = src_id; - } - for (const llama_seq_id seq_id : dst_cell.seq_id) { - cells[seq_id].tail = dst_id; - } - } - } - - // update the pos of the used seqs - for (uint32_t s = 0; s < n_seqs; ++s) { - const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; - int32_t cell_id = s + min; - llama_kv_cell & cell = cells[cell_id]; - - if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { - // What should happen when the pos backtracks or skips a value? - // Clearing the state mid-batch would require special-casing which isn't done. - LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n", - __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens); - } - cell.pos = last_pos; - cell.seq_id.clear(); - for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) { - const llama_seq_id seq_id = ubatch.seq_id[s][j]; - cell.seq_id.insert(seq_id); - cells[seq_id].tail = cell_id; - } - } - - // allow getting the range of used cells, from head to head + n - head = min; - n = max - min + 1; - used = std::count_if(cells.begin(), cells.end(), - [](const llama_kv_cell& cell){ return !cell.is_empty(); }); - - // sanity check - return n >= n_seqs; - } - // otherwise, one cell per token. if (n_tokens > size) { @@ -733,24 +525,50 @@ bool llama_kv_cache_unified::find_slot( pending.ranges.push_back({head, head + n_tokens}); + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding))); + + //printf("n = %5d, used = %5d, head = %5d\n", n, used, head); + return true; } -uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const { - // the FA kernels require padding to avoid extra runtime boundary checks - return cparams.flash_attn ? 256u : 32u; -} +int32_t llama_kv_cache_unified::get_n_tokens() const { + int32_t result = 0; -uint32_t llama_kv_cache_unified::cell_max() const { - for (uint32_t i = size; i > 0; --i) { - const llama_kv_cell & cell = cells[i - 1]; - - if (cell.pos >= 0 && !cell.is_empty()) { - return i; - } + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); } - return 0; + return result; +} + +int32_t llama_kv_cache_unified::get_used_cells() const { + return used; +} + +bool llama_kv_cache_unified::get_can_shift() const { + return can_shift; +} + +llama_pos llama_kv_cache_unified::get_pos_max() const { + llama_pos pos_max = -1; + for (const auto & cell : cells) { + pos_max = std::max(pos_max, cell.pos); + } + + return pos_max; +} + +size_t llama_kv_cache_unified::total_size() const { + size_t size = 0; + for (const auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); + } + + return size; } size_t llama_kv_cache_unified::size_k_bytes() const { @@ -773,6 +591,254 @@ size_t llama_kv_cache_unified::size_v_bytes() const { return size_v_bytes; } +ggml_tensor * llama_kv_cache_unified::build_rope_shift( + const llama_cparams & cparams, + ggml_context * ctx, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + float freq_base, + float freq_scale) const { + const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; + + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + + const auto & n_rot = hparams.n_rot; + const auto & rope_type = hparams.rope_type; + + // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. + // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. + const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor; + + ggml_tensor * tmp; + + if (ggml_is_quantized(cur->type)) { + // dequantize to f32 -> RoPE -> quantize back + tmp = ggml_cast(ctx, cur, GGML_TYPE_F32); + + tmp = ggml_rope_ext(ctx, tmp, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + + tmp = ggml_cpy(ctx, tmp, cur); + } else { + // we rotate only the first n_rot dimensions + tmp = ggml_rope_ext_inplace(ctx, cur, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + } + + return tmp; +} + +class llm_graph_input_k_shift : public llm_graph_input_i { +public: + llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {} + virtual ~llm_graph_input_k_shift() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * k_shift; // I32 [kv_size] + + const llama_kv_cache_unified * kv_self; +}; + +void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + if (k_shift) { + assert(ggml_backend_buffer_is_host(k_shift->buffer)); + + int32_t * data = (int32_t *) k_shift->data; + + for (uint32_t i = 0; i < kv_self->size; ++i) { + data[i] = kv_self->cells[i].delta; + } + } +} + +llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( + const llama_cparams & cparams, + ggml_context * ctx, + ggml_cgraph * gf) const { + auto res = std::make_unique(); + + const auto & n_layer = hparams.n_layer; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + + //GGML_ASSERT(kv_self->size == n_ctx); + + auto inp = std::make_unique(this); + + inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx); + ggml_set_input(inp->k_shift); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + const bool is_swa = hparams.is_swa(il); + + // note: the swa rope params could become part of the cparams in the future + // if we decide to make them configurable, like the non-sliding ones + const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; + const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale; + + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); + + ggml_tensor * k = + ggml_view_3d(ctx, k_l[il], + n_embd_head_k, n_head_kv, size, + ggml_row_size(k_l[il]->type, n_embd_head_k), + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + 0); + + ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l); + + ggml_build_forward_expand(gf, cur); + } + + res->add_input(std::move(inp)); + + return res; +} + +llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( + const llama_cparams & cparams, + ggml_context * ctx, + ggml_cgraph * gf, + const std::vector & moves) const { + auto res = std::make_unique(); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(v_l[il]->type); + const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (const auto & move : moves) { + for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il], + n_embd_k_gqa, move.len, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il], + n_embd_k_gqa, move.len, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (cparams.flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx, v_l[il], + n_embd_v_gqa, move.len, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), + ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst)); + + view_v_dst = ggml_view_2d(ctx, v_l[il], + move.len, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), + ggml_row_size(v_l[il]->type, move.src)); + } else { + view_v_src = ggml_view_2d(ctx, v_l[il], + move.len, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), + ggml_row_size(v_l[il]->type, move.src)); + + view_v_dst = ggml_view_2d(ctx, v_l[il], + move.len, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), + ggml_row_size(v_l[il]->type, move.dst)); + } + + ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst)); + } + } + + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); +#endif + + return res; +} + bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { const uint32_t n_layer = hparams.n_layer; @@ -854,7 +920,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { cells[i0 + nf] = cell1; // clear the old cell and move the head there - cell1 = llama_kv_cell(); + cell1 = kv_cell(); head = n_used; if (!cont) { @@ -885,6 +951,18 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { return true; } +uint32_t llama_kv_cache_unified::cell_max() const { + for (uint32_t i = size; i > 0; --i) { + const kv_cell & cell = cells[i - 1]; + + if (cell.pos >= 0 && !cell.is_empty()) { + return i; + } + } + + return 0; +} + void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; @@ -1093,7 +1171,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell clear(); for (uint32_t i = 0; i < cell_count; ++i) { - llama_kv_cell & cell = cells[i]; + kv_cell & cell = cells[i]; llama_pos pos; uint32_t n_seq_id; @@ -1116,15 +1194,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell } cell.seq_id.insert(seq_id); - - if (recurrent) { - int32_t & tail = cells[seq_id].tail; - if (tail != -1) { - LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); - return false; - } - tail = i; - } } } @@ -1132,14 +1201,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell used = cell_count; } - if (recurrent) { - for (uint32_t i = 0; i < cell_count; ++i) { - uint32_t cell_id = head + i; - // make sure the recurrent states will keep their restored state - cells[cell_id].src = cell_id; - } - } - return true; } @@ -1157,7 +1218,1034 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size); return false; } - if (v_trans != (bool) v_trans) { + if (this->v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; + } + + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Read type of key + int32_t k_type_i_ref; + io.read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t) k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } + + // Read row size of key + uint64_t k_size_row_ref; + io.read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); + } + } + + if (!this->v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read row size of value + uint64_t v_size_row_ref; + io.read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read element size of value + uint32_t v_size_el_ref; + io.read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = ggml_type_size(v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } + + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } + + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (head + j * size) * v_size_el; + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } + } + } + } + + return true; +} + +// +// llama_kv_cache_recurrent +// + +llama_kv_cache_recurrent::llama_kv_cache_recurrent( + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool offload, + uint32_t kv_size) : hparams(model.hparams) { + const int32_t n_layer = hparams.n_layer; + + LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", + __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer); + + head = 0; + size = kv_size; + used = 0; + + this->type_k = type_k; + this->type_v = type_v; + + cells.clear(); + cells.resize(kv_size); + + // create a context for each buffer type + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + ggml_init_params params = { + /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return nullptr; + } + + ctx_map[buft] = ctx; + ctxs.emplace_back(ctx); + + return ctx; + } + + return it->second; + }; + + k_l.reserve(n_layer); + v_l.reserve(n_layer); + + for (int i = 0; i < n_layer; i++) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + + const char * dev_name = "CPU"; + + ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type(); + + if (offload) { + auto * dev = model.dev_layer(i); + buft = ggml_backend_dev_buffer_type(dev); + + dev_name = ggml_backend_dev_name(dev); + } + + LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name); + + ggml_context * ctx = ctx_for_buft(buft); + if (!ctx) { + throw std::runtime_error("failed to create ggml context for kv cache"); + } + + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + k_l.push_back(k); + v_l.push_back(v); + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + for (auto it : ctx_map) { + auto * buft = it.first; + auto * ctx = it.second; + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + throw std::runtime_error("failed to allocate buffer for kv cache"); + } + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + bufs.emplace_back(buf); + } + + { + const size_t memory_size_k = size_k_bytes(); + const size_t memory_size_v = size_v_bytes(); + + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + } +} + +void llama_kv_cache_recurrent::clear() { + for (int32_t i = 0; i < (int32_t) size; ++i) { + cells[i].pos = -1; + cells[i].seq_id.clear(); + cells[i].src = -1; + cells[i].tail = -1; + } + head = 0; + used = 0; + + for (auto & buf : bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } +} + +bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // models like Mamba or RWKV can't have a state partially erased + if (seq_id >= (int64_t) size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + int32_t & tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + const kv_cell & cell = cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + // invalidate tails which will be cleared + if (p0 <= cell.pos && cell.pos < p1) { + tail_id = -1; + } + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].pos >= p0 && cells[i].pos < p1) { + if (seq_id < 0) { + cells[i].seq_id.clear(); + } else if (cells[i].has_seq_id(seq_id)) { + cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cells[i].is_empty()) { + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + cells[i].pos = -1; + cells[i].src = -1; + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + + return true; +} + +void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { + kv_cell & tail_src = cells[seq_id_src]; + kv_cell & tail_dst = cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + kv_cell & cell_dst = cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.src = -1; + used -= 1; + } + } + if (tail_src.tail >= 0) { + kv_cell & cell_src = cells[tail_src.tail]; + + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } + } +} + +void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) { + uint32_t new_head = size; + + for (uint32_t i = 0; i < size; ++i) { + if ((llama_seq_id) i != seq_id) { + cells[i].tail = -1; + } + + if (!cells[i].has_seq_id(seq_id)) { + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + cells[i].seq_id.clear(); + + if (new_head == size){ + new_head = i; + } + } else { + cells[i].seq_id.clear(); + cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } +} + +void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the + if (p0 == p1) { + return; + } + + // for Mamba-like or RWKV models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + } +} + +void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the cache. + if (p0 == p1) { + return; + } + + // for Mamba-like or RWKV models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + } +} + +llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const { + llama_pos result = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id)) { + result = std::max(result, cells[i].pos); + } + } + + return result; +} + +void llama_kv_cache_recurrent::restore() { + if (pending.ranges.empty()) { + return; + } + + seq_rm(-1, -1, -1); +} + +void llama_kv_cache_recurrent::commit() { + pending.ranges.clear(); +} + +bool llama_kv_cache_recurrent::update(llama_context & lctx) { + GGML_UNUSED(lctx); + return false; +} + +void llama_kv_cache_recurrent::defrag_sched(float thold) { + GGML_UNUSED(thold); + // noop +} + +void llama_kv_cache_recurrent::set_full() { + n = size; +} + +llama_sbatch llama_kv_cache_recurrent::sbatch_init( + const llama_batch & batch, + bool logits_all) { + return llama_sbatch(batch, hparams.n_embd, false, logits_all); +} + +llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const { + if (embd_pooled) { + // Pooled embeddings cannot be split across ubatches (yet) + return sbatch.split_seq(n_ubatch); + } + + return sbatch.split_equal(n_ubatch); +} + +bool llama_kv_cache_recurrent::find_slot( + const llama_ubatch & ubatch) { + const uint32_t n_tokens = ubatch.n_tokens; + const uint32_t n_seqs = ubatch.n_seqs; + + const uint32_t n_seq_tokens = ubatch.n_seq_tokens; + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (head > used + 2*n_tokens) { + head = 0; + } + + // For recurrent state architectures (like Mamba or RWKV), + // each cache cell can store the state for a whole sequence. + // A slot should be always be contiguous. + + // can only process batches with an equal number of new tokens in each sequence + GGML_ASSERT(ubatch.equal_seqs); + + int32_t min = size - 1; + int32_t max = 0; + + // everything should fit if all seq_ids are smaller than the max + for (uint32_t s = 0; s < n_seqs; ++s) { + const uint32_t n_seq_id = ubatch.n_seq_id[s]; + for (uint32_t j = 0; j < n_seq_id; ++j) { + const llama_seq_id seq_id = ubatch.seq_id[s][j]; + + if (seq_id < 0 || (uint32_t) seq_id >= size) { + // too big seq_id + // TODO: would it be possible to resize the cache instead? + LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); + return false; + } + if (j > 0) { + kv_cell & seq = cells[seq_id]; + if (seq.tail >= 0) { + kv_cell & cell = cells[seq.tail]; + // clear cells from seq_ids that become shared + // (should not normally happen, but let's handle it anyway) + cell.seq_id.erase(seq_id); + seq.tail = -1; + if (cell.seq_id.empty()) { + cell.pos = -1; + cell.src = -1; + used -= 1; + } + } + } + } + } + +#ifndef NDEBUG + { + std::vector tails_verif; + tails_verif.assign(size, -1); + for (uint32_t i = 0; i < size; ++i) { + kv_cell & cell = cells[i]; + for (llama_seq_id seq_id : cell.seq_id) { + if (tails_verif[seq_id] != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]); + } + tails_verif[seq_id] = i; + } + } + for (uint32_t i = 0; i < size; ++i) { + if (tails_verif[i] != cells[i].tail) { + LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]); + } + } + } +#endif + + // find next empty cell + uint32_t next_empty_cell = head; + + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + kv_cell & cell = cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; + } + + // find usable cell range + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + kv_cell & seq_meta = cells[seq_id]; + bool has_cell = false; + if (seq_meta.tail >= 0) { + kv_cell & cell = cells[seq_meta.tail]; + GGML_ASSERT(cell.has_seq_id(seq_id)); + // does this seq_id "own" the cell? + if (cell.seq_id.size() == 1) { has_cell = true; } + } + if (!has_cell) { + kv_cell & empty_cell = cells[next_empty_cell]; + GGML_ASSERT(empty_cell.is_empty()); + // copy old tail into the empty cell + if (seq_meta.tail >= 0) { + kv_cell & orig_cell = cells[seq_meta.tail]; + empty_cell.pos = orig_cell.pos; + empty_cell.src = orig_cell.src; + orig_cell.seq_id.erase(seq_id); + empty_cell.seq_id.insert(seq_id); // will be overwritten + } + seq_meta.tail = next_empty_cell; + // find next empty cell + if (s + 1 < n_seqs) { + next_empty_cell += 1; + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + kv_cell & cell = cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; + } + } + } + if (min > seq_meta.tail) { min = seq_meta.tail; } + if (max < seq_meta.tail) { max = seq_meta.tail; } + } + + // gather and re-order + for (uint32_t s = 0; s < n_seqs; ++s) { + int32_t dst_id = s + min; + int32_t src_id = cells[ubatch.seq_id[s][0]].tail; + if (dst_id != src_id) { + kv_cell & dst_cell = cells[dst_id]; + kv_cell & src_cell = cells[src_id]; + + std::swap(dst_cell.pos, src_cell.pos); + std::swap(dst_cell.src, src_cell.src); + std::swap(dst_cell.seq_id, src_cell.seq_id); + + // swap tails (assuming they NEVER overlap) + for (const llama_seq_id seq_id : src_cell.seq_id) { + cells[seq_id].tail = src_id; + } + for (const llama_seq_id seq_id : dst_cell.seq_id) { + cells[seq_id].tail = dst_id; + } + } + } + + // update the pos of the used seqs + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; + int32_t cell_id = s + min; + kv_cell & cell = cells[cell_id]; + + if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { + // What should happen when the pos backtracks or skips a value? + // Clearing the state mid-batch would require special-casing which isn't done. + LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n", + __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens); + } + cell.pos = last_pos; + cell.seq_id.clear(); + for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) { + const llama_seq_id seq_id = ubatch.seq_id[s][j]; + cell.seq_id.insert(seq_id); + cells[seq_id].tail = cell_id; + } + } + + // allow getting the range of used cells, from head to head + n + head = min; + n = max - min + 1; + used = std::count_if(cells.begin(), cells.end(), + [](const kv_cell & cell){ return !cell.is_empty(); }); + + // sanity check + return n >= n_seqs; +} + +int32_t llama_kv_cache_recurrent::get_n_tokens() const { + int32_t result = 0; + + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); + } + + return result; +} + +int32_t llama_kv_cache_recurrent::get_used_cells() const { + return used; +} + +llama_pos llama_kv_cache_recurrent::get_pos_max() const { + llama_pos pos_max = -1; + for (const auto & cell : cells) { + pos_max = std::max(pos_max, cell.pos); + } + + return pos_max; +} + +bool llama_kv_cache_recurrent::get_can_shift() const { + return false; +} + +int32_t llama_kv_cache_recurrent::s_copy(int i) const { + const uint32_t cell_id = i + head; + + ////////////////////////////////////////////// + // TODO: this should not mutate the KV cache ! + kv_cell & cell = const_cast(cells[cell_id]); + + // prevent out-of-bound sources + if (cell.src < 0 || (uint32_t) cell.src >= size) { + cell.src = cell_id; + } + + int32_t res = cell.src; + + // TODO: do not mutate the KV cache + // ensure copy only happens once + if (cell.src != (int32_t) cell_id) { + cell.src = cell_id; + } + + return res; +} + +float llama_kv_cache_recurrent::s_mask(int i) const { + const uint32_t cell_id = i + head; + + ////////////////////////////////////////////// + // TODO: this should not mutate the KV cache ! + kv_cell & cell = const_cast(cells[cell_id]); + + float res = (float) (cell.src >= 0); + + // only clear once + if (cell.src < 0) { + cell.src = cell_id; + } + + return res; +} + +uint32_t llama_kv_cache_recurrent::cell_max() const { + for (uint32_t i = size; i > 0; --i) { + const kv_cell & cell = cells[i - 1]; + + if (cell.pos >= 0 && !cell.is_empty()) { + return i; + } + } + + return 0; +} + +size_t llama_kv_cache_recurrent::total_size() const { + size_t size = 0; + for (const auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); + } + + return size; +} + +size_t llama_kv_cache_recurrent::size_k_bytes() const { + size_t size_k_bytes = 0; + + for (const auto & k : k_l) { + size_k_bytes += ggml_nbytes(k); + } + + return size_k_bytes; +} + +size_t llama_kv_cache_recurrent::size_v_bytes() const { + size_t size_v_bytes = 0; + + for (const auto & v : v_l) { + size_v_bytes += ggml_nbytes(v); + } + + return size_v_bytes; +} + +void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; + + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = size; + for (uint32_t i = 0; i < size; ++i) { + const auto & cell = cells[i]; + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == size) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, i); + cell_range_begin = size; + } + } + } + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, size); + } + + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); + + io.write(&cell_count, sizeof(cell_count)); + + state_write_meta(io, cell_ranges, seq_id); + state_write_data(io, cell_ranges); +} + +void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) { + uint32_t cell_count; + io.read_to(&cell_count, sizeof(cell_count)); + + bool res = true; + res = res && state_read_meta(io, cell_count, seq_id); + res = res && state_read_data(io, cell_count); + + if (!res) { + if (seq_id == -1) { + clear(); + } else { + seq_rm(seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } +} + +void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; + + io.write(&pos, sizeof(pos)); + io.write(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + io.write(&seq_id, sizeof(seq_id)); + } + } + } + } +} + +void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { + const uint32_t v_trans = 0; + const uint32_t n_layer = hparams.n_layer; + + io.write(&v_trans, sizeof(v_trans)); + io.write(&n_layer, sizeof(n_layer)); + + std::vector tmp_buf; + + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Write key type + const int32_t k_type_i = (int32_t)k_l[il]->type; + io.write(&k_type_i, sizeof(k_type_i)); + + // Write row size of key + const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + io.write(&k_size_row, sizeof(k_size_row)); + + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * k_size_row; + io.write_tensor(k_l[il], range.first * k_size_row, buf_size); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write row size of value + const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + io.write(&v_size_row, sizeof(v_size_row)); + + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * v_size_row; + io.write_tensor(v_l[il], range.first * v_size_row, buf_size); + } + } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write element size + const uint32_t v_size_el = ggml_type_size(v_l[il]->type); + io.write(&v_size_el, sizeof(v_size_el)); + + // Write GQA embedding size + io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + const size_t buf_size = range_size * v_size_el; + io.write_tensor(v_l[il], src_offset, buf_size); + } + } + } + } +} + +bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) { + if (dest_seq_id != -1) { + // single sequence + + seq_rm(dest_seq_id, -1, -1); + + llama_sbatch sbatch; + llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); + + batch.n_tokens = cell_count; + batch.n_seq_tokens = cell_count; + batch.n_seqs = 1; + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } + + batch.pos[i] = pos; + } + batch.n_seq_id[0] = 1; + batch.seq_id[0] = &dest_seq_id; + if (!find_slot(batch)) { + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } + commit(); + + // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(head + cell_count <= size); + GGML_ASSERT(cells[head].pos == batch.pos[0]); + GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(cells[head].has_seq_id(dest_seq_id)); + GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id)); + } else { + // whole KV cache restore + + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } + + clear(); + + for (uint32_t i = 0; i < cell_count; ++i) { + kv_cell & cell = cells[i]; + + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + io.read_to(&seq_id, sizeof(seq_id)); + + // TODO: llama_kv_cache_recurrent should have a notion of max sequences + //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + if (seq_id < 0) { + //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); + return false; + } + + cell.seq_id.insert(seq_id); + + int32_t & tail = cells[seq_id].tail; + if (tail != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); + return false; + } + tail = i; + } + } + + head = 0; + used = cell_count; + } + + for (uint32_t i = 0; i < cell_count; ++i) { + uint32_t cell_id = head + i; + // make sure the recurrent states will keep their restored state + cells[cell_id].src = cell_id; + } + + return true; +} + +bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) { + uint32_t v_trans; + uint32_t n_layer; + io.read_to(&v_trans, sizeof(v_trans)); + io.read_to(&n_layer, sizeof(n_layer)); + + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size); + return false; + } + if (false != (bool) v_trans) { LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); return false; } @@ -1309,7 +2397,7 @@ void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache view->cells_sequences = (llama_seq_id *)p; } - const std::vector & kv_cells = kvu->cells; + const std::vector & kv_cells = kvu->cells; llama_kv_cache_view_cell * c_curr = view->cells; llama_seq_id * cs_curr = view->cells_sequences; int32_t used_cells = 0; diff --git a/llama/llama.cpp/src/llama-kv-cache.h b/llama/llama.cpp/src/llama-kv-cache.h index 25cbcb562..928b97125 100644 --- a/llama/llama.cpp/src/llama-kv-cache.h +++ b/llama/llama.cpp/src/llama-kv-cache.h @@ -2,32 +2,72 @@ #include "llama.h" #include "llama-io.h" +#include "llama-graph.h" #include "llama-memory.h" #include "ggml-cpp.h" -#include #include #include struct llama_cparams; struct llama_hparams; struct llama_ubatch; +struct llama_sbatch; +struct llama_model; +struct llama_context; struct llama_kv_cache : public llama_memory_i { - using llama_memory_i::llama_memory_i; + virtual ~llama_kv_cache() = default; - virtual void restore() = 0; // call if batch processing fails - restores the cache state - virtual void commit() = 0; // call after successful batch processing - clears any pending state + // call if batch processing fails - restores the cache state + virtual void restore() = 0; - virtual int32_t get_n_tokens() const = 0; - virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache + // call after successful batch processing - clears any pending state + virtual void commit() = 0; - virtual bool get_can_shift() const = 0; + // process any pending defrag/shift/etc. operations + // optionally call once before processing a new batch + virtual bool update(llama_context & lctx) = 0; + + // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing + virtual void defrag_sched(float thold) = 0; + + // simulate full cache, used for allocating worst-case compute buffers + virtual void set_full() = 0; + + // + // batch processing + // + + virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0; + + // different KV caches require different batch splitting strategies + virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0; + + // find an empty slot of size "n_tokens" in the cache + virtual bool find_slot(const llama_ubatch & batch) = 0; + + // getters + virtual int32_t get_n_tokens() const = 0; + virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache + virtual llama_pos get_pos_max() const = 0; + virtual bool get_can_shift() const = 0; bool get_can_edit() const override { return get_can_shift(); } + + // + // state write/read + // + + virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0; + virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0; }; +// +// llama_kv_cache_guard +// + struct llama_kv_cache_guard { llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {} @@ -42,7 +82,7 @@ struct llama_kv_cache_guard { private: llama_kv_cache * kv; }; - + // block of KV slots to move when defragging struct llama_kv_defrag_move { uint32_t src; @@ -50,65 +90,50 @@ struct llama_kv_defrag_move { uint32_t len; }; -struct llama_kv_cell { - llama_pos pos = -1; - llama_pos delta = 0; - int32_t src = -1; // used by recurrent state models to copy states - int32_t tail = -1; +// +// llama_kv_cache_unified +// - std::set seq_id; - - bool has_seq_id(const llama_seq_id & id) const { - return seq_id.find(id) != seq_id.end(); - } - - bool is_empty() const { - return seq_id.empty(); - } - - bool is_same_seq(const llama_kv_cell & other) const { - return seq_id == other.seq_id; - } -}; - -// ring-buffer of cached KV data -// TODO: pimpl // TODO: add notion of max sequences class llama_kv_cache_unified : public llama_kv_cache { public: - // can be used to query data from the model if needed - struct callbacks { - std::function get_rope_factors; + struct kv_cell { + llama_pos pos = -1; + llama_pos delta = 0; + + std::set seq_id; + + bool has_seq_id(const llama_seq_id & id) const { + return seq_id.find(id) != seq_id.end(); + } + + bool is_empty() const { + return seq_id.empty(); + } + + bool is_same_seq(const kv_cell & other) const { + return seq_id == other.seq_id; + } }; + static uint32_t get_padding(const llama_cparams & cparams); + llama_kv_cache_unified( - const llama_hparams & hparams, - callbacks cbs); - - virtual ~llama_kv_cache_unified() = default; - - // TODO: become constructor - bool init( - const llama_model & model, // TODO: do not reference the model - const llama_cparams & cparams, + const llama_model & model, ggml_type type_k, ggml_type type_v, + bool v_trans, + bool offload, uint32_t kv_size, - bool offload); + uint32_t padding); - int32_t get_n_tokens() const override; - int32_t get_used_cells() const override; + ~llama_kv_cache_unified() = default; - size_t total_size() const; - - // TODO: better data structures to reduce the cost of this operation - llama_pos pos_max() const; + // + // llama_memory_i + // void clear() override; - void defrag() override; - - virtual void restore() override; - virtual void commit() override; bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; @@ -118,63 +143,40 @@ public: llama_pos seq_pos_max(llama_seq_id seq_id) const override; - bool get_can_shift() const override; + // + // llama_kv_cache + // + + void restore() override; + void commit() override; + + bool update(llama_context & ctx) override; + + void defrag_sched(float thold) override; + + void set_full() override; + + llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override; + + llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override; - // find an empty slot of size "n_tokens" in the cache // updates the cache head // Note: On success, it's important that cache.head points // to the first cell of the slot. - bool find_slot(const llama_ubatch & batch); + bool find_slot(const llama_ubatch & batch) override; - // TODO: maybe not needed - uint32_t get_padding(const llama_cparams & cparams) const; + int32_t get_n_tokens() const override; + int32_t get_used_cells() const override; - // find how many cells are currently in use - uint32_t cell_max() const; + // TODO: better data structures to reduce the cost of this operation + llama_pos get_pos_max() const override; - size_t size_k_bytes() const; - size_t size_v_bytes() const; - - // defrag - - struct { - std::vector moves; - } defrag_info; - - // return true if cells have been moved - bool defrag_prepare(int32_t n_max_nodes); - - // commit/restore cache - - struct slot_range { - uint32_t c0 = 0; // note: these are cell indices, not sequence positions - uint32_t c1 = 0; - }; - - // pending cell updates that are not yet committed - struct { - std::vector ranges; - } pending; + bool get_can_shift() const override; // state write/load - void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const; - void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1); - - // members - - const llama_hparams & hparams; - - callbacks cbs; - - bool has_shift = false; - bool do_defrag = false; - - // TODO: remove this and implement llama_kv_cache_recurrent instead - bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token - - bool v_trans = true; // the value tensor is transposed - bool can_shift = false; + void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; + void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; // Note: The value of head isn't only used to optimize searching // for a free KV slot. llama_decode_impl also uses it, so it @@ -186,18 +188,214 @@ public: // computed before each graph build uint32_t n = 0; - std::vector cells; + std::vector cells; std::vector k_l; // per layer std::vector v_l; private: + const llama_model & model; + const llama_hparams & hparams; + + bool has_shift = false; + bool do_defrag = false; + + bool v_trans = true; // the value tensor is transposed + bool can_shift = false; + + // required padding + uint32_t padding = 1; + ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; std::vector ctxs; std::vector bufs; + // defrag + struct { + std::vector moves; + } defrag_info; + + // return true if cells have been moved + bool defrag_prepare(int32_t n_max_nodes); + + // commit/restore cache + struct slot_range { + uint32_t c0 = 0; // note: these are cell indices, not sequence positions + uint32_t c1 = 0; + }; + + // pending cell updates that are not yet committed + struct { + std::vector ranges; + } pending; + + // find how many cells are currently in use + uint32_t cell_max() const; + + size_t total_size() const; + + size_t size_k_bytes() const; + size_t size_v_bytes() const; + + ggml_tensor * build_rope_shift( + const llama_cparams & cparams, + ggml_context * ctx, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + float freq_base, + float freq_scale) const; + + llm_graph_result_ptr build_graph_shift( + const llama_cparams & cparams, + ggml_context * ctx, + ggml_cgraph * gf) const; + + llm_graph_result_ptr build_graph_defrag( + const llama_cparams & cparams, + ggml_context * ctx, + ggml_cgraph * gf, + const std::vector & moves) const; + + void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; + void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const; + + bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); + bool state_read_data(llama_io_read_i & io, uint32_t cell_count); +}; + +// +// llama_kv_cache_recurrent +// + +class llama_kv_cache_recurrent : public llama_kv_cache { +public: + struct kv_cell { + llama_pos pos = -1; + int32_t src = -1; // used to copy states + int32_t tail = -1; + + std::set seq_id; + + bool has_seq_id(const llama_seq_id & id) const { + return seq_id.find(id) != seq_id.end(); + } + + bool is_empty() const { + return seq_id.empty(); + } + + bool is_same_seq(const kv_cell & other) const { + return seq_id == other.seq_id; + } + }; + + llama_kv_cache_recurrent( + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool offload, + uint32_t kv_size); + + ~llama_kv_cache_recurrent() = default; + + // + // llama_memory_i + // + + void clear() override; + + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; + void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; + void seq_keep(llama_seq_id seq_id) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override; + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; + + llama_pos seq_pos_max(llama_seq_id seq_id) const override; + + // + // llama_kv_cache + // + + void restore() override; + void commit() override; + + bool update(llama_context & lctx) override; + + void defrag_sched(float thold) override; + + void set_full() override; + + llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override; + + llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override; + + bool find_slot(const llama_ubatch & batch) override; + + int32_t get_n_tokens() const override; + int32_t get_used_cells() const override; + + // TODO: better data structures to reduce the cost of this operation + llama_pos get_pos_max() const override; + + bool get_can_shift() const override; + + // TODO: temporary methods - they are not really const as they do const_cast<>, fix this + int32_t s_copy(int i) const; + float s_mask(int i) const; + + // state write/load + + void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; + void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; + + // Note: The value of head isn't only used to optimize searching + // for a free KV slot. llama_decode_impl also uses it, so it + // cannot be freely changed after a slot has been allocated. + uint32_t head = 0; + uint32_t size = 0; + uint32_t used = 0; // used cells (i.e. at least one seq_id) + + // computed before each graph build + uint32_t n = 0; + + std::vector cells; + + std::vector k_l; // per layer + std::vector v_l; + +private: + //const llama_model & model; + const llama_hparams & hparams; + + // commit/restore cache + // TODO: rework for recurrent cache + struct slot_range { + uint32_t c0 = 0; // note: these are cell indices, not sequence positions + uint32_t c1 = 0; + }; + + // pending cell updates that are not yet committed + struct { + std::vector ranges; + } pending; + + ggml_type type_k = GGML_TYPE_F16; + ggml_type type_v = GGML_TYPE_F16; + + std::vector ctxs; + std::vector bufs; + + // find how many cells are currently in use + uint32_t cell_max() const; + + size_t total_size() const; + + size_t size_k_bytes() const; + size_t size_v_bytes() const; + void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const; @@ -205,11 +403,6 @@ private: bool state_read_data(llama_io_read_i & io, uint32_t cell_count); }; -// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified -//class llama_kv_cache_recurrent : public llama_kv_cache_unified { -//public: -// using llama_kv_cache_unified::llama_kv_cache_unified; -//}; // // kv cache view diff --git a/llama/llama.cpp/src/llama-memory.h b/llama/llama.cpp/src/llama-memory.h index dfa8c4e90..c7412d591 100644 --- a/llama/llama.cpp/src/llama-memory.h +++ b/llama/llama.cpp/src/llama-memory.h @@ -2,12 +2,22 @@ #include "llama.h" +struct llama_memory_params { + // kv cache + ggml_type type_k; + ggml_type type_v; + + // parameters for other types of memory + // ... +}; + // general concept of LLM memory // the KV cache is a type of LLM memory, but there can be other types class llama_memory_i { public: + virtual ~llama_memory_i() = default; + virtual void clear() = 0; - virtual void defrag() = 0; virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0; virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0; diff --git a/llama/llama.cpp/src/llama-model-loader.cpp b/llama/llama.cpp/src/llama-model-loader.cpp index 2e11507d9..7f6617fac 100644 --- a/llama/llama.cpp/src/llama-model-loader.cpp +++ b/llama/llama.cpp/src/llama-model-loader.cpp @@ -301,12 +301,12 @@ namespace GGUFMeta { GGUFMeta::GKV::get_kv(meta.get(), kid); switch (arr_info.gt) { - case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; - case GGUF_TYPE_INT32: GGML_ASSERT( - (std::is_same::value) || - (std::is_same::value)); break; + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || + (std::is_same::value)); break; + case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str())); } result.resize(arr_info.length); @@ -315,8 +315,6 @@ namespace GGUFMeta { return true; } - template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array& result, bool required); - template bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { const int kid = gguf_find_key(meta.get(), key.c_str()); @@ -332,12 +330,12 @@ namespace GGUFMeta { GGUFMeta::GKV::get_kv(meta.get(), kid); switch (arr_info.gt) { - case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; - case GGUF_TYPE_INT32: GGML_ASSERT( - (std::is_same::value) || - (std::is_same::value)); break; + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || + (std::is_same::value)); break; + case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str())); } if (arr_info.length > N_MAX) { @@ -826,6 +824,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps mmaps_used.reserve(files.size()); for (const auto & file : files) { auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + if (!reg) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); std::unique_ptr mapping = std::make_unique(file.get(), prefetch ? -1 : 0, is_numa_fn()); mmaps_used.emplace_back(mapping->size(), 0); diff --git a/llama/llama.cpp/src/llama-model-saver.cpp b/llama/llama.cpp/src/llama-model-saver.cpp new file mode 100644 index 000000000..a70b98923 --- /dev/null +++ b/llama/llama.cpp/src/llama-model-saver.cpp @@ -0,0 +1,281 @@ +#include "llama-model-saver.h" + +#include "gguf.h" + +#include "llama.h" +#include "llama-hparams.h" +#include "llama-model.h" +#include "llama-vocab.h" + +#include + +llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) { + gguf_ctx = gguf_init_empty(); +} + +llama_model_saver::~llama_model_saver() { + gguf_free(gguf_ctx); +} + +void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) { + gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value); +} + +void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) { + gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value); +} + +void llama_model_saver::add_kv(const enum llm_kv key, const float value) { + gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value); +} + +void llama_model_saver::add_kv(const enum llm_kv key, const bool value) { + gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value); +} + +void llama_model_saver::add_kv(const enum llm_kv key, const char * value) { + gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value); +} + +[[noreturn]] +void llama_model_saver::add_kv(const enum llm_kv key, const char value) { + GGML_UNUSED(key); + GGML_UNUSED(value); + GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile +} + +template +void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) { + const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size(); + GGML_ASSERT(n_values <= value.size()); + + if (n_values == 0) { + return; + } + + if (per_layer) { + bool all_values_the_same = true; + for (size_t i = 1; i < n_values; ++i) { + if (value[i] != value[0]) { + all_values_the_same = false; + break; + } + } + if (all_values_the_same) { + add_kv(key, value[0]); + return; + } + } + + if (std::is_same::value) { + gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values); + } else if (std::is_same::value) { + gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values); + } else if (std::is_same::value) { + gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values); + } else if (std::is_same::value) { + gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values); + } else if (std::is_same::value) { + gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values); + } else if (std::is_same::value) { + gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast(value.data())); + } else { + GGML_ABORT("fatal error"); + } +} + +void llama_model_saver::add_kv(const enum llm_kv key, const std::vector & value) { + std::vector tmp(value.size()); + for (size_t i = 0; i < value.size(); ++i) { + tmp[i] = value[i].c_str(); + } + gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size()); +} + +void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) { + if (!tensor) { + return; + } + if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) { + GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME + return; + } + gguf_add_tensor(gguf_ctx, tensor); +} + +void llama_model_saver::add_kv_from_model() { + const llama_hparams & hparams = model.hparams; + const llama_vocab & vocab = model.vocab; + + const int32_t n_vocab = vocab.n_tokens(); + std::vector tokens(n_vocab); + std::vector scores(n_vocab); + std::vector token_types(n_vocab); + + for (int32_t id = 0; id < n_vocab; ++id) { + const llama_vocab::token_data & token_data = vocab.get_token_data(id); + + tokens[id] = token_data.text; + scores[id] = token_data.score; + + switch(token_data.attr) { + case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break; + case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break; + case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break; + case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break; + case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break; + case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break; + case LLAMA_TOKEN_ATTR_UNDEFINED: + default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break; + } + } + + // add_kv(LLM_KV_GENERAL_TYPE, ???); + add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name()); + // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???); + // add_kv(LLM_KV_GENERAL_ALIGNMENT, ???); + add_kv(LLM_KV_GENERAL_NAME, model.name); + // add_kv(LLM_KV_GENERAL_AUTHOR, ???); + // add_kv(LLM_KV_GENERAL_VERSION, ???); + // add_kv(LLM_KV_GENERAL_URL, ???); + // add_kv(LLM_KV_GENERAL_DESCRIPTION, ???); + // add_kv(LLM_KV_GENERAL_LICENSE, ???); + // add_kv(LLM_KV_GENERAL_SOURCE_URL, ???); + // add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???); + + add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens()); + add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); + add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); + add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer); + add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); + add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); + // add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???); + add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert); + add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); + add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); + add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id); + add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping); + add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping); + add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm); + add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers); + add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim); + add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim); + add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + + add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true); + add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true); + add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); + add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k); + add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v); + add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); + add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); + + const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; + + add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); + add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train); + // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name + add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train)); + add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor); + add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor); + add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn); + add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned); + add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul); + + // TODO: implement split file support + // add_kv(LLM_KV_SPLIT_NO, ???); + // add_kv(LLM_KV_SPLIT_COUNT, ???); + // add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???); + + add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms); + + add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); + + add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model()); + add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre()); + add_kv(LLM_KV_TOKENIZER_LIST, tokens); + add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types); + add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types()); + add_kv(LLM_KV_TOKENIZER_SCORES, scores); + add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges()); + // FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though + add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos())); + add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos())); + add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot())); + add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom())); + add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk())); + add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep())); + add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad())); + // add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated + // add_kv(LLM_KV_TOKENIZER_MASK_ID, ???); + add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos()); + add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos()); + add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix()); + add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces()); + add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap()); + // add_kv(LLM_KV_TOKENIZER_HF_JSON, ???); + // add_kv(LLM_KV_TOKENIZER_RWKV, ???); + add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre())); + add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf())); + add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid())); + add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad())); + add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep())); + add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep())); + + // TODO: implement LoRA support + // add_kv(LLM_KV_ADAPTER_TYPE, ???); + // add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???); + + // deprecated + // add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???); + // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???); + // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???); +} + +void llama_model_saver::add_tensors_from_model() { + if (std::string(model.output->name) != std::string(model.tok_embd->name)) { + add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output + } + add_tensor(model.type_embd); + add_tensor(model.pos_embd); + add_tensor(model.tok_norm); + add_tensor(model.tok_norm_b); + add_tensor(model.output_norm); + add_tensor(model.output_norm_b); + add_tensor(model.output); + add_tensor(model.output_b); + add_tensor(model.output_norm_enc); + add_tensor(model.cls); + add_tensor(model.cls_b); + add_tensor(model.cls_out); + add_tensor(model.cls_out_b); + + for (const struct llama_layer & layer : model.layers) { + for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) { + add_tensor(reinterpret_cast(&layer)[i]); + } + } +} + +void llama_model_saver::save(const std::string & path_model) { + gguf_write_to_file(gguf_ctx, path_model.c_str(), false); +} + diff --git a/llama/llama.cpp/src/llama-model-saver.h b/llama/llama.cpp/src/llama-model-saver.h new file mode 100644 index 000000000..a5a434c30 --- /dev/null +++ b/llama/llama.cpp/src/llama-model-saver.h @@ -0,0 +1,37 @@ +#pragma once + +#include "llama.h" +#include "llama-arch.h" + +#include + +struct llama_model_saver { + struct gguf_context * gguf_ctx = nullptr; + const struct llama_model & model; + const struct LLM_KV llm_kv; + + llama_model_saver(const struct llama_model & model); + ~llama_model_saver(); + + void add_kv(enum llm_kv key, uint32_t value); + void add_kv(enum llm_kv key, int32_t value); + void add_kv(enum llm_kv key, float value); + void add_kv(enum llm_kv key, bool value); + void add_kv(enum llm_kv key, const char * value); + + [[noreturn]] + void add_kv(enum llm_kv key, char value); // needed to make the template below compile + + template + void add_kv(enum llm_kv key, const Container & value, bool per_layer = false); + + void add_kv(enum llm_kv key, const std::vector & value); + + void add_tensor(const struct ggml_tensor * tensor); + + void add_kv_from_model(); + + void add_tensors_from_model(); + + void save(const std::string & path_model); +}; diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp index c8374159f..db62973fa 100644 --- a/llama/llama.cpp/src/llama-model.cpp +++ b/llama/llama.cpp/src/llama-model.cpp @@ -40,14 +40,17 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_335M: return "335M"; case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; + case LLM_TYPE_475M: return "475M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; + case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_5B: return "1.5B"; case LLM_TYPE_1_6B: return "1.6B"; + case LLM_TYPE_1_7B: return "1.7B"; case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2_8B: return "2.8B"; @@ -66,6 +69,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_15B: return "15B"; case LLM_TYPE_16B: return "16B"; case LLM_TYPE_20B: return "20B"; + case LLM_TYPE_27B: return "27B"; case LLM_TYPE_30B: return "30B"; case LLM_TYPE_32B: return "32B"; case LLM_TYPE_34B: return "34B"; @@ -74,7 +78,9 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_65B: return "65B"; case LLM_TYPE_70B: return "70B"; case LLM_TYPE_236B: return "236B"; + case LLM_TYPE_290B: return "290B"; case LLM_TYPE_314B: return "314B"; + case LLM_TYPE_405B: return "405B"; case LLM_TYPE_671B: return "671B"; case LLM_TYPE_SMALL: return "0.1B"; case LLM_TYPE_MEDIUM: return "0.4B"; @@ -88,10 +94,10 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_16x3_8B: return "16x3.8B"; case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; case LLM_TYPE_57B_A14B: return "57B.A14B"; - case LLM_TYPE_27B: return "27B"; - case LLM_TYPE_290B: return "290B"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; + case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_235B_A22B: return "235B.A22B"; default: return "?B"; } } @@ -111,6 +117,10 @@ static const std::map LLAMA_ROPE_SCALING_ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" }, }; +std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) { + return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type); +} + static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { if (kv.second == name) { @@ -293,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector & de // add extra buffer types, only if no GPU device is present // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); @@ -419,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { // get general kv ml.get_key(LLM_KV_GENERAL_NAME, name, false); - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false); // everything past this point is not vocab-related if (hparams.vocab_only) { @@ -431,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false); if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); @@ -455,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); - std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1); ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); - ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; @@ -512,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { + if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } @@ -575,22 +585,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.use_kq_norm = false; } } break; - case LLM_ARCH_MLLAMA: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - - switch (hparams.n_layer) { - case 40: type = LLM_TYPE_11B; break; - case 100: type = LLM_TYPE_90B; break; - default: type = LLM_TYPE_UNKNOWN; - } - } break; case LLM_ARCH_DECI: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; + case 162: type = LLM_TYPE_405B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -709,13 +710,19 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } break; case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { - type = LLM_TYPE_137M; + if (arch == LLM_ARCH_NOMIC_BERT) { + type = LLM_TYPE_137M; + } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { + type = LLM_TYPE_475M; + } } } break; case LLM_ARCH_BLOOM: @@ -776,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // fall through case LLM_ARCH_QWEN2: { + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break; @@ -805,6 +813,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; + case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; + case 40: type = LLM_TYPE_14B; break; + case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -814,6 +826,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 48: type = LLM_TYPE_30B_A3B; break; + case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -1425,7 +1439,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; - case LLM_ARCH_MISTRAL3: break; default: throw std::runtime_error("unsupported model architecture"); } @@ -1494,6 +1507,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { @@ -1565,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_ff = hparams.n_ff(); const int64_t n_embd_gqa = n_embd_v_gqa; - const int64_t n_vocab = hparams.n_vocab; + const int64_t n_vocab = vocab.n_tokens(); const int64_t n_token_types = vocab.n_token_types(); const int64_t n_rot = hparams.n_rot; const int64_t n_expert = hparams.n_expert; @@ -1661,8 +1677,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { std::regex pattern(overrides->pattern); if (std::regex_search(tensor_name, pattern)) { - LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); buft = overrides->buft; + LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", + tensor_name.c_str(), + ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type), + ggml_backend_buft_name(buft)); break; } } @@ -1679,6 +1698,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto * buft_dev = ggml_backend_buft_get_device(buft); if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + throw std::runtime_error("no CPU backend found"); + } buft = ggml_backend_dev_buffer_type(cpu_dev); } @@ -1818,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; - case LLM_ARCH_MLLAMA: - { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0); - - // output - { - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); - - // if output is NULL, init from the input tok embed - if (output == NULL) { - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); - } - } - - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; - - if (hparams.cross_attention_layers(i)) { - layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0); - layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0); - layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0); - layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0); - layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0); - layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0); - layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0); - layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0); - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - } else { - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - } - } - } break; case LLM_ARCH_DECI: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -1906,7 +1882,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + if (n_ff > 0) { + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + } if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); @@ -1916,9 +1894,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); } - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + if (n_ff > 0) { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } // optional MLP bias layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); @@ -2133,6 +2113,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); @@ -2166,20 +2147,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); } + if (arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0); + } + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - - if (arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + } else { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + } } layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); @@ -3550,7 +3542,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -4183,6 +4179,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!dev) { // FIXME: workaround for CPU backend buft having a NULL device dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!dev) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } } ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); @@ -4312,7 +4311,7 @@ uint64_t llama_model::n_elements() const { } void llama_model::print_info() const { - const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); + const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train); auto print_f = [](const std::function & f, uint32_t n) { bool is_var = false; @@ -4373,7 +4372,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn); LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); - LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); @@ -4520,6 +4519,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const { return it->second; } +ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const { + // choose long/short freq factors based on the context size + if (layers[il].rope_freqs != nullptr) { + return layers[il].rope_freqs; + } + + if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) { + return layers[il].rope_long; + } + + return layers[il].rope_short; +} + struct llm_build_llama : public llm_graph_context { llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -4560,7 +4572,7 @@ struct llm_build_llama : public llm_graph_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -4744,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context { } }; -struct llm_build_mllama: public llm_graph_context { - llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - ggml_tensor * cur; - ggml_tensor * inpL; - ggml_tensor * inpCAS; - - inpL = build_inp_embd(model.tok_embd); - inpCAS = build_inp_cross_attn_state(); - - // inp_pos - contains the positions - ggml_tensor * inp_pos = build_inp_pos(); - - auto * inp_attn = build_attn_inp_kv_unified(); - const llama_kv_cache_unified * kv_self = static_cast(memory); - - for (int il = 0; il < n_layer; ++il) { - ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - if (hparams.cross_attention_layers(il)) { - if (!ubatch.embd && !cparams.cross_attn) { - continue; - } - - // cross attention layer - ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur); - cb(Qcur, "Qcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur", il); - - Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); - cb(Qcur, "Qcur", il); - - Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il); - cb(Qcur, "Qcur", il); - - ggml_tensor * Kcur, * Vcur; - if (ubatch.embd) { - Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS); - cb(Kcur, "Kcur", il); - - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404); - cb(Kcur, "Kcur", il); - - Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - cb(Kcur, "Kcur", il); - - Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il); - cb(Kcur, "Kcur", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il])); - - Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS); - cb(Vcur, "Vcur", il); - - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404); - cb(Vcur, "Vcur", il); - - Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); - cb(Vcur, "Vcur", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il])); - } else { - Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]); - cb(Kcur, "Kcur (view)", il); - - Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]); - cb(Vcur, "Vcur (view)", il); - } - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur); - cb(kq, "kq", il); - - // TODO: apply causal masks - struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - cb(kq_soft_max, "kq_soft_max", il); - - Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur)); - cb(Vcur, "Vcur", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); - cb(cur, "kqv_merged_cont", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur); - cb(cur, "cur", il); - - // TODO: do this in place once? - cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate)); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - - // TODO: do this inplace once? - cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp); - cb(cur, "ffn_out", il); - - cur = build_cvec(cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } else { - // self attention layer - - // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); - - // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - cur = build_attn(inp_attn, gf, - model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = build_cvec(cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - res->t_embd = cur; - - // lm_head - cur = build_lora_mm(model.output, cur); - - cb(cur, "result_output", -1); - res->t_logits = cur; - - ggml_build_forward_expand(gf, cur); - } -}; - struct llm_build_deci : public llm_graph_context { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -5006,6 +4778,7 @@ struct llm_build_deci : public llm_graph_context { ggml_tensor * inpSA = inpL; const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_head = hparams.n_head(il); + const int64_t n_ff = hparams.n_ff(il); if (n_head == 0) { // attention-free layer of Llama-3_1-Nemotron-51B @@ -5025,7 +4798,7 @@ struct llm_build_deci : public llm_graph_context { } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -5081,6 +4854,11 @@ struct llm_build_deci : public llm_graph_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } + // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B + if (n_ff == 0) { + continue; + } + // For Granite architecture if (hparams.f_residual_scale) { cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); @@ -6074,6 +5852,11 @@ struct llm_build_bert : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); @@ -6126,13 +5909,29 @@ struct llm_build_bert : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - if (model.arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) { + // MoE branch + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + nullptr, + model.layers[il].ffn_down_exps, + nullptr, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_GELU, + false, false, + 0.0f, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + cb(cur, "ffn_moe_out", il); + } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -6140,6 +5939,7 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } else { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -6147,8 +5947,8 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } - cb(cur, "ffn_out", il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp); @@ -7485,7 +7285,7 @@ struct llm_build_phi3 : public llm_graph_context { // self-attention { // rope freq factors for 128k context - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, @@ -8237,7 +8037,7 @@ struct llm_build_minicpm3 : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // norm cur = build_norm(inpL, @@ -9004,7 +8804,7 @@ struct llm_build_mamba : public llm_graph_context { ggml_tensor * state_mask, const llama_ubatch & ubatch, int il) const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); const auto kv_head = kv_self->head; @@ -9305,7 +9105,7 @@ struct llm_build_cohere2 : public llm_graph_context { // self-attention { // rope freq factors for 128k context - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -10243,7 +10043,7 @@ struct llm_build_deepseek : public llm_graph_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -11607,7 +11407,7 @@ struct llm_build_exaone : public llm_graph_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -11752,7 +11552,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { ggml_tensor * state_mask, const llama_ubatch & ubatch, int il) const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); const auto n_tokens = ubatch.n_tokens; const auto n_seqs = ubatch.n_seqs; @@ -12148,7 +11948,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { ggml_tensor *& first_layer_value, const llama_ubatch & ubatch, int il) const { - const llama_kv_cache_unified * kv_self = static_cast(memory); + const llama_kv_cache_recurrent * kv_self = static_cast(memory); const auto n_tokens = ubatch.n_tokens; const auto n_seqs = ubatch.n_seqs; @@ -12696,7 +12496,7 @@ struct llm_build_solar : public llm_graph_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -13147,7 +12947,7 @@ struct llm_build_bailingmoe : public llm_graph_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -13267,36 +13067,46 @@ struct llm_build_bailingmoe : public llm_graph_context { } }; -llama_memory_i * llama_model::create_memory() const { +llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; switch (arch) { + case LLM_ARCH_BERT: + case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: + { + res = nullptr; + } break; case LLM_ARCH_MAMBA: case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_RWKV7: case LLM_ARCH_ARWKV7: { - res = new llama_kv_cache_unified(hparams, { - /*.get_rope_factors =*/ nullptr - }); + res = new llama_kv_cache_recurrent( + *this, + GGML_TYPE_F32, + GGML_TYPE_F32, + cparams.offload_kqv, + std::max((uint32_t) 1, cparams.n_seq_max)); } break; default: { - res = new llama_kv_cache_unified(hparams, { - /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) { - // choose long/short freq factors based on the context size - if (layers[il].rope_freqs != nullptr) { - return layers[il].rope_freqs; - } + const auto padding = llama_kv_cache_unified::get_padding(cparams); - if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) { - return layers[il].rope_long; - } + cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding); - return layers[il].rope_short; - } - }); + LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); + + res = new llama_kv_cache_unified( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + cparams.n_ctx, + padding); } } @@ -13318,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; - case LLM_ARCH_MLLAMA: - { - llm = std::make_unique(*this, params, gf); - } break; case LLM_ARCH_DECI: { llm = std::make_unique(*this, params, gf); @@ -13349,6 +13155,7 @@ llm_graph_result_ptr llama_model::build_graph( case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { llm = std::make_unique(*this, params, gf); } break; @@ -13682,12 +13489,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA4: - case LLM_ARCH_MLLAMA: case LLM_ARCH_DECI: case LLM_ARCH_BAICHUAN: case LLM_ARCH_STARCODER: - case LLM_ARCH_PLAMO: - case LLM_ARCH_ORION: case LLM_ARCH_INTERNLM2: case LLM_ARCH_MINICPM: case LLM_ARCH_XVERSE: @@ -13705,7 +13509,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_CHAMELEON: case LLM_ARCH_SOLAR: case LLM_ARCH_BAILINGMOE: - case LLM_ARCH_MISTRAL3: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -13714,6 +13517,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DBRX: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: @@ -13726,6 +13530,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: + case LLM_ARCH_PLAMO: case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: case LLM_ARCH_GEMMA3: @@ -13733,6 +13538,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: case LLM_ARCH_CODESHELL: + case LLM_ARCH_ORION: case LLM_ARCH_NEMOTRON: case LLM_ARCH_EXAONE: case LLM_ARCH_MINICPM3: @@ -13805,6 +13611,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); const auto & it = model->gguf_kv.find(key); if (it == model->gguf_kv.end()) { + // one-off fix for very popular models (so we are not flooded with issues) + // do not extend this list unless absolutely necessary + // Mistral-Small-2503 does not have built-in chat template + llama_vocab_pre_type pre_type = model->vocab.get_pre_type(); + if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) { + return "mistral-v7-tekken"; + } + return nullptr; } diff --git a/llama/llama.cpp/src/llama-model.h b/llama/llama.cpp/src/llama-model.h index 72bab5bee..43746c7dd 100644 --- a/llama/llama.cpp/src/llama-model.h +++ b/llama/llama.cpp/src/llama-model.h @@ -11,7 +11,6 @@ #include #include #include -#include struct llama_cparams; struct llama_ubatch; @@ -37,14 +36,17 @@ enum llm_type { LLM_TYPE_335M, LLM_TYPE_410M, LLM_TYPE_450M, + LLM_TYPE_475M, LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, + LLM_TYPE_0_6B, LLM_TYPE_1B, LLM_TYPE_1_3B, LLM_TYPE_1_4B, LLM_TYPE_1_5B, LLM_TYPE_1_6B, + LLM_TYPE_1_7B, LLM_TYPE_1_8B, LLM_TYPE_2B, LLM_TYPE_2_8B, @@ -64,6 +66,7 @@ enum llm_type { LLM_TYPE_16B, LLM_TYPE_20B, LLM_TYPE_22B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, LLM_TYPE_34B, @@ -71,9 +74,10 @@ enum llm_type { LLM_TYPE_40B, LLM_TYPE_65B, LLM_TYPE_70B, - LLM_TYPE_90B, LLM_TYPE_236B, + LLM_TYPE_290B, LLM_TYPE_314B, + LLM_TYPE_405B, LLM_TYPE_671B, LLM_TYPE_SMALL, LLM_TYPE_MEDIUM, @@ -87,12 +91,14 @@ enum llm_type { LLM_TYPE_16x3_8B, LLM_TYPE_10B_128x3_66B, LLM_TYPE_57B_A14B, - LLM_TYPE_27B, - LLM_TYPE_290B, LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick + LLM_TYPE_30B_A3B, + LLM_TYPE_235B_A22B, }; +std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type); + struct llama_layer_posnet { // resnet struct ggml_tensor * norm1 = nullptr; @@ -312,16 +318,6 @@ struct llama_layer { struct ggml_tensor * bskcn_tv = nullptr; - // cross attention - struct ggml_tensor * cross_attn_k_norm = nullptr; - struct ggml_tensor * cross_attn_k_proj = nullptr; - struct ggml_tensor * cross_attn_o_proj = nullptr; - struct ggml_tensor * cross_attn_q_norm = nullptr; - struct ggml_tensor * cross_attn_q_proj = nullptr; - struct ggml_tensor * cross_attn_v_proj = nullptr; - struct ggml_tensor * cross_attn_attn_gate = nullptr; - struct ggml_tensor * cross_attn_mlp_gate = nullptr; - struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; @@ -405,8 +401,11 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; + ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const; + + // note: can mutate `cparams` // TODO: move this to new llm_arch_model_i interface - llama_memory_i * create_memory() const; // TODO: params + llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const; // TODO: move this to new llm_arch_model_i interface llm_graph_result_ptr build_graph( diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp index 8ae6dde87..820d5128e 100644 --- a/llama/llama.cpp/src/llama-quant.cpp +++ b/llama/llama.cpp/src/llama-quant.cpp @@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: nthread = std::thread::hardware_concurrency(); } - // mmap consistently increases speed Linux, and also increases speed on Windows with + // mmap consistently increases speed on Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. #if defined(__linux__) || defined(_WIN32) constexpr bool use_mmap = true; @@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: llama_model_kv_override * kv_overrides = nullptr; if (params->kv_overrides) { - auto v = (std::vector*)params->kv_overrides; + auto * v = (std::vector*)params->kv_overrides; kv_overrides = v->data(); } @@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (llama_model_has_encoder(&model)) { n_attn_layer *= 3; } - if (qs.n_attention_wv != n_attn_layer) { - LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv); - } + GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); } size_t total_size_org = 0; @@ -744,10 +742,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - // don't quantize vision stuff - quantize &= name.find("v.") == std::string::npos; - quantize &= name.find("mm.") == std::string::npos; - // quantize only 2D and 3D tensors (experts) quantize &= (ggml_n_dims(tensor) >= 2); diff --git a/llama/llama.cpp/src/llama-sampling.cpp b/llama/llama.cpp/src/llama-sampling.cpp index b1a9dca3c..15a10ca8c 100644 --- a/llama/llama.cpp/src/llama-sampling.cpp +++ b/llama/llama.cpp/src/llama-sampling.cpp @@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // } if (k <= 0) { - k = cur_p->size; + return; } k = std::min(k, (int) cur_p->size); @@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } cur_p->sorted = true; } + cur_p->size = k; } @@ -1749,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx; + if (ctx->n <= 0.0f || cur_p->size <= 1) { + return; + } + // find max logit and calculate mean float max = cur_p->data[0].logit; float logits_sum = 0; + size_t valid_count = 0; for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].logit > max) { - max = cur_p->data[i].logit; + // Only count non-negative infinity values + if (cur_p->data[i].logit != -INFINITY) { + if (cur_p->data[i].logit > max) { + max = cur_p->data[i].logit; + } + logits_sum += cur_p->data[i].logit; + valid_count++; } - logits_sum += cur_p->data[i].logit; } - float mean = logits_sum/cur_p->size; + float mean = valid_count > 0 ? logits_sum/valid_count : 0; // calculate standard deviation float acc = 0; for (size_t i = 0; i < cur_p->size; ++i) { - acc += pow(cur_p->data[i].logit - mean, 2); + // Skip -infinity in std calculation + if (cur_p->data[i].logit != -INFINITY) { + acc += pow(cur_p->data[i].logit - mean, 2); + } } - float std = sqrt(acc/cur_p->size); + float std = valid_count > 0 ? sqrt(acc/valid_count) : 0; //apply mask for (size_t i = 0; i < cur_p->size; ++i) { diff --git a/llama/llama.cpp/src/llama-vocab.cpp b/llama/llama.cpp/src/llama-vocab.cpp index ba37df355..9f5fd57b8 100644 --- a/llama/llama.cpp/src/llama-vocab.cpp +++ b/llama/llama.cpp/src/llama-vocab.cpp @@ -1,5 +1,7 @@ #include "llama-vocab.h" +#include "ggml.h" +#include "gguf.h" #include "llama-impl.h" #include "llama-model-loader.h" @@ -415,6 +417,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_SEED_CODER: + regex_exprs = { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1227,6 +1236,9 @@ struct fragment_buffer_variant { struct llama_vocab::impl { uint32_t n_token_types = 0; // for BERT-style token types + std::string tokenizer_model; + std::string tokenizer_pre; + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; @@ -1362,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { // determine vocab type { - std::string tokenizer_model; - std::string tokenizer_pre; - ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); @@ -1459,7 +1468,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); if (precompiled_charsmap_keyidx != -1) { - size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); + const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx); + const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); #ifdef IS_BIG_ENDIAN @@ -1497,7 +1507,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| - tokenizer_pre == "falcon3") { + tokenizer_pre == "falcon3" || + tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; @@ -1624,6 +1635,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "bailingmoe") { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; + } else if ( + tokenizer_pre == "seed-coder") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; + clean_spaces = false; } else { LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; @@ -2769,6 +2784,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) { pimpl->load(ml, kv); } +std::string llama_vocab::get_tokenizer_model() const { + return pimpl->tokenizer_model; +} + +std::string llama_vocab::get_tokenizer_pre() const { + return pimpl->tokenizer_pre; +} + enum llama_vocab_type llama_vocab::get_type() const { return pimpl->type; } @@ -2991,6 +3014,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string return it->second; } +std::vector llama_vocab::get_bpe_merges() const { + std::vector result(pimpl->bpe_ranks.size()); + + for (const auto & pair : pimpl->bpe_ranks) { + result[pair.second] = pair.first.first + " " + pair.first.second; + } + + return result; +} + +std::vector llama_vocab::get_precompiled_charsmap() const { + return pimpl->precompiled_charsmap; +} + int32_t llama_vocab::tokenize( const char * text, int32_t text_len, diff --git a/llama/llama.cpp/src/llama-vocab.h b/llama/llama.cpp/src/llama-vocab.h index 5ce355214..daa6cf308 100644 --- a/llama/llama.cpp/src/llama-vocab.h +++ b/llama/llama.cpp/src/llama-vocab.h @@ -21,6 +21,9 @@ struct llama_vocab { void load(llama_model_loader & ml, const LLM_KV & kv); + std::string get_tokenizer_model() const; + std::string get_tokenizer_pre() const; + enum llama_vocab_type get_type() const; enum llama_vocab_pre_type get_pre_type() const; @@ -80,6 +83,9 @@ struct llama_vocab { int max_token_len() const; int find_bpe_rank(const std::string & token_left, const std::string & token_right) const; + std::vector get_bpe_merges() const; + + std::vector get_precompiled_charsmap() const; int32_t tokenize( const char * text, diff --git a/llama/llama.cpp/src/llama.cpp b/llama/llama.cpp/src/llama.cpp index d5164720b..9fdddf7b0 100644 --- a/llama/llama.cpp/src/llama.cpp +++ b/llama/llama.cpp/src/llama.cpp @@ -4,6 +4,7 @@ #include "llama-mmap.h" #include "llama-vocab.h" #include "llama-model-loader.h" +#include "llama-model-saver.h" #include "llama-model.h" #include "ggml.h" @@ -253,6 +254,13 @@ struct llama_model * llama_model_load_from_splits( return llama_model_load_from_file_impl(splits.front(), splits, params); } +void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { + llama_model_saver ms(*model); + ms.add_kv_from_model(); + ms.add_tensors_from_model(); + ms.save(path_model); +} + // // chat templates // @@ -338,3 +346,4 @@ const char * llama_print_system_info(void) { return s.c_str(); } + diff --git a/llama/llama.cpp/examples/llava/clip-impl.h b/llama/llama.cpp/tools/mtmd/clip-impl.h similarity index 81% rename from llama/llama.cpp/examples/llava/clip-impl.h rename to llama/llama.cpp/tools/mtmd/clip-impl.h index 180ae9880..23036ba72 100644 --- a/llama/llama.cpp/examples/llava/clip-impl.h +++ b/llama/llama.cpp/tools/mtmd/clip-impl.h @@ -2,8 +2,6 @@ #include "gguf.h" #include "clip.h" -#include "clip.h" - #include #include #include @@ -17,33 +15,29 @@ #define KEY_FTYPE "general.file_type" #define KEY_NAME "general.name" #define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" -#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" -#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" #define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_SILU "clip.use_silu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_N_EMBD "clip.vision.embedding_length" +#define KEY_N_FF "clip.vision.feed_forward_length" +#define KEY_N_BLOCK "clip.vision.block_count" +#define KEY_N_HEAD "clip.vision.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.vision.projection_dim" #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" +#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" +#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" // @@ -59,10 +53,16 @@ #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" +#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" -#define TN_LN_1 "%s.blk.%d.ln1.%s" -#define TN_LN_2 "%s.blk.%d.ln2.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm +#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm +#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale +#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" #define TN_LLAVA_PROJ "mm.%d.%s" @@ -70,8 +70,14 @@ #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MM_INP_NORM "mm.input_norm.weight" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 +#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1 +#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral +#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) +#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model) // mimicpmv #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" @@ -87,18 +93,23 @@ #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" -#define TN_GLM_BOI_W "adapter.boi" -#define TN_GLM_EOI_W "adapter.eoi" + +// align x to upper multiple of n +#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, - PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_MINICPMV, PROJECTOR_TYPE_GLM_EDGE, - PROJECTOR_TYPE_MERGER, + PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_IDEFICS3, + PROJECTOR_TYPE_PIXTRAL, + PROJECTOR_TYPE_QWEN25VL, + PROJECTOR_TYPE_INTERNVL, PROJECTOR_TYPE_UNKNOWN, }; @@ -106,10 +117,14 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_RESAMPLER, "resampler"}, + { PROJECTOR_TYPE_MINICPMV, "resampler"}, { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, + { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, + { PROJECTOR_TYPE_INTERNVL, "internvl"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { @@ -224,6 +239,15 @@ struct clip_image_u8_batch { struct clip_image_f32_batch { std::vector entries; + + clip_image_f32_batch clone() const { + clip_image_f32_batch new_batch; + new_batch.entries.reserve(entries.size()); + for (const auto & entry : entries) { + new_batch.entries.emplace_back(new clip_image_f32(*entry)); + } + return new_batch; + } }; // diff --git a/llama/llama.cpp/tools/mtmd/clip.cpp b/llama/llama.cpp/tools/mtmd/clip.cpp new file mode 100644 index 000000000..cdd8ca44e --- /dev/null +++ b/llama/llama.cpp/tools/mtmd/clip.cpp @@ -0,0 +1,3841 @@ +// NOTE: This is modified from clip.cpp only for LLaVA, +// so there might be still unnecessary artifacts hanging around +// I'll gradually clean and extend it +// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch +#include "clip.h" +#include "clip-impl.h" +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#define STB_IMAGE_IMPLEMENTATION +#include "stb_image.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX + #define NOMINMAX +#endif +#include +#if __GLIBCXX__ +#include +#include +#include +#endif +#endif + +struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; + +enum ffn_op_type { + FFN_GELU, + FFN_SILU, + FFN_GELU_QUICK, +}; + +enum norm_type { + NORM_TYPE_NORMAL, + NORM_TYPE_RMS, +}; + +//#define CLIP_DEBUG_FUNCTIONS + +#ifdef CLIP_DEBUG_FUNCTIONS +static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast(&img.buf[i]), 3); + } + + file.close(); +} + +static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast(padding.data()), paddingAmount); + } + + file.close(); +} + +// debug function to convert f32 to u8 +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} +#endif + + +// +// clip layers +// + +enum patch_merge_type { + PATCH_MERGE_FLAT, + PATCH_MERGE_SPATIAL_UNPAD, +}; + +struct clip_hparams { + int32_t image_size; + int32_t patch_size; + int32_t n_embd; + int32_t n_ff; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + int32_t proj_scale_factor = 0; // idefics3 + + // for models using dynamic image size, we need to have a smaller image size to warmup + // otherwise, user will get OOM everytime they load the model + int32_t warmup_image_size = 0; + + ffn_op_type ffn_op = FFN_GELU; + + patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; + + float eps = 1e-6; + float rope_theta = 0.0; + + std::vector image_grid_pinpoints; + int32_t image_crop_resolution; + std::unordered_set vision_feature_layer; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; + int32_t spatial_merge_size = 0; +}; + +struct clip_layer { + // attention + ggml_tensor * k_w = nullptr; + ggml_tensor * k_b = nullptr; + ggml_tensor * q_w = nullptr; + ggml_tensor * q_b = nullptr; + ggml_tensor * v_w = nullptr; + ggml_tensor * v_b = nullptr; + + ggml_tensor * o_w = nullptr; + ggml_tensor * o_b = nullptr; + + ggml_tensor * k_norm = nullptr; + ggml_tensor * q_norm = nullptr; + + // layernorm 1 + ggml_tensor * ln_1_w = nullptr; + ggml_tensor * ln_1_b = nullptr; + + ggml_tensor * ff_up_w = nullptr; + ggml_tensor * ff_up_b = nullptr; + ggml_tensor * ff_gate_w = nullptr; + ggml_tensor * ff_gate_b = nullptr; + ggml_tensor * ff_down_w = nullptr; + ggml_tensor * ff_down_b = nullptr; + + // layernorm 2 + ggml_tensor * ln_2_w = nullptr; + ggml_tensor * ln_2_b = nullptr; + + // layer scale (no bias) + ggml_tensor * ls_1_w = nullptr; + ggml_tensor * ls_2_w = nullptr; +}; + +struct clip_vision_model { + struct clip_hparams hparams; + + // embeddings + ggml_tensor * class_embedding = nullptr; + ggml_tensor * patch_embeddings_0 = nullptr; + ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + ggml_tensor * patch_bias = nullptr; + ggml_tensor * position_embeddings = nullptr; + + ggml_tensor * pre_ln_w = nullptr; + ggml_tensor * pre_ln_b = nullptr; + + std::vector layers; + + ggml_tensor * post_ln_w; + ggml_tensor * post_ln_b; + + ggml_tensor * projection; + + // LLaVA projection + ggml_tensor * mm_input_norm_w = nullptr; + ggml_tensor * mm_0_w = nullptr; + ggml_tensor * mm_0_b = nullptr; + ggml_tensor * mm_2_w = nullptr; + ggml_tensor * mm_2_b = nullptr; + + ggml_tensor * image_newline = nullptr; + + // Yi type models with mlp+normalization projection + ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 + ggml_tensor * mm_1_b = nullptr; + ggml_tensor * mm_3_w = nullptr; + ggml_tensor * mm_3_b = nullptr; + ggml_tensor * mm_4_w = nullptr; + ggml_tensor * mm_4_b = nullptr; + + // GLMV-Edge projection + ggml_tensor * mm_model_adapter_conv_w = nullptr; + ggml_tensor * mm_model_adapter_conv_b = nullptr; + ggml_tensor * mm_glm_tok_boi = nullptr; + ggml_tensor * mm_glm_tok_eoi = nullptr; + + // MobileVLM projection + ggml_tensor * mm_model_mlp_1_w = nullptr; + ggml_tensor * mm_model_mlp_1_b = nullptr; + ggml_tensor * mm_model_mlp_3_w = nullptr; + ggml_tensor * mm_model_mlp_3_b = nullptr; + ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; + + // MobileVLM_V2 projection + ggml_tensor * mm_model_mlp_0_w = nullptr; + ggml_tensor * mm_model_mlp_0_b = nullptr; + ggml_tensor * mm_model_mlp_2_w = nullptr; + ggml_tensor * mm_model_mlp_2_b = nullptr; + ggml_tensor * mm_model_peg_0_w = nullptr; + ggml_tensor * mm_model_peg_0_b = nullptr; + + // MINICPMV projection + ggml_tensor * mm_model_pos_embed_k = nullptr; + ggml_tensor * mm_model_query = nullptr; + ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_kv_proj = nullptr; + ggml_tensor * mm_model_attn_q_w = nullptr; + ggml_tensor * mm_model_attn_q_b = nullptr; + ggml_tensor * mm_model_attn_k_w = nullptr; + ggml_tensor * mm_model_attn_k_b = nullptr; + ggml_tensor * mm_model_attn_v_w = nullptr; + ggml_tensor * mm_model_attn_v_b = nullptr; + ggml_tensor * mm_model_attn_o_w = nullptr; + ggml_tensor * mm_model_attn_o_b = nullptr; + ggml_tensor * mm_model_ln_q_w = nullptr; + ggml_tensor * mm_model_ln_q_b = nullptr; + ggml_tensor * mm_model_ln_kv_w = nullptr; + ggml_tensor * mm_model_ln_kv_b = nullptr; + ggml_tensor * mm_model_ln_post_w = nullptr; + ggml_tensor * mm_model_ln_post_b = nullptr; + + // gemma3 + ggml_tensor * mm_input_proj_w = nullptr; + ggml_tensor * mm_soft_emb_norm_w = nullptr; + + // pixtral + ggml_tensor * token_embd_img_break = nullptr; + ggml_tensor * mm_patch_merger_w = nullptr; +}; + +struct clip_ctx { + bool has_llava_projector = false; + int minicpmv_version = 0; + + struct clip_vision_model vision_model; + projector_type proj_type = PROJECTOR_TYPE_MLP; + + float image_mean[3]; + float image_std[3]; + + gguf_context_ptr ctx_gguf; + ggml_context_ptr ctx_data; + + std::vector buf_compute_meta; + + std::vector backend_ptrs; + std::vector backend_buft; + + ggml_backend_t backend; + ggml_backend_t backend_cpu; + ggml_backend_buffer_ptr buf; + + int max_nodes = 8192; + ggml_backend_sched_ptr sched; + + clip_image_size load_image_size; + + clip_ctx(clip_context_params & ctx_params) { + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (!backend_cpu) { + throw std::runtime_error("failed to initialize CPU backend"); + } + backend = ctx_params.use_gpu + ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr) + : nullptr; + + if (backend) { + LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend)); + backend_ptrs.push_back(backend); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + } else { + backend = backend_cpu; + LOG_INF("%s: CLIP using CPU backend\n", __func__); + } + + backend_ptrs.push_back(backend_cpu); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); + + sched.reset( + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) + ); + } + + ~clip_ctx() { + ggml_backend_free(backend); + if (backend != backend_cpu) { + ggml_backend_free(backend_cpu); + } + } +}; + +struct clip_graph { + clip_ctx * ctx; + const clip_vision_model & model; + const clip_hparams & hparams; + + // we only support single image per batch + const clip_image_f32 & img; + + const int patch_size; + const int n_patches_x; + const int n_patches_y; + const int n_patches; + const int n_embd; + const int n_head; + const int d_head; + const int n_layer; + const float eps; + const float kq_scale; + + ggml_context_ptr ctx0_ptr; + ggml_context * ctx0; + ggml_cgraph * gf; + + clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : + ctx(ctx), + model(ctx->vision_model), + hparams(model.hparams), + img(img), + patch_size(hparams.patch_size), + n_patches_x(img.nx / patch_size), + n_patches_y(img.ny / patch_size), + n_patches(n_patches_x * n_patches_y), + n_embd(hparams.n_embd), + n_head(hparams.n_head), + d_head(n_embd / n_head), + n_layer(hparams.n_layer), + eps(hparams.eps), + kq_scale(1.0f / sqrtf((float)d_head)) { + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + ctx0_ptr.reset(ggml_init(params)); + ctx0 = ctx0_ptr.get(); + gf = ggml_new_graph(ctx0); + } + + ggml_cgraph * build_siglip() { + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + const int batch_size = 1; + GGML_ASSERT(n_patches_x == n_patches_y); + const int patches_per_image = n_patches_x; + const int kernel_size = hparams.proj_scale_factor; + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); + + // doing a pool2d to reduce the number of output tokens + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + // apply norm before projection + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + + // apply projection + cur = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + cur); + + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + + const int scale_factor = model.hparams.proj_scale_factor; + const int n_embd = cur->ne[0]; + const int seq = cur->ne[1]; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = std::sqrt(seq); + const int width = std::sqrt(seq); + GGML_ASSERT(scale_factor != 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + seq / (scale_factor * scale_factor), + bsz); + + cur = ggml_mul_mat(ctx0, model.projection, cur); + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; + } + + ggml_cgraph * build_pixtral() { + const int n_merge = hparams.spatial_merge_size; + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + hparams.ffn_op, + nullptr, // no learned pos embd + add_pos); + + // mistral small 3.1 patch merger + // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 + if (model.mm_patch_merger_w) { + GGML_ASSERT(hparams.spatial_merge_size > 0); + + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); + + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] + cur = ggml_cont(ctx0, cur); + + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to n_embd + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + } + + // LlavaMultiModalProjector (always using GELU activation) + { + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + if (model.mm_1_b) { + cur = ggml_add(ctx0, cur, model.mm_1_b); + } + + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + if (model.mm_2_b) { + cur = ggml_add(ctx0, cur, model.mm_2_b); + } + } + + // arrangement of the [IMG_BREAK] token + { + // not efficient, but works + // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows] + // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension + // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] + + const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; + const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_total = p_x * p_y; + const int n_embd_text = cur->ne[0]; + const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row + + ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y); + tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor + tok = ggml_add(ctx0, tok, model.token_embd_img_break); + tmp = ggml_concat(ctx0, tmp, tok, 1); + cur = ggml_view_2d(ctx0, tmp, + n_embd_text, n_tokens_output, + ggml_row_size(tmp->type, n_embd_text), 0); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; + } + + // Qwen2VL and Qwen2.5VL use M-RoPE + ggml_cgraph * build_qwen2vl() { + GGML_ASSERT(model.patch_bias == nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const bool use_window_attn = hparams.n_wa_pattern > 0; + const int n_wa_pattern = hparams.n_wa_pattern; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL + ? NORM_TYPE_RMS // qwen 2.5 vl + : NORM_TYPE_NORMAL; // qwen 2 vl + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); + inp = ggml_reshape_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + if (use_window_attn) { + // handle window attention inputs + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; + } + + ggml_cgraph * build_minicpmv() { + const int batch_size = 1; + + GGML_ASSERT(model.class_embedding == nullptr); + const int n_pos = n_patches; + + // position embeddings for the projector (not for ViT) + int n_output_dim = clip_n_mmproj_embd(ctx); + ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size); + ggml_set_name(pos_embed, "pos_embed"); + ggml_set_input(pos_embed); + + // for selecting learned pos embd, used by ViT + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + ggml_tensor * inp = build_inp(); + ggml_tensor * embeddings = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + // resampler projector (it is just another transformer) + + ggml_tensor * q = model.mm_model_query; + ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + + // norm + q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); + v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); + + // k = v + pos_embed + ggml_tensor * k = ggml_add(ctx0, v, pos_embed); + + // attention + { + int n_embd = clip_n_mmproj_embd(ctx); + const int d_head = 128; + int n_head = n_embd/d_head; + int num_query = 96; + if (ctx->minicpmv_version == 2) { + num_query = 96; + } else if (ctx->minicpmv_version == 3) { + num_query = 64; + } else if (ctx->minicpmv_version == 4) { + num_query = 64; + } + + ggml_tensor * Q = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), + model.mm_model_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), + model.mm_model_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), + model.mm_model_attn_v_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); + K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); + V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); + + cb(Q, "resampler_Q", -1); + cb(K, "resampler_K", -1); + cb(V, "resampler_V", -1); + + embeddings = build_attn( + model.mm_model_attn_o_w, + model.mm_model_attn_o_b, + Q, K, V, nullptr, kq_scale, -1); + cb(embeddings, "resampler_attn_out", -1); + } + // layernorm + embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); + + // projection + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; + } + + ggml_cgraph * build_internvl() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; + ggml_tensor * inp = build_inp(); + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // The larger models use a different ViT, which uses RMS norm instead of layer norm + // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 + norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) + ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) + : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) + + ggml_tensor * cur = build_vit( + inp, n_pos, + norm_t, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + { + const int scale_factor = model.hparams.proj_scale_factor; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = n_patches_y; + const int width = n_patches_x; + GGML_ASSERT(scale_factor > 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + cur->ne[1] * cur->ne[2]); + } + + // projector (always using GELU activation) + { + // projector LayerNorm uses pytorch's default eps = 1e-5 + // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_3_w, cur); + cur = ggml_add(ctx0, cur, model.mm_3_b); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; + } + + // this graph is used by llava, granite and glm + // due to having embedding_stack (used by granite), we cannot reuse build_vit + ggml_cgraph * build_llava() { + const int batch_size = 1; + const int n_pos = n_patches + (model.class_embedding ? 1 : 0); + + GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); + + // Calculate the deepest feature layer based on hparams and projector type + int max_feature_layer = n_layer; + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int il_last = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { + il_last += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer; + } + + ggml_tensor * inp = build_inp(); + + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + } + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); + cb(inpL, "pre_ln", -1); + } + + std::vector embedding_stack; + const auto & vision_feature_layer = hparams.vision_feature_layer; + + // loop over layers + for (int il = 0; il < max_feature_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // If this is an embedding feature layer, save the output. + // NOTE: 0 index here refers to the input to the encoder. + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + embedding_stack.push_back(cur); + } + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); + } + + ggml_tensor * embeddings = inpL; + + // process vision feature layers (used by granite) + { + // final layer is a vision feature layer + if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + embedding_stack.push_back(inpL); + } + + // If feature layers are explicitly set, stack them (if we have multiple) + if (!embedding_stack.empty()) { + embeddings = embedding_stack[0]; + for (size_t i = 1; i < embedding_stack.size(); i++) { + embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); + } + } + } + + // llava projector (also used by granite) + if (ctx->has_llava_projector) { + embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); + + ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(patches, "patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + embeddings = ggml_get_rows(ctx0, embeddings, patches); + + // print_tensor_info(embeddings, "embeddings"); + + // llava projector + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu(ctx0, embeddings); + if (model.mm_2_w) { + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + } + else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); + } + else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] + + // block 1 + ggml_tensor * block_1 = nullptr; + { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); + mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } + + // block_2 + { + // stride = 2 + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } + else { + GGML_ABORT("fatal error"); + } + } + + // glm projector + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); + embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_silu_inplace(ctx0, embeddings); + embeddings = ggml_mul(ctx0, embeddings,x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); + } + // arrangement of BOI/EOI token embeddings + // note: these embeddings are not present in text model, hence we cannot process them as text tokens + // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 + { + embeddings = ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI + embeddings = ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI + } + } + + else { + GGML_ABORT("llava: unknown projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; + } + +private: + // + // utility functions + // + + void cb(ggml_tensor * cur, const char * name, int il) const { + // TODO: implement this + GGML_UNUSED(cur); + GGML_UNUSED(name); + GGML_UNUSED(il); + } + + // build vision transformer (ViT) cgraph + // this function should cover most of the models + // if your model has specific features, you should probably duplicate this function + ggml_tensor * build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function add_pos + ) { + if (learned_pos_embd) { + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "pos_embed", -1); + } + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + cb(inpL, "pre_ln", -1); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + if (add_pos) { + Qcur = add_pos(Qcur, layer); + Kcur = add_pos(Kcur, layer); + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + } + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + if (layer.ls_1_w) { + cur = ggml_mul(ctx0, cur, layer.ls_1_w); + cb(cur, "attn_out_scaled", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + ffn_t, il); + + cb(cur, "ffn_out", il); + + if (layer.ls_2_w) { + cur = ggml_mul(ctx0, cur, layer.ls_2_w); + cb(cur, "ffn_out_scaled", il); + } + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); + } + return inpL; + } + + // build the input after conv2d (inp_raw --> patches) + // returns tensor with shape [n_embd, n_patches] + ggml_tensor * build_inp() { + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + return inp; + } + + ggml_tensor * build_inp_raw() { + ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + return inp_raw; + } + + ggml_tensor * build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const { + + cur = type == NORM_TYPE_RMS + ? ggml_rms_norm(ctx0, cur, norm_eps) + : ggml_norm(ctx0, cur, norm_eps); + + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx0, cur, mb); + } + + return cur; + } + + ggml_tensor * build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const { + + ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + cur = ggml_mul_mat(ctx0, gate, cur); + cb(cur, "ffn_gate", il); + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } else { + cur = tmp; + } + + switch (type_op) { + case FFN_SILU: + { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case FFN_GELU: + { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + } break; + case FFN_GELU_QUICK: + { + cur = ggml_gelu_quick(ctx0, cur); + cb(cur, "ffn_relu", il); + } break; + } + + // we only support parallel ffn for now + if (gate) { + cur = ggml_mul(ctx0, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + if (down) { + cur = ggml_mul_mat(ctx0, down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + return cur; + } + + ggml_tensor * build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); + + ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); + v = ggml_cont(ctx0, v); + //cb(k, "v", il); + + ggml_tensor * cur; + + // TODO @ngxson : support flash attention + { + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; + // const auto n_kv = k->ne[1]; // for flash attention + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // F32 may not needed for vision encoders? + // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); + } + + cb(cur, "kqv_out", il); + + if (wo) { + cur = ggml_mul_mat(ctx0, wo, cur); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; + } + + // implementation of the 2D RoPE without adding a new op in ggml + // this is not efficient (use double the memory), but works on all backends + // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 + static ggml_tensor * build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_h, + ggml_tensor * pos_w, + const float freq_base + ) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; + + // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) + // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 + // first half of cur will use 1e-0, 1e-2 (even) + // second half of cur will use 1e-1, 1e-3 (odd) + // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even + // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) + // then for the second half, we use freq_scale to shift the inv_freq + // ^ why? replace (2i) with (2i+1) in the above equation + const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim); + + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_h, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + n_dim/2 * ggml_element_size(cur)); + second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors + second = ggml_rope_ext( + ctx0, + second, + pos_w, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + freq_scale_odd, + 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + cur = ggml_concat(ctx0, first, second, 0); + return cur; + } + +}; + +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { + GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); + clip_graph graph(ctx, *imgs.entries[0]); + + ggml_cgraph * res; + + switch (ctx->proj_type) { + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + { + res = graph.build_siglip(); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + res = graph.build_pixtral(); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + { + res = graph.build_qwen2vl(); + } break; + case PROJECTOR_TYPE_MINICPMV: + { + res = graph.build_minicpmv(); + } break; + case PROJECTOR_TYPE_INTERNVL: + { + res = graph.build_internvl(); + } break; + default: + { + res = graph.build_llava(); + } break; + } + return res; +} + +struct clip_model_loader { + ggml_context_ptr ctx_meta; + gguf_context_ptr ctx_gguf; + + clip_ctx & ctx_clip; + std::string fname; + + size_t model_size = 0; // in bytes + + // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model + clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) { + struct ggml_context * meta = nullptr; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params)); + if (!ctx_gguf.get()) { + throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); + } + + ctx_meta.reset(meta); + + const int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); + + // print gguf info + { + std::string name; + get_string(KEY_NAME, name, false); + std::string description; + get_string(KEY_DESCRIPTION, description, false); + LOG_INF("%s: model name: %s\n", __func__, name.c_str()); + LOG_INF("%s: description: %s\n", __func__, description.c_str()); + LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get())); + LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get())); + LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get())); + LOG_INF("\n"); + } + + // tensors + { + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i); + enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i); + ggml_tensor * cur = ggml_get_tensor(meta, name); + size_t tensor_size = ggml_nbytes(cur); + model_size += tensor_size; + LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); + } + } + } + + void load_hparams() { + auto & hparams = ctx_clip.vision_model.hparams; + std::string log_ffn_op; // for logging + + // projector type + std::string proj_type; + { + get_string(KEY_PROJ_TYPE, proj_type, false); + if (!proj_type.empty()) { + ctx_clip.proj_type = clip_projector_type_from_string(proj_type); + } + if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) { + throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str())); + } + } + + // other hparams + { + get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy + + get_u32(KEY_N_EMBD, hparams.n_embd); + get_u32(KEY_N_HEAD, hparams.n_head); + get_u32(KEY_N_FF, hparams.n_ff); + get_u32(KEY_N_BLOCK, hparams.n_layer); + get_u32(KEY_PROJ_DIM, hparams.projection_dim); + get_f32(KEY_LAYER_NORM_EPS, hparams.eps); + get_u32(KEY_IMAGE_SIZE, hparams.image_size); + get_u32(KEY_PATCH_SIZE, hparams.patch_size); + get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); + get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false); + + // default warmup value + hparams.warmup_image_size = hparams.image_size; + + ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP + || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM + || ctx_clip.proj_type == PROJECTOR_TYPE_LDP + || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2; + + { + bool use_gelu = false; + bool use_silu = false; + get_bool(KEY_USE_GELU, use_gelu, false); + get_bool(KEY_USE_SILU, use_silu, false); + if (use_gelu && use_silu) { + throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__)); + } + if (use_gelu) { + hparams.ffn_op = FFN_GELU; + log_ffn_op = "gelu"; + } else if (use_silu) { + hparams.ffn_op = FFN_SILU; + log_ffn_op = "silu"; + } else { + hparams.ffn_op = FFN_GELU_QUICK; + log_ffn_op = "gelu_quick"; + } + } + + { + std::string mm_patch_merge_type; + get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); + if (mm_patch_merge_type == "spatial_unpad") { + hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD; + } + } + + { + int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN); + int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD); + GGML_ASSERT(idx_mean >= 0 && "image_mean not found"); + GGML_ASSERT(idx_std >= 0 && "image_std not found"); + const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean); + const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std); + for (int i = 0; i < 3; ++i) { + ctx_clip.image_mean[i] = mean_data[i]; + ctx_clip.image_std[i] = std_data[i]; + } + } + + // Load the vision feature layer indices if they are explicitly provided; + // if multiple vision feature layers are present, the values will be concatenated + // to form the final visual features. + // NOTE: gguf conversions should standardize the values of the vision feature layer to + // be non-negative, since we use -1 to mark values as unset here. + std::vector vision_feature_layer; + get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false); + // convert std::vector to std::unordered_set + for (auto & layer : vision_feature_layer) { + hparams.vision_feature_layer.insert(layer); + } + + // model-specific params + switch (ctx_clip.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + if (ctx_clip.minicpmv_version == 0) { + ctx_clip.minicpmv_version = 2; // default to 2 if not set + } + } break; + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_INTERNVL: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + hparams.rope_theta = 10000.0f; + hparams.warmup_image_size = hparams.patch_size * 8; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); + } break; + case PROJECTOR_TYPE_GEMMA3: + { + // default value (used by all model sizes in gemma 3 family) + // number of patches for each **side** is reduced by a factor of 4 + hparams.proj_scale_factor = 4; + // test model (tinygemma3) has a different value, we optionally read it + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; + case PROJECTOR_TYPE_QWEN2VL: + { + // max image size = sqrt(max_pixels) = 3584 + // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json + // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable + // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 + hparams.image_size = 1024; + hparams.warmup_image_size = hparams.patch_size * 8; + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + // max image size = sqrt(max_pixels) + // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json + // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable + // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 + hparams.image_size = 1024; + hparams.warmup_image_size = hparams.patch_size * 8; + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern); + } break; + default: + break; + } + + LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); + LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd); + LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head); + LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff); + LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer); + LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim); + LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size); + LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size); + LOG_INF("\n"); + LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector); + LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); + LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); + LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str()); + LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); + } + } + + void load_tensors() { + auto & hparams = ctx_clip.vision_model.hparams; + std::map tensor_offset; + std::vector tensors_to_load; + + // get offsets + for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i); + } + + // create data context + struct ggml_init_params params = { + /*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_clip.ctx_data.reset(ggml_init(params)); + if (!ctx_clip.ctx_data) { + throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__)); + } + + // helper function + auto get_tensor = [&](const std::string & name, bool required = true) { + ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str()); + if (!cur && required) { + throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str())); + } + if (cur) { + tensors_to_load.push_back(cur); + // add tensors to context + ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur); + ggml_set_name(data_tensor, cur->name); + cur = data_tensor; + } + return cur; + }; + + auto & vision_model = ctx_clip.vision_model; + + vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false); + + vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false); + vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"), false); + + vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false); + vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"), false); + + vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false); + vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); + vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); + + vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false); + + // layers + vision_model.layers.resize(hparams.n_layer); + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = vision_model.layers[il]; + layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight")); + layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight")); + layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight")); + layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight")); + layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false); + layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false); + layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false); + layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); + layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias + layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias + + layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); + layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false); + layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false); + layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false); + layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false); + layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false); + + // ffn + layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false); + layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false); + layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false); + layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false); + + // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here + // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check! + if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) { + // swap up and down weights + ggml_tensor * tmp = layer.ff_up_w; + layer.ff_up_w = layer.ff_down_w; + layer.ff_down_w = tmp; + // swap up and down biases + tmp = layer.ff_up_b; + layer.ff_up_b = layer.ff_down_b; + layer.ff_down_b = tmp; + } + } + + switch (ctx_clip.proj_type) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + { + // LLaVA projection + vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false); + vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); + // Yi-type llava + vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false); + vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + // missing in Yi-type llava + vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false); + vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + // Yi-type llava + vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false); + vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false); + vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false); + vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false); + if (vision_model.mm_3_w) { + // TODO: this is a hack to support Yi-type llava + ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM; + } + vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false); + } break; + case PROJECTOR_TYPE_LDP: + { + // MobileVLM projection + vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); + vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); + vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); + vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); + vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); + vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); + vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); + vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); + vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); + vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } break; + case PROJECTOR_TYPE_LDPV2: + { + // MobilVLM_V2 projection + vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); + vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); + vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias")); + vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); + vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); + } break; + case PROJECTOR_TYPE_MINICPMV: + { + // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); + vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY); + vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ); + vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ); + vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight")); + vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight")); + vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight")); + vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias")); + vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias")); + vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias")); + vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight")); + vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias")); + vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight")); + vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias")); + vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight")); + vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias")); + vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight")); + vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias")); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight")); + vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias")); + vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight")); + vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight")); + vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias")); + vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight")); + vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight")); + vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight")); + vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight")); + vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight")); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + { + vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_GEMMA3: + { + vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + vision_model.projection = get_tensor(TN_MM_PROJECTOR); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + // [IMG_BREAK] token embedding + vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); + // for mistral small 3.1 + vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); + } break; + case PROJECTOR_TYPE_INTERNVL: + { + vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); + vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); + } break; + default: + GGML_ASSERT(false && "unknown projector type"); + } + + // load data + { + std::vector read_buf; + +#ifdef _WIN32 + int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0); + if (!wlen) { + throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__)); + } + wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); + wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wbuf, wlen); + if (!wlen) { + free(wbuf); + throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__)); + } +#if __GLIBCXX__ + int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY); + __gnu_cxx::stdio_filebuf buffer(fd, std::ios_base::in); + std::istream fin(&buffer); +#else // MSVC + // unused in our current build + auto fin = std::ifstream(wbuf, std::ios::binary); +#endif + free(wbuf); +#else + auto fin = std::ifstream(fname, std::ios::binary); +#endif + if (!fin) { + throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); + ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); + ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (auto & t : tensors_to_load) { + ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); + const size_t offset = tensor_offset[t->name]; + fin.seekg(offset, std::ios::beg); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); + } + size_t num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } +#if defined(_WIN32) && defined(__GLIBCXX__) + close(fd); +#else + fin.close(); +#endif + + LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); + } + } + + void alloc_compute_meta() { + ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + + // create a fake batch + clip_image_f32_batch batch; + clip_image_f32_ptr img(clip_image_f32_init()); + img->nx = ctx_clip.vision_model.hparams.warmup_image_size; + img->ny = ctx_clip.vision_model.hparams.warmup_image_size; + img->buf.resize(img->nx * img->ny * 3); + batch.entries.push_back(std::move(img)); + + ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch); + ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); + + for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) { + ggml_backend_t backend = ctx_clip.backend_ptrs[i]; + ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend); + if (size > 1) { + LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } + } + + void get_bool(const std::string & key, bool & output, bool required = true) { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) throw std::runtime_error("Key not found: " + key); + return; + } + output = gguf_get_val_bool(ctx_gguf.get(), i); + } + + void get_i32(const std::string & key, int & output, bool required = true) { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) throw std::runtime_error("Key not found: " + key); + return; + } + output = gguf_get_val_i32(ctx_gguf.get(), i); + } + + void get_u32(const std::string & key, int & output, bool required = true) { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) throw std::runtime_error("Key not found: " + key); + return; + } + output = gguf_get_val_u32(ctx_gguf.get(), i); + } + + void get_f32(const std::string & key, float & output, bool required = true) { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) throw std::runtime_error("Key not found: " + key); + return; + } + output = gguf_get_val_f32(ctx_gguf.get(), i); + } + + void get_string(const std::string & key, std::string & output, bool required = true) { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) throw std::runtime_error("Key not found: " + key); + return; + } + output = std::string(gguf_get_val_str(ctx_gguf.get(), i)); + } + + void get_arr_int(const std::string & key, std::vector & output, bool required = true) { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) throw std::runtime_error("Key not found: " + key); + return; + } + int n = gguf_get_arr_n(ctx_gguf.get(), i); + output.resize(n); + const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i); + for (int i = 0; i < n; ++i) { + output[i] = values[i]; + } + } +}; + +// read and create ggml_context containing the tensors and their data +struct clip_ctx * clip_model_load(const char * fname, const int verbosity) { + return clip_init(fname, clip_context_params{ + /* use_gpu */ true, + /* verbosity */ static_cast(verbosity), + }); +} + +struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) { + g_logger_state.verbosity_thold = ctx_params.verbosity; + clip_ctx * ctx_clip = nullptr; + + try { + ctx_clip = new clip_ctx(ctx_params); + clip_model_loader loader(fname, *ctx_clip); + loader.load_hparams(); + loader.load_tensors(); + loader.alloc_compute_meta(); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what()); + delete ctx_clip; + return nullptr; + } + + return ctx_clip; +} + +void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { + ctx_clip->load_image_size = *load_image_size; // copy +} + +struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { + return &ctx_clip->load_image_size; +} + +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; + return load_image_size; +} + +struct clip_image_u8 * clip_image_u8_init() { + return new clip_image_u8(); +} + +struct clip_image_f32 * clip_image_f32_init() { + return new clip_image_f32(); +} + +struct clip_image_f32_batch * clip_image_f32_batch_init() { + return new clip_image_f32_batch(); +} + +unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { + if (nx) *nx = img->nx; + if (ny) *ny = img->ny; + return img->buf.data(); +} + +void clip_image_size_free(struct clip_image_size * load_image_size) { + if (load_image_size == nullptr) { + return; + } + delete load_image_size; +} +void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; } +void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; } +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; } + +size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) { + return batch->entries.size(); +} + +size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; + } + return batch->entries[idx]->nx; +} + +size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; + } + return batch->entries[idx]->ny; +} + +clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return nullptr; + } + return batch->entries[idx].get(); +} + +void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { + img->nx = nx; + img->ny = ny; + img->buf.resize(3 * nx * ny); + memcpy(img->buf.data(), rgb_pixels, img->buf.size()); +} + +bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { + int nx, ny, nc; + auto * data = stbi_load(fname, &nx, &ny, &nc, 3); + if (!data) { + LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); + return false; + } + clip_build_img_from_pixels(data, nx, ny, img); + stbi_image_free(data); + return true; +} + +bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { + int nx, ny, nc; + auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); + if (!data) { + LOG_ERR("%s: failed to decode image bytes\n", __func__); + return false; + } + clip_build_img_from_pixels(data, nx, ny, img); + stbi_image_free(data); + return true; +} + +// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not +static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(src.buf.size()); + + // TODO @ngxson : seems like this could be done more efficiently on cgraph + for (size_t i = 0; i < src.buf.size(); ++i) { + int c = i % 3; // rgb + dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; + } +} + +// set of tools to manupulate images +// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv +struct image_manipulation { + // Bilinear resize function + static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast(src.nx - 1) / target_width; + float y_ratio = static_cast(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast(px); + int y_floor = static_cast(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = lerp( + static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = lerp( + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); + } + } + } + } + + // Bicubic resize function + // part of image will be cropped if the aspect ratio is different + static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; + } + + // llava-1.6 type of resize_and_pad + // if the ratio is not 1:1, padding with pad_color will be applied + // pad_color is single channel, default is 0 (black) + static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array pad_color = {0, 0, 0}) { + int target_width = target_resolution.width; + int target_height = target_resolution.height; + + float scale_w = static_cast(target_width) / image.nx; + float scale_h = static_cast(target_height) / image.ny; + + int new_width, new_height; + + if (scale_w < scale_h) { + new_width = target_width; + new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); + } else { + new_height = target_height; + new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); + } + + clip_image_u8 resized_image; + bicubic_resize(image, resized_image, new_width, new_height); + + clip_image_u8 padded_image; + padded_image.nx = target_width; + padded_image.ny = target_height; + padded_image.buf.resize(3 * target_width * target_height); + + // Fill the padded image with the fill color + for (size_t i = 0; i < padded_image.buf.size(); i += 3) { + padded_image.buf[i] = pad_color[0]; + padded_image.buf[i + 1] = pad_color[1]; + padded_image.buf[i + 2] = pad_color[2]; + } + + // Calculate padding offsets + int pad_x = (target_width - new_width) / 2; + int pad_y = (target_height - new_height) / 2; + + // Copy the resized image into the center of the padded buffer + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + for (int c = 0; c < 3; ++c) { + padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + } + } + } + dst = std::move(padded_image); + } + + static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int src_idx = 3 * ((y + i)*image.nx + (x + j)); + int dst_idx = 3 * (i*w + j); + dst.buf[dst_idx] = image.buf[src_idx]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + } + } + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than max_dimension, it will be resized to max_dimension + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) { + if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) { + return {0, 0}; + } + + float scale = std::min(1.0f, std::min(static_cast(max_dimension) / inp_size.width, + static_cast(max_dimension) / inp_size.height)); + + float target_width_f = static_cast(inp_size.width) * scale; + float target_height_f = static_cast(inp_size.height) * scale; + + int aligned_width = CLIP_ALIGN((int)target_width_f, align_size); + int aligned_height = CLIP_ALIGN((int)target_height_f, align_size); + + return {aligned_width, aligned_height}; + } + +private: + static inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); + } + + // Linear interpolation between two points + static inline float lerp(float s, float e, float t) { + return s + (e - s) * t; + } +}; + +/** + * implementation of LLaVA-UHD: + * - https://arxiv.org/pdf/2403.11703 + * - https://github.com/thunlp/LLaVA-UHD + * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 + * + * overview: + * - an image always have a single overview (downscaled image) + * - an image can have 0 or multiple slices, depending on the image size + * - each slice can then be considered as a separate image + * + * for example: + * + * [overview] --> [slice 1] --> [slice 2] + * | | + * +--> [slice 3] --> [slice 4] + */ +struct llava_uhd { + struct slice_coordinates { + int x; + int y; + clip_image_size size; + }; + + struct slice_instructions { + clip_image_size overview_size; // size of downscaled image + clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) + clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices + std::vector slices; + bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + }; + + static int get_max_slices(struct clip_ctx * ctx) { + if (clip_is_minicpmv(ctx)) { + return 9; + } + return 0; + } + + static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { + slice_instructions res; + const int patch_size = clip_get_patch_size(ctx); + const int slice_size = clip_get_image_size(ctx); + const int max_slice_nums = get_max_slices(ctx); + const int original_width = original_size.width; + const int original_height = original_size.height; + const float log_ratio = log((float)original_width / original_height); + const float ratio = (float)original_width * original_height / (slice_size * slice_size); + const int multiple = fmin(ceil(ratio), max_slice_nums); + const bool has_slices = (multiple > 1); + const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty(); + + if (has_pinpoints) { + // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) + auto refine_size = llava_uhd::select_best_resolution( + ctx->vision_model.hparams.image_grid_pinpoints, + original_size); + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = refine_size; + res.grid_size = clip_image_size{0, 0}; + res.padding_refined = true; + + for (int y = 0; y < refine_size.height; y += slice_size) { + for (int x = 0; x < refine_size.width; x += slice_size) { + slice_coordinates slice; + slice.x = x; + slice.y = y; + slice.size.width = std::min(slice_size, refine_size.width - x); + slice.size.height = std::min(slice_size, refine_size.height - y); + res.slices.push_back(slice); + if (x == 0) { + res.grid_size.width++; + } + } + res.grid_size.height++; + } + + return res; + } + + // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) + + auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); + res.overview_size = best_size; + + if (!has_slices) { + // skip slicing logic + res.refined_size = clip_image_size{0, 0}; + res.grid_size = clip_image_size{0, 0}; + + } else { + auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); + auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); + res.grid_size = best_grid; + res.refined_size = refine_size; + + int width = refine_size.width; + int height = refine_size.height; + int grid_x = int(width / best_grid.width); + int grid_y = int(height / best_grid.height); + for (int patches_y = 0, ic = 0; + patches_y < refine_size.height && ic < best_grid.height; + patches_y += grid_y, ic += 1) { + for (int patches_x = 0, jc = 0; + patches_x < refine_size.width && jc < best_grid.width; + patches_x += grid_x, jc += 1) { + slice_coordinates slice; + slice.x = patches_x; + slice.y = patches_y; + slice.size.width = grid_x; + slice.size.height = grid_y; + res.slices.push_back(slice); + // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y); + } + } + } + + return res; + } + + static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { + std::vector output; + + // resize to overview size + clip_image_u8_ptr resized_img(clip_image_u8_init()); + image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height); + output.push_back(std::move(resized_img)); + if (inst.slices.empty()) { + // no slices, just return the resized image + return output; + } + + // resize to refined size + clip_image_u8_ptr refined_img(clip_image_u8_init()); + if (inst.padding_refined) { + image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size); + } else { + image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height); + } + + // create slices + for (const auto & slice : inst.slices) { + int x = slice.x; + int y = slice.y; + int w = slice.size.width; + int h = slice.size.height; + + clip_image_u8_ptr img_slice(clip_image_u8_init()); + image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h); + output.push_back(std::move(img_slice)); + } + + return output; + } + +private: + static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + clip_image_size res; + res.width = ensure_divide(width, patch_size); + res.height = ensure_divide(height, patch_size); + return res; + } + + /** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image + * @param possible_resolutions A list of possible resolutions + * @return The best fit resolution + */ + static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions) { + int original_width = original_size.width; + int original_height = original_size.height; + clip_image_size best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto & resolution : possible_resolutions) { + int width = resolution.width; + int height = resolution.height; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; + } + + // used by llava 1.6 with custom list of pinpoints + static clip_image_size select_best_resolution(const std::vector & pinpoints, const clip_image_size & original_size) { + std::vector possible_resolutions; + for (size_t i = 0; i < pinpoints.size(); i += 2) { + possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]}); + } + return select_best_resolution(original_size, possible_resolutions); + } + + static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); + } + + static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + int grid_x = grid.width; + int grid_y = grid.height; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + clip_image_size grid_size; + grid_size.width = refine_width / grid_x; + grid_size.height = refine_height / grid_y; + + auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale); + int best_grid_width = best_grid_size.width; + int best_grid_height = best_grid_size.height; + + clip_image_size refine_size; + refine_size.width = best_grid_width * grid_x; + refine_size.height = best_grid_height * grid_y; + return refine_size; + } + + static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.push_back(clip_image_size{m, split_grids_nums / m}); + } + ++m; + } + } + + clip_image_size best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; + } +}; + +// TODO @ngxson : decprecate the load_image_size singleton pattern +int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { + const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size); + return inst.grid_size.width; +} + +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector +// res_imgs memory is being allocated here, previous allocations will be freed if found +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { + clip_image_size original_size{img->nx, img->ny}; + bool pad_to_square = true; + auto & params = ctx->vision_model.hparams; + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { + pad_to_square = false; + } + + if (clip_is_minicpmv(ctx)) { + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); + } + return true; + } + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + clip_image_u8 resized; + auto patch_size = params.patch_size * 2; + auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size); + image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height); + + clip_image_f32_ptr img_f32(clip_image_f32_init()); + // clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std); + // res_imgs->data[0] = *res; + res_imgs->entries.push_back(std::move(img_f32)); + return true; + } + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE + || ctx->proj_type == PROJECTOR_TYPE_GEMMA3 + || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 + || ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution + ) { + clip_image_u8 resized_image; + int sz = params.image_size; + image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + //clip_image_save_to_bmp(resized_image, "resized.bmp"); + normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(img_f32)); + return true; + } + else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { + clip_image_u8 resized_image; + auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size); + image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(img_f32)); + return true; + } + + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + + clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily + + if (pad_to_square) { + // for llava-1.5, we resize image to a square, and pad the shorter side with a background color + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + const int longer_side = std::max(img->nx, img->ny); + temp->nx = longer_side; + temp->ny = longer_side; + temp->buf.resize(3 * longer_side * longer_side); + + // background color in RGB from LLaVA (this is the mean rgb color * 255) + const std::array pad_color = {122, 116, 104}; + + // resize the image to the target_size + image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color); + + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); + return true; + + } else if (!params.image_grid_pinpoints.empty()) { + // "spatial_unpad" with "anyres" processing for llava-1.6 + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); + } + + return true; + + } + + GGML_ASSERT(false && "Unknown image preprocessing type"); +} + +ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->vision_model.image_newline; +} + +void clip_free(clip_ctx * ctx) { + if (ctx == nullptr) { + return; + } + delete ctx; +} + +// deprecated +size_t clip_embd_nbytes(const struct clip_ctx * ctx) { + const int32_t nx = ctx->vision_model.hparams.image_size; + const int32_t ny = ctx->vision_model.hparams.image_size; + return clip_embd_nbytes_by_img(ctx, nx, ny); +} + +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { + clip_image_f32 img; + img.nx = img_w; + img.ny = img_h; + return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + +int32_t clip_get_image_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_size; +} + +int32_t clip_get_patch_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.patch_size; +} + +int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.n_embd; +} + +const char * clip_patch_merge_type(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat"; +} + +const int32_t * clip_image_grid(const struct clip_ctx * ctx) { + if (ctx->vision_model.hparams.image_grid_pinpoints.size()) { + return &ctx->vision_model.hparams.image_grid_pinpoints.front(); + } + return nullptr; +} + +size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_grid_pinpoints.size(); +} + +// deprecated +int clip_n_patches(const struct clip_ctx * ctx) { + clip_image_f32 img; + img.nx = ctx->vision_model.hparams.image_size; + img.ny = ctx->vision_model.hparams.image_size; + return clip_n_output_tokens(ctx, &img); +} + +// deprecated +int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + return clip_n_output_tokens(ctx, img); +} + +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + const int n_total = clip_n_output_tokens(ctx, img); + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0); + } + return n_total; +} + +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0); + } + return 1; +} + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + + int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + + if (ctx->proj_type == PROJECTOR_TYPE_LDP + || ctx->proj_type == PROJECTOR_TYPE_LDPV2 + || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { + n_patches /= 4; + if (ctx->vision_model.mm_glm_tok_boi) { + n_patches += 2; // for BOI and EOI token embeddings + } + } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { + if (ctx->minicpmv_version == 2) { + n_patches = 96; + } + else if (ctx->minicpmv_version == 3) { + n_patches = 64; + } + else if (ctx->minicpmv_version == 4) { + n_patches = 64; + } + else { + GGML_ABORT("Unknown minicpmv version"); + } + } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + int patch_size = params.patch_size * 2; + int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); + int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); + n_patches = x_patch * y_patch; + } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + int n_per_side = params.image_size / params.patch_size; + int n_per_side_2d_pool = n_per_side / params.proj_scale_factor; + n_patches = n_per_side_2d_pool * n_per_side_2d_pool; + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) { + // both W and H are divided by proj_scale_factor + n_patches /= (params.proj_scale_factor * params.proj_scale_factor); + } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { + int n_merge = params.spatial_merge_size; + int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1); + n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row + } + + return n_patches; +} + +static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { + assert(embed_dim % 2 == 0); + int H = pos.size(); + int W = pos[0].size(); + + std::vector omega(embed_dim / 2); + for (int i = 0; i < embed_dim / 2; ++i) { + omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); + } + + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + float out_value = pos[h][w] * omega[d]; + emb[h][w][d] = sin(out_value); + emb[h][w][d + embed_dim / 2] = cos(out_value); + } + } + } + + return emb; +} + +static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { + assert(embed_dim % 2 == 0); + std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) + std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) + + int H = emb_h.size(); + int W = emb_h[0].size(); + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + emb[h][w][d] = emb_h[h][w][d]; + emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; + } + } + } + return emb; +} + +static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { + int grid_h_size = image_size.first; + int grid_w_size = image_size.second; + + std::vector grid_h(grid_h_size); + std::vector grid_w(grid_w_size); + + for (int i = 0; i < grid_h_size; ++i) { + grid_h[i] = static_cast(i); + } + for (int i = 0; i < grid_w_size; ++i) { + grid_w[i] = static_cast(i); + } + + std::vector> grid(grid_h_size, std::vector(grid_w_size)); + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid[h][w] = grid_w[w]; + } + } + std::vector>> grid_2d = {grid, grid}; + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid_2d[0][h][w] = grid_h[h]; + grid_2d[1][h][w] = grid_w[w]; + } + } + + std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); + + int H = image_size.first; + int W = image_size.second; + std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; + } + } + + return pos_embed_2d; +} + +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { + clip_image_f32_batch imgs; + clip_image_f32_ptr img_copy(clip_image_f32_init()); + *img_copy = *img; + imgs.entries.push_back(std::move(img_copy)); + + return clip_image_batch_encode(ctx, n_threads, &imgs, vec); +} + +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { + const clip_image_f32_batch & imgs = *imgs_c_ptr; + int batch_size = imgs.entries.size(); + + // TODO @ngxson : implement batch size > 1 as a loop + // we don't need true batching support because the cgraph will gonna be big anyway + if (batch_size != 1) { + return false; // only support batch size of 1 + } + + // build the inference graph + ggml_backend_sched_reset(ctx->sched.get()); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); + + // set inputs + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int n_pos = num_patches + (model.class_embedding ? 1 : 0); + const int pos_w = ctx->load_image_size.width / patch_size; + const int pos_h = ctx->load_image_size.height / patch_size; + + const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl + + auto get_inp_tensor = [&gf](const char * name) { + ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + // set input pixel values + { + size_t nelem = 0; + for (const auto & img : imgs.entries) { + nelem += img->nx * img->ny * 3; + } + std::vector inp_raw(nelem); + + // layout of data (note: the channel dim is unrolled to better visualize the layout): + // + // ┌──W──┐ + // │ H │ channel = R + // ├─────┤ │ + // │ H │ channel = G + // ├─────┤ │ + // │ H │ channel = B + // └─────┘ │ + // ──────┘ x B + + for (size_t i = 0; i < imgs.entries.size(); i++) { + const int nx = imgs.entries[i]->nx; + const int ny = imgs.entries[i]->ny; + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + float * batch_entry = inp_raw.data() + b * (3*n); + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + size_t base_src = 3*(y * nx + x); // idx of the first channel + size_t base_dst = y * nx + x; // idx of the first channel + batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; + batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; + batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + } + } + } + } + set_input_f32("inp_raw", inp_raw); + } + + // set input per projector + switch (ctx->proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + std::vector positions(pos_h * pos_w); + int bucket_coords_h[1024]; + int bucket_coords_w[1024]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + set_input_i32("positions", positions); + + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + int embed_dim = clip_n_mmproj_embd(ctx); + + // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos? + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + + std::vector pos_embed(embed_dim * pos_w * pos_h); + for(int i = 0; i < pos_w * pos_h; ++i){ + for(int j = 0; j < embed_dim; ++j){ + pos_embed[i * embed_dim + j] = pos_embed_t[i][j]; + } + } + + set_input_f32("pos_embed", pos_embed); + } break; + case PROJECTOR_TYPE_QWEN2VL: + { + const int merge_ratio = 2; + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + std::vector positions(n_pos * 4); + int ptr = 0; + for (int y = 0; y < ph; y += merge_ratio) { + for (int x = 0; x < pw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions[ ptr] = y + dy; + positions[ num_patches + ptr] = x + dx; + positions[2 * num_patches + ptr] = y + dy; + positions[3 * num_patches + ptr] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + // pw * ph = number of tokens output by ViT after apply patch merger + // ipw * ipw = number of vision token been processed inside ViT + const int merge_ratio = 2; + const int pw = image_size_width / patch_size / merge_ratio; + const int ph = image_size_height / patch_size / merge_ratio; + const int ipw = image_size_width / patch_size; + const int iph = image_size_height / patch_size; + + std::vector idx (ph * pw); + std::vector inv_idx(ph * pw); + + if (use_window_attn) { + const int attn_window_size = 112; + const int grid_window = attn_window_size / patch_size / merge_ratio; + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); + int mask_row = 0; + + for (int y = 0; y < ph; y += grid_window) { + for (int x = 0; x < pw; x += grid_window) { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + GGML_ASSERT(src < (int)idx.size()); + GGML_ASSERT(dst < (int)inv_idx.size()); + idx [src] = dst; + inv_idx[dst] = src; + dst++; + } + } + + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } + + set_input_i32("window_idx", idx); + set_input_i32("inv_window_idx", inv_idx); + set_input_f32("window_mask", mask); + } else { + for (int i = 0; i < ph * pw; i++) { + idx[i] = i; + } + } + + const int mpow = merge_ratio * merge_ratio; + std::vector positions(n_pos * 4); + + int ptr = 0; + for (int y = 0; y < iph; y += merge_ratio) { + for (int x = 0; x < ipw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + auto remap = idx[ptr / mpow]; + remap = (remap * mpow) + (ptr % mpow); + + positions[ remap] = y + dy; + positions[ num_patches + remap] = x + dx; + positions[2 * num_patches + remap] = y + dy; + positions[3 * num_patches + remap] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(n_pos); + // dimension H + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i / n_patches_per_col; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i % n_patches_per_col; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + // llava and other models + std::vector positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + { + // llava and other models + std::vector positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); + + // The patches vector is used to get rows to index into the embeds with; + // we should skip dim 0 only if we have CLS to avoid going out of bounds + // when retrieving the rows. + int patch_offset = model.class_embedding ? 1 : 0; + std::vector patches(num_patches); + for (int i = 0; i < num_patches; i++) { + patches[i] = i + patch_offset; + } + set_input_i32("patches", patches); + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_INTERNVL: + { + // do nothing + } break; + default: + GGML_ABORT("Unknown projector type"); + } + + // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); + ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads); + } + } + + auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); + return false; + } + + // the last node is the embedding tensor + ggml_tensor * embeddings = ggml_graph_node(gf, -1); + + // sanity check (only support batch size of 1 for now) + const int n_tokens_out = embeddings->ne[1]; + const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); + if (n_tokens_out != expected_n_tokens_out) { + LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); + GGML_ABORT("Invalid number of output tokens"); + } + + // copy the embeddings to the location passed by the user + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + + return true; +} + +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { + assert(itype < GGML_TYPE_COUNT); + ggml_type type = static_cast(itype); + + auto * ctx_clip = clip_init(fname_inp, clip_context_params{ + /* use_gpu */ false, + /* verbosity */ GGML_LOG_LEVEL_ERROR, + }); + + const auto & ctx_src = ctx_clip->ctx_gguf.get(); + const auto & ctx_data = ctx_clip->ctx_data.get(); + + auto * ctx_out = gguf_init_empty(); + gguf_set_kv(ctx_out, ctx_src); + gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out, "general.file_type", itype); + + auto fout = std::ofstream(fname_out, std::ios::binary); + + const int n_tensors = gguf_get_n_tensors(ctx_src); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_src, i); + ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + gguf_add_tensor(ctx_out, cur); + } + + const size_t meta_size = gguf_get_meta_size(ctx_out); + for (size_t i = 0; i < meta_size; ++i) { + fout.put(0); + } + + // regexes of tensor names to be quantized + const std::vector k_names = { + ".*weight", + }; + + std::vector work(512); + std::vector conv_buf(512); + size_t total_size_org = 0; + size_t total_size_new = 0; + + for (int i = 0; i < n_tensors; ++i) { + const std::string name = gguf_get_tensor_name(ctx_src, i); + ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str()); + + enum ggml_type new_type; + void * new_data; + size_t new_size; + + bool quantize = false; + for (const auto & s : k_names) { + if (std::regex_match(name, std::regex(s))) { + quantize = true; + break; + } + } + + // quantize only 2D tensors and bigger than block size + quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type); + + if (quantize) { + new_type = type; + if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { + new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type + // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + } + const size_t n_elms = ggml_nelements(cur); + float * f32_data; + + switch (cur->type) { + case GGML_TYPE_F32: + f32_data = (float *)cur->data; + break; + case GGML_TYPE_F16: + if (conv_buf.size() < n_elms) { + conv_buf.resize(n_elms); + } + for (size_t j = 0; j < n_elms; ++j) { + conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]); + } + f32_data = (float *)conv_buf.data(); + break; + default: + LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__); + gguf_free(ctx_out); + return false; + } + + if (work.size() < n_elms * 4) { + work.resize(n_elms * 4); + } + new_data = work.data(); + + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr); + } else { + new_type = cur->type; + new_data = cur->data; + new_size = ggml_nbytes(cur); + } + const size_t orig_size = ggml_nbytes(cur); + total_size_org += orig_size; + total_size_new += new_size; + gguf_set_tensor_type(ctx_out, name.c_str(), new_type); + GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data); + fout.write((const char *)new_data, new_size); + size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; + for (size_t j = 0; j < pad; ++j) { + fout.put(0); + } + + LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + } + + // go back to beginning of file and write the updated metadata + fout.seekp(0, std::ios::beg); + std::vector meta(meta_size); + gguf_get_meta_data(ctx_out, meta.data()); + fout.write((const char *)meta.data(), meta_size); + + fout.close(); + + clip_free(ctx_clip); + gguf_free(ctx_out); + + { + LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + } + + return true; +} + +int clip_n_mmproj_embd(const struct clip_ctx * ctx) { + switch (ctx->proj_type) { + case PROJECTOR_TYPE_LDP: + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + case PROJECTOR_TYPE_LDPV2: + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_PIXTRAL: + return ctx->vision_model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_MLP_NORM: + return ctx->vision_model.mm_3_b->ne[0]; + case PROJECTOR_TYPE_MINICPMV: + if (ctx->minicpmv_version == 2) { + return 4096; + } else if (ctx->minicpmv_version == 3) { + return 3584; + } else if (ctx->minicpmv_version == 4) { + return 3584; + } + GGML_ABORT("Unknown minicpmv version"); + case PROJECTOR_TYPE_GLM_EDGE: + return ctx->vision_model.mm_model_mlp_3_w->ne[1]; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + return ctx->vision_model.mm_1_b->ne[0]; + case PROJECTOR_TYPE_GEMMA3: + return ctx->vision_model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_IDEFICS3: + return ctx->vision_model.projection->ne[1]; + case PROJECTOR_TYPE_INTERNVL: + return ctx->vision_model.mm_3_w->ne[1]; + default: + GGML_ABORT("Unknown projector type"); + } +} + +int clip_is_minicpmv(const struct clip_ctx * ctx) { + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { + return ctx->minicpmv_version; + } + return 0; +} + +bool clip_is_glm(const struct clip_ctx * ctx) { + return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE; +} + +bool clip_is_qwen2vl(const struct clip_ctx * ctx) { + return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL; +} + +bool clip_is_llava(const struct clip_ctx * ctx) { + return ctx->has_llava_projector; +} + +bool clip_is_gemma3(const struct clip_ctx * ctx) { + return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; +} + +bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { + clip_image_f32 clip_img; + clip_img.buf.resize(h * w * 3); + for (int i = 0; i < h*w*3; i++) + { + clip_img.buf[i] = img[i]; + } + clip_img.nx = w; + clip_img.ny = h; + clip_image_encode(ctx, n_threads, &clip_img, vec); + return true; +} + +// +// API used internally with mtmd +// + +projector_type clip_get_projector_type(const struct clip_ctx * ctx) { + return ctx->proj_type; +} diff --git a/llama/llama.cpp/examples/llava/clip.h b/llama/llama.cpp/tools/mtmd/clip.h similarity index 81% rename from llama/llama.cpp/examples/llava/clip.h rename to llama/llama.cpp/tools/mtmd/clip.h index 5fc45d3e2..0b0eb0295 100644 --- a/llama/llama.cpp/examples/llava/clip.h +++ b/llama/llama.cpp/tools/mtmd/clip.h @@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); -CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); +CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); @@ -59,18 +59,29 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); -CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); +GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx), + "use clip_n_output_tokens instead"); +GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img), + "use clip_n_output_tokens instead"); + +CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// for M-RoPE, this will be the number of token positions in X and Y directions +// for other models, X will be the total number of tokens and Y will be 1 +CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); +CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// this should be equal to the embedding dimension of the text model +CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip); -CLIP_API struct clip_image_size * clip_image_size_init(); -CLIP_API struct clip_image_u8 * clip_image_u8_init (); -CLIP_API struct clip_image_f32 * clip_image_f32_init(); -CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava +CLIP_API struct clip_image_size * clip_image_size_init(void); +CLIP_API struct clip_image_u8 * clip_image_u8_init (void); +CLIP_API struct clip_image_f32 * clip_image_f32_init(void); +CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava // nx, ny are the output image dimensions CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); @@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); CLIP_API bool clip_is_llava(const struct clip_ctx * ctx); CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx); -CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); - CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); diff --git a/llama/llama.cpp/examples/llava/llava.cpp b/llama/llama.cpp/tools/mtmd/llava.cpp similarity index 95% rename from llama/llama.cpp/examples/llava/llava.cpp rename to llama/llama.cpp/tools/mtmd/llava.cpp index 5eb40bcd1..ebef8b3c1 100644 --- a/llama/llama.cpp/examples/llava/llava.cpp +++ b/llama/llama.cpp/tools/mtmd/llava.cpp @@ -2,6 +2,7 @@ #include "llava.h" #include "llama.h" +#include "ggml-cpp.h" #include #include @@ -112,7 +113,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< } // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) -static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { +static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) { struct { struct ggml_context * ctx; } model; @@ -175,7 +176,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector model.ctx = ggml_init(params); - struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4 // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // fill it with the image embeddings, ignoring the base for (size_t i = 1; i < num_images; i++) { @@ -209,13 +210,17 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); ggml_build_forward_expand(gf, flatten); - ggml_graph_compute_with_ctx(model.ctx, gf, 1); + + ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) }; + GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend"); + ggml_backend_graph_compute(backend.get(), gf); + struct ggml_tensor* result = ggml_graph_node(gf, -1); memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches - *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); + memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches + *n_img_pos_out = static_cast(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input)); // Debug: Test single segments // Current findings: sending base image, sending a segment embedding all works similar to python @@ -313,7 +318,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes_by_img(ctx_clip, nx, ny)); - n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); + n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -342,8 +347,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding - *n_img_pos = clip_n_patches(ctx_clip); clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 if (!encoded) { LOG_ERR("Unable to encode image\n"); @@ -381,7 +386,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); int n_img_pos_out; - clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); + clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0); + clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input); *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -456,7 +462,7 @@ struct llava_embd_batch { std::vector seq_ids; std::vector logits; llama_batch batch; - llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { pos .resize(n_tokens); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); @@ -468,7 +474,6 @@ struct llava_embd_batch { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, /*embd =*/ embd, - /*n_embd =*/ n_embd, /*pos =*/ pos.data(), /*n_seq_id =*/ n_seq_id.data(), /*seq_id =*/ seq_ids.data(), @@ -492,7 +497,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ n_eval = n_batch; } float * embd = image_embed->embed+i*n_embd; - llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0); + llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0); if (llama_decode(ctx_llama, llava_batch.batch)) { LOG_ERR("%s : failed to eval\n", __func__); return false; diff --git a/llama/llama.cpp/examples/llava/llava.h b/llama/llama.cpp/tools/mtmd/llava.h similarity index 100% rename from llama/llama.cpp/examples/llava/llava.h rename to llama/llama.cpp/tools/mtmd/llava.h diff --git a/llama/llama.cpp/examples/llava/llava.go b/llama/llama.cpp/tools/mtmd/mtmd.go similarity index 92% rename from llama/llama.cpp/examples/llava/llava.go rename to llama/llama.cpp/tools/mtmd/mtmd.go index 37b031cb7..064790368 100644 --- a/llama/llama.cpp/examples/llava/llava.go +++ b/llama/llama.cpp/tools/mtmd/mtmd.go @@ -1,4 +1,4 @@ -package llava +package mtmd // #cgo CXXFLAGS: -std=c++11 // #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common diff --git a/llama/llama.go b/llama/llama.go index 5fce0a622..626ea13a3 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -2,10 +2,11 @@ package llama /* #cgo CFLAGS: -std=c11 +#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration #cgo CXXFLAGS: -std=c++17 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common -#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava +#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src #cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include @@ -16,7 +17,6 @@ package llama #include "llava.h" #include "gguf.h" -#include "mllama.h" #include "sampling_ext.h" extern bool llamaProgressCallback(float progress, void *user_data); @@ -39,8 +39,8 @@ import ( "unsafe" _ "github.com/ollama/ollama/llama/llama.cpp/common" - _ "github.com/ollama/ollama/llama/llama.cpp/examples/llava" _ "github.com/ollama/ollama/llama/llama.cpp/src" + _ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd" ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ) @@ -198,7 +198,6 @@ type ModelParams struct { NumGpuLayers int MainGpu int UseMmap bool - UseMlock bool TensorSplit []float32 Progress func(float32) VocabOnly bool @@ -217,7 +216,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { cparams.n_gpu_layers = C.int(params.NumGpuLayers) cparams.main_gpu = C.int32_t(params.MainGpu) cparams.use_mmap = C.bool(params.UseMmap) - cparams.use_mlock = C.bool(params.UseMlock) cparams.vocab_only = C.bool(params.VocabOnly) if len(params.TensorSplit) > 0 { @@ -461,24 +459,6 @@ func (m *Model) NEmbd() int { return int(C.llama_model_n_embd(m.c)) } -func Quantize(infile, outfile string, ftype uint32) error { - cinfile := C.CString(infile) - defer C.free(unsafe.Pointer(cinfile)) - - coutfile := C.CString(outfile) - defer C.free(unsafe.Pointer(coutfile)) - - params := C.llama_model_quantize_default_params() - params.nthread = -1 - params.ftype = ftype - - if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 { - return fmt.Errorf("llama_model_quantize: %d", rc) - } - - return nil -} - // vision processing type ClipContext struct { c *C.struct_clip_ctx @@ -529,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, return embed, nil } -type MllamaContext struct { - c *C.struct_mllama_ctx -} - -func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) { - mp := C.CString(modelPath) - defer C.free(unsafe.Pointer(mp)) - c := C.mllama_model_load(mp, 1) - if c == nil { - return nil, fmt.Errorf("unable to load mllama model: %v", modelPath) - } - - projEmbedSize := int(C.mllama_n_embd(c)) - modelEmbedSize := llamaContext.Model().NEmbd() - if projEmbedSize != modelEmbedSize { - return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize) - } - - return &MllamaContext{c: c}, nil -} - -func (m *MllamaContext) Free() { - C.mllama_free(m.c) -} - -func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) { - img := C.mllama_image_init() - defer C.mllama_image_free(img) - - ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)) - if !ok { - return nil, errors.New("unable to load mllama image data") - } - - rows := make([]float32, m.EmbedSize(llamaContext)) - ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))) - if !ok { - return nil, errors.New("unable to make mllama embedding from image") - } - - embed := make([][]float32, 1) - embed[0] = rows - - return embed, nil -} - -func (m *MllamaContext) EmbedSize(llamaContext *Context) int { - numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c)) - numEmbed := llamaContext.Model().NEmbd() - - return numTokens * numEmbed -} - -func (c *Context) SetCrossAttention(state bool) { - C.llama_set_cross_attention(c.c, C.bool(state)) -} - func (c *Context) Synchronize() { C.llama_synchronize(c.c) } @@ -606,9 +529,6 @@ type SamplingParams struct { PenaltyRepeat float32 PenaltyFreq float32 PenaltyPresent float32 - Mirostat int - MirostatTau float32 - MirostatEta float32 PenalizeNl bool Seed uint32 Grammar string @@ -625,9 +545,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, cparams.penalty_repeat = C.float(params.PenaltyRepeat) cparams.penalty_freq = C.float(params.PenaltyFreq) cparams.penalty_present = C.float(params.PenaltyFreq) - cparams.mirostat = C.int32_t(params.Mirostat) - cparams.mirostat_tau = C.float(params.MirostatTau) - cparams.mirostat_eta = C.float(params.MirostatEta) cparams.seed = C.uint32_t(params.Seed) grammar := C.CString(params.Grammar) @@ -662,8 +579,8 @@ func SchemaToGrammar(schema []byte) []byte { cStr := C.CString(string(schema)) defer C.free(unsafe.Pointer(cStr)) - // Allocate buffer for grammar output with reasonable size - const maxLen = 32768 // 32KB + // Allocate buffer for grammar based on schema length but with upper bound + maxLen := min(1024*1024, len(schema)*4) buf := make([]byte, maxLen) // Call C function to convert schema to grammar @@ -685,7 +602,7 @@ type Grammar struct { mu sync.Mutex } -func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar { +func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar { cGrammar := C.CString(grammar) defer C.free(unsafe.Pointer(cGrammar)) @@ -705,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke cEogTokens[i] = C.uint32_t(token) } - g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens))) + g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens))) if g == nil { return nil } diff --git a/llama/mllama.cpp b/llama/mllama.cpp deleted file mode 100644 index 1ba8f5bef..000000000 --- a/llama/mllama.cpp +++ /dev/null @@ -1,887 +0,0 @@ -// NOTE: This is modified from clip.cpp for Mllama only -#include "mllama.h" - -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "ggml-cpu.h" -#include "ggml.h" -#include "gguf.h" - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#define REQUIRE(x) \ - do { \ - if (!(x)) { \ - throw std::runtime_error("REQUIRE failed: " #x); \ - } \ - } while (0) - -#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__) - -#if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX - #define NOMINMAX -#endif -#include -#if __GLIBCXX__ -#include -#include -#include -#endif -#endif - -struct mllama_image { - int width; - int height; - - int num_channels = 3; - int num_tiles = 4; - - int aspect_ratio_id; - - std::vector data; -}; - -static std::string format(const char *fmt, ...) { - va_list args; - va_start(args, fmt); - std::vector b(128); - int n = vsnprintf(b.data(), b.size(), fmt, args); - REQUIRE(n >= 0 && n < b.size()); - va_end(args); - return std::string(b.data(), b.size()); -} - -// -// utilities to get data from a gguf file -// - -static int get_key_index(const gguf_context *ctx, const char *key) { - int key_index = gguf_find_key(ctx, key); - REQUIRE(key_index != -1); - return key_index; -} - -static std::vector get_u32_array(const gguf_context *ctx, const std::string &key) { - const int i = get_key_index(ctx, key.c_str()); - const int n = gguf_get_arr_n(ctx, i); - const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i); - - std::vector s(n); - for (size_t j = 0; j < s.size(); j++) { - s[j] = data[j]; - } - - return s; -} - -static uint32_t get_u32(const gguf_context *ctx, const std::string &key) { - return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str())); -} - -static float get_f32(const gguf_context *ctx, const std::string &key) { - return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str())); -} - -static std::string get_ftype(int ftype) { - return ggml_type_name(static_cast(ftype)); -} - -// -// mllama layers -// - -struct mllama_hparams { - uint32_t image_size; - uint32_t patch_size; - uint32_t hidden_size; - uint32_t n_intermediate; - uint32_t projection_dim; - uint32_t n_head; - uint32_t n_layer; - uint32_t n_global_layer; - uint32_t n_tiles; - - float eps; - - std::vector intermediate_layers; -}; - -struct mllama_layer { - // attention - struct ggml_tensor *k_w; - struct ggml_tensor *k_b; - struct ggml_tensor *q_w; - struct ggml_tensor *q_b; - struct ggml_tensor *v_w; - struct ggml_tensor *v_b; - - struct ggml_tensor *o_w; - struct ggml_tensor *o_b; - - struct ggml_tensor *attn_gate; - - // layernorm 1 - struct ggml_tensor *ln_1_w; - struct ggml_tensor *ln_1_b; - - // ff - struct ggml_tensor *ff_i_w; - struct ggml_tensor *ff_i_b; - - struct ggml_tensor *ff_o_w; - struct ggml_tensor *ff_o_b; - - struct ggml_tensor *ff_gate; - - // layernorm 2 - struct ggml_tensor *ln_2_w; - struct ggml_tensor *ln_2_b; -}; - -struct mllama_vision_model { - struct mllama_hparams hparams; - - // embeddings - struct ggml_tensor *class_embedding; - struct ggml_tensor *patch_embeddings; - struct ggml_tensor *position_embeddings; - struct ggml_tensor *position_embeddings_gate; - struct ggml_tensor *tile_position_embeddings; - struct ggml_tensor *tile_position_embeddings_gate; - struct ggml_tensor *pre_tile_position_embeddings; - struct ggml_tensor *pre_tile_position_embeddings_gate; - struct ggml_tensor *post_tile_position_embeddings; - struct ggml_tensor *post_tile_position_embeddings_gate; - - struct ggml_tensor *pre_ln_w; - struct ggml_tensor *pre_ln_b; - - std::vector layers; - std::vector global_layers; - - struct ggml_tensor *post_ln_w; - struct ggml_tensor *post_ln_b; - - struct ggml_tensor *mm_0_w; - struct ggml_tensor *mm_0_b; -}; - -struct mllama_ctx { - struct mllama_vision_model vision_model; - - uint32_t ftype = 1; - - struct gguf_context *ctx_gguf; - struct ggml_context *ctx_data; - - std::vector buf_compute_meta; - - // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = nullptr; - - ggml_backend_t backend = nullptr; - ggml_gallocr_t compute_alloc = nullptr; -}; - -static ggml_tensor *mllama_image_build_encoder_layer( - struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings, - const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) { - struct ggml_tensor *cur = embeddings; - - { - // layernorm1 - cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b); - ggml_set_name(cur, format("%d pre layernorm", il).c_str()); - } - - { - // self-attention - struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur); - if (layer.q_b != nullptr) { - Q = ggml_add(ctx0, Q, layer.q_b); - } - - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - ggml_set_name(Q, format("%d query", il).c_str()); - - struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur); - if (layer.k_b != nullptr) { - K = ggml_add(ctx0, K, layer.k_b); - } - - K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - ggml_set_name(K, format("%d key", il).c_str()); - - struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur); - if (layer.v_b != nullptr) { - V = ggml_add(ctx0, V, layer.v_b); - } - - V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - ggml_set_name(V, format("%d value", il).c_str()); - - struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head)); - KQ = ggml_soft_max_inplace(ctx0, KQ); - ggml_set_name(KQ, format("%d KQ", il).c_str()); - - struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size); - ggml_set_name(KQV, format("%d KQV", il).c_str()); - - cur = ggml_mul_mat(ctx0, layer.o_w, KQV); - if (layer.o_b != nullptr) { - cur = ggml_add(ctx0, cur, layer.o_b); - } - ggml_set_name(cur, format("%d self attention", il).c_str()); - - if (layer.attn_gate != nullptr) { - cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate); - ggml_set_name(cur, format("%d self attention gate", il).c_str()); - } - } - - cur = ggml_add(ctx0, cur, embeddings); - ggml_set_name(cur, format("%d residual", il).c_str()); - - embeddings = cur; - - { - // layernorm2 - cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b); - ggml_set_name(cur, format("%d post layernorm", il).c_str()); - } - - { - // feed forward - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b); - cur = ggml_gelu_inplace(ctx0, cur); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b); - ggml_set_name(cur, format("%d feed forward", il).c_str()); - - if (layer.ff_gate != nullptr) { - cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate); - ggml_set_name(cur, format("%d feed forward gate", il).c_str()); - } - } - - // residual 2 - cur = ggml_add(ctx0, cur, embeddings); - ggml_set_name(cur, format("%d residual", il).c_str()); - - embeddings = cur; - - return embeddings; -} - -static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) { - const auto &model = ctx->vision_model; - const auto &hparams = model.hparams; - - const int image_size = hparams.image_size; - const int image_size_width = image_size; - const int image_size_height = image_size; - - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1); - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - - const int batch_size = imgs->size; - REQUIRE(batch_size == 1); - - int num_tiles = 4; - int num_channels = 3; - if (imgs->data != nullptr) { - num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles; - num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels; - } - - struct ggml_init_params params = { - ctx->buf_compute_meta.size(), // mem_size - ctx->buf_compute_meta.data(), // mem_buffer - true, // no_alloc - }; - - struct ggml_context *ctx0 = ggml_init(params); - struct ggml_cgraph *gf = ggml_new_graph(ctx0); - - struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - - struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - - struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size); - ggml_set_name(aspect_ratios, "aspect_ratios"); - ggml_set_input(aspect_ratios); - - if (model.pre_tile_position_embeddings != nullptr) { - struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios); - ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings"); - - pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles); - if (model.pre_tile_position_embeddings_gate != nullptr) { - pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate); - } - - inp = ggml_add(ctx0, inp, pre_tile_position_embeddings); - } - - struct ggml_tensor *embeddings = inp; - - if (model.class_embedding != nullptr) { - // concat class_embeddings and patch_embeddings - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - for (int i = 0; i < num_tiles; ++i) { - // repeat class embeddings for each tile - embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]); - } - - embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } - - struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); - if (model.position_embeddings_gate != nullptr) { - position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate); - } - - embeddings = ggml_add(ctx0, embeddings, position_embd); - - if (model.tile_position_embeddings != nullptr) { - struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios); - ggml_set_name(tile_position_embeddings, "tile_position_embeddings"); - - tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles); - if (model.tile_position_embeddings_gate != nullptr) { - tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate); - } - - embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings); - } - - // pre-layernorm - if (model.pre_ln_w != nullptr) { - embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w); - if (model.pre_ln_b != nullptr) { - embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b); - } - - ggml_set_name(embeddings, "pre layernorm"); - } - - const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8; - - embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0); - embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0); - - std::vector intermediate_embeddings; - - // encoder - for (size_t il = 0; il < model.layers.size(); il++) { - if (hparams.intermediate_layers[il]) { - intermediate_embeddings.push_back(embeddings); - } - - embeddings = mllama_image_build_encoder_layer( - ctx0, il, model.layers[il], embeddings, - hparams.eps, hidden_size, batch_size, n_head, d_head); - } - - // post-layernorm - if (model.post_ln_w != nullptr) { - embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w); - if (model.post_ln_b != nullptr) { - embeddings = ggml_add(ctx0, embeddings, model.post_ln_b); - } - - ggml_set_name(embeddings, "post layernorm"); - } - - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles); - - if (model.post_tile_position_embeddings != nullptr) { - struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios); - ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings"); - - post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles); - if (model.post_tile_position_embeddings_gate != nullptr) { - post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate); - } - - embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings); - } - - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1); - - // global encoder - for (size_t il = 0; il < model.global_layers.size(); il++) { - embeddings = mllama_image_build_encoder_layer( - ctx0, il, model.global_layers[il], embeddings, - hparams.eps, hidden_size, batch_size, n_head, d_head); - } - - struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles); - for (size_t i = 0; i < intermediate_embeddings.size(); ++i) { - stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0); - } - - stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size); - stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0); - - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles); - embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0); - embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0); - - // mllama projector - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b); - ggml_set_name(embeddings, "multi modal projector"); - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) { - struct ggml_tensor *cur = ggml_get_tensor(ctx, name); - REQUIRE(cur != nullptr || optional); - return cur; -} - -static std::vector mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) { - std::vector layers(n); - for (size_t i = 0; i < layers.size(); i++) { - auto &layer = layers[i]; - layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false); - layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false); - layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false); - layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false); - - layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false); - layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true); - layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false); - layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true); - layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false); - layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true); - layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false); - layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true); - - layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false); - layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false); - layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false); - layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false); - - layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true); - layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true); - } - - return layers; -} - -// read and create ggml_context containing the tensors and their data -struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) { - struct ggml_context *meta = nullptr; - - struct gguf_init_params params = { - true, // no_alloc - &meta, // ctx - }; - - struct gguf_context *ctx = gguf_init_from_file(fname, params); - REQUIRE(ctx != nullptr); - - if (verbosity >= 1) { - const int n_tensors = gguf_get_n_tensors(ctx); - const int n_kv = gguf_get_n_kv(ctx); - const std::string ftype = get_ftype(get_u32(ctx, "general.file_type")); - const int idx_desc = get_key_index(ctx, "general.description"); - const std::string description = gguf_get_val_str(ctx, idx_desc); - const int idx_name = gguf_find_key(ctx, "general.name"); - if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug - const std::string name = gguf_get_val_str(ctx, idx_name); - LOG("model name: %s", name.c_str()); - } - LOG("description: %s", description.c_str()); - LOG("GGUF version: %d", gguf_get_version(ctx)); - LOG("alignment: %zu", gguf_get_alignment(ctx)); - LOG("n_tensors: %d", n_tensors); - LOG("n_kv: %d", n_kv); - LOG("ftype: %s", ftype.c_str()); - LOG(""); - } - const int n_tensors = gguf_get_n_tensors(ctx); - - mllama_ctx *new_mllama = new mllama_ctx{}; - - ggml_backend_t backend = ggml_backend_init_best(); - if (backend == nullptr) { - LOG("%s: failed to initialize backend\n", __func__); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend)); - new_mllama->backend = backend; - - // load tensors - { - std::vector read_buf; - struct ggml_init_params params = { - (n_tensors + 1) * ggml_tensor_overhead(), // mem_size - nullptr, // mem_buffer - true, // no_alloc - }; - - new_mllama->ctx_data = ggml_init(params); - if (!new_mllama->ctx_data) { - LOG("ggml_init() failed"); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - -#ifdef _WIN32 - int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); - if (!wlen) { - return NULL; - } - wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); - wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen); - if (!wlen) { - free(wbuf); - return NULL; - } -#if __GLIBCXX__ - int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY); - __gnu_cxx::stdio_filebuf buffer(fd, std::ios_base::in); - std::istream fin(&buffer); -#else // MSVC - // unused in our current build - auto fin = std::ifstream(wbuf, std::ios::binary); -#endif - free(wbuf); -#else - auto fin = std::ifstream(fname, std::ios::binary); -#endif - if (!fin) { - LOG("cannot open model file for loading tensors\n"); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - - // add tensors to context - for (int i = 0; i < n_tensors; ++i) { - const char *name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor *t = ggml_get_tensor(meta, name); - struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t); - ggml_set_name(cur, name); - } - - // alloc memory and offload data - new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend); - for (int i = 0; i < n_tensors; ++i) { - const char *name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name); - const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); - fin.seekg(offset, std::ios::beg); - if (!fin) { - LOG("failed to seek for tensor %s\n", name); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - int num_bytes = ggml_nbytes(cur); - if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); - } - } - -#if defined(_WIN32) && defined(__GLIBCXX__) - close(fd); -#else - fin.close(); -#endif - } - - // vision model - // load vision model - auto &vision_model = new_mllama->vision_model; - auto &hparams = vision_model.hparams; - hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length"); - hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count"); - hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length"); - hparams.n_layer = get_u32(ctx, "mllama.vision.block_count"); - hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count"); - hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles"); - hparams.image_size = get_u32(ctx, "mllama.vision.image_size"); - hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size"); - hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim"); - hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon"); - - std::vector intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices"); - hparams.intermediate_layers.resize(hparams.n_layer); - for (size_t i = 0; i < intermediate_layers_indices.size(); i++) { - hparams.intermediate_layers[intermediate_layers_indices[i]] = true; - } - - if (verbosity >= 2) { - LOG(""); - LOG("vision model hparams"); - LOG("image_size %d", hparams.image_size); - LOG("patch_size %d", hparams.patch_size); - LOG("v_hidden_size %d", hparams.hidden_size); - LOG("v_n_intermediate %d", hparams.n_intermediate); - LOG("v_projection_dim %d", hparams.projection_dim); - LOG("v_n_head %d", hparams.n_head); - LOG("v_n_layer %d", hparams.n_layer); - LOG("v_n_global_layer %d", hparams.n_global_layer); - LOG("v_eps %f", hparams.eps); - } - - vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true); - vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true); - - vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true); - vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true); - - vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true); - vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true); - vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true); - vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true); - - vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true); - vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true); - - vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true); - vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true); - - vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true); - vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true); - - vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false); - vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false); - - vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer); - vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer); - - ggml_free(meta); - - new_mllama->ctx_gguf = ctx; - - { - // measure mem requirement and allocate - new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend)); - struct mllama_image_batch batch; - batch.size = 1; - ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch); - ggml_gallocr_reserve(new_mllama->compute_alloc, gf); - size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0); - LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0); - } - - return new_mllama; -} - -struct mllama_image *mllama_image_init() { - return new mllama_image(); -} - -void mllama_image_free(struct mllama_image *img) { delete img; } -void mllama_image_batch_free(struct mllama_image_batch *batch) { - if (batch->size > 0) { - delete[] batch->data; - batch->size = 0; - } -} - -bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) { - img->width = width; - img->height = height; - img->num_channels = num_channels; - img->num_tiles = num_tiles; - img->aspect_ratio_id = aspect_ratio_id; - img->data.resize(n); - - memcpy(img->data.data(), data, n); - return true; -} - -inline int mllama(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); -} - -void mllama_free(mllama_ctx *ctx) { - ggml_free(ctx->ctx_data); - gguf_free(ctx->ctx_gguf); - - ggml_backend_buffer_free(ctx->params_buffer); - ggml_backend_free(ctx->backend); - ggml_gallocr_free(ctx->compute_alloc); - delete ctx; -} - -bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) { - mllama_image_batch imgs{}; - imgs.size = 1; - imgs.data = img; - return mllama_image_batch_encode(ctx, n_threads, &imgs, vec); -} - -bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) { - int batch_size = imgs->size; - REQUIRE(batch_size == 1); - - // build the inference graph - ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs); - ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); - - // set inputs - const auto &model = ctx->vision_model; - const auto &hparams = model.hparams; - - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1); - - { - struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw)); - } - - { - struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings"); - if (embeddings != nullptr) { - void *zeros = malloc(ggml_nbytes(embeddings)); - memset(zeros, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings)); - free(zeros); - } - } - - { - struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions"); - if (positions != nullptr) { - int *positions_data = (int *)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - } - - { - struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios"); - if (aspect_ratios != nullptr) { - int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios)); - aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id; - ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios)); - free(aspect_ratios_data); - } - } - - if (ggml_backend_is_cpu(ctx->backend)) { - ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); - } - - ggml_backend_graph_compute(ctx->backend, gf); - - // the last node is the embedding tensor - struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1); - - // copy the embeddings to the location passed by the user - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - - return true; -} - -int32_t mllama_image_size(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.image_size; -} - -int32_t mllama_patch_size(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.patch_size; -} - -int32_t mllama_hidden_size(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.hidden_size; -} - -int mllama_n_patches(const struct mllama_ctx *ctx) { - const auto &hparams = ctx->vision_model.hparams; - return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size); -} - -int mllama_n_positions(const struct mllama_ctx *ctx) { - return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1); -} - -int mllama_n_tiles(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.n_tiles; -} - -int mllama_n_embd(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.projection_dim; -} - -size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) { - return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float); -} diff --git a/llama/mllama.h b/llama/mllama.h deleted file mode 100644 index 446dbb9ec..000000000 --- a/llama/mllama.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef MLLAMA_H -#define MLLAMA_H - -#include -#include - -#ifdef LLAMA_SHARED -#if defined(_WIN32) && !defined(__MINGW32__) -#ifdef LLAMA_BUILD -#define MLLAMA_API __declspec(dllexport) -#else -#define MLLAMA_API __declspec(dllimport) -#endif -#else -#define MLLAMA_API __attribute__((visibility("default"))) -#endif -#else -#define MLLAMA_API -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -struct mllama_ctx; - -struct mllama_image_batch { - struct mllama_image *data; - size_t size; -}; - -MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity); -MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity); - -MLLAMA_API void mllama_free(struct mllama_ctx *ctx); - -MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx); -MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx); -MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx); - -MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx); -MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx); -MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx); -MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx); -MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx); - -MLLAMA_API struct mllama_image *mllama_image_init(); - -MLLAMA_API void mllama_image_free(struct mllama_image *img); -MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch); - -MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img); - -MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec); -MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec); - -#ifdef __cplusplus -} -#endif - -#endif // MLLAMA_H diff --git a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch index 8f5c3a779..edeeb4ffa 100644 --- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch +++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch @@ -24,7 +24,7 @@ problem. 9 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index 273075f4..dd11f304 100644 +index b30b4cb3..0ce73a99 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -43,7 +43,7 @@ index 273075f4..dd11f304 100644 } static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { -@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_aligned_free(buffer->context, buffer->size); @@ -55,7 +55,7 @@ index 273075f4..dd11f304 100644 } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { -@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { +@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { }; static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { @@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644 /** diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index a7febef7..31750b6f 100644 +index b4b85abc..cb0d8528 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context { @@ -96,7 +96,7 @@ index a7febef7..31750b6f 100644 } static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { -@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context { +@@ -790,6 +791,7 @@ struct ggml_backend_cuda_split_buffer_context { static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; delete ctx; @@ -104,7 +104,7 @@ index a7febef7..31750b6f 100644 } static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ +@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { CUDA_CHECK(cudaFreeHost(buffer->context)); @@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 266d8af4..12886cd3 100644 +index 576f9581..1b56f858 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) +@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } free(ctx); @@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp -index a0667b7d..bd83adc5 100644 +index 4f0abb5a..de1ec184 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp -@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { +@@ -483,6 +483,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); GGML_ASSERT(status); delete ctx; @@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp -index 1de34c96..4600f61e 100644 +index 0ea72994..ae3a3c33 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp -@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { +@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { ggml_sycl_set_device(ctx->device); delete ctx; @@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644 } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ -@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context { +@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; delete ctx; @@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644 } static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ +@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_sycl_host_free(buffer->context); @@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 39f3cd34..c569a8a5 100644 +index e2b357fd..68768029 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; @@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644 } static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe +@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch index e51b43730..07aa4b0ea 100644 --- a/llama/patches/0002-pretokenizer.patch +++ b/llama/patches/0002-pretokenizer.patch @@ -10,10 +10,10 @@ logs instead of throwing an error 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 48060517..a35b498c 100644 +index 9389ca80..806c1b3d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp -@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { +@@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { if (type == LLAMA_VOCAB_TYPE_BPE) { add_space_prefix = false; clean_spaces = true; @@ -31,8 +31,8 @@ index 48060517..a35b498c 100644 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { - pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; +@@ -1651,7 +1642,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { + pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch index c27dbd7b5..80d6b55e5 100644 --- a/llama/patches/0003-embeddings.patch +++ b/llama/patches/0003-embeddings.patch @@ -11,10 +11,10 @@ instead of forcing one or the error 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 983385f8..32f59819 100644 +index 62246c10..dca22d8b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) { int64_t n_outputs_all = 0; // count outputs @@ -23,7 +23,7 @@ index 983385f8..32f59819 100644 for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs_all += batch.logits[i] != 0; } -@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} @@ -32,7 +32,7 @@ index 983385f8..32f59819 100644 auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; if (t_embd && res->get_embd_pooled()) { -@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { +@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch index 81d61a827..957109783 100644 --- a/llama/patches/0004-clip-unicode.patch +++ b/llama/patches/0004-clip-unicode.patch @@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode fixes loading vision models in llama.cpp on windows filesystems for paths that include wide characters --- - examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++ + tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) -diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp -index 75970615..d57b4bd6 100644 ---- a/examples/llava/clip.cpp -+++ b/examples/llava/clip.cpp -@@ -29,6 +29,19 @@ - #include - #include +diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp +index 41ba45a7..cdd8ca44 100644 +--- a/tools/mtmd/clip.cpp ++++ b/tools/mtmd/clip.cpp +@@ -31,6 +31,19 @@ + #include + #include +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN @@ -32,8 +32,8 @@ index 75970615..d57b4bd6 100644 + struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; - //#define CLIP_DEBUG_FUNCTIONS -@@ -1430,7 +1443,29 @@ struct clip_model_loader { + enum ffn_op_type { +@@ -2190,7 +2203,29 @@ struct clip_model_loader { { std::vector read_buf; @@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644 if (!fin) { throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); } -@@ -1457,7 +1492,11 @@ struct clip_model_loader { +@@ -2217,7 +2252,11 @@ struct clip_model_loader { ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); } } diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch index 76ddc6197..deb53c225 100644 --- a/llama/patches/0005-solar-pro.patch +++ b/llama/patches/0005-solar-pro.patch @@ -15,10 +15,10 @@ adds support for the Solar Pro architecture 7 files changed, 248 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index 62e1480b..f754bc8f 100644 +index f2bc8ca7..5ab3f572 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp -@@ -68,6 +68,7 @@ static const std::map LLM_ARCH_NAMES = { +@@ -69,6 +69,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, @@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644 { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, -@@ -140,6 +141,7 @@ static const std::map LLM_KV_NAMES = { +@@ -142,6 +143,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, @@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644 { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, -@@ -1482,6 +1484,24 @@ static const std::map> LLM_TENSOR_N +@@ -1502,6 +1504,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, @@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644 { LLM_ARCH_WAVTOKENIZER_DEC, { -@@ -1660,6 +1680,7 @@ static const std::map LLM_TENSOR_INFOS = { +@@ -1680,6 +1700,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, @@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644 {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h -index 98ca00a1..439aaeab 100644 +index 41a023da..525c1b7d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h -@@ -72,6 +72,7 @@ enum llm_arch { +@@ -73,6 +73,7 @@ enum llm_arch { LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, @@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644 LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_PLM, LLM_ARCH_BAILINGMOE, -@@ -144,6 +145,7 @@ enum llm_kv { +@@ -146,6 +147,7 @@ enum llm_kv { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, @@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644 LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, -@@ -344,6 +346,7 @@ enum llm_tensor { +@@ -346,6 +348,7 @@ enum llm_tensor { LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, @@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644 if (il < n_layer) { return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 80fcd65d..6e278945 100644 +index 7ee6a5b7..48dce407 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -55,6 +55,8 @@ struct llama_hparams { @@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644 uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; uint32_t n_lora_kv = 0; -@@ -153,6 +155,9 @@ struct llama_hparams { +@@ -154,6 +156,9 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_v_s() const; @@ -138,7 +138,7 @@ index 80fcd65d..6e278945 100644 }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp -index ea73a8a7..a012aeae 100644 +index 4cce5166..7f6617fa 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -439,6 +439,7 @@ namespace GGUFMeta { @@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 llama_model_loader::llama_model_loader( const std::string & fname, diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index 6b7bfecf..aba42819 100644 +index 3a4e72a3..831b68c0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp -@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; @@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); @@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context { +@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context { } }; @@ -270,7 +270,7 @@ index 6b7bfecf..aba42819 100644 + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models -+ ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); ++ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); + + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644 struct llm_build_wavtokenizer_dec : public llm_graph_context { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { ggml_tensor * cur; -@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph( +@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; @@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { llm = std::make_unique(*this, params, gf); -@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { +@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_CHAMELEON: @@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644 return LLAMA_ROPE_TYPE_NORM; diff --git a/src/llama-model.h b/src/llama-model.h -index fd82d106..5865d5e9 100644 +index 6bdec263..43746c7d 100644 --- a/src/llama-model.h +++ b/src/llama-model.h -@@ -62,6 +62,7 @@ enum llm_type { +@@ -65,6 +65,7 @@ enum llm_type { LLM_TYPE_15B, LLM_TYPE_16B, LLM_TYPE_20B, + LLM_TYPE_22B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, - LLM_TYPE_34B, -@@ -307,6 +308,8 @@ struct llama_layer { +@@ -315,6 +316,8 @@ struct llama_layer { struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr; diff --git a/llama/patches/0006-add-mllama-support.patch b/llama/patches/0006-add-mllama-support.patch deleted file mode 100644 index e5fa0462c..000000000 --- a/llama/patches/0006-add-mllama-support.patch +++ /dev/null @@ -1,1032 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Sun, 20 Apr 2025 16:12:36 -0700 -Subject: [PATCH] add mllama support - -adds support for the llama 3.2 vision architecture ---- - examples/llava/gemma3-cli.cpp | 3 +- - examples/llava/llava.cpp | 5 +- - examples/llava/mtmd.cpp | 6 +- - ggml/src/ggml-backend-reg.cpp | 6 +- - include/llama.h | 6 + - src/llama-arch.cpp | 44 +++++ - src/llama-arch.h | 10 ++ - src/llama-batch.cpp | 3 + - src/llama-context.cpp | 25 ++- - src/llama-context.h | 1 + - src/llama-cparams.h | 1 + - src/llama-graph.cpp | 25 +++ - src/llama-graph.h | 12 ++ - src/llama-hparams.cpp | 4 + - src/llama-hparams.h | 7 + - src/llama-kv-cache.cpp | 12 +- - src/llama-model-loader.cpp | 2 + - src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++- - src/llama-model.h | 12 ++ - src/llama-quant.cpp | 4 +- - 20 files changed, 475 insertions(+), 22 deletions(-) - -diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp -index 3d566475..654d1358 100644 ---- a/examples/llava/gemma3-cli.cpp -+++ b/examples/llava/gemma3-cli.cpp -@@ -106,7 +106,7 @@ struct decode_embd_batch { - std::vector seq_ids; - std::vector logits; - llama_batch batch; -- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); -@@ -118,6 +118,7 @@ struct decode_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, -+ /*n_embd =*/ n_embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), -diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp -index 03a22cbb..5eb40bcd 100644 ---- a/examples/llava/llava.cpp -+++ b/examples/llava/llava.cpp -@@ -456,7 +456,7 @@ struct llava_embd_batch { - std::vector seq_ids; - std::vector logits; - llama_batch batch; -- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); -@@ -468,6 +468,7 @@ struct llava_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, -+ /*n_embd =*/ n_embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), -@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ - n_eval = n_batch; - } - float * embd = image_embed->embed+i*n_embd; -- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0); -+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0); - if (llama_decode(ctx_llama, llava_batch.batch)) { - LOG_ERR("%s : failed to eval\n", __func__); - return false; -diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp -index 3fd5bebc..f0cec596 100644 ---- a/examples/llava/mtmd.cpp -+++ b/examples/llava/mtmd.cpp -@@ -233,7 +233,7 @@ struct decode_embd_batch { - std::vector seq_ids; - std::vector logits; - llama_batch batch; -- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); -@@ -245,6 +245,7 @@ struct decode_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, -+ /*n_embd =*/ n_embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), -@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, - - int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); - float * embd = mtmd_get_output_embd(ctx); -- decode_embd_batch batch_img(embd, n_tokens, n_past, 0); -+ int n_embd = llama_model_n_embd(llama_get_model(lctx)); -+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0); - int64_t t1 = ggml_time_ms(); - ret = llama_decode(lctx, batch_img.batch); - if (ret != 0) { -diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp -index 405d8e31..82ae1b5b 100644 ---- a/ggml/src/ggml-backend-reg.cpp -+++ b/ggml/src/ggml-backend-reg.cpp -@@ -178,9 +178,9 @@ struct ggml_backend_registry { - #ifdef GGML_USE_CANN - register_backend(ggml_backend_cann_reg()); - #endif --#ifdef GGML_USE_BLAS -- register_backend(ggml_backend_blas_reg()); --#endif -+// #ifdef GGML_USE_BLAS -+// register_backend(ggml_backend_blas_reg()); -+// #endif - #ifdef GGML_USE_RPC - register_backend(ggml_backend_rpc_reg()); - #endif -diff --git a/include/llama.h b/include/llama.h -index 5657fbf0..f91896e4 100644 ---- a/include/llama.h -+++ b/include/llama.h -@@ -255,6 +255,7 @@ extern "C" { - - llama_token * token; - float * embd; -+ int32_t n_embd; - llama_pos * pos; - int32_t * n_seq_id; - llama_seq_id ** seq_id; -@@ -357,6 +358,7 @@ extern "C" { - bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings -+ bool cross_attn; // whether to use cross attention - - // Abort callback - // if it returns true, execution of llama_decode() will be aborted -@@ -458,6 +460,10 @@ extern "C" { - struct llama_context_params params), - "use llama_init_from_model instead"); - -+ // TODO (jmorganca): this should most likely be passed in as part of a batch -+ // and not set on the context for all batches. -+ LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state); -+ - // Frees all allocated memory - LLAMA_API void llama_free(struct llama_context * ctx); - -diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index f754bc8f..0568565f 100644 ---- a/src/llama-arch.cpp -+++ b/src/llama-arch.cpp -@@ -6,6 +6,7 @@ - - static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, -+ { LLM_ARCH_MLLAMA, "mllama" }, - { LLM_ARCH_LLAMA4, "llama4" }, - { LLM_ARCH_DECI, "deci" }, - { LLM_ARCH_FALCON, "falcon" }, -@@ -142,6 +143,7 @@ static const std::map LLM_KV_NAMES = { - { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, - { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, - { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, -+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, - { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, - { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, - -@@ -271,6 +273,40 @@ static const std::map> LLM_TENSOR_N - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, -+ { -+ LLM_ARCH_MLLAMA, -+ { -+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, -+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, -+ { LLM_TENSOR_OUTPUT, "output" }, -+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, -+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, -+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, -+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, -+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, -+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, -+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, -+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, -+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, -+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, -+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, -+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, -+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, -+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, -+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, -+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, -+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, -+ { LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" }, -+ { LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" }, -+ { LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" }, -+ { LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" }, -+ }, -+ }, - { - LLM_ARCH_DECI, - { -@@ -1681,6 +1717,14 @@ static const std::map LLM_TENSOR_INFOS = { - // this tensor is loaded for T5, but never used - {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, - {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, - {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -diff --git a/src/llama-arch.h b/src/llama-arch.h -index 439aaeab..6a989034 100644 ---- a/src/llama-arch.h -+++ b/src/llama-arch.h -@@ -11,6 +11,7 @@ - enum llm_arch { - LLM_ARCH_LLAMA, - LLM_ARCH_LLAMA4, -+ LLM_ARCH_MLLAMA, - LLM_ARCH_DECI, - LLM_ARCH_FALCON, - LLM_ARCH_BAICHUAN, -@@ -146,6 +147,7 @@ enum llm_kv { - LLM_KV_ATTENTION_SLIDING_WINDOW, - LLM_KV_ATTENTION_SCALE, - LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, -+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, - LLM_KV_ATTENTION_KEY_LENGTH_MLA, - LLM_KV_ATTENTION_VALUE_LENGTH_MLA, - -@@ -347,6 +349,14 @@ enum llm_tensor { - LLM_TENSOR_CLS, - LLM_TENSOR_CLS_OUT, - LLM_TENSOR_BSKCN_TV, -+ LLM_TENSOR_CROSS_ATTN_K_NORM, -+ LLM_TENSOR_CROSS_ATTN_K_PROJ, -+ LLM_TENSOR_CROSS_ATTN_O_PROJ, -+ LLM_TENSOR_CROSS_ATTN_Q_NORM, -+ LLM_TENSOR_CROSS_ATTN_Q_PROJ, -+ LLM_TENSOR_CROSS_ATTN_V_PROJ, -+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, -+ LLM_TENSOR_CROSS_ATTN_MLP_GATE, - LLM_TENSOR_CONV1D, - LLM_TENSOR_CONVNEXT_DW, - LLM_TENSOR_CONVNEXT_NORM, -diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp -index 01d5ca57..8682b0e6 100644 ---- a/src/llama-batch.cpp -+++ b/src/llama-batch.cpp -@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one( - /*n_tokens =*/ n_tokens, - /*tokens =*/ tokens, - /*embd =*/ nullptr, -+ /*n_embd =*/ 0, - /*pos =*/ nullptr, - /*n_seq_id =*/ nullptr, - /*seq_id =*/ nullptr, -@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ - /*n_tokens =*/ 0, - /*tokens =*/ nullptr, - /*embd =*/ nullptr, -+ /*n_embd =*/ 0, - /*pos =*/ nullptr, - /*n_seq_id =*/ nullptr, - /*seq_id =*/ nullptr, -@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ - - if (embd) { - batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); -+ batch.n_embd = embd; - } else { - batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); - } -diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 32f59819..0343ba8a 100644 ---- a/src/llama-context.cpp -+++ b/src/llama-context.cpp -@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) { - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); - } - -- return logits + j*model.vocab.n_tokens(); -+ return logits + j*model.hparams.n_vocab; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); - #ifndef NDEBUG -@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) { - cparams.warmup = value; - } - -+void llama_context::set_cross_attn(bool value) { -+ cparams.cross_attn = value; -+} -+ - void llama_context::set_adapter_lora( - llama_adapter_lora * adapter, - float scale) { -@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) { - - const int64_t n_embd = hparams.n_embd; - -- sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); -+ sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true); - - const llama_ubatch ubatch = sbatch.split_simple(n_tokens); - -@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) { - - const llama_batch & batch = batch_allocr.batch; - -- const auto & vocab = model.vocab; - const auto & hparams = model.hparams; - -- const int32_t n_vocab = vocab.n_tokens(); -+ const int32_t n_vocab = hparams.n_vocab; - - const int64_t n_tokens_all = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; -@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) { - - const bool logits_all = n_outputs_all == n_tokens_all; - -- sbatch.from_batch(batch, n_embd, -+ sbatch.from_batch(batch, batch.n_embd, - /* simple_split */ !kv_self->recurrent, - /* logits_all */ logits_all); - -@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) { - - int32_t llama_context::output_reserve(int32_t n_outputs) { - const auto & hparams = model.hparams; -- const auto & vocab = model.vocab; - - const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); - - const auto n_batch = cparams.n_batch; -- const auto n_vocab = vocab.n_tokens(); -+ const auto n_vocab = hparams.n_vocab; - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead -@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { - void llama_context::output_reorder() { - auto & out_ids = sbatch.out_ids; - if (!out_ids.empty()) { -- const uint32_t n_vocab = model.vocab.n_tokens(); -+ const uint32_t n_vocab = model.hparams.n_vocab; - const uint32_t n_embd = model.hparams.n_embd; - - GGML_ASSERT((size_t) n_outputs == out_ids.size()); -@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { - { - LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); - -- const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); -+ const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab); - - io.write(&logits_size, sizeof(logits_size)); - -@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() { - /*.offload_kqv =*/ true, - /*.flash_attn =*/ false, - /*.no_perf =*/ true, -+ /*.cross_attn =*/ false, - /*.abort_callback =*/ nullptr, - /*.abort_callback_data =*/ nullptr, - }; -@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { - ctx->set_warmup(warmup); - } - -+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) { -+ ctx->set_cross_attn(cross_attention); -+} -+ - void llama_synchronize(llama_context * ctx) { - ctx->synchronize(); - } -diff --git a/src/llama-context.h b/src/llama-context.h -index 04facb54..baa03276 100644 ---- a/src/llama-context.h -+++ b/src/llama-context.h -@@ -65,6 +65,7 @@ struct llama_context { - void set_embeddings (bool value); - void set_causal_attn(bool value); - void set_warmup(bool value); -+ void set_cross_attn(bool value); - - void set_adapter_lora( - llama_adapter_lora * adapter, -diff --git a/src/llama-cparams.h b/src/llama-cparams.h -index 30e550f0..85ad91b9 100644 ---- a/src/llama-cparams.h -+++ b/src/llama-cparams.h -@@ -29,6 +29,7 @@ struct llama_cparams { - bool offload_kqv; - bool flash_attn; - bool no_perf; -+ bool cross_attn; - bool warmup; - - enum llama_pooling_type pooling_type; -diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp -index a85e9728..d740c120 100644 ---- a/src/llama-graph.cpp -+++ b/src/llama-graph.cpp -@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { - } - } - -+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) { -+ if (ubatch->embd) { -+ ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state)); -+ } -+} -+ - // - // llm_graph_context - // -@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { - return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); - } - -+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const { -+ const int64_t n_embd = hparams.n_embd; -+ -+ auto inp = std::make_unique(); -+ -+ ggml_tensor * cur = nullptr; -+ -+ inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4); -+ ggml_set_input(inp->cross_attn_state); -+ -+ cur = inp->cross_attn_state; -+ -+ cb(cur, "inp_cross_attn_state", -1); -+ -+ res->add_input(std::move(inp)); -+ -+ return cur; -+} -+ - ggml_tensor * llm_graph_context::build_attn( - llm_graph_input_attn_cross * inp, - ggml_cgraph * gf, -diff --git a/src/llama-graph.h b/src/llama-graph.h -index d192dc14..260a2af2 100644 ---- a/src/llama-graph.h -+++ b/src/llama-graph.h -@@ -86,6 +86,7 @@ public: - - ggml_tensor * tokens = nullptr; // I32 [n_batch] - ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] -+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] - }; - - class llm_graph_input_pos : public llm_graph_input_i { -@@ -285,6 +286,16 @@ public: - const llama_cross * cross = nullptr; - }; - -+class llm_graph_input_cross_attn_state : public llm_graph_input_i { -+public: -+ llm_graph_input_cross_attn_state() = default; -+ virtual ~llm_graph_input_cross_attn_state() = default; -+ -+ void set_input(const llama_ubatch * ubatch) override; -+ -+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] -+}; -+ - // - // llm_graph_result - // -@@ -493,6 +504,7 @@ struct llm_graph_context { - ggml_tensor * build_inp_cls() const; - ggml_tensor * build_inp_s_copy() const; - ggml_tensor * build_inp_s_mask() const; -+ ggml_tensor * build_inp_cross_attn_state() const; - - ggml_tensor * build_inp_cross_embd() const; - ggml_tensor * build_inp_pos_bucket_enc() const; -diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp -index 8a667960..6a02de03 100644 ---- a/src/llama-hparams.cpp -+++ b/src/llama-hparams.cpp -@@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const { - - GGML_ABORT("fatal error"); - } -+ -+bool llama_hparams::cross_attention_layers(uint32_t il) const { -+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); -+} -diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 6e278945..c8a34d52 100644 ---- a/src/llama-hparams.h -+++ b/src/llama-hparams.h -@@ -2,6 +2,8 @@ - - #include "llama.h" - -+#include -+ - #include - - // bump if necessary -@@ -42,6 +44,7 @@ struct llama_hparams { - uint32_t n_expert = 0; - uint32_t n_expert_used = 0; - uint32_t n_rel_attn_bkts = 0; -+ uint32_t n_vocab = 0; - - // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA - uint32_t n_embd_head_k_mla = 0; -@@ -56,6 +59,7 @@ struct llama_hparams { - std::array n_ff_arr; - - std::array, 4> n_bskcn_arr = {}; -+ std::array cross_attn_layers; - - uint32_t n_layer_dense_lead = 0; - uint32_t n_lora_q = 0; -@@ -158,6 +162,9 @@ struct llama_hparams { - // Block skip connection - bool n_bskcn(uint32_t n, uint32_t il) const; - -+ // cross attention layers -+ bool cross_attention_layers(uint32_t il) const; -+ - bool is_swa(uint32_t il) const; - }; - -diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp -index 7c9d46d8..69f8d35a 100644 ---- a/src/llama-kv-cache.cpp -+++ b/src/llama-kv-cache.cpp -@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init( - return false; - } - -- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); -- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); -+ ggml_tensor * k, *v; -+ -+ // for cross attention layers -+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) { -+ k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i)); -+ v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i)); -+ } else { -+ k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); -+ v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); -+ } - ggml_format_name(k, "cache_k_l%d", i); - ggml_format_name(v, "cache_v_l%d", i); - k_l.push_back(k); -diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp -index a012aeae..2e11507d 100644 ---- a/src/llama-model-loader.cpp -+++ b/src/llama-model-loader.cpp -@@ -315,6 +315,8 @@ namespace GGUFMeta { - return true; - } - -+ template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array& result, bool required); -+ - template - bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { - const int kid = gguf_find_key(meta.get(), key.c_str()); -diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index aba42819..d051696c 100644 ---- a/src/llama-model.cpp -+++ b/src/llama-model.cpp -@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - - // get general kv - ml.get_key(LLM_KV_GENERAL_NAME, name, false); -+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false); - - // everything past this point is not vocab-related - if (hparams.vocab_only) { -@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); - ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); - ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); -+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false); - - if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { - ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); -@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { - std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); - std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); - std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); -+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1); - - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); -+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false); - - // n_head_kv is optional, default to n_head - hparams.n_head_kv_arr = hparams.n_head_arr; -@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - - ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - -- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { -+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { - if (hparams.n_rot != hparams.n_embd_head_k) { - throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); - } -@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { - hparams.use_kq_norm = false; - } - } break; -+ case LLM_ARCH_MLLAMA: -+ { -+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); -+ -+ switch (hparams.n_layer) { -+ case 40: type = LLM_TYPE_11B; break; -+ case 100: type = LLM_TYPE_90B; break; -+ default: type = LLM_TYPE_UNKNOWN; -+ } -+ } break; - case LLM_ARCH_DECI: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); -@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { - const int64_t n_embd_head_v = hparams.n_embd_head_v; - const int64_t n_ff = hparams.n_ff(); - const int64_t n_embd_gqa = n_embd_v_gqa; -- const int64_t n_vocab = vocab.n_tokens(); -+ const int64_t n_vocab = hparams.n_vocab; - const int64_t n_token_types = vocab.n_token_types(); - const int64_t n_rot = hparams.n_rot; - const int64_t n_expert = hparams.n_expert; -@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { - } - } - } break; -+ case LLM_ARCH_MLLAMA: -+ { -+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0); -+ -+ // output -+ { -+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); -+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); -+ -+ // if output is NULL, init from the input tok embed -+ if (output == NULL) { -+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); -+ } -+ } -+ -+ for (int i = 0; i < n_layer; ++i) { -+ auto & layer = layers[i]; -+ -+ if (hparams.cross_attention_layers(i)) { -+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0); -+ layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0); -+ layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0); -+ layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0); -+ layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0); -+ layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0); -+ layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0); -+ layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0); -+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); -+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); -+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); -+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); -+ } else { -+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); -+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); -+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); -+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); -+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); -+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); -+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); -+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); -+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); -+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -+ } -+ } -+ } break; - case LLM_ARCH_DECI: - { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); -@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context { - } - }; - -+struct llm_build_mllama: public llm_graph_context { -+ llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { -+ // mutable variable, needed during the last layer of the computation to skip unused tokens -+ int32_t n_tokens = this->n_tokens; -+ -+ const int64_t n_embd_head = hparams.n_embd_head_v; -+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); -+ GGML_ASSERT(n_embd_head == hparams.n_rot); -+ -+ ggml_tensor * cur; -+ ggml_tensor * inpL; -+ ggml_tensor * inpCAS; -+ -+ inpL = build_inp_embd(model.tok_embd); -+ inpCAS = build_inp_cross_attn_state(); -+ -+ // inp_pos - contains the positions -+ ggml_tensor * inp_pos = build_inp_pos(); -+ -+ auto * inp_attn = build_attn_inp_kv_unified(); -+ const llama_kv_cache_unified * kv_self = static_cast(memory); -+ -+ for (int il = 0; il < n_layer; ++il) { -+ ggml_tensor * inpSA = inpL; -+ -+ // norm -+ cur = build_norm(inpL, -+ model.layers[il].attn_norm, NULL, -+ LLM_NORM_RMS, il); -+ cb(cur, "attn_norm", il); -+ -+ if (hparams.cross_attention_layers(il)) { -+ if (!ubatch.embd && !cparams.cross_attn) { -+ continue; -+ } -+ -+ // cross attention layer -+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur); -+ cb(Qcur, "Qcur", il); -+ -+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); -+ cb(Qcur, "Qcur", il); -+ -+ Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); -+ cb(Qcur, "Qcur", il); -+ -+ Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il); -+ cb(Qcur, "Qcur", il); -+ -+ ggml_tensor * Kcur, * Vcur; -+ if (ubatch.embd) { -+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS); -+ cb(Kcur, "Kcur", il); -+ -+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404); -+ cb(Kcur, "Kcur", il); -+ -+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); -+ cb(Kcur, "Kcur", il); -+ -+ Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il); -+ cb(Kcur, "Kcur", il); -+ -+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il])); -+ -+ Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS); -+ cb(Vcur, "Vcur", il); -+ -+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404); -+ cb(Vcur, "Vcur", il); -+ -+ Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); -+ cb(Vcur, "Vcur", il); -+ -+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il])); -+ } else { -+ Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]); -+ cb(Kcur, "Kcur (view)", il); -+ -+ Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]); -+ cb(Vcur, "Vcur (view)", il); -+ } -+ -+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur); -+ cb(kq, "kq", il); -+ -+ // TODO: apply causal masks -+ struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); -+ cb(kq_soft_max, "kq_soft_max", il); -+ -+ Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur)); -+ cb(Vcur, "Vcur", il); -+ -+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max); -+ cb(kqv, "kqv", il); -+ -+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); -+ cb(kqv_merged, "kqv_merged", il); -+ -+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); -+ cb(cur, "kqv_merged_cont", il); -+ -+ cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur); -+ cb(cur, "cur", il); -+ -+ // TODO: do this in place once? -+ cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate)); -+ -+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); -+ cb(ffn_inp, "ffn_inp", il); -+ -+ // feed-forward network -+ cur = build_norm(ffn_inp, -+ model.layers[il].ffn_norm, NULL, -+ LLM_NORM_RMS, il); -+ cb(cur, "ffn_norm", il); -+ -+ cur = build_ffn(cur, -+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, -+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, -+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, -+ NULL, -+ LLM_FFN_SILU, LLM_FFN_PAR, il); -+ cb(cur, "ffn_out", il); -+ -+ // TODO: do this inplace once? -+ cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp); -+ cb(cur, "ffn_out", il); -+ -+ cur = build_cvec(cur, il); -+ cb(cur, "l_out", il); -+ -+ // input for next layer -+ inpL = cur; -+ } else { -+ // self attention layer -+ -+ // rope freq factors for llama3; may return nullptr for llama2 and other models -+ ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); -+ -+ // compute Q and K and RoPE them -+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); -+ cb(Qcur, "Qcur", il); -+ if (model.layers[il].bq) { -+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); -+ cb(Qcur, "Qcur", il); -+ } -+ -+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); -+ cb(Kcur, "Kcur", il); -+ if (model.layers[il].bk) { -+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); -+ cb(Kcur, "Kcur", il); -+ } -+ -+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); -+ cb(Vcur, "Vcur", il); -+ if (model.layers[il].bv) { -+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); -+ cb(Vcur, "Vcur", il); -+ } -+ -+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); -+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); -+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); -+ -+ Qcur = ggml_rope_ext( -+ ctx0, Qcur, inp_pos, rope_factors, -+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, -+ ext_factor, attn_factor, beta_fast, beta_slow -+ ); -+ -+ Kcur = ggml_rope_ext( -+ ctx0, Kcur, inp_pos, rope_factors, -+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, -+ ext_factor, attn_factor, beta_fast, beta_slow -+ ); -+ -+ cb(Qcur, "Qcur", il); -+ cb(Kcur, "Kcur", il); -+ cb(Vcur, "Vcur", il); -+ -+ cur = build_attn(inp_attn, gf, -+ model.layers[il].wo, model.layers[il].bo, -+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); -+ -+ if (il == n_layer - 1) { -+ // skip computing output for unused tokens -+ struct ggml_tensor * inp_out_ids = build_inp_out_ids(); -+ n_tokens = n_outputs; -+ cur = ggml_get_rows(ctx0, cur, inp_out_ids); -+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); -+ } -+ -+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); -+ cb(ffn_inp, "ffn_inp", il); -+ -+ // feed-forward network -+ cur = build_norm(ffn_inp, -+ model.layers[il].ffn_norm, NULL, -+ LLM_NORM_RMS, il); -+ cb(cur, "ffn_norm", il); -+ -+ cur = build_ffn(cur, -+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, -+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, -+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, -+ NULL, -+ LLM_FFN_SILU, LLM_FFN_PAR, il); -+ cb(cur, "ffn_out", il); -+ -+ cur = ggml_add(ctx0, cur, ffn_inp); -+ cb(cur, "ffn_out", il); -+ -+ cur = build_cvec(cur, il); -+ cb(cur, "l_out", il); -+ -+ // input for next layer -+ inpL = cur; -+ } -+ } -+ -+ cur = inpL; -+ -+ cur = build_norm(cur, -+ model.output_norm, NULL, -+ LLM_NORM_RMS, -1); -+ cb(cur, "result_norm", -1); -+ res->t_embd = cur; -+ -+ // lm_head -+ cur = build_lora_mm(model.output, cur); -+ -+ cb(cur, "result_output", -1); -+ res->t_logits = cur; -+ -+ ggml_build_forward_expand(gf, cur); -+ } -+}; -+ - struct llm_build_deci : public llm_graph_context { - llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; -@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph( - { - llm = std::make_unique(*this, params, gf); - } break; -+ case LLM_ARCH_MLLAMA: -+ { -+ llm = std::make_unique(*this, params, gf); -+ } break; - case LLM_ARCH_DECI: - { - llm = std::make_unique(*this, params, gf); -@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { - // use what we call a normal RoPE, operating on pairs of consecutive head values - case LLM_ARCH_LLAMA: - case LLM_ARCH_LLAMA4: -+ case LLM_ARCH_MLLAMA: - case LLM_ARCH_DECI: - case LLM_ARCH_BAICHUAN: - case LLM_ARCH_STARCODER: -diff --git a/src/llama-model.h b/src/llama-model.h -index 5865d5e9..72bab5be 100644 ---- a/src/llama-model.h -+++ b/src/llama-model.h -@@ -11,6 +11,7 @@ - #include - #include - #include -+#include - - struct llama_cparams; - struct llama_ubatch; -@@ -70,6 +71,7 @@ enum llm_type { - LLM_TYPE_40B, - LLM_TYPE_65B, - LLM_TYPE_70B, -+ LLM_TYPE_90B, - LLM_TYPE_236B, - LLM_TYPE_314B, - LLM_TYPE_671B, -@@ -310,6 +312,16 @@ struct llama_layer { - - struct ggml_tensor * bskcn_tv = nullptr; - -+ // cross attention -+ struct ggml_tensor * cross_attn_k_norm = nullptr; -+ struct ggml_tensor * cross_attn_k_proj = nullptr; -+ struct ggml_tensor * cross_attn_o_proj = nullptr; -+ struct ggml_tensor * cross_attn_q_norm = nullptr; -+ struct ggml_tensor * cross_attn_q_proj = nullptr; -+ struct ggml_tensor * cross_attn_v_proj = nullptr; -+ struct ggml_tensor * cross_attn_attn_gate = nullptr; -+ struct ggml_tensor * cross_attn_mlp_gate = nullptr; -+ - struct llama_layer_posnet posnet; - - struct llama_layer_convnext convnext; -diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp -index 7dc54227..223e1f3f 100644 ---- a/src/llama-quant.cpp -+++ b/src/llama-quant.cpp -@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: - if (llama_model_has_encoder(&model)) { - n_attn_layer *= 3; - } -- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); -+ if (qs.n_attention_wv != n_attn_layer) { -+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv); -+ } - } - - size_t total_size_org = 0; diff --git a/llama/patches/0008-fix-deepseek-deseret-regex.patch b/llama/patches/0006-fix-deepseek-deseret-regex.patch similarity index 97% rename from llama/patches/0008-fix-deepseek-deseret-regex.patch rename to llama/patches/0006-fix-deepseek-deseret-regex.patch index 9b2d33984..ff4b57577 100644 --- a/llama/patches/0008-fix-deepseek-deseret-regex.patch +++ b/llama/patches/0006-fix-deepseek-deseret-regex.patch @@ -12,10 +12,10 @@ regex 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index a35b498c..032019c9 100644 +index 806c1b3d..10f34d33 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp -@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { +@@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: regex_exprs = { "[\r\n]", diff --git a/llama/patches/0007-add-unpad-operator.patch b/llama/patches/0007-add-unpad-operator.patch deleted file mode 100644 index 116545d67..000000000 --- a/llama/patches/0007-add-unpad-operator.patch +++ /dev/null @@ -1,419 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Sun, 13 Apr 2025 22:10:06 -0400 -Subject: [PATCH] add unpad operator - -adds the unpad operator to GGML ---- - ggml/include/ggml.h | 10 +++++ - ggml/src/ggml-cpu/ggml-cpu.c | 5 +++ - ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++ - ggml/src/ggml-cpu/ops.h | 1 + - ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++ - ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++ - ggml/src/ggml-cuda/pad.cuh | 1 + - ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++ - ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++ - ggml/src/ggml.c | 25 ++++++++++++- - 10 files changed, 223 insertions(+), 2 deletions(-) - -diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h -index 8fcc16df..d19fc167 100644 ---- a/ggml/include/ggml.h -+++ b/ggml/include/ggml.h -@@ -488,6 +488,7 @@ extern "C" { - GGML_OP_UPSCALE, // nearest interpolate - GGML_OP_PAD, - GGML_OP_PAD_REFLECT_1D, -+ GGML_OP_UNPAD, - GGML_OP_ARANGE, - GGML_OP_TIMESTEP_EMBEDDING, - GGML_OP_ARGSORT, -@@ -1757,6 +1758,15 @@ extern "C" { - int p0, - int p1); - -+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x] -+ GGML_API struct ggml_tensor * ggml_unpad( -+ struct ggml_context * ctx, -+ struct ggml_tensor * a, -+ int p0, -+ int p1, -+ int p2, -+ int p3); -+ - // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 - // timesteps: [N,] - // return: [N, dim] -diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 50400328..432942bf 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu.c -+++ b/ggml/src/ggml-cpu/ggml-cpu.c -@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm - { - ggml_compute_forward_pad_reflect_1d(params, tensor); - } break; -+ case GGML_OP_UNPAD: -+ { -+ ggml_compute_forward_unpad(params, tensor); -+ } break; - case GGML_OP_ARANGE: - { - ggml_compute_forward_arange(params, tensor); -@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { - case GGML_OP_UPSCALE: - case GGML_OP_PAD: - case GGML_OP_PAD_REFLECT_1D: -+ case GGML_OP_UNPAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_ARGSORT: -diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp -index 6050147b..66b8da68 100644 ---- a/ggml/src/ggml-cpu/ops.cpp -+++ b/ggml/src/ggml-cpu/ops.cpp -@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d( - } - } - -+// ggml_compute_forward_unpad -+ -+static void ggml_compute_forward_unpad_f32( -+ const struct ggml_compute_params *params, -+ struct ggml_tensor *dst) { -+ -+ const struct ggml_tensor * src0 = dst->src[0]; -+ -+ GGML_ASSERT(src0->nb[0] == sizeof(float)); -+ GGML_ASSERT( dst->nb[0] == sizeof(float)); -+ -+ const int ith = params->ith; -+ const int nth = params->nth; -+ -+ GGML_TENSOR_UNARY_OP_LOCALS -+ -+ float * dst_ptr = (float *) dst->data; -+ -+ // TODO: optimize -+ -+ for (int64_t i2 = 0; i2 < ne2; ++i2) { -+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) { -+ for (int64_t i0 = 0; i0 < ne0; ++i0) { -+ for (int64_t i3 = 0; i3 < ne3; ++i3) { -+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; -+ -+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); -+ -+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { -+ dst_ptr[dst_idx] = *src_ptr; -+ } -+ } -+ } -+ } -+ } -+} -+ -+void ggml_compute_forward_unpad( -+ const struct ggml_compute_params * params, -+ struct ggml_tensor * dst) { -+ -+ const struct ggml_tensor * src0 = dst->src[0]; -+ -+ switch (src0->type) { -+ case GGML_TYPE_F32: -+ { -+ ggml_compute_forward_unpad_f32(params, dst); -+ } break; -+ default: -+ { -+ GGML_ABORT("fatal error"); -+ } -+ } -+} -+ - // ggml_compute_forward_arange - - static void ggml_compute_forward_arange_f32( -diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h -index 410a3720..3eca1cf8 100644 ---- a/ggml/src/ggml-cpu/ops.h -+++ b/ggml/src/ggml-cpu/ops.h -@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params - void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); -+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 31750b6f..0fef9522 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg - case GGML_OP_PAD: - ggml_cuda_op_pad(ctx, dst); - break; -+ case GGML_OP_UNPAD: -+ ggml_cuda_op_unpad(ctx, dst); -+ break; - case GGML_OP_ARANGE: - ggml_cuda_op_arange(ctx, dst); - break; -@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g - case GGML_OP_UPSCALE: - return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; - case GGML_OP_PAD: -+ case GGML_OP_UNPAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_LEAKY_RELU: -diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu -index 77432b04..7d45a7e1 100644 ---- a/ggml/src/ggml-cuda/pad.cu -+++ b/ggml/src/ggml-cuda/pad.cu -@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); - } -+ -+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { -+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 -+ // blockIdx.y: idx of ne1 -+ // blockIDx.x: idx of ne0 / BLOCK_SIZE -+ int nidx = threadIdx.x + blockIdx.x * blockDim.x; -+ if (nidx >= ne0) { -+ return; -+ } -+ -+ // operation -+ int offset_dst = -+ nidx + -+ blockIdx.y * ne0 + -+ blockIdx.z * ne0 * gridDim.y; -+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { -+ int offset_src = -+ nidx + -+ blockIdx.y * ne00 + -+ blockIdx.z * ne00 * ne01; -+ dst[offset_dst] = x[offset_src]; -+ } -+} -+ -+static void unpad_f32_cuda(const float * x, float * dst, -+ const int ne00, const int ne01, const int ne02, const int ne03, -+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { -+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; -+ dim3 gridDim(num_blocks, ne1, ne2*ne3); -+ unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); -+} -+ -+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { -+ const ggml_tensor * src0 = dst->src[0]; -+ const float * src0_d = (const float *)src0->data; -+ float * dst_d = (float *)dst->data; -+ cudaStream_t stream = ctx.stream(); -+ -+ GGML_ASSERT(src0->type == GGML_TYPE_F32); -+ GGML_ASSERT(dst->type == GGML_TYPE_F32); -+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors -+ -+ unpad_f32_cuda(src0_d, dst_d, -+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], -+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); -+} -\ No newline at end of file -diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh -index 8fd386b0..e2ededc3 100644 ---- a/ggml/src/ggml-cuda/pad.cuh -+++ b/ggml/src/ggml-cuda/pad.cuh -@@ -3,3 +3,4 @@ - #define CUDA_PAD_BLOCK_SIZE 256 - - void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 12886cd3..b2e95a66 100644 ---- a/ggml/src/ggml-metal/ggml-metal.m -+++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte - GGML_METAL_KERNEL_TYPE_UPSCALE_F32, - GGML_METAL_KERNEL_TYPE_PAD_F32, - GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, -+ GGML_METAL_KERNEL_TYPE_UNPAD_F32, - GGML_METAL_KERNEL_TYPE_ARANGE_F32, - GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, - GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, -@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); -+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); -@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex - case GGML_OP_POOL_2D: - case GGML_OP_PAD: - case GGML_OP_PAD_REFLECT_1D: -+ case GGML_OP_UNPAD: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_ARGSORT: - case GGML_OP_LEAKY_RELU: -@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node( - - const int nth = MIN(1024, ne0); - -+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; -+ } break; -+ case GGML_OP_UNPAD: -+ { -+ GGML_ASSERT(src0->type == GGML_TYPE_F32); -+ -+ id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline; -+ -+ [encoder setComputePipelineState:pipeline]; -+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; -+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; -+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; -+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; -+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; -+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; -+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; -+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; -+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; -+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; -+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; -+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; -+ [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; -+ [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; -+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; -+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; -+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; -+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; -+ -+ const int nth = MIN(1024, ne0); -+ - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ARANGE: -diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal -index 8d6e99e6..71f0f97f 100644 ---- a/ggml/src/ggml-metal/ggml-metal.metal -+++ b/ggml/src/ggml-metal/ggml-metal.metal -@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32( - } - } - -+kernel void kernel_unpad_f32( -+ device const char * src0, -+ device char * dst, -+ constant int64_t & ne00, -+ constant int64_t & ne01, -+ constant int64_t & ne02, -+ constant int64_t & ne03, -+ constant uint64_t & nb00, -+ constant uint64_t & nb01, -+ constant uint64_t & nb02, -+ constant uint64_t & nb03, -+ constant int64_t & ne0, -+ constant int64_t & ne1, -+ constant int64_t & ne2, -+ constant int64_t & ne3, -+ constant uint64_t & nb0, -+ constant uint64_t & nb1, -+ constant uint64_t & nb2, -+ constant uint64_t & nb3, -+ uint3 tgpig[[threadgroup_position_in_grid]], -+ uint3 tpitg[[thread_position_in_threadgroup]], -+ uint3 ntg[[threads_per_threadgroup]]) { -+ -+ const int64_t i3 = tgpig.z; -+ const int64_t i2 = tgpig.y; -+ const int64_t i1 = tgpig.x; -+ -+ const int64_t i03 = i3; -+ const int64_t i02 = i2; -+ const int64_t i01 = i1; -+ -+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); -+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); -+ -+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) { -+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { -+ if (i0 < ne00) { -+ dst_ptr[i0] = src0_ptr[i0]; -+ } -+ } -+ -+ return; -+ } -+} -+ - kernel void kernel_arange_f32( - device char * dst, - constant ggml_metal_kargs_arange & args, -diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c -index 950772c7..2276b631 100644 ---- a/ggml/src/ggml.c -+++ b/ggml/src/ggml.c -@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { - "UPSCALE", - "PAD", - "PAD_REFLECT_1D", -+ "UNPAD", - "ARANGE", - "TIMESTEP_EMBEDDING", - "ARGSORT", -@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { - "OPT_STEP_ADAMW", - }; - --static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); - - static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "none", -@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "upscale(x)", - "pad(x)", - "pad_reflect_1d(x)", -+ "unpad(x)", - "arange(start, stop, step)", - "timestep_embedding(timesteps, dim, max_period)", - "argsort(x)", -@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "adamw(x)", - }; - --static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); - - static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); - -@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( - return result; - } - -+// ggml_unpad -+ -+struct ggml_tensor * ggml_unpad( -+ struct ggml_context * ctx, -+ struct ggml_tensor * a, -+ int p0, int p1, int p2, int p3) { -+ -+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, -+ a->ne[0] - p0, -+ a->ne[1] - p1, -+ a->ne[2] - p2, -+ a->ne[3] - p3); -+ -+ result->op = GGML_OP_UNPAD; -+ result->src[0] = a; -+ -+ return result; -+} -+ - // ggml_arange - - struct ggml_tensor * ggml_arange( diff --git a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch similarity index 93% rename from llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch rename to llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch index 5504b1d31..4c2192887 100644 --- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch +++ b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch @@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp -index 90679822..56043678 100644 +index 5b3059c2..656b3eca 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp -@@ -346,7 +346,7 @@ private: +@@ -349,7 +349,7 @@ private: friend std::string build_grammar(const std::function & cb, const common_grammar_options & options); std::function _fetch_json; bool _dotall; diff --git a/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch new file mode 100644 index 000000000..52116ce3f --- /dev/null +++ b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch @@ -0,0 +1,352 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: jmorganca +Date: Tue, 15 Apr 2025 14:27:40 -0400 +Subject: [PATCH] ensure KV cache is fully defragmented + +Sometimes the KV cache requires defragmentation even without +triggering the threshold heuristic. In this case, decoding +will not being able to find a KV cache slot. This is particularly +difficult for the caller to handle if it happens in between +ubatches. To avoid this, we should immediately trigger a defrag. + +In addition, a heavily fragmented cache can require more than +max_moves to defragment. Currently, we stop when we hit the limit +but this can leave a cache that still does not have adequate space +even after defragmentation is triggered. Instead, we should do +multiple batches of processing until everything is complete. +--- + src/llama-context.cpp | 18 ++++--- + src/llama-context.h | 1 + + src/llama-kv-cache.cpp | 107 ++++++++++++++--------------------------- + src/llama-kv-cache.h | 12 ++++- + 4 files changed, 59 insertions(+), 79 deletions(-) + +diff --git a/src/llama-context.cpp b/src/llama-context.cpp +index c22687e4..c5948e8f 100644 +--- a/src/llama-context.cpp ++++ b/src/llama-context.cpp +@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) { + + // find KV slot + if (!kv_self->find_slot(ubatch)) { +- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); +- +- return 1; ++ kv_self->defrag_sched(-1.0f); ++ kv_self->update(*this); ++ if (!kv_self->find_slot(ubatch)) { ++ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); ++ return 1; ++ } + } + + ggml_backend_sched_reset(sched.get()); +@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter( + + // TODO: not sure if this is needed + if (!kv_self->find_slot(ubatch)) { +- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); +- +- GGML_ABORT("TODO: handle this error"); ++ kv_self->defrag_sched(-1.0f); ++ kv_self->update(*this); ++ if (!kv_self->find_slot(ubatch)) { ++ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); ++ GGML_ABORT("TODO: handle this error"); ++ } + } + + auto * gf = graph_init(); +diff --git a/src/llama-context.h b/src/llama-context.h +index c0ceacb1..0264e937 100644 +--- a/src/llama-context.h ++++ b/src/llama-context.h +@@ -5,6 +5,7 @@ + #include "llama-cparams.h" + #include "llama-graph.h" + #include "llama-adapter.h" ++#include "llama-kv-cache.h" + + #include "ggml-cpp.h" + #include "ggml-opt.h" +diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp +index 3dcad65b..60e67b03 100644 +--- a/src/llama-kv-cache.cpp ++++ b/src/llama-kv-cache.cpp +@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() { + } + + bool llama_kv_cache_unified::update(llama_context & lctx) { +- bool need_reserve = false; +- + auto * sched = lctx.get_sched(); + + if (has_shift) { +@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { + res->set_inputs(nullptr); + + lctx.graph_compute(gf, false); +- +- need_reserve = true; + } + + { +@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { + + if (do_defrag) { + LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); ++ const uint32_t n_max_nodes = lctx.graph_max_nodes(); ++ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer); ++ if (!defrag_prepare(n_max_nodes)) { ++ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__); ++ return false; ++ } ++ ++ for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) { ++ std::vector chunk; ++ auto end = std::min(i + max_moves, defrag_info.moves.size()); ++ chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end); + +- if (defrag_prepare(lctx.graph_max_nodes())) { + ggml_backend_sched_reset(sched); + + auto * gf = lctx.graph_init(); + +- auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf); ++ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk); + + ggml_backend_sched_alloc_graph(sched, gf); + + res->set_inputs(nullptr); + + lctx.graph_compute(gf, false); +- +- need_reserve = true; + } + + do_defrag = false; + } + +- return need_reserve; ++ // we never need to reserve a worst case graph ++ return false; + } + + void llama_kv_cache_unified::defrag_sched(float thold) { +@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( + llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( + const llama_cparams & cparams, + ggml_context * ctx, +- ggml_cgraph * gf) const { ++ ggml_cgraph * gf, ++ const std::vector & moves) const { + auto res = std::make_unique(); + +- const auto & ids = defrag_info.ids; +- + #if 0 + // CPU defrag + // +@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( + ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); + } + #else +- for (uint32_t i = 0; i < ids.size(); ++i) { +- const uint32_t id = ids[i]; +- +- if (i == id || id == ids.size()) { +- continue; +- } +- +- uint32_t nm = 1; +- +- while (i + nm < ids.size() && ids[i + nm] == id + nm) { +- nm++; +- } +- ++ for (const auto & move : moves) { + for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il], +- n_embd_k_gqa, nm, ++ n_embd_k_gqa, move.len, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), +- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i)); ++ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il], +- n_embd_k_gqa, nm, ++ n_embd_k_gqa, move.len, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), +- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id)); ++ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; +@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( + if (cparams.flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx, v_l[il], +- n_embd_v_gqa, nm, ++ n_embd_v_gqa, move.len, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), +- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i)); ++ ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst)); + + view_v_dst = ggml_view_2d(ctx, v_l[il], +- n_embd_v_gqa, nm, ++ move.len, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), +- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id)); ++ ggml_row_size(v_l[il]->type, move.src)); + } else { + view_v_src = ggml_view_2d(ctx, v_l[il], +- nm, n_embd_v_gqa, ++ move.len, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), +- ggml_row_size(v_l[il]->type, i)); ++ ggml_row_size(v_l[il]->type, move.src)); + + view_v_dst = ggml_view_2d(ctx, v_l[il], +- nm, n_embd_v_gqa, ++ move.len, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), +- ggml_row_size(v_l[il]->type, id)); ++ ggml_row_size(v_l[il]->type, move.dst)); + } + + ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst)); + } +- +- i += nm - 1; + } + + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); +@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { + + assert(n_used <= n_kv); + +- //const int64_t t_start = ggml_time_us(); +- +- // number of cells moved +- uint32_t n_moves = 0; +- +- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag) +- // - source view, destination view, copy operation +- // - x2 for keys and values +- //const uint32_t max_moves = max_nodes()/(6*n_layer); +- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 +- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer); ++ defrag_info.moves.clear(); + + // determine which KV cells to move where + // +@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // +- auto & ids = defrag_info.ids; +- +- ids.clear(); +- ids.resize(n_kv, n_kv); ++ std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = cells[i0]; +@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { + // are we moving a continuous block of memory? + bool cont = false; + +- // should we stop searching for the next move? +- bool stop = false; +- + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { +- if (n_moves == max_moves) { +- stop = true; +- break; +- } +- + cont = false; + continue; + } +@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { + head = n_used; + + if (!cont) { +- n_moves++; ++ defrag_info.moves.push_back({i1, i0 + nf, 1}); + cont = true; ++ } else { ++ defrag_info.moves.back().len++; + } + + nf++; +@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { + } + } + +- if (stop || n_moves == max_moves) { +- break; +- } +- + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + +- if (n_moves == 0) { ++ if (defrag_info.moves.size() == 0) { + return false; + } + +- LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves); +- +- LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer); ++ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); + + return true; + } +diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h +index bf3b4b6a..928b9712 100644 +--- a/src/llama-kv-cache.h ++++ b/src/llama-kv-cache.h +@@ -82,6 +82,13 @@ struct llama_kv_cache_guard { + private: + llama_kv_cache * kv; + }; ++ ++// block of KV slots to move when defragging ++struct llama_kv_defrag_move { ++ uint32_t src; ++ uint32_t dst; ++ uint32_t len; ++}; + + // + // llama_kv_cache_unified +@@ -207,7 +214,7 @@ private: + + // defrag + struct { +- std::vector ids; ++ std::vector moves; + } defrag_info; + + // return true if cells have been moved +@@ -249,7 +256,8 @@ private: + llm_graph_result_ptr build_graph_defrag( + const llama_cparams & cparams, + ggml_context * ctx, +- ggml_cgraph * gf) const; ++ ggml_cgraph * gf, ++ const std::vector & moves) const; + + void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; + void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const; diff --git a/llama/patches/0011-sort-devices-by-score.patch b/llama/patches/0009-sort-devices-by-score.patch similarity index 99% rename from llama/patches/0011-sort-devices-by-score.patch rename to llama/patches/0009-sort-devices-by-score.patch index 8c3908cf6..e27d1ae92 100644 --- a/llama/patches/0011-sort-devices-by-score.patch +++ b/llama/patches/0009-sort-devices-by-score.patch @@ -11,7 +11,7 @@ with the fastest acceleration is loaded 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp -index 82ae1b5b..1487f322 100644 +index 405d8e31..4e67d243 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -157,7 +157,7 @@ struct ggml_backend_reg_entry { diff --git a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch similarity index 67% rename from llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch rename to llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch index eaba3c4c8..21c1fc42f 100644 --- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch +++ b/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch @@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants 1 file changed, 2 insertions(+) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index f00700da..91d6a7d5 100644 +index ddea5ad3..45918bf6 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt -@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name) +@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name) endforeach() ggml_add_cpu_backend_variant_impl(${tag_name}) @@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644 endfunction() ggml_add_backend(CPU) -@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS) +@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS) if (NOT GGML_BACKEND_DL) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") endif() + add_custom_target(ggml-cpu) - ggml_add_cpu_backend_variant(sandybridge AVX) - ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(x64) + ggml_add_cpu_backend_variant(sse42 SSE42) + ggml_add_cpu_backend_variant(sandybridge SSE42 AVX) diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch deleted file mode 100644 index c9d4e9ad8..000000000 --- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch +++ /dev/null @@ -1,361 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Tue, 15 Apr 2025 14:27:40 -0400 -Subject: [PATCH] ensure KV cache is fully defragmented - -Sometimes the KV cache requires defragmentation even without -triggering the threshold heuristic. In this case, decoding -will not being able to find a KV cache slot. This is particularly -difficult for the caller to handle if it happens in between -ubatches. To avoid this, we should immediately trigger a defrag. - -In addition, a heavily fragmented cache can require more than -max_moves to defragment. Currently, we stop when we hit the limit -but this can leave a cache that still does not have adequate space -even after defragmentation is triggered. Instead, we should do -multiple batches of processing until everything is complete. ---- - src/llama-context.cpp | 105 +++++++++++++---------------------------- - src/llama-context.h | 4 +- - src/llama-kv-cache.cpp | 39 +++------------ - src/llama-kv-cache.h | 9 +++- - 4 files changed, 51 insertions(+), 106 deletions(-) - -diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 0343ba8a..4b3e6a83 100644 ---- a/src/llama-context.cpp -+++ b/src/llama-context.cpp -@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( - - llm_graph_result_ptr llama_context::build_kv_self_defrag( - ggml_context * ctx0, -- ggml_cgraph * gf) const { -+ ggml_cgraph * gf, -+ const std::vector & moves) const { - auto res = std::make_unique(); - - const auto & hparams = model.hparams; - -- const auto & ids = kv_self->defrag_info.ids; -- - #if 0 - // CPU defrag - // -@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( - ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); - } - #else -- for (uint32_t i = 0; i < ids.size(); ++i) { -- const uint32_t id = ids[i]; -- -- if (i == id || id == ids.size()) { -- continue; -- } -- -- uint32_t nm = 1; -- -- while (i + nm < ids.size() && ids[i + nm] == id + nm) { -- nm++; -- } -- -+ for (const auto & move : moves) { - for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il], -- n_embd_k_gqa, nm, -+ n_embd_k_gqa, move.len, - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), -- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i)); -+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il], -- n_embd_k_gqa, nm, -+ n_embd_k_gqa, move.len, - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), -- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id)); -+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; -@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( - if (cparams.flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], -- n_embd_v_gqa, nm, -+ n_embd_v_gqa, move.len, - ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), -- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i)); -+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src)); - - view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], -- n_embd_v_gqa, nm, -+ n_embd_v_gqa, move.len, - ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), -- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id)); -+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst)); - } else { - view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], -- nm, n_embd_v_gqa, -+ move.len, n_embd_v_gqa, - ggml_row_size(kv_self->v_l[il]->type, kv_self->size), -- ggml_row_size(kv_self->v_l[il]->type, i)); -+ ggml_row_size(kv_self->v_l[il]->type, move.src)); - - view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], -- nm, n_embd_v_gqa, -+ move.len, n_embd_v_gqa, - ggml_row_size(kv_self->v_l[il]->type, kv_self->size), -- ggml_row_size(kv_self->v_l[il]->type, id)); -+ ggml_row_size(kv_self->v_l[il]->type, move.dst)); - } - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } -- -- i += nm - 1; - } -- -- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); - #endif - - return res; -@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( - void llama_context::kv_self_update() { - auto & kv = kv_self; - -- bool need_reserve = false; -- - if (kv->has_shift) { - if (!kv->get_can_shift()) { - GGML_ABORT("The current context does not support K-shift"); -@@ -763,8 +744,6 @@ void llama_context::kv_self_update() { - res->set_inputs(nullptr); - - graph_compute(gf, false); -- -- need_reserve = true; - } - - { -@@ -779,49 +758,28 @@ void llama_context::kv_self_update() { - // defragment the KV cache if needed - if (kv->do_defrag) { - LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); -+ const uint32_t n_max_nodes = graph_max_nodes(); -+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer); -+ if (!kv->defrag_prepare(n_max_nodes)) { -+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__); -+ return; -+ } - -- if (kv->defrag_prepare(graph_max_nodes())) { -- ggml_backend_sched_reset(sched.get()); -+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) { -+ std::vector chunk; -+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size()); -+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end); - -+ ggml_backend_sched_reset(sched.get()); - auto * gf = graph_init(); -- -- auto res = build_kv_self_defrag(ctx_compute.get(), gf); -- -+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk); - ggml_backend_sched_alloc_graph(sched.get(), gf); -- - res->set_inputs(nullptr); -- - graph_compute(gf, false); -- -- need_reserve = true; - } - - kv->do_defrag = false; - } -- -- // reserve a worst case graph if needed -- if (need_reserve) { -- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); -- -- // build worst-case graph -- uint32_t n_seqs = 1; // TODO: worst-case number of sequences -- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); -- -- // simulate full KV cache -- kv_self->n = kv_self->size; -- -- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph -- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; -- -- auto * gf = graph_init(); -- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT); -- -- // initialize scheduler with the worst-case graph -- ggml_backend_sched_reset(sched.get()); -- if (!ggml_backend_sched_reserve(sched.get(), gf)) { -- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); -- } -- } - } - - enum llama_pooling_type llama_context::pooling_type() const { -@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) { - // find KV slot - { - if (!kv_self->find_slot(ubatch)) { -- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); -- -- return 1; -+ kv_self->defrag(); -+ kv_self_update(); -+ if (!kv_self->find_slot(ubatch)) { -+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); -+ return 1; -+ } - } - - if (!kv_self->recurrent) { -diff --git a/src/llama-context.h b/src/llama-context.h -index baa03276..a59ff8fd 100644 ---- a/src/llama-context.h -+++ b/src/llama-context.h -@@ -5,6 +5,7 @@ - #include "llama-cparams.h" - #include "llama-graph.h" - #include "llama-adapter.h" -+#include "llama-kv-cache.h" - - #include "ggml-cpp.h" - -@@ -180,7 +181,8 @@ private: - - llm_graph_result_ptr build_kv_self_defrag( - ggml_context * ctx0, -- ggml_cgraph * gf) const; -+ ggml_cgraph * gf, -+ const std::vector & moves) const; - - // TODO: read/write lora adapters and cvec - size_t state_write_data(llama_io_write_i & io); -diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp -index 69f8d35a..35a750d3 100644 ---- a/src/llama-kv-cache.cpp -+++ b/src/llama-kv-cache.cpp -@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { - - assert(n_used <= n_kv); - -- //const int64_t t_start = ggml_time_us(); -- -- // number of cells moved -- uint32_t n_moves = 0; -- -- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag) -- // - source view, destination view, copy operation -- // - x2 for keys and values -- //const uint32_t max_moves = max_nodes()/(6*n_layer); -- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 -- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer); -+ defrag_info.moves.clear(); - - // determine which KV cells to move where - // -@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // -- auto & ids = defrag_info.ids; -- -- ids.clear(); -- ids.resize(n_kv, n_kv); -+ std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = cells[i0]; -@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { - // are we moving a continuous block of memory? - bool cont = false; - -- // should we stop searching for the next move? -- bool stop = false; -- - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { -- if (n_moves == max_moves) { -- stop = true; -- break; -- } -- - cont = false; - continue; - } -@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { - head = n_used; - - if (!cont) { -- n_moves++; -+ defrag_info.moves.push_back({i1, i0 + nf, 1}); - cont = true; -+ } else { -+ defrag_info.moves.back().len++; - } - - nf++; -@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { - } - } - -- if (stop || n_moves == max_moves) { -- break; -- } -- - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - -- if (n_moves == 0) { -+ if (defrag_info.moves.size() == 0) { - return false; - } - -- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); -- -- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer); -+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); - - return true; - } -diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h -index 56c74035..25cbcb56 100644 ---- a/src/llama-kv-cache.h -+++ b/src/llama-kv-cache.h -@@ -43,6 +43,13 @@ private: - llama_kv_cache * kv; - }; - -+// block of KV slots to move when defragging -+struct llama_kv_defrag_move { -+ uint32_t src; -+ uint32_t dst; -+ uint32_t len; -+}; -+ - struct llama_kv_cell { - llama_pos pos = -1; - llama_pos delta = 0; -@@ -131,7 +138,7 @@ public: - // defrag - - struct { -- std::vector ids; -+ std::vector moves; - } defrag_info; - - // return true if cells have been moved diff --git a/llama/patches/0011-remove-amx.patch b/llama/patches/0011-remove-amx.patch new file mode 100644 index 000000000..296a37612 --- /dev/null +++ b/llama/patches/0011-remove-amx.patch @@ -0,0 +1,25 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: jmorganca +Date: Thu, 1 May 2025 15:05:08 -0700 +Subject: [PATCH] remove amx + +disable amx as it reduces performance on some systems +--- + ggml/src/CMakeLists.txt | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt +index 45918bf6..0beaed86 100644 +--- a/ggml/src/CMakeLists.txt ++++ b/ggml/src/CMakeLists.txt +@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS) + ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) +- if (NOT MSVC) +- # MSVC doesn't support AMX +- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) +- endif() + elseif (GGML_CPU) + ggml_add_cpu_backend_variant_impl("") + endif() diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0012-fix-string-arr-kv-loading.patch similarity index 83% rename from llama/patches/0014-fix-string-arr-kv-loading.patch rename to llama/patches/0012-fix-string-arr-kv-loading.patch index 01f1b71eb..f879c50ee 100644 --- a/llama/patches/0014-fix-string-arr-kv-loading.patch +++ b/llama/patches/0012-fix-string-arr-kv-loading.patch @@ -9,8 +9,8 @@ such as vocab fields --- ggml/include/gguf.h | 1 + ggml/src/gguf.cpp | 7 +++++-- - src/llama-vocab.cpp | 2 +- - 3 files changed, 7 insertions(+), 3 deletions(-) + src/llama-vocab.cpp | 4 +--- + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee2020..3efb22f0 100644 @@ -53,15 +53,17 @@ index 381a9c7d..e45b453d 100644 } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 032019c9..ba37df35 100644 +index 10f34d33..9f5fd57b 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp -@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { - +@@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); if (precompiled_charsmap_keyidx != -1) { -- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); -+ size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); + const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx); +- GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8); +- +- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); ++ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); #ifdef IS_BIG_ENDIAN diff --git a/llama/patches/0015-ollama-debug-tensor.patch b/llama/patches/0013-ollama-debug-tensor.patch similarity index 91% rename from llama/patches/0015-ollama-debug-tensor.patch rename to llama/patches/0013-ollama-debug-tensor.patch index a192bdea8..53d911277 100644 --- a/llama/patches/0015-ollama-debug-tensor.patch +++ b/llama/patches/0013-ollama-debug-tensor.patch @@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 432942bf..6d4abe4c 100644 +index a30e67f2..2462d2b8 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -15,6 +15,8 @@ @@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644 #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { +@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); diff --git a/llama/patches/0013-remove-amx.patch b/llama/patches/0013-remove-amx.patch deleted file mode 100644 index 0bbc0a3a3..000000000 --- a/llama/patches/0013-remove-amx.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Tue, 8 Apr 2025 20:33:01 -0700 -Subject: [PATCH] remove amx - -disable amx as it reduces performance on some systems ---- - ggml/src/CMakeLists.txt | 4 ---- - 1 file changed, 4 deletions(-) - -diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index 91d6a7d5..d6b393a2 100644 ---- a/ggml/src/CMakeLists.txt -+++ b/ggml/src/CMakeLists.txt -@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) -- if (NOT MSVC) -- # MSVC doesn't support AMX -- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) -- endif() - elseif (GGML_CPU) - ggml_add_cpu_backend_variant_impl("") - endif() diff --git a/llama/patches/0021-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch similarity index 98% rename from llama/patches/0021-add-ollama-vocab-for-grammar-support.patch rename to llama/patches/0014-add-ollama-vocab-for-grammar-support.patch index 6193b755f..ee81800e2 100644 --- a/llama/patches/0021-add-ollama-vocab-for-grammar-support.patch +++ b/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch @@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644 const char * grammar_root, bool lazy, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp -index d1497985..b1a9dca3 100644 +index 804b11e0..15a10ca8 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp -@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { +@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); } @@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644 ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); -@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( +@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( /* .vocab = */ vocab, /* .grammar_str = */ grammar_str, /* .grammar_root = */ grammar_root, diff --git a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch new file mode 100644 index 000000000..b71295c76 --- /dev/null +++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch @@ -0,0 +1,277 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Thu, 1 May 2025 13:45:12 -0700 +Subject: [PATCH] add argsort and cuda copy for i32 + +--- + ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++++++ + ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++- + ggml/src/ggml-cuda/cpy.cu | 49 ++++++++++++++++ + 3 files changed, 192 insertions(+), 2 deletions(-) + +diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp +index becdae07..7a44b6cf 100644 +--- a/ggml/src/ggml-cpu/ops.cpp ++++ b/ggml/src/ggml-cpu/ops.cpp +@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32( + } + } + ++static void ggml_compute_forward_argsort_i32( ++ const ggml_compute_params * params, ++ ggml_tensor * dst) { ++ ++ const ggml_tensor * src0 = dst->src[0]; ++ ++ GGML_TENSOR_UNARY_OP_LOCALS ++ ++ GGML_ASSERT(nb0 == sizeof(int32_t)); ++ ++ const int ith = params->ith; ++ const int nth = params->nth; ++ ++ const int64_t nr = ggml_nrows(src0); ++ ++ ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0); ++ ++ for (int64_t i = ith; i < nr; i += nth) { ++ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); ++ const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01); ++ ++ for (int64_t j = 0; j < ne0; j++) { ++ dst_data[j] = j; ++ } ++ ++ // C doesn't have a functional sort, so we do a bubble sort instead ++ for (int64_t j = 0; j < ne0; j++) { ++ for (int64_t k = j + 1; k < ne0; k++) { ++ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || ++ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { ++ int32_t tmp = dst_data[j]; ++ dst_data[j] = dst_data[k]; ++ dst_data[k] = tmp; ++ } ++ } ++ } ++ } ++} ++ + void ggml_compute_forward_argsort( + const ggml_compute_params * params, + ggml_tensor * dst) { +@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort( + { + ggml_compute_forward_argsort_f32(params, dst); + } break; ++ case GGML_TYPE_I32: ++ { ++ ggml_compute_forward_argsort_i32(params, dst); ++ } break; + default: + { + GGML_ABORT("fatal error"); +diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu +index 607ded85..53b02634 100644 +--- a/ggml/src/ggml-cuda/argsort.cu ++++ b/ggml/src/ggml-cuda/argsort.cu +@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co + } + } + ++ ++template ++static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) { ++ extern __shared__ int shared_mem[]; ++ int * indices = shared_mem; ++ ++ const int tid = threadIdx.x; ++ const int row = blockIdx.y; ++ ++ // Initialize all indices, handling the case where threads < ncols_pad ++ for (int i = tid; i < ncols_pad; i += blockDim.x) { ++ indices[i] = i < ncols ? i : 0; // Use 0 for padding indices ++ } ++ __syncthreads(); ++ ++ // Bitonic sort ++ for (int k = 2; k <= ncols_pad; k *= 2) { ++ for (int j = k/2; j > 0; j /= 2) { ++ for (int i = tid; i < ncols_pad; i += blockDim.x) { ++ const int ij = i ^ j; ++ if (ij > i) { ++ // Only compare values within the actual data range ++ if (i < ncols && ij < ncols) { ++ if ((i & k) == 0) { ++ if (order == GGML_SORT_ORDER_ASC) { ++ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) { ++ int tmp = indices[i]; ++ indices[i] = indices[ij]; ++ indices[ij] = tmp; ++ } ++ } else { ++ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) { ++ int tmp = indices[i]; ++ indices[i] = indices[ij]; ++ indices[ij] = tmp; ++ } ++ } ++ } else { ++ if (order == GGML_SORT_ORDER_ASC) { ++ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) { ++ int tmp = indices[i]; ++ indices[i] = indices[ij]; ++ indices[ij] = tmp; ++ } ++ } else { ++ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) { ++ int tmp = indices[i]; ++ indices[i] = indices[ij]; ++ indices[ij] = tmp; ++ } ++ } ++ } ++ } ++ } ++ } ++ __syncthreads(); ++ } ++ } ++ ++ // Write sorted indices to output, only threads handling valid data ++ for (int i = tid; i < ncols; i += blockDim.x) { ++ dst[row * ncols + i] = indices[i]; ++ } ++} ++ ++static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { ++ // Bitonic sort requires ncols to be power of 2 ++ const int ncols_pad = next_power_of_2(ncols); ++ ++ // Ensure thread count doesn't exceed maximum (typically 1024) ++ const int max_threads = 1024; // This is the typical max for most GPUs ++ const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad; ++ ++ const dim3 block_dims(threads_per_block, 1, 1); ++ const dim3 block_nums(1, nrows, 1); ++ const size_t shared_mem = ncols_pad * sizeof(int); ++ ++ // Check if shared memory size is within limits ++ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; ++ ++ // Instead of logging an error, use GGML_ASSERT with a descriptive message ++ GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit"); ++ ++ // Launch kernels with the updated thread configuration ++ if (order == GGML_SORT_ORDER_ASC) { ++ k_argsort_i32_i32<<>>(x, dst, ncols, ncols_pad); ++ } else if (order == GGML_SORT_ORDER_DESC) { ++ k_argsort_i32_i32<<>>(x, dst, ncols, ncols_pad); ++ } else { ++ GGML_ABORT("fatal error"); ++ } ++} ++ ++ + void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + +- GGML_ASSERT(src0->type == GGML_TYPE_F32); ++ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_contiguous(src0)); + +@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + +- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); ++ if (src0->type == GGML_TYPE_I32) { ++ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream); ++ } else { ++ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); ++ } + } +diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu +index 2d46176e..47383486 100644 +--- a/ggml/src/ggml-cuda/cpy.cu ++++ b/ggml/src/ggml-cuda/cpy.cu +@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { + *dsti = *xi; + } + ++static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) { ++ const int32_t * xi = (const int32_t *) cxi; ++ int32_t * dsti = (int32_t *) cdsti; ++ ++ *dsti = *xi; ++} ++ + template + static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, +@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in + cpy_1(cx + x_offset, cdst + dst_offset); + } + ++// First, add this template function after the other template functions ++template ++static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne, ++ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, ++ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, ++ const int nb12, const int nb13) { ++ const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; ++ ++ if (i >= ne) { ++ return; ++ } ++ ++ const int64_t i03 = i/(ne00 * ne01 * ne02); ++ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); ++ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; ++ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; ++ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; ++ ++ const int64_t i13 = i/(ne10 * ne11 * ne12); ++ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); ++ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; ++ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; ++ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; ++ ++ cpy_1(cx + x_offset, cdst + dst_offset); ++} ++ ++// Then modify the ggml_cpy_i32_i32_cuda function to use the new template ++static void ggml_cpy_i32_i32_cuda( ++ const char * cx, char * cdst, const int ne, ++ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, ++ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) { ++ ++ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; ++ cpy_i32_i32<<>> ++ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); ++} ++ + static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; +@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg + ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ++ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { ++ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + } else { + GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); +@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { + return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + return (void*) cpy_f32_f16; ++ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { ++ return (void*) cpy_i32_i32; + } else { + GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/llama/patches/0016-add-model-quantizations.patch b/llama/patches/0016-add-model-quantizations.patch deleted file mode 100644 index 3d078b03f..000000000 --- a/llama/patches/0016-add-model-quantizations.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Tue, 8 Apr 2025 20:39:32 -0700 -Subject: [PATCH] add model quantizations - -a temporary patch to add model quantization for -models not supported in llama.cpp ---- - src/llama-arch.cpp | 17 +++++++++++++++++ - src/llama-arch.h | 1 + - src/llama-model.cpp | 2 ++ - src/llama-quant.cpp | 4 ++++ - 4 files changed, 24 insertions(+) - -diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index 0568565f..dd01df60 100644 ---- a/src/llama-arch.cpp -+++ b/src/llama-arch.cpp -@@ -73,6 +73,7 @@ static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, - { LLM_ARCH_PLM, "plm" }, - { LLM_ARCH_BAILINGMOE, "bailingmoe" }, -+ { LLM_ARCH_MISTRAL3, "mistral3" }, - { LLM_ARCH_UNKNOWN, "(unknown)" }, - }; - -@@ -1586,6 +1587,22 @@ static const std::map> LLM_TENSOR_N - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, -+ { -+ LLM_ARCH_MISTRAL3, -+ { -+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, -+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, -+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, -+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, -+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, -+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, -+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, -+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, -+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, -+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, -+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -+ } -+ }, - { - LLM_ARCH_UNKNOWN, - { -diff --git a/src/llama-arch.h b/src/llama-arch.h -index 6a989034..b6227eeb 100644 ---- a/src/llama-arch.h -+++ b/src/llama-arch.h -@@ -75,6 +75,7 @@ enum llm_arch { - LLM_ARCH_CHAMELEON, - LLM_ARCH_SOLAR, - LLM_ARCH_WAVTOKENIZER_DEC, -+ LLM_ARCH_MISTRAL3, - LLM_ARCH_PLM, - LLM_ARCH_BAILINGMOE, - LLM_ARCH_UNKNOWN, -diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index d051696c..c8374159 100644 ---- a/src/llama-model.cpp -+++ b/src/llama-model.cpp -@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - default: type = LLM_TYPE_UNKNOWN; - } - } break; -+ case LLM_ARCH_MISTRAL3: break; - default: throw std::runtime_error("unsupported model architecture"); - } - -@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { - case LLM_ARCH_CHAMELEON: - case LLM_ARCH_SOLAR: - case LLM_ARCH_BAILINGMOE: -+ case LLM_ARCH_MISTRAL3: - return LLAMA_ROPE_TYPE_NORM; - - // the pairs of head values are offset by n_rot/2 -diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp -index 223e1f3f..8ae6dde8 100644 ---- a/src/llama-quant.cpp -+++ b/src/llama-quant.cpp -@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - -+ // don't quantize vision stuff -+ quantize &= name.find("v.") == std::string::npos; -+ quantize &= name.find("mm.") == std::string::npos; -+ - // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); - diff --git a/llama/sampling_ext.cpp b/llama/sampling_ext.cpp index 6a025c906..1cacd3015 100644 --- a/llama/sampling_ext.cpp +++ b/llama/sampling_ext.cpp @@ -19,9 +19,6 @@ struct common_sampler *common_sampler_cinit(const struct llama_model *model, str sparams.penalty_repeat = params->penalty_repeat; sparams.penalty_freq = params->penalty_freq; sparams.penalty_present = params->penalty_present; - sparams.mirostat = params->mirostat; - sparams.mirostat_tau = params->mirostat_tau; - sparams.mirostat_eta = params->mirostat_eta; sparams.seed = params->seed; sparams.grammar = params->grammar; sparams.xtc_probability = 0.0; @@ -117,6 +114,9 @@ void grammar_free(struct llama_grammar *g) { if (g->vocab != nullptr) { delete g->vocab; } + if (g->o_vocab != nullptr) { + delete g->o_vocab; + } llama_grammar_free_impl(g); } } diff --git a/llama/sampling_ext.h b/llama/sampling_ext.h index a9e610ba2..3302e6efa 100644 --- a/llama/sampling_ext.h +++ b/llama/sampling_ext.h @@ -20,9 +20,6 @@ extern "C" float penalty_repeat; float penalty_freq; float penalty_present; - int32_t mirostat; - float mirostat_tau; - float mirostat_eta; uint32_t seed; char *grammar; }; diff --git a/llm/llm_windows.go b/llm/llm_windows.go index 915355a25..a350979ab 100644 --- a/llm/llm_windows.go +++ b/llm/llm_windows.go @@ -7,6 +7,7 @@ import ( const ( CREATE_DEFAULT_ERROR_MODE = 0x04000000 ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000 + CREATE_NO_WINDOW = 0x08000000 ) var LlamaServerSysProcAttr = &syscall.SysProcAttr{ @@ -18,5 +19,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{ // // Setting Above Normal priority class ensures when running as a "background service" // with "programs" given best priority, we aren't starved of cpu cycles - CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS, + CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS | CREATE_NO_WINDOW, } diff --git a/llm/memory.go b/llm/memory.go index e05327f79..b5a8dd5c6 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -1,9 +1,12 @@ package llm import ( + "cmp" "fmt" "log/slog" + "maps" "os" + "slices" "strconv" "strings" @@ -108,9 +111,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) for _, projector := range projectors { - weight, graph := projectorMemoryRequirements(projector) + weight := projectorMemoryRequirements(projector) projectorWeights += weight - projectorGraph += graph // multimodal models require at least 2048 context opts.NumCtx = max(opts.NumCtx, 2048) @@ -120,12 +122,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } layers := f.Tensors().GroupLayers() - // add one layer worth of memory as a buffer - if blk0, ok := layers["blk.0"]; ok { - layerSize = blk0.Size() - } else { - slog.Warn("model missing blk.0 layer size") - } + // add one layer (chosing the max layer) worth of memory as a buffer + layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int { + return cmp.Compare(a.Size(), b.Size()) + }).Size() var kvct string if envconfig.FlashAttention() && @@ -219,7 +219,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } // For all the layers, find where they can fit on the GPU(s) - for i := range int(f.KV().BlockCount()) { + for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- { // Some models have inconsistent layer sizes if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { layerSize = blk.Size() @@ -229,6 +229,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { // Stop allocating on GPU(s) once we hit the users target NumGPU + overflow += layerSize continue } @@ -245,13 +246,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) } } + + if len(gpusWithSpace) == 0 { + overflow += layerSize + } } if layerCount >= int(f.KV().BlockCount()) { fullyLoaded = true - } else { - for i := layerCount; i < int(f.KV().BlockCount()); i++ { - overflow += layerSize - } } // Determine if we need to consider output then find where it fits @@ -407,51 +408,21 @@ func (m MemoryEstimate) LogValue() slog.Value { return slog.GroupValue(attrs...) } -func projectorMemoryRequirements(filename string) (weights, graphSize uint64) { +func projectorMemoryRequirements(filename string) (weights uint64) { file, err := os.Open(filename) if err != nil { - return 0, 0 + return 0 } defer file.Close() ggml, _, err := ggml.Decode(file, 1024) if err != nil { - return 0, 0 + return 0 } for _, layer := range ggml.Tensors().GroupLayers() { weights += layer.Size() } - switch arch := ggml.KV().Architecture(); arch { - case "mllama": - kv := func(n string) uint64 { - if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok { - return uint64(v) - } - - return 0 - } - - imageSize := kv("image_size") - - maxNumTiles := kv("max_num_tiles") - embeddingLength := kv("embedding_length") - headCount := kv("attention.head_count") - - numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size")) - if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok { - numPatches++ - } - - numPaddedPatches := numPatches + 8 - (numPatches%8)%8 - - graphSize = 4 * (8 + - imageSize*imageSize*kv("num_channels")*maxNumTiles + - embeddingLength*numPatches*maxNumTiles + - 9*embeddingLength*numPaddedPatches*maxNumTiles + - numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) - } - - return weights, graphSize + return weights } diff --git a/llm/memory_test.go b/llm/memory_test.go index 213784a02..1d4f7a98c 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) { defer f.Close() inputLayerCount := 5 - tensors := []ggml.Tensor{ + tensors := []*ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, diff --git a/llm/server.go b/llm/server.go index 1fcb597b8..d95d7f8ca 100644 --- a/llm/server.go +++ b/llm/server.go @@ -17,6 +17,7 @@ import ( "os/exec" "path/filepath" "runtime" + "slices" "strconv" "strings" "sync" @@ -30,9 +31,37 @@ import ( "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llama" + "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/model" ) +type filteredEnv []string + +func (e filteredEnv) LogValue() slog.Value { + var attrs []slog.Attr + for _, env := range e { + if key, value, ok := strings.Cut(env, "="); ok { + switch { + case strings.HasPrefix(key, "OLLAMA_"), + strings.HasPrefix(key, "CUDA_"), + strings.HasPrefix(key, "ROCR_"), + strings.HasPrefix(key, "ROCM_"), + strings.HasPrefix(key, "HIP_"), + strings.HasPrefix(key, "GPU_"), + strings.HasPrefix(key, "HSA_"), + strings.HasPrefix(key, "GGML_"), + slices.Contains([]string{ + "PATH", + "LD_LIBRARY_PATH", + "DYLD_LIBRARY_PATH", + }, key): + attrs = append(attrs, slog.String(key, value)) + } + } + } + return slog.GroupValue(attrs...) +} + type LlamaServer interface { Ping(ctx context.Context) error WaitUntilRunning(ctx context.Context) error @@ -44,6 +73,7 @@ type LlamaServer interface { EstimatedVRAM() uint64 // Total VRAM across all GPUs EstimatedTotal() uint64 EstimatedVRAMByGPU(gpuID string) uint64 + Pid() int } // llmServer is an instance of the llama.cpp server @@ -147,10 +177,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU)) } - if envconfig.Debug() { - params = append(params, "--verbose") - } - if opts.MainGPU > 0 { params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU)) } @@ -216,10 +242,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a params = append(params, "--no-mmap") } - if opts.UseMLock { - params = append(params, "--mlock") - } - // TODO - NUMA support currently doesn't work properly params = append(params, "--parallel", strconv.Itoa(numParallel)) @@ -324,7 +346,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a pathEnv = "LD_LIBRARY_PATH" } - var libraryPaths []string + // Note: we always put our dependency paths first + // since these are the exact version we compiled/linked against + libraryPaths := []string{discover.LibOllamaPath} if libraryPath, ok := os.LookupEnv(pathEnv); ok { libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) } @@ -334,13 +358,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a c := compatible[0] if libpath, ok := libs[c]; ok { slog.Debug("adding gpu library", "path", libpath) - libraryPaths = append(libraryPaths, libpath) + libraryPaths = append([]string{libpath}, libraryPaths...) ggmlPaths = append(ggmlPaths, libpath) } } - // Note: we always put the dependency path first - // since this was the exact version we compiled/linked against if gpus[0].DependencyPath != nil { slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath) // assume gpus from the same library have the same dependency path @@ -407,26 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } slog.Info("starting llama server", "cmd", s.cmd) - if envconfig.Debug() { - filteredEnv := []string{} - for _, ev := range s.cmd.Env { - if strings.HasPrefix(ev, "OLLAMA_") || - strings.HasPrefix(ev, "CUDA_") || - strings.HasPrefix(ev, "ROCR_") || - strings.HasPrefix(ev, "ROCM_") || - strings.HasPrefix(ev, "HIP_") || - strings.HasPrefix(ev, "GPU_") || - strings.HasPrefix(ev, "HSA_") || - strings.HasPrefix(ev, "GGML_") || - strings.HasPrefix(ev, "PATH=") || - strings.HasPrefix(ev, "LD_LIBRARY_PATH=") || - strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") { - filteredEnv = append(filteredEnv, ev) - } - } - // Log at debug as the environment is inherited and might contain sensitive information - slog.Debug("subprocess", "environment", filteredEnv) - } + slog.Debug("subprocess", "", filteredEnv(s.cmd.Env)) if err = s.cmd.Start(); err != nil { var msg string @@ -520,6 +523,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) { if errors.Is(err, context.DeadlineExceeded) { return ServerStatusNotResponding, errors.New("server not responding") } + if strings.Contains(err.Error(), "connection refused") { + return ServerStatusNotResponding, errors.New("connection refused") + } return ServerStatusError, fmt.Errorf("health resp: %w", err) } defer resp.Body.Close() @@ -640,6 +646,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { } } +func (s *llmServer) Pid() int { + if s.cmd != nil && s.cmd.Process != nil { + return s.cmd.Process.Pid + } + return -1 +} + var grammarJSON = ` root ::= object value ::= object | array | string | number | ("true" | "false" | "null") ws @@ -666,9 +679,8 @@ ws ::= ([ \t\n] ws)? const maxBufferSize = 512 * format.KiloByte type ImageData struct { - Data []byte `json:"data"` - ID int `json:"id"` - AspectRatioID int `json:"aspect_ratio_id"` + Data []byte `json:"data"` + ID int `json:"id"` } type CompletionRequest struct { @@ -714,6 +726,9 @@ type CompletionResponse struct { } func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error { + slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format)) + slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt) + if len(req.Format) > 0 { switch string(req.Format) { case `null`, `""`: @@ -877,6 +892,8 @@ type EmbeddingResponse struct { } func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) { + slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input) + if err := s.sem.Acquire(ctx, 1); err != nil { if errors.Is(err, context.Canceled) { slog.Info("aborting embedding request due to client closing the connection") @@ -1003,17 +1020,17 @@ func (s *llmServer) Close() error { s.llamaModelLock.Unlock() if s.cmd != nil { - slog.Debug("stopping llama server") + slog.Debug("stopping llama server", "pid", s.Pid()) if err := s.cmd.Process.Kill(); err != nil { return err } // if ProcessState is already populated, Wait already completed, no need to wait again if s.cmd.ProcessState == nil { - slog.Debug("waiting for llama server to exit") + slog.Debug("waiting for llama server to exit", "pid", s.Pid()) <-s.done } - slog.Debug("llama server stopped") + slog.Debug("llama server stopped", "pid", s.Pid()) } return nil diff --git a/llm/server_test.go b/llm/server_test.go index 6c8f7590b..b6a8705e5 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) { // of a mess, and but it's good enough, until we can refactoring the // Completion method to be more testable. - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(t.Context()) s := &llmServer{ sem: semaphore.NewWeighted(1), // required to prevent nil panic } diff --git a/logutil/logutil.go b/logutil/logutil.go new file mode 100644 index 000000000..406caf540 --- /dev/null +++ b/logutil/logutil.go @@ -0,0 +1,29 @@ +package logutil + +import ( + "io" + "log/slog" + "path/filepath" +) + +const LevelTrace slog.Level = -8 + +func NewLogger(w io.Writer, level slog.Level) *slog.Logger { + return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{ + Level: level, + AddSource: true, + ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr { + switch attr.Key { + case slog.LevelKey: + switch attr.Value.Any().(slog.Level) { + case LevelTrace: + attr.Value = slog.StringValue("TRACE") + } + case slog.SourceKey: + source := attr.Value.Any().(*slog.Source) + source.File = filepath.Base(source.File) + } + return attr + }, + })) +} diff --git a/ml/backend.go b/ml/backend.go index 0cd33bd8a..cb32d8185 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -5,6 +5,7 @@ import ( "context" "encoding/binary" "fmt" + "math" "os" "slices" "strconv" @@ -118,6 +119,21 @@ type Context interface { Layer(int) Context } +// RopeOptions contains optional parameters for RoPE function +type RopeOptions struct { + OriginalContextLen uint32 +} + +// RopeOption defines a function that modifies RopeOpts +type RopeOption func(*RopeOptions) + +// WithContextLen sets a custom context length +func WithContextLen(len uint32) RopeOption { + return func(opts *RopeOptions) { + opts.OriginalContextLen = len + } +} + type Tensor interface { Dim(n int) int Stride(n int) int @@ -143,7 +159,7 @@ type Tensor interface { AvgPool2D(ctx Context, k, s int, p float32) Tensor Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor - RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor + RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor Sin(ctx Context) Tensor @@ -160,7 +176,6 @@ type Tensor interface { Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor Pad(ctx Context, shape ...int) Tensor - Unpad(ctx Context, shape ...int) Tensor Stack(ctx Context, dim int, s ...Tensor) Tensor @@ -172,6 +187,7 @@ type Tensor interface { Duplicate(ctx Context) Tensor TopK(ctx Context, k int) Tensor + Argsort(ctx Context) Tensor } // ScaledDotProductAttention implements a fused attention @@ -214,35 +230,58 @@ func mul[T number](s ...T) T { return p } -type DumpOptions struct { - // Items is the number of elements to print at the beginning and end of each dimension. - Items int +type DumpOptions func(*dumpOptions) - // Precision is the number of decimal places to print. Applies to float32 and float64. - Precision int +// DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64. +func DumpWithPrecision(n int) DumpOptions { + return func(opts *dumpOptions) { + opts.Precision = n + } } -func Dump(ctx Context, t Tensor, opts ...DumpOptions) string { - if len(opts) < 1 { - opts = append(opts, DumpOptions{ - Items: 3, - Precision: 4, - }) +// DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements +// is less than or equal to this value, the entire tensor will be printed. Otherwise, only the +// beginning and end of each dimension will be printed. +func DumpWithThreshold(n int) DumpOptions { + return func(opts *dumpOptions) { + opts.Threshold = n + } +} + +// DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension. +func DumpWithEdgeItems(n int) DumpOptions { + return func(opts *dumpOptions) { + opts.EdgeItems = n + } +} + +type dumpOptions struct { + Precision, Threshold, EdgeItems int +} + +func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string { + opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3} + for _, optsFunc := range optsFuncs { + optsFunc(&opts) + } + + if mul(t.Shape()...) <= opts.Threshold { + opts.EdgeItems = math.MaxInt } switch t.DType() { case DTypeF32: - return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string { - return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32) + return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string { + return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32) }) case DTypeF16, DTypeQ80, DTypeQ40: f32 := ctx.Input().Empty(DTypeF32, t.Shape()...) f32 = t.Copy(ctx, f32) - return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string { - return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32) + return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string { + return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32) }) case DTypeI32: - return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string { + return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string { return strconv.FormatInt(int64(i), 10) }) default: diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 177ac6fd0..2821ad119 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -27,6 +27,7 @@ import ( "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs" fsggml "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" "golang.org/x/sync/errgroup" @@ -222,7 +223,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0]))) C.ggml_set_name(tt, cname) - slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) + slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) //nolint:staticcheck // TODO: check if buffer type supports this tensor return tt } @@ -312,6 +313,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, g, ctx := errgroup.WithContext(ctx) g.SetLimit(runtime.GOMAXPROCS(0)) for _, t := range meta.Tensors().Items() { + t := t g.Go(func() error { tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name]))) for i := range tts { @@ -341,6 +343,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, var s uint64 for s < t.Size() { + // Stop if either the parent context has been canceled or if any of the other tensors returned an error + if err := ctx.Err(); err != nil { + return err + } + n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))]) if err != nil { slog.Warn("file read error", "file", r.Name(), "error", err) @@ -363,14 +370,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, }) } - // start a goroutine to cancel the errgroup if the parent context is done - go func() { - <-ctx.Done() - g.Go(func() error { - return ctx.Err() - }) - }() - if err := g.Wait(); err != nil { return nil, err } @@ -407,6 +406,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, C.int(len(schedBackends)), C.size_t(maxGraphNodes), C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)), + C._Bool(false), ), schedBackends: schedBackends, schedBufts: schedBufts, @@ -915,6 +915,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor { func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor { if len(shape) != 4 { panic("expected 4 dimensions") + } else if shape[3] != 0 { + panic("cuda does not support 4d tensors") } return &Tensor{ @@ -1017,17 +1019,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor { } } -func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor { - if len(shape) != 4 { - panic("expected 4 dimensions") - } - - return &Tensor{ - b: t.b, - t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), - } -} - func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { switch len(shape) { case 1: @@ -1071,7 +1062,17 @@ const ( ropeTypeVision C.int = 24 ) -func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor { +func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor { + // Default options + opts := &ml.RopeOptions{ + OriginalContextLen: 131072, + } + + // Apply any provided options + for _, option := range options { + option(opts) + } + if ropeFactors == nil { ropeFactors = &Tensor{b: t.b} } @@ -1084,16 +1085,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi return &Tensor{ b: t.b, t: C.ggml_rope_ext( - ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t, + ctx.(*Context).ctx, + dequant, + positionIDs.(*Tensor).t, + ropeFactors.(*Tensor).t, C.int(ropeDim), C.int(ropeType), - 131072, // YaRN n_ctx_train + C.int(opts.OriginalContextLen), C.float(ropeBase), C.float(ropeScale), - 0., // YaRN ext_factor - 1., // YaRN attn_factor - 32., // YaRN beta_fast - 1., // YaRN beta_slow + C.float(0.0), + C.float(1.0), + C.float(32.0), + C.float(1.0), ), } } @@ -1187,3 +1191,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor { t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)), } } + +func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor { + return &Tensor{ + b: t.b, + t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC), + } +} diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 64671495b..778927f68 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -38,7 +38,7 @@ extern "C" { GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); @@ -59,7 +59,7 @@ extern "C" { GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); @@ -248,7 +248,7 @@ extern "C" { // preferrably to run on the same backend as the buffer ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false); + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true); // initialize buffers from a max size graph (optional) reserve_graph = build_graph(sched, max_batch_size); @@ -289,7 +289,7 @@ extern "C" { typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler, backends with low index are given priority over backends with high index - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph diff --git a/ml/backend/ggml/ggml/include/ggml-cpp.h b/ml/backend/ggml/ggml/include/ggml-cpp.h index a12342c25..48aa79682 100644 --- a/ml/backend/ggml/ggml/include/ggml-cpp.h +++ b/ml/backend/ggml/ggml/include/ggml-cpp.h @@ -24,7 +24,7 @@ typedef std::unique_ptr gguf_context_ptr; struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; -typedef std::unique_ptr ggml_gallocr_ptr; +typedef std::unique_ptr ggml_gallocr_ptr; // ggml-backend diff --git a/ml/backend/ggml/ggml/include/ggml-cpu.h b/ml/backend/ggml/ggml/include/ggml-cpu.h index f5e11f1e1..de77a875e 100644 --- a/ml/backend/ggml/ggml/include/ggml-cpu.h +++ b/ml/backend/ggml/ggml/include/ggml-cpu.h @@ -133,6 +133,11 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t); + #ifdef __cplusplus } #endif diff --git a/ml/backend/ggml/ggml/include/ggml-opt.h b/ml/backend/ggml/ggml/include/ggml-opt.h index eb5eab9de..da0c24b46 100644 --- a/ml/backend/ggml/ggml/include/ggml-opt.h +++ b/ml/backend/ggml/ggml/include/ggml-opt.h @@ -37,13 +37,16 @@ extern "C" { // ====== Dataset ====== GGML_API ggml_opt_dataset_t ggml_opt_dataset_init( - int64_t ne_datapoint, // number of elements per datapoint - int64_t ne_label, // number of elements per label - int64_t ndata, // total number of datapoints/labels - int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied) + enum ggml_type type_data, // the type for the internal data tensor + enum ggml_type type_label, // the type for the internal labels tensor + int64_t ne_datapoint, // number of elements per datapoint + int64_t ne_label, // number of elements per label + int64_t ndata, // total number of datapoints/labels + int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied) GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset); // get underlying tensors that store the data + GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset); GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata] GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata] @@ -56,13 +59,19 @@ extern "C" { struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch] struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch] int64_t ibatch); + GGML_API void ggml_opt_dataset_get_batch_host( + ggml_opt_dataset_t dataset, + void * data_batch, + size_t nb_data_batch, + void * labels_batch, + int64_t ibatch); // ====== Model / Context ====== enum ggml_opt_build_type { - GGML_OPT_BUILD_TYPE_FORWARD, - GGML_OPT_BUILD_TYPE_GRAD, - GGML_OPT_BUILD_TYPE_OPT, + GGML_OPT_BUILD_TYPE_FORWARD = 10, + GGML_OPT_BUILD_TYPE_GRAD = 20, + GGML_OPT_BUILD_TYPE_OPT = 30, }; // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss @@ -81,20 +90,22 @@ extern "C" { // userdata can be used to pass arbitrary data typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata); - // returns the default optimizer params (constant) + // returns the default optimizer params (constant, hard-coded values) // userdata is not used GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata); + // casts userdata to ggml_opt_optimizer_params and returns it + GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata); + // parameters for initializing a new optimization context struct ggml_opt_params { ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs - struct ggml_context * ctx_compute; // created in user code, holds non-static tensors - - // the forward graph is defined by inputs and outputs - // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts - struct ggml_tensor * inputs; - struct ggml_tensor * outputs; + // by default the forward graph needs to be reconstructed for each eval + // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically + struct ggml_context * ctx_compute; + struct ggml_tensor * inputs; + struct ggml_tensor * outputs; enum ggml_opt_loss_type loss_type; enum ggml_opt_build_type build_type; @@ -107,12 +118,9 @@ extern "C" { // get parameters for an optimization context with defaults set where possible // parameters for which no sensible defaults exist are supplied as arguments to this function - GGML_API ggml_opt_params ggml_opt_default_params( - ggml_backend_sched_t backend_sched, - struct ggml_context * ctx_compute, - struct ggml_tensor * inputs, - struct ggml_tensor * outputs, - enum ggml_opt_loss_type loss_type); + GGML_API struct ggml_opt_params ggml_opt_default_params( + ggml_backend_sched_t backend_sched, + enum ggml_opt_loss_type loss_type); GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params); GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx); @@ -121,6 +129,7 @@ extern "C" { GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer); // get underlying tensors that store data + // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against @@ -128,11 +137,12 @@ extern "C" { GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels + // get the gradient accumulator for a node from the forward graph GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); // ====== Optimization Result ====== - GGML_API ggml_opt_result_t ggml_opt_result_init(); + GGML_API ggml_opt_result_t ggml_opt_result_init(void); GGML_API void ggml_opt_result_free(ggml_opt_result_t result); GGML_API void ggml_opt_result_reset(ggml_opt_result_t result); @@ -144,11 +154,20 @@ extern "C" { // ====== Computation ====== - // do forward pass, increment result if not NULL - GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); + // if not using static graphs, this function must be called prior to ggml_opt_alloc + GGML_API void ggml_opt_prepare_alloc( + ggml_opt_context_t opt_ctx, + struct ggml_context * ctx_compute, + struct ggml_cgraph * gf, + struct ggml_tensor * inputs, + struct ggml_tensor * outputs); - // do forward pass, increment result if not NULL, do backward pass - GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); + // allocate the next graph for evaluation, either forward or forward + backward + // must be called exactly once prior to calling ggml_opt_eval + GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward); + + // do forward pass, increment result if not NULL, do backward pass if allocated + GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); // ############################################################################ // ## The high-level functions start here. They do not depend on any private ## @@ -200,9 +219,9 @@ extern "C" { // fit model defined by inputs and outputs to dataset GGML_API void ggml_opt_fit( ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs - ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs - ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch] - ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used + struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs + struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch] + struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used ggml_opt_dataset_t dataset, // dataset with data and optionally also labels enum ggml_opt_loss_type loss_type, // loss to minimize ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t) diff --git a/ml/backend/ggml/ggml/include/ggml-rpc.h b/ml/backend/ggml/ggml/include/ggml-rpc.h index c8b6097f7..1e6741127 100644 --- a/ml/backend/ggml/ggml/include/ggml-rpc.h +++ b/ml/backend/ggml/ggml/include/ggml-rpc.h @@ -7,7 +7,7 @@ extern "C" { #endif -#define RPC_PROTO_MAJOR_VERSION 1 +#define RPC_PROTO_MAJOR_VERSION 2 #define RPC_PROTO_MINOR_VERSION 0 #define RPC_PROTO_PATCH_VERSION 0 #define GGML_RPC_MAX_SERVERS 16 diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h index d19fc1678..e91dedf14 100644 --- a/ml/backend/ggml/ggml/include/ggml.h +++ b/ml/backend/ggml/ggml/include/ggml.h @@ -393,8 +393,8 @@ extern "C" { // precision enum ggml_prec { - GGML_PREC_DEFAULT, - GGML_PREC_F32, + GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default + GGML_PREC_F32 = 10, }; // model file types @@ -481,6 +481,7 @@ extern "C" { GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, + GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, @@ -488,7 +489,6 @@ extern "C" { GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, GGML_OP_PAD_REFLECT_1D, - GGML_OP_UNPAD, GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, @@ -673,11 +673,18 @@ extern "C" { GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars + // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation) GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 + // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok) + GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor); + + // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN + GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -761,7 +768,7 @@ extern "C" { // Tensor flags GGML_API void ggml_set_input(struct ggml_tensor * tensor); GGML_API void ggml_set_output(struct ggml_tensor * tensor); - GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API void ggml_set_param(struct ggml_tensor * tensor); GGML_API void ggml_set_loss(struct ggml_tensor * tensor); // @@ -931,7 +938,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_repeat_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride // concat a and b along dim // used in stable-diffusion @@ -1661,7 +1668,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // depthwise + // depthwise (via im2col and mul_mat) GGML_API struct ggml_tensor * ggml_conv_2d_dw( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1673,6 +1680,22 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + // Depthwise 2D convolution + // may be faster than ggml_conv_2d_dw, but not available in all backends + // a: KW KH 1 C convolution kernel + // b: W H C N input data + // res: W_out H_out C N + GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1); + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1758,15 +1781,6 @@ extern "C" { int p0, int p1); - // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x] - GGML_API struct ggml_tensor * ggml_unpad( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, - int p1, - int p2, - int p3); - // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] // return: [N, dim] @@ -2035,15 +2049,14 @@ extern "C" { GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_backward_expand( - struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation) - struct ggml_context * ctx_compute, // context for gradient computation - struct ggml_cgraph * cgraph, - bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static + struct ggml_context * ctx, // context for gradient computation + struct ggml_cgraph * cgraph, + struct ggml_tensor ** grad_accs); // graph allocation in a context GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); - GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); + GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads); GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1 GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt index d6b393a21..0beaed866 100644 --- a/ml/backend/ggml/ggml/src/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/CMakeLists.txt @@ -214,7 +214,7 @@ add_library(ggml target_link_libraries(ggml PUBLIC ggml-base) if (CMAKE_SYSTEM_NAME MATCHES "Linux") - target_link_libraries(ggml PRIVATE dl stdc++fs) + target_link_libraries(ggml PRIVATE dl) endif() function(ggml_add_backend_library backend) @@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name) set(GGML_CPU_TAG_NAME ${tag_name}) # other: OPENMP LLAMAFILE CPU_HBM foreach (feat NATIVE + SSE42 AVX AVX2 BMI2 AVX_VNNI FMA F16C AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8 AMX_BF16) @@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") endif() add_custom_target(ggml-cpu) - ggml_add_cpu_backend_variant(sandybridge AVX) - ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) + ggml_add_cpu_backend_variant(x64) + ggml_add_cpu_backend_variant(sse42 SSE42) + ggml_add_cpu_backend_variant(sandybridge SSE42 AVX) + ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA) + ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") endif() diff --git a/ml/backend/ggml/ggml/src/ggml-alloc.c b/ml/backend/ggml/ggml/src/ggml-alloc.c index a3d3f6901..5fd379f6a 100644 --- a/ml/backend/ggml/ggml/src/ggml-alloc.c +++ b/ml/backend/ggml/ggml/src/ggml-alloc.c @@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { size_t node_size = 0; if (!node->data && !node->view_src) { - GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API + // If we previously had data but don't now then reallocate + if (talloc->buffer_id < 0) { + return false; + } node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); } return talloc->size_max >= node_size; diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp index 1487f322f..4e67d243a 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp @@ -178,9 +178,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_CANN register_backend(ggml_backend_cann_reg()); #endif -// #ifdef GGML_USE_BLAS -// register_backend(ggml_backend_blas_reg()); -// #endif +#ifdef GGML_USE_BLAS + register_backend(ggml_backend_blas_reg()); +#endif #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index dd11f304f..0ce73a997 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { return SIZE_MAX; } -size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { +size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { size_t size = buft->iface.get_alloc_size(buft, tensor); @@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) { return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer)); } -size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) { return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } @@ -674,6 +674,8 @@ struct ggml_backend_sched { char * context_buffer; size_t context_buffer_size; + bool op_offload; + int debug; }; @@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int node_backend_id = tensor_backend_id(node); - assert(node_backend_id != -1); // all nodes should be assigned by now + assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback // check if we should start a new split based on the sources of the current node bool need_new_split = false; @@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new( ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, - bool parallel) { + bool parallel, + bool op_offload) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); @@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new( } sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); + sched->op_offload = op_offload; ggml_backend_sched_reset(sched); diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt index e73a3b69b..bdaec2881 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt @@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) elseif (GGML_AVX) list(APPEND ARCH_FLAGS /arch:AVX) list(APPEND ARCH_DEFINITIONS GGML_AVX) - else () + elseif (GGML_SSE42) list(APPEND ARCH_FLAGS /arch:SSE4.2) list(APPEND ARCH_DEFINITIONS GGML_SSE42) endif() @@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (GGML_NATIVE) list(APPEND ARCH_FLAGS -march=native) else () - list(APPEND ARCH_FLAGS -msse4.2) - list(APPEND ARCH_DEFINITIONS GGML_SSE42) + if (GGML_SSE42) + list(APPEND ARCH_FLAGS -msse4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + endif() if (GGML_F16C) list(APPEND ARCH_FLAGS -mf16c) list(APPEND ARCH_DEFINITIONS GGML_F16C) @@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") message(STATUS "z15 target") - list(APPEND ARCH_FLAGS -march=z15 -mtune=z15) + list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") message(STATUS "z16 target") - list(APPEND ARCH_FLAGS -march=z16 -mtune=z16) + list(APPEND ARCH_FLAGS -march=z16) + elseif (${S390X_M} MATCHES "9175|9176") + # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version. + message(STATUS "z17 target") + list(APPEND ARCH_FLAGS -march=z17) else() message(STATUS "Unknown target") message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") @@ -422,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ${KLEIDIAI_SRC}/kai/ukernels/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/ + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/) set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}") @@ -432,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name) string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED) string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED) - set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS}) + set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP}) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c) + list(APPEND GGML_KLEIDIAI_SOURCES + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c) if (NOT DOTPROD_ENABLED MATCHES -1) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c) + list(APPEND GGML_KLEIDIAI_SOURCES + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c) endif() if (NOT I8MM_ENABLED MATCHES -1) @@ -450,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() if (NOT SME_ENABLED MATCHES -1) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c) - list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c) - set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2") + list(APPEND GGML_KLEIDIAI_SOURCES + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c) + set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2") endif() set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}") diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 902ee4346..d775a0363 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -263,7 +263,7 @@ void test_x86_is() { static int ggml_backend_cpu_x86_score() { // FIXME: this does not check for OS support - int score = 0; + int score = 1; cpuid_x86 is; #ifdef GGML_FMA diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 175cba329..8ff6d64a4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -72,8 +72,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" -#elif defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data #endif #define UNUSED GGML_UNUSED diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c index 91a81bdc3..ccd0651eb 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -20,12 +20,6 @@ #define GROUP_MAX_EPS_IQ1_M 1e-7f #define GROUP_MAX_EPS_IQ1_S 1e-12f -#if defined(_MSC_VER) -// disable "possible loss of data" to avoid warnings for hundreds of casts -// we should just be careful :) -#pragma warning(disable: 4244 4267) -#endif - #define UNUSED GGML_UNUSED // some compilers don't provide _mm256_set_m128i, e.g. gcc 7 @@ -6596,7 +6590,118 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi } *s = hsum_float_8(acc); +#elif defined(__VXE__) || defined(__VXE2__) + uint32_t aux[3]; + uint32_t utmp[4]; + const int32x4_t v_z = vec_splat_s32(0); + const uint8x16_t v_3m = vec_splat_u8(0x03); + + const uint8x16_t v_0c = vec_splat_u8(1); + const uint8x16_t v_1c = vec_sl(v_0c, 1); + const uint8x16_t v_2c = vec_sl(v_0c, 2); + const uint8x16_t v_3c = vec_sl(v_0c, 3); + + uint8x16_t q3h[4]; + uint8x16_t q3b[2]; + int8x16_t q3bytes[4]; + int8x16_t q8bytes[4]; + uint8x16_t qhbits[2]; + + float sum = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * restrict x0l = x[i].qs; + const uint8_t * restrict x0h = x[i].hmask; + const int8_t * restrict y0 = y[i].qs; + + qhbits[0] = vec_xl(0 , x0h); + qhbits[1] = vec_xl(16, x0h); + + int32_t isum = 0; + + memcpy(aux, x[i].scales, 12); + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + int8_t * scale = (int8_t *)utmp; + for (int j = 0; j < 16; ++j) scale[j] -= 32; + + for (int j = 0; j < QK_K/128; ++j) { + int32x4_t isum0, isum1, isum2, isum3; + + q3b[0] = vec_xl(0 , x0l); + q3b[1] = vec_xl(16, x0l); + x0l += 32; + + q8bytes[0] = vec_xl(0 , y0); + q8bytes[1] = vec_xl(16 , y0); + q8bytes[2] = vec_xl(32 , y0); + q8bytes[3] = vec_xl(48 , y0); + q8bytes[4] = vec_xl(64 , y0); + q8bytes[5] = vec_xl(80 , y0); + q8bytes[6] = vec_xl(96 , y0); + q8bytes[7] = vec_xl(112, y0); + y0 += 128; + + q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2); + q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2); + q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1); + q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1); + + q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]); + q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]); + q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]); + q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]); + + isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]); + isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]); + isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]); + isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]); + + isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0]; + isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1]; + isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2]; + isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3]; + + scale += 4; + + q3h[0] = vec_andc(v_2c, qhbits[0]); + q3h[1] = vec_andc(v_2c, qhbits[1]); + q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1); + q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1); + + q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]); + q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]); + q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]); + q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]); + + isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]); + isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]); + isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]); + isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]); + + isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0]; + isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1]; + isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2]; + isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3]; + + scale += 4; + + if (j == 0) { + qhbits[0] = vec_sr(qhbits[0], 4); + qhbits[1] = vec_sr(qhbits[1], 4); + } + } + + sum += d * isum; + } + + *s = sum; #else // scalar version // This function is written like this so the compiler can manage to vectorize most of it diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c index 6d4abe4c7..2462d2b85 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c @@ -52,19 +52,6 @@ #include "llamafile/sgemm.h" #endif -#if defined(_MSC_VER) -// disable "possible loss of data" to avoid hundreds of casts -// we should just be careful :) -#pragma warning(disable: 4244 4267) - -// disable POSIX deprecation warnings -// these functions are never going away, anyway -#pragma warning(disable: 4996) - -// unreachable code because of multiple instances of code after GGML_ABORT -#pragma warning(disable: 4702) -#endif - // Note: once we move threading into a separate C++ file // will use std::hardware_destructive_interference_size instead of hardcoding it here // and we'll use C++ attribute syntax. @@ -217,7 +204,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_F16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, .nrows = 1, @@ -358,7 +345,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_K, }, [GGML_TYPE_BF16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -1934,6 +1921,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_back_f32(params, tensor); } break; + case GGML_OP_CONV_2D_DW: + { + ggml_compute_forward_conv_2d_dw(params, tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor); @@ -1962,10 +1953,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad_reflect_1d(params, tensor); } break; - case GGML_OP_UNPAD: - { - ggml_compute_forward_unpad(params, tensor); - } break; case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); @@ -2274,6 +2261,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: { @@ -2288,7 +2276,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: - case GGML_OP_UNPAD: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: @@ -3172,6 +3159,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m512 x_vec = _mm512_loadu_ps(x + i); + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm256_storeu_si256((__m256i *)(y + i), y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for (; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + +void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i)); + __m512 y_vec = _mm512_cvtph_ps(x_vec); + _mm512_storeu_ps(y + i, y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i)); + __m256 y_vec = _mm256_cvtph_ps(x_vec); + _mm256_storeu_ps(y + i, y_vec); + } + for (; i + 3 < n; i += 4) { + __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i)); + __m128 y_vec = _mm_cvtph_ps(x_vec); + _mm_storeu_ps(y + i, y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) { + int64_t i = 0; + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_BF16(x[i]); + } +} + +void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__AVX2__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } +#endif + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } +#endif + for (; i < n; i++) { + y[i] = GGML_BF16_TO_FP32(x[i]); + } +} int ggml_cpu_has_avx(void) { #if defined(__AVX__) diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp index 4b688a67e..e013e8b41 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -11,24 +11,26 @@ #include #ifdef GGML_USE_CPU_HBM -#include "ggml-cpu-hbm.h" +# include "ggml-cpu-hbm.h" #endif #ifdef GGML_USE_CPU_KLEIDIAI -#include "kleidiai/kleidiai.h" -#endif - -#if defined(__APPLE__) -#include -#include +# include "kleidiai/kleidiai.h" #endif #if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX - #define NOMINMAX +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#else +# include #endif -#include + +#if defined(__APPLE__) +# include +# include #endif // ggml-backend interface @@ -70,8 +72,10 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_ty } static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) { - for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { - if (extra && extra == buft) return true; + for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra && extra == buft) { + return true; + } } return false; } @@ -330,9 +334,18 @@ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t d } static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // TODO - *free = 0; - *total = 0; +#ifdef _WIN32 + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx(&status); + *total = status.ullTotalPhys; + *free = status.ullAvailPhys; +#else + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + *total = pages * page_size; + *free = *total; +#endif GGML_UNUSED(dev); } diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp index f6374f789..1d46158f9 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -1054,6 +1054,493 @@ class tinyBLAS_Q0_AVX { } \ } \ +template +class tinyBLAS_BF16_PPC { + public: + tinyBLAS_BF16_PPC(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, + int ith, int nth) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + } + + void matmul(int64_t m, int64_t n) { + mnpack(0, m, 0, n); + } + + private: + void vector_permute_store(vec_t *c, int numVec, unsigned char *vecOffset) { + vec_t t[8], s[8]; + vec_t swiz1 = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; + vec_t swiz2 = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; + vec_t swiz3 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + vec_t swiz4 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; + + if (numVec == 2) { + t[0] = vec_perm(c[0], c[1], swiz1); + t[1] = vec_perm(c[2], c[3], swiz1); + s[0] = vec_perm(t[0], t[1], swiz3); + s[1] = vec_perm(t[0], t[1], swiz4); + vec_xst(s[0], 0, (vec_t*)vecOffset); + vec_xst(s[1], 0, (vec_t*)(vecOffset + 16)); + } else if (numVec == 4) { + t[0] = vec_perm(c[0], c[1], swiz1); + t[1] = vec_perm(c[0], c[1], swiz2); + t[2] = vec_perm(c[2], c[3], swiz1); + t[3] = vec_perm(c[2], c[3], swiz2); + s[0] = vec_perm(t[0], t[2], swiz3); + s[1] = vec_perm(t[0], t[2], swiz4); + s[2] = vec_perm(t[1], t[3], swiz3); + s[3] = vec_perm(t[1], t[3], swiz4); + for (int i = 0; i < 4; ++i) + vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16)); + } else if (numVec == 8) { + for (int i = 0; i < 4; i += 2) { + t[i+0] = vec_perm(c[i+0], c[i+1], swiz1); + t[i+1] = vec_perm(c[i+0], c[i+1], swiz2); + } + for (int i = 4; i < 8; i += 2) { + t[i+0] = vec_perm(c[i+0], c[i+1], swiz1); + t[i+1] = vec_perm(c[i+0], c[i+1], swiz2); + } + s[0] = vec_perm(t[0], t[2], swiz3); + s[1] = vec_perm(t[0], t[2], swiz4); + s[2] = vec_perm(t[1], t[3], swiz3); + s[3] = vec_perm(t[1], t[3], swiz4); + s[4] = vec_perm(t[4], t[6], swiz3); + s[5] = vec_perm(t[4], t[6], swiz4); + s[6] = vec_perm(t[5], t[7], swiz3); + s[7] = vec_perm(t[5], t[7], swiz4); + for (int i = 0; i < 8; ++i) + vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16)); + } + } + + void packNormal(const TA* a, int64_t lda, int rows, int cols, unsigned char* vec) { + int64_t i, j; + TA *aoffset = NULL; + unsigned char *vecOffset = NULL; + TA * aoffsets[8]; + vector unsigned char c_arr[8]; + aoffset = const_cast(a); + vecOffset = vec; + j = (rows >> 3); + if (j > 0) { + do { + if (cols == 4) { + aoffsets[0] = aoffset; + for (int it = 1; it < 4; ++it) + aoffsets[it] = aoffsets[it-1] + lda; + aoffset += 4 * lda; + for (int i = 0; i < 4; ++i) + c_arr[i] = vec_xl(0, (vector unsigned char*)aoffsets[i]); + vector_permute_store(c_arr, 4, vecOffset); + for (int i = 0; i<4; i++) + aoffsets[i] = aoffsets[i]+lda; + vecOffset +=64; + } + i = (cols >> 3); + if (i > 0) { + aoffsets[0] = aoffset; + for (int it = 1; it < 8; ++it) { + aoffsets[it] = aoffsets[it-1] + lda; + } + aoffset += 8 * lda; + do { + for (int it = 0; it < 8; ++it) + c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]); + vector_permute_store(c_arr, 8, vecOffset); + for (int it = 0; it < 8; ++it) + aoffsets[it] = aoffsets[it] + 8*lda; + vecOffset += 128; + i--; + } while(i > 0); + } + j--; + } while(j > 0); + } + if (rows & 4) { + aoffsets[0] = aoffset; + for (int it = 1; it < 4; ++it) + aoffsets[it] = aoffsets[it-1] + lda; + aoffset += 4 * lda; + if (cols == 4) { + for (int it = 0; it < 4; ++it) + c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]); + vector_permute_store(c_arr, 2, vecOffset); + for (int it = 0; it< 4; it++) + aoffsets[it] = aoffsets[it] + lda; + vecOffset += 32; + } + i = (cols >> 3); + if (i > 0) { + do { + for (int it = 0; it < 4; ++it) + c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]); + vector_permute_store(c_arr, 4, vecOffset); + for (int it = 0; it< 4; it++) + aoffsets[it] = aoffsets[it] + 8*lda; + vecOffset += 64; + i--; + } while(i > 0); + } + } + if (rows & 3) { + aoffsets[0] = aoffset; + for (int it = 1; it < 4; ++it) + aoffsets[it] = aoffsets[it-1] + lda; + if (cols == 4) { + switch(rows) { + case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]); + case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]); + case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]); + break; + } + vector_permute_store(c_arr, 2, vecOffset); + for (int it = 0; it< 4; it++) + aoffsets[it] = aoffsets[it] + lda; + vecOffset += 32; + } + i = (cols >> 3); + if (i > 0) { + do { + switch(rows) { + case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]); + case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]); + case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]); + break; + } + vector_permute_store(c_arr, 4, vecOffset); + for (int it = 0; it <4; it++) + aoffsets[it] = aoffsets[it] + 8* lda; + vecOffset += 64; + i--; + } while(i > 0); + } + } + } + + void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + int m_rem = MIN(m - m0, 8); + int n_rem = MIN(n - n0, 8); + + if (m_rem >= 8 && n_rem >= 8) { + mc = 8; + nc = 8; + gemm<8,8>(m0, m, n0, n); + } else if (m_rem >= 4 && n_rem >= 8) { + mc = 4; + nc = 8; + gemm<4,8>(m0, m, n0, n); + } else if (m_rem >=8 && n_rem >=4){ + mc = 8; + nc = 4; + gemm<8,4>(m0, m, n0, n); + } else if ((m_rem < 4) && (n_rem >= 8)) { + nc = 8; + switch(m_rem) { + case 1: + mc = 1; + gemm_Mx8<1>(m0, m, n0, n); + break; + case 2: + mc = 2; + gemm_Mx8<2>(m0, m, n0, n); + break; + case 3: + mc = 3; + gemm_Mx8<3>(m0, m, n0, n); + break; + default: + return; + } + } else if (m_rem >= 4 && n_rem >= 4) { + mc = 4; + nc = 4; + gemm_small<4, 4>(m0, m, n0, n); + } else if ((m_rem > 4) && (n_rem < 4)) { + mc = 4; + switch(n_rem) { + case 1: + nc = 1; + gemm_small<4, 1>(m0, m, n0, n); + break; + case 2: + nc = 2; + gemm_small<4, 2>(m0, m, n0, n); + break; + case 3: + nc = 3; + gemm_small<4, 3>(m0, m, n0, n); + break; + + default: + return; + } + } else { + switch((m_rem << 4) | n_rem) { + case 0x43: + mc = 4; + nc = 3; + gemm_small<4, 3>(m0, m, n0, n); + break; + case 0x42: + mc = 4; + nc = 2; + gemm_small<4, 2>(m0, m, n0, n); + break; + case 0x41: + mc = 4; + nc = 1; + gemm_small<4, 1>(m0, m, n0, n); + break; + case 0x34: + mc = 3; + nc = 4; + gemm_small<3, 4>(m0, m, n0, n); + break; + case 0x33: + mc = 3; + nc = 3; + gemm_small<3, 3>(m0, m, n0, n); + break; + case 0x32: + mc = 3; + nc = 2; + gemm_small<3, 2>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm_small<3, 1>(m0, m, n0, n); + break; + case 0x24: + mc = 2; + nc = 4; + gemm_small<2,4>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm_small<2, 3>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm_small<2, 2>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm_small<2, 1>(m0, m, n0, n); + break; + case 0x14: + mc = 1; + nc = 4; + gemm_small<1, 4>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm_small<1, 3>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm_small<1, 2>(m0, m, n0, n); + break; + case 0x11: + mc = 1; + nc = 1; + gemm_small<1, 1>(m0, m, n0, n); + break; + default: + return; + } + } + mp = m0 + (m - m0) / mc * mc; + np = n0 + (n - n0) / nc * nc; + mnpack(mp, m, n0, np); + mnpack(m0, m, np, n); + } + + void KERNEL_4x8(int64_t ii, int64_t jj) { + vec_t vec_A[4], vec_B[8] , vec_C[4]; + acc_t acc_0, acc_1; + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + for (int l = 0; l < k; l+=8) { + packNormal((A+(ii*lda)+l), lda, 4, 8, (uint8_t*)vec_A); + packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B); + for (int x = 0; x < 4; x++) { + __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]); + } + } + SAVE_ACC(&acc_0, ii, jj); + SAVE_ACC(&acc_1, ii, jj+4); + } + + void KERNEL_8x4(int64_t ii, int64_t jj) { + vec_t vec_A[8], vec_B[4] , vec_C[4]; + acc_t acc_0, acc_1; + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + for (int l = 0; l < k; l+=8) { + packNormal((A+(ii*lda)+l), lda, 8, 8, (uint8_t*)vec_A); + packNormal((B+(jj*ldb)+l), ldb, 8, 4, (uint8_t*)vec_B); + for (int x = 0; x < 4; x++) { + __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x+4], vec_B[x]); + } + } + SAVE_ACC(&acc_0, ii, jj); + SAVE_ACC(&acc_1, ii+4, jj); + } + + + void KERNEL_8x8(int64_t ii, int64_t jj) { + vec_t vec_A[8], vec_B[8], vec_C[4]; + acc_t acc_0, acc_1, acc_2, acc_3; + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + __builtin_mma_xxsetaccz(&acc_2); + __builtin_mma_xxsetaccz(&acc_3); + for (int l = 0; l < k; l+=8) { + packNormal(A+(ii*lda)+l, lda, 8, 8, (uint8_t*)vec_A); + packNormal(B+(jj*ldb)+l, ldb, 8, 8, (uint8_t*)vec_B); + for (int x = 0; x < 4; x++) { + __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvbf16ger2pp(&acc_1, (vec_t)vec_A[x], (vec_t)vec_B[x+4]); + __builtin_mma_xvbf16ger2pp(&acc_2, (vec_t)vec_A[x+4], (vec_t)vec_B[x]); + __builtin_mma_xvbf16ger2pp(&acc_3, (vec_t)vec_A[x+4], (vec_t)vec_B[x+4]); + } + } + + SAVE_ACC(&acc_0, ii, jj); + SAVE_ACC(&acc_1, ii, jj+4); + SAVE_ACC(&acc_2, ii+4, jj); + SAVE_ACC(&acc_3, ii+4, jj+4); + } + + template + void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + vec_t vec_C[4]; + acc_t acc_0; + __builtin_mma_xxsetaccz(&acc_0); + vec_t vec_A[2], vec_B[2]; + for (int l=0; l + void gemm_Mx8(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int RN = 8; + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + vec_t vec_C[4]; + acc_t acc_0, acc_1; + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + vec_t vec_A[4], vec_B[8]; + for (int l=0; l + inline void kernel(int64_t ii, int64_t jj) { + if constexpr(RM == 4 && RN == 8) { + KERNEL_4x8(ii,jj); + } else if constexpr(RM == 8 && RN == 8) { + KERNEL_8x8(ii,jj); + } else if constexpr(RM == 8 && RN == 4) { + KERNEL_8x4(ii,jj); + } else { + static_assert(false, "RN/RM values not supported"); + } + } + + template + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + kernel(ii, jj); + } + } + + const TA *const A; + const TB *const B; + TC *C; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; + const int ith; + const int nth; +}; + template class tinyBLAS_Q0_PPC { public: @@ -2202,6 +2689,7 @@ class tinyBLAS_PPC { boffset = vec; j = (rows >> 3); if (j > 0) { + do { aoffset1 = aoffset; aoffset2 = aoffset1 + lda; @@ -2875,9 +3363,22 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (float *)C, ldc}; return tb.matmul(m, n); } +#elif defined(__MMA__) + if ((k % 8)) + return false; + if(Btype == GGML_TYPE_BF16) { + tinyBLAS_BF16_PPC tb{ k, + (const ggml_bf16_t *)A, lda, + (const ggml_bf16_t *)B, ldb, + (float *)C, ldc, + params->ith, params->nth}; + tb.matmul(m, n); + return true; + } #endif return false; } + case GGML_TYPE_F16: { #if defined(__AVX512F__) if (Btype == GGML_TYPE_F16) { diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp index 66b8da68f..654e2f280 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp @@ -8,19 +8,6 @@ #include -#if defined(_MSC_VER) -// disable "possible loss of data" to avoid hundreds of casts -// we should just be careful :) -#pragma warning(disable: 4244 4267) - -// disable POSIX deprecation warnings -// these functions are never going away, anyway -#pragma warning(disable: 4996) - -// unreachable code because of multiple instances of code after GGML_ABORT -#pragma warning(disable: 4702) -#endif - // ggml_compute_forward_dup static void ggml_compute_forward_dup_same_cont( @@ -4222,7 +4209,7 @@ static void ggml_compute_forward_get_rows_f16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_fp16_to_fp32_row( + ggml_cpu_fp16_to_fp32( (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -4263,7 +4250,7 @@ static void ggml_compute_forward_get_rows_bf16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_bf16_to_fp32_row( + ggml_cpu_bf16_to_fp32( (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -6064,6 +6051,178 @@ void ggml_compute_forward_conv_transpose_2d( } } +// ggml_compute_forward_conv_2d_dw + +struct ggml_conv_2d_dw_params { + int64_t channels; + int64_t batch; + int64_t src_w; + int64_t src_h; + int64_t dst_w; + int64_t dst_h; + int64_t knl_w; + int64_t knl_h; + int stride_x; + int stride_y; + int pad_x; + int pad_y; + int dilation_x; + int dilation_y; +}; + +static void ggml_compute_forward_conv_2d_dw_cwhn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t c = p.channels; + const float * knl_data = (const float *)kernel->data; + + const int64_t rows_total = p.dst_h * p.batch; + const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; + const int64_t row_start = params->ith * rows_per_thread; + const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); + +#ifdef GGML_SIMD + const int64_t pkg_size = GGML_F32_EPR; + const int64_t pkg_count = c / pkg_size; + const int64_t c_pkg_end = pkg_count * pkg_size; +#else + const int64_t c_pkg_end = 0; +#endif + + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t dst_y = row % p.dst_h; + const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; + const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; + const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; + +#ifdef GGML_SIMD + // Vectorized loop + for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { + GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); + GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); + sum = GGML_F32_VEC_FMA(sum, k, s); + } + } + GGML_F32_VEC_STORE(dst_data + c_i, sum); + } +#endif + // Scalar loop + for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] + * src_data[(src_y * p.src_w + src_x) * c + c_i]; + } + } + dst_data[c_i] = sum; + } + } + } +} + +static void ggml_compute_forward_conv_2d_dw_whcn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t n = p.channels * p.batch; + const int64_t per_thread = (n + params->nth - 1) / params->nth; + const int64_t start = params->ith * per_thread; + const int64_t end = MIN(start + per_thread, n); + + for (int64_t i = start; i < end; ++i) { + const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; + const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; + float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; + + for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[knl_y * p.knl_w + knl_x] + * src_data[src_y * p.src_w + src_x]; + } + } + dst_data[dst_y * p.dst_w + dst_x] = sum; + } + } + } +} + +void ggml_compute_forward_conv_2d_dw( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * src = dst->src[1]; + ggml_conv_2d_dw_params p; + p.channels = src->ne[2]; + p.batch = src->ne[3]; + p.src_w = src->ne[0]; + p.src_h = src->ne[1]; + p.dst_w = dst->ne[0]; + p.dst_h = dst->ne[1]; + p.knl_w = kernel->ne[0]; + p.knl_h = kernel->ne[1]; + p.stride_x = dst->op_params[0]; + p.stride_y = dst->op_params[1]; + p.pad_x = dst->op_params[2]; + p.pad_y = dst->op_params[3]; + p.dilation_x = dst->op_params[4]; + p.dilation_y = dst->op_params[5]; + + GGML_ASSERT(kernel->ne[3] == p.channels); + GGML_ASSERT(dst->ne[3] == p.batch); + + if (ggml_is_contiguous(src)) { + ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p); + } else if (ggml_is_contiguous_channels(src)) { + // kernel should also have channels most contiguous in memory + GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]); + ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p); + } else { + GGML_ABORT("non-contiguous memory layout not supported"); + } +} + // ggml_compute_forward_pool_1d_sk_p0 static void ggml_compute_forward_pool_1d_sk_p0( @@ -6531,61 +6690,6 @@ void ggml_compute_forward_pad_reflect_1d( } } -// ggml_compute_forward_unpad - -static void ggml_compute_forward_unpad_f32( - const struct ggml_compute_params *params, - struct ggml_tensor *dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT( dst->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - float * dst_ptr = (float *) dst->data; - - // TODO: optimize - - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = ith; i1 < ne1; i1 += nth) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - for (int64_t i3 = 0; i3 < ne3; ++i3) { - const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - - const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - dst_ptr[dst_idx] = *src_ptr; - } - } - } - } - } -} - -void ggml_compute_forward_unpad( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_unpad_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - // ggml_compute_forward_arange static void ggml_compute_forward_arange_f32( @@ -6718,6 +6822,45 @@ static void ggml_compute_forward_argsort_f32( } } +static void ggml_compute_forward_argsort_i32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(nb0 == sizeof(int32_t)); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nr = ggml_nrows(src0); + + ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0); + + for (int64_t i = ith; i < nr; i += nth) { + int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); + const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01); + + for (int64_t j = 0; j < ne0; j++) { + dst_data[j] = j; + } + + // C doesn't have a functional sort, so we do a bubble sort instead + for (int64_t j = 0; j < ne0; j++) { + for (int64_t k = j + 1; k < ne0; k++) { + if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || + (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { + int32_t tmp = dst_data[j]; + dst_data[j] = dst_data[k]; + dst_data[k] = tmp; + } + } + } + } +} + void ggml_compute_forward_argsort( const ggml_compute_params * params, ggml_tensor * dst) { @@ -6729,6 +6872,10 @@ void ggml_compute_forward_argsort( { ggml_compute_forward_argsort_f32(params, dst); } break; + case GGML_TYPE_I32: + { + ggml_compute_forward_argsort_i32(params, dst); + } break; default: { GGML_ABORT("fatal error"); diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h index 3eca1cf86..dc081b9e6 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h @@ -65,13 +65,13 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); -void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h b/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h index 04d10cec2..45c31cf1f 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h +++ b/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h @@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { #define GGML_F32_EPR 4 #define GGML_F32x4 vector float -#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_ZERO {0.0f} #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp index dfe2218e3..02d406182 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp @@ -2,12 +2,6 @@ #include -#if defined(_MSC_VER) -// disable "possible loss of data" to avoid hundreds of casts -// we should just be careful :) -#pragma warning(disable: 4244 4267) -#endif - // precomputed gelu table for f16 (128 KB) ggml_fp16_t ggml_table_gelu_f16[1 << 16]; diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt index 8623214c7..c9ff4aa32 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt @@ -12,12 +12,30 @@ if (CUDAToolkit_FOUND) # 61 == Pascal, __dp4a instruction (per-byte integer dot product) # 70 == V100, FP16 tensor cores # 75 == Turing, int8 tensor cores + # 80 == Ampere, asynchronous data loading, faster tensor core instructions + # 86 == RTX 3000, needs CUDA v11.1 + # 89 == RTX 4000, needs CUDA v11.8 + # + # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run + # XX-real == compile CUDA code as device code for this specific architecture + # no suffix == compile as both PTX and device code + # + # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed + # for best performance and to also build real architectures for the most commonly used GPUs. if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") set(CMAKE_CUDA_ARCHITECTURES "native") elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80") + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") + set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") + else() + set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real") + endif() else() - set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80") + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") + set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") + else() + set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real") + endif() endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") @@ -100,7 +118,7 @@ if (CUDAToolkit_FOUND) set(CUDA_CXX_FLAGS "") - set(CUDA_FLAGS -use_fast_math) + set(CUDA_FLAGS -use_fast_math -extended-lambda) if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") # Options are: @@ -133,6 +151,7 @@ if (CUDAToolkit_FOUND) COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" OUTPUT_VARIABLE CUDA_CCVER ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE ) else() if (CUDA_CCFULLVER MATCHES Apple) @@ -143,7 +162,7 @@ if (CUDAToolkit_FOUND) string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) endif() - message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") + message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER}) list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu index 96bfe1c9d..e084607c0 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu @@ -1,47 +1,61 @@ #include "acc.cuh" -static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset) { - const int i = blockDim.x * blockIdx.x + threadIdx.x; +static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) { + const int64_t i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= ne) { return; } - int src1_idx = i - offset; - int oz = src1_idx / nb2; - int oy = (src1_idx - (oz * nb2)) / nb1; - int ox = src1_idx % nb1; - if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { - dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; - } else { - dst[i] = x[i]; + + int64_t src1_idx = i - offset; + + int64_t tmp = src1_idx; + const int64_t i13 = tmp / s13; + tmp -= i13 * s13; + const int64_t i12 = tmp / s12; + tmp -= i12 * s12; + const int64_t i11 = tmp / s11; + tmp -= i11 * s11; + const int64_t i10 = tmp; + + float val = x[i]; + if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) { + val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10]; } + dst[i] = val; } -static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, const int offset, cudaStream_t stream) { - int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; - acc_f32<<>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset); +static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) { + const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; + acc_f32<<>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset); } void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - float * dst_d = (float *)dst->data; + + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported - int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 - int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 - // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused - int offset = dst->op_params[3] / 4; // offset in bytes + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(dst->nb[0] == ggml_element_size(dst)); + GGML_ASSERT(ggml_is_contiguously_allocated(dst)); - acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream); + const int64_t s1 = dst->op_params[0] / sizeof(float); + const int64_t s2 = dst->op_params[1] / sizeof(float); + const int64_t s3 = dst->op_params[2] / sizeof(float); + const int64_t offset = dst->op_params[3] / sizeof(float); + + acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream); } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu index 607ded855..53b02634c 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu @@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co } } + +template +static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) { + extern __shared__ int shared_mem[]; + int * indices = shared_mem; + + const int tid = threadIdx.x; + const int row = blockIdx.y; + + // Initialize all indices, handling the case where threads < ncols_pad + for (int i = tid; i < ncols_pad; i += blockDim.x) { + indices[i] = i < ncols ? i : 0; // Use 0 for padding indices + } + __syncthreads(); + + // Bitonic sort + for (int k = 2; k <= ncols_pad; k *= 2) { + for (int j = k/2; j > 0; j /= 2) { + for (int i = tid; i < ncols_pad; i += blockDim.x) { + const int ij = i ^ j; + if (ij > i) { + // Only compare values within the actual data range + if (i < ncols && ij < ncols) { + if ((i & k) == 0) { + if (order == GGML_SORT_ORDER_ASC) { + if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) { + int tmp = indices[i]; + indices[i] = indices[ij]; + indices[ij] = tmp; + } + } else { + if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) { + int tmp = indices[i]; + indices[i] = indices[ij]; + indices[ij] = tmp; + } + } + } else { + if (order == GGML_SORT_ORDER_ASC) { + if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) { + int tmp = indices[i]; + indices[i] = indices[ij]; + indices[ij] = tmp; + } + } else { + if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) { + int tmp = indices[i]; + indices[i] = indices[ij]; + indices[ij] = tmp; + } + } + } + } + } + } + __syncthreads(); + } + } + + // Write sorted indices to output, only threads handling valid data + for (int i = tid; i < ncols; i += blockDim.x) { + dst[row * ncols + i] = indices[i]; + } +} + +static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { + // Bitonic sort requires ncols to be power of 2 + const int ncols_pad = next_power_of_2(ncols); + + // Ensure thread count doesn't exceed maximum (typically 1024) + const int max_threads = 1024; // This is the typical max for most GPUs + const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad; + + const dim3 block_dims(threads_per_block, 1, 1); + const dim3 block_nums(1, nrows, 1); + const size_t shared_mem = ncols_pad * sizeof(int); + + // Check if shared memory size is within limits + const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; + + // Instead of logging an error, use GGML_ASSERT with a descriptive message + GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit"); + + // Launch kernels with the updated thread configuration + if (order == GGML_SORT_ORDER_ASC) { + k_argsort_i32_i32<<>>(x, dst, ncols, ncols_pad); + } else if (order == GGML_SORT_ORDER_DESC) { + k_argsort_i32_i32<<>>(x, dst, ncols, ncols_pad); + } else { + GGML_ABORT("fatal error"); + } +} + + void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32); GGML_ASSERT( dst->type == GGML_TYPE_I32); GGML_ASSERT(ggml_is_contiguous(src0)); @@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); + if (src0->type == GGML_TYPE_I32) { + argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream); + } else { + argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); + } } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh index 8284a0017..64fb4ff4c 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh @@ -78,13 +78,13 @@ // Moore Threads #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) -#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 -#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 -#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD +#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 +#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 +#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) -#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT) +#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG) #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) #ifdef __CUDA_ARCH_LIST__ @@ -130,10 +130,6 @@ static int ggml_cuda_highest_compiled_arch(const int arch) { #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - #define GGML_CUDA_MAX_STREAMS 8 [[noreturn]] @@ -300,6 +296,25 @@ static __device__ void no_device_code( #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") #endif // __CUDA_ARCH__ +// The compiler is always able to unroll loops if they contain continue expressions. +// In such cases loop unrolling can still be achieved via recursion: +template +struct ggml_cuda_unroll { + template + __device__ void operator()(const Func & f, Args... args) const { + f(n - 1, args...); + ggml_cuda_unroll{}(f, args...); + } +}; + +template <> +struct ggml_cuda_unroll<1> { + template + __device__ void operator()(const Func & f, Args... args) const { + f(0, args...); + } +}; + template static __device__ __forceinline__ int warp_reduce_sum(int x) { #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu index a224ec0e1..c6dec4276 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu @@ -1,6 +1,8 @@ #include "convert.cuh" #include "dequantize.cuh" +#include + #define CUDA_Q8_0_NE_ALIGN 2048 template @@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t } template -static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { - const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void convert_unary( + const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t s01, const int64_t s02, const int64_t s03) { + const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { + if (i00 >= ne00) { return; } + const int64_t i01 = blockIdx.y; + const int64_t i02 = blockIdx.z % ne02; + const int64_t i03 = blockIdx.z / ne02; + const src_t * x = (const src_t *) vx; - y[i] = float(x[i]); + const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; + const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; + y[iy] = float(x[ix]); } template -static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; - convert_unary<<>>(vx, y, k); +static void convert_unary_cuda(const void * vx, dst_t * y, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) { + const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03); + convert_unary<<>> + (vx, y, ne00, ne01, ne02, s01, s02, s03); +} + +template +static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + convert_unary_cuda(vx, y, k, 1, 1, 1, k, k, k, stream); } to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) { switch (type) { case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_BF16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; + case GGML_TYPE_BF16: + return convert_unary_cont_cuda; + default: + return nullptr; + } +} + +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh index 411a13cf1..b65b98e08 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh @@ -3,7 +3,7 @@ #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 template -using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); +using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream); typedef to_t_cuda_t to_fp32_cuda_t; typedef to_t_cuda_t to_fp16_cuda_t; @@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type); to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); + +// TODO more general support for non-contiguous inputs + +template +using to_t_nc_cuda_t = void (*)(const void * x, T * y, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, + int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream); + +typedef to_t_nc_cuda_t to_fp16_nc_cuda_t; +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh index ecb659997..63d0c482f 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh @@ -2,6 +2,17 @@ #include "common.cuh" + +static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) { +#ifdef CP_ASYNC_AVAILABLE + return __cvta_generic_to_shared(generic_ptr); +#else + GGML_UNUSED(generic_ptr); + NO_DEVICE_CODE; + return 0; +#endif // CP_ASYNC_AVAILABLE +} + // Copies data from global to shared memory, cg == cache global. // Both the src and dst pointers must be aligned to 16 bit. // Shared memory uses 32 bit addressing, the pointer is passed as unsigned int. diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu index ed25646e8..4abd01d79 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu @@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { *dsti = *xi; } +static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) { + const int32_t * xi = (const int32_t *) cxi; + int32_t * dsti = (int32_t *) cdsti; + + *dsti = *xi; +} + template static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, @@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in cpy_1(cx + x_offset, cdst + dst_offset); } +// First, add this template function after the other template functions +template +static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13) { + const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= ne) { + return; + } + + const int64_t i03 = i/(ne00 * ne01 * ne02); + const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; + + const int64_t i13 = i/(ne10 * ne11 * ne12); + const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +// Then modify the ggml_cpy_i32_i32_cuda function to use the new template +static void ggml_cpy_i32_i32_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_i32_i32<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; block_q8_0 * dsti = (block_q8_0 *) cdsti; @@ -592,6 +637,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d; graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index; } +#else + GGML_UNUSED(disable_indirection_for_this_node); #endif if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); @@ -631,6 +678,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else { GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); @@ -639,6 +688,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; } +#else + GGML_UNUSED(disable_indirection_for_this_node); #endif } @@ -684,6 +735,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { return (void*) cpy_f32_f16; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + return (void*) cpy_i32_i32; } else { GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh index 56121705b..b7180d595 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh @@ -516,7 +516,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) { nullptr; } -template // D == head size +template // D == head size __launch_bounds__(D, 1) static __global__ void flash_attn_stream_k_fixup( float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) { @@ -665,13 +665,13 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); GGML_ABORT("fatal error"); } else { - fprintf(stderr, "Unsupported KV type combination for head_size 256.\n"); + fprintf(stderr, "Unsupported KV type combination for head_size %d.\n", D); fprintf(stderr, "Only f16 is supported.\n"); GGML_ABORT("fatal error"); } } -template +template void launch_fattn( ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared, const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE @@ -691,7 +691,7 @@ void launch_fattn( GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16); GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) && - "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big"); + "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big"); GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding."); @@ -719,6 +719,7 @@ void launch_fattn( size_t nb23 = V->nb[3]; if (need_f16_K && K->type != GGML_TYPE_F16) { + GGML_ASSERT(ggml_is_contiguously_allocated(K)); K_f16.alloc(ggml_nelements(K)); to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type); to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream); @@ -733,6 +734,7 @@ void launch_fattn( } if (need_f16_V && V->type != GGML_TYPE_F16) { + GGML_ASSERT(ggml_is_contiguously_allocated(V)); V_f16.alloc(ggml_nelements(V)); to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type); to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream); @@ -752,10 +754,13 @@ void launch_fattn( const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3]; const dim3 block_dim(warp_size, nwarps, 1); + int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy. + CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared)); + dim3 blocks_num; if (stream_k) { // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup. - const int max_blocks = 2*nsm; + const int max_blocks = max_blocks_per_sm*nsm; const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks; const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves); @@ -767,14 +772,11 @@ void launch_fattn( blocks_num.y = 1; blocks_num.z = 1; - dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float)); + dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float)); } else { GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0); const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size. - int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy. - CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared)); - // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave: parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1); @@ -851,19 +853,19 @@ void launch_fattn( if (stream_k) { if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. - const dim3 block_dim_combine(D, 1, 1); + const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2}; - flash_attn_stream_k_fixup + flash_attn_stream_k_fixup <<>> ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]); } } else if (parallel_blocks > 1) { - const dim3 block_dim_combine(D, 1, 1); + const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z); const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2); - flash_attn_combine_results + flash_attn_combine_results <<>> (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 04804a15c..491780abd 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -13,104 +13,217 @@ typedef tile<16, 16, float> tile_C_KQ_16; typedef tile<16, 4, half2> tile_C_VKQ; typedef tile<16, 8, half2> tile_C_VKQ_16; -template +// Config options for specific head sizes. +// Should not affect results, only speed/register pressure/shared memory use. +// +// nbatch_fa: number of KV rows per softmax rescaling of KQ rowsums and VKQ accumulators. +// nwarps_max: maximum number of warps per CUDA block, up to 8 warps in total can run per SM (given enough shared memory). +// Q_in_reg: whether the Q values should be kept permanently in registers. +// nstages_target: targeted number of pipeline stages for cp_async (if available), 0 means synchronous data loading. +// nbatch_K2: number of K half2 values in direction of DKQ to load in parallel. +// nbatch_V2: number of V half2 values in direction of DV to load in parallel. +// nbatch_combine: number of VKQ half2 values in direction of DV to combine in parallel. + +template +struct fattn_mma_f16_config; + +template <> +struct fattn_mma_f16_config< 64, 64> { + static constexpr int nbatch_fa = 64; + static constexpr int nwarps_max = 4; + static constexpr bool Q_in_reg = true; + static constexpr int nstages_target = 2; + static constexpr int nbatch_K2 = 32; + static constexpr int nbatch_V2 = 32; + static constexpr int nbatch_combine = 32; +}; + +template <> +struct fattn_mma_f16_config< 80, 80> { + static constexpr int nbatch_fa = 64; + static constexpr int nwarps_max = 4; + static constexpr bool Q_in_reg = true; + static constexpr int nstages_target = 2; + static constexpr int nbatch_K2 = 40; + static constexpr int nbatch_V2 = 40; + static constexpr int nbatch_combine = 40; +}; + +template <> +struct fattn_mma_f16_config< 96, 96> { + static constexpr int nbatch_fa = 64; + static constexpr int nwarps_max = 4; + static constexpr bool Q_in_reg = true; + static constexpr int nstages_target = 2; + static constexpr int nbatch_K2 = 48; + static constexpr int nbatch_V2 = 48; + static constexpr int nbatch_combine = 48; +}; + +template <> +struct fattn_mma_f16_config<112, 112> { + static constexpr int nbatch_fa = 64; + static constexpr int nwarps_max = 4; + static constexpr bool Q_in_reg = true; + static constexpr int nstages_target = 2; + static constexpr int nbatch_K2 = 56; + static constexpr int nbatch_V2 = 56; + static constexpr int nbatch_combine = 56; +}; + +template <> +struct fattn_mma_f16_config<128, 128> { + static constexpr int nbatch_fa = 64; + static constexpr int nwarps_max = 4; + static constexpr bool Q_in_reg = true; + static constexpr int nstages_target = 2; + static constexpr int nbatch_K2 = 64; + static constexpr int nbatch_V2 = 64; + static constexpr int nbatch_combine = 64; +}; + +template <> +struct fattn_mma_f16_config<256, 256> { + static constexpr int nbatch_fa = 32; + static constexpr int nwarps_max = 4; + static constexpr bool Q_in_reg = true; + static constexpr int nstages_target = 2; + static constexpr int nbatch_K2 = 128; + static constexpr int nbatch_V2 = 128; + static constexpr int nbatch_combine = 128; +}; + +template <> +struct fattn_mma_f16_config<576, 512> { + static constexpr int nbatch_fa = 32; + static constexpr int nwarps_max = 8; + static constexpr bool Q_in_reg = false; + static constexpr int nstages_target = 1; + static constexpr int nbatch_K2 = 160; + static constexpr int nbatch_V2 = 128; + static constexpr int nbatch_combine = 128; +}; + +// ------------------------------------------------------------------------------------------------------------------ + +template static __device__ __forceinline__ void flash_attn_ext_f16_load_tile( - const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int stride_KV) { - constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts. + const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV) { - // If cp.async is available, load up to the highest power of 2 in D asynchronously: -#ifdef CP_ASYNC_AVAILABLE - static_assert(D >= 64 && D < 512, "bad D"); - constexpr int k0_sync_start = D/2 < 64 ? 32 : (D/2 < 128 ? 64 : 128); - - const unsigned int tile_KV_32 = __cvta_generic_to_shared(tile_KV); - - constexpr int preload = 64; - constexpr int h2_per_chunk = 16/sizeof(half2); - constexpr int chunks_per_row = k0_sync_start / h2_per_chunk; - constexpr int stride_i = WARP_SIZE / chunks_per_row; -#pragma unroll - for (int i0 = 0; i0 < KQ_per_iter; i0 += nwarps*stride_i) { - const int i = i0 + threadIdx.y*stride_i + (chunks_per_row == WARP_SIZE ? 0 : threadIdx.x / chunks_per_row); - const int k = (chunks_per_row == WARP_SIZE ? threadIdx.x : threadIdx.x % chunks_per_row)*h2_per_chunk; - - cp_async_cg_16(tile_KV_32 + (i*D2_padded + k)*sizeof(half2), KV + i*stride_KV + k); - } -#else - constexpr int k0_sync_start = 0; -#endif // CP_ASYNC_AVAILABLE - static_assert(k0_sync_start % WARP_SIZE == 0, "bad k0_sync_start"); - - // If D is not a power of 2, the rest is loaded synchronously. // K/V data is loaded with decreasing granularity for D for better memory bandwidth. - static_assert(KQ_per_iter % (4*nwarps) == 0, "out of bounds"); -#pragma unroll - for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) { - const int k0_start = stride_k == WARP_SIZE ? k0_sync_start : D/2 - (D/2) % (2*stride_k); - const int k0_stop = D/2 - (D/2) % (1*stride_k); - const int stride_i = WARP_SIZE / stride_k; + // The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes. - if (k0_start == k0_stop || k0_stop <= k0_sync_start) { - continue; - } + if (use_cp_async) { + constexpr int preload = 64; + constexpr int h2_per_chunk = 16/sizeof(half2); + const int chunks_per_row = D2 / h2_per_chunk; -#pragma unroll - for (int i0 = 0; i0 < KQ_per_iter; i0 += nwarps*stride_i) { - const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k); + const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV); -#pragma unroll - for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { - const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); + auto load = [&] __device__ (const int n) { + const int stride_k = WARP_SIZE >> n; + const int k0_start = stride_k == WARP_SIZE ? 0 : chunks_per_row - chunks_per_row % (2*stride_k); + const int k0_stop = chunks_per_row - chunks_per_row % (1*stride_k); + const int stride_i = WARP_SIZE / stride_k; - tile_KV[i*D2_padded + k] = KV[i*stride_KV + k]; + if (k0_start == k0_stop) { + return; } - } + +#pragma unroll + for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) { + const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k); + + if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) { + break; + } + +#pragma unroll + for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { + const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); + + cp_async_cg_16(tile_KV_32 + i*(stride_tile*sizeof(half2)) + k*16, KV + i*stride_KV + k*h2_per_chunk); + } + } + }; + ggml_cuda_unroll<5>{}(load); + } else { + static_assert(nbatch_fa % (4*nwarps) == 0, "out of bounds"); + auto load = [&] __device__ (const int n) { + const int stride_k = WARP_SIZE >> n; + const int k0_start = stride_k == WARP_SIZE ? 0 : D2 - D2 % (2*stride_k); + const int k0_stop = D2 - D2 % (1*stride_k); + const int stride_i = WARP_SIZE / stride_k; + + if (k0_start == k0_stop) { + return; + } + +#pragma unroll + for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) { + const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k); + + if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) { + break; + } + +#pragma unroll + for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { + const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); + + tile_KV[i*stride_tile + k] = KV[i*stride_KV + k]; + } + } + }; + ggml_cuda_unroll<3>{}(load); } } -template +template static __device__ __forceinline__ void flash_attn_ext_f16_load_mask( const half2 * const __restrict__ mask_h2, half2 * const __restrict__ tile_mask, const int stride_mask) { - static_assert(KQ_per_iter == 2*WARP_SIZE || KQ_per_iter == WARP_SIZE, "bad KQ_per_iter"); -#ifdef CP_ASYNC_AVAILABLE - constexpr int preload = KQ_per_iter * sizeof(half); - constexpr int cols_per_warp = 8*WARP_SIZE/KQ_per_iter; - constexpr int stride_j = nwarps * cols_per_warp; + static_assert(nbatch_fa == 2*WARP_SIZE || WARP_SIZE % nbatch_fa == 0, "bad KQ_per_iter"); - const unsigned int tile_mask_32 = __cvta_generic_to_shared(tile_mask); + if (use_cp_async) { + constexpr int preload = nbatch_fa >= 32 ? nbatch_fa * sizeof(half) : 64; + constexpr int cols_per_warp = 8*WARP_SIZE/nbatch_fa; + constexpr int stride_j = nwarps * cols_per_warp; + + const unsigned int tile_mask_32 = ggml_cuda_cvta_generic_to_shared(tile_mask); +#pragma unroll + for (int j0 = 0; j0 < ncols1; j0 += stride_j) { + const int j = j0 + threadIdx.y*cols_per_warp + + (nbatch_fa == 2*WARP_SIZE ? threadIdx.x / (WARP_SIZE/4) : threadIdx.x / (WARP_SIZE/cols_per_warp)); + + if (j0 + stride_j > ncols1 && j >= ncols1) { + break; + } + + const int i = 4 * (threadIdx.x % (nbatch_fa/8)); + + cp_async_cg_16(tile_mask_32 + j*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half2), mask_h2 + j*stride_mask + i); + } + return; + } + + constexpr int cols_per_warp = 2*WARP_SIZE/nbatch_fa; + constexpr int stride_j = nwarps * cols_per_warp; #pragma unroll for (int j0 = 0; j0 < ncols1; j0 += stride_j) { - const int j = j0 + threadIdx.y*cols_per_warp + - (KQ_per_iter == 2*WARP_SIZE ? threadIdx.x / (WARP_SIZE/4) : threadIdx.x / (WARP_SIZE/8)); + const int j = j0 + threadIdx.y*cols_per_warp + (nbatch_fa == 2*WARP_SIZE ? 0 : threadIdx.x / (WARP_SIZE/cols_per_warp)); if (j0 + stride_j > ncols1 && j >= ncols1) { break; } - const int i = 4 * (KQ_per_iter == 2*WARP_SIZE ? threadIdx.x % (WARP_SIZE/4) : threadIdx.x % (WARP_SIZE/8)); + const int i = nbatch_fa == 2*WARP_SIZE ? threadIdx.x : threadIdx.x % (WARP_SIZE/cols_per_warp); - cp_async_cg_16(tile_mask_32 + j*(KQ_per_iter*sizeof(half) + 16) + i*sizeof(half2), mask_h2 + j*stride_mask + i); + tile_mask[j*(nbatch_fa/2 + 4) + i] = mask_h2[j*stride_mask + i]; } -#else - constexpr int cols_per_warp = 2*WARP_SIZE/KQ_per_iter; - constexpr int stride_j = nwarps * cols_per_warp; -#pragma unroll - for (int j0 = 0; j0 < ncols1; j0 += stride_j) { - const int j = j0 + threadIdx.y*cols_per_warp + (KQ_per_iter == 2*WARP_SIZE ? 0 : threadIdx.x / (WARP_SIZE/2)); - - if (j0 + stride_j > ncols1 && j >= ncols1) { - break; - } - - const int i = KQ_per_iter == 2*WARP_SIZE ? threadIdx.x : threadIdx.x % (WARP_SIZE/2); - - tile_mask[j*(KQ_per_iter/2 + 4) + i] = mask_h2[j*stride_mask + i]; - } -#endif // CP_ASYNC_AVAILABLE } -template +template static __device__ __forceinline__ void flash_attn_ext_f16_iter( const float2 * const __restrict__ Q_f2, const half2 * const __restrict__ K_h2, @@ -123,9 +236,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( const float logit_softcap, const int ne01, const int ne02, - const int stride_KV, + const int stride_K, + const int stride_V, const int stride_mask, const int jt, + half2 * const __restrict__ tile_Q, half2 * const __restrict__ tile_K, half2 * const __restrict__ tile_V, half2 * const __restrict__ tile_mask, @@ -135,59 +250,107 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( float * const __restrict__ KQ_rowsum, const int kb0) { #ifdef NEW_MMA_AVAILABLE + typedef fattn_mma_f16_config c; + +#ifdef CP_ASYNC_AVAILABLE + constexpr int nstages = c::nstages_target; +#else + constexpr int nstages = 0; +#endif // CP_ASYNC_AVAILABLE + constexpr int cols_per_warp = ntiles * tile_B::I; constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles; constexpr int np = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column. - constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts. - const int k_VKQ_0 = kb0 * KQ_per_iter; - tile_C_KQ KQ_C[KQ_per_iter/(np*tile_C_KQ::I) * ntiles]; + constexpr int stride_tile_Q = DKQ/2 + 4; + constexpr int stride_tile_K = c::nbatch_K2 + 4; + constexpr int stride_tile_V = c::nbatch_V2 + 4; + + const int k_VKQ_0 = kb0 * c::nbatch_fa; + tile_C_KQ KQ_C[c::nbatch_fa/(np*tile_C_KQ::I) * ntiles]; // Use wide variants of tiles if ntiles >= 2. tile_B_16 * Q_B_16 = (tile_B_16 *) Q_B; tile_C_VKQ_16 * VKQ_C_16 = (tile_C_VKQ_16 *) VKQ_C; tile_C_KQ_16 * KQ_C_16 = (tile_C_KQ_16 *) KQ_C; -#ifdef CP_ASYNC_AVAILABLE - cp_async_wait_all(); - __syncthreads(); - flash_attn_ext_f16_load_tile(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV); -#else - if (ncols2 > 1 || mask_h2) { - flash_attn_ext_f16_load_mask(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask); - } - flash_attn_ext_f16_load_tile(K_h2 + k_VKQ_0*stride_KV, tile_K, stride_KV); - __syncthreads(); -#endif // CP_ASYNC_AVAILABLE - - // Calculate tile of KQ: -#pragma unroll - for (int i_KQ_00 = 0; i_KQ_00 < KQ_per_iter; i_KQ_00 += np*tile_A::I) { - const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*tile_A::I; -#pragma unroll - for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += tile_A::J) { - tile_A K_A; - load_ldmatrix(K_A, tile_K + i_KQ_0*D2_padded + k_KQ_0, D2_padded); - if (ntiles == 1) { - mma(KQ_C[i_KQ_00/(np*tile_A::I)], K_A, Q_B[k_KQ_0/tile_A::J]); - } else { -#pragma unroll - for (int t = 0; t < ntiles/2; ++t) { - // Wide version of KQ_C is column-major => swap A and B. - mma(KQ_C_16[i_KQ_00/(np*tile_A::I) * ntiles/2 + t], Q_B_16[k_KQ_0/tile_A::J * ntiles/2 + t], K_A); - } - } + if constexpr (nstages > 1) { + static_assert(c::nbatch_K2 == DKQ/2, "batching not implemented for multi stage loading"); + constexpr bool use_cp_async = true; + cp_async_wait_all(); + __syncthreads(); + flash_attn_ext_f16_load_tile + (V_h2 + k_VKQ_0*stride_V, tile_V, c::nbatch_V2, stride_V); + } else { + constexpr bool use_cp_async = nstages == 1; + if (ncols2 > 1 || mask_h2) { + flash_attn_ext_f16_load_mask(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask); } } -#ifndef CP_ASYNC_AVAILABLE - __syncthreads(); // Only needed if tile_K == tile_V. -#endif // CP_ASYNC_AVAILABLE +#pragma unroll + for (int k0_start = 0; k0_start < DKQ/2; k0_start += c::nbatch_K2) { + const int k0_stop = k0_start + c::nbatch_K2 < DKQ/2 ? k0_start + c::nbatch_K2 : DKQ/2; + const int k0_diff = k0_stop - k0_start; + + if (nstages <= 1) { + constexpr bool use_cp_async = nstages == 1; + flash_attn_ext_f16_load_tile + (K_h2 + k_VKQ_0*stride_K + k0_start, tile_K, k0_diff, stride_K); + if (use_cp_async) { + cp_async_wait_all(); + } + __syncthreads(); + } + + // Calculate tile of KQ: + if constexpr (c::Q_in_reg) { +#pragma unroll + for (int i_KQ_00 = 0; i_KQ_00 < c::nbatch_fa; i_KQ_00 += np*tile_A::I) { + const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*tile_A::I; +#pragma unroll + for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += tile_A::J) { + tile_A K_A; + load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K); + if (ntiles == 1) { + mma(KQ_C[i_KQ_00/(np*tile_A::I)], K_A, Q_B[k_KQ_0/tile_A::J]); + } else { +#pragma unroll + for (int t = 0; t < ntiles/2; ++t) { + // Wide version of KQ_C is column-major => swap A and B. + mma(KQ_C_16[i_KQ_00/(np*tile_A::I) * ntiles/2 + t], Q_B_16[k_KQ_0/tile_A::J * ntiles/2 + t], K_A); + } + } + } + } + } else { + static_assert(ntiles == 2, "ntiles != 2 not implemented"); +#pragma unroll + for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += tile_A::J) { + load_ldmatrix(Q_B_16[0], tile_Q + (threadIdx.y / np)*(tile_B_16::I*stride_tile_Q) + k_KQ_0, stride_tile_Q); + +#pragma unroll + for (int i_KQ_00 = 0; i_KQ_00 < c::nbatch_fa; i_KQ_00 += np*tile_A::I) { + const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*tile_A::I; + + tile_A K_A; + load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K); + + // Wide version of KQ_C is column-major => swap A and B. + mma(KQ_C_16[i_KQ_00/(np*tile_A::I)], Q_B_16[0], K_A); + } + } + } + + if (nstages <= 1) { + __syncthreads(); // Only needed if tile_K == tile_V. + } + } if (use_logit_softcap) { - static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size"); + static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size"); #pragma unroll - for (int i = 0; i < KQ_per_iter/(np*tile_C_KQ::I) * ntiles; ++i) { + for (int i = 0; i < c::nbatch_fa/(np*tile_C_KQ::I) * ntiles; ++i) { #pragma unroll for (int l = 0; l < tile_C_KQ::ne; ++l) { KQ_C[i].x[l] = logit_softcap*tanhf(KQ_C[i].x[l]); @@ -205,7 +368,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( if (ntiles == 1) { if (ncols2 > 1 || mask_h2) { #pragma unroll - for (int i00 = 0; i00 < KQ_per_iter; i00 += np*tile_C_KQ::I) { + for (int i00 = 0; i00 < c::nbatch_fa; i00 += np*tile_C_KQ::I) { const int i0 = i00 + (threadIdx.y % np)*tile_C_KQ::I; #pragma unroll for (int l = 0; l < tile_C_KQ::ne; ++l) { @@ -213,16 +376,16 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( const int j = ((threadIdx.y / np)*tile_C_KQ::J + tile_C_KQ::get_j(l)) / ncols2; KQ_C[i00/(np*tile_C_KQ::I)].x[l] += slope * - __half2float(((const half *) tile_mask)[j*(KQ_per_iter + 8) + i]); + __half2float(((const half *) tile_mask)[j*(c::nbatch_fa + 8) + i]); } } } // Calculate softmax for each KQ column using the current max. value. // The divisor is stored in KQ_rowsum and will be applied at the end. - static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size"); + static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size"); #pragma unroll - for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ::I); ++k) { + for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ::I); ++k) { #pragma unroll for (int l = 0; l < tile_C_KQ::ne; ++l) { KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k].x[l]); @@ -238,10 +401,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } } - static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size"); - + static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size"); #pragma unroll - for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ::I); ++k) { + for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ::I); ++k) { #pragma unroll for (int l = 0; l < tile_C_KQ::ne; ++l) { KQ_C[k].x[l] = expf(KQ_C[k].x[l] - KQ_max_new[l % 2]); @@ -252,7 +414,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } else { // ntiles > 1 if (ncols2 > 1 || mask_h2) { #pragma unroll - for (int i00 = 0; i00 < KQ_per_iter; i00 += np*tile_C_KQ_16::J) { + for (int i00 = 0; i00 < c::nbatch_fa; i00 += np*tile_C_KQ_16::J) { const int i0 = i00 + (threadIdx.y % np)*tile_C_KQ_16::J; #pragma unroll for (int t = 0; t < ntiles/2; ++t) { @@ -261,7 +423,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( const int i = (i0 + tile_C_KQ_16::get_j(l0)) / 2; const int j = ((threadIdx.y / np)*cols_per_warp + t*tile_C_KQ_16::I + tile_C_KQ_16::get_i(l0)) / ncols2; - const float2 tmp = __half22float2(tile_mask[j*(KQ_per_iter/2 + 4) + i]); + const float2 tmp = __half22float2(tile_mask[j*(c::nbatch_fa/2 + 4) + i]); const int KQ_index = i00/(np*tile_C_KQ_16::J) * ntiles/2 + t; KQ_C_16[KQ_index].x[l0 + 0] += slope*tmp.x; KQ_C_16[KQ_index].x[l0 + 1] += slope*tmp.y; @@ -272,9 +434,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( // Calculate softmax for each KQ column using the current max. value. // The divisor is stored in KQ_rowsum and will be applied at the end. - static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size"); + static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size"); #pragma unroll - for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ_16::J); ++k) { + for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ_16::J); ++k) { #pragma unroll for (int t = 0; t < ntiles/2; ++t) { #pragma unroll @@ -294,9 +456,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } } - static_assert(KQ_per_iter % (np*tile_C_KQ_16::J) == 0, "bad loop size"); + static_assert(c::nbatch_fa % (np*tile_C_KQ_16::J) == 0, "bad loop size"); #pragma unroll - for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ_16::J); ++k) { + for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ_16::J); ++k) { #pragma unroll for (int t = 0; t < ntiles/2; ++t) { #pragma unroll @@ -325,7 +487,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( if (ntiles == 1) { const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]); #pragma unroll - for (int i = 0; i < D/tile_C_VKQ::I; ++i) { + for (int i = 0; i < DV/tile_C_VKQ::I; ++i) { #pragma unroll for (int l = 0; l < tile_C_VKQ::ne; ++l) { VKQ_C[i].x[l] *= KQ_max_scale_h2; @@ -336,7 +498,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( for (int col = 0; col < cols_per_thread; ++col) { const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]); #pragma unroll - for (int i = 0; i < D/tile_C_VKQ_16::J; ++i) { + for (int i = 0; i < DV/tile_C_VKQ_16::J; ++i) { #pragma unroll for (int l0 = 0; l0 < tile_C_VKQ_16::ne; l0 += 2) { VKQ_C_16[i*ntiles/2 + col/2].x[l0 + col % 2] *= KQ_max_scale_h2; @@ -347,16 +509,16 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } // Convert KQ C tiles into B tiles for VKQ calculation: - tile_B B[KQ_per_iter/(np*2*tile_B::J) * ntiles]; + tile_B B[c::nbatch_fa/(np*2*tile_B::J) * ntiles]; tile_B_16 * B_16 = (tile_B_16 *) B; - static_assert(KQ_per_iter % (np*2*tile_B::J) == 0, "bad loop size"); + static_assert(c::nbatch_fa % (np*2*tile_B::J) == 0, "bad loop size"); if (ntiles == 1) { #pragma unroll - for (int k = 0; k < KQ_per_iter/(np*2*tile_B::J); ++k) { + for (int k = 0; k < c::nbatch_fa/(np*2*tile_B::J); ++k) { B[k] = get_transposed(get_half2(KQ_C[k])); } } else { - for (int k = 0; k < KQ_per_iter/(np*2*tile_B_16::J); ++k) { + for (int k = 0; k < c::nbatch_fa/(np*2*tile_B_16::J); ++k) { #pragma unroll for (int t = 0; t < ntiles/2; ++t) { B_16[k*ntiles/2 + t] = get_half2(KQ_C_16[k*ntiles/2 + t]); @@ -364,52 +526,67 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } } -#ifdef CP_ASYNC_AVAILABLE - // Preload K tile for next iteration: - cp_async_wait_all(); - __syncthreads(); - if (!last_iter) { - if (ncols2 > 1 || mask_h2) { - flash_attn_ext_f16_load_mask(mask_h2 + (k_VKQ_0 + KQ_per_iter)/2, tile_mask, stride_mask); + if (nstages > 1) { + // Preload K tile for next iteration: + constexpr bool use_cp_async = true; + cp_async_wait_all(); + __syncthreads(); + if (!last_iter) { + if (ncols2 > 1 || mask_h2) { + flash_attn_ext_f16_load_mask + (mask_h2 + (k_VKQ_0 + c::nbatch_fa)/2, tile_mask, stride_mask); + } + flash_attn_ext_f16_load_tile + (K_h2 + (k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, c::nbatch_K2, stride_K); } - flash_attn_ext_f16_load_tile(K_h2 + (k_VKQ_0 + KQ_per_iter)*stride_KV, tile_K, stride_KV); } -#else - flash_attn_ext_f16_load_tile(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV); - __syncthreads(); -#endif // CP_ASYNC_AVAILABLE - // Calculate VKQ tile: #pragma unroll - for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += tile_C_VKQ::I) { - static_assert((KQ_per_iter/2) % (np*tile_A::J) == 0, "bad loop size"); -#pragma unroll - for (int k00 = 0; k00 < KQ_per_iter/2; k00 += np*tile_A::J) { - const int k0 = k00 + (threadIdx.y % np)*tile_A::J; + for (int i0_start = 0; i0_start < DV; i0_start += 2*c::nbatch_V2) { + const int i0_stop = i0_start + 2*c::nbatch_V2 < DV ? i0_start + 2*c::nbatch_V2 : DV; + const int i0_diff = i0_stop - i0_start; - tile_A A; - load_ldmatrix_trans(A, tile_V + 2*k0*D2_padded + i_VKQ_0/2, D2_padded); - if (ntiles == 1) { - mma(VKQ_C[i_VKQ_0/tile_C_VKQ::I], A, B[k00/(np*tile_A::J)]); - } else { + if (nstages <= 1) { + constexpr bool use_cp_async = nstages == 1; + flash_attn_ext_f16_load_tile + (V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V); + if (use_cp_async) { + cp_async_wait_all(); + } + __syncthreads(); + } + + // Calculate VKQ tile: #pragma unroll - for (int t = 0; t < ntiles/2; ++t) { - // Wide version of VKQ_C is column-major => swap A and B. - mma(VKQ_C_16[i_VKQ_0/tile_C_VKQ::I * ntiles/2 + t], B_16[k00/(np*tile_A::J) * ntiles/2 + t], A); + for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += tile_C_VKQ::I) { + static_assert((c::nbatch_fa/2) % (np*tile_A::J) == 0, "bad loop size"); +#pragma unroll + for (int k00 = 0; k00 < c::nbatch_fa/2; k00 += np*tile_A::J) { + const int k0 = k00 + (threadIdx.y % np)*tile_A::J; + + tile_A A; + load_ldmatrix_trans(A, tile_V + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V); + if (ntiles == 1) { + mma(VKQ_C[i_VKQ_0/tile_C_VKQ::I], A, B[k00/(np*tile_A::J)]); + } else { +#pragma unroll + for (int t = 0; t < ntiles/2; ++t) { + // Wide version of VKQ_C is column-major => swap A and B. + mma(VKQ_C_16[i_VKQ_0/tile_C_VKQ::I * ntiles/2 + t], B_16[k00/(np*tile_A::J) * ntiles/2 + t], A); + } } } } + + if (nstages <= 1) { + __syncthreads(); // Only needed if tile_K == tile_V. + } } - -#ifndef CP_ASYNC_AVAILABLE - __syncthreads(); // Only needed if tile_K == tile_V. -#endif // CP_ASYNC_AVAILABLE - #else GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2); GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup); GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV); + GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K); GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K); GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B); @@ -419,7 +596,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( #endif // NEW_MMA_AVAILABLE } -template +template static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const float2 * const __restrict__ Q_f2, const half2 * const __restrict__ K_h2, @@ -434,7 +611,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int ne02, const int stride_Q1, const int stride_Q2, - const int stride_KV, + const int stride_K, + const int stride_V, const int stride_mask, const int jt, const int kb0_start, @@ -442,6 +620,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( #ifdef NEW_MMA_AVAILABLE //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + typedef fattn_mma_f16_config c; + +#ifdef CP_ASYNC_AVAILABLE + constexpr int nstages = c::nstages_target; +#else + constexpr int nstages = 0; +#endif // CP_ASYNC_AVAILABLE + constexpr int ncols = ncols1 * ncols2; constexpr int cols_per_warp = ntiles * tile_B::I; constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles; @@ -449,22 +635,19 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( static_assert(nwarps * (cols_per_warp/ncols2) % ncols1 == 0, "bad nwarps"); - static_assert(D % nwarps == 0, "bad D"); - static_assert(KQ_per_iter % nwarps == 0, "bad KQ_per_iter"); + constexpr int stride_tile_Q = DKQ/2 + 4; + constexpr int stride_tile_K = c::nbatch_K2 + 4; + constexpr int stride_tile_V = c::nbatch_V2 + 4; - constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts. + constexpr int stride_tile_KV_max = stride_tile_K > stride_tile_V ? stride_tile_K : stride_tile_V; - // Temporary shared buffer for loading K/V data with KQ_per_iter*D logical elements: - extern __shared__ half2 tile_K[]; -#ifdef CP_ASYNC_AVAILABLE - half2 * tile_V = tile_K + KQ_per_iter*D2_padded; -#else - half2 * tile_V = tile_K; -#endif // CP_ASYNC_AVAILABLE - half2 * tile_mask = tile_V + KQ_per_iter*D2_padded; + extern __shared__ half2 tile_Q[]; + half2 * tile_K = c::Q_in_reg ? tile_Q : tile_Q + ncols * stride_tile_Q; + half2 * tile_V = nstages > 1 ? tile_K + c::nbatch_fa * stride_tile_K : tile_K; + half2 * tile_mask = nstages > 1 ? tile_V + c::nbatch_fa * stride_tile_V : tile_V + c::nbatch_fa * stride_tile_KV_max; - tile_B Q_B[D/(2*tile_B::J) * ntiles]; - tile_C_VKQ VKQ_C[D/tile_C_VKQ::I * ntiles]; + tile_B Q_B[(c::Q_in_reg ? DKQ/(2*tile_B::J) : 1) * ntiles]; + tile_C_VKQ VKQ_C[DV/tile_C_VKQ::I * ntiles]; tile_B_16 * Q_B_16 = (tile_B_16 *) Q_B; tile_C_VKQ_16 * VKQ_C_16 = (tile_C_VKQ_16 *) VKQ_C; @@ -476,13 +659,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( KQ_max[col] = -FLT_MAX/2.0f; } - // Temporarily load Q data into tile_K, will be loaded into registers afterwards. + // Load Q data into tile_Q, either temporarily or permanently. + // Q in registers is faster, but register pressure is the biggest bottleneck. // The loading is done with decreasing granularity for D for better memory bandwidth. const half2 scale_h2 = make_half2(scale, scale); #pragma unroll for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) { - const int k0_start = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k); - const int k0_stop = D/2 - (D/2) % (1*stride_k); + const int k0_start = stride_k == WARP_SIZE ? 0 : DKQ/2 - (DKQ/2) % (2*stride_k); + const int k0_stop = DKQ/2 - (DKQ/2) % (1*stride_k); const int stride_jc = WARP_SIZE / stride_k; if (k0_start == k0_stop) { @@ -506,14 +690,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); const float2 tmp = Q_f2[(jt*ncols1 + j)*stride_Q1 + c*stride_Q2 + k]; - tile_K[jc*D2_padded + k] = scale_h2 * make_half2(tmp.x, tmp.y); + tile_Q[jc*stride_tile_Q + k] = scale_h2 * make_half2(tmp.x, tmp.y); } } else { #pragma unroll for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); - tile_K[jc*D2_padded + k] = make_half2(0.0f, 0.0f); + tile_Q[jc*stride_tile_Q + k] = make_half2(0.0f, 0.0f); } } } @@ -521,18 +705,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( __syncthreads(); - { + if (c::Q_in_reg) { const int j0 = (threadIdx.y / np) * cols_per_warp; #pragma unroll - for (int k0 = 0; k0 < D/2; k0 += tile_B::J) { + for (int k0 = 0; k0 < DKQ/2; k0 += tile_B::J) { if (ntiles == 1) { - load_ldmatrix(Q_B[k0/tile_B::J], tile_K + j0*D2_padded + k0, D2_padded); + load_ldmatrix(Q_B[k0/tile_B::J], tile_Q + j0*stride_tile_Q + k0, stride_tile_Q); } else { #pragma unroll for (int t = 0; t < ntiles/2; ++t) { load_ldmatrix(Q_B_16[k0/tile_B_16::J * ntiles/2 + t], - tile_K + (j0 + t*tile_B_16::I)*D2_padded + k0, D2_padded); + tile_Q + (j0 + t*tile_B_16::I)*stride_tile_Q + k0, stride_tile_Q); } } } @@ -540,35 +724,37 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( __syncthreads(); - // Preload mask and K data for first iteration when using cp_async: -#ifdef CP_ASYNC_AVAILABLE - if (ncols2 > 1 || mask_h2) { - flash_attn_ext_f16_load_mask(mask_h2 + kb0_start*KQ_per_iter/2, tile_mask, stride_mask); + // Preload mask and K data for first iteration when using cp_async with multiple stages: + if constexpr (nstages > 1) { + static_assert(c::nbatch_K2 == DKQ/2, "batching not implemented for multi-stage pipeline"); + constexpr bool use_cp_async = true; + if (ncols2 > 1 || mask_h2) { + flash_attn_ext_f16_load_mask + (mask_h2 + kb0_start*c::nbatch_fa/2, tile_mask, stride_mask); + } + flash_attn_ext_f16_load_tile + (K_h2 + kb0_start*c::nbatch_fa*stride_K, tile_K, c::nbatch_K2, stride_K); } - flash_attn_ext_f16_load_tile(K_h2 + kb0_start*KQ_per_iter*stride_KV, tile_K, stride_KV); -#endif // CP_ASYNC_AVAILABLE // Iterate over ne11 == previous tokens: for (int kb0 = kb0_start; kb0 < kb0_stop-1; ++kb0) { constexpr bool last_iter = false; - flash_attn_ext_f16_iter + flash_attn_ext_f16_iter (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap, - ne01, ne02, stride_KV, stride_mask, jt, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0); + ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0); } { // kb0_start is always < kb0_stop so the last iter can be executed unconditionally. constexpr bool last_iter = true; - flash_attn_ext_f16_iter + flash_attn_ext_f16_iter (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap, - ne01, ne02, stride_KV, stride_mask, jt, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1); + ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1); } - // With cp_async there is no __syncthreads at the end of the iter, + // With multi-stage loading there is no __syncthreads at the end of the iter, // there can be a race condition on shared memory access for combining/writing back results. -#ifdef CP_ASYNC_AVAILABLE - if (nwarps*cols_per_warp > KQ_per_iter) { + if (nstages > 1 && nwarps*cols_per_warp > c::nbatch_fa) { __syncthreads(); } -#endif // CP_ASYNC_AVAILABLE // Finally, sum up partial KQ rowsums. // The partial sums are spread across 8/4 threads each, does not need full reduce. @@ -584,38 +770,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } } - // Write VKQ accumulators to shared memory in column-major format. - // It's faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM. - // Also for np > 1 the combination is done via these values in shared memory. - if (ntiles == 1) { - const int jc_cwd = threadIdx.y*tile_B::I + tile_B::get_i(-1); // jc combine write data -#pragma unroll - for (int k0 = 0; k0 < D/2; k0 += tile_B::J) { - const tile_B B = get_transposed(VKQ_C[k0/tile_B::J]); // Conversion of C to B matrix puts it in column-major format. + // Combine VKQ accumulator values if np > 1. + // It's also faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM. + // So also write VKQ accumulators to shared memory in column-major format if np == 1. -#pragma unroll - for (int l = 0; l < tile_B::ne; ++l) { - const int k = k0 + tile_B::get_j(l); - - tile_K[jc_cwd*D2_padded + k] = B.x[l]; - } - } - } else { -#pragma unroll - for (int t = 0; t < ntiles/2; ++t) { - const int j0 = threadIdx.y*cols_per_warp + t*tile_C_VKQ_16::I; -#pragma unroll - for (int k0 = 0; k0 < D/2; k0 += tile_C_VKQ_16::J) { -#pragma unroll - for (int l = 0; l < tile_C_VKQ_16::ne; ++l) { - const int j = j0 + tile_C_VKQ_16::get_i(l); - const int k = k0 + tile_C_VKQ_16::get_j(l); - - tile_K[j*D2_padded + k] = VKQ_C_16[k0/tile_C_VKQ_16::J * ntiles/2 + t].x[l]; - } - } - } - } + constexpr int nbatch_combine = c::Q_in_reg ? DV/2 : DV/4; + constexpr int tile_stride = nbatch_combine + 4; + static_assert((DV/2) % nbatch_combine == 0, "bad nbatch_combine"); if constexpr (ntiles == 1) { const int jc_cwmo = (threadIdx.x % (2*tile_C_VKQ::J)) / tile_C_VKQ::J; // jc combine write meta offset @@ -624,7 +785,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( if (((!needs_fixup && !is_fixup) || np > 1) && threadIdx.x < 2*tile_C_VKQ::J) { // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale. - ((float2 *) tile_K)[jc_cwm*(D2_padded/2) + D/4] = KQ_cmr; + ((float2 *) tile_Q)[jc_cwm*(tile_stride/2) + nbatch_combine/2] = KQ_cmr; } __syncthreads(); @@ -649,7 +810,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( if (((!needs_fixup && !is_fixup) || np > 1) && (ntiles == 4 || threadIdx.x % 4 < cols_per_thread)) { // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale. - ((float2 *) tile_K)[jc_cwm*(D2_padded/2) + D/4] = KQ_cmr; + ((float2 *) tile_Q)[jc_cwm*(tile_stride/2) + nbatch_combine/2] = KQ_cmr; } __syncthreads(); @@ -676,11 +837,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( constexpr int nmeta = np*cols_per_warp >= WARP_SIZE ? np*cols_per_warp/WARP_SIZE : 1; const int jc_meta = threadIdx.y*cols_per_warp + (np*cols_per_warp < WARP_SIZE ? threadIdx.x % (np*cols_per_warp) : threadIdx.x); - float2 * const meta_ptr = ((float2 *) tile_K) + jc_meta*(D2_padded/2) + D/4; + float2 * const meta_ptr = ((float2 *) tile_Q) + jc_meta*(tile_stride/2) + nbatch_combine/2; float2 meta[nmeta]; #pragma unroll for (int imeta = 0; imeta < nmeta; ++imeta) { - meta[imeta] = meta_ptr[imeta * WARP_SIZE * D2_padded/2]; + meta[imeta] = meta_ptr[imeta * WARP_SIZE * tile_stride/2]; } float KQ_cmn = meta[0].x; // KQ combine max new, max between all parallel warps. @@ -690,10 +851,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } #pragma unroll for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) { - if (offset >= WARP_SIZE) { - continue; + if (offset < WARP_SIZE) { + KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE)); } - KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE)); } float KQ_cms[nmeta]; // KQ combine max scale per warp. @@ -709,18 +869,19 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } #pragma unroll for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) { - if (offset >= WARP_SIZE) { - continue; + if (offset < WARP_SIZE) { + KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE); } - KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE); } + __syncthreads(); + // Write back combined meta data: #pragma unroll for (int imeta = 0; imeta < nmeta; ++imeta) { if (np*cols_per_warp >= WARP_SIZE || threadIdx.x < np*cols_per_warp) { // Combined KQ max scale + rowsum. - meta_ptr[imeta * WARP_SIZE * D2_padded/2] = make_float2(KQ_cms[imeta], KQ_crs); + meta_ptr[imeta * WARP_SIZE * tile_stride/2] = make_float2(KQ_cms[imeta], KQ_crs); } } @@ -734,90 +895,125 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols; dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs); } - } - - if (np > 1) { + } else if (np > 1) { + // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch. + // Therefore, all other warps also need to execute a __syncthreads(). + // Otherwise the points at which warps synchronize with each other would become misaligned. __syncthreads(); } - if (np == 1 || threadIdx.y % np == 0) { - // The first 2*2*gridDim.x*ncols floats in dstk_fixup are for storing max. values and row sums. - // The values after that are for the partial results of the individual blocks. - float2 * dstk_fixup_data = dstk_fixup + gridDim.x*(2*ncols) + blockIdx.x*(ncols*(D/2)); +#pragma unroll + for (int k00 = 0; k00 < DV/2; k00 += nbatch_combine) { + if (ntiles == 1) { + const int jc_cwd = threadIdx.y*tile_B::I + tile_B::get_i(-1); // jc combine write data +#pragma unroll + for (int k0 = 0; k0 < nbatch_combine; k0 += tile_B::J) { + const tile_B B = get_transposed(VKQ_C[(k00 + k0)/tile_B::J]); // Conversion of C to B matrix puts it in column-major format. #pragma unroll - for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) { - const int k0_start = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k); - const int k0_stop = D/2 - (D/2) % (1*stride_k); - const int stride_jc = WARP_SIZE / stride_k; + for (int l = 0; l < tile_B::ne; ++l) { + const int k = k0 + tile_B::get_j(l); - if (k0_start == k0_stop) { - continue; + tile_Q[jc_cwd*tile_stride + k] = B.x[l]; + } } - + } else { #pragma unroll - for (int jc0_dst = 0; jc0_dst < ncols; jc0_dst += (nwarps/np)*stride_jc) { - const int jc_dst = jc0_dst + (threadIdx.y/np)*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k); - - if (jc0_dst + (nwarps/np)*stride_jc > ncols && jc_dst >= ncols) { - break; - } - - const int jc_tile_K = (jc_dst/cols_per_warp)*(np*cols_per_warp) + jc_dst % cols_per_warp; - - const int j_dst = jc_dst / ncols2; - const int c_dst = jc_dst % ncols2; - - if (!is_fixup && jt*ncols1 + j_dst >= ne01) { - continue; - } - - const float * meta_j = (const float *) tile_K + jc_tile_K*D2_padded + D/2; + for (int t = 0; t < ntiles/2; ++t) { + const int j0 = threadIdx.y*cols_per_warp + t*tile_C_VKQ_16::I; #pragma unroll - for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { - const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); - - float2 dstk_val = make_float2(0.0f, 0.0f); + for (int k0 = 0; k0 < nbatch_combine; k0 += tile_C_VKQ_16::J) { #pragma unroll - for (int ip = 0; ip < np; ++ip) { - const float KQ_crs = np == 1 ? 1.0f : meta_j[ip*cols_per_warp * D2_padded + 0]; - const float2 dstk_val_add = __half22float2(tile_K[(jc_tile_K + ip*cols_per_warp) * D2_padded + k]); - dstk_val.x += dstk_val_add.x*KQ_crs; - dstk_val.y += dstk_val_add.y*KQ_crs; - } + for (int l = 0; l < tile_C_VKQ_16::ne; ++l) { + const int j = j0 + tile_C_VKQ_16::get_i(l); + const int k = k0 + tile_C_VKQ_16::get_j(l); - if (!needs_fixup && !is_fixup) { - const float KQ_rowsum_j = meta_j[1]; - dstk_val.x /= KQ_rowsum_j; - dstk_val.y /= KQ_rowsum_j; - } - - if (is_fixup) { - dstk_fixup_data[jc_dst*(D/2) + k] = dstk_val; - } else { - dstk[((jt*ncols1 + j_dst)*ne02 + c_dst)*(D/2) + k] = dstk_val; + tile_Q[j*tile_stride + k] = VKQ_C_16[(k00 + k0)/tile_C_VKQ_16::J * ntiles/2 + t].x[l]; } } } } - } - if (np > 1) { __syncthreads(); + + if (np == 1 || threadIdx.y % np == 0) { + // The first 2*2*gridDim.x*ncols floats in dstk_fixup are for storing max. values and row sums. + // The values after that are for the partial results of the individual blocks. + float2 * dstk_fixup_data = dstk_fixup + gridDim.x*(2*ncols) + blockIdx.x*(ncols*(DV/2)); + +#pragma unroll + for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) { + const int k0_start = stride_k == WARP_SIZE ? 0 : nbatch_combine - nbatch_combine % (2*stride_k); + const int k0_stop = nbatch_combine - nbatch_combine % (1*stride_k); + const int stride_jc = WARP_SIZE / stride_k; + + if (k0_start == k0_stop) { + continue; + } + +#pragma unroll + for (int jc0_dst = 0; jc0_dst < ncols; jc0_dst += (nwarps/np)*stride_jc) { + const int jc_dst = jc0_dst + (threadIdx.y/np)*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k); + + if (jc0_dst + (nwarps/np)*stride_jc > ncols && jc_dst >= ncols) { + break; + } + + const int jc_tile_K = (jc_dst/cols_per_warp)*(np*cols_per_warp) + jc_dst % cols_per_warp; + + const int j_dst = jc_dst / ncols2; + const int c_dst = jc_dst % ncols2; + + if (!is_fixup && jt*ncols1 + j_dst >= ne01) { + continue; + } + + const float * meta_j = (const float *) tile_Q + jc_tile_K*tile_stride + nbatch_combine; +#pragma unroll + for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { + const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); + + float2 dstk_val = make_float2(0.0f, 0.0f); +#pragma unroll + for (int ip = 0; ip < np; ++ip) { + const float KQ_crs = np == 1 ? 1.0f : meta_j[ip*cols_per_warp * tile_stride + 0]; + const float2 dstk_val_add = __half22float2(tile_Q[(jc_tile_K + ip*cols_per_warp) * tile_stride + k]); + dstk_val.x += dstk_val_add.x*KQ_crs; + dstk_val.y += dstk_val_add.y*KQ_crs; + } + + if (!needs_fixup && !is_fixup) { + const float KQ_rowsum_j = meta_j[1]; + dstk_val.x /= KQ_rowsum_j; + dstk_val.y /= KQ_rowsum_j; + } + + if (is_fixup) { + dstk_fixup_data[jc_dst*(DV/2) + k00 + k] = dstk_val; + } else { + dstk[((jt*ncols1 + j_dst)*ne02 + c_dst)*(DV/2) + k00 + k] = dstk_val; + } + } + } + } + } + if (np > 1) { + __syncthreads(); + } } #else GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2); GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup); GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1); - GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask); + GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop); NO_DEVICE_CODE; #endif // NEW_MMA_AVAILABLE } -template -__launch_bounds__(nwarps*WARP_SIZE, 2) +template +__launch_bounds__(nwarps*WARP_SIZE, 1) static __global__ void flash_attn_ext_f16( const char * __restrict__ Q, const char * __restrict__ K, @@ -857,24 +1053,27 @@ static __global__ void flash_attn_ext_f16( #if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) // Skip unused kernel variants for faster compilation: - if (use_logit_softcap && !(D == 128 || D == 256)) { + if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) { NO_DEVICE_CODE; return; } - static_assert(FATTN_KQ_STRIDE % KQ_per_iter == 0, "bad KQ_per_iter"); + typedef fattn_mma_f16_config c; + + static_assert(FATTN_KQ_STRIDE % fattn_mma_f16_config::nbatch_fa == 0, "bad nbatch_fa"); const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. const int stride_Q1 = nb01 / sizeof(float2); const int stride_Q2 = nb02 / sizeof(float2); - const int stride_KV = nb11 / sizeof(half2); + const int stride_K = nb11 / sizeof(half2); + const int stride_V = nb21 / sizeof(half2); const int stride_mask = nb31 / sizeof(half2); const int iter_k = ne11 / FATTN_KQ_STRIDE; const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; - constexpr int kb_niter = FATTN_KQ_STRIDE / KQ_per_iter; // Number of kernel iterations per assigned KQ slice. + constexpr int kb_niter = FATTN_KQ_STRIDE / c::nbatch_fa; // Number of kernel iterations per assigned KQ slice. // kbc == k block continuous, current index in continuous ijk space. int kbc = (blockIdx.x + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x; @@ -893,9 +1092,9 @@ static __global__ void flash_attn_ext_f16( const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2); const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio)); - const half2 * V_h2 = (const half2 *) (V + nb12*(channel*ncols2 / gqa_ratio)); // K and V have same shape + const half2 * V_h2 = (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio)); const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr; - float2 * dstk = ((float2 *) dst) + channel*(ncols2 * D/2); + float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2); const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f; @@ -905,14 +1104,14 @@ static __global__ void flash_attn_ext_f16( constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. if (kb0_start == 0) { constexpr bool needs_fixup = false; // CUDA block is working on an entire tile. - flash_attn_ext_f16_process_tile + flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); + ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); } else { constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile. - flash_attn_ext_f16_process_tile + flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); + ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); } kbc += iter_k; @@ -931,9 +1130,9 @@ static __global__ void flash_attn_ext_f16( const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2); const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio)); - const half2 * V_h2 = (const half2 *) (V + nb12*(channel*ncols2 / gqa_ratio)); // K and V have same shape + const half2 * V_h2 = (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio)); // K and V have same shape const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr; - float2 * dstk = ((float2 *) dst) + channel*(ncols2 * D/2); + float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2); const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f; @@ -942,9 +1141,9 @@ static __global__ void flash_attn_ext_f16( constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. constexpr bool needs_fixup = false; - flash_attn_ext_f16_process_tile + flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); + ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); @@ -960,28 +1159,42 @@ static __global__ void flash_attn_ext_f16( #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) } -template +template void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - constexpr int ncols = ncols1 * ncols2; - constexpr int KQ_per_iter = D <= 128 && ncols1 <= 64 ? 64 : 32; - constexpr int nwarps = (KQ_per_iter == 32 && ncols <= 16) ? 2 : 4; - constexpr int ntiles = ncols <= 8 ? 1 : (ncols <= 64 ? 2 : 4); - constexpr int cols_per_warp = ntiles * tile_B::I; + const ggml_tensor * KQV = dst; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; - static_assert(D % tile_B::J == 0, "bad D"); + typedef fattn_mma_f16_config c; + + constexpr int nbatch_K2 = c::nbatch_K2 < 1 ? DKQ/2 : c::nbatch_K2; + constexpr int nbatch_V2 = c::nbatch_V2 < 1 ? DV /2 : c::nbatch_V2; + constexpr int nbatch_combine = c::nbatch_combine < 1 ? DV /2 : c::nbatch_combine; + + const int nstages = cp_async_available(cc) ? c::nstages_target : 0; + + constexpr int ncols = ncols1 * ncols2; + constexpr int ntiles = ncols <= 8 ? 1 : 2; // Number of tiles per warp. + constexpr int cols_per_warp = ntiles * tile_B::I; + constexpr int nwarps_max_x = ncols / cols_per_warp; + constexpr int nwarps_max_y = c::nbatch_fa / tile_A::I; + constexpr int nwarps = nwarps_max_x*nwarps_max_y <= c::nwarps_max ? nwarps_max_x*nwarps_max_y : c::nwarps_max; + + static_assert(DKQ % tile_B::J == 0, "bad DKQ"); + static_assert(DV % tile_A::J == 0, "bad DV"); static_assert(ncols % cols_per_warp == 0, "bad ncols"); - const ggml_tensor * KQV = dst; - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; + const size_t nbytes_shared_KV_1stage = c::nbatch_fa * std::max(c::nbatch_K2 + 4, c::nbatch_V2 + 4) * sizeof(half2); + const size_t nbytes_shared_KV_2stage = c::nbatch_fa * (c::nbatch_K2 + 4 + c::nbatch_V2 + 4) * sizeof(half2); + const size_t nbytes_shared_Q = ncols * (DKQ/2 + 4) * sizeof(half2); + const size_t nbytes_shared_mask = ncols1 * (c::nbatch_fa/2 + 4) * sizeof(half2); + const size_t nbytes_shared_combine = nwarps*cols_per_warp * (nbatch_combine + 4) * sizeof(half2); - const int KQ_shared_rows = cp_async_available(cc) ? 2*KQ_per_iter : KQ_per_iter; + const size_t nbytes_shared_KV = nstages <= 1 ? nbytes_shared_KV_1stage : nbytes_shared_KV_2stage; - const size_t nbytes_shared_KV = KQ_shared_rows * (D + 8) * sizeof(half); - const size_t nbytes_shared_mask = ncols1 * (KQ_per_iter + 8) * sizeof(half); - const size_t nbytes_shared_combine = nwarps*cols_per_warp * (D + 8) * sizeof(half); - - const size_t nbytes_shared_total = std::max(nbytes_shared_KV + nbytes_shared_mask, nbytes_shared_combine); + const size_t nbytes_shared_total = std::max(nbytes_shared_combine, c::Q_in_reg ? + std::max(nbytes_shared_Q, nbytes_shared_KV + nbytes_shared_mask) : + nbytes_shared_Q + nbytes_shared_KV + nbytes_shared_mask); float logit_softcap; memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float)); @@ -989,59 +1202,73 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml fattn_kernel_t fattn_kernel; if (logit_softcap == 0.0f) { constexpr bool use_logit_softcap = false; - fattn_kernel = flash_attn_ext_f16; + fattn_kernel = flash_attn_ext_f16; + +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total)); + shared_memory_limit_raised[id] = true; + } +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) } else { constexpr bool use_logit_softcap = true; - fattn_kernel = flash_attn_ext_f16; + fattn_kernel = flash_attn_ext_f16; + +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total)); + shared_memory_limit_raised[id] = true; + } +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) } - launch_fattn + launch_fattn (ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, FATTN_KQ_STRIDE, true, true, true); } -#define DECL_FATTN_MMA_F16_CASE(D, ncols1, ncols2) \ - template void ggml_cuda_flash_attn_ext_mma_f16_case \ - (ggml_backend_cuda_context & ctx, ggml_tensor * dst) \ +#define DECL_FATTN_MMA_F16_CASE(DKQ, DV, ncols1, ncols2) \ + template void ggml_cuda_flash_attn_ext_mma_f16_case \ + (ggml_backend_cuda_context & ctx, ggml_tensor * dst) \ -#define DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(D, ncols) \ - extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/1, 1); \ - extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/2, 2); \ - extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \ - extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \ +#define DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(DKQ, DV, ncols) \ + extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 1, 1); \ + extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 2, 2); \ + extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 4, 4); \ + extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 8, 8); \ + extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/16, 16); \ -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64, 8) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 80, 8) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 96, 8) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112, 8) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128, 8) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256, 8) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64, 16) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 80, 16) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 96, 16) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112, 16) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128, 16) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256, 16) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64, 32) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 80, 32) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 96, 32) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112, 32) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128, 32) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256, 32) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64) -DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64, 64) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 80, 64) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 96, 64) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112, 64) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128, 64) +DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256, 64) -// Kernels with ncols == 128 are only 4% faster due to register pressure. -// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128) -// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128) -// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128) -// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128) -// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128) -// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory. +// The number of viable configurations for Deepseek is very limited: +extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16); +extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16); +extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu index e0039e175..9283560d5 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -307,7 +307,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * constexpr int nwarps = 8; constexpr size_t nbytes_shared = 0; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16; - launch_fattn + launch_fattn (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false); } break; case 128: { @@ -315,7 +315,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * constexpr int nwarps = 8; constexpr size_t nbytes_shared = 0; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16; - launch_fattn + launch_fattn (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false); } break; default: { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu index fcb6f848f..32673adb5 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -318,7 +318,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * constexpr int nwarps = 8; constexpr size_t nbytes_shared = 0; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32; - launch_fattn + launch_fattn (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false); } break; case 128: { @@ -326,7 +326,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * constexpr int nwarps = 8; constexpr size_t nbytes_shared = 0; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32; - launch_fattn + launch_fattn (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false); } break; default: { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh index e17d2d0e4..d96e39212 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -168,6 +168,7 @@ static __global__ void flash_attn_vec_ext_f16( for (int j = 0; j < ncols; ++j) { KQ[j*D + tid] = -HALF_MAX_HALF; } + __syncthreads(); half2 VKQ[ncols] = {{0.0f, 0.0f}}; @@ -315,7 +316,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, constexpr bool need_f16_K = D != 128; constexpr bool need_f16_V = D != 128 && D != 64; constexpr size_t nbytes_shared = 0; - launch_fattn(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false); + launch_fattn(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false); } template diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh index d42ddca49..7064675d5 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -310,7 +310,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, constexpr bool need_f16_K = D != 128; constexpr bool need_f16_V = D != 128 && D != 64; constexpr size_t nbytes_shared = 0; - launch_fattn(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false); + launch_fattn(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false); } template diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu index bc21b27a0..c5668adb1 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -490,7 +490,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm fattn_kernel = flash_attn_ext_f16< D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>; } - launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size); + launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size); } void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu index 7a2d1e453..9c5c803d0 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu @@ -8,58 +8,32 @@ #include "fattn-wmma-f16.cuh" #include "fattn.cuh" -template +template static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * Q = dst->src[0]; - if (Q->ne[1] <= 8/ncols2) { - ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); - return; + if constexpr (ncols2 <= 8) { + if (Q->ne[1] <= 8/ncols2) { + ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); + return; + } } if (Q->ne[1] <= 16/ncols2) { - ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); + ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); return; } if (Q->ne[1] <= 32/ncols2) { - ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); + ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); return; } - ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); + ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); } -template -static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; - - switch (Q->ne[0]) { - case 64: - ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 64, ncols2>(ctx, dst); - break; - case 80: - ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 80, ncols2>(ctx, dst); - break; - case 96: - ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 96, ncols2>(ctx, dst); - break; - case 112: - ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<112, ncols2>(ctx, dst); - break; - case 128: - ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst); - break; - case 256: - ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst); - break; - default: - GGML_ABORT("fatal error"); - break; - } -} - -static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +template +static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; const ggml_tensor * K = dst->src[1]; @@ -68,27 +42,79 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg float max_bias = 0.0f; memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float)); - const float use_gqa_opt = mask && max_bias == 0.0f; + const bool use_gqa_opt = mask && max_bias == 0.0f; GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); const int gqa_ratio = Q->ne[2] / K->ne[2]; if (use_gqa_opt && gqa_ratio % 8 == 0) { - ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); return; } - if (use_gqa_opt && gqa_ratio == 4) { - ggml_cuda_flash_attn_ext_mma_f16_switch_hs<4>(ctx, dst); + if (use_gqa_opt && gqa_ratio % 4 == 0) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); return; } - if (use_gqa_opt && gqa_ratio == 2) { - ggml_cuda_flash_attn_ext_mma_f16_switch_hs<2>(ctx, dst); + if (use_gqa_opt && gqa_ratio % 2 == 0) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); return; } - ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); +} + +static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * KQV = dst; + const ggml_tensor * Q = dst->src[0]; + const ggml_tensor * K = dst->src[1]; + const ggml_tensor * V = dst->src[2]; + const ggml_tensor * mask = dst->src[3]; + + switch (Q->ne[0]) { + case 64: + GGML_ASSERT(V->ne[0] == 64); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 64, 64>(ctx, dst); + break; + case 80: + GGML_ASSERT(V->ne[0] == 80); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 80, 80>(ctx, dst); + break; + case 96: + GGML_ASSERT(V->ne[0] == 96); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 96, 96>(ctx, dst); + break; + case 112: + GGML_ASSERT(V->ne[0] == 112); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<112, 112>(ctx, dst); + break; + case 128: + GGML_ASSERT(V->ne[0] == 128); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst); + break; + case 256: + GGML_ASSERT(V->ne[0] == 256); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst); + break; + case 576: { + // For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels. + GGML_ASSERT(V->ne[0] == 512); + float max_bias = 0.0f; + memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float)); + + const bool use_gqa_opt = mask && max_bias == 0.0f; + GGML_ASSERT(use_gqa_opt); + + GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); + const int gqa_ratio = Q->ne[2] / K->ne[2]; + GGML_ASSERT(gqa_ratio % 16 == 0); + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst); + } break; + default: + GGML_ABORT("fatal error"); + break; + } } #define FATTN_VEC_F16_CASE(D, type_K, type_V) \ @@ -299,7 +325,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16; const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion; - const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0; + const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0; if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) { if (prec == GGML_PREC_DEFAULT) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu index 4cef53a98..963e4d03d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu @@ -10,10 +10,11 @@ static __global__ void k_get_rows( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; - const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; - const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. + const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2; + const int i10 = blockIdx.x; + const int i11 = blockIdx.z / ne12; + const int i12 = blockIdx.z % ne12; if (i00 >= ne00) { return; @@ -33,8 +34,8 @@ static __global__ void k_get_rows( dfloat2 v; dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = v.x; - dst_row[iybs + iqs + y_offset] = v.y; + dst_row[iybs + iqs + 0] = float(v.x); + dst_row[iybs + iqs + y_offset] = float(v.y); } template @@ -46,10 +47,11 @@ static __global__ void k_get_rows_float( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - const int i00 = blockIdx.x*blockDim.x + threadIdx.x; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; - const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; - const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. + const int i00 = blockIdx.y * blockDim.x + threadIdx.x; + const int i10 = blockIdx.x; + const int i11 = blockIdx.z / ne12; + const int i12 = blockIdx.z % ne12; if (i00 >= ne00) { return; @@ -60,7 +62,7 @@ static __global__ void k_get_rows_float( dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - dst_row[i00] = src0_row[i00]; + dst_row[i00] = float(src0_row[i00]); } template @@ -86,120 +88,159 @@ static __global__ void k_get_rows_back_float( dst[dst_row*ncols + col] = sum; } -template -static void get_rows_cuda( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - +template +static void get_rows_cuda_q( + const void * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); - const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); - const dim3 block_nums(block_num_x, ne10, ne11*ne12); + const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const dim3 block_nums(ne10, block_num_y, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); GGML_ASSERT(ne00 % 2 == 0); k_get_rows<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); } -template +template static void get_rows_cuda_float( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(ne13 == 1); - + const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); - const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; - const dim3 block_nums(block_num_x, ne10, ne11*ne12); + const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; + const dim3 block_nums(ne10, block_num_y, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); +} - GGML_UNUSED(dst); +template +static void ggml_cuda_get_rows_switch_src0_type( + const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + switch (src0_type) { + case GGML_TYPE_F16: + get_rows_cuda_float((const half *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda_float((const float *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + default: + // TODO: k-quants + GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type)); + break; + } +} + +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream) { + switch (dst_type) { + case GGML_TYPE_F32: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_F16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + default: + GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type)); + break; + } } void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const void * src0_d = (const void *) src0->data; - const int32_t * src1_d = (const int32_t *) src1->data; - float * dst_d = (float *) dst->data; - cudaStream_t stream = ctx.stream(); + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ne13 == 1); GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - switch (src0->type) { - case GGML_TYPE_F16: - get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_F32: - get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q4_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q4_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q5_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q5_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q8_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - default: - // TODO: k-quants - GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - break; - } + get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); } void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh index a1ca643f1..3c5bea5f4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh @@ -3,6 +3,13 @@ #define CUDA_GET_ROWS_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256 +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream); + void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 0fef9522d..cb0d8528d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) { // initialize padding to 0 to avoid possible NaN values - size_t original_size = ggml_nbytes(tensor); - size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + const size_t original_size = ggml_nbytes(tensor); + const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); if (padded_size > original_size) { ggml_cuda_set_device(ctx->device); @@ -680,6 +680,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t if (ggml_is_quantized(tensor->type)) { if (ne0 % MATRIX_ROW_PADDING != 0) { + GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor)); size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } } @@ -802,6 +803,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; @@ -853,6 +855,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; @@ -891,6 +894,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; @@ -972,6 +976,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); size_t total_size = 0; @@ -1413,6 +1418,11 @@ static void ggml_cuda_op_mul_mat( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + // const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + const int64_t nb13 = src1->nb[3]; + const int64_t nb2 = dst->nb[2]; const int64_t nb3 = dst->nb[3]; @@ -1529,6 +1539,8 @@ static void ggml_cuda_op_mul_mat( // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared: if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { + GGML_ASSERT(ggml_is_contiguously_allocated(src0)); + GGML_ASSERT(!src0->view_src); const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00); const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING); CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream)); @@ -1548,7 +1560,10 @@ static void ggml_cuda_op_mul_mat( dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size); if (src1_on_device && src1_is_contiguous) { - quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream); + quantize_src1( + dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10, + nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float), + src1_padded_col_size, ne11, ne12, ne13, stream); CUDA_CHECK(cudaGetLastError()); } } @@ -1643,7 +1658,9 @@ static void ggml_cuda_op_mul_mat( } if (quantize_src1 && !src1_is_contiguous) { - quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream); + quantize_src1( + src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10, + src1_padded_col_size, src1_ncols, 1, 1, stream); CUDA_CHECK(cudaGetLastError()); } @@ -1713,15 +1730,15 @@ static __global__ void k_compute_batched_ptrs( size_t nb12, size_t nb13, size_t nbd2, size_t nbd3, int64_t r2, int64_t r3) { - int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; - int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; + const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { return; } - int64_t i03 = i13 / r3; - int64_t i02 = i12 / r2; + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; @@ -1735,6 +1752,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); + // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst. + // As long as dst is contiguous this does not matter though. + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_TENSOR_BINARY_OP_LOCALS const int64_t ne_dst = ggml_nelements(dst); @@ -1743,21 +1764,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - void * src0_ddq = src0->data; - half * src0_f16 = (half *) src0_ddq; - float * src1_ddf = (float *) src1->data; - float * dst_ddf = (float *) dst->data; + const half * src0_f16 = (const half *) src0->data; + float * dst_ddf = (float *) dst->data; + + const half * src1_f16 = (const half *) src1->data; + const size_t ts_src1 = ggml_type_size(src1->type); + GGML_ASSERT(nb10 == ts_src1); + int64_t s11 = nb11 / ts_src1; + int64_t s12 = nb12 / ts_src1; + int64_t s13 = nb13 / ts_src1; + ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); // convert src1 to fp16 - ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); if (src1->type != GGML_TYPE_F16) { - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type); const int64_t ne_src1 = ggml_nelements(src1); src1_f16_alloc.alloc(ne_src1); GGML_ASSERT(to_fp16_cuda != nullptr); - to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + + to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + + src1_f16 = src1_f16_alloc.get(); + s11 = ne10; + s12 = ne11*s11; + s13 = ne12*s12; } - half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get(); ggml_cuda_pool_alloc dst_f16(ctx.pool()); char * dst_t; @@ -1817,13 +1848,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co int i02 = i12 / r2; CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), - (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), - beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, - cu_compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half), + src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11, + beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0, + cu_compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } } @@ -1834,15 +1865,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK( cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA - (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB - beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC + alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA + src1_f16, CUDA_R_16F, s11, s12, // strideB + beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC ne12*ne13, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx - const int ne23 = ne12*ne13; + const int64_t ne23 = ne12*ne13; ggml_cuda_pool_alloc ptrs_src(ctx.pool(), 2*ne23); ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); @@ -1854,8 +1885,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ne12, ne13, ne23, nb02, nb03, - src1->type == GGML_TYPE_F16 ? nb12 : nb12/2, - src1->type == GGML_TYPE_F16 ? nb13 : nb13/2, + src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half), + src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half), nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); @@ -1864,8 +1895,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, - (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10, - beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, + (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11, + beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, ne23, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1881,13 +1912,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); - bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) + // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. + // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data. + // Therefore, in such cases use cuBLAS. + const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE + && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src; + + bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % 2 == 0 && src1->ne[1] == 1; - bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) + bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; - bool use_mul_mat_q = ggml_is_quantized(src0->type) + bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; bool any_gpus_with_slow_fp16 = false; @@ -1922,12 +1959,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { + if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) - ggml_cuda_mul_mat_vec(ctx, src0, src1, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) - && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst); + } else if (!split && use_mul_mat_vec_q) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); + } else if (!split && use_mul_mat_q) { + ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst); + } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && + !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec) { @@ -1941,196 +1982,147 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } } -struct mmid_row_mapping { - int32_t i1; - int32_t i2; -}; - -static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous, - int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping, - const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0, - int64_t ne11, int64_t ne10, - size_t nb11, size_t nb12) { - int32_t iid1 = blockIdx.x; - int32_t id = blockIdx.y; - - const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0); - - if (row_id_i != i02) { - return; - } - - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; - - __shared__ int src1_row; - if (threadIdx.x == 0) { - src1_row = atomicAdd(cur_src1_row, 1); - row_mapping[src1_row] = {id, iid1}; - } - __syncthreads(); - - const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12); - float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11); - - for (int i = threadIdx.x; i < ne10; i += blockDim.x) { - src1_row_contiguous[i] = src1_row_original[i]; - } -} - -static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous, - const mmid_row_mapping * __restrict__ row_mapping, - int64_t ne0, - size_t nb1, size_t nb2) { - int32_t i = blockIdx.x; - - const int32_t i1 = row_mapping[i].i1; - const int32_t i2 = row_mapping[i].i2; - - const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1); - float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2); - - for (int j = threadIdx.x; j < ne0; j += blockDim.x) { - dst_row_original[j] = dst_row_contiguous[j]; - } -} - static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * ids = dst->src[2]; + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); + GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (ne2 == 1) { + if (ggml_is_quantized(src0->type)) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); + } else { + ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst); + } + return; + } + + if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) { + ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); + return; + } + } cudaStream_t stream = ctx.stream(); - const int64_t n_as = ne02; - const int64_t n_ids = ids->ne[0]; + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); + + const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc)) + || ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type; + const ggml_type type_dst_sorted = GGML_TYPE_F32; + const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted); + const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted); + + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; + + std::vector ids_to_sorted_host; + ids_to_sorted_host.reserve(2*ne_get_rows); + std::vector ids_from_sorted_host(ne_get_rows); + + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool(), 2*ne_get_rows); + + std::vector tokens_per_expert(ne02); + + ggml_cuda_pool_alloc src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted); + ggml_cuda_pool_alloc dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted); std::vector ids_host(ggml_nbytes(ids)); - const char * ids_dev = (const char *) ids->data; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); - ggml_tensor src0_row = *src0; - ggml_tensor src1_row = *src1; - ggml_tensor dst_row = *dst; - - char * src0_original = (char *) src0->data; - char * src1_original = (char *) src1->data; - char * dst_original = (char *) dst->data; - - src0_row.ne[2] = 1; - src0_row.ne[3] = 1; - src0_row.nb[3] = nb02; - - src1_row.ne[1] = 1; - src1_row.ne[2] = 1; - src1_row.ne[3] = 1; - src1_row.nb[2] = nb11; - src1_row.nb[3] = nb11; - - dst_row.ne[1] = 1; - dst_row.ne[2] = 1; - dst_row.ne[3] = 1; - dst_row.nb[2] = nb1; - dst_row.nb[3] = nb1; - - if (ne12 == 1) { - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - - GGML_ASSERT(i02 >= 0 && i02 < n_as); - - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; - - const int64_t i1 = id; - const int64_t i2 = i12; - - src0_row.data = src0_original + i02*nb02; - src1_row.data = src1_original + i11*nb11 + i12*nb12; - dst_row.data = dst_original + i1*nb1 + i2*nb2; - - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); - } - } - } else { - ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); - ggml_cuda_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); - - src1_row.data = src1_contiguous.get(); - dst_row.data = dst_contiguous.get(); - - for (int64_t i02 = 0; i02 < n_as; i02++) { - int64_t num_src1_rows = 0; - - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - - GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); - - if (row_id_i != i02) { - continue; - } - - num_src1_rows++; + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size(); + ids_to_sorted_host.push_back(i12*ne11 + iex % ne11); + tokens_per_expert[i02]++; + break; } } - - if (num_src1_rows == 0) { - continue; - } - - ggml_cuda_pool_alloc dev_cur_src1_row(ctx.pool(), 1); - ggml_cuda_pool_alloc dev_row_mapping(ctx.pool(), num_src1_rows); - CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream)); - - { - dim3 block_dims(std::min((unsigned int)ne10, 768u)); - dim3 grid_dims(ids->ne[1], n_ids); - k_copy_src1_to_contiguous<<>>( - src1_original, src1_contiguous.get(), - dev_cur_src1_row.get(), dev_row_mapping.get(), - ids_dev, i02, ids->nb[1], ids->nb[0], - ne11, ne10, - nb11, nb12); - CUDA_CHECK(cudaGetLastError()); - } - - src0_row.data = src0_original + i02*nb02; - - GGML_ASSERT(nb11 == sizeof(float)*ne10); - GGML_ASSERT(nb1 == sizeof(float)*ne0); - - src1_row.ne[1] = num_src1_rows; - src1_row.nb[1] = nb11; - src1_row.nb[2] = num_src1_rows*nb11; - src1_row.nb[3] = num_src1_rows*nb11; - - dst_row.ne[1] = num_src1_rows; - dst_row.nb[1] = nb1; - dst_row.nb[2] = num_src1_rows*nb1; - dst_row.nb[3] = num_src1_rows*nb1; - - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); - - { - dim3 block_dims(std::min((unsigned int)ne0, 768u)); - dim3 grid_dims(num_src1_rows); - k_copy_dst_from_contiguous<<>>( - dst_original, dst_contiguous.get(), - dev_row_mapping.get(), - ne0, - nb1, nb2); - CUDA_CHECK(cudaGetLastError()); - } } } + GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows)); + + ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end()); + + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows; + const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows; + + get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted, + ne10, nb11, nb12, nb13, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream); + CUDA_CHECK(cudaGetLastError()); + + char * src1_data_cur = (char *) src1_sorted.ptr; + char * dst_data_cur = (char *) dst_sorted.ptr; + for (int64_t i02 = 0; i02 < ne02; ++i02) { + if (tokens_per_expert[i02] == 0) { + continue; + } + + ggml_tensor src0_slice = *src0; + src0_slice.ne[2] = 1; + src0_slice.nb[3] = src0_slice.nb[2]; + src0_slice.op = GGML_OP_VIEW; + src0_slice.view_src = dst->src[0]; // non-const pointer to src0 + src0_slice.data = (char *) src0->data + i02*nb02; + + ggml_tensor src1_slice; + memset(&src1_slice, 0, sizeof(src1_slice)); + src1_slice.buffer = src1->buffer; + src1_slice.type = type_src1_sorted; + src1_slice.ne[0] = ne10; + src1_slice.ne[1] = tokens_per_expert[i02]; + src1_slice.ne[2] = 1; + src1_slice.ne[3] = 1; + src1_slice.nb[0] = ts_src1_sorted; + src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0]; + src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1]; + src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2]; + src1_slice.data = src1_data_cur; + + ggml_tensor dst_slice; + memset(&dst_slice, 0, sizeof(dst_slice)); + dst_slice.buffer = dst->buffer; + dst_slice.type = type_dst_sorted; + dst_slice.ne[0] = ne0; + dst_slice.ne[1] = tokens_per_expert[i02]; + dst_slice.ne[2] = 1; + dst_slice.ne[3] = 1; + dst_slice.nb[0] = ts_dst_sorted; + dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0]; + dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1]; + dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2]; + dst_slice.data = dst_data_cur; + + ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice); + CUDA_CHECK(cudaGetLastError()); + + src1_data_cur += src1_slice.nb[2]; + dst_data_cur += dst_slice.nb[2]; + } + + get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type, + ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + nb1, nb2, nb3, stream); } static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { @@ -2246,9 +2238,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_PAD: ggml_cuda_op_pad(ctx, dst); break; - case GGML_OP_UNPAD: - ggml_cuda_op_unpad(ctx, dst); - break; case GGML_OP_ARANGE: ggml_cuda_op_arange(ctx, dst); break; @@ -2495,7 +2484,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_MUL_MAT_ID) { + if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__); @@ -3209,9 +3198,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - const size_t ts = ggml_type_size(op->src[0]->type); - const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2]; - return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts; + return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]); } case GGML_OP_IM2COL: case GGML_OP_POOL_2D: @@ -3225,7 +3212,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_UPSCALE: return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_PAD: - case GGML_OP_UNPAD: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: @@ -3238,16 +3224,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; #endif // FLASH_ATTN_AVAILABLE if (op->src[1]->ne[0] != op->src[2]->ne[0]) { - // different head sizes of K and V are not supported yet - return false; + const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; + if (!new_mma_available(cc) || cc < GGML_CUDA_CC_AMPERE) { + return false; + } + const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2]; + return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0; } if (op->src[0]->ne[0] == 192) { return false; } - if (op->src[0]->ne[0] == 576) { - // DeepSeek MLA - return false; - } if (op->src[0]->ne[3] != 1) { return false; } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu index b36b43d54..e1cf843de 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu @@ -1,37 +1,10 @@ #include "mmq.cuh" +#include "quantize.cuh" -void ggml_cuda_op_mul_mat_q( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { +#include - const int64_t ne00 = src0->ne[0]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - GGML_ASSERT(ne10 % QK8_1 == 0); - - const int64_t ne0 = dst->ne[0]; - - const int64_t row_diff = row_high - row_low; - const int64_t stride00 = ne00 / ggml_blck_size(src0->type); - - int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - - // The stream-k decomposition is only faster for recent NVIDIA GPUs. - // Also its fixup needs to allocate a temporary buffer in the memory pool. - // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && - ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; - - switch (src0->type) { +static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { + switch (args.type_x) { case GGML_TYPE_Q4_0: mul_mat_q_case(ctx, args, stream); break; @@ -90,10 +63,206 @@ void ggml_cuda_op_mul_mat_q( GGML_ABORT("fatal error"); break; } +} + +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. + + GGML_TENSOR_BINARY_OP_LOCALS; + + cudaStream_t stream = ctx.stream(); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + + const size_t ts_src0 = ggml_type_size(src0->type); + const size_t ts_src1 = ggml_type_size(src1->type); + const size_t ts_dst = ggml_type_size(dst->type); + + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); + + const char * src0_d = (const char *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + // If src0 is a temporary compute buffer, clear any potential padding. + if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { + const size_t size_data = ggml_nbytes(src0); + const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0); + if (size_alloc > size_data) { + GGML_ASSERT(ggml_is_contiguously_allocated(src0)); + GGML_ASSERT(!src0->view_src); + CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream)); + } + } + + const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); + + const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s1 = dst->nb[1] / ts_dst; + const int64_t s02 = src0->nb[2] / ts_src0; + const int64_t s2 = dst->nb[2] / ts_dst; + const int64_t s03 = src0->nb[3] / ts_src0; + const int64_t s3 = dst->nb[3] / ts_dst; + + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; + + if (!ids) { + const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[3] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d, + ne00, ne01, ne1, s01, ne11, s1, + ne02, ne12, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); + return; + } + + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); + + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; + + std::vector ids_host(ggml_nbytes(ids)); + std::vector ids_src1_host; + ids_src1_host.reserve(ne_get_rows); + std::vector ids_dst_host; + ids_dst_host.reserve(ne_get_rows); + std::vector tokens_per_expert_host(ne02); + std::vector expert_bounds_host(ne02 + 1); + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool()); + + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11); + ids_dst_host.push_back(i12*ne1 + iex); + tokens_per_expert_host[i02]++; + break; + } + } + } + } + + int32_t cumsum = 0; + for (int64_t i = 0; i < ne02; ++i) { + expert_bounds_host[i] = cumsum; + cumsum += tokens_per_expert_host[i]; + } + expert_bounds_host[ne02] = cumsum; + + std::vector ids_buf_host; + ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size()); + ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end()); + ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end()); + ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end()); + ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device. + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + const int32_t * ids_src1_dev = ids_buf_dev.ptr; + const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size(); + const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size(); + + const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + const int64_t ne11_flat = ne12*n_expert_used; + const int64_t ne12_flat = 1; + const int64_t ne13_flat = 1; + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[2] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d, + ne00, ne01, ne_get_rows, s01, ne_get_rows, s1, + ne02, ne02, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); +} + +void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + const int64_t stride01 = ne00 / ggml_blck_size(src0->type); + + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; + + // The stream-k decomposition is only faster for recent NVIDIA GPUs. + // Also its fixup needs to allocate a temporary buffer in the memory pool. + // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && + ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; + const mmq_args args = { + src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, + ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst, + 1, 1, 0, 0, 0, + 1, 1, 0, 0, 0, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); GGML_UNUSED(src1); GGML_UNUSED(dst); GGML_UNUSED(src1_ddf_i); + GGML_UNUSED(src1_padded_row_size); } bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh index 532358018..80baf459c 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh @@ -13,9 +13,10 @@ using namespace ggml_cuda_mma; #define MMQ_ITER_K 256 #define MMQ_NWARPS 8 -typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride); -typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00); -typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max); +typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride); +typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00); +typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted, + float * __restrict__ dst, const int stride, const int i_max, const int j_max); enum mmq_q8_1_ds_layout { MMQ_Q8_1_DS_LAYOUT_D4, @@ -155,25 +156,27 @@ static constexpr __device__ int get_mmq_y_device() { #define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8} static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) { - return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 : - type == GGML_TYPE_Q4_1 ? MMQ_DP4A_TXS_Q4_1 : - type == GGML_TYPE_Q5_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_DP4A_TXS_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_DP4A_TXS_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_DP4A_TXS_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_DP4A_TXS_Q4_K : - type == GGML_TYPE_Q5_K ? MMQ_DP4A_TXS_Q5_K : - type == GGML_TYPE_Q6_K ? MMQ_DP4A_TXS_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ2_S ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_DP4A_TXS_Q8_0 : - tile_x_sizes{0, 0, 0}; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0; + case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1; + case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K; + case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K; + case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0; + default: return tile_x_sizes{0, 0, 0}; + } } #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4) @@ -189,25 +192,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding."); static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q4_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_MMA_TILE_X_K_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q6_K ? MMQ_MMA_TILE_X_K_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ2_S ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_MMA_TILE_X_K_Q8_0 : - 0; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0; + default: return 0; + } } #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) @@ -229,7 +234,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */ // ------------------------------------------------------------ template static __device__ __forceinline__ void load_tiles_q4_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -285,7 +290,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y); const int * x_qs = (const int *) x; @@ -324,7 +329,7 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q4_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -380,7 +385,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y); const int * x_qs = (const int *) x; @@ -419,7 +424,7 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -491,7 +496,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q5_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -561,7 +566,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q8_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -617,7 +622,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y); const int * x_qs = (const int *) x; @@ -647,7 +652,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -728,7 +733,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y); const int * x_qs = (const int *) x; @@ -758,7 +763,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -835,7 +840,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16; const int * x_qs = (const int *) x; @@ -867,7 +872,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -951,7 +956,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q2_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1007,7 +1012,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y); const int * x_qs = (const int *) x; @@ -1070,7 +1075,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1197,7 +1202,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q3_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1294,7 +1299,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y); const int * x_qs = (const int *) x; @@ -1336,7 +1341,7 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co } template static __device__ __forceinline__ void load_tiles_q4_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1433,7 +1438,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y); const int * x_qs = (const int *) x; @@ -1465,7 +1470,7 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1574,7 +1579,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y); const int * x_qs = (const int *) x; @@ -1606,7 +1611,7 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q6_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1689,7 +1694,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y); const int * x_qs = (const int *) x; @@ -1722,7 +1727,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1831,7 +1836,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_iq4_nl( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1889,7 +1894,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1947,7 +1952,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2003,7 +2008,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2066,7 +2071,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2122,7 +2127,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2185,7 +2190,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq1_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2241,7 +2246,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq4_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2302,8 +2307,8 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void mmq_write_back_dp4a( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -2320,15 +2325,15 @@ static __device__ __forceinline__ void mmq_write_back_dp4a( continue; } - dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } template static __device__ __forceinline__ void mmq_write_back_mma( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { typedef tile<16, 8, int> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -2358,7 +2363,7 @@ static __device__ __forceinline__ void mmq_write_back_mma( continue; } - dst[j*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; + dst[ids_dst[j]*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; } } } @@ -2514,17 +2519,18 @@ struct mmq_type_traits { }; template -static __device__ void mul_mat_q_process_tile( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0, - const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) { +static __device__ __forceinline__ void mul_mat_q_process_tile( + const char * __restrict__ x, const int offset_x, const int * __restrict__ y, + const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int stride_row_x, const int ncols_y, const int stride_col_dst, + const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; - extern __shared__ char data_mul_mat_q[]; - int * tile_y = (int *) data_mul_mat_q; + extern __shared__ int data_mul_mat_q[]; + int * tile_y = data_mul_mat_q + mmq_x; int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE); #ifdef NEW_MMA_AVAILABLE @@ -2539,16 +2545,11 @@ static __device__ void mul_mat_q_process_tile( float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int tile_x_max_i = ne01 - it*mmq_y - 1; - const int tile_y_max_j = ne11 - jt*mmq_x - 1; - - const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int)); - for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { - load_tiles(x, tile_x, stride01*it*mmq_y + kb0, tile_x_max_i, stride01); + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2564,7 +2565,7 @@ static __device__ void mul_mat_q_process_tile( __syncthreads(); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2581,12 +2582,10 @@ static __device__ void mul_mat_q_process_tile( } if (fixup) { - write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); + write_back(sum, ids_dst, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); } else { - write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j); + write_back(sum, ids_dst, dst, stride_col_dst, tile_x_max_i, tile_y_max_j); } - - GGML_UNUSED(ne00); GGML_UNUSED(ne10); } @@ -2605,8 +2604,11 @@ template #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { + const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst, + const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst, + const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) { @@ -2617,26 +2619,88 @@ static __global__ void mul_mat_q( constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); + const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x + const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y + + // Initialize the ids for writing back data with just the index. + // For regular matrix multiplications this is never changed. + // For MoE the correct indices are loaded from ids_dst. + extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory. +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + __syncthreads(); + // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { + const int wt = blockIdx.z / nchannels_y; + const int zt = blockIdx.z - wt*nchannels_y; + const int jt = blockIdx.y; + const int it = blockIdx.x; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_dst; + int col_diff = ncols_dst; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + + // __syncthreads(); // There is no previous tile that could cause a race condition. +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + __syncthreads(); + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; + constexpr bool fixup = false; mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - blockIdx.x, blockIdx.y, 0, ne00/qk); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, + tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); return; } #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x - const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y - // kbc == k block continuous, current index in continuous ijk space. - int64_t kbc = (int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x; - int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x; + int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter; @@ -2645,13 +2709,66 @@ static __global__ void mul_mat_q( int kb0_start = kbc % blocks_per_ne00; int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc); while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) { - const int jt = kbc / (blocks_per_ne00*nty); // j index of current tile. - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile. + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_dst; + int col_diff = ncols_dst; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + kbc += blocks_per_ne00; + kbc -= kbc % blocks_per_ne00; + + kb0_start = 0; + kb0_stop = min(blocks_per_ne00, kbc_stop - kbc); + + continue; + } + + __syncthreads(); +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + __syncthreads(); + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); kbc += blocks_per_ne00; kbc -= kbc % blocks_per_ne00; @@ -2664,55 +2781,108 @@ static __global__ void mul_mat_q( return; } - const int jt = kbc / (blocks_per_ne00*nty); - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_dst; + int col_diff = ncols_dst; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + + // The memory layout for the fixup buffer is always contiguous, therefore reset ids: + __syncthreads(); +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + __syncthreads(); + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); } template static __global__ void mul_mat_q_stream_k_fixup( - float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) { - + const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, + const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst, + const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) { constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; - const int nty = (ne01 + mmq_y - 1) / mmq_y; + const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; + const int nty = (nrows_x + mmq_y - 1) / mmq_y; + + const int bidx0 = blockIdx.x; + + // kbc == k block continuous, current index in continuous ijk space. + int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + + kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter; + kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter; + + const bool did_not_have_any_data = kbc0 == kbc0_stop; + const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0; + const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0; + if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { + return; + } bool any_fixup = false; - const int bidx_start = ((blockIdx.y*nty + blockIdx.x) * block_num_mmq) / (gridDim.y*gridDim.x); - const int bidx_stop = ((blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq + gridDim.y*gridDim.x - 1) / (gridDim.y*gridDim.x); + // Iterate over previous blocks and sum up partial sums written to fixup buffer. + // All CUDA blocks that get here must have a previous block that needs a fixup. + int64_t bidx = bidx0 - 1; + int64_t kbc_stop = kbc0; + while(true) { + int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; - int64_t kbc_0; - int64_t kbc_stop_0 = (int64_t) bidx_start*blocks_per_ne00*ntx*nty / block_num_mmq; - - for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) { - kbc_0 = kbc_stop_0; - kbc_stop_0 = (int64_t) (bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq; - - const int64_t kbc = kbc_0 - (kbc_0 % blocks_per_ne00) % blocks_per_iter; - const int64_t kbc_stop = kbc_stop_0 - (kbc_stop_0 % blocks_per_ne00) % blocks_per_iter; - - // Skip fixup tile if the MMQ CUDA block never wrote anything to it: - if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) { - continue; - } - - const int jt = kbc_stop / (blocks_per_ne00*nty); - const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; - - // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block: - if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) { + if (kbc == kbc_stop) { // Did not have any data. + bidx--; + kbc_stop = kbc; continue; } @@ -2729,16 +2899,72 @@ static __global__ void mul_mat_q_stream_k_fixup( sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i]; } } + + // If this block started in a previous tile we are done and don't need to combine additional partial results. + if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) { + break; + } + bidx--; + kbc_stop = kbc; } if (!any_fixup) { return; } - dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y; + int tmp = kbc0; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; - const int i_max = ne01 - blockIdx.x*mmq_y - 1; - const int j_max = ne11 - blockIdx.y*mmq_x - 1; + if (!ids_dst) { + const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = ncols_dst - jt*mmq_x - 1; + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j > j_max) { + return; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + if (need_check && i > i_max) { + continue; + } + + dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + } + } + return; + } + + __shared__ int ids_dst_shared[mmq_x]; + const int col_low = expert_bounds[zt + 0]; + const int col_high = expert_bounds[zt + 1]; + const int col_diff = col_high - col_low; + + for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) { + ids_dst_shared[j] = ids_dst[col_low + j]; + } + __syncthreads(); + + const int offset_dst = it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = col_diff - jt*mmq_x - 1; #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { @@ -2756,26 +2982,27 @@ static __global__ void mul_mat_q_stream_k_fixup( continue; } - dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } struct mmq_args { - const char * x; const char * y; float * dst; - int64_t ne00; int64_t ne01; int64_t stride01; - int64_t ne10; int64_t ne11; int64_t stride11; - int64_t ne0; + const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst; + int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst; + int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst; + int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst; bool use_stream_k; }; template -static int mmq_get_shmem(const int mmq_x, const int mmq_y, const int cc) { +static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc) { const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y); const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type); - const int shmem_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); - const int shmem_y = mmq_x*sizeof(block_q8_1_mmq); - return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); + const size_t nbs_ids = mmq_x*sizeof(int); + const size_t nbs_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); + const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq); + return nbs_ids + nbs_x + GGML_PAD(nbs_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); } template @@ -2787,86 +3014,114 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1); - const int shmem = mmq_get_shmem(mmq_x, mmq_y, cc); + const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc); #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; - if (!shmem_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - shmem_limit_raised[id] = true; + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + shared_memory_limit_raised[id] = true; } #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - const int nty = (args.ne01 + mmq_y - 1) / mmq_y; - const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; - const dim3 block_nums_xy_tiling(nty, ntx, 1); + const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; + const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x; + const int ntzw = args.nchannels_y * args.nsamples_y; + const dim3 block_nums_xy_tiling(nty, ntx, ntzw); + + GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0); + GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0); + const int channel_ratio = args.nchannels_y / args.nchannels_x; + const int sample_ratio = args.nsamples_y / args.nsamples_x; if (!args.use_stream_k) { - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } return; } - const dim3 block_nums_mmq(nsm, 1, 1); + const dim3 block_nums_stream_k(nsm, 1, 1); + const bool fixup_needed = ntx*nty*ntzw % nsm != 0; ggml_cuda_pool & pool = ctx.pool(id); - ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); + ggml_cuda_pool_alloc tmp_fixup(pool); + if (fixup_needed) { + tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y); + } - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + if (!fixup_needed) { + return; + } + + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + if (!fixup_needed) { + return; + } + + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } } template void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const int smpbo = ggml_cuda_info().devices[id].smpbo; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); - const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; int mmq_x_best = 0; - int nparts_best = INT_MAX; + int ntiles_x_best = INT_MAX; - for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) { + for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) { const int granularity = mmq_get_granularity_host(mmq_x, cc); - if (mmq_x % granularity != 0 || mmq_get_shmem(mmq_x, mmq_y, cc) > smpbo) { + if (mmq_x % granularity != 0 || mmq_get_nbytes_shared(mmq_x, mmq_y, cc) > smpbo) { continue; } - const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x; - const int nwaves_xy_tiling = ntiles_x*block_num_y; - const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling; + const int ntiles_x = (args.ncols_y + mmq_x - 1) / mmq_x; - if (nparts < nparts_best) { - mmq_x_best = mmq_x; - nparts_best = nparts; + if (ntiles_x < ntiles_x_best) { + mmq_x_best = mmq_x; + ntiles_x_best = ntiles_x; } } @@ -2950,6 +3205,9 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); // ------------------------------------------------------------------------------------------------------------------------- +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + void ggml_cuda_op_mul_mat_q( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu index b39961cd1..d8c385e23 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu @@ -4,18 +4,23 @@ template static __global__ void mul_mat_vec( - const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row, + const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row, const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) { - const int64_t row = blockIdx.x; - const int64_t channel = blockIdx.y; - const int64_t sample = blockIdx.z; - const int tid = threadIdx.x; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + const int64_t row = blockIdx.x; + const int64_t channel_dst = blockIdx.y; + const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst; + const int64_t sample_dst = blockIdx.z; + const int64_t sample_x = sample_dst / sample_ratio; + const int64_t sample_y = sample_dst; + const int tid = threadIdx.x; + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - x += (sample/sample_ratio)*stride_sample_x + (channel/channel_ratio)*stride_channel_x + row*stride_row; - y += sample *stride_sample_y + channel *stride_channel_y; - dst += sample *stride_sample_dst + channel *stride_channel_dst; + x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row; + y += sample_y *stride_sample_y + channel_y *stride_channel_y; + dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst; const float2 * y2 = (const float2 *) y; @@ -31,12 +36,19 @@ static __global__ void mul_mat_vec( float sumf = 0.0f; - if constexpr (std::is_same::value) { + if constexpr (std::is_same::value) { + const float2 * x2 = (const float2 *) x; + + for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + const float2 tmpx = x2[col2]; + const float2 tmpy = y2[col2]; + sumf += tmpx.x*tmpy.x; + sumf += tmpx.y*tmpy.y; + } + } else if constexpr (std::is_same::value) { const half2 * x2 = (const half2 *) x; if (std::is_same::value) { - sumf = 0.0f; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = __half22float2(x2[col2]); const float2 tmpy = y2[col2]; @@ -59,8 +71,6 @@ static __global__ void mul_mat_vec( } } else if constexpr (std::is_same::value) { const int * x2 = (const int *) x; - sumf = 0.0f; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { const int tmpx = x2[col2]; const float2 tmpy = y2[col2]; @@ -92,17 +102,17 @@ static __global__ void mul_mat_vec( template static void launch_mul_mat_vec_cuda( - const T * x, const float * y, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, cudaStream_t stream) { GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(stride_row % 2 == 0); - GGML_ASSERT(nchannels_y % nchannels_x == 0); - GGML_ASSERT(nsamples_y % nsamples_x == 0); - const int64_t channel_ratio = nchannels_y / nchannels_x; - const int64_t sample_ratio = nsamples_y / nsamples_x; + GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); + GGML_ASSERT( nsamples_dst % nsamples_x == 0); + const int64_t channel_ratio = nchannels_dst / nchannels_x; + const int64_t sample_ratio = nsamples_dst / nsamples_x; int device; int warp_size; @@ -124,48 +134,48 @@ static void launch_mul_mat_vec_cuda( } const int smem = warp_size*sizeof(float); - const dim3 block_nums(nrows, nchannels_y, nsamples_y); + const dim3 block_nums(nrows, nchannels_dst, nsamples_dst); const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 64: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 96: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 128: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 160: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 192: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 224: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 256: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; default: { GGML_ABORT("fatal error"); @@ -175,28 +185,28 @@ static void launch_mul_mat_vec_cuda( template static void mul_mat_vec_cuda( - const T * x, const float * y, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, enum ggml_prec prec, cudaStream_t stream) { - switch (prec) { - case GGML_PREC_DEFAULT: { + if constexpr(std::is_same::value) { + if (prec == GGML_PREC_DEFAULT) { launch_mul_mat_vec_cuda - (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case GGML_PREC_F32: { - launch_mul_mat_vec_cuda - (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; + (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + return; + } } + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); } -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); +void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_TENSOR_BINARY_OP_LOCALS; @@ -204,21 +214,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); - GGML_ASSERT(ne11 == 1); - GGML_ASSERT(ne12 == ne2); + GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. GGML_ASSERT(ne13 == ne3); - GGML_ASSERT(nb00 == ts_src0); - GGML_ASSERT(nb10 == ts_src1); - GGML_ASSERT(nb0 == ts_dst); + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); + GGML_ASSERT( nb0 == ts_dst); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src1_d = (const float *) src1->data; + const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; + float * dst_d = (float *) dst->data; const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s1 = dst->nb[1] / ts_dst; const int64_t s02 = src0->nb[2] / ts_src0; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s2 = dst->nb[2] / ts_dst; @@ -226,14 +239,33 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int64_t s13 = src1->nb[3] / ts_src1; const int64_t s3 = dst->nb[3] / ts_dst; + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: + const int64_t ncols_dst = ids ? ne2 : ne1; + const int64_t nchannels_y = ids ? ne11 : ne12; + const int64_t nchannels_dst = ids ? ne1 : ne2; + const int64_t stride_channel_dst = ids ? s1 : s2; + const int64_t stride_channel_y = ids ? s11 : s12; + + GGML_ASSERT(ncols_dst == 1); + switch (src0->type) { + case GGML_TYPE_F32: { + const float * src0_d = (const float *) src0->data; + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); + } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream()); + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream()); + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); @@ -262,27 +294,34 @@ void ggml_cuda_op_mul_mat_vec( const int64_t stride_row = ne00; const int64_t nchannels_x = 1; const int64_t nchannels_y = 1; + const int64_t nchannels_dst = 1; const int64_t stride_channel_x = 0; const int64_t stride_channel_y = 0; const int64_t stride_channel_dst = 0; const int64_t nsamples_x = 1; - const int64_t nsamples_y = 1; + const int64_t nsamples_dst = 1; const int64_t stride_sample_x = 0; const int64_t stride_sample_y = 0; const int64_t stride_sample_dst = 0; switch (src0->type) { + case GGML_TYPE_F32: { + const float * src0_d = (const float *) src0_dd_i; + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, - nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, - nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh index 78a1cd4a6..756e7e1cc 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh @@ -3,7 +3,7 @@ // maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available #define MMV_MAX_ROWS 512 -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); void ggml_cuda_op_mul_mat_vec( ggml_backend_cuda_context & ctx, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu index eef8585a7..dc7adf509 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu @@ -1,50 +1,57 @@ #include "mmvq.cuh" +#include "quantize.cuh" #include "vecdotq.cuh" +#include + typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs); static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 : - type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 : - type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 : - type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 : - type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 : - type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 : - type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 : - type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 : - type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 : - type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 : - type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 : - type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 : - type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 : - type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 : - type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 : - type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 : - type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 : - type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 : - type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 : - nullptr; + switch (type) { + case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1; + case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1; + case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1; + case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1; + case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1; + case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1; + case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1; + case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1; + case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1; + case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1; + case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1; + case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1; + case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1; + case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1; + case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1; + case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1; + case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1; + case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1; + case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1; + default: return nullptr; + } } static constexpr __device__ int get_vdr_mmvq(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ : - type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : - type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : - type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : - type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : - type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : - type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : - type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : - type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : - type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ : - 1; + switch (type) { + case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ; + case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ; + case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ; + case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ; + case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ; + case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ; + case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ; + case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ; + case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ; + case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ; + case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ; + case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ; + case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ; + default: return 1; + } } enum mmvq_parameter_table_id { @@ -73,9 +80,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { return MMVQ_PARAMETERS_GENERIC; } -static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) { +static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) { if (table_id == MMVQ_PARAMETERS_GENERIC) { - switch (ncols_y) { + switch (ncols_dst) { case 1: case 2: case 3: @@ -90,7 +97,7 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete return 1; } } else if (table_id == MMVQ_PARAMETERS_GCN) { - switch (ncols_y) { + switch (ncols_dst) { case 1: case 2: case 3: @@ -107,9 +114,9 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete return 1; } -static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) { +static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) { if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) { - switch (ncols_y) { + switch (ncols_dst) { case 1: return 1; case 2: @@ -127,19 +134,21 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int ta return 1; } -template +template // tell the compiler to use as many registers as it wants, see nwarps definition below -__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) +__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; constexpr int vdr = get_vdr_mmvq(type); constexpr mmvq_parameter_table_id table_id = get_device_table_id(); - constexpr int nwarps = calc_nwarps(ncols_y, table_id); - constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id); + constexpr int nwarps = calc_nwarps(ncols_dst, table_id); + constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id); constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); @@ -147,13 +156,21 @@ static __global__ void mul_mat_vec_q( const int tid = warp_size*threadIdx.y + threadIdx.x; const int row0 = rows_per_cuda_block*blockIdx.x; const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; - // partial sum for each thread - float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}}; + // The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1. + const int channel_dst = blockIdx.y; + const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst; + const int sample_dst = blockIdx.z; + const int sample_x = sample_dst / sample_ratio; + const int sample_y = sample_dst; - const block_q8_1 * y = (const block_q8_1 *) vy; + // partial sum for each thread + float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}}; + + const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; + const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx @@ -162,18 +179,19 @@ static __global__ void mul_mat_vec_q( const int kqs = vdr * (tid % (qi/vdr)); #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { - tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs); + tmp[j][i] += vec_dot_q_cuda( + vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs); } } } - __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size]; + __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; if (threadIdx.y > 0) { #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; @@ -185,9 +203,11 @@ static __global__ void mul_mat_vec_q( return; } + dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0; + // sum up partial sums and write back result #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { #pragma unroll @@ -197,88 +217,121 @@ static __global__ void mul_mat_vec_q( tmp[j][i] = warp_reduce_sum(tmp[j][i]); } - if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) { - dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x]; + if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) { + dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x]; } } - - GGML_UNUSED(nrows_x); } -static std::pair calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) { - const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id); - const dim3 block_nums(nblocks, 1, 1); - const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1); +static std::pair calc_launch_params( + const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y, + const int warp_size, const mmvq_parameter_table_id table_id) { + const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id); + const dim3 block_nums(nblocks, nchannels_y, nsamples_y); + const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1); return {block_nums, block_dims}; } template -static void mul_mat_vec_q_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { +static void mul_mat_vec_q_switch_ncols_dst( + const void * vx, const void * vy, const int32_t * ids, float * dst, + const int ncols_x, const int nrows_x, const int ncols_dst, + const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int nchannels_x, const int nchannels_y, const int nchannels_dst, + const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + cudaStream_t stream) { GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0); - GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); + GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE); + + const int channel_ratio = nchannels_dst / nchannels_x; + const int sample_ratio = nsamples_dst / nsamples_x; const int device = ggml_cuda_get_device(); const int warp_size = ggml_cuda_info().devices[device].warp_size; const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc); - switch (ncols_y) { + GGML_ASSERT(!ids || ncols_dst == 1); + switch (ncols_dst) { case 1: { - constexpr int c_ncols_y = 1; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 1; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 2: { - constexpr int c_ncols_y = 2; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 2; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 3: { - constexpr int c_ncols_y = 3; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 3; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 4: { - constexpr int c_ncols_y = 4; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 4; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 5: { - constexpr int c_ncols_y = 5; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 5; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 6: { - constexpr int c_ncols_y = 6; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 6; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 7: { - constexpr int c_ncols_y = 7; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 7; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 8: { - constexpr int c_ncols_y = 8; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 8; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } default: @@ -287,137 +340,224 @@ static void mul_mat_vec_q_cuda( } } -static void mul_mat_vec_q4_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +static void mul_mat_vec_q_switch_type( + const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst, + const int ncols_x, const int nrows_x, const int ncols_dst, + const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int nchannels_x, const int nchannels_y, const int nchannels_dst, + const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + cudaStream_t stream) { + switch (type_x) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + default: + GGML_ABORT("fatal error"); + break; + } } -static void mul_mat_vec_q4_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { +void ggml_cuda_mul_mat_vec_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + GGML_TENSOR_BINARY_OP_LOCALS; -static void mul_mat_vec_q5_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + cudaStream_t stream = ctx.stream(); - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const size_t ts_src0 = ggml_type_size(src0->type); + const size_t ts_src1 = ggml_type_size(src1->type); + const size_t ts_dst = ggml_type_size(dst->type); -static void mul_mat_vec_q5_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. -static void mul_mat_vec_q8_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + const float * src1_d = (const float *) src1->data; + const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; + float * dst_d = (float *) dst->data; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + // If src0 is a temporary compute buffer, clear any potential padding. + if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { + const size_t size_data = ggml_nbytes(src0); + const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0); + if (size_alloc > size_data) { + GGML_ASSERT(ggml_is_contiguously_allocated(src0)); + GGML_ASSERT(!src0->view_src); + CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream)); + } + } -static void mul_mat_vec_q2_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1); + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[3] / ts_src1; + quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + } - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s11 = ne10_padded / QK8_1; + const int64_t s1 = dst->nb[1] / ts_dst; + const int64_t s02 = src0->nb[2] / ts_src0; + const int64_t s2 = dst->nb[2] / ts_dst; + const int64_t s03 = src0->nb[3] / ts_src0; + const int64_t s3 = dst->nb[3] / ts_dst; -static void mul_mat_vec_q3_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + const int64_t s12 = ne11*s11; + const int64_t s13 = ne12*s12; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: + const int64_t ncols_dst = ids ? ne2 : ne1; + const int64_t nchannels_y = ids ? ne11 : ne12; + const int64_t nchannels_dst = ids ? ne1 : ne2; + const int64_t stride_col_dst = ids ? s2 : s1; + const int64_t stride_col_y = ids ? s12 : s11; + const int64_t stride_channel_dst = ids ? s1 : s2; + const int64_t stride_channel_y = ids ? s11 : s12; -static void mul_mat_vec_q4_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q5_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q6_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_xxs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_xs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq3_xxs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq1_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq1_m_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq4_nl_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq4_xs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq3_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); + mul_mat_vec_q_switch_type( + src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00, + ne01, ncols_dst, s01, stride_col_y, stride_col_dst, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, stream); } void ggml_cuda_op_mul_mat_vec_q( @@ -440,68 +580,12 @@ void ggml_cuda_op_mul_mat_vec_q( // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XXS: - mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XS: - mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_S: - mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_XXS: - mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_M: - mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_NL: - mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_XS: - mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_S: - mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - default: - GGML_ABORT("fatal error"); - break; - } + const int stride_row_x = ne00 / ggml_blck_size(src0->type); + const int stride_col_y = src1_padded_row_size / QK8_1; + + mul_mat_vec_q_switch_type( + src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream); GGML_UNUSED(src1); GGML_UNUSED(dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh index d9e42fdd6..39dc7d33e 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh @@ -2,6 +2,9 @@ #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. +void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + void ggml_cuda_op_mul_mat_vec_q( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu index 7d45a7e19..77432b046 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu @@ -47,49 +47,3 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); } - -static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { - // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 - // blockIdx.y: idx of ne1 - // blockIDx.x: idx of ne0 / BLOCK_SIZE - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - - // operation - int offset_dst = - nidx + - blockIdx.y * ne0 + - blockIdx.z * ne0 * gridDim.y; - if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { - int offset_src = - nidx + - blockIdx.y * ne00 + - blockIdx.z * ne00 * ne01; - dst[offset_dst] = x[offset_src]; - } -} - -static void unpad_f32_cuda(const float * x, float * dst, - const int ne00, const int ne01, const int ne02, const int ne03, - const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne1, ne2*ne3); - unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); -} - -void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - - unpad_f32_cuda(src0_d, dst_d, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); -} \ No newline at end of file diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh index e2ededc3c..8fd386b00 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh @@ -3,4 +3,3 @@ #define CUDA_PAD_BLOCK_SIZE 256 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu index 1702e4ce2..cb9318145 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu @@ -1,30 +1,40 @@ #include "quantize.cuh" #include -static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) { - const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void quantize_q8_1( + const float * __restrict__ x, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int ne1, const int ne2) { + const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (ix0 >= kx0_padded) { + if (i0 >= ne0) { return; } - const int64_t ix1 = blockIdx.y; + const int64_t i1 = blockIdx.y; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; - const int64_t i_padded = ix1*kx0_padded + ix0; + const int64_t & i00 = i0; + const int64_t & i01 = i1; + const int64_t & i02 = i2; + const int64_t & i03 = i3; + + const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0; block_q8_1 * y = (block_q8_1 *) vy; - const int64_t ib = i_padded / QK8_1; // block index - const int64_t iqs = i_padded % QK8_1; // quant index + const int64_t ib = i_cont / QK8_1; // block index + const int64_t iqs = i_cont % QK8_1; // quant index - const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f; + const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; float amax = fabsf(xi); float sum = xi; amax = warp_reduce_max(amax); - sum = warp_reduce_sum(sum); + sum = warp_reduce_sum(sum); - const float d = amax / 127; + const float d = amax / 127; const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); y[ib].qs[iqs] = q; @@ -39,29 +49,38 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest template static __global__ void quantize_mmq_q8_1( - const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) { + const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int ne1, const int ne2) { constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32; constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32; - const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; + const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; - if (ix0 >= kx0_padded) { + if (i0 >= ne0) { return; } - const float4 * x4 = (const float4 *) x; + const int64_t i1 = blockIdx.y; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; - const int64_t ix1 = kx1*blockIdx.z + blockIdx.y; + const int64_t i00 = i0; + const int64_t i01 = ids ? ids[i1] : i1; + const int64_t i02 = i2; + const int64_t i03 = i3; + + const float4 * x4 = (const float4 *) x; block_q8_1_mmq * y = (block_q8_1_mmq *) vy; const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel - const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel - const int64_t iqs = ix0 % (4*QK8_1); // quant index in block + const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel + const int64_t iqs = i0 % (4*QK8_1); // quant index in block // Load 4 floats per thread and calculate max. abs. value between them: - const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); + const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); float amax = fabsf(xi.x); amax = fmaxf(amax, fabsf(xi.y)); amax = fmaxf(amax, fabsf(xi.z)); @@ -77,7 +96,7 @@ static __global__ void quantize_mmq_q8_1( if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) { sum = xi.x + xi.y + xi.z + xi.w; - // Exchange calculate sum across vals_per_sum/4 threads. + // Calculate sums across vals_per_sum/4 threads. #pragma unroll for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); @@ -127,40 +146,41 @@ static __global__ void quantize_mmq_q8_1( } void quantize_row_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, - const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(!ids); + GGML_ASSERT(ne0 % QK8_1 == 0); - GGML_ASSERT(kx0_padded % QK8_1 == 0); - - const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - const dim3 num_blocks(block_num_x, kx1*channels, 1); + const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, kx0, kx0_padded); - - GGML_UNUSED(type_x); + quantize_q8_1<<>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2); + GGML_UNUSED(type_src0); } void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, - const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ne0 % (4*QK8_1) == 0); - GGML_ASSERT(kx0_padded % (4*QK8_1) == 0); - - const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); - const dim3 num_blocks(block_num_x, kx1, channels); + const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); + const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1); - switch (mmq_get_q8_1_ds_layout(type_x)) { + switch (mmq_get_q8_1_ds_layout(type_src0)) { case MMQ_Q8_1_DS_LAYOUT_D4: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_DS4: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_D2S6: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; default: GGML_ABORT("fatal error"); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh index 03bf322b9..725ab5244 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh @@ -12,13 +12,16 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access."); typedef void (*quantize_cuda_t)( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_row_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu index f9589080a..eb3d7cdba 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu @@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguously_allocated(src0)); const float * src0_d = (const float *) src0->data; float * dst_d = (float *) dst->data; diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu new file mode 100644 index 000000000..fb26abeb0 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-mma-f16.cuh" + +DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu index 80108615a..dc1682902 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 1, 8); -DECL_FATTN_MMA_F16_CASE(80, 1, 8); -DECL_FATTN_MMA_F16_CASE(96, 1, 8); -DECL_FATTN_MMA_F16_CASE(112, 1, 8); -DECL_FATTN_MMA_F16_CASE(128, 1, 8); -DECL_FATTN_MMA_F16_CASE(256, 1, 8); +DECL_FATTN_MMA_F16_CASE(64, 64, 1, 8); +DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8); +DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8); +DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8); +DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8); +DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu index 66161c0ab..9d3cfd8ed 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 16, 1); -DECL_FATTN_MMA_F16_CASE(80, 16, 1); -DECL_FATTN_MMA_F16_CASE(96, 16, 1); -DECL_FATTN_MMA_F16_CASE(112, 16, 1); -DECL_FATTN_MMA_F16_CASE(128, 16, 1); -DECL_FATTN_MMA_F16_CASE(256, 16, 1); +DECL_FATTN_MMA_F16_CASE(64, 64, 16, 1); +DECL_FATTN_MMA_F16_CASE(80, 80, 16, 1); +DECL_FATTN_MMA_F16_CASE(96, 96, 16, 1); +DECL_FATTN_MMA_F16_CASE(112, 112, 16, 1); +DECL_FATTN_MMA_F16_CASE(128, 128, 16, 1); +DECL_FATTN_MMA_F16_CASE(256, 256, 16, 1); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu index ee88c72aa..2e1883af4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 16, 2); -DECL_FATTN_MMA_F16_CASE(80, 16, 2); -DECL_FATTN_MMA_F16_CASE(96, 16, 2); -DECL_FATTN_MMA_F16_CASE(112, 16, 2); -DECL_FATTN_MMA_F16_CASE(128, 16, 2); -DECL_FATTN_MMA_F16_CASE(256, 16, 2); +DECL_FATTN_MMA_F16_CASE(64, 64, 16, 2); +DECL_FATTN_MMA_F16_CASE(80, 80, 16, 2); +DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2); +DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2); +DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2); +DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu index d888a5a42..2074e954a 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 16, 4); -DECL_FATTN_MMA_F16_CASE(80, 16, 4); -DECL_FATTN_MMA_F16_CASE(96, 16, 4); -DECL_FATTN_MMA_F16_CASE(112, 16, 4); -DECL_FATTN_MMA_F16_CASE(128, 16, 4); -DECL_FATTN_MMA_F16_CASE(256, 16, 4); +DECL_FATTN_MMA_F16_CASE(64, 64, 16, 4); +DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4); +DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4); +DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4); +DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4); +DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu new file mode 100644 index 000000000..f011a208c --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-mma-f16.cuh" + +DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu index d93a2d08e..24c64cf00 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 2, 4); -DECL_FATTN_MMA_F16_CASE(80, 2, 4); -DECL_FATTN_MMA_F16_CASE(96, 2, 4); -DECL_FATTN_MMA_F16_CASE(112, 2, 4); -DECL_FATTN_MMA_F16_CASE(128, 2, 4); -DECL_FATTN_MMA_F16_CASE(256, 2, 4); +DECL_FATTN_MMA_F16_CASE(64, 64, 2, 4); +DECL_FATTN_MMA_F16_CASE(80, 80, 2, 4); +DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4); +DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4); +DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4); +DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu index 617464c94..163b1d939 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 2, 8); -DECL_FATTN_MMA_F16_CASE(80, 2, 8); -DECL_FATTN_MMA_F16_CASE(96, 2, 8); -DECL_FATTN_MMA_F16_CASE(112, 2, 8); -DECL_FATTN_MMA_F16_CASE(128, 2, 8); -DECL_FATTN_MMA_F16_CASE(256, 2, 8); +DECL_FATTN_MMA_F16_CASE(64, 64, 2, 8); +DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8); +DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8); +DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8); +DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8); +DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu index 970d2b686..0543532ea 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 32, 1); -DECL_FATTN_MMA_F16_CASE(80, 32, 1); -DECL_FATTN_MMA_F16_CASE(96, 32, 1); -DECL_FATTN_MMA_F16_CASE(112, 32, 1); -DECL_FATTN_MMA_F16_CASE(128, 32, 1); -DECL_FATTN_MMA_F16_CASE(256, 32, 1); +DECL_FATTN_MMA_F16_CASE(64, 64, 32, 1); +DECL_FATTN_MMA_F16_CASE(80, 80, 32, 1); +DECL_FATTN_MMA_F16_CASE(96, 96, 32, 1); +DECL_FATTN_MMA_F16_CASE(112, 112, 32, 1); +DECL_FATTN_MMA_F16_CASE(128, 128, 32, 1); +DECL_FATTN_MMA_F16_CASE(256, 256, 32, 1); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu index 65cd377c3..407b6cf4c 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 32, 2); -DECL_FATTN_MMA_F16_CASE(80, 32, 2); -DECL_FATTN_MMA_F16_CASE(96, 32, 2); -DECL_FATTN_MMA_F16_CASE(112, 32, 2); -DECL_FATTN_MMA_F16_CASE(128, 32, 2); -DECL_FATTN_MMA_F16_CASE(256, 32, 2); +DECL_FATTN_MMA_F16_CASE(64, 64, 32, 2); +DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2); +DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2); +DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2); +DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2); +DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu new file mode 100644 index 000000000..f5fd0e236 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-mma-f16.cuh" + +DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu index f4a8bf348..5e4668502 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 4, 2); -DECL_FATTN_MMA_F16_CASE(80, 4, 2); -DECL_FATTN_MMA_F16_CASE(96, 4, 2); -DECL_FATTN_MMA_F16_CASE(112, 4, 2); -DECL_FATTN_MMA_F16_CASE(128, 4, 2); -DECL_FATTN_MMA_F16_CASE(256, 4, 2); +DECL_FATTN_MMA_F16_CASE(64, 64, 4, 2); +DECL_FATTN_MMA_F16_CASE(80, 80, 4, 2); +DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2); +DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2); +DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2); +DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu index de191a8ab..1ada657f1 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 4, 4); -DECL_FATTN_MMA_F16_CASE(80, 4, 4); -DECL_FATTN_MMA_F16_CASE(96, 4, 4); -DECL_FATTN_MMA_F16_CASE(112, 4, 4); -DECL_FATTN_MMA_F16_CASE(128, 4, 4); -DECL_FATTN_MMA_F16_CASE(256, 4, 4); +DECL_FATTN_MMA_F16_CASE(64, 64, 4, 4); +DECL_FATTN_MMA_F16_CASE(80, 80, 4, 4); +DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4); +DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4); +DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4); +DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu index e8cb0e1b3..bad296b41 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 4, 8); -DECL_FATTN_MMA_F16_CASE(80, 4, 8); -DECL_FATTN_MMA_F16_CASE(96, 4, 8); -DECL_FATTN_MMA_F16_CASE(112, 4, 8); -DECL_FATTN_MMA_F16_CASE(128, 4, 8); -DECL_FATTN_MMA_F16_CASE(256, 4, 8); +DECL_FATTN_MMA_F16_CASE(64, 64, 4, 8); +DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8); +DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8); +DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8); +DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8); +DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu index a532e9629..0d7a9c728 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 64, 1); -DECL_FATTN_MMA_F16_CASE(80, 64, 1); -DECL_FATTN_MMA_F16_CASE(96, 64, 1); -DECL_FATTN_MMA_F16_CASE(112, 64, 1); -DECL_FATTN_MMA_F16_CASE(128, 64, 1); -DECL_FATTN_MMA_F16_CASE(256, 64, 1); +DECL_FATTN_MMA_F16_CASE(64, 64, 64, 1); +DECL_FATTN_MMA_F16_CASE(80, 80, 64, 1); +DECL_FATTN_MMA_F16_CASE(96, 96, 64, 1); +DECL_FATTN_MMA_F16_CASE(112, 112, 64, 1); +DECL_FATTN_MMA_F16_CASE(128, 128, 64, 1); +DECL_FATTN_MMA_F16_CASE(256, 256, 64, 1); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu index bf25181aa..9d5a9976f 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 8, 1); -DECL_FATTN_MMA_F16_CASE(80, 8, 1); -DECL_FATTN_MMA_F16_CASE(96, 8, 1); -DECL_FATTN_MMA_F16_CASE(112, 8, 1); -DECL_FATTN_MMA_F16_CASE(128, 8, 1); -DECL_FATTN_MMA_F16_CASE(256, 8, 1); +DECL_FATTN_MMA_F16_CASE(64, 64, 8, 1); +DECL_FATTN_MMA_F16_CASE(80, 80, 8, 1); +DECL_FATTN_MMA_F16_CASE(96, 96, 8, 1); +DECL_FATTN_MMA_F16_CASE(112, 112, 8, 1); +DECL_FATTN_MMA_F16_CASE(128, 128, 8, 1); +DECL_FATTN_MMA_F16_CASE(256, 256, 8, 1); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu index 378c132e6..a6e6f093d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 8, 2); -DECL_FATTN_MMA_F16_CASE(80, 8, 2); -DECL_FATTN_MMA_F16_CASE(96, 8, 2); -DECL_FATTN_MMA_F16_CASE(112, 8, 2); -DECL_FATTN_MMA_F16_CASE(128, 8, 2); -DECL_FATTN_MMA_F16_CASE(256, 8, 2); +DECL_FATTN_MMA_F16_CASE(64, 64, 8, 2); +DECL_FATTN_MMA_F16_CASE(80, 80, 8, 2); +DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2); +DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2); +DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2); +DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu index 372641be9..86d4ffae2 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 8, 4); -DECL_FATTN_MMA_F16_CASE(80, 8, 4); -DECL_FATTN_MMA_F16_CASE(96, 8, 4); -DECL_FATTN_MMA_F16_CASE(112, 8, 4); -DECL_FATTN_MMA_F16_CASE(128, 8, 4); -DECL_FATTN_MMA_F16_CASE(256, 8, 4); +DECL_FATTN_MMA_F16_CASE(64, 64, 8, 4); +DECL_FATTN_MMA_F16_CASE(80, 80, 8, 4); +DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4); +DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4); +DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4); +DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu index 9ff5968b6..680a13ca6 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu @@ -2,9 +2,9 @@ #include "../fattn-mma-f16.cuh" -DECL_FATTN_MMA_F16_CASE(64, 8, 8); -DECL_FATTN_MMA_F16_CASE(80, 8, 8); -DECL_FATTN_MMA_F16_CASE(96, 8, 8); -DECL_FATTN_MMA_F16_CASE(112, 8, 8); -DECL_FATTN_MMA_F16_CASE(128, 8, 8); -DECL_FATTN_MMA_F16_CASE(256, 8, 8); +DECL_FATTN_MMA_F16_CASE(64, 64, 8, 8); +DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8); +DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8); +DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8); +DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8); +DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh index 40091a0ef..ba195e1d1 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh @@ -1,3 +1,5 @@ +#pragma once + #include "common.cuh" #include diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal index 297933adb..3656c2383 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal @@ -2071,6 +2071,10 @@ typedef struct { float attn_factor; float beta_fast; float beta_slow; + int32_t sect_0; + int32_t sect_1; + int32_t sect_2; + int32_t sect_3; } ggml_metal_kargs_rope; typedef struct { @@ -2163,21 +2167,42 @@ typedef struct { } ggml_metal_kargs_mul_mv_ext; typedef struct { - int32_t nei0; - int32_t nei1; - uint64_t nbi1; + int32_t ne10; + int32_t ne11; // n_expert_used (bcast) + uint64_t nb11; + uint64_t nb12; + int32_t neh11; // n_tokens + uint64_t nbh11; + int32_t ne20; // n_expert_used + uint64_t nb21; +} ggml_metal_kargs_mul_mm_id_map0; + +typedef struct { + int32_t ne20; // n_expert_used + int32_t neh0; + int32_t neh1; + uint64_t nbh1; + uint64_t nbh2; + int32_t ne0; + uint64_t nb1; + uint64_t nb2; +} ggml_metal_kargs_mul_mm_id_map1; + +typedef struct { int32_t ne00; int32_t ne02; uint64_t nb01; uint64_t nb02; - int32_t ne11; - int32_t ne12; - int32_t ne13; - uint64_t nb10; - uint64_t nb11; - uint64_t nb12; - int32_t ne0; - int32_t ne1; + uint64_t nb03; + int32_t neh12; + uint64_t nbh10; + uint64_t nbh11; + uint64_t nbh12; + uint64_t nbh13; + int32_t neh0; + int32_t neh1; + int16_t r2; + int16_t r3; } ggml_metal_kargs_mul_mm_id; typedef struct { @@ -5166,8 +5191,148 @@ kernel void kernel_rope_neox( } } +template +kernel void kernel_rope_multi( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < args.n_dims) { + const int ic = i0/2; + + // mrope theta calculations + // note: the rest is the same as kernel_rope_neox + const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3; + const int sec_w01 = args.sect_0 + args.sect_1; // end of section 1 + const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 + const int sector = ic % sect_dims; + + float theta_base; + if (sector < args.sect_0) { + theta_base = (float) pos[i2]; + } else if (sector < sec_w01) { + theta_base = (float) pos[i2 + args.ne02]; + } else if (sector < sec_w012) { + theta_base = (float) pos[i2 + args.ne02 * 2]; + } else { + theta_base = (float) pos[i2 + args.ne02 * 3]; + } + // end of mrope + + const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); + + const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + + const float x0 = src[0]; + const float x1 = src[args.n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + +template +kernel void kernel_rope_vision( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < 2*args.n_dims) { // different from kernel_rope_multi + const int ic = i0/2; + + // mrope theta calculations (only support 2 dimensions) + const int sect_dims = args.sect_0 + args.sect_1; + const int sector = ic % sect_dims; + + float p; + float theta_base; + if (sector < args.sect_1) { + p = (float) sector; + theta_base = (float) pos[i2]; + } else { + p = (float) sector - args.sect_0; + theta_base = (float) pos[i2 + args.ne02]; + } + + const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p); + // end of mrope + + const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + + const float x0 = src[0]; + const float x1 = src[args.n_dims]; // different from kernel_rope_multi + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + typedef decltype(kernel_rope_norm) kernel_rope_norm_t; typedef decltype(kernel_rope_neox) kernel_rope_neox_t; +typedef decltype(kernel_rope_multi) kernel_rope_multi_t; +typedef decltype(kernel_rope_vision) kernel_rope_vision_t; template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm; template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm; @@ -5175,6 +5340,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_ template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox; template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox; +template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi; +template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi; + +template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision; +template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision; + typedef void (im2col_t)( device const float * x, device char * dst, @@ -5428,51 +5599,6 @@ kernel void kernel_pad_reflect_1d_f32( } } -kernel void kernel_unpad_f32( - device const char * src0, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; - - device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); - device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); - - if (i1 < ne01 && i2 < ne02 && i3 < ne03) { - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - if (i0 < ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } - } - - return; - } -} - kernel void kernel_arange_f32( device char * dst, constant ggml_metal_kargs_arange & args, @@ -5690,7 +5816,7 @@ kernel void kernel_flash_attn_ext( { float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; + float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 }; // thread indices inside the simdgroup // TODO: see if we can utilize quad-group functions for better performance @@ -5950,7 +6076,7 @@ kernel void kernel_flash_attn_ext( // reduce the warps sequentially for (ushort sg = 1; sg < nsg; ++sg) { float S = { 0.0f }; - float M = { -__FLT16_MAX__/2 }; + float M = { -__FLT_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -6197,7 +6323,7 @@ kernel void kernel_flash_attn_ext_vec( { float S = 0.0f; - float M = -__FLT16_MAX__/2; + float M = -__FLT_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%NL; @@ -8834,127 +8960,219 @@ kernel void kernel_mul_mm( } } -// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids -// TODO: this kernel needs to be reimplemented from scratch for better performance -template -void kernel_mul_mm_id_impl( - int32_t ne00, - int32_t ne02, - uint64_t nb01, - uint64_t nb02, - int32_t ne11, - int32_t ne12, - uint64_t nb10, - uint64_t nb11, - uint64_t nb12, - int32_t ne0, - int32_t ne1, - int64_t ne0ne1, - device const char * src0, - device const char * src1, - threadgroup ushort2 * rowids, - device char * dst, - threadgroup char * shmem, +template +kernel void kernel_mul_mm_id_map0( + constant ggml_metal_kargs_mul_mm_id_map0 & args, + device const char * src1, + device const char * src2, + device char * hsrc1, + device char * htpe, + device char * hids, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int ide = tgpig[0]; // expert id + + int n_all = 0; + + device int32_t * ids_i32 = (device int32_t *) (hids); + + for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens + device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21); + + for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used + if (src2_i32[i20] != ide) { + continue; + } + + device const float4 * src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11); + device T4 * hsrc1_f32x4 = (device T4 *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11); + + for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) { + hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]); + } + + if (tpitg.x == 0) { + ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all; + } + + ++n_all; + } + } + + if (tpitg.x == 0) { + device int32_t * tpe_i32 = (device int32_t *) (htpe); + tpe_i32[ide] = n_all; + } +} + +typedef decltype(kernel_mul_mm_id_map0) kernel_mul_mm_id_map0_t; + +template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0; + +template +kernel void kernel_mul_mm_id_map1( + constant ggml_metal_kargs_mul_mm_id_map1 & args, + device const char * hdst, + device const char * hids, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i20 = tgpig[0]; // used expert + const int i21 = tgpig[1]; // token + + device const int32_t * ids_i32 = (device const int32_t *) (hids); + device float4 * dst_f32x4 = (device float4 *) (dst + i20*args.nb1 + i21*args.nb2); + + const int id = ids_i32[i21*args.ne20 + i20]; + + const int ide = id / args.neh1; + const int idt = id % args.neh1; + + device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2); + + for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) { + dst_f32x4[i0] = hdst_f32x4[i0]; + } +} + +typedef decltype(kernel_mul_mm_id_map1) kernel_mul_mm_id_map1_t; + +template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1; + +template +kernel void kernel_mul_mm_id( + constant ggml_metal_kargs_mul_mm_id & args, + device const char * src0, + device const char * src1, + device const char * tpe, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], ushort tiitg[[thread_index_in_threadgroup]], ushort sgitg[[simdgroup_index_in_threadgroup]]) { - threadgroup half * sa = (threadgroup half *)(shmem); - threadgroup float * sb = (threadgroup float *)(shmem + 4096); + threadgroup T * sa = (threadgroup T *)(shmem); + threadgroup half * sb = (threadgroup half *)(shmem + 4096); const int r0 = tgpig.y; const int r1 = tgpig.x; + const int im = tgpig.z; - if (r1*BLOCK_SIZE_N >= ne1) return; + device const int32_t * tpe_i32 = (device const int32_t *) (tpe); + + const int neh1 = tpe_i32[im]; + + if (r1*BLOCK_SIZE_N >= neh1) { + return; + } // if this block is of 64x32 shape or smaller - short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; - short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; + const short n_cols = ( neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? ( neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; // a thread shouldn't load data outside of the matrix - short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; - short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; - simdgroup_half8x8 ma[4]; - simdgroup_float8x8 mb[2]; + simdgroup_T8x8 ma[4]; + simdgroup_half8x8 mb[2]; simdgroup_float8x8 mc[8]; - for (int i = 0; i < 8; i++){ + + for (short i = 0; i < 8; i++){ mc[i] = make_filled_simdgroup_matrix(0.f); } + short il = (tiitg % THREAD_PER_ROW); - ushort offset1 = il/nl; + const int i12 = im%args.neh12; + const int i13 = im/args.neh12; - threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col]; + const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + const short offset1 = il/nl; - device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1; - device const float * y = (device const float *)(src1 - + nb12 * id[1] - + nb11 * (id[0] % ne11) - + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + device const block_q * x = (device const block_q *)(src0 + + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1; - for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { + device const half * y = (device const half *)(src1 + + args.nbh13*i13 + + args.nbh12*i12 + + args.nbh11*(r1*BLOCK_SIZE_N + thread_col) + + args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + + for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) { // load data and store to threadgroup memory - half4x4 temp_a; + T4x4 temp_a; dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); - for (int i = 0; i < 16; i++) { - *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ - + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ - + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + #pragma unroll(16) + for (short i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \ + + (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \ + + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; } - *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y); + *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y); il = (il + 2 < nl) ? il + 2 : il % 2; - x = (il < 2) ? x + (2+nl-1)/nl : x; + x = (il < 2) ? x + (2 + nl - 1)/nl : x; y += BLOCK_SIZE_K; threadgroup_barrier(mem_flags::mem_threadgroup); // load matrices from threadgroup memory and conduct outer products - threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); - threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); + threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); - #pragma unroll(BLOCK_SIZE_K/8) - for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + #pragma unroll(4) + for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) { #pragma unroll(4) - for (int i = 0; i < 4; i++) { + for (short i = 0; i < 4; i++) { simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) - for (int i = 0; i < 2; i++) { + for (short i = 0; i < 2; i++) { simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); } - lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; - lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; - #pragma unroll(8) - for (int i = 0; i < 8; i++){ + for (short i = 0; i < 8; i++){ simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); } + + lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE; + lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE; } } - { + if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) { + device float * C = (device float *) dst + + (BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \ + (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0; + + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0); + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup float * temp_str = ((threadgroup float *) shmem) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { - simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M; + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); } threadgroup_barrier(mem_flags::mem_threadgroup); if (sgitg == 0) { for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { - threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j]; - int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1; - - device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + joff; + device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0; device float4 * D4 = (device float4 *) D; threadgroup float * C = temp_str + (j*BLOCK_SIZE_M); @@ -8974,66 +9192,6 @@ void kernel_mul_mm_id_impl( } } -template -kernel void kernel_mul_mm_id( - constant ggml_metal_kargs_mul_mm_id & args, - device const char * src0s, - device const char * src1, - device char * dst, - device const char * ids, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - const int32_t i02 = tgpig.z; - - tgpig.z = 0; - - device const char * src0 = src0s + i02*args.nb02; - - // row indices - threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192); - - // TODO: parallelize this loop - int32_t _ne1 = 0; - for (ushort ii1 = 0; ii1 < args.nei1; ii1++) { - for (ushort ii0 = 0; ii0 < args.nei0; ii0++) { - int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0]; - if (id == i02) { - if (tiitg == 0) { - rowids[_ne1] = ushort2(ii0, ii1); - } - _ne1++; - } - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - kernel_mul_mm_id_impl( - args.ne00, - args.ne02, - args.nb01, - args.nb02, - args.ne11, - args.ne12, - args.nb10, - args.nb11, - args.nb12, - args.ne0, - _ne1, - (int64_t)args.ne0*args.ne1, - src0, - src1, - rowids, - dst, - shmem, - tgpig, - tiitg, - sgitg); -} - #define QK_NL 16 // @@ -9074,63 +9232,64 @@ template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get // matrix-matrix multiplication // -typedef decltype(kernel_mul_mm) mat_mm_t; +typedef decltype(kernel_mul_mm) mul_mm_t; -template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f32_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_mm; #if defined(GGML_METAL_USE_BF16) -template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm; #endif -template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication // -typedef decltype(kernel_mul_mm_id) mat_mm_id_t; +typedef decltype(kernel_mul_mm_id) mul_mm_id; -template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id; #if defined(GGML_METAL_USE_BF16) -template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_bf16_f16")]] kernel mul_mm_id kernel_mul_mm_id; #endif -template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q8_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q2_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q3_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q6_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; + // // matrix-vector multiplication diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h index 8721b272d..17eab976f 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h @@ -207,6 +207,10 @@ typedef struct { float attn_factor; float beta_fast; float beta_slow; + int32_t sect_0; + int32_t sect_1; + int32_t sect_2; + int32_t sect_3; } ggml_metal_kargs_rope; typedef struct { @@ -299,21 +303,42 @@ typedef struct { } ggml_metal_kargs_mul_mv_ext; typedef struct { - int32_t nei0; - int32_t nei1; - uint64_t nbi1; + int32_t ne10; + int32_t ne11; // n_expert_used (bcast) + uint64_t nb11; + uint64_t nb12; + int32_t neh11; // n_tokens + uint64_t nbh11; + int32_t ne20; // n_expert_used + uint64_t nb21; +} ggml_metal_kargs_mul_mm_id_map0; + +typedef struct { + int32_t ne20; // n_expert_used + int32_t neh0; + int32_t neh1; + uint64_t nbh1; + uint64_t nbh2; + int32_t ne0; + uint64_t nb1; + uint64_t nb2; +} ggml_metal_kargs_mul_mm_id_map1; + +typedef struct { int32_t ne00; int32_t ne02; uint64_t nb01; uint64_t nb02; - int32_t ne11; - int32_t ne12; - int32_t ne13; - uint64_t nb10; - uint64_t nb11; - uint64_t nb12; - int32_t ne0; - int32_t ne1; + uint64_t nb03; + int32_t neh12; + uint64_t nbh10; + uint64_t nbh11; + uint64_t nbh12; + uint64_t nbh13; + int32_t neh0; + int32_t neh1; + int16_t r2; + int16_t r3; } ggml_metal_kargs_mul_mm_id; typedef struct { diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m index b2e95a66c..1b56f858c 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m @@ -44,8 +44,8 @@ static struct ggml_backend_device g_ggml_backend_metal_device; // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; + id mtl_device; + int mtl_device_ref_count; id mtl_library; bool has_simdgroup_reduction; @@ -306,30 +306,36 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16, GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, + GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32, + GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16, + GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32, + GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16, GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16, GGML_METAL_KERNEL_TYPE_IM2COL_F16, @@ -341,7 +347,6 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, - GGML_METAL_KERNEL_TYPE_UNPAD_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -491,7 +496,264 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; +// +// ggml_metal_heap +// + +struct ggml_metal_heap { + // number of times the heap was unused + int n_unused; + + // total number of buffer allocations in this heap across all computes + int64_t n_alloc; + + // current offset in the heap - we reset this after each node in order to reuse the memory + size_t offs; + + // the currently allocated MTLBuffer objects in this heap + id obj; + + NSMutableArray * bufs; +}; + +static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { + struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); + + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypePlacement; + desc.size = size; + + heap->n_unused = 0; + heap->n_alloc = 0; + + heap->obj = [device newHeapWithDescriptor:desc]; + if (!heap->obj) { + GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); + + free(heap); + + return false; + } + + [desc release]; + + heap->bufs = [[NSMutableArray alloc] init]; + + return heap; +} + +static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { + heap->offs = 0; + + // count how many graph computes the heap ended up being unused + if ([heap->bufs count] > 0) { + heap->n_unused = 0; + } else { + heap->n_unused++; + } + + for (id buf in heap->bufs) { + [buf release]; + } + [heap->bufs removeAllObjects]; + + // tell the OS that it can reuse this memory if needed + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; +} + +static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { + if (heap == nil) { + return; + } + + ggml_metal_heap_reset(heap); + + [heap->obj release]; + [heap->bufs release]; + + free(heap); +} + +@interface ggml_metal_heap_ptr : NSObject + +@property (nonatomic, assign) struct ggml_metal_heap * data; + +@end + +@implementation ggml_metal_heap_ptr +@end + +// +// ggml_metal_mem_pool +// + +struct ggml_metal_mem_pool { + id device; + + int n_heaps; // total number of heaps ever created (including those that were removed) + + NSMutableArray * heaps; + NSMutableArray * heaps_to_remove; +}; + +static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { + struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); + + mem_pool->n_heaps = 0; + + mem_pool->heaps = [[NSMutableArray alloc] init]; + mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; + + return mem_pool; +} + +static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { + GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); + + size_t size_all = 0; + size_t size_cur = 0; + + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); + GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); + GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); + GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); + + if ([ptr.data->bufs count] > 0) { + size_cur += [ptr.data->obj size]; + } + size_all += [ptr.data->obj size]; + + ggml_metal_heap_free(ptr.data); + [ptr release]; + } + [mem_pool->heaps release]; + [mem_pool->heaps_to_remove release]; + + if (size_all > 0) { + GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); + } + + free(mem_pool); +} + +static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { + for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_reset(heap); + + // if the heap hasn't been used for a while, remove it + if (heap->n_unused >= 128) { + [mem_pool->heaps_to_remove addObject:@(i)]; + } + } + + if (mem_pool->heaps_to_remove.count > 0) { + // remove in reverse order + for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) { + NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_free(heap); + + [mem_pool->heaps removeObjectAtIndex:index]; + [ptr release]; + + if (i == 0) { + break; + } + } + + [mem_pool->heaps_to_remove removeAllObjects]; + } +} + +static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + ptr.data->offs = 0; + } +} + +static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { + const size_t alignment = 256; + + const size_t size_aligned = GGML_PAD(size, alignment); + + // try one of the existing heaps + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + struct ggml_metal_heap * heap = ptr.data; + if (heap->offs + size_aligned <= [heap->obj size]) { + // if this is the first buffer in the heap for the current command buffer, tell the OS that + // it cannot free the memory used by the heap + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + if ([heap->bufs count] == 0) { + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + } + + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return nil; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + return buf; + } + } + + // create a new heap that can fit this buffer + ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; + + struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); + if (heap == NULL) { + GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); + return NULL; + } + + //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); + + heap_ptr.data = heap; + ggml_metal_heap_reset(heap); + + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return NULL; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + [mem_pool->heaps addObject:heap_ptr]; + mem_pool->n_heaps++; + + return buf; +} + +struct ggml_metal_command_buffer { + id obj; + + // each command buffer has a memory pool from which it can allocate temporary buffers during the compute + struct ggml_metal_mem_pool * mem_pool; +}; + struct ggml_backend_metal_context { + id device; id queue; dispatch_queue_t d_queue; @@ -516,7 +778,7 @@ struct ggml_backend_metal_context { void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread - id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -706,9 +968,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de struct ggml_backend_metal_device_context * ctx_dev = dev->context; id device = ggml_backend_metal_device_acq(ctx_dev); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); - ctx->queue = [device newCommandQueue]; + ctx->device = device; + ctx->queue = [device newCommandQueue]; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -769,7 +1033,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->gf = nil; ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - ctx->command_buffers[i] = nil; + ctx->cmd_bufs[i].obj = nil; + + ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); + ctx->cmd_bufs[i].mem_pool->device = device; } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -986,30 +1253,36 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, mul_mm_id_bf16_f32, has_simdgroup_mm && use_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16, mul_mm_id_map0_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32, mul_mm_id_map1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16, mul_mm_id_f32_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16, mul_mm_id_f16_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16, mul_mm_id_bf16_f16, has_simdgroup_mm && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16, mul_mm_id_q4_0_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16, mul_mm_id_q4_1_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16, mul_mm_id_q5_0_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16, mul_mm_id_q5_1_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16, mul_mm_id_q8_0_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16, mul_mm_id_q2_K_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16, mul_mm_id_q3_K_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16, mul_mm_id_q4_K_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16, mul_mm_id_q5_K_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16, mul_mm_id_q6_K_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16, mul_mm_id_iq2_xxs_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16, mul_mm_id_iq2_xs_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16, mul_mm_id_iq3_xxs_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16, mul_mm_id_iq3_s_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16, mul_mm_id_iq2_s_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16, mul_mm_id_iq1_s_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16, mul_mm_id_iq1_m_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16, mul_mm_id_iq4_nl_f16, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16, mul_mm_id_iq4_xs_f16, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32, rope_multi_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16, rope_multi_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32, rope_vision_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16, rope_vision_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16, rope_neox_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); @@ -1021,7 +1294,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -1183,6 +1455,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + // ctx->cmd_bufs[i].obj is auto released + + ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); + } + dispatch_release(ctx->d_queue); free(ctx); @@ -1367,16 +1645,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); case GGML_OP_ROPE: - { - const int mode = ((const int32_t *) op->op_params)[2]; - if (mode & GGML_ROPE_TYPE_MROPE) { - return false; - } - if (mode & GGML_ROPE_TYPE_VISION) { - return false; - } - return true; - } + return true; case GGML_OP_IM2COL: return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_1D: @@ -1386,7 +1655,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: - case GGML_OP_UNPAD: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: @@ -1489,10 +1757,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } } -static void ggml_metal_encode_node( +static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, - id encoder) { + id encoder, + struct ggml_metal_mem_pool * mem_pool) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -1508,7 +1777,7 @@ static void ggml_metal_encode_node( struct ggml_tensor * dst = node; if (ggml_is_empty(dst)) { - return; + return true; } switch (dst->op) { @@ -1519,7 +1788,7 @@ static void ggml_metal_encode_node( case GGML_OP_PERMUTE: { // noop -> next node - } return; + } return true; default: { } break; @@ -1530,6 +1799,8 @@ static void ggml_metal_encode_node( GGML_ABORT("unsupported op"); } + ggml_metal_mem_pool_clear(mem_pool); + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2176,26 +2447,76 @@ static void ggml_metal_encode_node( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_metal_kargs_soft_max args = { +// use this branch to test the ggml_metal_mem_pool functionality +#if 0 + // cpy to tmp buffer in MTLHeap + + id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); + if (!h_src0) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); + return false; + } + + offs_src0 = 0; + + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, - /*.scale =*/ scale, - /*.max_bias =*/ max_bias, - /*.m0 =*/ m0, - /*.m1 =*/ m1, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne0 =*/ ne00, + /*.ne1 =*/ ne01, + /*.ne2 =*/ ne02, + /*.ne3 =*/ ne03, + /*.nb0 =*/ nb00, + /*.nb1 =*/ nb01, + /*.nb2 =*/ nb02, + /*.nb3 =*/ nb03, + }; + + if (src0->type == GGML_TYPE_F16) { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; + } else { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; + } + [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:0 atIndex:2]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; + +#else + id h_src0 = id_src0; +#endif + // softmax + + ggml_metal_kargs_soft_max args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.scale =*/ scale, + /*.max_bias =*/ max_bias, + /*.m0 =*/ m0, + /*.m1 =*/ m1, /*.n_head_log2 =*/ n_head_log2, }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:0]; if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; @@ -2686,7 +3007,7 @@ static void ggml_metal_encode_node( [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { id pipeline = nil; @@ -2906,8 +3227,6 @@ static void ggml_metal_encode_node( } break; case GGML_OP_MUL_MAT_ID: { - const int n_as = src0->ne[2]; - // src2 = ids const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); @@ -2921,24 +3240,21 @@ static void ggml_metal_encode_node( GGML_ASSERT(ne03 == 1); GGML_ASSERT(ne13 == 1); + const uint32_t r2 = 1; + const uint32_t r3 = 1; + // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel // ne20 = n_used_experts - // ne21 = n_rows - const int dst_rows = ne20*ne21; - const int dst_rows_min = n_as; - const int dst_rows_max = (device.maxThreadgroupMemoryLength/2 - 8192)/4; - - // max size of the rowids array in the kernel shared buffer - //GGML_ASSERT(dst_rows <= dst_rows_max); + // ne21 = n_rows (batch size) + const int ne21_mm_id_min = 32; // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel if ([device supportsFamily:MTLGPUFamilyApple7] && ne00 % 32 == 0 && ne00 >= 64 && - //ne01 / ne02 >= 512 && // NOTE: this is based on Mixtral shapes, might need adjustments - dst_rows > dst_rows_min && - dst_rows <= dst_rows_max) { + (ne21 >= ne21_mm_id_min)) { + GGML_ASSERT(ne00 % 4 == 0); // some Metal matrix data types require aligned pointers // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) @@ -2949,62 +3265,169 @@ static void ggml_metal_encode_node( default: break; } - id pipeline = nil; + const int64_t neh10 = ne10; // n_embd + const int64_t neh11 = ne21; // n_tokens + const int64_t neh12 = ne02; // n_expert - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; - case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; - case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; - case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; - case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; - case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; - default: GGML_ABORT("MUL_MAT_ID not implemented"); + const uint64_t nbh10 = ggml_type_size(GGML_TYPE_F16); + const uint64_t nbh11 = nbh10*neh10; + const uint64_t nbh12 = nbh11*neh11; + const uint64_t nbh13 = nbh12*neh12; + + const size_t s_src1 = ggml_type_size(GGML_TYPE_F16)*neh10*neh11*neh12; + id h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1); + if (!h_src1) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1); + return false; } - ggml_metal_kargs_mul_mm_id args = { - /*.nei0 =*/ ne20, - /*.nei1 =*/ ne21, - /*.nbi1 =*/ nb21, - /*.ne00 =*/ ne00, - /*.ne02 =*/ ne02, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.ne11 =*/ ne11, - /*.ne12 =*/ ne12, - /*.ne13 =*/ ne13, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.ne0 =*/ ne0, - /*.ne1 =*/ ne1, - }; + const int64_t neh0 = ne0; + const int64_t neh1 = ne21; + const int64_t neh2 = ne02; - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4]; + const uint64_t nbh0 = ggml_type_size(GGML_TYPE_F32); + const uint64_t nbh1 = nbh0*neh0; + const uint64_t nbh2 = nbh1*neh1; + //const uint64_t nbh3 = nbh2*neh2; - [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0]; + const size_t s_dst = ggml_type_size(GGML_TYPE_F32)*neh0*neh1*neh2; + id h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst); + if (!h_dst) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst); + return false; + } - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + // tokens per expert + const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02; + id h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe); + if (!h_tpe) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe); + return false; + } + + // id map + // [n_expert_used, n_tokens] + const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne20*ne21; + id h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids); + if (!h_ids) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids); + return false; + } + + { + const int nth = MIN(1024, ne10/4); + + ggml_metal_kargs_mul_mm_id_map0 args = { + ne10, + ne11, // n_expert_used (bcast) + nb11, + nb12, + neh11, // n_tokens + nbh11, + ne20, // n_expert_used + nb21, + }; + + id pipeline = nil; + + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + [encoder setBuffer: h_src1 offset:0 atIndex:3]; + [encoder setBuffer: h_tpe offset:0 atIndex:4]; + [encoder setBuffer: h_ids offset:0 atIndex:5]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne02, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } + + { + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16 ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16 ].pipeline; break; + case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16 ].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break; + case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break; + default: GGML_ABORT("MUL_MAT_ID not implemented"); + } + + ggml_metal_kargs_mul_mm_id args = { + /*.ne00 =*/ ne00, + /*.ne02 =*/ ne02, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.neh12 =*/ neh12, + /*.nbh10 =*/ nbh10, + /*.nbh11 =*/ nbh11, + /*.nbh12 =*/ nbh12, + /*.nbh13 =*/ nbh13, + /*.neh0 =*/ neh0, + /*.neh1 =*/ neh1, + /*.r2 =*/ r2, + /*.r3 =*/ r3, + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer: h_src1 offset:0 atIndex:2]; + [encoder setBuffer: h_tpe offset:0 atIndex:3]; + [encoder setBuffer: h_dst offset:0 atIndex:4]; + + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, ne02) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } + + { + GGML_ASSERT(ne0 % 4 == 0); + + const int nth = MIN(1024, ne0/4); + + ggml_metal_kargs_mul_mm_id_map1 args = { + ne20, // n_expert_used + neh0, + neh1, + nbh1, + nbh2, + ne0, + nb1, + nb2, + }; + + id pipeline = nil; + + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer: h_dst offset:0 atIndex:1]; + [encoder setBuffer: h_ids offset:0 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne20, ne21, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } } else { id pipeline = nil; @@ -3198,7 +3621,7 @@ static void ggml_metal_encode_node( [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4]; const int64_t _ne1 = 1; - const int64_t ne123 = dst_rows; + const int64_t ne123 = ne20*ne21; if (smem > 0) { [encoder setThreadgroupMemoryLength:smem atIndex:0]; @@ -3402,6 +3825,7 @@ static void ggml_metal_encode_node( } break; case GGML_OP_ROPE: { + // make sure we have one or more position id(ne10) per token(ne02) GGML_ASSERT(ne10 % ne02 == 0); GGML_ASSERT(ne10 >= ne02); @@ -3428,20 +3852,42 @@ static void ggml_metal_encode_node( memcpy(&beta_fast, (const int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (const int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; + const bool is_vision = mode == GGML_ROPE_TYPE_VISION; + + // mrope + const int sect_0 = ((const int32_t *) dst->op_params)[11]; + const int sect_1 = ((const int32_t *) dst->op_params)[12]; + const int sect_2 = ((const int32_t *) dst->op_params)[13]; + const int sect_3 = ((const int32_t *) dst->op_params)[14]; id pipeline = nil; - if (!is_neox) { + if (is_neox) { switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + } else if (is_mrope && !is_vision) { + GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + } else if (is_vision) { + GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break; default: GGML_ABORT("fatal error"); }; } else { switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; default: GGML_ABORT("fatal error"); }; } @@ -3472,6 +3918,10 @@ static void ggml_metal_encode_node( /*.attn_factor =*/ attn_factor, /*.beta_fast =*/ beta_fast, /*.beta_slow =*/ beta_slow, + /* sect_0 =*/ sect_0, + /* sect_1 =*/ sect_1, + /* sect_2 =*/ sect_2, + /* sect_3 =*/ sect_3, }; [encoder setComputePipelineState:pipeline]; @@ -3734,36 +4184,6 @@ static void ggml_metal_encode_node( const int nth = MIN(1024, ne0); - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_UNPAD: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - - const int nth = MIN(1024, ne0); - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ARANGE: @@ -4634,6 +5054,8 @@ static void ggml_metal_encode_node( GGML_ABORT("fatal error"); } } + + return true; } static enum ggml_status ggml_metal_graph_compute( @@ -4687,25 +5109,25 @@ static enum ggml_status ggml_metal_graph_compute( } // the main thread commits the first few commands immediately - // command_buffer[n_cb] + // cmd_buf[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[n_cb] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; - [command_buffer enqueue]; + [cmd_buf enqueue]; ctx->encode_async(n_cb); } // prepare the rest of the command buffers asynchronously - // command_buffer[0.. n_cb) + // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[cb_idx] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; + [cmd_buf enqueue]; } } @@ -4714,14 +5136,14 @@ static enum ggml_status ggml_metal_graph_compute( // wait for completion and check status of each command buffer // needed to detect if the device ran out-of-memory for example (#1881) { - id command_buffer = ctx->command_buffers[n_cb]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -4729,20 +5151,20 @@ static enum ggml_status ggml_metal_graph_compute( } for (int i = 0; i < n_cb; ++i) { - id command_buffer = ctx->command_buffers[i]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; } - id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); if (!next_buffer) { continue; } @@ -5126,8 +5548,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoder]; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + + id encoder = [cmd_buf computeCommandEncoder]; int node_start = 0; int node_end = n_nodes_0; @@ -5139,22 +5562,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + ggml_metal_mem_pool_reset(mem_pool); + for (int idx = node_start; idx < node_end; ++idx) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder); + const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); if (should_capture) { [encoder popDebugGroup]; } + + if (!res) { + break; + } } [encoder endEncoding]; if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; + [cmd_buf commit]; } }); } diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal index 71f0f97ff..9cfddf450 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal @@ -2713,8 +2713,148 @@ kernel void kernel_rope_neox( } } +template +kernel void kernel_rope_multi( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < args.n_dims) { + const int ic = i0/2; + + // mrope theta calculations + // note: the rest is the same as kernel_rope_neox + const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3; + const int sec_w01 = args.sect_0 + args.sect_1; // end of section 1 + const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 + const int sector = ic % sect_dims; + + float theta_base; + if (sector < args.sect_0) { + theta_base = (float) pos[i2]; + } else if (sector < sec_w01) { + theta_base = (float) pos[i2 + args.ne02]; + } else if (sector < sec_w012) { + theta_base = (float) pos[i2 + args.ne02 * 2]; + } else { + theta_base = (float) pos[i2 + args.ne02 * 3]; + } + // end of mrope + + const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); + + const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + + const float x0 = src[0]; + const float x1 = src[args.n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + +template +kernel void kernel_rope_vision( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < 2*args.n_dims) { // different from kernel_rope_multi + const int ic = i0/2; + + // mrope theta calculations (only support 2 dimensions) + const int sect_dims = args.sect_0 + args.sect_1; + const int sector = ic % sect_dims; + + float p; + float theta_base; + if (sector < args.sect_1) { + p = (float) sector; + theta_base = (float) pos[i2]; + } else { + p = (float) sector - args.sect_0; + theta_base = (float) pos[i2 + args.ne02]; + } + + const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p); + // end of mrope + + const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + + const float x0 = src[0]; + const float x1 = src[args.n_dims]; // different from kernel_rope_multi + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + typedef decltype(kernel_rope_norm) kernel_rope_norm_t; typedef decltype(kernel_rope_neox) kernel_rope_neox_t; +typedef decltype(kernel_rope_multi) kernel_rope_multi_t; +typedef decltype(kernel_rope_vision) kernel_rope_vision_t; template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm; template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm; @@ -2722,6 +2862,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_ template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox; template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox; +template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi; +template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi; + +template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision; +template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision; + typedef void (im2col_t)( device const float * x, device char * dst, @@ -2975,51 +3121,6 @@ kernel void kernel_pad_reflect_1d_f32( } } -kernel void kernel_unpad_f32( - device const char * src0, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; - - device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); - device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); - - if (i1 < ne01 && i2 < ne02 && i3 < ne03) { - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - if (i0 < ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } - } - - return; - } -} - kernel void kernel_arange_f32( device char * dst, constant ggml_metal_kargs_arange & args, @@ -3237,7 +3338,7 @@ kernel void kernel_flash_attn_ext( { float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; + float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 }; // thread indices inside the simdgroup // TODO: see if we can utilize quad-group functions for better performance @@ -3497,7 +3598,7 @@ kernel void kernel_flash_attn_ext( // reduce the warps sequentially for (ushort sg = 1; sg < nsg; ++sg) { float S = { 0.0f }; - float M = { -__FLT16_MAX__/2 }; + float M = { -__FLT_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -3744,7 +3845,7 @@ kernel void kernel_flash_attn_ext_vec( { float S = 0.0f; - float M = -__FLT16_MAX__/2; + float M = -__FLT_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%NL; @@ -6381,127 +6482,219 @@ kernel void kernel_mul_mm( } } -// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids -// TODO: this kernel needs to be reimplemented from scratch for better performance -template -void kernel_mul_mm_id_impl( - int32_t ne00, - int32_t ne02, - uint64_t nb01, - uint64_t nb02, - int32_t ne11, - int32_t ne12, - uint64_t nb10, - uint64_t nb11, - uint64_t nb12, - int32_t ne0, - int32_t ne1, - int64_t ne0ne1, - device const char * src0, - device const char * src1, - threadgroup ushort2 * rowids, - device char * dst, - threadgroup char * shmem, +template +kernel void kernel_mul_mm_id_map0( + constant ggml_metal_kargs_mul_mm_id_map0 & args, + device const char * src1, + device const char * src2, + device char * hsrc1, + device char * htpe, + device char * hids, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int ide = tgpig[0]; // expert id + + int n_all = 0; + + device int32_t * ids_i32 = (device int32_t *) (hids); + + for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens + device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21); + + for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used + if (src2_i32[i20] != ide) { + continue; + } + + device const float4 * src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11); + device T4 * hsrc1_f32x4 = (device T4 *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11); + + for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) { + hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]); + } + + if (tpitg.x == 0) { + ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all; + } + + ++n_all; + } + } + + if (tpitg.x == 0) { + device int32_t * tpe_i32 = (device int32_t *) (htpe); + tpe_i32[ide] = n_all; + } +} + +typedef decltype(kernel_mul_mm_id_map0) kernel_mul_mm_id_map0_t; + +template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0; + +template +kernel void kernel_mul_mm_id_map1( + constant ggml_metal_kargs_mul_mm_id_map1 & args, + device const char * hdst, + device const char * hids, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i20 = tgpig[0]; // used expert + const int i21 = tgpig[1]; // token + + device const int32_t * ids_i32 = (device const int32_t *) (hids); + device float4 * dst_f32x4 = (device float4 *) (dst + i20*args.nb1 + i21*args.nb2); + + const int id = ids_i32[i21*args.ne20 + i20]; + + const int ide = id / args.neh1; + const int idt = id % args.neh1; + + device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2); + + for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) { + dst_f32x4[i0] = hdst_f32x4[i0]; + } +} + +typedef decltype(kernel_mul_mm_id_map1) kernel_mul_mm_id_map1_t; + +template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1; + +template +kernel void kernel_mul_mm_id( + constant ggml_metal_kargs_mul_mm_id & args, + device const char * src0, + device const char * src1, + device const char * tpe, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], ushort tiitg[[thread_index_in_threadgroup]], ushort sgitg[[simdgroup_index_in_threadgroup]]) { - threadgroup half * sa = (threadgroup half *)(shmem); - threadgroup float * sb = (threadgroup float *)(shmem + 4096); + threadgroup T * sa = (threadgroup T *)(shmem); + threadgroup half * sb = (threadgroup half *)(shmem + 4096); const int r0 = tgpig.y; const int r1 = tgpig.x; + const int im = tgpig.z; - if (r1*BLOCK_SIZE_N >= ne1) return; + device const int32_t * tpe_i32 = (device const int32_t *) (tpe); + + const int neh1 = tpe_i32[im]; + + if (r1*BLOCK_SIZE_N >= neh1) { + return; + } // if this block is of 64x32 shape or smaller - short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; - short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; + const short n_cols = ( neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? ( neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; // a thread shouldn't load data outside of the matrix - short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; - short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; - simdgroup_half8x8 ma[4]; - simdgroup_float8x8 mb[2]; + simdgroup_T8x8 ma[4]; + simdgroup_half8x8 mb[2]; simdgroup_float8x8 mc[8]; - for (int i = 0; i < 8; i++){ + + for (short i = 0; i < 8; i++){ mc[i] = make_filled_simdgroup_matrix(0.f); } + short il = (tiitg % THREAD_PER_ROW); - ushort offset1 = il/nl; + const int i12 = im%args.neh12; + const int i13 = im/args.neh12; - threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col]; + const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + const short offset1 = il/nl; - device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1; - device const float * y = (device const float *)(src1 - + nb12 * id[1] - + nb11 * (id[0] % ne11) - + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + device const block_q * x = (device const block_q *)(src0 + + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1; - for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { + device const half * y = (device const half *)(src1 + + args.nbh13*i13 + + args.nbh12*i12 + + args.nbh11*(r1*BLOCK_SIZE_N + thread_col) + + args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + + for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) { // load data and store to threadgroup memory - half4x4 temp_a; + T4x4 temp_a; dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); - for (int i = 0; i < 16; i++) { - *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ - + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ - + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + #pragma unroll(16) + for (short i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \ + + (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \ + + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; } - *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y); + *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y); il = (il + 2 < nl) ? il + 2 : il % 2; - x = (il < 2) ? x + (2+nl-1)/nl : x; + x = (il < 2) ? x + (2 + nl - 1)/nl : x; y += BLOCK_SIZE_K; threadgroup_barrier(mem_flags::mem_threadgroup); // load matrices from threadgroup memory and conduct outer products - threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); - threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); + threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); - #pragma unroll(BLOCK_SIZE_K/8) - for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + #pragma unroll(4) + for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) { #pragma unroll(4) - for (int i = 0; i < 4; i++) { + for (short i = 0; i < 4; i++) { simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) - for (int i = 0; i < 2; i++) { + for (short i = 0; i < 2; i++) { simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); } - lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; - lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; - #pragma unroll(8) - for (int i = 0; i < 8; i++){ + for (short i = 0; i < 8; i++){ simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); } + + lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE; + lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE; } } - { + if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) { + device float * C = (device float *) dst + + (BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \ + (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0; + + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0); + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup float * temp_str = ((threadgroup float *) shmem) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { - simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M; + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); } threadgroup_barrier(mem_flags::mem_threadgroup); if (sgitg == 0) { for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { - threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j]; - int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1; - - device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + joff; + device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0; device float4 * D4 = (device float4 *) D; threadgroup float * C = temp_str + (j*BLOCK_SIZE_M); @@ -6521,66 +6714,6 @@ void kernel_mul_mm_id_impl( } } -template -kernel void kernel_mul_mm_id( - constant ggml_metal_kargs_mul_mm_id & args, - device const char * src0s, - device const char * src1, - device char * dst, - device const char * ids, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - const int32_t i02 = tgpig.z; - - tgpig.z = 0; - - device const char * src0 = src0s + i02*args.nb02; - - // row indices - threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192); - - // TODO: parallelize this loop - int32_t _ne1 = 0; - for (ushort ii1 = 0; ii1 < args.nei1; ii1++) { - for (ushort ii0 = 0; ii0 < args.nei0; ii0++) { - int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0]; - if (id == i02) { - if (tiitg == 0) { - rowids[_ne1] = ushort2(ii0, ii1); - } - _ne1++; - } - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - kernel_mul_mm_id_impl( - args.ne00, - args.ne02, - args.nb01, - args.nb02, - args.ne11, - args.ne12, - args.nb10, - args.nb11, - args.nb12, - args.ne0, - _ne1, - (int64_t)args.ne0*args.ne1, - src0, - src1, - rowids, - dst, - shmem, - tgpig, - tiitg, - sgitg); -} - #define QK_NL 16 // @@ -6621,63 +6754,64 @@ template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get // matrix-matrix multiplication // -typedef decltype(kernel_mul_mm) mat_mm_t; +typedef decltype(kernel_mul_mm) mul_mm_t; -template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f32_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_mm; #if defined(GGML_METAL_USE_BF16) -template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm; #endif -template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication // -typedef decltype(kernel_mul_mm_id) mat_mm_id_t; +typedef decltype(kernel_mul_mm_id) mul_mm_id; -template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id; #if defined(GGML_METAL_USE_BF16) -template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_bf16_f16")]] kernel mul_mm_id kernel_mul_mm_id; #endif -template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q8_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q2_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q3_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q6_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; + // // matrix-vector multiplication diff --git a/ml/backend/ggml/ggml/src/ggml-opt.cpp b/ml/backend/ggml/ggml/src/ggml-opt.cpp index 7c3e24103..58d77578f 100644 --- a/ml/backend/ggml/ggml/src/ggml-opt.cpp +++ b/ml/backend/ggml/ggml/src/ggml-opt.cpp @@ -28,16 +28,19 @@ struct ggml_opt_dataset { }; struct ggml_opt_context { - ggml_backend_sched_t backend_sched = nullptr; - ggml_cgraph * allocated_graph = nullptr; - ggml_cgraph * allocated_graph_copy = nullptr; - struct ggml_context * ctx_static = nullptr; - struct ggml_context * ctx_static_cpu = nullptr; - struct ggml_context * ctx_compute = nullptr; - struct ggml_context * ctx_copy = nullptr; - ggml_backend_buffer_t buf_static = nullptr; - ggml_backend_buffer_t buf_static_cpu = nullptr; - std::mt19937 rng; + ggml_backend_sched_t backend_sched = nullptr; + ggml_cgraph * allocated_graph = nullptr; + ggml_cgraph * allocated_graph_copy = nullptr; + struct ggml_context * ctx_static = nullptr; + struct ggml_context * ctx_cpu = nullptr; + struct ggml_context * ctx_compute = nullptr; + struct ggml_context * ctx_copy = nullptr; + ggml_backend_buffer_t buf_static = nullptr; + ggml_backend_buffer_t buf_cpu = nullptr; + std::mt19937 rng; + enum ggml_opt_loss_type loss_type; + enum ggml_opt_build_type build_type; + enum ggml_opt_build_type build_type_alloc; struct ggml_tensor * inputs = nullptr; struct ggml_tensor * outputs = nullptr; @@ -50,6 +53,11 @@ struct ggml_opt_context { struct ggml_cgraph * gf = nullptr; struct ggml_cgraph * gb_grad = nullptr; struct ggml_cgraph * gb_opt = nullptr; + bool static_graphs = false; + bool eval_ready = false; + std::vector grad_accs; + std::vector grad_m; + std::vector grad_v; int64_t iter = 1; int32_t opt_period = 1; @@ -73,7 +81,13 @@ struct ggml_opt_result { // ====== Dataset ====== -ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) { +ggml_opt_dataset_t ggml_opt_dataset_init( + enum ggml_type type_data, + enum ggml_type type_label, + int64_t ne_datapoint, + int64_t ne_label, + int64_t ndata, + int64_t ndata_shard) { GGML_ASSERT(ne_datapoint > 0); GGML_ASSERT(ne_label >= 0); GGML_ASSERT(ndata > 0); @@ -92,11 +106,11 @@ ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, result->ctx = ggml_init(params); } - result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata); + result->data = ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata); result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata; if (ne_label > 0) { - result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata); + result->labels = ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata); result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata; } else { result->labels = nullptr; @@ -119,6 +133,10 @@ void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) { delete dataset; } +int64_t ggml_opt_dataset_ndata(ggml_opt_dataset_t dataset) { + return dataset->ndata; +} + struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) { return dataset->data; } @@ -144,6 +162,8 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor * GGML_ASSERT( data_batch && ggml_is_contiguous(data_batch)); GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch)); GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr)); + GGML_ASSERT( data_batch->type == dataset->data->type); + GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type); const size_t nb_data_batch = ggml_nbytes(data_batch); GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0); @@ -171,6 +191,31 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor * } } +void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) { + GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr)); + GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0); + + const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data; + + GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size())); + + for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) { + const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch]; + + const char * ptr_data = (const char *) dataset->data->data + ishard *dataset->nbs_data; + char * ptr_data_batch = (char *) data_batch + ishard_batch*dataset->nbs_data; + memcpy(ptr_data_batch, ptr_data, dataset->nbs_data); + + if (!labels_batch) { + continue; + } + + const char * ptr_labels = (const char *) dataset->labels->data + ishard *dataset->nbs_labels; + char * ptr_labels_batch = (char *) labels_batch + ishard_batch*dataset->nbs_labels; + memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels); + } +} + // ====== Model / Context ====== struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) { @@ -187,17 +232,18 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us return result; } +struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) { + return *((struct ggml_opt_optimizer_params *) userdata); +} + struct ggml_opt_params ggml_opt_default_params( ggml_backend_sched_t backend_sched, - struct ggml_context * ctx_compute, - struct ggml_tensor * inputs, - struct ggml_tensor * outputs, enum ggml_opt_loss_type loss_type) { return { /*backend_sched =*/ backend_sched, - /*ctx_compute =*/ ctx_compute, - /*inputs =*/ inputs, - /*logits =*/ outputs, + /*ctx_compute =*/ nullptr, + /*inputs =*/ nullptr, + /*logits =*/ nullptr, /*loss_type =*/ loss_type, /*build_type =*/ GGML_OPT_BUILD_TYPE_OPT, /*opt_period =*/ 1, @@ -266,195 +312,246 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) { return dst; } -static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) { - GGML_ASSERT(graph); - if (opt_ctx->allocated_graph == graph) { - return; - } +static void ggml_opt_build(ggml_opt_context_t opt_ctx) { + GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc"); + GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically"); - ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph + const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD && + !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1); - { - ggml_init_params params = { - /*.mem_size =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE, - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; - ggml_free(opt_ctx->ctx_copy); - opt_ctx->ctx_copy = ggml_init(params); - } - - opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph); - - ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy); - opt_ctx->allocated_graph = graph; -} - -ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { - ggml_opt_context_t result = new struct ggml_opt_context; - result->backend_sched = params.backend_sched; - result->ctx_compute = params.ctx_compute; - result->inputs = params.inputs; - result->outputs = params.outputs; - result->opt_period = params.opt_period; - result->get_opt_pars = params.get_opt_pars; - result->get_opt_pars_ud = params.get_opt_pars_ud; - - GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically"); - GGML_ASSERT(result->opt_period >= 1); - - const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD || - (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1); - - ggml_set_input(result->inputs); - ggml_set_output(result->outputs); - - result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass. - ggml_build_forward_expand(result->gf, result->outputs); + ggml_set_input(opt_ctx->inputs); + ggml_set_output(opt_ctx->outputs); int n_param = 0; - for (int i = 0; i < result->gf->n_nodes; ++i) { - if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) { + for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) { + const struct ggml_tensor * node = opt_ctx->gf->nodes[i]; + if (node->flags & GGML_TENSOR_FLAG_PARAM) { n_param++; } + GGML_ASSERT(!(node->flags & GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented"); } - { + if (!opt_ctx->ctx_static) { // The static context is used for: - // - gradients (1 tensor per param if using gradient accumulation) + // - gradients (1 per loss, 1 tensor per param if using gradient accumulation) // - optimizer momenta (2 tensors per param) - // - labels - // - loss + its gradient (up to 5 tensors) - // - pred - // - ncorrect (2 tensors). - const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0); - const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead(); + // - labels (if using static graphs) + // - loss (if using static graphs, up to 5 tensors) + // - pred (if using static graphs) + // - ncorrect (if using static graphs, 2 tensors). + constexpr size_t n_loss = 1; + const size_t tensors_per_param = (accumulate ? 1 : 0) + + (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0); + const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0; + const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead(); struct ggml_init_params params = { /*.mem_size =*/ size_meta, /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; - result->ctx_static = ggml_init(params); + opt_ctx->ctx_static = ggml_init(params); } + GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc); + { - // The static cpu context is used for: - // - optimizer parameters (1 for the entire context) + // The cpu context is allocated statically if using static graphs, dynamically otherwise. + // It is used for: + // - optimizer parameters (1 shared for all optimizer invocations) const size_t size_meta = 1 * ggml_tensor_overhead(); struct ggml_init_params params = { /*.mem_size =*/ size_meta, /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; - result->ctx_static_cpu = ggml_init(params); + ggml_free(opt_ctx->ctx_cpu); + opt_ctx->ctx_cpu = ggml_init(params); + + ggml_backend_buffer_free(opt_ctx->buf_cpu); + opt_ctx->buf_cpu = nullptr; } + struct ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute; - switch (params.loss_type) { + switch (opt_ctx->loss_type) { case GGML_OPT_LOSS_TYPE_MEAN: { - result->loss = ggml_sum(result->ctx_static, result->outputs); - ggml_set_name(result->loss, "loss_sum"); - const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs)); - result->loss = ggml_scale(result->ctx_static, result->loss, scale); - ggml_set_name(result->loss, "loss_mean"); - result->loss_per_datapoint = true; + opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs); + ggml_set_name(opt_ctx->loss, "loss_sum"); + const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs)); + opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale); + ggml_set_name(opt_ctx->loss, "loss_mean"); + opt_ctx->loss_per_datapoint = true; break; } case GGML_OPT_LOSS_TYPE_SUM: { - result->loss = ggml_sum(result->ctx_static, result->outputs); - ggml_set_name(result->loss, "loss_sum"); - result->loss_per_datapoint = false; + opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs); + ggml_set_name(opt_ctx->loss, "loss_sum"); + opt_ctx->loss_per_datapoint = false; break; } case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: { - result->labels = ggml_dup_tensor(result->ctx_static, result->outputs); - ggml_set_input(result->labels); - ggml_set_name(result->labels, "labels"); - result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels); - ggml_set_name(result->loss, "loss_cross_entropy"); - if (result->opt_period > 1) { - result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period); - ggml_set_name(result->loss, "loss_cross_entropy_scaled"); + opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs); + ggml_set_input(opt_ctx->labels); + ggml_set_name(opt_ctx->labels, "labels"); + opt_ctx->loss = ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels); + ggml_set_name(opt_ctx->loss, "loss_cross_entropy"); + if (opt_ctx->opt_period > 1) { + opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period); + ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled"); } - result->loss_per_datapoint = true; + opt_ctx->loss_per_datapoint = true; break; } case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: { - result->labels = ggml_dup_tensor(result->ctx_static, result->outputs); - ggml_set_input(result->labels); - ggml_set_name(result->labels, "labels"); - result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels); - ggml_set_name(result->loss, "loss_error"); - result->loss = ggml_sqr(result->ctx_static, result->loss); - ggml_set_name(result->loss, "loss_squared_error"); - result->loss = ggml_sum(result->ctx_static, result->loss); - ggml_set_name(result->loss, "loss_sum_squared_error"); - const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs)); - result->loss = ggml_scale(result->ctx_static, result->loss, scale); - ggml_set_name(result->loss, "loss_mean_squared_error"); - result->loss_per_datapoint = true; + opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs); + ggml_set_input(opt_ctx->labels); + ggml_set_name(opt_ctx->labels, "labels"); + opt_ctx->loss = ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels); + ggml_set_name(opt_ctx->loss, "loss_error"); + opt_ctx->loss = ggml_sqr(ctx_results, opt_ctx->loss); + ggml_set_name(opt_ctx->loss, "loss_squared_error"); + opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->loss); + ggml_set_name(opt_ctx->loss, "loss_sum_squared_error"); + const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs)); + opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale); + ggml_set_name(opt_ctx->loss, "loss_mean_squared_error"); + opt_ctx->loss_per_datapoint = true; break; } } - ggml_set_output(result->loss); - ggml_set_loss(result->loss); - ggml_build_forward_expand(result->gf, result->loss); + ggml_set_output(opt_ctx->loss); + ggml_set_loss(opt_ctx->loss); + ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss); - result->pred = ggml_argmax(result->ctx_static, result->outputs); - ggml_set_name(result->pred, "pred"); - ggml_set_output(result->pred); - ggml_build_forward_expand(result->gf, result->pred); + if (opt_ctx->loss_type == GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) { + opt_ctx->pred = ggml_argmax(ctx_results, opt_ctx->outputs); + ggml_set_name(opt_ctx->pred, "pred"); + ggml_set_output(opt_ctx->pred); + ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred); - if (result->labels) { - result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels)); - ggml_set_name(result->ncorrect, "ncorrect"); - ggml_set_output(result->ncorrect); - ggml_build_forward_expand(result->gf, result->ncorrect); - } else { - result->ncorrect = nullptr; + opt_ctx->ncorrect = ggml_count_equal(ctx_results, opt_ctx->pred, ggml_argmax(ctx_results, opt_ctx->labels)); + ggml_set_name(opt_ctx->ncorrect, "ncorrect"); + ggml_set_output(opt_ctx->ncorrect); + ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect); } - if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) { - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - return result; + if (opt_ctx->buf_static) { + if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_FORWARD) { + return; + } + } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_FORWARD) { + opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors( + opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0)); + return; } - // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients. - result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf); - ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate); + if (opt_ctx->grad_accs.empty()) { + GGML_ASSERT(opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD); - if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) { - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - ggml_graph_reset(result->gb_grad); - return result; - } + const int n_nodes = opt_ctx->gf->n_nodes; + opt_ctx->grad_accs.resize(n_nodes); + for (int i = 0; i < n_nodes; ++i) { + ggml_tensor * node = opt_ctx->gf->nodes[i]; + if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) { + opt_ctx->grad_accs[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + } else { + opt_ctx->grad_accs[i] = nullptr; + } + } - GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT); - - // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step. - result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad); - - result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7); - ggml_set_input(result->adamw_params); - ggml_set_name(result->adamw_params, "adamw_params"); - - for (int i = result->gf->n_nodes-1; i >= 0; --i) { - struct ggml_tensor * node = result->gb_opt->nodes[i]; - struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node); - - if (node->flags & GGML_TENSOR_FLAG_PARAM) { - struct ggml_tensor * m = ggml_dup_tensor(result->ctx_static, node); - struct ggml_tensor * v = ggml_dup_tensor(result->ctx_static, node); - struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params); - ggml_build_forward_expand(result->gb_opt, opt_step); + if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) { + opt_ctx->grad_m.resize(n_nodes); + opt_ctx->grad_v.resize(n_nodes); + for (int i = 0; i < n_nodes; ++i) { + ggml_tensor * node = opt_ctx->gf->nodes[i]; + if (node->flags & GGML_TENSOR_FLAG_PARAM) { + opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + } else { + opt_ctx->grad_m[i] = nullptr; + opt_ctx->grad_v[i] = nullptr; + } + } } } - result->buf_static = ggml_backend_alloc_ctx_tensors( - result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); + // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients. + opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true); + ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data()); - result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type()); + if (opt_ctx->buf_static) { + if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_GRAD) { + return; + } + } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_GRAD) { + opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0)); + ggml_graph_reset(opt_ctx->gb_grad); + } - ggml_graph_reset(result->gb_opt); + GGML_ASSERT(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT); + + // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step. + opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true); + + opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7); + ggml_set_input(opt_ctx->adamw_params); + ggml_set_name(opt_ctx->adamw_params, "adamw_params"); + + for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) { + struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i]; + struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node); + + if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) { + struct ggml_tensor * m = opt_ctx->grad_m[i]; + struct ggml_tensor * v = opt_ctx->grad_v[i]; + struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params); + + ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str()); + ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str()); + ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str()); + + ggml_build_forward_expand(opt_ctx->gb_opt, opt_step); + } + } + + if (!opt_ctx->buf_static) { + opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors( + opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0)); + ggml_graph_reset(opt_ctx->gb_opt); + } + + opt_ctx->buf_cpu = ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, ggml_backend_cpu_buffer_type()); +} + +ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { + ggml_opt_context_t result = new struct ggml_opt_context; + result->backend_sched = params.backend_sched; + result->ctx_compute = params.ctx_compute; + result->loss_type = params.loss_type; + result->build_type = params.build_type; + result->build_type_alloc = params.build_type; + result->inputs = params.inputs; + result->outputs = params.outputs; + result->opt_period = params.opt_period; + result->get_opt_pars = params.get_opt_pars; + result->get_opt_pars_ud = params.get_opt_pars_ud; + + GGML_ASSERT(result->opt_period >= 1); + + result->static_graphs = result->ctx_compute; + + if (!result->static_graphs) { + GGML_ASSERT(!result->inputs); + GGML_ASSERT(!result->outputs); + return result; + } + + GGML_ASSERT(result->inputs); + GGML_ASSERT(result->outputs); + + result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass. + ggml_build_forward_expand(result->gf, result->outputs); + + ggml_opt_build(result); return result; } @@ -464,9 +561,9 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) { return; } ggml_backend_buffer_free(opt_ctx->buf_static); - ggml_backend_buffer_free(opt_ctx->buf_static_cpu); + ggml_backend_buffer_free(opt_ctx->buf_cpu); ggml_free(opt_ctx->ctx_static); - ggml_free(opt_ctx->ctx_static_cpu); + ggml_free(opt_ctx->ctx_cpu); delete opt_ctx; } @@ -582,8 +679,79 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl // ====== Computation ====== -static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) { - if (graph != opt_ctx->gf) { +void ggml_opt_prepare_alloc( + ggml_opt_context_t opt_ctx, + struct ggml_context * ctx_compute, + struct ggml_cgraph * gf, + struct ggml_tensor * inputs, + struct ggml_tensor * outputs) { + GGML_ASSERT(!opt_ctx->static_graphs); + opt_ctx->ctx_compute = ctx_compute; + opt_ctx->gf = gf; + opt_ctx->inputs = inputs; + opt_ctx->outputs = outputs; +} + +void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { + GGML_ASSERT(!opt_ctx->eval_ready); + if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) { + ggml_graph_reset(opt_ctx->gb_grad); + } + if (backward) { + const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period; + opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD; + } else { + opt_ctx->build_type = GGML_OPT_BUILD_TYPE_FORWARD; + } + + if (!opt_ctx->static_graphs) { + ggml_opt_build(opt_ctx); + } + + struct ggml_cgraph * graph = nullptr; + switch (opt_ctx->build_type) { + case GGML_OPT_BUILD_TYPE_FORWARD: { + graph = opt_ctx->gf; + } break; + case GGML_OPT_BUILD_TYPE_GRAD: { + graph = opt_ctx->gb_grad; + } break; + case GGML_OPT_BUILD_TYPE_OPT: { + graph = opt_ctx->gb_opt; + } break; + } + GGML_ASSERT(graph); + + if (opt_ctx->allocated_graph == graph) { + opt_ctx->eval_ready = true; + return; + } + + ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph + + if (opt_ctx->static_graphs) { + ggml_init_params params = { + /*.mem_size =*/ graph->size*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph->size, graph->grads), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_free(opt_ctx->ctx_copy); + opt_ctx->ctx_copy = ggml_init(params); + + opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph); + } else { + opt_ctx->allocated_graph_copy = graph; + } + + ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy); + opt_ctx->allocated_graph = graph; + + opt_ctx->eval_ready = true; +} + +void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) { + GGML_ASSERT(opt_ctx->eval_ready); + if (opt_ctx->allocated_graph == opt_ctx->gb_opt) { struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud); GGML_ASSERT(opt_pars.adamw.alpha > 0.0f); @@ -609,9 +777,19 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, adamw_par_data[6] = beta2h; } - ggml_opt_alloc_graph(opt_ctx, graph); ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy); opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt; + opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period; + + if (!opt_ctx->static_graphs) { + opt_ctx->gf = nullptr; + opt_ctx->gb_grad = nullptr; + opt_ctx->gb_opt = nullptr; + opt_ctx->allocated_graph = nullptr; + opt_ctx->allocated_graph_copy = nullptr; + } + + opt_ctx->eval_ready = false; if (!result) { return; @@ -635,12 +813,14 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss)); result->loss.push_back(loss); - GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32); - std::vector pred(ndata); - ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred)); - result->pred.insert(result->pred.end(), pred.begin(), pred.end()); + if (opt_ctx->pred) { + GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32); + std::vector pred(ndata); + ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred)); + result->pred.insert(result->pred.end(), pred.begin(), pred.end()); + } - if (!opt_ctx->labels || result->ncorrect < 0) { + if (!opt_ctx->ncorrect || result->ncorrect < 0) { result->ncorrect = -1; return; } @@ -652,26 +832,6 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, result->ncorrect += ncorrect; } -void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) { - ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result); -} - -void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) { - if (opt_ctx->opt_period == 1) { - ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result); - return; - } - - const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period; - if (opt_i_next == 0) { - ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result); - ggml_opt_reset(opt_ctx, /*optimizer =*/ false); - } else { - ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result); - } - opt_ctx->opt_i = opt_i_next; -} - // ====== High-Level Functions ====== void ggml_opt_epoch( @@ -700,16 +860,18 @@ void ggml_opt_epoch( int64_t ibatch = 0; int64_t t_loop_start = ggml_time_us(); for (; ibatch < ibatch_split; ++ibatch) { + ggml_opt_alloc(opt_ctx, /*backward =*/ true); ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch); - ggml_opt_forward_backward(opt_ctx, result_train); + ggml_opt_eval(opt_ctx, result_train); if (callback_train) { callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start); } } t_loop_start = ggml_time_us(); for (; ibatch < nbatches; ++ibatch) { + ggml_opt_alloc(opt_ctx, /*backward =*/ false); ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch); - ggml_opt_forward(opt_ctx, result_eval); + ggml_opt_eval(opt_ctx, result_eval); if (callback_eval) { callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start); } @@ -726,13 +888,26 @@ void ggml_opt_epoch_callback_progress_bar( int64_t t_start_us) { fprintf(stderr, "%s[", train ? "train: " : "val: "); - constexpr int64_t bar_length = 25; + // The progress bar consists of partially filled blocks, unicode has 8 separate fill levels. + constexpr int64_t bar_length = 8; + const int64_t ibatch8 = 8 * ibatch; for (int64_t j = 0; j < bar_length; ++j) { - const int64_t ibatch_j = ibatch_max * j/bar_length; - if (ibatch_j < ibatch) { - fprintf(stderr, "="); - } else if (ibatch_max * (j - 1)/bar_length < ibatch) { - fprintf(stderr, ">"); + if (ibatch_max * (8*j + 8) / bar_length < ibatch8) { + fprintf(stderr, "\u2588"); // full block + } else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) { + fprintf(stderr, "\u2589"); // 7/8 filled + } else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) { + fprintf(stderr, "\u258A"); // 6/8 filled + } else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) { + fprintf(stderr, "\u258B"); // 5/8 filled + } else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) { + fprintf(stderr, "\u258C"); // 4/8 filled + } else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) { + fprintf(stderr, "\u258D"); // 3/8 filled + } else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) { + fprintf(stderr, "\u258E"); // 2/8 filled + } else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) { + fprintf(stderr, "\u258F"); // 1/8 filled } else { fprintf(stderr, " "); } @@ -764,8 +939,8 @@ void ggml_opt_epoch_callback_progress_bar( const int64_t t_eta_m = t_eta_s / 60; t_eta_s -= t_eta_m * 60; - fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, " - "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r", + fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% " + "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r", idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc, t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s); if (ibatch == ibatch_max) { @@ -806,7 +981,10 @@ void ggml_opt_fit( int64_t epoch = 1; - ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type); + ggml_opt_params params = ggml_opt_default_params(backend_sched, loss_type); + params.ctx_compute = ctx_compute; + params.inputs = inputs; + params.outputs = outputs; params.opt_period = opt_period; params.get_opt_pars = get_opt_pars; params.get_opt_pars_ud = &epoch; diff --git a/ml/backend/ggml/ggml/src/ggml-quants.c b/ml/backend/ggml/ggml/src/ggml-quants.c index ac918a60d..84ec6dfe3 100644 --- a/ml/backend/ggml/ggml/src/ggml-quants.c +++ b/ml/backend/ggml/ggml/src/ggml-quants.c @@ -19,12 +19,6 @@ #define GROUP_MAX_EPS_IQ1_M 1e-7f #define GROUP_MAX_EPS_IQ1_S 1e-12f -#if defined(_MSC_VER) -// disable "possible loss of data" to avoid warnings for hundreds of casts -// we should just be careful :) -#pragma warning(disable: 4244 4267) -#endif - #define UNUSED GGML_UNUSED // reference implementation for deterministic creation of model files diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c index 2276b6312..8a6546240 100644 --- a/ml/backend/ggml/ggml/src/ggml.c +++ b/ml/backend/ggml/ggml/src/ggml.c @@ -4,6 +4,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-threading.h" +#include "ggml-cpu.h" #include "ggml.h" // FIXME: required here for quantization functions @@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } -// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library -// currently, the ggml_cpu_has_* functions are entirely compile-time void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { - int64_t i = 0; -#if defined(__F16C__) - //if (ggml_cpu_has_f16c()) { - for (; i + 7 < n; i += 8) { - __m256 x_vec = _mm256_loadu_ps(x + i); - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storeu_si128((__m128i *)(y + i), y_vec); - } - for(; i + 3 < n; i += 4) { - __m128 x_vec = _mm_loadu_ps(x + i); - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storel_epi64((__m128i *)(y + i), y_vec); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(x[i]); } } void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { - int64_t i = 0; -#if defined(__AVX512F__) - //if (ggml_cpu_has_avx512()) { - for (; i + 16 <= n; i += 16) { - _mm512_storeu_ps(y + i, - _mm512_castsi512_ps( - _mm512_slli_epi32( - _mm512_cvtepu16_epi32( - _mm256_loadu_si256( - (const __m256i *)(x + i))), - 16))); - } - //} -#endif -#if defined(__AVX2__) - //if (ggml_cpu_has_avx2()) { - for (; i + 8 <= n; i += 8) { - _mm256_storeu_ps(y + i, - _mm256_castsi256_ps( - _mm256_slli_epi32( - _mm256_cvtepu16_epi32( - _mm_loadu_si128( - (const __m128i *)(x + i))), - 16))); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_BF16_TO_FP32(x[i]); } } @@ -956,6 +915,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_TRANSPOSE_1D", "IM2COL", "IM2COL_BACK", + "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", @@ -963,7 +923,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "UPSCALE", "PAD", "PAD_REFLECT_1D", - "UNPAD", "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", @@ -1051,6 +1010,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "im2col_back(x)", + "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", @@ -1058,7 +1018,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "upscale(x)", "pad(x)", "pad_reflect_1d(x)", - "unpad(x)", "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", @@ -1340,12 +1299,23 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) { return ggml_is_contiguous_n(tensor, 2); } +bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) { + return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); +} + bool ggml_is_permuted(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; } +bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) { + return + tensor->nb[0] > tensor->nb[2] && + tensor->nb[1] > tensor->nb[0] && + tensor->nb[2] == ggml_type_size(tensor->type); +} + static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -2762,11 +2732,11 @@ void ggml_mul_mat_set_prec( c = ggml_mul_mat_id(ctx, as, b, ids); as -> [cols, rows, n_expert] - ids -> [n_experts_used, n_tokens] (i32) b -> [cols, n_expert_used, n_tokens] + ids -> [n_expert_used, n_tokens] (i32) c -> [rows, n_expert_used, n_tokens] - in b, n_experts_used can be broadcasted to match the n_expert_used of ids + in b, n_expert_used can be broadcasted to match the n_expert_used of ids c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids */ @@ -4052,6 +4022,46 @@ struct ggml_tensor * ggml_conv_2d_dw( return result; } +// ggml_conv_2d_dw_direct + +struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1) { + GGML_ASSERT(a->ne[2] == 1); + GGML_ASSERT(a->ne[3] == b->ne[2]); + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1); + ne[2] = b->ne[2]; + ne[3] = b->ne[3]; + + struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); + + if (ggml_is_contiguous_channels(b)) { + // Result will be permuted the same way as input (CWHN order) + const int64_t type_size = ggml_type_size(result->type); + GGML_ASSERT(ggml_blck_size(result->type) == 1); + result->nb[0] = result->ne[2] * type_size; + result->nb[1] = result->ne[0] * result->nb[0]; + result->nb[2] = type_size; + } + + int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_2D_DW; + result->src[0] = a; + result->src[1] = b; + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { @@ -4264,25 +4274,6 @@ struct ggml_tensor * ggml_pad_reflect_1d( return result; } -// ggml_unpad - -struct ggml_tensor * ggml_unpad( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, int p1, int p2, int p3) { - - struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, - a->ne[0] - p0, - a->ne[1] - p1, - a->ne[2] - p2, - a->ne[3] - p3); - - result->op = GGML_OP_UNPAD; - result->src[0] = a; - - return result; -} - // ggml_arange struct ggml_tensor * ggml_arange( @@ -5508,7 +5499,7 @@ static void ggml_compute_backward( // tensor = src0 * 1 + src1 * 0 if (src0_needs_grads) { // dsrc0 = dtensor * 1 - ggml_add_or_set(ctx, cgraph, isrc0, grad); + ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad, src0)); } if (src1_needs_grads) { // dsrc1 = dtensor * 0 -> noop @@ -5789,10 +5780,9 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * } void ggml_build_backward_expand( - struct ggml_context * ctx_static, - struct ggml_context * ctx_compute, - struct ggml_cgraph * cgraph, - bool accumulate) { + struct ggml_context * ctx, + struct ggml_cgraph * cgraph, + struct ggml_tensor ** grad_accs) { GGML_ASSERT(cgraph->n_nodes > 0); GGML_ASSERT(cgraph->grads); GGML_ASSERT(cgraph->grad_accs); @@ -5865,21 +5855,24 @@ void ggml_build_backward_expand( GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); - const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node); - GGML_ASSERT(igrad != GGML_HASHSET_FULL); - GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad)); - if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) { - cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node); - cgraph->grads[igrad] = cgraph->grad_accs[igrad]; - ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name); + const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node); + GGML_ASSERT(ihash != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, ihash)); + if (grad_accs && grad_accs[i]) { + cgraph->grad_accs[ihash] = grad_accs[i]; + cgraph->grads[ihash] = cgraph->grad_accs[ihash]; + } else if (node->flags & GGML_TENSOR_FLAG_LOSS) { + // loss tensors always need a gradient accumulator + cgraph->grad_accs[ihash] = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + cgraph->grads[ihash] = cgraph->grad_accs[ihash]; } - grads_needed[igrad] = true; + grads_needed[ihash] = true; } for (int i = n_nodes_f - 1; i >= 0; --i) { // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation // use allocator to automatically make inplace operations - ggml_compute_backward(ctx_compute, cgraph, i, grads_needed); + ggml_compute_backward(ctx, cgraph, i, grads_needed); } free(grads_needed); @@ -6025,8 +6018,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } } -struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL); +struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads) { + struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads || force_grads); ggml_graph_cpy(cgraph, result); return result; } @@ -6045,6 +6038,9 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { } void ggml_graph_reset(struct ggml_cgraph * cgraph) { + if (!cgraph) { + return; + } GGML_ASSERT(cgraph->grads != NULL); for (int i = 0; i < cgraph->n_nodes; i++) { @@ -6354,8 +6350,8 @@ void ggml_set_output(struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_OUTPUT; } -void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) { - GGML_UNUSED(ctx); // TODO: remove this parameter +void ggml_set_param(struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->op == GGML_OP_NONE); tensor->flags |= GGML_TENSOR_FLAG_PARAM; } diff --git a/ml/backend/ggml/ggml/src/ggml.go b/ml/backend/ggml/ggml/src/ggml.go index 678f55ca6..91f1f1ada 100644 --- a/ml/backend/ggml/ggml/src/ggml.go +++ b/ml/backend/ggml/ggml/src/ggml.go @@ -3,6 +3,7 @@ package ggml // #cgo CXXFLAGS: -std=c++17 // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU // #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu +// #cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration // #cgo windows LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++ // #include // #include "ggml-backend.h" diff --git a/ml/backend/ggml/quantization.go b/ml/backend/ggml/quantization.go new file mode 100644 index 000000000..bb31e455d --- /dev/null +++ b/ml/backend/ggml/quantization.go @@ -0,0 +1,83 @@ +package ggml + +// #cgo CPPFLAGS: -I${SRCDIR}/ggml/src +// #include +// #include +// #include "ggml.h" +// #include "ggml-cpu.h" +// #include "ggml-backend.h" +// #include "ggml-quants.h" +import "C" + +import ( + "unsafe" + + fsggml "github.com/ollama/ollama/fs/ggml" +) + +// convertToF32 converts (dequantizes) the raw data to F32 so we can then quantize it +func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 { + f32s := make([]float32, nelements) + elems := C.int64_t(nelements) + switch dtype { + case C.GGML_TYPE_F16: + C.ggml_fp16_to_fp32_row((*C.uint16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_0: + C.dequantize_row_q4_0((*C.block_q4_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_1: + C.dequantize_row_q4_1((*C.block_q4_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_0: + C.dequantize_row_q5_0((*C.block_q5_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_1: + C.dequantize_row_q5_1((*C.block_q5_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q8_0: + C.dequantize_row_q8_0((*C.block_q8_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q2_K: + C.dequantize_row_q2_K((*C.block_q2_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q3_K: + C.dequantize_row_q3_K((*C.block_q3_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_K: + C.dequantize_row_q4_K((*C.block_q4_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_K: + C.dequantize_row_q5_K((*C.block_q5_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q6_K: + C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_BF16: + C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + default: + panic("unsupported quantization format") + } + return f32s +} + +func Quantize(newType fsggml.TensorType, f32s []float32, shape []uint64) []byte { + buf := make([]byte, len(f32s)*4) // upper bound on size + nPerRow := C.int64_t(shape[0]) + nrows := C.int64_t(1) + if len(shape) > 1 { + nrows = C.int64_t(shape[1]) + } + shape2 := C.int64_t(1) + if len(shape) > 2 { + shape2 = C.int64_t(shape[2]) + } + nelements_matrix := nPerRow * nrows + newSize := C.size_t(0) + for i03 := C.int64_t(0); i03 < shape2; i03++ { + f32s_03 := i03 * nelements_matrix + buf_03 := C.int64_t(C.ggml_row_size(uint32(newType), nPerRow)) * i03 * nrows + newSize += C.ggml_quantize_chunk( + uint32(newType), + (*C.float)(&f32s[f32s_03]), + unsafe.Pointer((uintptr)(unsafe.Pointer(&buf[0]))+uintptr(buf_03)), + 0, + nrows, + nPerRow, + nil) + } + return buf[:newSize] +} + +func QuantizationVersion() uint32 { + return uint32(C.GGML_QNT_VERSION) +} diff --git a/model/process_text.go b/model/bytepairencoding.go similarity index 68% rename from model/process_text.go rename to model/bytepairencoding.go index 90b220a2e..6bb9a003e 100644 --- a/model/process_text.go +++ b/model/bytepairencoding.go @@ -2,117 +2,16 @@ package model import ( "cmp" + "context" "iter" "log/slog" - "slices" "strings" - "sync" "github.com/dlclark/regexp2" heap "github.com/emirpasic/gods/v2/trees/binaryheap" + "github.com/ollama/ollama/logutil" ) -type Special int32 - -const ( - SpecialBOS Special = iota - SpecialEOS -) - -const ( - TOKEN_TYPE_NORMAL = iota + 1 - TOKEN_TYPE_UNKNOWN - TOKEN_TYPE_CONTROL - TOKEN_TYPE_USER_DEFINED - TOKEN_TYPE_UNUSED - TOKEN_TYPE_BYTE -) - -type TextProcessor interface { - Encode(s string, addSpecial bool) ([]int32, error) - Decode([]int32) (string, error) - Is(int32, Special) bool - Vocabulary() *Vocabulary -} - -type Vocabulary struct { - Values []string - Types []int32 - Scores []float32 - Merges []string - - BOS, EOS, EOT int32 - AddBOS, AddEOS, AddEOT bool - - specialOnce sync.Once - special []string - - valuesOnce sync.Once - values map[string]int32 - - mergeOnce sync.Once - merge map[string]int32 -} - -func (v *Vocabulary) Is(id int32, special Special) bool { - switch special { - case SpecialBOS: - return id == v.BOS - case SpecialEOS: - return id == v.EOS || id == v.EOT - default: - return false - } -} - -func (v *Vocabulary) Encode(s string) int32 { - v.valuesOnce.Do(func() { - v.values = make(map[string]int32, len(v.Values)) - for i, value := range v.Values { - v.values[value] = int32(i) - } - }) - - if id, ok := v.values[s]; ok { - return id - } - - return -1 -} - -func (v *Vocabulary) Decode(id int32) string { - return v.Values[id] -} - -func (v *Vocabulary) SpecialVocabulary() []string { - v.specialOnce.Do(func() { - for i := range v.Values { - if slices.Contains([]int{105, 106}, i) { - v.special = append(v.special, v.Values[i]) - } else if v.Types[i] == TOKEN_TYPE_CONTROL { - v.special = append(v.special, v.Values[i]) - } - } - }) - - return v.special -} - -func (v *Vocabulary) Merge(left, right string) int { - v.mergeOnce.Do(func() { - v.merge = make(map[string]int32, len(v.Merges)) - for i, merge := range v.Merges { - v.merge[merge] = int32(i) - } - }) - - if id, ok := v.merge[left+" "+right]; ok { - return int(id) - } - - return -1 -} - type BytePairEncoding struct { pre *regexp2.Regexp vocab *Vocabulary @@ -302,24 +201,10 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) { } } + slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids) + if addSpecial && len(ids) > 0 { - if bpe.vocab.AddBOS { - if ids[0] == bpe.vocab.BOS { - slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS) - } - - slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS) - ids = append([]int32{bpe.vocab.BOS}, ids...) - } - - if bpe.vocab.AddEOS { - if ids[len(ids)-1] == bpe.vocab.EOS { - slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS) - } - - slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS) - ids = append(ids, bpe.vocab.EOS) - } + ids = bpe.vocab.addSpecials(ids) } return ids, nil @@ -349,5 +234,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) { } } + slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String()) return sb.String(), nil } diff --git a/model/process_text_test.go b/model/bytepairencoding_test.go similarity index 100% rename from model/process_text_test.go rename to model/bytepairencoding_test.go diff --git a/model/input/input.go b/model/input/input.go index d66f52a0d..bd9b53ec6 100644 --- a/model/input/input.go +++ b/model/input/input.go @@ -2,16 +2,30 @@ package input import "github.com/ollama/ollama/ml" +// Multimodal is a multimodal embedding or a component of one. +// For example, it could be a row of an image that can be processed +// independently. +type Multimodal struct { + // Tensor is the embedding data. Implementations may chose what to + // store here or it may be nil if not needed. However, any ml.Tensor + // objects must be stored here and not in Data. + Tensor ml.Tensor + + // Data is implementation-specific opaque data, such as metadata on how + // to layout Tensor. It may be nil if not needed. It may also store larger + // objects such as complete images if they are to be processed later. + Data any +} + // Input represents one token in the input stream type Input struct { // Token is a single element of text. Token int32 - // Multimodal is opaque data representing a non-text - // element such as an image (or part of one if the image - // can be processed in pieces). It may be either together - // with Token or on its own. - Multimodal any + // Multimodal is represents a non-text element such as an + // image (or part of one if the image can be processed in pieces). + // It may be used either together with Token or on its own. + Multimodal []Multimodal // MultimodalHash is a unique representation of the data // stored in Multimodal, used for caching and comparing @@ -32,7 +46,7 @@ type Input struct { // Positions slice. type MultimodalIndex struct { Index int - Multimodal any + Multimodal []Multimodal } // Batch contains the inputs for a model forward pass diff --git a/model/model.go b/model/model.go index ab96c4c70..98381c904 100644 --- a/model/model.go +++ b/model/model.go @@ -19,6 +19,7 @@ import ( "github.com/ollama/ollama/fs" fsggml "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/kvcache" + "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" _ "github.com/ollama/ollama/ml/backend" "github.com/ollama/ollama/model/input" @@ -39,12 +40,13 @@ type MultimodalProcessor interface { // EncodeMultimodal processes a single input (such as an image) and // generates an output (typically an embedding) that can be used by the model. // - // The return value is most typically an ml.Tensor, however, different - // type are possible, such as an object containing a tensor plus - // additional metadata, a slice of tensors or even just the original input. + // The return value is one or more tensors, each with optional model-specific + // opaque metadata. Typically, the tensors might be views into an embedding + // with each view representing a chunk of data that can be processed independently + // in different batches. // // The result may be cached by the runner. - EncodeMultimodal(ml.Context, []byte) (any, error) + EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error) // PostTokenize is called after tokenization to allow the model to edit the // input stream to correctly arrange multimodal elements. @@ -202,7 +204,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { names := fn(tagsCopy) for _, name := range names { if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil { - slog.Debug("found tensor", "", tensor) + slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor) vv.Set(reflect.ValueOf(tensor)) break } diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go index d418f6827..a87534c54 100644 --- a/model/models/gemma2/model.go +++ b/model/models/gemma2/model.go @@ -43,8 +43,13 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, + AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + []int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), }, ), Layers: make([]Layer, c.Uint("block_count")), diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go index bf396b6a0..89d1788ef 100644 --- a/model/models/gemma3/model.go +++ b/model/models/gemma3/model.go @@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(1), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), - EOT: int32(106), - AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false), + EOS: append( + []int32{ + int32(c.Uint("tokenizer.ggml.eos_token_id")), + int32(c.Uint("tokenizer.ggml.eot_token_id", 106)), + }, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), }, ), ImageProcessor: newImageProcessor(c), @@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) { return &m, nil } -func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) { if len(m.VisionModel.Layers) == 0 { return nil, model.ErrNoVisionModel } @@ -108,22 +112,22 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er visionOutputs := m.VisionModel.Forward(ctx, pixelValues) visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps) - return visionOutputs, nil + return []input.Multimodal{{Tensor: visionOutputs}}, nil } func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { var result []input.Input for _, inp := range inputs { - if inp.Multimodal == nil { + if len(inp.Multimodal) == 0 { result = append(result, inp) } else { - inputMultimodal := inp.Multimodal.(ml.Tensor) + inputMultimodal := inp.Multimodal[0].Tensor result = append(result, - input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n" - input.Input{Token: 255999}, // """ - input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder + input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n" + input.Input{Token: 255999}, // """ + input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder ) // add image token placeholders diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go index c1e843d8f..a40614af2 100644 --- a/model/models/gemma3/model_text.go +++ b/model/models/gemma3/model_text.go @@ -7,7 +7,6 @@ import ( "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" - "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" ) @@ -20,9 +19,6 @@ type TextConfig struct { } type TextModel struct { - model.Base - model.SentencePieceModel - TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []TextLayer `gguf:"blk"` OutputNorm *nn.RMSNorm `gguf:"output_norm"` @@ -45,15 +41,6 @@ func newTextModel(c fs.Config) *TextModel { numBlocks := int(c.Uint("block_count")) m := TextModel{ - SentencePieceModel: model.NewSentencePieceModel( - &model.Vocabulary{ - Values: c.Strings("tokenizer.ggml.tokens"), - Scores: c.Floats("tokenizer.ggml.scores"), - Types: c.Ints("tokenizer.ggml.token_type"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), - }, - ), Layers: make([]TextLayer, numBlocks), TextConfig: &TextConfig{ hiddenSize: int(c.Uint("embedding_length")), @@ -178,7 +165,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor // set image embeddings var except []int for _, image := range batch.Multimodal { - visionOutputs := image.Multimodal.(ml.Tensor) + visionOutputs := image.Multimodal[0].Tensor ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1)))) for i := range visionOutputs.Dim(1) { diff --git a/model/models/llama/model.go b/model/models/llama/model.go index 3e5a54278..6e214f0fb 100644 --- a/model/models/llama/model.go +++ b/model/models/llama/model.go @@ -43,10 +43,13 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + []int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), }, ), Layers: make([]Layer, c.Uint("block_count")), diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go index 632d313ec..af5173a16 100644 --- a/model/models/llama4/model.go +++ b/model/models/llama4/model.go @@ -4,7 +4,6 @@ import ( "bytes" "image" "slices" - "sync" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/kvcache" @@ -41,10 +40,13 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + []int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), }, ), ImageProcessor: newImageProcessor(c), @@ -60,7 +62,7 @@ func New(c fs.Config) (model.Model, error) { return &m, nil } -func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) { if len(m.VisionModel.Layers) < 1 { return nil, model.ErrNoVisionModel } @@ -100,70 +102,79 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er visionOutputs := m.VisionModel.Forward(ctx, pixelValues) visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3)) projectedOutputs := m.Projector.Forward(ctx, visionOutputs) - return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil + + var multimodal []input.Multimodal + aspectRatio := image.Point{ratioW, ratioH} + + var offset int + patchesPerChunk := projectedOutputs.Dim(1) + if aspectRatio.Y*aspectRatio.X > 1 { + patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1) + + for range aspectRatio.Y { + for x := range aspectRatio.X { + view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset, + projectedOutputs.Dim(0), projectedOutputs.Stride(1), + patchesPerChunk) + var separator separator + if x < aspectRatio.X-1 { + separator.x = true // <|tile_x_separator|> + } else { + separator.y = true // <|tile_y_separator|> + } + multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator}) + offset += patchesPerChunk + } + } + } + + view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset, + projectedOutputs.Dim(0), projectedOutputs.Stride(1), + patchesPerChunk) + multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}}) + + return multimodal, nil } -type chunks struct { - *Model - ml.Tensor - aspectRatio image.Point - - dataOnce sync.Once - data []float32 -} - -type chunk struct { - *chunks - s, n int -} - -func (r *chunk) floats() []float32 { - r.dataOnce.Do(func() { - temp := r.Backend().NewContext() - defer temp.Close() - temp.Forward(r.Tensor).Compute(r.Tensor) - r.data = r.Floats() - }) - - return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)] +type separator struct { + x bool + y bool } func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { var result []input.Input for _, inp := range inputs { - if inp.Multimodal == nil { + if len(inp.Multimodal) == 0 { result = append(result, inp) continue } - t := inp.Multimodal.(*chunks) var imageInputs []input.Input imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|> - var offset int - patchesPerChunk := t.Dim(1) - if t.aspectRatio.Y*t.aspectRatio.X > 1 { - patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1) + for i, mm := range inp.Multimodal { + patchesPerChunk := mm.Tensor.Dim(1) - for range t.aspectRatio.Y { - for x := range t.aspectRatio.X { - imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|> - imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...) - if x < t.aspectRatio.X-1 { - imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|> - } - offset += patchesPerChunk + if i < len(inp.Multimodal)-1 { + separator := mm.Data.(*separator) + + imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|> + imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...) + + if separator.x { + imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|> } - - imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|> + if separator.y { + imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|> + } + } else { + imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|> + imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|> + imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...) + imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|> } } - imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|> - imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|> - imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...) - imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|> - result = append(result, imageInputs...) } diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go index 3f9f578f1..d98587bd0 100644 --- a/model/models/llama4/model_text.go +++ b/model/models/llama4/model_text.go @@ -210,12 +210,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx) for _, mi := range batch.Multimodal { - f32s := mi.Multimodal.(*chunk).floats() - img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize) - if err != nil { - panic(err) - } - + img := mi.Multimodal[0].Tensor ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1)))) } diff --git a/model/models/llama4/model_vision.go b/model/models/llama4/model_vision.go index 3bf9cee75..e6b1afef6 100644 --- a/model/models/llama4/model_vision.go +++ b/model/models/llama4/model_vision.go @@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor { } hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps) - hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0) + hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0) hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions) return hiddenStates } diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go index f749fdcd2..0d384b94c 100644 --- a/model/models/mistral3/model.go +++ b/model/models/mistral3/model.go @@ -4,7 +4,6 @@ import ( "bytes" "image" "slices" - "sync" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/kvcache" @@ -16,6 +15,8 @@ import ( type Model struct { model.Base + model.BytePairEncoding + *TextModel *VisionModel `gguf:"v,vision"` *MultiModalProjector `gguf:"mm"` @@ -36,6 +37,21 @@ func New(c fs.Config) (model.Model, error) { } m := &Model{ + BytePairEncoding: model.NewBytePairEncoding( + c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`), + &model.Vocabulary{ + Values: c.Strings("tokenizer.ggml.tokens"), + Types: c.Ints("tokenizer.ggml.token_type"), + Merges: c.Strings("tokenizer.ggml.merges"), + AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, + AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + []int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), + }, + ), TextModel: textModel, VisionModel: newVisionModel(c), ImageProcessor: newImageProcessor(c), @@ -88,7 +104,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector { } } -func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) { if len(m.VisionModel.Layers) == 0 { return nil, model.ErrNoVisionModel } @@ -112,37 +128,14 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size) // split into patches to be sent to the text transformer - parent := imageFeatures{tensor: features} - rows := make([]*imageRow, size.Y) + rows := make([]input.Multimodal, size.Y) for i := range rows { - rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}} + rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X) } return rows, nil } -type imageFeatures struct { - tensor ml.Tensor - - dataOnce sync.Once - data []float32 -} - -type imageRow struct { - parent *imageFeatures - s int - shape []int -} - -func (r *imageRow) data() []float32 { - n := 1 - for _, s := range r.shape { - n *= s - } - - return r.parent.data[r.s*n : (r.s+1)*n] -} - // PostTokenize arranges Mistral 3's inputs for the forward pass // In Mistral 3 and Pixtral, the input patches are arranged as follows: // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END] @@ -151,15 +144,14 @@ func (r *imageRow) data() []float32 { func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { var result []input.Input for _, inp := range inputs { - if inp.Multimodal == nil { + if len(inp.Multimodal) == 0 { result = append(result, inp) } else { - inputMultimodal := inp.Multimodal.([]*imageRow) - for i, row := range inputMultimodal { + for i, row := range inp.Multimodal { // [IMG] - result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]}) - result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...) - if i == len(inputMultimodal)-1 { + result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)}) + result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...) + if i == len(inp.Multimodal)-1 { // [IMG_END] result = append(result, input.Input{Token: 13}) } else { diff --git a/model/models/mistral3/model_text.go b/model/models/mistral3/model_text.go index 1bf72acd8..17939800f 100644 --- a/model/models/mistral3/model_text.go +++ b/model/models/mistral3/model_text.go @@ -9,7 +9,6 @@ import ( "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" - "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" ) @@ -20,9 +19,6 @@ type TextOptions struct { } type TextModel struct { - model.Base - model.BytePairEncoding - TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` OutputNorm *nn.RMSNorm `gguf:"output_norm"` @@ -110,20 +106,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor // image embeddings for _, image := range batch.Multimodal { - row := image.Multimodal.(*imageRow) - row.parent.dataOnce.Do(func() { - // use a new, throwaway context so the image tensor is not added to the graph - temp := m.Backend().NewContext() - temp.Forward(row.parent.tensor).Compute(row.parent.tensor) - row.parent.data = row.parent.tensor.Floats() - temp.Close() - }) - - imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...) - if err != nil { - panic(err) - } - + imageFeature := image.Multimodal[0].Tensor ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1)))) } @@ -148,18 +131,6 @@ func NewTextModel(c fs.Config) (*TextModel, error) { } textModel := &TextModel{ - BytePairEncoding: model.NewBytePairEncoding( - c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`), - &model.Vocabulary{ - Values: c.Strings("tokenizer.ggml.tokens"), - Types: c.Ints("tokenizer.ggml.token_type"), - Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)), - AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)), - AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), - }, - ), Layers: make([]Layer, c.Uint("block_count")), TextOptions: &TextOptions{ hiddenSize: int(c.Uint("embedding_length")), diff --git a/model/models/mllama/imageproc.go b/model/models/mllama/imageproc.go deleted file mode 100644 index 13f2fb8b3..000000000 --- a/model/models/mllama/imageproc.go +++ /dev/null @@ -1,201 +0,0 @@ -package mllama - -import ( - "fmt" - "image" - _ "image/jpeg" - _ "image/png" - "io" - "math" - "slices" - - "golang.org/x/image/draw" - - "github.com/ollama/ollama/model/imageproc" -) - -func getSupportedAspectRatios(maxTiles int) []image.Point { - ratios := []image.Point{} - - for w := range maxTiles { - for h := range maxTiles { - if (w+1)*(h+1) <= maxTiles { - ratios = append(ratios, image.Point{w + 1, h + 1}) - } - } - } - - return ratios -} - -func clip(a, a_min, a_max int) int { - if a < a_min { - return a_min - } else if a > a_max { - return a_max - } - - return a -} - -func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { - possibleTileArrangements := getSupportedAspectRatios(maxImageTiles) - possibleCanvasSizes := []image.Point{} - for _, pta := range possibleTileArrangements { - possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) - } - - scales := []float64{} - - for _, pcs := range possibleCanvasSizes { - scaleHeight := float64(pcs.Y) / float64(imageSize.Y) - scaleWidth := float64(pcs.X) / float64(imageSize.X) - - if scaleWidth > scaleHeight { - scales = append(scales, scaleHeight) - } else { - scales = append(scales, scaleWidth) - } - } - - var minUpscale float64 - var maxDownscale float64 - var upscale bool - - for _, s := range scales { - if s > 1.0 { - upscale = true - if minUpscale == 0 { - minUpscale = s - } else { - minUpscale = math.Min(minUpscale, s) - } - } else { - maxDownscale = math.Max(maxDownscale, s) - } - } - - selectedScale := maxDownscale - if upscale { - selectedScale = minUpscale - } - - var selectedCanvas image.Point - for n, pcs := range possibleCanvasSizes { - if scales[n] == selectedScale { - // choose the smallest possible canvas - if selectedCanvas.X == 0 && selectedCanvas.Y == 0 { - selectedCanvas = pcs - } else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y { - selectedCanvas = pcs - } - } - } - return selectedCanvas -} - -func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { - targetWidth := clip(imageSize.X, tileSize, canvasSize.X) - targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) - - scaleWidth := float64(targetWidth) / float64(imageSize.X) - scaleHeight := float64(targetHeight) / float64(imageSize.Y) - - var w, h int - - if scaleWidth < scaleHeight { - w = targetWidth - h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) - } else { - w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) - h = targetHeight - } - - return image.Point{w, h} -} - -func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { - if format == "png" { - img = imageproc.Composite(img) - } - - b := img.Bounds() - tileSize := outputSize.Y - - canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) - aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} - newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) - - return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio -} - -func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image { - paddedSize := image.Point{ - X: outputSize.X * aspectRatio.X, - Y: outputSize.Y * aspectRatio.Y, - } - - dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) - draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) - - return dst -} - -func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { - b := img.Bounds() - width := b.Max.X - b.Min.X - height := b.Max.Y - b.Min.Y - tileHeight := height / numTilesSize.Y - tileWidth := width / numTilesSize.X - - images := []image.Image{} - - for h := range numTilesSize.Y { - for w := range numTilesSize.X { - rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) - images = append(images, img.(interface { - SubImage(image.Rectangle) image.Image - }).SubImage(rect)) - } - } - - return images -} - -func packImages(img image.Image, aspectRatio image.Point) []float32 { - subImages := splitToTiles(img, aspectRatio) - - var pixelVals []float32 - - rescale := true - channelFirst := true - - for _, subImg := range subImages { - vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst) - pixelVals = append(pixelVals, vals...) - } - - return pixelVals -} - -func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { - outputSize := image.Point{560, 560} - maxTiles := 4 - - img, format, err := image.Decode(imageData) - if err != nil { - return nil, nil, fmt.Errorf("failed to decode image: %w", err) - } - - newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles) - newImage = padImage(newImage, outputSize, aspectRatio) - - data := packImages(newImage, aspectRatio) - aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1 - - opts := map[string]any{ - "aspectRatioIndex": aspectRatioIndex, - } - - return data, opts, nil -} diff --git a/model/models/mllama/imageproc_test.go b/model/models/mllama/imageproc_test.go deleted file mode 100644 index a14b91bd1..000000000 --- a/model/models/mllama/imageproc_test.go +++ /dev/null @@ -1,420 +0,0 @@ -package mllama - -import ( - "bytes" - "image" - "image/png" - "testing" - - "github.com/google/go-cmp/cmp" -) - -func TestAspectRatios(t *testing.T) { - type aspectCase struct { - MaxTiles int - Expected []image.Point - } - - cases := []aspectCase{ - { - MaxTiles: 1, - Expected: []image.Point{{1, 1}}, - }, - { - MaxTiles: 2, - Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}}, - }, - { - MaxTiles: 3, - Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}}, - }, - { - MaxTiles: 4, - Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}}, - }, - } - - for _, c := range cases { - actual := getSupportedAspectRatios(c.MaxTiles) - - if diff := cmp.Diff(actual, c.Expected); diff != "" { - t.Errorf("mismatch (-got +want):\n%s", diff) - } - } -} - -func TestGetImageSizeFitToCanvas(t *testing.T) { - type imageSizeCase struct { - ImageRect image.Point - CanvasRect image.Point - TileSize int - Expected image.Point - } - - cases := []imageSizeCase{ - { - ImageRect: image.Point{400, 400}, - CanvasRect: image.Point{640, 480}, - TileSize: 200, - Expected: image.Point{400, 400}, - }, - { - ImageRect: image.Point{1024, 768}, - CanvasRect: image.Point{640, 480}, - TileSize: 200, - Expected: image.Point{640, 480}, - }, - { - ImageRect: image.Point{500, 500}, - CanvasRect: image.Point{1000, 1000}, - TileSize: 750, - Expected: image.Point{750, 750}, - }, - { - ImageRect: image.Point{500, 1000}, - CanvasRect: image.Point{2000, 2000}, - TileSize: 2000, - Expected: image.Point{1000, 2000}, - }, - { - ImageRect: image.Point{4000, 3000}, - CanvasRect: image.Point{2000, 1000}, - TileSize: 1000, - Expected: image.Point{1333, 1000}, - }, - { - ImageRect: image.Point{667, 1000}, - CanvasRect: image.Point{1000, 1000}, - TileSize: 560, - Expected: image.Point{667, 1000}, - }, - } - - for _, c := range cases { - actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize) - - if actual != c.Expected { - t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected) - } - } -} - -func TestGetOptimalTiledCanvas(t *testing.T) { - type tiledCanvasSizeCase struct { - ImageSize image.Point - MaxImageTiles int - TileSize int - Expected image.Point - } - - cases := []tiledCanvasSizeCase{ - { - ImageSize: image.Point{1024, 768}, - MaxImageTiles: 4, - TileSize: 1000, - Expected: image.Point{2000, 1000}, - }, - { - ImageSize: image.Point{1024, 768}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 1120}, - }, - { - ImageSize: image.Point{800, 600}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 1120}, - }, - { - ImageSize: image.Point{640, 480}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 560}, - }, - { - ImageSize: image.Point{320, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 560}, - }, - { - ImageSize: image.Point{1320, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1680, 560}, - }, - { - ImageSize: image.Point{2000, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{2240, 560}, - }, - { - ImageSize: image.Point{10000, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{2240, 560}, - }, - { - ImageSize: image.Point{480, 640}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 1120}, - }, - { - ImageSize: image.Point{200, 320}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 560}, - }, - { - ImageSize: image.Point{200, 1320}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 1680}, - }, - { - ImageSize: image.Point{200, 2000}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 2240}, - }, - { - ImageSize: image.Point{200, 10000}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 2240}, - }, - { - ImageSize: image.Point{10000, 10000}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 1120}, - }, - } - - for _, c := range cases { - actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize) - - if actual != c.Expected { - t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected) - } - } -} - -func TestSplitToTiles(t *testing.T) { - type splitCase struct { - TestImage image.Image - NumTilesSize image.Point - Expected []image.Image - } - - cases := []splitCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - NumTilesSize: image.Point{1, 1}, - Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)), - NumTilesSize: image.Point{2, 1}, - Expected: []image.Image{ - image.NewRGBA(image.Rect(0, 0, 500, 500)), - image.NewRGBA(image.Rect(500, 0, 1000, 500)), - }, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)), - NumTilesSize: image.Point{2, 2}, - Expected: []image.Image{ - image.NewRGBA(image.Rect(0, 0, 500, 500)), - image.NewRGBA(image.Rect(500, 0, 1000, 500)), - image.NewRGBA(image.Rect(0, 500, 500, 1000)), - image.NewRGBA(image.Rect(500, 500, 1000, 1000)), - }, - }, - } - - for _, c := range cases { - actual := splitToTiles(c.TestImage, c.NumTilesSize) - - if len(actual) != len(c.Expected) { - t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected)) - } - - for i := range actual { - if actual[i].Bounds() != c.Expected[i].Bounds() { - t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds()) - } - } - } -} - -func TestResize(t *testing.T) { - type resizeCase struct { - TestImage image.Image - OutputSize image.Point - MaxImageTiles int - ExpectedImage image.Image - ExpectedAspectRatio image.Point - } - - cases := []resizeCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)), - OutputSize: image.Point{100, 100}, - MaxImageTiles: 1, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), - ExpectedAspectRatio: image.Point{1, 1}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)), - OutputSize: image.Point{100, 100}, - MaxImageTiles: 2, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), - ExpectedAspectRatio: image.Point{1, 1}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), - OutputSize: image.Point{560, 560}, - MaxImageTiles: 4, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), - ExpectedAspectRatio: image.Point{1, 1}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)), - OutputSize: image.Point{560, 560}, - MaxImageTiles: 4, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)), - ExpectedAspectRatio: image.Point{2, 2}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - OutputSize: image.Point{560, 560}, - MaxImageTiles: 4, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - ExpectedAspectRatio: image.Point{2, 2}, - }, - } - - for _, c := range cases { - actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles) - - if actualImage.Bounds() != c.ExpectedImage.Bounds() { - t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds()) - } - - if actualAspectRatio != c.ExpectedAspectRatio { - t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio) - } - } -} - -func TestPad(t *testing.T) { - type padCase struct { - TestImage image.Image - OutputSize image.Point - AspectRatio image.Point - Expected image.Image - } - - cases := []padCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)), - OutputSize: image.Point{560, 560}, - AspectRatio: image.Point{2, 2}, - Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), - }, - } - - for _, c := range cases { - actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio) - - if actual.Bounds() != c.Expected.Bounds() { - t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds()) - } - } -} - -func TestPackImages(t *testing.T) { - type packCase struct { - TestImage image.Image - AspectRatio image.Point - ExpectedVals int - } - - cases := []packCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), - AspectRatio: image.Point{2, 2}, - ExpectedVals: 2 * 2 * 3 * 560 * 560, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), - AspectRatio: image.Point{1, 1}, - ExpectedVals: 1 * 1 * 3 * 560 * 560, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)), - AspectRatio: image.Point{1, 2}, - ExpectedVals: 1 * 2 * 3 * 560 * 560, - }, - } - - for _, c := range cases { - actualVals := packImages(c.TestImage, c.AspectRatio) - if len(actualVals) != c.ExpectedVals { - t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals) - } - } -} - -func TestPreprocess(t *testing.T) { - type preprocessCase struct { - TestImage image.Image - ExpectedVals int - ExpectedAspectRatioID int - } - - cases := []preprocessCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), - ExpectedVals: 0, - ExpectedAspectRatioID: 1, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - ExpectedVals: 0, - ExpectedAspectRatioID: 6, - }, - } - - for _, c := range cases { - var buf bytes.Buffer - err := png.Encode(&buf, c.TestImage) - if err != nil { - t.Fatal(err) - } - - imgData, opts, err := Preprocess(&buf) - if err != nil { - t.Fatalf("error processing: %q", err) - } - - if len(imgData) == 0 { - t.Errorf("no image data returned") - } - - ar, ok := opts["aspectRatioIndex"] - if !ok { - t.Fatalf("no aspect ratio found") - } - - aspectRatioID := ar.(int) - - if aspectRatioID != c.ExpectedAspectRatioID { - t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID) - } - } -} diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index 149876c9c..547e2cb32 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -2,9 +2,6 @@ package mllama import ( "bytes" - "encoding/binary" - "fmt" - "hash/fnv" "image" "slices" @@ -34,10 +31,6 @@ const ( ) func New(c fs.Config) (model.Model, error) { - // Verify unified config - if c.Uint("vision.block_count") == 0 { - return nil, fmt.Errorf("non-unified vision model not supported") - } m := Model{ BytePairEncoding: model.NewBytePairEncoding( c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), @@ -45,10 +38,13 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + []int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), }, ), ImageProcessor: newImageProcessor(c), @@ -63,7 +59,7 @@ func New(c fs.Config) (model.Model, error) { return &m, nil } -func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) { if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 { return nil, model.ErrNoVisionModel } @@ -73,67 +69,48 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er return nil, err } - f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image) + f32s, ratio, err := m.ImageProcessor.ProcessImage(image) if err != nil { return nil, err } - pixelValues, err := ctx.Input().FromFloatSlice(f32s, - m.ImageProcessor.imageSize, - m.ImageProcessor.imageSize, - m.ImageProcessor.numChannels, - m.ImageProcessor.maxNumTiles, - ) + if ratio.numTiles() < m.maxNumTiles { + // Pad tiles to maxNumTiles + f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles) + f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles] + } + + pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles) if err != nil { return nil, err } - aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1) + aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1) if err != nil { return nil, err } positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32) crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio) - return m.Projector.Forward(ctx, crossAttentionStates), nil + projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates) + + return []input.Multimodal{{Tensor: projectedOutputs}}, nil } func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { - var images []input.Input - fnvHash := fnv.New64a() - for i := range inputs { - if inputs[i].Multimodal == nil { - if len(images) > 0 { - inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)} - inputs[i].MultimodalHash = images[0].MultimodalHash - for j := 1; j < len(images); j++ { - inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor)) - fnvHash.Reset() - binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash) - binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash) - inputs[i].MultimodalHash = fnvHash.Sum64() - } - images = nil - } - } else { - images = append(images, inputs[i]) - inputs[i].Token = -1 + if inputs[i].Multimodal != nil { + inputs[i].Token = 128256 // <|image|> } } - inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 }) - return inputs, nil } func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { var crossAttentionStates ml.Tensor if len(batch.Multimodal) > 0 { - images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor) - if len(images) > 0 { - crossAttentionStates = images[len(images)-1] - } + crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor } positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) @@ -147,7 +124,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { } // TODO: attention mask, cross attention mask - return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil + return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil } func init() { diff --git a/model/models/mllama/model_text.go b/model/models/mllama/model_text.go index 490eb696c..9bd414afc 100644 --- a/model/models/mllama/model_text.go +++ b/model/models/mllama/model_text.go @@ -18,7 +18,7 @@ type TextSelfAttention struct { RopeFactors ml.Tensor `gguf:"rope_freqs.weight"` } -func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { batchSize := hiddenState.Dim(1) headDim := opts.hiddenSize / opts.numHeads ropeType := uint32(0) @@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct { MLP *TextMLP } -func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { residual := hiddenState hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) - hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts) + hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts) // In the final layer (outputs != nil), optimize by pruning to just the token positions // we need logits for. @@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct { MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"` } -func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { residual := hiddenState hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) @@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, } type TextDecoderLayer interface { - Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor + Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor } type TextDecoder struct { Layers []TextDecoderLayer } -func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { for i, layer := range d.Layers { layerType := selfAttentionLayer if slices.Contains(opts.crossAttentionLayers, int32(i)) { @@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, lastLayerOutputs = outputs } - hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts) + hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts) } } @@ -214,9 +214,9 @@ type TextModel struct { *TextModelOptions } -func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor { +func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor { hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs) - hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions) + hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions) hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) return m.Output.Forward(ctx, hiddenState) } diff --git a/model/models/mllama/model_vision.go b/model/models/mllama/model_vision.go index bd3d150a3..77ea53731 100644 --- a/model/models/mllama/model_vision.go +++ b/model/models/mllama/model_vision.go @@ -15,7 +15,7 @@ type VisionSelfAttention struct { Query *nn.Linear `gguf:"attn_q"` Key *nn.Linear `gguf:"attn_k"` Value *nn.Linear `gguf:"attn_v"` - Output *nn.Linear `gguf:"attn_out"` + Output *nn.Linear `gguf:"attn_output"` Gate ml.Tensor `gguf:"attn_gate"` } @@ -45,36 +45,29 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) hiddenState = sa.Output.Forward(ctx, attention) - if sa.Gate != nil { - hiddenState = hiddenState.Mul(ctx, sa.Gate) - } - return hiddenState } type VisionMLP struct { - Down *nn.Linear `gguf:"ffn_down"` Up *nn.Linear `gguf:"ffn_up"` - - Gate ml.Tensor `gguf:"ffn_gate"` + Down *nn.Linear `gguf:"ffn_down"` } func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { - hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx) - hiddenState = mlp.Up.Forward(ctx, hiddenState) - if mlp.Gate != nil { - hiddenState = hiddenState.Mul(ctx, mlp.Gate) - } + hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx) + hiddenState = mlp.Down.Forward(ctx, hiddenState) return hiddenState } type VisionEncoderLayer struct { - AttentionNorm *nn.LayerNorm `gguf:"ln1"` + AttentionNorm *nn.LayerNorm `gguf:"attn_norm"` SelfAttention *VisionSelfAttention + AttentionGate ml.Tensor `gguf:"attn_gate"` - MLPNorm *nn.LayerNorm `gguf:"ln2"` + MLPNorm *nn.LayerNorm `gguf:"ffn_norm"` MLP *VisionMLP + MLPGate ml.Tensor `gguf:"ffn_gate"` } func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { @@ -83,13 +76,22 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts // self attention hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts) + + if e.AttentionGate != nil { + hiddenState = hiddenState.Mul(ctx, e.AttentionGate) + } hiddenState = hiddenState.Add(ctx, residual) residual = hiddenState // feed forward hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.MLP.Forward(ctx, hiddenState, opts) - return hiddenState.Add(ctx, residual) + hiddenState = hiddenState.Add(ctx, residual) + if e.MLPGate != nil { + hiddenState = hiddenState.Mul(ctx, e.MLPGate) + } + + return hiddenState } type VisionEncoder struct { @@ -114,9 +116,9 @@ type PrecomputedAspectRatioEmbedding struct { Gate ml.Tensor `gguf:"gate"` } -func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor { +func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor { embeddings := e.Embedding.Forward(ctx, aspectRatioIDs) - embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles) + embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles) if e.Gate != nil { embeddings = embeddings.Mul(ctx, e.Gate) } @@ -132,7 +134,7 @@ type PrecomputedPositionEmbedding struct { TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"` } -func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor { +func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor { positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs) if e.PositionEmbeddingGate != nil { positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate) @@ -141,7 +143,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi hiddenState = hiddenState.Add(ctx, positionEmbedding) tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs) - tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles) + tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles) if e.TilePositionEmbeddingGate != nil { tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate) } @@ -150,9 +152,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi } type VisionModelOptions struct { - hiddenSize, numHeads, numTiles int - imageSize, patchSize int - eps float32 + hiddenSize, numHeads int + imageSize, patchSize int + eps float32 intermediateLayersIndices []int32 } @@ -181,14 +183,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa numPositions++ } + numTiles := pixelValues.Dim(3) + hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1) - hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles) + hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) - hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) - hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1) + hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions) + hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1) - hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions) + hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions) hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps) numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8 @@ -199,18 +203,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps) - hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) - hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) + hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize) + hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions) - hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize) + hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize) hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions) hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...) - hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) - hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0) + hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize) + hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0) - hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) - hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0) + hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize) + hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0) return hiddenState.Concat(ctx, hiddenStates, 0) } @@ -222,7 +226,6 @@ func newVisionModel(c fs.Config) *VisionModel { VisionModelOptions: &VisionModelOptions{ hiddenSize: int(c.Uint("vision.embedding_length")), numHeads: int(c.Uint("vision.attention.head_count")), - numTiles: int(c.Uint("vision.max_num_tiles")), imageSize: int(c.Uint("vision.image_size")), patchSize: int(c.Uint("vision.patch_size")), diff --git a/model/models/mllama/process_image.go b/model/models/mllama/process_image.go index 1b0506d32..8e60508ff 100644 --- a/model/models/mllama/process_image.go +++ b/model/models/mllama/process_image.go @@ -2,17 +2,31 @@ package mllama import ( "image" - "image/color" "math" "slices" "golang.org/x/image/draw" "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/model/imageproc" ) +type supportedAspectRatio struct { + rank, width, height int +} + +func (a supportedAspectRatio) Point() image.Point { + return image.Point{a.width, a.height} +} + +func (a supportedAspectRatio) numTiles() int { + return a.width * a.height +} + type ImageProcessor struct { imageSize, numChannels, maxNumTiles int + + mean, std [3]float32 } func newImageProcessor(c fs.Config) ImageProcessor { @@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor { imageSize: int(c.Uint("vision.image_size")), numChannels: int(c.Uint("vision.num_channels")), maxNumTiles: int(c.Uint("vision.max_num_tiles")), + + mean: imageproc.ClipDefaultMean, + std: imageproc.ClipDefaultSTD, } } -func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point { - ratios := []image.Point{} - - for w := range maxTiles { - for h := range maxTiles { - if (w+1)*(h+1) <= maxTiles { - ratios = append(ratios, image.Point{w + 1, h + 1}) - } +func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) { + for w := 1; w <= p.maxNumTiles; w++ { + for h := 1; h <= p.maxNumTiles/w; h++ { + ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h}) } } - return ratios } -func (p *ImageProcessor) clip(a, a_min, a_max int) int { - if a < a_min { - return a_min - } else if a > a_max { - return a_max - } +func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point { + tw := min(max(imageSize.X, p.imageSize), canvasSize.X) + th := min(max(imageSize.Y, p.imageSize), canvasSize.Y) - return a -} + r := math.Min( + float64(tw)/float64(imageSize.X), + float64(th)/float64(imageSize.Y), + ) -func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { - targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X) - targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y) - - scaleWidth := float64(targetWidth) / float64(imageSize.X) - scaleHeight := float64(targetHeight) / float64(imageSize.Y) - - var w, h int - - if scaleWidth < scaleHeight { - w = targetWidth - h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) - } else { - w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) - h = targetHeight - } + w := min(int(math.Floor(float64(imageSize.X)*r)), tw) + h := min(int(math.Floor(float64(imageSize.Y)*r)), th) return image.Point{w, h} } -func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { - possibleTileArrangements := p.supportedAspectRatios(maxImageTiles) - possibleCanvasSizes := []image.Point{} - for _, pta := range possibleTileArrangements { - possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) +func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point { + possibleTileArrangements := p.supportedAspectRatios() + possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements)) + for i, pta := range possibleTileArrangements { + possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize} } - scales := []float64{} - - for _, pcs := range possibleCanvasSizes { - scaleHeight := float64(pcs.Y) / float64(imageSize.Y) - scaleWidth := float64(pcs.X) / float64(imageSize.X) - - if scaleWidth > scaleHeight { - scales = append(scales, scaleHeight) - } else { - scales = append(scales, scaleWidth) - } + scales := make([]float64, len(possibleCanvasSizes)) + for i, pcs := range possibleCanvasSizes { + scales[i] = min( + float64(pcs.Y)/float64(imageSize.Y), + float64(pcs.X)/float64(imageSize.X), + ) } var minUpscale float64 @@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles return selectedCanvas } -func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { +func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { b := img.Bounds() width := b.Max.X - b.Min.X height := b.Max.Y - b.Min.Y tileHeight := height / numTilesSize.Y tileWidth := width / numTilesSize.X - images := []image.Image{} + images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X) for h := range numTilesSize.Y { for w := range numTilesSize.X { rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) - images = append(images, img.(interface { + if subImg, ok := img.(interface { SubImage(image.Rectangle) image.Image - }).SubImage(rect)) + }); ok { + images = append(images, subImg.SubImage(rect)) + } else { + // Handle the case where img does not implement SubImage + // This is a fallback and may not be efficient + newImg := image.NewRGBA(rect) + draw.Draw(newImg, rect, img, rect.Min, draw.Src) + images = append(images, newImg) + } } } return images } -// remove the "alpha" channel by drawing over a prefilled image -// -//nolint:unused -func (p *ImageProcessor) compositeImage(img image.Image) image.Image { - dst := image.NewRGBA(img.Bounds()) - - white := color.RGBA{255, 255, 255, 255} - draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src) - draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over) - - return dst -} - -func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { +func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) { b := img.Bounds() - tileSize := outputSize.Y - canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize) - aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} - newSize := p.fitToCanvas(b.Max, canvasSize, tileSize) + canvasSize := p.optimalTiledCanvas(b.Max) + aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize} + newSize := p.fitToCanvas(b.Max, canvasSize) dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) @@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag return dst, aspectRatio } -func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image { +func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image { paddedSize := image.Point{ - X: outputSize.X * aspectRatio.X, - Y: outputSize.Y * aspectRatio.Y, + X: p.imageSize * aspectRatio.X, + Y: p.imageSize * aspectRatio.Y, } dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) @@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin return dst } -func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 { +func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 { subImages := p.splitToTiles(img, aspectRatio) var pixelVals []float32 @@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st gVal := float32(g>>8) / 255.0 bVal := float32(b>>8) / 255.0 - rVal = (rVal - mean[0]) / std[0] - gVal = (gVal - mean[1]) / std[1] - bVal = (bVal - mean[2]) / std[2] + rVal = (rVal - p.mean[0]) / p.std[0] + gVal = (gVal - p.mean[1]) / p.std[1] + bVal = (bVal - p.mean[2]) / p.std[2] rVals = append(rVals, rVal) gVals = append(gVals, gVal) @@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st return pixelVals } -func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) { - outputSize := image.Point{p.imageSize, p.imageSize} +func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) { + newImage, newImageRatio := p.resize(img) + newImage = p.pad(newImage, newImageRatio) + pixelValues := p.pack(newImage, newImageRatio) - // clip values - mean := [3]float32{0.48145466, 0.4578275, 0.40821073} - std := [3]float32{0.26862954, 0.26130258, 0.27577711} + supportedAspectRatios := p.supportedAspectRatios() + aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool { + return i.width == newImageRatio.X && i.height == newImageRatio.Y + }) - newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles) - newImage = p.pad(newImage, outputSize, aspectRatio) - - data := p.pack(newImage, aspectRatio, mean, std) - aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1 - return data, aspectRatioIndex, nil + return pixelValues, supportedAspectRatios[aspectRatioID], nil } diff --git a/model/models/mllama/process_image_test.go b/model/models/mllama/process_image_test.go new file mode 100644 index 000000000..a9669b182 --- /dev/null +++ b/model/models/mllama/process_image_test.go @@ -0,0 +1,387 @@ +package mllama + +import ( + "image" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestSupportedAspectRatios(t *testing.T) { + cases := []struct { + p ImageProcessor + want []supportedAspectRatio + }{ + { + p: ImageProcessor{maxNumTiles: 1}, + want: []supportedAspectRatio{ + {1, 1, 1}, + }, + }, + { + p: ImageProcessor{maxNumTiles: 2}, + want: []supportedAspectRatio{ + {1, 1, 1}, + {2, 1, 2}, + {3, 2, 1}, + }, + }, + { + p: ImageProcessor{maxNumTiles: 3}, + want: []supportedAspectRatio{ + {1, 1, 1}, + {2, 1, 2}, + {3, 1, 3}, + {4, 2, 1}, + {5, 3, 1}, + }, + }, + { + p: ImageProcessor{maxNumTiles: 4}, + want: []supportedAspectRatio{ + {1, 1, 1}, + {2, 1, 2}, + {3, 1, 3}, + {4, 1, 4}, + {5, 2, 1}, + {6, 2, 2}, + {7, 3, 1}, + {8, 4, 1}, + }, + }, + } + + for _, tt := range cases { + actual := tt.p.supportedAspectRatios() + if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestFitToCanvas(t *testing.T) { + cases := []struct { + p ImageProcessor + image image.Point + canvas image.Point + expect image.Point + }{ + { + p: ImageProcessor{imageSize: 200}, + image: image.Point{400, 400}, + canvas: image.Point{640, 480}, + expect: image.Point{400, 400}, + }, + { + p: ImageProcessor{imageSize: 200}, + image: image.Point{1024, 768}, + canvas: image.Point{640, 480}, + expect: image.Point{640, 480}, + }, + { + p: ImageProcessor{imageSize: 750}, + image: image.Point{500, 500}, + canvas: image.Point{1000, 1000}, + expect: image.Point{750, 750}, + }, + { + p: ImageProcessor{imageSize: 2000}, + image: image.Point{500, 1000}, + canvas: image.Point{2000, 2000}, + expect: image.Point{1000, 2000}, + }, + { + p: ImageProcessor{imageSize: 1000}, + image: image.Point{4000, 3000}, + canvas: image.Point{2000, 1000}, + expect: image.Point{1333, 1000}, + }, + { + p: ImageProcessor{imageSize: 560}, + image: image.Point{667, 1000}, + canvas: image.Point{1000, 1000}, + expect: image.Point{667, 1000}, + }, + } + + for _, tt := range cases { + actual := tt.p.fitToCanvas(tt.image, tt.canvas) + if diff := cmp.Diff(actual, tt.expect); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestOptimalTiledCanvas(t *testing.T) { + cases := []struct { + p ImageProcessor + image image.Point + expect image.Point + }{ + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 1000}, + image: image.Point{1024, 768}, + expect: image.Point{2000, 1000}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{1024, 768}, + expect: image.Point{1120, 1120}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{800, 600}, + expect: image.Point{1120, 1120}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{640, 480}, + expect: image.Point{1120, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{320, 200}, + expect: image.Point{560, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{1320, 200}, + expect: image.Point{1680, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{2000, 200}, + expect: image.Point{2240, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{10000, 200}, + expect: image.Point{2240, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{480, 640}, + expect: image.Point{560, 1120}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 320}, + expect: image.Point{560, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 1320}, + expect: image.Point{560, 1680}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 2000}, + expect: image.Point{560, 2240}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 10000}, + expect: image.Point{560, 2240}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{10000, 10000}, + expect: image.Point{1120, 1120}, + }, + } + + for _, tt := range cases { + actual := tt.p.optimalTiledCanvas(tt.image) + if diff := cmp.Diff(actual, tt.expect); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestSplitToTiles(t *testing.T) { + cases := []struct { + imageMax image.Point + numTiles image.Point + expect []image.Image + }{ + { + imageMax: image.Point{1024, 768}, + numTiles: image.Point{1, 1}, + expect: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))}, + }, + { + imageMax: image.Point{1000, 500}, + numTiles: image.Point{2, 1}, + expect: []image.Image{ + image.NewRGBA(image.Rect(0, 0, 500, 500)), + image.NewRGBA(image.Rect(500, 0, 1000, 500)), + }, + }, + { + imageMax: image.Point{1000, 1000}, + numTiles: image.Point{2, 2}, + expect: []image.Image{ + image.NewRGBA(image.Rect(0, 0, 500, 500)), + image.NewRGBA(image.Rect(500, 0, 1000, 500)), + image.NewRGBA(image.Rect(0, 500, 500, 1000)), + image.NewRGBA(image.Rect(500, 500, 1000, 1000)), + }, + }, + } + + var p ImageProcessor + + for _, tt := range cases { + actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles) + + if len(actual) != len(tt.expect) { + t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect)) + } + + for i := range actual { + if actual[i].Bounds() != tt.expect[i].Bounds() { + t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds()) + } + } + } +} + +func TestResize(t *testing.T) { + cases := []struct { + p ImageProcessor + imageMax image.Point + expectImage image.Image + expectAspectRatio image.Point + }{ + { + p: ImageProcessor{maxNumTiles: 1, imageSize: 100}, + imageMax: image.Point{200, 200}, + expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), + expectAspectRatio: image.Point{1, 1}, + }, + { + p: ImageProcessor{maxNumTiles: 2, imageSize: 100}, + imageMax: image.Point{200, 200}, + expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), + expectAspectRatio: image.Point{1, 1}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{10, 10}, + expectImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), + expectAspectRatio: image.Point{1, 1}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{2560, 1920}, + expectImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)), + expectAspectRatio: image.Point{2, 2}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{1024, 768}, + expectImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), + expectAspectRatio: image.Point{2, 2}, + }, + } + + for _, tt := range cases { + actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax}) + + if actualImage.Bounds() != tt.expectImage.Bounds() { + t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds()) + } + + if actualAspectRatio != tt.expectAspectRatio { + t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio) + } + } +} + +func TestPad(t *testing.T) { + cases := []struct { + p ImageProcessor + imageMax image.Point + aspectRatio image.Point + expect image.Image + }{ + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{1000, 667}, + aspectRatio: image.Point{2, 2}, + expect: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), + }, + } + + for _, tt := range cases { + actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio) + + if actual.Bounds() != tt.expect.Bounds() { + t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds()) + } + } +} + +func TestPackImages(t *testing.T) { + cases := []struct { + imageMax image.Point + aspectRatio image.Point + expectVals int + }{ + { + imageMax: image.Point{1120, 1120}, + aspectRatio: image.Point{2, 2}, + expectVals: 2 * 2 * 3 * 560 * 560, + }, + { + imageMax: image.Point{560, 560}, + aspectRatio: image.Point{1, 1}, + expectVals: 1 * 1 * 3 * 560 * 560, + }, + { + imageMax: image.Point{1120, 560}, + aspectRatio: image.Point{1, 2}, + expectVals: 1 * 2 * 3 * 560 * 560, + }, + } + + for _, tt := range cases { + var p ImageProcessor + actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio) + if len(actualVals) != tt.expectVals { + t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals) + } + } +} + +func TestPreprocess(t *testing.T) { + cases := []struct { + imageMax image.Point + expectAspectRatioID int + }{ + { + imageMax: image.Point{10, 10}, + expectAspectRatioID: 1, + }, + { + imageMax: image.Point{1024, 768}, + expectAspectRatioID: 6, + }, + } + + p := ImageProcessor{imageSize: 560, maxNumTiles: 4} + for _, tt := range cases { + img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax})) + if err != nil { + t.Fatalf("error processing: %q", err) + } + + if len(img) == 0 { + t.Errorf("no image data returned") + } + + if aspectRatio.rank != tt.expectAspectRatioID { + t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID) + } + } +} diff --git a/model/models/models.go b/model/models/models.go index 73b4c53a5..133e51761 100644 --- a/model/models/models.go +++ b/model/models/models.go @@ -7,4 +7,5 @@ import ( _ "github.com/ollama/ollama/model/models/llama4" _ "github.com/ollama/ollama/model/models/mistral3" _ "github.com/ollama/ollama/model/models/mllama" + _ "github.com/ollama/ollama/model/models/qwen25vl" ) diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go new file mode 100644 index 000000000..7de9b6eb1 --- /dev/null +++ b/model/models/qwen25vl/model.go @@ -0,0 +1,159 @@ +package qwen25vl + +import ( + "bytes" + "fmt" + "image" + "slices" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/kvcache" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model" + "github.com/ollama/ollama/model/input" +) + +type Model struct { + model.Base + model.BytePairEncoding + + *TextModel + *VisionModel `gguf:"v,vision"` + + ImageProcessor +} + +// Implement MultimodalProcessor interface +var _ model.MultimodalProcessor = (*Model)(nil) + +func New(c fs.Config) (model.Model, error) { + m := &Model{ + BytePairEncoding: model.NewBytePairEncoding( + c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), + &model.Vocabulary{ + Values: c.Strings("tokenizer.ggml.tokens"), + Types: c.Ints("tokenizer.ggml.token_type"), + Merges: c.Strings("tokenizer.ggml.merges"), + AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, + AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + []int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), + }, + ), + TextModel: NewTextModel(c), + VisionModel: newVisionModel(c), + ImageProcessor: newImageProcessor(c), + } + + m.Cache = kvcache.NewCausalCache(m.TextModel.Shift) + + return m, nil +} + +func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) { + image, _, err := image.Decode(bytes.NewReader(multimodalData)) + if err != nil { + return nil, nil, err + } + + f32s, grid, err := m.ImageProcessor.ProcessImage(image) + if err != nil { + return nil, nil, err + } + + // Calculate tensor dimensions + patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize * + m.ImageProcessor.patchSize * m.ImageProcessor.patchSize + numPatches := grid.Temporal * grid.Height * grid.Width + + pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches) + if err != nil { + return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err) + } + + return pixelValues, grid, nil +} + +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) { + if len(m.VisionModel.Layers) == 0 { + return nil, model.ErrNoVisionModel + } + + pixels, grid, err := m.PixelValues(ctx, multimodalData) + if err != nil { + return nil, err + } + + visionOutputs := m.VisionModel.Forward(ctx, pixels, grid) + return []input.Multimodal{{Tensor: visionOutputs}}, nil +} + +// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass +func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { + var result []input.Input + + var ( + imageToken int32 = 151655 + visionStartToken int32 = 151652 + visionEndToken int32 = 151653 + ) + + nImg := 0 + for _, inp := range inputs { + if inp.Multimodal == nil { + // If not a multimodal input, add it to the result unchanged + result = append(result, inp) + } else { + // Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix + // the image tokens with a prompt, so we add a prefix here + nImg++ + pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true) + if err != nil { + return nil, fmt.Errorf("failed to encode image prompt: %w", err) + } + for i := range pre { + result = append(result, input.Input{Token: pre[i]}) + } + + patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1) + + // First add the vision start token + result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 1}) + + // Add the image token with the multimodal tensor data at the first position + result = append(result, input.Input{ + Token: imageToken, + Multimodal: inp.Multimodal, + MultimodalHash: inp.MultimodalHash, + }) + + // Add the placeholder tokens for the remaining positions (tokensPerGrid-1) + result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...) + + result = append(result, input.Input{Token: visionEndToken}) + } + } + + return result, nil +} + +func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { + positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) + if err != nil { + return nil, err + } + + outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs)) + if err != nil { + return nil, err + } + + return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache) +} + +func init() { + model.Register("qwen25vl", New) +} diff --git a/model/models/qwen25vl/model_text.go b/model/models/qwen25vl/model_text.go new file mode 100644 index 000000000..800fd961d --- /dev/null +++ b/model/models/qwen25vl/model_text.go @@ -0,0 +1,150 @@ +package qwen25vl + +import ( + "math" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/kvcache" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/ml/nn" + "github.com/ollama/ollama/model/input" +) + +type TextOptions struct { + ctxLen, hiddenSize, numHeads, numKVHeads int + eps, ropeBase, ropeScale float32 + ropeDim, defaultContextLen uint32 +} + +type TextModel struct { + TokenEmbedding *nn.Embedding `gguf:"token_embd"` + Layers []Layer `gguf:"blk"` + OutputNorm *nn.RMSNorm `gguf:"output_norm"` + Output *nn.Linear `gguf:"output,alt:token_embd"` + + *TextOptions +} + +func NewTextModel(c fs.Config) *TextModel { + m := TextModel{ + Layers: make([]Layer, c.Uint("block_count")), + TextOptions: &TextOptions{ + ctxLen: int(c.Uint("context_length")), + hiddenSize: int(c.Uint("embedding_length")), + numHeads: int(c.Uint("attention.head_count")), + numKVHeads: int(c.Uint("attention.head_count_kv")), + eps: c.Float("attention.layer_norm_rms_epsilon"), + ropeBase: c.Float("rope.freq_base"), + ropeScale: c.Float("rope.freq_scale", 1), + ropeDim: c.Uint("rope.dimension_count", 128), + defaultContextLen: c.Uint("context_length", 128000), + }, + } + + return &m +} + +// SelfAttention implements the multi-head self-attention mechanism +// with separate projections for query, key, value and output transformations +type SelfAttention struct { + Query *nn.Linear `gguf:"attn_q"` + Key *nn.Linear `gguf:"attn_k"` + Value *nn.Linear `gguf:"attn_v"` + Output *nn.Linear `gguf:"attn_output"` +} + +func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { + batchSize := hiddenState.Dim(1) + headDim := opts.hiddenSize / opts.numHeads + + q := sa.Query.Forward(ctx, hiddenState) + q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) + q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen)) + + k := sa.Key.Forward(ctx, hiddenState) + k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) + k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen)) + + v := sa.Value.Forward(ctx, hiddenState) + v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) + + scaleFactor := 1.0 / math.Sqrt(float64(headDim)) + kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache) + kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize) + + return sa.Output.Forward(ctx, kqv) +} + +// Shift applies rotary position embeddings to the key tensor for causal attention caching +func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { + return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil +} + +// MLP implements the feed-forward network component with SwiGLU activation +type MLP struct { + Up *nn.Linear `gguf:"ffn_up"` + Down *nn.Linear `gguf:"ffn_down"` + Gate *nn.Linear `gguf:"ffn_gate"` +} + +func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor { + // Apply SwiGLU activation gating + hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState)) + // Project back to hidden dimension + return mlp.Down.Forward(ctx, hiddenState) +} + +// Layer represents a single transformer layer combining self-attention and feed-forward components +type Layer struct { + AttentionNorm *nn.RMSNorm `gguf:"attn_norm"` + SelfAttention *SelfAttention + MLPNorm *nn.RMSNorm `gguf:"ffn_norm"` + MLP *MLP +} + +func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { + // Self-attention branch with residual connection + residual := hiddenState + + hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps) + hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts) + + // In the final layer (outputs != nil), optimize by pruning to just the token positions + // we need logits for. + if outputs != nil { + hiddenState = hiddenState.Rows(ctx, outputs) + residual = residual.Rows(ctx, outputs) + } + + hiddenState = hiddenState.Add(ctx, residual) + // Feed-forward branch with residual connection + residual = hiddenState + hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps) + hiddenState = l.MLP.Forward(ctx, hiddenState, opts) + return hiddenState.Add(ctx, residual) +} + +func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) { + // Initial token embedding + hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx) + + for _, mi := range batch.Multimodal { + img := mi.Multimodal[0].Tensor + ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1)))) + } + + // Process through transformer layers + for i, layer := range m.Layers { + cache.SetLayer(i) + + var lastLayerOutputs ml.Tensor + if i == len(m.Layers)-1 { + lastLayerOutputs = outputs + } + + hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, m.TextOptions) + } + + hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps) + return m.Output.Forward(ctx, hiddenStates), nil +} diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go new file mode 100644 index 000000000..01eef392b --- /dev/null +++ b/model/models/qwen25vl/model_vision.go @@ -0,0 +1,391 @@ +package qwen25vl + +import ( + "fmt" + "math" + "slices" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/ml/nn" +) + +// We only support batch size of 1 +var batchSize int = 1 + +func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor { + x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)) + x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx) + return x2.Neg(ctx).Concat(ctx, x1, 0) +} + +func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor { + return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin)) +} + +func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int) ml.Tensor { + // Create a flat slice for the mask (all -inf initially to block all attention) + flat := make([]float32, seqLength*seqLength) + for i := range flat { + flat[i] = float32(math.Inf(-1)) // Negative infinity to block attention + } + + // Fill in the mask with zeros for tokens that CAN attend to each other + for i := 1; i < len(bounds); i++ { + start := bounds[i-1] + end := bounds[i] + + // Enable attention within this sequence block by setting values to 0 + for row := start; row < end; row++ { + for col := start; col < end; col++ { + idx := row*seqLength + col + flat[idx] = 0.0 // 0 allows attention, -inf blocks it + } + } + } + + mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength) + if err != nil { + panic(err) + } + // Reshape to match [seqLength, seqLength, 1] for broadcasting + mask = mask.Reshape(ctx, seqLength, seqLength, 1) + + return mask +} + +type VisionSelfAttention struct { + Query *nn.Linear `gguf:"attn_q"` + Key *nn.Linear `gguf:"attn_k"` + Value *nn.Linear `gguf:"attn_v"` + Output *nn.Linear `gguf:"attn_out"` +} + +func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor { + query := sa.Query.Forward(ctx, hiddenStates) + key := sa.Key.Forward(ctx, hiddenStates) + value := sa.Value.Forward(ctx, hiddenStates) + + query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize) + key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize) + value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize) + + query = applyRotaryPositionalEmbedding(ctx, query, cos, sin) + key = applyRotaryPositionalEmbedding(ctx, key, cos, sin) + + // Scale factor for scaled dot-product attention + scale := 1.0 / math.Sqrt(float64(opts.headDim)) + + // Scaled dot-product attention + query = query.Permute(ctx, 0, 2, 1, 3) + key = key.Permute(ctx, 0, 2, 1, 3) + value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) + kq := key.MulmatFullPrec(ctx, query) + kq = kq.Scale(ctx, scale) + if mask != nil { + kq = kq.Add(ctx, mask) + } + kq = kq.Softmax(ctx) + kqv := value.Mulmat(ctx, kq) + attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) + + return sa.Output.Forward(ctx, attention) +} + +// VisionMLP implements the multi-layer perceptron +type VisionMLP struct { + Gate *nn.Linear `gguf:"ffn_gate"` + Up *nn.Linear `gguf:"ffn_up"` + Down *nn.Linear `gguf:"ffn_down"` +} + +func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor { + // Using activation as specified in config (likely GELU or SiLU/Swish) + gateOutput := mlp.Gate.Forward(ctx, hiddenStates) + upOutput := mlp.Up.Forward(ctx, hiddenStates) + hiddenStates = gateOutput.SILU(ctx).Mul(ctx, upOutput) + + return mlp.Down.Forward(ctx, hiddenStates) +} + +type VisionEncoderLayer struct { + Norm1 *nn.RMSNorm `gguf:"ln1"` + SelfAttention *VisionSelfAttention + Norm2 *nn.RMSNorm `gguf:"ln2"` + MLP *VisionMLP +} + +func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor { + residual := hiddenStates + hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, cos, sin, mask, opts) + hiddenStates = hiddenStates.Add(ctx, residual) + + residual = hiddenStates + hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts) + return hiddenStates.Add(ctx, residual) +} + +// VisionModelOptions contains configuration options +type VisionModelOptions struct { + hiddenSize int + numHeads int + headDim int + patchSize int + numChannels int + eps float32 + ropeTheta float32 + spatialMergeSize int + windowSize int + fullAttnBlocks []int32 + temporalPatchSize int +} + +type PatchEmbedding struct { + PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"` + PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"` +} + +func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor { + numPatches := pixelValues.Shape()[1] + + // Reshape the input tensor to match the expected dimensions + pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches) + + // Permute the tensor to bring the temporal dimension to the front + pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) + + // Split the tensor into parts for the temporal convolutions + in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx) + in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches) + in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx) + in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches) + + s0, s1 := opts.patchSize, opts.patchSize // Use full stride + p0, p1 := 0, 0 // padding + d0, d1 := 1, 1 // dilation + out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1) + out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1) + + // Add the outputs from the two temporal convolutions + out := out0.Add(ctx, out1) + + // Reshape the output tensor to match the expected dimensions + return out.Reshape(ctx, opts.hiddenSize, numPatches) +} + +// VisionPatchMerger implements patch merging for the Qwen vision model +type VisionPatchMerger struct { + LNQ *nn.RMSNorm `gguf:"ln_q"` + MLP0 *nn.Linear `gguf:"mlp.0"` + MLP2 *nn.Linear `gguf:"mlp.2"` +} + +// Forward computes patch merging for the vision model +func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor { + normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps) + + hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize) + + // Reshape the normalized output to view the hidden size dimension + reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize), batchSize) + hidden := pm.MLP0.Forward(ctx, reshaped) + activated := hidden.GELU(ctx) + + output := pm.MLP2.Forward(ctx, activated) + + return output +} + +// VisionModel implements the Qwen vision model +type VisionModel struct { + PatchEmbedding *PatchEmbedding + Layers []VisionEncoderLayer `gguf:"blk"` + PatchMerger *VisionPatchMerger `gguf:"merger"` + + *VisionModelOptions +} + +// Forward computes the vision model for an input tensor +func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor { + // Extract patch embeddings + hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions) + + positionEmbedding := m.PositionalEmbedding(ctx, grid) + + windowIndex, bounds := m.WindowIndex(ctx, grid) + + spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize + + hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit) + hiddenStates = hiddenStates.Rows(ctx, windowIndex) + hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit) + + positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)*spatialMergeUnit, positionEmbedding.Dim(1)/spatialMergeUnit) + positionEmbedding = positionEmbedding.Rows(ctx, windowIndex) + positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)/spatialMergeUnit, positionEmbedding.Dim(1)*spatialMergeUnit) + positionEmbedding = positionEmbedding.Concat(ctx, positionEmbedding, 0) + + cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx) + cos = cos.Reshape(ctx, cos.Dim(0), 1, cos.Dim(1)) + sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1)) + + mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads) + // Apply encoder layers + for i, layer := range m.Layers { + if slices.Contains(m.fullAttnBlocks, int32(i)) { + hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions) + } else { + hiddenStates = layer.Forward( + ctx, + hiddenStates, + cos, + sin, + mask, + m.VisionModelOptions, + ) + } + } + + hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions) + reverseWindowIndex := windowIndex.Argsort(ctx) + return hiddenStates.Rows(ctx, reverseWindowIndex) +} + +// WindowIndex divides the grid into windows and returns: +// 1. A tensor containing flattened indices of all grid points organized by windows +// 2. A slice of boundaries that mark where each window's data begins and ends +// in the flattened representation, scaled by spatialMergeSize squared +// +// The boundaries slice always starts with 0 and contains cumulative ending +// positions for each window, allowing downstream processing to identify +// window boundaries in the tensor data. +func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int) { + vitMergerWindowSize := m.windowSize / m.spatialMergeSize / m.patchSize + + llmGridH := grid.Height / m.spatialMergeSize + llmGridW := grid.Width / m.spatialMergeSize + + // Calculate window parameters + numWindowsH := int(math.Ceil(float64(llmGridH) / float64(vitMergerWindowSize))) + numWindowsW := int(math.Ceil(float64(llmGridW) / float64(vitMergerWindowSize))) + + // Initialize index_new slice + var index []int32 + + // Initialize bounds with the first element as 0 + bounds := []int{0} + totalSeqLen := 0 + + // Process each window without padding + for wh := range numWindowsH { + for ww := range numWindowsW { + // Calculate window boundaries + hStart := wh * vitMergerWindowSize + wStart := ww * vitMergerWindowSize + hEnd := min(hStart+vitMergerWindowSize, llmGridH) + wEnd := min(wStart+vitMergerWindowSize, llmGridW) + + // Calculate sequence length for this window + seqLen := (hEnd - hStart) * (wEnd - wStart) + + // Collect indices for this window + for h := hStart; h < hEnd; h++ { + for w := wStart; w < wEnd; w++ { + index = append(index, int32(h*llmGridW+w)) + } + } + + totalSeqLen += seqLen + bounds = append(bounds, totalSeqLen*(m.spatialMergeSize*m.spatialMergeSize)+bounds[0]) + } + } + + t, err := ctx.Input().FromIntSlice(index, len(index)) + if err != nil { + panic(err) + } + + return t, bounds +} + +// PositionalEmbedding generates rotary position embeddings for attention mechanisms +func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor { + dim := m.headDim / 2 + freq := dim / 2 + theta := float64(m.ropeTheta) + merge := m.spatialMergeSize + + // Create frequency patterns for position encoding + maxGridSize := max(grid.Height, grid.Width) + freqVals := make([]float32, freq*maxGridSize) + for i := range maxGridSize { + for j := range freq { + freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim))) + } + } + freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize) + if err != nil { + panic(fmt.Errorf("failed to create tensor from frequencies: %w", err)) + } + + // Create position coordinates (y,x pairs) for the grid + // In PyTorch: Equivalent to generating position ids with torch.arange() + coords := make([]int32, 0, grid.Height*grid.Width*2) + for y := range grid.Height { + for x := range grid.Width { + coords = append(coords, int32(y), int32(x)) + } + } + pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height) + if err != nil { + panic(fmt.Errorf("failed to create tensor from positions: %w", err)) + } + + // Reshape and permute positions to match spatial merging pattern + pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge) + pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + pos = pos.Reshape(ctx, 2, merge, merge, grid.Width/merge*grid.Height/merge) + pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + pos = pos.Reshape(ctx, 2*merge*merge*grid.Width/merge*grid.Height/merge) + + // Use position indices to look up corresponding frequency values + positionalEmbedding := freqs.Rows(ctx, pos) + positionalEmbedding = positionalEmbedding.Reshape(ctx, positionalEmbedding.Dim(0)*2, positionalEmbedding.Dim(1)/2) + return positionalEmbedding +} + +// newVisionModel creates a new instance of the Qwen vision model +func newVisionModel(c fs.Config) *VisionModel { + patchSize := int(c.Uint("vision.patch_size", 14)) + hiddenSize := int(c.Uint("vision.embedding_length", 1280)) + numHeads := int(c.Uint("vision.attention.head_count", 16)) + numChannels := int(c.Uint("vision.num_channels", 3)) + eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6) + ropeTheta := c.Float("vision.rope.freq_base", 10000.0) + spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2)) + windowSize := int(c.Uint("vision.window_size", 112)) + fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31}) + temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2)) + + model := &VisionModel{ + Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)), + VisionModelOptions: &VisionModelOptions{ + hiddenSize: hiddenSize, + numHeads: numHeads, + headDim: hiddenSize / numHeads, + patchSize: patchSize, + numChannels: numChannels, + eps: eps, + ropeTheta: ropeTheta, + spatialMergeSize: spatialMergeSize, + windowSize: windowSize, + temporalPatchSize: temporalPatchSize, + fullAttnBlocks: fullAttnBlocks, + }, + } + + return model +} diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go new file mode 100644 index 000000000..dc91bdea5 --- /dev/null +++ b/model/models/qwen25vl/process_image.go @@ -0,0 +1,184 @@ +package qwen25vl + +import ( + "fmt" + "image" + "math" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/model/imageproc" +) + +// ImageProcessor contains configuration for the Qwen 2.5 VL image processing +type ImageProcessor struct { + numChannels int + patchSize int + temporalPatchSize int + mergeSize int + minPixels int + maxPixels int + factor int + rescaleFactor float32 + imageMean []float32 + imageStd []float32 +} + +// newImageProcessor creates a new image processor with default values +func newImageProcessor(c fs.Config) ImageProcessor { + patchSize := int(c.Uint("vision.patch_size", 14)) + mergeSize := int(c.Uint("vision.spatial_merge_size", 2)) + + return ImageProcessor{ + numChannels: int(c.Uint("vision.num_channels", 3)), // not set + patchSize: patchSize, + temporalPatchSize: 2, + mergeSize: mergeSize, + minPixels: 56 * 56, + maxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit + factor: patchSize * mergeSize, + rescaleFactor: 1.0 / 255.0, + imageMean: imageproc.ClipDefaultMean[:], + imageStd: imageproc.ClipDefaultSTD[:], + } +} + +// SmartResize implements the smart resize algorithm +func (p *ImageProcessor) SmartResize(height, width int) (int, int) { + factor := p.factor + + if height < factor || width < factor { + panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor)) + } else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 { + panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio)) + } + + round := func(x float64) int { return int(math.RoundToEven(x)) } + + hBar := round(float64(height)/float64(factor)) * factor + wBar := round(float64(width)/float64(factor)) * factor + + if hBar*wBar > p.maxPixels { + beta := math.Sqrt(float64(height*width) / float64(p.maxPixels)) + + hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor + wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor + } else if hBar*wBar < p.minPixels { + beta := math.Sqrt(float64(p.minPixels) / float64(height*width)) + + hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor + wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor + } + + return hBar, wBar +} + +type Grid struct { + Height int + Width int + Temporal int +} + +func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) { + origWidth := img.Bounds().Dx() + origHeight := img.Bounds().Dy() + + // Calculate smart resize dimensions + resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth) + + // Resize image using existing functions + resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear) + + normalizedPixels := imageproc.Normalize( + resizedImg, + [3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]}, + [3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]}, + true, // rescale + true, // channelFirst + ) + + // Calculate grid dimensions + grid := &Grid{ + Height: resizedHeight / p.patchSize, + Width: resizedWidth / p.patchSize, + Temporal: 1, // For single images, temporal dimension is 1 + } + + patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid) + if err != nil { + return nil, nil, fmt.Errorf("failed to create patches: %v", err) + } + + // Return patches and grid dimensions + return patches, grid, nil +} + +func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) { + channels := p.numChannels + patchSize := p.patchSize + mergeSize := p.mergeSize + temporalPatchSize := p.temporalPatchSize + + // Calculate output dimensions + numPatches := grid.Temporal * grid.Height * grid.Width + patchDim := channels * temporalPatchSize * patchSize * patchSize + + result := make([]float32, numPatches*patchDim) + patchIndex := 0 + + // Single temporal frame handling (copies to all frames) + for range grid.Temporal { + for h := 0; h < grid.Height; h += mergeSize { + for w := 0; w < grid.Width; w += mergeSize { + // Handle the 2x2 merged patches + for mh := range mergeSize { + for mw := range mergeSize { + baseOffset := patchIndex * patchDim + + // Extract patch data for first temporal frame + for c := range channels { + channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize) + + for py := range patchSize { + for px := range patchSize { + // Calculate source pixel coordinates + y := (h+mh)*patchSize + py + x := (w+mw)*patchSize + px + + // Source index in input tensor (CHW format) + srcIdx := c*height*width + y*width + x + + // Destination index in first temporal frame + dstIdx := channelOffset + (py * patchSize) + px + + if srcIdx < len(pixels) && dstIdx < len(result) { + result[dstIdx] = pixels[srcIdx] + } + } + } + } + + // Copy first temporal frame to all other frames + if temporalPatchSize > 1 { + for c := range channels { + channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize) + firstFrameOffset := channelOffset + frameSize := patchSize * patchSize + + // Copy first frame to all other frames + for tp := 1; tp < temporalPatchSize; tp++ { + currentFrameOffset := channelOffset + (tp * frameSize) + copy(result[currentFrameOffset:currentFrameOffset+frameSize], + result[firstFrameOffset:firstFrameOffset+frameSize]) + } + } + } + + patchIndex++ + } + } + } + } + } + + return result, nil +} diff --git a/model/models/qwen2vl/imageproc.go b/model/models/qwen2vl/imageproc.go deleted file mode 100644 index 964b39072..000000000 --- a/model/models/qwen2vl/imageproc.go +++ /dev/null @@ -1,74 +0,0 @@ -package qwen2vl - -import ( - "fmt" - "image" - _ "image/jpeg" - _ "image/png" - "io" - "math" - - "github.com/ollama/ollama/model/imageproc" -) - -const ( - DefaultFactor = 28 - DefaultMinPixels = 56 * 56 - DefaultMaxPixels = 14 * 14 * 4 * 1280 -) - -// smartResize calculates the size of the image to resize to based on the -// factor, minPixels, and maxPixels. -func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point { - // 1. Both dimensions of size are divisible by factor - // 2. The area of the image is between minPixels and maxPixels - // 3. The aspect ratio of the image is as close to 1:1 as possible - - if size.Y < factor || size.X < factor { - panic("image is too small to resize") - } else if max(size.X, size.Y)/min(size.X, size.Y) > 200 { - panic("aspect ratio must be less than 200:1") - } - - f := float64(factor) - width := float64(size.X) - height := float64(size.Y) - - xBar := math.Round(width/f) * f - yBar := math.Round(height/f) * f - - if xBar*yBar > float64(maxPixels) { - beta := math.Sqrt(height * width / float64(maxPixels)) - xBar = math.Floor(width/beta/f) * f - yBar = math.Floor(height/beta/f) * f - } else if xBar*yBar < float64(minPixels) { - beta := math.Sqrt(float64(minPixels) / (height * width)) - xBar = math.Ceil(width*beta/f) * f - yBar = math.Ceil(height*beta/f) * f - } - - return image.Point{int(xBar), int(yBar)} -} - -func resizeImage(img image.Image, format string, size image.Point) image.Image { - if format == "png" { - img = imageproc.Composite(img) - } - - return imageproc.Resize(img, size, imageproc.ResizeBilinear) -} - -func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { - img, format, err := image.Decode(imageData) - if err != nil { - return nil, nil, fmt.Errorf("failed to decode image: %w", err) - } - - size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) - img = resizeImage(img, format, size) - - data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true) - - opts := map[string]any{} - return data, opts, nil -} diff --git a/model/models/qwen2vl/imageproc_test.go b/model/models/qwen2vl/imageproc_test.go deleted file mode 100644 index 817b61a5c..000000000 --- a/model/models/qwen2vl/imageproc_test.go +++ /dev/null @@ -1,78 +0,0 @@ -package qwen2vl - -import ( - "bytes" - "image" - "image/png" - "testing" -) - -func TestSmartResize(t *testing.T) { - type smartResizeCase struct { - TestImage image.Image - Expected image.Point - } - - cases := []smartResizeCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)), - Expected: image.Point{980, 980}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - Expected: image.Point{1036, 756}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), - Expected: image.Point{980, 980}, - }, - } - - for _, c := range cases { - b := c.TestImage.Bounds().Max - actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) - if actual != c.Expected { - t.Errorf("expected: %v, actual: %v", c.Expected, actual) - } - } -} - -func TestPreprocess(t *testing.T) { - type preprocessCase struct { - TestImage image.Image - ExpectedLen int - } - - cases := []preprocessCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)), - ExpectedLen: 252 * 252 * 3 * 1, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), - ExpectedLen: 980 * 980 * 3 * 1, - }, - } - - for _, c := range cases { - var buf bytes.Buffer - err := png.Encode(&buf, c.TestImage) - if err != nil { - t.Fatal(err) - } - - imgData, _, err := Preprocess(&buf) - if err != nil { - t.Fatalf("error processing: %q", err) - } - - switch len(imgData) { - case 0: - t.Errorf("no image data returned") - case c.ExpectedLen: - // ok - default: - t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen) - } - } -} diff --git a/model/process_text_spm.go b/model/sentencepiece.go similarity index 87% rename from model/process_text_spm.go rename to model/sentencepiece.go index 446d5d604..7d725f04f 100644 --- a/model/process_text_spm.go +++ b/model/sentencepiece.go @@ -2,10 +2,13 @@ package model import ( "container/heap" + "context" "fmt" "log/slog" "strconv" "strings" + + "github.com/ollama/ollama/logutil" ) const spmWhitespaceSep = "▁" @@ -22,7 +25,7 @@ func (spm SentencePieceModel) Vocabulary() *Vocabulary { } func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel { - slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5]) + slog.Log(context.TODO(), logutil.LevelTrace, "Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5]) counter := map[int]int{} var maxTokenLen int @@ -36,7 +39,7 @@ func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel { } } - slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL], + slog.Log(context.TODO(), logutil.LevelTrace, "Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL], "user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE], "max token len", maxTokenLen) @@ -179,24 +182,10 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error) } } + slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids) + if addSpecial && len(ids) > 0 { - if spm.vocab.AddBOS { - if ids[0] == spm.vocab.BOS { - slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS) - } - - slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS) - ids = append([]int32{spm.vocab.BOS}, ids...) - } - - if spm.vocab.AddEOS { - if ids[len(ids)-1] == spm.vocab.EOS { - slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS) - } - - slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS) - ids = append(ids, spm.vocab.EOS) - } + ids = spm.vocab.addSpecials(ids) } return ids, nil @@ -257,5 +246,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) { } } + slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String()) return sb.String(), nil } diff --git a/model/process_text_spm_test.go b/model/sentencepiece_test.go similarity index 100% rename from model/process_text_spm_test.go rename to model/sentencepiece_test.go diff --git a/model/textprocessor.go b/model/textprocessor.go new file mode 100644 index 000000000..4a36f2352 --- /dev/null +++ b/model/textprocessor.go @@ -0,0 +1,17 @@ +package model + +const ( + TOKEN_TYPE_NORMAL = iota + 1 + TOKEN_TYPE_UNKNOWN + TOKEN_TYPE_CONTROL + TOKEN_TYPE_USER_DEFINED + TOKEN_TYPE_UNUSED + TOKEN_TYPE_BYTE +) + +type TextProcessor interface { + Encode(s string, addSpecial bool) ([]int32, error) + Decode([]int32) (string, error) + Is(int32, Special) bool + Vocabulary() *Vocabulary +} diff --git a/model/vocabulary.go b/model/vocabulary.go new file mode 100644 index 000000000..24adbaca3 --- /dev/null +++ b/model/vocabulary.go @@ -0,0 +1,112 @@ +package model + +import ( + "log/slog" + "slices" + "sync" +) + +type Special int32 + +const ( + SpecialBOS Special = iota + SpecialEOS +) + +type Vocabulary struct { + Values []string + Types []int32 + Scores []float32 + Merges []string + + BOS, EOS []int32 + AddBOS, AddEOS bool + + specialOnce sync.Once + special []string + + valuesOnce sync.Once + values map[string]int32 + + mergeOnce sync.Once + merge map[string]int32 +} + +func (v *Vocabulary) Is(id int32, special Special) bool { + switch special { + case SpecialBOS: + return slices.Contains(v.BOS, id) + case SpecialEOS: + return slices.Contains(v.EOS, id) + default: + return false + } +} + +func (v *Vocabulary) addSpecials(ids []int32) []int32 { + if v.AddBOS && len(v.BOS) > 0 { + if slices.Contains(v.BOS, ids[0]) { + slog.Warn("adding bos token to prompt which already has it", "id", v.BOS) + } + + slog.Debug("adding bos token to prompt", "id", v.BOS) + ids = append([]int32{v.BOS[0]}, ids...) + } + + if v.AddEOS && len(v.EOS) > 0 { + if slices.Contains(v.BOS, ids[len(ids)-1]) { + slog.Warn("adding eos token to prompt which already has it", "id", v.EOS) + } + + slog.Debug("adding eos token to prompt", "id", v.EOS) + ids = append(ids, v.EOS[0]) + } + + return ids +} + +func (v *Vocabulary) Encode(s string) int32 { + v.valuesOnce.Do(func() { + v.values = make(map[string]int32, len(v.Values)) + for i, value := range v.Values { + v.values[value] = int32(i) + } + }) + + if id, ok := v.values[s]; ok { + return id + } + + return -1 +} + +func (v *Vocabulary) Decode(id int32) string { + return v.Values[id] +} + +func (v *Vocabulary) SpecialVocabulary() []string { + v.specialOnce.Do(func() { + for i := range v.Values { + if v.Types[i] == TOKEN_TYPE_CONTROL { + v.special = append(v.special, v.Values[i]) + } + } + }) + + return v.special +} + +func (v *Vocabulary) Merge(left, right string) int { + v.mergeOnce.Do(func() { + v.merge = make(map[string]int32, len(v.Merges)) + for i, merge := range v.Merges { + v.merge[merge] = int32(i) + } + }) + + if id, ok := v.merge[left+" "+right]; ok { + return int(id) + } + + return -1 +} diff --git a/parser/parser.go b/parser/parser.go index a14ac5ff4..96eae9c04 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -39,7 +39,17 @@ func (f Modelfile) String() string { return sb.String() } -var deprecatedParameters = []string{"penalize_newline"} +var deprecatedParameters = []string{ + "penalize_newline", + "low_vram", + "f16_kv", + "logits_all", + "vocab_only", + "use_mlock", + "mirostat", + "mirostat_tau", + "mirostat_eta", +} // CreateRequest creates a new *api.CreateRequest from an existing Modelfile func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) { @@ -139,10 +149,28 @@ func fileDigestMap(path string) (map[string]string, error) { var files []string if fi.IsDir() { - files, err = filesForModel(path) + fs, err := filesForModel(path) if err != nil { return nil, err } + + for _, f := range fs { + f, err := filepath.EvalSymlinks(f) + if err != nil { + return nil, err + } + + rel, err := filepath.Rel(path, f) + if err != nil { + return nil, err + } + + if !filepath.IsLocal(rel) { + return nil, fmt.Errorf("insecure path: %s", rel) + } + + files = append(files, f) + } } else { files = []string{path} } @@ -215,11 +243,11 @@ func filesForModel(path string) ([]string, error) { return nil, err } - for _, safetensor := range matches { - if ct, err := detectContentType(safetensor); err != nil { + for _, match := range matches { + if ct, err := detectContentType(match); err != nil { return nil, err } else if ct != contentType { - return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, safetensor) + return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, match) } } diff --git a/parser/parser_test.go b/parser/parser_test.go index 097c058fb..7d5a808ba 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) { "num_gqa 1": {"num_gqa", "1"}, "num_gpu 1": {"num_gpu", "1"}, "main_gpu 1": {"main_gpu", "1"}, - "low_vram true": {"low_vram", "true"}, - "logits_all true": {"logits_all", "true"}, - "vocab_only true": {"vocab_only", "true"}, "use_mmap true": {"use_mmap", "true"}, - "use_mlock true": {"use_mlock", "true"}, "num_thread 1": {"num_thread", "1"}, "num_keep 1": {"num_keep", "1"}, "seed 1": {"seed", "1"}, @@ -496,9 +492,6 @@ func TestParseFileParameters(t *testing.T) { "repeat_penalty 1.0": {"repeat_penalty", "1.0"}, "presence_penalty 1.0": {"presence_penalty", "1.0"}, "frequency_penalty 1.0": {"frequency_penalty", "1.0"}, - "mirostat 1": {"mirostat", "1"}, - "mirostat_tau 1.0": {"mirostat_tau", "1.0"}, - "mirostat_eta 1.0": {"mirostat_eta", "1.0"}, "penalize_newline true": {"penalize_newline", "true"}, "stop ### User:": {"stop", "### User:"}, "stop ### User: ": {"stop", "### User:"}, @@ -769,7 +762,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) { return fmt.Sprintf("sha256:%x", h.Sum(nil)), n } -func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) { +func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) { t.Helper() f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf") diff --git a/progress/bar.go b/progress/bar.go index 410b6e23f..f3d21a8fd 100644 --- a/progress/bar.go +++ b/progress/bar.go @@ -64,7 +64,7 @@ func formatDuration(d time.Duration) string { func (b *Bar) String() string { termWidth, _, err := term.GetSize(int(os.Stderr.Fd())) if err != nil { - termWidth = 80 + termWidth = defaultTermWidth } var pre strings.Builder diff --git a/progress/progress.go b/progress/progress.go index 0cd0ea1f9..9f54275ec 100644 --- a/progress/progress.go +++ b/progress/progress.go @@ -4,8 +4,16 @@ import ( "bufio" "fmt" "io" + "os" "sync" "time" + + "golang.org/x/term" +) + +const ( + defaultTermWidth = 80 + defaultTermHeight = 24 ) type State interface { @@ -83,6 +91,11 @@ func (p *Progress) Add(key string, state State) { } func (p *Progress) render() { + _, termHeight, err := term.GetSize(int(os.Stderr.Fd())) + if err != nil { + termHeight = defaultTermHeight + } + p.mu.Lock() defer p.mu.Unlock() @@ -102,8 +115,9 @@ func (p *Progress) render() { fmt.Fprint(p.w, "\033[1G") // render progress lines - for i, state := range p.states { - fmt.Fprint(p.w, state.String(), "\033[K") + maxHeight := min(len(p.states), termHeight) + for i := len(p.states) - maxHeight; i < len(p.states); i++ { + fmt.Fprint(p.w, p.states[i].String(), "\033[K") if i < len(p.states)-1 { fmt.Fprint(p.w, "\n") } diff --git a/runner/llamarunner/cache.go b/runner/llamarunner/cache.go index 2e55b09dc..2e273e69c 100644 --- a/runner/llamarunner/cache.go +++ b/runner/llamarunner/cache.go @@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt), "used", numPast, "remaining", len(prompt)-numPast) + slot.Inputs = prompt[:numPast] prompt = prompt[numPast:] - slot.Inputs = slot.Inputs[:numPast] return slot, prompt, nil } diff --git a/runner/llamarunner/image.go b/runner/llamarunner/image.go index e7e30a4d8..1d0c1a4f5 100644 --- a/runner/llamarunner/image.go +++ b/runner/llamarunner/image.go @@ -5,7 +5,6 @@ import ( "fmt" "hash/maphash" "log/slog" - "slices" "sync" "time" @@ -18,8 +17,7 @@ type ImageContext struct { // mu is required to be held when generating embeddings or accessing the cache mu sync.Mutex - clip *llama.ClipContext - mllama *llama.MllamaContext + clip *llama.ClipContext // cache of images to embeddings images []imageCache @@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte var c ImageContext if arch == "clip" { c.clip, err = llama.NewClipContext(llamaContext, modelPath) - } else if arch == "mllama" { - c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath) } else { return nil, fmt.Errorf("unknown vision model architecture: %s", arch) } @@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) { if c.clip != nil { c.clip.Free() } - if c.mllama != nil { - c.mllama.Free() - } } -func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) { +func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) { if c == nil { return nil, nil } @@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect embed, err := c.findImage(hash) if err != nil { - if c.mllama != nil { - embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId) - if err != nil { - return nil, err - } - } else if c.clip != nil { + if c.clip != nil { embed, err = c.clip.NewEmbed(llamaContext, data) if err != nil { return nil, err @@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int { return 0 } - // Mllama maps an image to 1 embedding token (llava creates many tokens) - // and doesn't support more than a single image per request. - // The embeddings are large (100 MB), so allocating a big batch can fail - // on some systems - if c.mllama != nil { - return 1 - } - return configuredBatchSize } func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int { - if c != nil && c.mllama != nil { - return c.mllama.EmbedSize(llamaContext) - } else { - return llamaContext.Model().NEmbd() - } -} - -func (c *ImageContext) NeedCrossAttention(inputs ...input) bool { - if c == nil || c.mllama == nil { - return false - } - - return slices.ContainsFunc(inputs, func(input input) bool { - return input.embed != nil - }) + return llamaContext.Model().NEmbd() } type imageCache struct { diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go index d8169be40..7aa9b96a2 100644 --- a/runner/llamarunner/runner.go +++ b/runner/llamarunner/runner.go @@ -11,7 +11,6 @@ import ( "net" "net/http" "os" - "path/filepath" "regexp" "runtime" "strconv" @@ -23,8 +22,10 @@ import ( "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/llama" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/runner/common" ) @@ -56,10 +57,6 @@ type Sequence struct { // input cache being used by this sequence cache *InputCacheSlot - // does this sequence require cross-attention layers to be processed? - if we have seen - // an image for certain multi-modal models - crossAttention bool - // channel to send responses over responses chan string @@ -204,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error) return nil, fmt.Errorf("invalid image index: %d", n) } - embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID) + embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data) if err != nil { return nil, err } @@ -367,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) defer s.mu.Unlock() var batch *llama.Batch - crossAttention := false seqIdx := s.nextSeq - 1 for range s.seqs { @@ -415,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) batch = tokenBatch } else { batch = embedBatch - seq.crossAttention = s.image.NeedCrossAttention(input) } - } else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention { + } else if embedding != batch.IsEmbedding() { s.nextSeq = seqIdx break } @@ -426,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) break } - crossAttention = seq.crossAttention batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id) seq.pendingInputs = append(seq.pendingInputs, input) seq.iBatch = batch.NumTokens() - 1 @@ -439,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) return nil } - s.lc.SetCrossAttention(crossAttention) - err := s.lc.Decode(batch) if err != nil { return fmt.Errorf("failed to decode batch: %w", err) } - if crossAttention { - // synchronize state to ensure the cross attention batch is complete. - // needed specifically for multi-GPU systems otherwise an inflight - // task may be incorrectly invalidated causing a crash - s.lc.Synchronize() - } - for i, seq := range s.seqs { if seq == nil { continue @@ -583,9 +568,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { PenaltyRepeat: req.Options.RepeatPenalty, PenaltyFreq: req.Options.FrequencyPenalty, PenaltyPresent: req.Options.PresencePenalty, - Mirostat: req.Options.Mirostat, - MirostatTau: req.Options.MirostatTau, - MirostatEta: req.Options.MirostatEta, Seed: uint32(req.Options.Seed), Grammar: req.Grammar, } @@ -624,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { return } - seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...) - s.seqs[i] = seq s.cond.Signal() found = true @@ -683,8 +663,6 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - slog.Debug("embedding request", "content", req.Content) - seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true}) if err != nil { http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError) @@ -818,9 +796,8 @@ func Execute(args []string) error { kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)") port := fs.Int("port", 8080, "Port to expose the server on") threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") - verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") + _ = fs.Bool("verbose", false, "verbose output (default: disabled)") noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") - mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") @@ -834,22 +811,7 @@ func Execute(args []string) error { if err := fs.Parse(args); err != nil { return err } - level := slog.LevelInfo - if *verbose { - level = slog.LevelDebug - } - handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ - Level: level, - AddSource: true, - ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr { - if attr.Key == slog.SourceKey { - source := attr.Value.Any().(*slog.Source) - source.File = filepath.Base(source.File) - } - return attr - }, - }) - slog.SetDefault(slog.New(handler)) + slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel())) slog.Info("starting go runner") llama.BackendInit() @@ -876,7 +838,6 @@ func Execute(args []string) error { NumGpuLayers: *nGpuLayers, MainGpu: *mainGpu, UseMmap: !*noMmap && lpaths.String() == "", - UseMlock: *mlock, TensorSplit: tensorSplitFloats, Progress: func(progress float32) { server.progress = progress diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go index 01f435e4b..43880a41b 100644 --- a/runner/ollamarunner/cache.go +++ b/runner/ollamarunner/cache.go @@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt), "used", numPast, "remaining", int32(len(prompt))-numPast) + slot.Inputs = prompt[:numPast] prompt = prompt[numPast:] - slot.Inputs = slot.Inputs[:numPast] return slot, prompt, nil } @@ -284,7 +284,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error { copy(newInputs[numKeep:], slot.Inputs[numKeep+discard:]) // Reset the cache - _ = c.cache.Remove(slot.Id, 0, -1) + _ = c.cache.Remove(slot.Id, 0, math.MaxInt32) slot.Inputs = []input.Input{} // Return error with inputs that need to be reprocessed diff --git a/runner/ollamarunner/cache_test.go b/runner/ollamarunner/cache_test.go index 062b654cf..6897b5e46 100644 --- a/runner/ollamarunner/cache_test.go +++ b/runner/ollamarunner/cache_test.go @@ -3,7 +3,6 @@ package ollamarunner import ( "errors" "fmt" - "image" "testing" "time" @@ -12,10 +11,6 @@ import ( ) func TestCountCommon(t *testing.T) { - imgA := image.NewRGBA(image.Rect(0, 0, 100, 100)) - imgB := image.NewRGBA(image.Rect(0, 0, 50, 50)) - imgC := image.NewRGBA(image.Rect(50, 50, 100, 100)) - tests := []struct { name string t1 []input.Input @@ -36,20 +31,20 @@ func TestCountCommon(t *testing.T) { }, { name: "Image Prefix", - t1: []input.Input{{Multimodal: imgA, MultimodalHash: 1}}, - t2: []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}}, + t1: []input.Input{{MultimodalHash: 1}}, + t2: []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}}, expected: 1, }, { name: "Mixed", - t1: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}}, - t2: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}}, + t1: []input.Input{{Token: 1}, {MultimodalHash: 1}}, + t2: []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}}, expected: 2, }, { name: "Mixed, Same Length", - t1: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}}, - t2: []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}}, + t1: []input.Input{{Token: 1}, {MultimodalHash: 1}}, + t2: []input.Input{{Token: 1}, {MultimodalHash: 2}}, expected: 1, }, { diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go new file mode 100644 index 000000000..d78612fee --- /dev/null +++ b/runner/ollamarunner/multimodal.go @@ -0,0 +1,116 @@ +package ollamarunner + +import ( + "errors" + + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model/input" +) + +// Tensors can't be used across multiple compute graphs. This is a problem +// if a single embedding is split across batches using views since all of +// the views will have the same source tensor. We also don't want to +// recompute the entire embedding for each batch. +// +// To avoid this, we compute all of the tensors for the embedding on the +// first use and then store the result in system memory. When we need +// additional tensors, we recreate them from the stored data. + +// multimodalEntry represents the embeddings of a single object (such +// as an image). +type multimodalEntry struct { + // mm is the original set of tensors created by EncodeMultimodal + mm []input.Multimodal + + // data is the computed result of mm. Nil if not yet computed + data [][]float32 +} + +// multimodalStore maps from an individual tensor (of which there +// may be many in a single multimodal object) to its parent embedding +type multimodalStore map[ml.Tensor]*multimodalEntry + +func newMultimodalStore() multimodalStore { + return make(multimodalStore) +} + +// addMultimodal stores an embedding for later use in a compute graph +func (m multimodalStore) addMultimodal(embedding []input.Multimodal) { + entry := &multimodalEntry{mm: embedding} + + for _, e := range embedding { + if e.Tensor != nil { + m[e.Tensor] = entry + } + } +} + +// getMultimodal takes a source set of tensors (which may contain a whole or +// parts of one or more images) and returns the equivalent that can be used in +// the current context +func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) { + out := make([]input.Multimodal, len(in)) + for i := range out { + if in[i].Tensor != nil { + var err error + out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve) + if err != nil { + return nil, err + } + } + + out[i].Data = in[i].Data + } + + return out, nil +} + +func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) { + entry := m[in] + + if entry.data == nil { + computeCtx := backend.NewContext() + defer computeCtx.Close() + + var tensors []ml.Tensor + for _, t := range entry.mm { + if t.Tensor != nil { + tensors = append(tensors, t.Tensor) + } + } + + if len(tensors) == 0 { + return nil, nil + } + + computeCtx.Forward(tensors...) + entry.data = make([][]float32, len(entry.mm)) + + if !reserve { + computeCtx.Compute(tensors...) + + for i, t := range entry.mm { + if t.Tensor != nil { + entry.data[i] = t.Tensor.Floats() + } + } + } else { + err := computeCtx.Reserve() + if err != nil { + return nil, err + } + } + } + + for i, t := range entry.mm { + if in == t.Tensor { + if !reserve { + return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...) + } else { + return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil + } + } + } + + return nil, errors.New("multimodal tensor not found") +} diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 7ca6dc8c1..cd42d4341 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -1,18 +1,19 @@ package ollamarunner import ( + "bytes" "context" "encoding/json" "errors" "flag" "fmt" "hash/maphash" + "image" "log" "log/slog" "net" "net/http" "os" - "path/filepath" "regexp" "runtime" "strconv" @@ -21,10 +22,13 @@ import ( "time" "unicode/utf8" + "golang.org/x/image/bmp" "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" @@ -34,14 +38,13 @@ import ( _ "github.com/ollama/ollama/model/models" ) -type contextList struct { - list []ml.Context -} - type Sequence struct { // ctxs are used for allocating tensors that last the lifetime of the sequence, such as // multimodal embeddings - ctxs *contextList + ctxs []ml.Context + + // mmStore holds multimodal embeddings to mange memory and enable splitting across batches + mmStore multimodalStore // batch index iBatch int @@ -104,7 +107,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe startTime := time.Now() - inputs, ctxs, err := s.inputs(prompt, images) + inputs, ctxs, mmStore, err := s.inputs(prompt, images) if err != nil { return nil, fmt.Errorf("failed to process inputs: %w", err) } else if len(inputs) == 0 { @@ -159,6 +162,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe return &Sequence{ ctxs: ctxs, + mmStore: mmStore, inputs: inputs, numPromptInputs: len(inputs), startProcessingTime: startTime, @@ -177,8 +181,11 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe // inputs processes the prompt and images into a list of inputs // by splitting the prompt on [img-] tags, tokenizing text and // decoding images -func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, *contextList, error) { +func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, multimodalStore, error) { var inputs []input.Input + var ctxs []ml.Context + var mmStore multimodalStore + var parts []string var matches [][]string @@ -188,23 +195,17 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, * re := regexp.MustCompile(`\[img-(\d+)\]`) parts = re.Split(prompt, -1) matches = re.FindAllStringSubmatch(prompt, -1) + mmStore = newMultimodalStore() } else { parts = []string{prompt} } - var contexts contextList - runtime.AddCleanup(&contexts, func(ctxs []ml.Context) { - for _, ctx := range ctxs { - ctx.Close() - } - }, contexts.list) - postTokenize := false for i, part := range parts { // text - tokenize tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0) if err != nil { - return nil, nil, err + return nil, nil, nil, err } for _, t := range tokens { @@ -224,20 +225,23 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, * } if imageIndex < 0 { - return nil, nil, fmt.Errorf("invalid image index: %d", n) + return nil, nil, nil, fmt.Errorf("invalid image index: %d", n) } ctx := s.model.Backend().NewContext() - contexts.list = append(contexts.list, ctx) + runtime.SetFinalizer(ctx, func(c ml.Context) { c.Close() }) + ctxs = append(ctxs, ctx) imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data) if err != nil { - return nil, nil, err + return nil, nil, nil, err } s.multimodalHash.Reset() _, _ = s.multimodalHash.Write(images[imageIndex].Data) imageHash := s.multimodalHash.Sum64() + mmStore.addMultimodal(imageEmbeddings) + inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash}) postTokenize = true } @@ -247,11 +251,11 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, * var err error inputs, err = multimodalProcessor.PostTokenize(inputs) if err != nil { - return nil, nil, err + return nil, nil, nil, err } } - return inputs, &contexts, nil + return inputs, ctxs, mmStore, nil } type Server struct { @@ -370,6 +374,9 @@ func (s *Server) processBatch() error { } defer s.mu.Unlock() + ctx := s.model.Backend().NewContext() + defer ctx.Close() + var batchInputs []int32 var batch input.Batch @@ -440,7 +447,11 @@ func (s *Server) processBatch() error { batchInputs = append(batchInputs, inp.Token) if inp.Multimodal != nil { - batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal}) + mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, false) + if err != nil { + return err + } + batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: mm}) } batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs))) @@ -466,9 +477,6 @@ func (s *Server) processBatch() error { return nil } - ctx := s.model.Backend().NewContext() - defer ctx.Close() - modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch) if err != nil { return fmt.Errorf("failed to decode batch: %w", err) @@ -723,18 +731,75 @@ func (m *multiLPath) String() string { return strings.Join(*m, ", ") } -// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded -// to the GPU -/*func (s *Server) reserveWorstCaseGraph() error { +func (s *Server) reserveWorstCaseGraph() error { ctx := s.model.Backend().NewContext() defer ctx.Close() + var err error + inputs := make([]input.Input, s.batchSize) + mmStore := newMultimodalStore() + + // Multimodal strategy: + // - Encode a 2048x2048 image. This assumes that a single image of this + // size is sufficient to trigger the worst case. This is currently true + // because for existing models, only a single image fits in a batch. + // - Add the embedding to a full batch of tokens - this is necessary because + // the model may be looking for non-image data, such as tags. + // - Run PostTokenize to execute any transformations between generated + // embeddings and what the forward pass expects. + // - The result may now be larger than a batch (images may not fit in a + // single batch), so trim based on what will fit and must be grouped together. + // - Fill out the rest of the space with text tokens. + if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok { + mmCtx := s.model.Backend().NewContext() + defer mmCtx.Close() + + img := image.NewGray(image.Rect(0, 0, 2048, 2048)) + var buf bytes.Buffer + bmp.Encode(&buf, img) + + if inputs[0].Multimodal, err = multimodalProcessor.EncodeMultimodal(mmCtx, buf.Bytes()); err == nil { + mmStore.addMultimodal(inputs[0].Multimodal) + + inputs, err = multimodalProcessor.PostTokenize(inputs) + if err != nil { + return err + } + + for i, inp := range inputs { + minBatch := 1 + inp.SameBatch + if minBatch > s.batchSize { + inputs = inputs[i:min(i+minBatch, len(inputs))] + break + } else if i+minBatch > s.batchSize { + inputs = inputs[:i] + break + } + } + + if len(inputs) < s.batchSize { + newInputs := make([]input.Input, s.batchSize) + copy(newInputs, inputs) + inputs = newInputs + } + } + } + var batch input.Batch - inputs := make([]int32, s.batchSize) + batchInputs := make([]int32, len(inputs)) batch.Positions = make([]int32, len(inputs)) batch.Sequences = make([]int, len(inputs)) - for i := range inputs { + for i, inp := range inputs { + batchInputs[i] = inp.Token + if inp.Multimodal != nil { + mm, err := mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, true) + if err != nil { + return err + } + batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: i, Multimodal: mm}) + } + batch.Positions[i] = int32(i) } @@ -743,8 +808,7 @@ func (m *multiLPath) String() string { batch.Outputs[i] = int32(i) } - var err error - batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs)) + batch.Inputs, err = ctx.Input().FromIntSlice(batchInputs, len(batchInputs)) if err != nil { return err } @@ -768,7 +832,7 @@ func (m *multiLPath) String() string { } return nil -}*/ +} func (s *Server) loadModel( ctx context.Context, @@ -805,10 +869,10 @@ func (s *Server) loadModel( s.seqs = make([]*Sequence, s.parallel) s.seqsSem = semaphore.NewWeighted(int64(s.parallel)) - /*err = s.reserveWorstCaseGraph() + err = s.reserveWorstCaseGraph() if err != nil { panic(err) - }*/ + } s.status = llm.ServerStatusReady s.ready.Done() @@ -826,9 +890,8 @@ func Execute(args []string) error { kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)") port := fs.Int("port", 8080, "Port to expose the server on") threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") - verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") + _ = fs.Bool("verbose", false, "verbose output (default: disabled)") _ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") - _ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") @@ -842,22 +905,7 @@ func Execute(args []string) error { if err := fs.Parse(args); err != nil { return err } - level := slog.LevelInfo - if *verbose { - level = slog.LevelDebug - } - handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ - Level: level, - AddSource: true, - ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr { - if attr.Key == slog.SourceKey { - source := attr.Value.Any().(*slog.Source) - source.File = filepath.Base(source.File) - } - return attr - }, - }) - slog.SetDefault(slog.New(handler)) + slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel())) slog.Info("starting ollama engine") server := &Server{ diff --git a/sample/samplers.go b/sample/samplers.go index f0846c8dd..d395650d9 100644 --- a/sample/samplers.go +++ b/sample/samplers.go @@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa vocabIds[i] = uint32(i) } - grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)}) + grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS) if grammar == nil { return nil, errors.New("sample: failed to initialize grammar") } diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 60485df85..e4c0b3d93 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -121,7 +121,7 @@ function buildOllama() { if ($env:HIP_PATH) { write-host "Building ROCm backend libraries" if (-Not (get-command -ErrorAction silent ninja)) { - $NINJA_DIR=(gci -path (Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation -r -fi ninja.exe) | split-path -parent + $NINJA_DIR=(gci -path (Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation -r -fi ninja.exe).Directory.FullName $env:PATH="$NINJA_DIR;$env:PATH" } $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe" diff --git a/server/create.go b/server/create.go index 50e669db0..68e003dfd 100644 --- a/server/create.go +++ b/server/create.go @@ -15,6 +15,7 @@ import ( "path/filepath" "slices" "strings" + "sync/atomic" "github.com/gin-gonic/gin" @@ -23,7 +24,6 @@ import ( "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" - "github.com/ollama/ollama/llama" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" @@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is } defer bin.Close() - f, _, err := ggml.Decode(bin, 1024) + f, _, err := ggml.Decode(bin, -1) if err != nil { return nil, err } @@ -425,9 +425,14 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) { ft := layer.GGML.KV().FileType() - fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)}) - - want, err := ggml.ParseFileType(quantizeType) + var doneBytes atomic.Uint64 + totalBytes := uint64(layer.Size) - layer.GGML.Tensors().Offset + fnWrap := func(n uint64) { + done := doneBytes.Add(n) + progress := float32(done) / float32(totalBytes) + fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0000000000000000000", Total: layer.Size, Completed: int64(progress * float32(layer.Size))}) + } + ftype, err := ggml.ParseFileType(quantizeType) if err != nil { return nil, err } @@ -436,6 +441,11 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr if err != nil { return nil, err } + fp, err := os.Open(blob) + if err != nil { + return nil, err + } + defer fp.Close() temp, err := os.CreateTemp(filepath.Dir(blob), quantizeType) if err != nil { @@ -444,15 +454,15 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr defer temp.Close() defer os.Remove(temp.Name()) - if err := llama.Quantize(blob, temp.Name(), uint32(want)); err != nil { + if err := quantize(fp, temp, layer.GGML, ftype, fnWrap); err != nil { return nil, err } - + temp.Seek(0, io.SeekStart) + fn(api.ProgressResponse{Status: "verifying conversion"}) newLayer, err := NewLayer(temp, layer.MediaType) if err != nil { return nil, err } - if _, err := temp.Seek(0, io.SeekStart); err != nil { return nil, err } @@ -462,7 +472,6 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err)) return nil, err } - return &layerGGML{newLayer, f}, nil } diff --git a/server/images.go b/server/images.go index be629f4cb..352f10f2b 100644 --- a/server/images.go +++ b/server/images.go @@ -106,6 +106,11 @@ func (m *Model) Capabilities() []model.Capability { capabilities = append(capabilities, model.CapabilityInsert) } + // Check for vision capability in projector-based models + if len(m.ProjectorPaths) > 0 { + capabilities = append(capabilities, model.CapabilityVision) + } + return capabilities } diff --git a/server/images_test.go b/server/images_test.go index 22e5b7e6a..363b298e1 100644 --- a/server/images_test.go +++ b/server/images_test.go @@ -3,6 +3,7 @@ package server import ( "bytes" "encoding/binary" + "errors" "os" "path/filepath" "strings" @@ -91,11 +92,7 @@ func createMockGGUFData(architecture string, vision bool) []byte { func TestModelCapabilities(t *testing.T) { // Create a temporary directory for test files - tempDir, err := os.MkdirTemp("", "model_capabilities_test") - if err != nil { - t.Fatalf("Failed to create temp directory: %v", err) - } - defer os.RemoveAll(tempDir) + tempDir := t.TempDir() // Create different types of mock model files completionModelPath := filepath.Join(tempDir, "model.bin") @@ -104,21 +101,13 @@ func TestModelCapabilities(t *testing.T) { // Create a simple model file for tests that don't depend on GGUF content simpleModelPath := filepath.Join(tempDir, "simple_model.bin") - err = os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644) - if err != nil { - t.Fatalf("Failed to create completion model file: %v", err) - } - err = os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644) - if err != nil { - t.Fatalf("Failed to create completion model file: %v", err) - } - err = os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644) - if err != nil { - t.Fatalf("Failed to create embedding model file: %v", err) - } - err = os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644) - if err != nil { - t.Fatalf("Failed to create simple model file: %v", err) + if err := errors.Join( + os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644), + os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644), + os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644), + os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644), + ); err != nil { + t.Fatalf("Failed to create model files: %v", err) } toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}") @@ -236,27 +225,18 @@ func TestModelCapabilities(t *testing.T) { func TestModelCheckCapabilities(t *testing.T) { // Create a temporary directory for test files - tempDir, err := os.MkdirTemp("", "model_check_capabilities_test") - if err != nil { - t.Fatalf("Failed to create temp directory: %v", err) - } - defer os.RemoveAll(tempDir) + tempDir := t.TempDir() visionModelPath := filepath.Join(tempDir, "vision_model.bin") simpleModelPath := filepath.Join(tempDir, "model.bin") embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin") - err = os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644) - if err != nil { - t.Fatalf("Failed to create simple model file: %v", err) - } - err = os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644) - if err != nil { - t.Fatalf("Failed to create vision model file: %v", err) - } - err = os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644) - if err != nil { - t.Fatalf("Failed to create embedding model file: %v", err) + if err := errors.Join( + os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644), + os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644), + os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644), + ); err != nil { + t.Fatalf("Failed to create model files: %v", err) } toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}") diff --git a/server/internal/cmd/opp/internal/safetensors/safetensors.go b/server/internal/cmd/opp/internal/safetensors/safetensors.go deleted file mode 100644 index 7a45b91df..000000000 --- a/server/internal/cmd/opp/internal/safetensors/safetensors.go +++ /dev/null @@ -1,224 +0,0 @@ -// safetensors provides a reader for the safetensor directories and files. -package safetensors - -import ( - "encoding/json" - "fmt" - "io" - "io/fs" - "iter" - "slices" - "strconv" - "strings" -) - -// Tensor represents a single tensor in a safetensors file. -// -// It's zero value is not valid. Use [Model.Tensors] to get valid tensors. -// -// It is not safe for use across multiple goroutines. -type Tensor struct { - name string - dataType string - shape []int64 - - fsys fs.FS - fname string // entry name in fsys - offset int64 - size int64 -} - -type Model struct { - fsys fs.FS -} - -func Read(fsys fs.FS) (*Model, error) { - return &Model{fsys: fsys}, nil -} - -func (m *Model) Tensors() iter.Seq2[*Tensor, error] { - return func(yield func(*Tensor, error) bool) { - entries, err := fs.Glob(m.fsys, "*.safetensors") - if err != nil { - yield(nil, err) - return - } - for _, e := range entries { - tt, err := m.readTensors(e) - if err != nil { - yield(nil, err) - return - } - for _, t := range tt { - if !yield(t, nil) { - return - } - } - } - } -} - -func (m *Model) readTensors(fname string) ([]*Tensor, error) { - f, err := m.fsys.Open(fname) - if err != nil { - return nil, err - } - defer f.Close() - - finfo, err := f.Stat() - if err != nil { - return nil, err - } - - headerSize, err := readInt64(f) - if err != nil { - return nil, err - } - - data := make([]byte, headerSize) - _, err = io.ReadFull(f, data) - if err != nil { - return nil, err - } - - var raws map[string]json.RawMessage - if err := json.Unmarshal(data, &raws); err != nil { - return nil, err - } - - endOfHeader := 8 + headerSize // 8 bytes for header size plus the header itself - - // TODO(bmizerany): do something with metadata? This could be another - // header read if needed. We also need to figure out if the metadata is - // present in only one .safetensors file or if each file may have their - // own and if it needs to follow each tensor. Currently, I (bmizerany) - // am only seeing them show up with one entry for file type which is - // always "pt". - - tt := make([]*Tensor, 0, len(raws)) - for name, raw := range raws { - if name == "__metadata__" { - // TODO(bmizerany): do something with metadata? - continue - } - var v struct { - DataType string `json:"dtype"` - Shape []int64 `json:"shape"` - Offsets []int64 `json:"data_offsets"` - } - if err := json.Unmarshal(raw, &v); err != nil { - return nil, fmt.Errorf("error unmarshalling layer %q: %w", name, err) - } - if len(v.Offsets) != 2 { - return nil, fmt.Errorf("invalid offsets for %q: %v", name, v.Offsets) - } - - // TODO(bmizerany): after collecting, validate all offests make - // tensors contiguous? - begin := endOfHeader + v.Offsets[0] - end := endOfHeader + v.Offsets[1] - if err := checkBeginEnd(finfo.Size(), begin, end); err != nil { - return nil, err - } - - // TODO(bmizerany): just yield.. don't be silly and make a slice :) - tt = append(tt, &Tensor{ - name: name, - dataType: v.DataType, - shape: v.Shape, - fsys: m.fsys, - fname: fname, - offset: begin, - size: end - begin, - }) - } - return tt, nil -} - -func checkBeginEnd(size, begin, end int64) error { - if begin < 0 { - return fmt.Errorf("begin must not be negative: %d", begin) - } - if end < 0 { - return fmt.Errorf("end must not be negative: %d", end) - } - if end < begin { - return fmt.Errorf("end must be >= begin: %d < %d", end, begin) - } - if end > size { - return fmt.Errorf("end must be <= size: %d > %d", end, size) - } - return nil -} - -func readInt64(r io.Reader) (int64, error) { - var v uint64 - var buf [8]byte - if _, err := io.ReadFull(r, buf[:]); err != nil { - return 0, err - } - for i := range buf { - v |= uint64(buf[i]) << (8 * i) - } - return int64(v), nil -} - -type Shape []int64 - -func (s Shape) String() string { - var b strings.Builder - b.WriteByte('[') - for i, v := range s { - if i > 0 { - b.WriteByte(',') - } - b.WriteString(strconv.FormatInt(v, 10)) - } - b.WriteByte(']') - return b.String() -} - -func (t *Tensor) Name() string { return t.name } -func (t *Tensor) DataType() string { return t.dataType } -func (t *Tensor) Size() int64 { return t.size } -func (t *Tensor) Shape() Shape { return slices.Clone(t.shape) } - -func (t *Tensor) Reader() (io.ReadCloser, error) { - f, err := t.fsys.Open(t.fname) - if err != nil { - return nil, err - } - r := newSectionReader(f, t.offset, t.size) - rc := struct { - io.Reader - io.Closer - }{r, f} - return rc, nil -} - -// newSectionReader returns a new io.Reader that reads from r starting at -// offset. It is a convenience function for creating a io.SectionReader when r -// may not be an io.ReaderAt. -// -// If r is already a ReaderAt, it is returned directly, otherwise if r is an -// io.Seeker, a new io.ReaderAt is returned that wraps r after seeking to the -// beginning of the file. -// -// If r is an io.Seeker, -// or slow path. The slow path is used when r does not implement io.ReaderAt, -// in which case it must discard the data it reads. -func newSectionReader(r io.Reader, offset, n int64) io.Reader { - if r, ok := r.(io.ReaderAt); ok { - return io.NewSectionReader(r, offset, n) - } - if r, ok := r.(io.ReadSeeker); ok { - r.Seek(offset, io.SeekStart) - return io.LimitReader(r, n) - } - // Discard to offset and return a limited reader. - _, err := io.CopyN(io.Discard, r, offset) - if err != nil { - return nil - } - return io.LimitReader(r, n) -} diff --git a/server/internal/cmd/opp/opp.go b/server/internal/cmd/opp/opp.go deleted file mode 100644 index 6976927c7..000000000 --- a/server/internal/cmd/opp/opp.go +++ /dev/null @@ -1,375 +0,0 @@ -package main - -import ( - "bytes" - "cmp" - "context" - "encoding/json" - "errors" - "flag" - "fmt" - "io" - "log" - "mime" - "net/http" - "os" - "runtime" - "strings" - "sync" - "sync/atomic" - "time" - - "github.com/ollama/ollama/server/internal/cache/blob" - "github.com/ollama/ollama/server/internal/client/ollama" - "github.com/ollama/ollama/server/internal/cmd/opp/internal/safetensors" - "golang.org/x/sync/errgroup" -) - -var stdout io.Writer = os.Stdout - -const usage = `Opp is a tool for pushing and pulling Ollama models. - -Usage: - - opp [flags] - -Commands: - - push Upload a model to the Ollama server. - pull Download a model from the Ollama server. - import Import a model from a local safetensor directory. - -Examples: - - # Pull a model from the Ollama server. - opp pull library/llama3.2:latest - - # Push a model to the Ollama server. - opp push username/my_model:8b - - # Import a model from a local safetensor directory. - opp import /path/to/safetensor - -Envionment Variables: - - OLLAMA_MODELS - The directory where models are pushed and pulled from - (default ~/.ollama/models). -` - -func main() { - flag.Usage = func() { - fmt.Fprint(os.Stderr, usage) - } - flag.Parse() - - ctx := context.Background() - - err := func() error { - switch cmd := flag.Arg(0); cmd { - case "pull": - rc, err := ollama.DefaultRegistry() - if err != nil { - log.Fatal(err) - } - - return cmdPull(ctx, rc) - case "push": - rc, err := ollama.DefaultRegistry() - if err != nil { - log.Fatal(err) - } - return cmdPush(ctx, rc) - case "import": - c, err := ollama.DefaultCache() - if err != nil { - log.Fatal(err) - } - return cmdImport(ctx, c) - default: - if cmd == "" { - flag.Usage() - } else { - fmt.Fprintf(os.Stderr, "unknown command %q\n", cmd) - } - os.Exit(2) - return errors.New("unreachable") - } - }() - if err != nil { - fmt.Fprintf(os.Stderr, "opp: %v\n", err) - os.Exit(1) - } -} - -func cmdPull(ctx context.Context, rc *ollama.Registry) error { - model := flag.Arg(1) - if model == "" { - flag.Usage() - os.Exit(1) - } - - tr := http.DefaultTransport.(*http.Transport).Clone() - // TODO(bmizerany): configure transport? - rc.HTTPClient = &http.Client{Transport: tr} - - var mu sync.Mutex - p := make(map[blob.Digest][2]int64) // digest -> [total, downloaded] - - var pb bytes.Buffer - printProgress := func() { - pb.Reset() - mu.Lock() - for d, s := range p { - // Write progress to a buffer first to avoid blocking - // on stdout while holding the lock. - stamp := time.Now().Format("2006/01/02 15:04:05") - fmt.Fprintf(&pb, "%s %s pulling %d/%d (%.1f%%)\n", stamp, d.Short(), s[1], s[0], 100*float64(s[1])/float64(s[0])) - if s[0] == s[1] { - delete(p, d) - } - } - mu.Unlock() - io.Copy(stdout, &pb) - } - - ctx = ollama.WithTrace(ctx, &ollama.Trace{ - Update: func(l *ollama.Layer, n int64, err error) { - if err != nil && !errors.Is(err, ollama.ErrCached) { - fmt.Fprintf(stdout, "opp: pull %s ! %v\n", l.Digest.Short(), err) - return - } - - mu.Lock() - p[l.Digest] = [2]int64{l.Size, n} - mu.Unlock() - }, - }) - - errc := make(chan error) - go func() { - errc <- rc.Pull(ctx, model) - }() - - t := time.NewTicker(time.Second) - defer t.Stop() - for { - select { - case <-t.C: - printProgress() - case err := <-errc: - printProgress() - return err - } - } -} - -func cmdPush(ctx context.Context, rc *ollama.Registry) error { - args := flag.Args()[1:] - flag := flag.NewFlagSet("push", flag.ExitOnError) - flagFrom := flag.String("from", "", "Use the manifest from a model by another name.") - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: opp push \n") - flag.PrintDefaults() - } - flag.Parse(args) - - model := flag.Arg(0) - if model == "" { - return fmt.Errorf("missing model argument") - } - - from := cmp.Or(*flagFrom, model) - m, err := rc.ResolveLocal(from) - if err != nil { - return err - } - - ctx = ollama.WithTrace(ctx, &ollama.Trace{ - Update: func(l *ollama.Layer, n int64, err error) { - switch { - case errors.Is(err, ollama.ErrCached): - fmt.Fprintf(stdout, "opp: uploading %s %d (existed)", l.Digest.Short(), n) - case err != nil: - fmt.Fprintf(stdout, "opp: uploading %s %d ! %v\n", l.Digest.Short(), n, err) - case n == 0: - l := m.Layer(l.Digest) - mt, p, _ := mime.ParseMediaType(l.MediaType) - mt, _ = strings.CutPrefix(mt, "application/vnd.ollama.image.") - switch mt { - case "tensor": - fmt.Fprintf(stdout, "opp: uploading tensor %s %s\n", l.Digest.Short(), p["name"]) - default: - fmt.Fprintf(stdout, "opp: uploading %s %s\n", l.Digest.Short(), l.MediaType) - } - } - }, - }) - - return rc.Push(ctx, model, &ollama.PushParams{ - From: from, - }) -} - -type trackingReader struct { - io.Reader - n *atomic.Int64 -} - -func (r *trackingReader) Read(p []byte) (n int, err error) { - n, err = r.Reader.Read(p) - r.n.Add(int64(n)) - return n, err -} - -func cmdImport(ctx context.Context, c *blob.DiskCache) error { - args := flag.Args()[1:] - flag := flag.NewFlagSet("import", flag.ExitOnError) - flagAs := flag.String("as", "", "Import using the provided name.") - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: opp import \n") - flag.PrintDefaults() - } - flag.Parse(args) - if *flagAs == "" { - return fmt.Errorf("missing -as flag") - } - as := ollama.CompleteName(*flagAs) - - dir := cmp.Or(flag.Arg(0), ".") - fmt.Fprintf(os.Stderr, "Reading %s\n", dir) - - m, err := safetensors.Read(os.DirFS(dir)) - if err != nil { - return err - } - - var total int64 - var tt []*safetensors.Tensor - for t, err := range m.Tensors() { - if err != nil { - return err - } - tt = append(tt, t) - total += t.Size() - } - - var n atomic.Int64 - done := make(chan error) - go func() { - layers := make([]*ollama.Layer, len(tt)) - var g errgroup.Group - g.SetLimit(runtime.GOMAXPROCS(0)) - var ctxErr error - for i, t := range tt { - if ctx.Err() != nil { - // The context may cancel AFTER we exit the - // loop, and so if we use ctx.Err() after the - // loop we may report it as the error that - // broke the loop, when it was not. This can - // manifest as a false-negative, leading the - // user to think their import failed when it - // did not, so capture it if and only if we - // exit the loop because of a ctx.Err() and - // report it. - ctxErr = ctx.Err() - break - } - g.Go(func() (err error) { - rc, err := t.Reader() - if err != nil { - return err - } - defer rc.Close() - tr := &trackingReader{rc, &n} - d, err := c.Import(tr, t.Size()) - if err != nil { - return err - } - if err := rc.Close(); err != nil { - return err - } - - layers[i] = &ollama.Layer{ - Digest: d, - Size: t.Size(), - MediaType: mime.FormatMediaType("application/vnd.ollama.image.tensor", map[string]string{ - "name": t.Name(), - "dtype": t.DataType(), - "shape": t.Shape().String(), - }), - } - - return nil - }) - } - - done <- func() error { - if err := errors.Join(g.Wait(), ctxErr); err != nil { - return err - } - m := &ollama.Manifest{Layers: layers} - data, err := json.MarshalIndent(m, "", " ") - if err != nil { - return err - } - d := blob.DigestFromBytes(data) - err = blob.PutBytes(c, d, data) - if err != nil { - return err - } - return c.Link(as, d) - }() - }() - - fmt.Fprintf(stdout, "Importing %d tensors from %s\n", len(tt), dir) - - csiHideCursor(stdout) - defer csiShowCursor(stdout) - - csiSavePos(stdout) - writeProgress := func() { - csiRestorePos(stdout) - nn := n.Load() - fmt.Fprintf(stdout, "Imported %s/%s bytes (%d%%)%s\n", - formatNatural(nn), - formatNatural(total), - nn*100/total, - ansiClearToEndOfLine, - ) - } - - ticker := time.NewTicker(time.Second) - defer ticker.Stop() - for { - select { - case <-ticker.C: - writeProgress() - case err := <-done: - writeProgress() - fmt.Println() - fmt.Println("Successfully imported", as) - return err - } - } -} - -func formatNatural(n int64) string { - switch { - case n < 1024: - return fmt.Sprintf("%d B", n) - case n < 1024*1024: - return fmt.Sprintf("%.1f KB", float64(n)/1024) - case n < 1024*1024*1024: - return fmt.Sprintf("%.1f MB", float64(n)/(1024*1024)) - default: - return fmt.Sprintf("%.1f GB", float64(n)/(1024*1024*1024)) - } -} - -const ansiClearToEndOfLine = "\033[K" - -func csiSavePos(w io.Writer) { fmt.Fprint(w, "\033[s") } -func csiRestorePos(w io.Writer) { fmt.Fprint(w, "\033[u") } -func csiHideCursor(w io.Writer) { fmt.Fprint(w, "\033[?25l") } -func csiShowCursor(w io.Writer) { fmt.Fprint(w, "\033[?25h") } diff --git a/server/internal/internal/backoff/backoff_test.go b/server/internal/internal/backoff/backoff_test.go index 11ace22a8..f474118f0 100644 --- a/server/internal/internal/backoff/backoff_test.go +++ b/server/internal/internal/backoff/backoff_test.go @@ -3,7 +3,6 @@ package backoff import ( - "context" "testing" "testing/synctest" "time" @@ -29,7 +28,7 @@ func TestLoopAllocs(t *testing.T) { } func BenchmarkLoop(b *testing.B) { - ctx := context.Background() + ctx := b.Context() synctest.Run(func() { for n := range Loop(ctx, 100*time.Millisecond) { if n == b.N { diff --git a/server/model.go b/server/model.go index e733fbdb1..2149ff855 100644 --- a/server/model.go +++ b/server/model.go @@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe } defer blob.Close() - f, _, err := ggml.Decode(blob, 1024) + f, _, err := ggml.Decode(blob, -1) if err != nil { return nil, err } diff --git a/server/modelpath_test.go b/server/modelpath_test.go index 849e0fa73..96429f958 100644 --- a/server/modelpath_test.go +++ b/server/modelpath_test.go @@ -1,7 +1,6 @@ package server import ( - "os" "path/filepath" "testing" @@ -11,9 +10,7 @@ import ( func TestGetBlobsPath(t *testing.T) { // GetBlobsPath expects an actual directory to exist - dir, err := os.MkdirTemp("", "ollama-test") - require.NoError(t, err) - defer os.RemoveAll(dir) + tempDir := t.TempDir() tests := []struct { name string @@ -24,19 +21,19 @@ func TestGetBlobsPath(t *testing.T) { { "empty digest", "", - filepath.Join(dir, "blobs"), + filepath.Join(tempDir, "blobs"), nil, }, { "valid with colon", "sha256:456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9", - filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), + filepath.Join(tempDir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), nil, }, { "valid with dash", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9", - filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), + filepath.Join(tempDir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), nil, }, { @@ -60,7 +57,7 @@ func TestGetBlobsPath(t *testing.T) { } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - t.Setenv("OLLAMA_MODELS", dir) + t.Setenv("OLLAMA_MODELS", tempDir) got, err := GetBlobsPath(tc.digest) diff --git a/server/prompt.go b/server/prompt.go index 5b5b958f1..147a02b69 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -3,47 +3,32 @@ package server import ( "bytes" "context" - "encoding/binary" "errors" "fmt" "log/slog" + "slices" "strings" "github.com/ollama/ollama/api" "github.com/ollama/ollama/llm" - "github.com/ollama/ollama/model/models/mllama" "github.com/ollama/ollama/template" ) type tokenizeFunc func(context.Context, string) ([]int, error) -var errTooManyImages = errors.New("vision model only supports a single image per message") - // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn. // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the // latest message and 2) system messages func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) { var system []api.Message - isMllama := checkMllamaModelFamily(m) - - var imageNumTokens int // TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent - if isMllama { - // Our mllama implementation packs all of the embeddings into a single token - imageNumTokens = 1 - } else { - // Clip images are represented as 768 tokens, each an embedding - imageNumTokens = 768 - } + // Clip images are represented as 768 tokens, each an embedding + imageNumTokens := 768 n := len(msgs) - 1 // in reverse, find all messages that fit into context window for i := n; i >= 0; i-- { - if isMllama && len(msgs[i].Images) > 1 { - return "", nil, errTooManyImages - } - // always include the last message if i == n { continue @@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. currMsgIdx := n for cnt, msg := range msgs[currMsgIdx:] { - prefix := "" - imgPrompt := "" + if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 { + return "", nil, errors.New("this model only supports one image while more than one image requested") + } + + var prefix string prompt := msg.Content for _, i := range msg.Images { - var imgData llm.ImageData - - if isMllama { - if len(m.ProjectorPaths) == 0 { - imgData = llm.ImageData{ - ID: len(images), - Data: i, - } - } else { - data, opts, err := mllama.Preprocess(bytes.NewReader(i)) - if err != nil { - return "", nil, err - } - - buf := new(bytes.Buffer) - err = binary.Write(buf, binary.LittleEndian, data) - if err != nil { - return "", nil, err - } - - ar, ok := opts["aspectRatioIndex"].(int) - if !ok { - return "", nil, fmt.Errorf("missing aspect ratio for image") - } - - imgData = llm.ImageData{ - ID: len(images), - Data: buf.Bytes(), - AspectRatioID: ar, - } - } - imgPrompt = "<|image|>" - } else { - imgData = llm.ImageData{ - ID: len(images), - Data: i, - } + imgData := llm.ImageData{ + ID: len(images), + Data: i, } imgTag := fmt.Sprintf("[img-%d]", imgData.ID) @@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. images = append(images, imgData) } - msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt + msgs[currMsgIdx+cnt].Content = prefix + prompt } // truncate any messages that do not fit into the context window @@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. return b.String(), images, nil } - -func checkMllamaModelFamily(m *Model) bool { - for _, arch := range m.Config.ModelFamilies { - if arch == "mllama" { - return true - } - } - return false -} diff --git a/server/prompt_test.go b/server/prompt_test.go index 62aec86a9..fb6c96c0c 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -2,9 +2,6 @@ package server import ( "bytes" - "context" - "image" - "image/png" "testing" "github.com/google/go-cmp/cmp" @@ -15,10 +12,9 @@ import ( func TestChatPrompt(t *testing.T) { type expect struct { - prompt string - images [][]byte - aspectRatioID int - error error + prompt string + images [][]byte + error error } tmpl, err := template.Parse(` @@ -29,28 +25,6 @@ func TestChatPrompt(t *testing.T) { t.Fatal(err) } visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} - mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}} - - createImg := func(width, height int) ([]byte, error) { - img := image.NewRGBA(image.Rect(0, 0, width, height)) - var buf bytes.Buffer - - if err := png.Encode(&buf, img); err != nil { - return nil, err - } - - return buf.Bytes(), nil - } - - imgBuf, err := createImg(5, 5) - if err != nil { - t.Fatal(err) - } - - imgBuf2, err := createImg(6, 6) - if err != nil { - t.Fatal(err) - } cases := []struct { name string @@ -228,97 +202,13 @@ func TestChatPrompt(t *testing.T) { images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")}, }, }, - { - name: "messages with mllama (no images)", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!"}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, - }, - expect: expect{ - prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ", - }, - }, - { - name: "messages with mllama single prompt", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}}, - }, - expect: expect{ - prompt: "[img-0]<|image|>How many hotdogs are in this image? ", - images: [][]byte{imgBuf}, - aspectRatioID: 1, - }, - }, - { - name: "messages with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!"}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}}, - }, - expect: expect{ - prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ", - images: [][]byte{imgBuf}, - aspectRatioID: 1, - }, - }, - { - name: "multiple messages with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}}, - }, - expect: expect{ - prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ", - images: [][]byte{imgBuf, imgBuf2}, - aspectRatioID: 1, - }, - }, - { - name: "earlier image with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}}, - {Role: "assistant", Content: "There are four hotdogs."}, - {Role: "user", Content: "Which ones have mustard?"}, - }, - expect: expect{ - prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ", - images: [][]byte{imgBuf}, - aspectRatioID: 1, - }, - }, - { - name: "too many images with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!"}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}}, - }, - expect: expect{ - error: errTooManyImages, - }, - }, } for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { model := tt.model opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}} - prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil) + prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil) if tt.error == nil && err != nil { t.Fatal(err) } else if tt.error != nil && err != tt.error { @@ -342,10 +232,6 @@ func TestChatPrompt(t *testing.T) { if !bytes.Equal(images[i].Data, tt.images[i]) { t.Errorf("expected %q, got %q", tt.images[i], images[i].Data) } - } else { - if images[i].AspectRatioID != tt.aspectRatioID { - t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID) - } } } }) diff --git a/server/quantization.go b/server/quantization.go new file mode 100644 index 000000000..adfc948ec --- /dev/null +++ b/server/quantization.go @@ -0,0 +1,227 @@ +package server + +import ( + "fmt" + "io" + "log/slog" + "maps" + "os" + "strings" + "unsafe" + + fsggml "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/ml/backend/ggml" +) + +type quantizer struct { + *os.File + offset uint64 + from, to *fsggml.Tensor + progressFn func(n uint64) +} + +func (q quantizer) WriteTo(w io.Writer) (int64, error) { + quantize := q.from.Kind != q.to.Kind + sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size())) + if !quantize { + n, err := io.Copy(w, sr) + q.progressFn(q.from.Size()) + return n, err + } + data, err := io.ReadAll(sr) + if err != nil { + slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err) + return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err) + } + var f32s []float32 + newType := fsggml.TensorType(q.to.Kind) + if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 { + f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements()) + } else { + f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements()) + } + data = ggml.Quantize(newType, f32s, q.from.Shape) + n, err := w.Write(data) + q.progressFn(q.from.Size()) + return int64(n), err +} + +type quantizeState struct { + nAttnV int // Number of attn_*v* weight tensors + nFfnDown int // Number of ffn_down tensors + iAttnV int // Running counter of number of attn_v tensors that have been processed + iFfnDown int // Running counter of number of ffn_down tensors that have been processed + hasOutput bool // used to figure out if a model shares tok_embd with the output weight +} + +func useMoreBits(iLayer, nLayers int) bool { + return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2 +} + +func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType { + // Ported from llama_tensor_get_type, removed unsupported quantization types + nExperts := max(1, kv.Uint("expert_count", 0)) + if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") { + nx := shape[0] + qk_k := newType.BlockSize() + if nx%qk_k != 0 { + newType = fsggml.TensorTypeQ8_0 + } else if newType != fsggml.TensorTypeQ8_0 { + newType = fsggml.TensorTypeQ6_K + } + } else if strings.Contains(name, "attn_v.weight") { + if (ftype == fsggml.FileTypeQ4_K_M) && + useMoreBits(qs.iAttnV, qs.nAttnV) { + newType = fsggml.TensorTypeQ6_K + } else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 { + newType = fsggml.TensorTypeQ5_K + } + + // TODO + // if (qs.model.type == LLM_TYPE_70B) { + // // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is + // // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // // nearly negligible increase in model size by quantizing this tensor with more bits: + // if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K; + // } + + if nExperts == 8 { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + newType = fsggml.TensorTypeQ8_0 + } + qs.iAttnV++ + } else if strings.Contains(name, "attn_k.weight") { + if nExperts == 8 { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + newType = fsggml.TensorTypeQ8_0 + } + } else if strings.Contains(name, "ffn_down") { + iLayer := qs.iFfnDown + n_layer := qs.nFfnDown + if ftype == fsggml.FileTypeQ4_K_M { + if useMoreBits(iLayer, n_layer) { + newType = fsggml.TensorTypeQ6_K + } + } else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 { + newType = fsggml.TensorTypeQ5_K + } + qs.iFfnDown++ + } else if strings.Contains(name, "attn_output.weight") { + if nExperts == 8 { + if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M { + newType = fsggml.TensorTypeQ5_K + } + } + } else if strings.Contains(name, "attn_qkv.weight") { + if ftype == fsggml.FileTypeQ4_K_M { + newType = fsggml.TensorTypeQ5_K + } + } + + if newType.IsQuantized() { + nx := shape[0] + ny := uint64(1) + if len(shape) > 1 { + ny = shape[1] + } + qk_k := newType.BlockSize() + if nx%qk_k != 0 { + slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String())) + newType = fsggml.TensorTypeF16 + } + } + return newType +} + +func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error { + kv := maps.Clone(orig.KV()) + kv["general.file_type"] = newFileType + // kv["general.quantization_version"] = ggml.QuantizationVersion() + qs := &quantizeState{} + // Build up the quantize state so newType can adjust types + layerCount := 0 + for k, l := range orig.Tensors().GroupLayers() { + if strings.HasPrefix(k, "blk.") { + layerCount++ + } + for _, tensor := range l { + if strings.Contains(tensor.Name, "attn_v.weight") || + strings.Contains(tensor.Name, "attn_qkv.weight") || + strings.Contains(tensor.Name, "attn_kv_b.weight") { + qs.nAttnV++ + } else if tensor.Name == "output.weight" { + qs.hasOutput = true + } + } + } + qs.nFfnDown = layerCount + + origTensors := orig.Tensors().Items() + outputTensors := make([]*fsggml.Tensor, len(origTensors)) + for i, tensor := range origTensors { + tensor := tensor + newType := newType(tensor, kv, qs, newFileType) + newTensor := &fsggml.Tensor{ + Name: tensor.Name, + Shape: tensor.Shape, + Kind: uint32(newType), + } + outputTensors[i] = newTensor + outputTensors[i].WriterTo = quantizer{ + File: in, + offset: orig.Tensors().Offset + tensor.Offset, + from: tensor, + to: newTensor, + progressFn: progressFn, + } + } + return fsggml.WriteGGUF(out, kv, outputTensors) +} + +func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType { + defaultType := ftype.ToTensorType() + name := t.Name + quantize := strings.HasSuffix(name, "weight") + + // don't quantize vision stuff + quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v.")) + quantize = quantize && !strings.Contains(name, "mm.") + + // quantize only 2D and 3D tensors (experts) + quantize = quantize && (len(t.Shape) >= 2) + + // do not quantize norm tensors + quantize = quantize && !strings.Contains(name, "_norm.weight") + + // do not quantize expert gating tensors + quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight") + + // do not quantize positional embeddings and token types (BERT) + quantize = quantize && (name != "position_embd.weight") + quantize = quantize && (name != "token_types.weight") + + // do not quantize Mamba's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight") + + // do not quantize RWKV's time_mix_first tensors + quantize = quantize && !strings.Contains(name, "time_mix_first.weight") + quantize = quantize && !strings.Contains(name, "time_mix_w1.weight") + quantize = quantize && !strings.Contains(name, "time_mix_w2.weight") + quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight") + quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight") + quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight") + + // do not quantize relative position bias (T5) + quantize = quantize && !strings.Contains(name, "attn_rel_b.weight") + + newType := fsggml.TensorType(t.Kind) + if quantize { + // get more optimal quantization type based on the tensor shape, layer, etc. + newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype) + if newType != defaultType { + slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType) + } + } + return newType +} diff --git a/server/quantization_test.go b/server/quantization_test.go new file mode 100644 index 000000000..495297df3 --- /dev/null +++ b/server/quantization_test.go @@ -0,0 +1,659 @@ +package server + +import ( + "bytes" + "fmt" + "math" + "os" + "strings" + "testing" + + fsggml "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/ml/backend/ggml" +) + +func TestGetTensorNewType(t *testing.T) { + cases := []struct { + name string + kv map[string]any + qs quantizeState + newType fsggml.TensorType + tensor_name string + shape []uint64 + ftype fsggml.FileType + expected fsggml.TensorType + expectedPanic string + }{ + { + name: "output_unsupported", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "output.weight", + shape: []uint64{100, 100}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeF16, + }, + { + name: "output_Q8", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "output.weight", + shape: []uint64{1024, 1024}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "attn_v.weight_q4_k_m", + qs: quantizeState{ + iAttnV: 2, + nAttnV: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "attn_v.weight_q4_k_s", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_S, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_v.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ8_0, + }, + { + name: "attn_k.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_k.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ8_0, + }, + { + name: "ffn_down_q4_k_m", + qs: quantizeState{ + iFfnDown: 1, + nFfnDown: 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ4_0, + }, + { + name: "ffn_down_q4_k_m_6", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "ffn_down_q4_k_s", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_S, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_qkv.weight_q4_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_qkv.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ5_K, + }, + } + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + if tt.expectedPanic != "" { + defer func() { + e := recover() + if !strings.Contains(fmt.Sprintf("%v", e), tt.expectedPanic) { + t.Fatalf("incorrect panic\ngot: %v\nexpected: %s", e, tt.expectedPanic) + } + }() + } else { + defer func() { + e := recover() + if e != nil { + t.Fatalf("hit unexpected panic %v", e) + } + }() + } + ret := getTensorNewType(tt.kv, &tt.qs, tt.newType, tt.tensor_name, tt.shape, tt.ftype) + if ret != tt.expected { + t.Fatalf("incorrect type returned\ngot: %d\nexpected: %d", ret, tt.expected) + } + }) + } +} + +func TestQuantizeModel(t *testing.T) { + cases := []struct { + name string + kv map[string]any + tensors []*fsggml.Tensor + newType string + expectedTensorTypes map[string]fsggml.TensorType + }{ + { + name: "f16_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{512, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{256, 4}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + }, + newType: "Q4_K", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn.weight": fsggml.TensorTypeQ4_K, + "output.weight": fsggml.TensorTypeQ6_K, + }, + }, + { + name: "f32_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn_v.weight", Kind: uint32(fsggml.TensorTypeF32), + Offset: uint64(0), Shape: []uint64{512, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF32), + Offset: uint64(0), Shape: []uint64{512}, + WriterTo: bytes.NewReader(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...)), + }, + }, + newType: "Q4_K", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn_v.weight": fsggml.TensorTypeQ6_K, + "output.weight": fsggml.TensorTypeF32, + }, + }, + { + name: "f16_q8_0", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{32, 16, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{256, 4}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + }, + newType: "Q8_0", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn.weight": fsggml.TensorTypeQ8_0, + "output.weight": fsggml.TensorTypeQ8_0, + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + f, err := os.CreateTemp(t.TempDir(), tt.name) + if err != nil { + t.Fatal(err.Error()) + } + defer f.Close() + err = fsggml.WriteGGUF(f, tt.kv, tt.tensors) + if err != nil { + t.Fatalf("failed to create initial model: %s", err) + } + fp, err := os.Open(f.Name()) + if err != nil { + t.Fatal(err.Error()) + } + defer fp.Close() + meta, _, err := fsggml.Decode(fp, -1) + if err != nil { + t.Fatal(err.Error()) + } + progressCalled := false + progress := func(n uint64) { + // fmt.Fprintf(os.Stderr, "progress: %f\n", p) + progressCalled = true + } + tmp, err := os.CreateTemp(t.TempDir(), tt.name+".out") + if err != nil { + t.Fatal(err.Error()) + } + defer tmp.Close() + ftype, err := fsggml.ParseFileType(tt.newType) + if err != nil { + t.Fatal(err.Error()) + } + + err = quantize(fp, tmp, meta, ftype, progress) + if err != nil { + t.Fatalf("error during quantize: %s", err) + } + if !progressCalled { + t.Fatalf("progress was not reported") + } + // Now attempt to load it back and make sure types match expected + fpNew, err := os.Open(tmp.Name()) + if err != nil { + t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) + } + defer fpNew.Close() + newMeta, _, err := fsggml.Decode(fpNew, -1) + if err != nil { + t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) + } + tensors := newMeta.Tensors() + for _, l := range tensors.GroupLayers() { + for _, tensor := range l { + if fsggml.TensorType(tensor.Kind) != tt.expectedTensorTypes[tensor.Name] { + t.Fatalf("incorrect output type for %s\ngot:%s\nexpected:%s", tensor.Name, fsggml.TensorType(tensor.Kind), tt.expectedTensorTypes[tensor.Name]) + } + } + } + }) + } +} + +func TestConvertToF32(t *testing.T) { + expected := make([]float32, 256) + for i := range expected { + expected[i] = float32(i) + } + for dtype, data := range quantBytes { + // Skip the no-op + if dtype == fsggml.TensorTypeF32 { + continue + } + t.Run(dtype.String(), func(t *testing.T) { + fp32 := ggml.ConvertToF32(data, uint32(dtype), 256) + similarity := cosineSimilarity(expected, fp32) + if similarity < 0.999 { + t.Fatalf("Results not similar enough: %s %f", dtype.String(), similarity) + } + }) + } +} + +func dotProduct[V float32 | float64](v1, v2 []V) V { + var result V = 0 + for i := range v1 { + result += v1[i] * v2[i] + } + return result +} + +func magnitude[V float32 | float64](v []V) V { + var result V = 0 + for _, val := range v { + result += val * val + } + return V(math.Sqrt(float64(result))) +} + +func cosineSimilarity[V float32 | float64](v1, v2 []V) V { + return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2)) +} + +// Precomputed quantized data - arange 256 +// # For gguf-py supported types +// import gguf +// import numpy as np +// print(repr(gguf.quantize(np.arange(256, dtype=np.float16), gguf.GGMLQuantizationType.Q4_0))) +// +// For types not supported by gguf-py converted via ggml_fp32_to_fp16_row and quantize_XXX +// +// data := make([]byte, 256*2) +// fp32 := make([]float32, 256) +// for i := range 256 { +// fp32[i] = float32(i) +// } +// l := C.quantize_q6_K((*C.float)(&fp32[0]), unsafe.Pointer(&data[0]), 1, 256, nil) +// for i := range data[:int(l)] { +// fmt.Printf("%d, ", data[i]) +// } +var ( + quantBytes = map[fsggml.TensorType][]byte{ + fsggml.TensorTypeQ4_0: { + 192, 195, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, + 21, 21, 21, 4, 4, 224, 199, 36, 36, 36, 36, 19, 19, + 19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 240, 201, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, + 1, 1, 240, 203, 18, 18, 18, 18, 18, 18, 18, 18, 1, + 1, 1, 1, 1, 1, 1, 1, 248, 204, 18, 18, 17, 17, + 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 248, + 205, 17, 17, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 248, 206, 17, 17, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 248, 207, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, + }, + fsggml.TensorTypeQ4_1: { + 34, 64, 0, 0, 128, 128, 145, 145, 162, 162, 179, 179, 196, + 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 80, 128, 128, + 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, 247, + 247, 34, 64, 0, 84, 128, 128, 145, 145, 162, 162, 179, 179, + 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 86, 128, + 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, + 247, 247, 34, 64, 0, 88, 128, 128, 145, 145, 162, 162, 179, + 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 89, + 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, + 230, 247, 247, 34, 64, 0, 90, 128, 128, 145, 145, 162, 162, + 179, 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, + 91, 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, + 230, 230, 247, 247, + }, + fsggml.TensorTypeQ5_0: { + 192, 191, 1, 0, 0, 0, 128, 127, 127, 110, 110, 93, 93, + 76, 76, 59, 59, 42, 42, 25, 25, 8, 224, 195, 0, 0, + 0, 0, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, + 21, 21, 21, 4, 4, 240, 197, 0, 0, 0, 0, 53, 37, + 37, 37, 37, 36, 36, 20, 20, 20, 20, 19, 19, 3, 3, + 3, 240, 199, 0, 0, 0, 0, 36, 36, 36, 36, 19, 19, + 19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 248, 200, 0, + 0, 0, 0, 35, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 2, 2, 2, 2, 2, 248, 201, 0, 0, 0, 0, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, + 1, 1, 248, 202, 0, 0, 0, 0, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 2, 2, 1, 1, 1, 1, 1, 248, 203, + 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 1, + 1, 1, 1, 1, 1, 1, 1, + }, + fsggml.TensorTypeQ5_1: { + 0, 60, 0, 0, 0, 0, 255, 255, 0, 17, 34, 51, 68, + 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, + 0, 80, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, + 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 84, + 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, + 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 86, 0, 0, + 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, + 187, 204, 221, 238, 255, 0, 60, 0, 88, 0, 0, 255, 255, + 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, + 221, 238, 255, 0, 60, 0, 89, 0, 0, 255, 255, 0, 17, + 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, + 255, 0, 60, 0, 90, 0, 0, 255, 255, 0, 17, 34, 51, + 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, + 60, 0, 91, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, + 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, + }, + fsggml.TensorTypeQ8_0: { + 208, 51, 0, 4, 8, 12, 16, 20, 25, 29, 33, 37, 41, + 45, 49, 53, 57, 61, 66, 70, 74, 78, 82, 86, 90, 94, + 98, 102, 107, 111, 115, 119, 123, 127, 240, 55, 65, 67, 69, + 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, + 123, 125, 127, 252, 57, 86, 87, 88, 90, 91, 92, 94, 95, + 96, 98, 99, 100, 102, 103, 104, 106, 107, 108, 110, 111, 112, + 114, 115, 116, 118, 119, 120, 122, 123, 124, 126, 127, 0, 60, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 123, 124, 125, 126, 127, 2, 61, 102, 103, 104, 105, 105, + 106, 107, 108, 109, 109, 110, 111, 112, 113, 113, 114, 115, 116, + 117, 117, 118, 119, 120, 121, 121, 122, 123, 124, 125, 125, 126, + 127, 4, 62, 106, 107, 108, 108, 109, 110, 110, 111, 112, 112, + 113, 114, 114, 115, 116, 116, 117, 118, 118, 119, 120, 120, 121, + 122, 122, 123, 124, 124, 125, 126, 126, 127, 6, 63, 109, 110, + 110, 111, 112, 112, 113, 113, 114, 114, 115, 116, 116, 117, 117, + 118, 118, 119, 120, 120, 121, 121, 122, 122, 123, 124, 124, 125, + 125, 126, 126, 127, 4, 64, 112, 112, 113, 113, 114, 114, 115, + 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, + 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127, + }, + fsggml.TensorTypeBF16: { + 0, 0, 128, 63, 0, 64, 64, 64, 128, 64, 160, 64, 192, + 64, 224, 64, 0, 65, 16, 65, 32, 65, 48, 65, 64, 65, + 80, 65, 96, 65, 112, 65, 128, 65, 136, 65, 144, 65, 152, + 65, 160, 65, 168, 65, 176, 65, 184, 65, 192, 65, 200, 65, + 208, 65, 216, 65, 224, 65, 232, 65, 240, 65, 248, 65, 0, + 66, 4, 66, 8, 66, 12, 66, 16, 66, 20, 66, 24, 66, + 28, 66, 32, 66, 36, 66, 40, 66, 44, 66, 48, 66, 52, + 66, 56, 66, 60, 66, 64, 66, 68, 66, 72, 66, 76, 66, + 80, 66, 84, 66, 88, 66, 92, 66, 96, 66, 100, 66, 104, + 66, 108, 66, 112, 66, 116, 66, 120, 66, 124, 66, 128, 66, + 130, 66, 132, 66, 134, 66, 136, 66, 138, 66, 140, 66, 142, + 66, 144, 66, 146, 66, 148, 66, 150, 66, 152, 66, 154, 66, + 156, 66, 158, 66, 160, 66, 162, 66, 164, 66, 166, 66, 168, + 66, 170, 66, 172, 66, 174, 66, 176, 66, 178, 66, 180, 66, + 182, 66, 184, 66, 186, 66, 188, 66, 190, 66, 192, 66, 194, + 66, 196, 66, 198, 66, 200, 66, 202, 66, 204, 66, 206, 66, + 208, 66, 210, 66, 212, 66, 214, 66, 216, 66, 218, 66, 220, + 66, 222, 66, 224, 66, 226, 66, 228, 66, 230, 66, 232, 66, + 234, 66, 236, 66, 238, 66, 240, 66, 242, 66, 244, 66, 246, + 66, 248, 66, 250, 66, 252, 66, 254, 66, 0, 67, 1, 67, + 2, 67, 3, 67, 4, 67, 5, 67, 6, 67, 7, 67, 8, + 67, 9, 67, 10, 67, 11, 67, 12, 67, 13, 67, 14, 67, + 15, 67, 16, 67, 17, 67, 18, 67, 19, 67, 20, 67, 21, + 67, 22, 67, 23, 67, 24, 67, 25, 67, 26, 67, 27, 67, + 28, 67, 29, 67, 30, 67, 31, 67, 32, 67, 33, 67, 34, + 67, 35, 67, 36, 67, 37, 67, 38, 67, 39, 67, 40, 67, + 41, 67, 42, 67, 43, 67, 44, 67, 45, 67, 46, 67, 47, + 67, 48, 67, 49, 67, 50, 67, 51, 67, 52, 67, 53, 67, + 54, 67, 55, 67, 56, 67, 57, 67, 58, 67, 59, 67, 60, + 67, 61, 67, 62, 67, 63, 67, 64, 67, 65, 67, 66, 67, + 67, 67, 68, 67, 69, 67, 70, 67, 71, 67, 72, 67, 73, + 67, 74, 67, 75, 67, 76, 67, 77, 67, 78, 67, 79, 67, + 80, 67, 81, 67, 82, 67, 83, 67, 84, 67, 85, 67, 86, + 67, 87, 67, 88, 67, 89, 67, 90, 67, 91, 67, 92, 67, + 93, 67, 94, 67, 95, 67, 96, 67, 97, 67, 98, 67, 99, + 67, 100, 67, 101, 67, 102, 67, 103, 67, 104, 67, 105, 67, + 106, 67, 107, 67, 108, 67, 109, 67, 110, 67, 111, 67, 112, + 67, 113, 67, 114, 67, 115, 67, 116, 67, 117, 67, 118, 67, + 119, 67, 120, 67, 121, 67, 122, 67, 123, 67, 124, 67, 125, + 67, 126, 67, 127, 67, + }, + fsggml.TensorTypeF16: { + 0, 0, 0, 60, 0, 64, 0, 66, 0, 68, 0, 69, 0, 70, 0, 71, 0, + 72, 128, 72, 0, 73, 128, 73, 0, 74, 128, 74, 0, 75, 128, 75, + 0, 76, 64, 76, 128, 76, 192, 76, 0, 77, 64, 77, 128, 77, 192, + 77, 0, 78, 64, 78, 128, 78, 192, 78, 0, 79, 64, 79, 128, 79, + 192, 79, 0, 80, 32, 80, 64, 80, 96, 80, 128, 80, 160, 80, + 192, 80, 224, 80, 0, 81, 32, 81, 64, 81, 96, 81, 128, 81, + 160, 81, 192, 81, 224, 81, 0, 82, 32, 82, 64, 82, 96, 82, + 128, 82, 160, 82, 192, 82, 224, 82, 0, 83, 32, 83, 64, 83, + 96, 83, 128, 83, 160, 83, 192, 83, 224, 83, 0, 84, 16, 84, + 32, 84, 48, 84, 64, 84, 80, 84, 96, 84, 112, 84, 128, 84, + 144, 84, 160, 84, 176, 84, 192, 84, 208, 84, 224, 84, 240, + 84, 0, 85, 16, 85, 32, 85, 48, 85, 64, 85, 80, 85, 96, 85, + 112, 85, 128, 85, 144, 85, 160, 85, 176, 85, 192, 85, 208, + 85, 224, 85, 240, 85, 0, 86, 16, 86, 32, 86, 48, 86, 64, + 86, 80, 86, 96, 86, 112, 86, 128, 86, 144, 86, 160, 86, + 176, 86, 192, 86, 208, 86, 224, 86, 240, 86, 0, 87, 16, + 87, 32, 87, 48, 87, 64, 87, 80, 87, 96, 87, 112, 87, 128, + 87, 144, 87, 160, 87, 176, 87, 192, 87, 208, 87, 224, 87, + 240, 87, 0, 88, 8, 88, 16, 88, 24, 88, 32, 88, 40, 88, + 48, 88, 56, 88, 64, 88, 72, 88, 80, 88, 88, 88, 96, 88, + 104, 88, 112, 88, 120, 88, 128, 88, 136, 88, 144, 88, 152, + 88, 160, 88, 168, 88, 176, 88, 184, 88, 192, 88, 200, 88, + 208, 88, 216, 88, 224, 88, 232, 88, 240, 88, 248, 88, 0, + 89, 8, 89, 16, 89, 24, 89, 32, 89, 40, 89, 48, 89, 56, 89, + 64, 89, 72, 89, 80, 89, 88, 89, 96, 89, 104, 89, 112, 89, + 120, 89, 128, 89, 136, 89, 144, 89, 152, 89, 160, 89, 168, + 89, 176, 89, 184, 89, 192, 89, 200, 89, 208, 89, 216, 89, + 224, 89, 232, 89, 240, 89, 248, 89, 0, 90, 8, 90, 16, 90, + 24, 90, 32, 90, 40, 90, 48, 90, 56, 90, 64, 90, 72, 90, 80, + 90, 88, 90, 96, 90, 104, 90, 112, 90, 120, 90, 128, 90, + 136, 90, 144, 90, 152, 90, 160, 90, 168, 90, 176, 90, 184, + 90, 192, 90, 200, 90, 208, 90, 216, 90, 224, 90, 232, 90, + 240, 90, 248, 90, 0, 91, 8, 91, 16, 91, 24, 91, 32, 91, 40, + 91, 48, 91, 56, 91, 64, 91, 72, 91, 80, 91, 88, 91, 96, 91, + 104, 91, 112, 91, 120, 91, 128, 91, 136, 91, 144, 91, 152, + 91, 160, 91, 168, 91, 176, 91, 184, 91, 192, 91, 200, 91, + 208, 91, 216, 91, 224, 91, 232, 91, 240, 91, 248, 91, + }, + fsggml.TensorTypeF32: { + 0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 128, + 64, 0, 0, 160, 64, 0, 0, 192, 64, 0, 0, 224, 64, 0, 0, 0, 65, 0, + 0, 16, 65, 0, 0, 32, 65, 0, 0, 48, 65, 0, 0, 64, 65, 0, 0, 80, 65, + 0, 0, 96, 65, 0, 0, 112, 65, 0, 0, 128, 65, 0, 0, 136, 65, 0, 0, + 144, 65, 0, 0, 152, 65, 0, 0, 160, 65, 0, 0, 168, 65, 0, 0, 176, + 65, 0, 0, 184, 65, 0, 0, 192, 65, 0, 0, 200, 65, 0, 0, 208, 65, 0, + 0, 216, 65, 0, 0, 224, 65, 0, 0, 232, 65, 0, 0, 240, 65, 0, 0, 248, + 65, 0, 0, 0, 66, 0, 0, 4, 66, 0, 0, 8, 66, 0, 0, 12, 66, 0, 0, 16, + 66, 0, 0, 20, 66, 0, 0, 24, 66, 0, 0, 28, 66, 0, 0, 32, 66, 0, 0, + 36, 66, 0, 0, 40, 66, 0, 0, 44, 66, 0, 0, 48, 66, 0, 0, 52, 66, 0, + 0, 56, 66, 0, 0, 60, 66, 0, 0, 64, 66, 0, 0, 68, 66, 0, 0, 72, 66, + 0, 0, 76, 66, 0, 0, 80, 66, 0, 0, 84, 66, 0, 0, 88, 66, 0, 0, 92, 66, + 0, 0, 96, 66, 0, 0, 100, 66, 0, 0, 104, 66, 0, 0, 108, 66, 0, 0, 112, + 66, 0, 0, 116, 66, 0, 0, 120, 66, 0, 0, 124, 66, 0, 0, 128, 66, 0, 0, + 130, 66, 0, 0, 132, 66, 0, 0, 134, 66, 0, 0, 136, 66, 0, 0, 138, 66, + 0, 0, 140, 66, 0, 0, 142, 66, 0, 0, 144, 66, 0, 0, 146, 66, 0, 0, 148, + 66, 0, 0, 150, 66, 0, 0, 152, 66, 0, 0, 154, 66, 0, 0, 156, 66, 0, 0, + 158, 66, 0, 0, 160, 66, 0, 0, 162, 66, 0, 0, 164, 66, 0, 0, 166, 66, + 0, 0, 168, 66, 0, 0, 170, 66, 0, 0, 172, 66, 0, 0, 174, 66, 0, 0, 176, + 66, 0, 0, 178, 66, 0, 0, 180, 66, 0, 0, 182, 66, 0, 0, 184, 66, 0, 0, + 186, 66, 0, 0, 188, 66, 0, 0, 190, 66, 0, 0, 192, 66, 0, 0, 194, 66, 0, + 0, 196, 66, 0, 0, 198, 66, 0, 0, 200, 66, 0, 0, 202, 66, 0, 0, 204, 66, + 0, 0, 206, 66, 0, 0, 208, 66, 0, 0, 210, 66, 0, 0, 212, 66, 0, 0, 214, 66, + 0, 0, 216, 66, 0, 0, 218, 66, 0, 0, 220, 66, 0, 0, 222, 66, 0, 0, 224, 66, + 0, 0, 226, 66, 0, 0, 228, 66, 0, 0, 230, 66, 0, 0, 232, 66, 0, 0, 234, 66, + 0, 0, 236, 66, 0, 0, 238, 66, 0, 0, 240, 66, 0, 0, 242, 66, 0, 0, 244, 66, + 0, 0, 246, 66, 0, 0, 248, 66, 0, 0, 250, 66, 0, 0, 252, 66, 0, 0, 254, 66, + 0, 0, 0, 67, 0, 0, 1, 67, 0, 0, 2, 67, 0, 0, 3, 67, 0, 0, 4, 67, 0, 0, 5, 67, + 0, 0, 6, 67, 0, 0, 7, 67, 0, 0, 8, 67, 0, 0, 9, 67, 0, 0, 10, 67, 0, 0, 11, + 67, 0, 0, 12, 67, 0, 0, 13, 67, 0, 0, 14, 67, 0, 0, 15, 67, 0, 0, 16, 67, + 0, 0, 17, 67, 0, 0, 18, 67, 0, 0, 19, 67, 0, 0, 20, 67, 0, 0, 21, 67, 0, 0, + 22, 67, 0, 0, 23, 67, 0, 0, 24, 67, 0, 0, 25, 67, 0, 0, 26, 67, 0, 0, 27, + 67, 0, 0, 28, 67, 0, 0, 29, 67, 0, 0, 30, 67, 0, 0, 31, 67, 0, 0, 32, 67, + 0, 0, 33, 67, 0, 0, 34, 67, 0, 0, 35, 67, 0, 0, 36, 67, 0, 0, 37, 67, 0, 0, + 38, 67, 0, 0, 39, 67, 0, 0, 40, 67, 0, 0, 41, 67, 0, 0, 42, 67, 0, 0, 43, 67, + 0, 0, 44, 67, 0, 0, 45, 67, 0, 0, 46, 67, 0, 0, 47, 67, 0, 0, 48, 67, 0, 0, + 49, 67, 0, 0, 50, 67, 0, 0, 51, 67, 0, 0, 52, 67, 0, 0, 53, 67, 0, 0, 54, 67, + 0, 0, 55, 67, 0, 0, 56, 67, 0, 0, 57, 67, 0, 0, 58, 67, 0, 0, 59, 67, 0, 0, + 60, 67, 0, 0, 61, 67, 0, 0, 62, 67, 0, 0, 63, 67, 0, 0, 64, 67, 0, 0, 65, 67, + 0, 0, 66, 67, 0, 0, 67, 67, 0, 0, 68, 67, 0, 0, 69, 67, 0, 0, 70, 67, 0, 0, 71, + 67, 0, 0, 72, 67, 0, 0, 73, 67, 0, 0, 74, 67, 0, 0, 75, 67, 0, 0, 76, 67, 0, + 0, 77, 67, 0, 0, 78, 67, 0, 0, 79, 67, 0, 0, 80, 67, 0, 0, 81, 67, 0, 0, 82, + 67, 0, 0, 83, 67, 0, 0, 84, 67, 0, 0, 85, 67, 0, 0, 86, 67, 0, 0, 87, 67, 0, + 0, 88, 67, 0, 0, 89, 67, 0, 0, 90, 67, 0, 0, 91, 67, 0, 0, 92, 67, 0, 0, 93, + 67, 0, 0, 94, 67, 0, 0, 95, 67, 0, 0, 96, 67, 0, 0, 97, 67, 0, 0, 98, 67, 0, + 0, 99, 67, 0, 0, 100, 67, 0, 0, 101, 67, 0, 0, 102, 67, 0, 0, 103, 67, 0, 0, + 104, 67, 0, 0, 105, 67, 0, 0, 106, 67, 0, 0, 107, 67, 0, 0, 108, 67, 0, 0, 109, + 67, 0, 0, 110, 67, 0, 0, 111, 67, 0, 0, 112, 67, 0, 0, 113, 67, 0, 0, 114, 67, + 0, 0, 115, 67, 0, 0, 116, 67, 0, 0, 117, 67, 0, 0, 118, 67, 0, 0, 119, 67, 0, + 0, 120, 67, 0, 0, 121, 67, 0, 0, 122, 67, 0, 0, 123, 67, 0, 0, 124, 67, 0, 0, + 125, 67, 0, 0, 126, 67, 0, 0, 127, 67, + }, + fsggml.TensorTypeQ4_K: { + 52, 52, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 8, 15, 128, + 128, 129, 129, 146, 146, 147, 147, 164, 164, 165, 165, 166, 182, + 183, 183, 184, 200, 201, 201, 202, 218, 218, 219, 219, 236, 236, + 237, 237, 254, 254, 255, 202, 202, 202, 203, 203, 203, 219, 219, + 219, 220, 220, 220, 220, 220, 236, 237, 237, 237, 237, 237, + 237, 237, 238, 254, 254, 254, 254, 254, 255, 255, 255, 255, 220, + 220, 220, 220, 221, 221, 221, 221, 221, 221, 221, 237, 237, 237, + 238, 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 255, 255, + 255, 255, 255, 255, 255, 237, 237, 237, 237, 237, 237, 237, 238, + 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 254, 254, 254, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + }, + fsggml.TensorTypeQ2_K: { + 1, 2, 3, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 184, 184, + 184, 185, 249, 249, 249, 249, 249, 250, 250, 254, 254, 254, 254, + 255, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 171, 69, 0, 0, + }, + fsggml.TensorTypeQ5_K: { + 32, 48, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 7, 15, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 19, 20, 37, 38, 55, 56, 73, 74, + 91, 92, 109, 110, 127, 112, 128, 129, 146, 147, 164, 165, 182, 183, + 200, 201, 218, 219, 236, 237, 254, 133, 133, 149, 150, 150, 150, + 167, 167, 167, 168, 184, 184, 185, 185, 201, 202, 202, 202, 219, + 219, 219, 219, 236, 236, 236, 237, 253, 253, 254, 254, 254, 255, + 169, 169, 169, 169, 186, 186, 186, 186, 186, 187, 187, 203, 203, + 203, 204, 204, 204, 220, 220, 221, 221, 221, 221, 237, 237, 238, + 238, 238, 238, 254, 255, 255, 203, 203, 203, 204, 204, 204, 204, + 204, 220, 220, 220, 221, 221, 221, 221, 221, 237, 237, 238, 238, + 238, 238, 238, 238, 254, 255, 255, 255, 255, 255, 255, 255, + }, + fsggml.TensorTypeQ6_K: { + 96, 110, 92, 90, 88, 70, 68, 50, 48, 46, 44, 42, 24, 22, 4, 2, 80, + 95, 78, 77, 76, 59, 58, 57, 40, 39, 38, 21, 20, 19, 2, 1, 75, 75, + 74, 57, 57, 56, 55, 39, 38, 37, 21, 20, 20, 19, 2, 2, 72, 55, 55, + 54, 54, 37, 37, 36, 36, 19, 19, 18, 18, 1, 1, 0, 35, 35, 35, 35, + 34, 18, 18, 18, 17, 17, 17, 1, 1, 0, 0, 0, 35, 35, 34, 34, 18, + 18, 18, 17, 17, 17, 17, 1, 0, 0, 0, 0, 35, 35, 35, 19, 19, 18, 18, + 18, 18, 18, 1, 1, 1, 1, 1, 1, 34, 34, 18, 18, 18, 18, 17, 17, 17, + 17, 1, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 248, 240, 231, 224, 216, 208, 200, 192, 184, 176, + 166, 160, 152, 144, 136, 128, 235, 43, + }, + fsggml.TensorTypeQ3_K: { + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 20, 23, 23, 7, 7, 6, 6, 6, 2, + 1, 1, 1, 1, 0, 0, 22, 22, 6, 6, 5, 5, 5, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 238, 204, 170, 136, 102, 68, + 34, 1, 5, 5, 5, 5, 189, 63, + }, + } +) diff --git a/server/routes.go b/server/routes.go index 16f22cf93..d0b8f487e 100644 --- a/server/routes.go +++ b/server/routes.go @@ -4,10 +4,10 @@ import ( "bytes" "cmp" "context" - "encoding/binary" "encoding/json" "errors" "fmt" + "image" "io" "io/fs" "log/slog" @@ -17,7 +17,6 @@ import ( "net/netip" "os" "os/signal" - "path/filepath" "regexp" "slices" "strings" @@ -26,6 +25,7 @@ import ( "github.com/gin-contrib/cors" "github.com/gin-gonic/gin" + "golang.org/x/image/webp" "golang.org/x/sync/errgroup" "github.com/ollama/ollama/api" @@ -33,7 +33,7 @@ import ( "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" - "github.com/ollama/ollama/model/models/mllama" + "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/server/internal/client/ollama" "github.com/ollama/ollama/server/internal/registry" @@ -98,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C return nil, nil, nil, err } + if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 { + return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'") + } + if err := model.CheckCapabilities(caps...); err != nil { return nil, nil, nil, fmt.Errorf("%s %w", name, err) } @@ -204,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } - isMllama := checkMllamaModelFamily(m) - if isMllama && len(req.Images) > 1 { - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"}) + if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 { + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"}) return } images := make([]llm.ImageData, len(req.Images)) for i := range req.Images { - if isMllama && len(m.ProjectorPaths) > 0 { - data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i])) - if err != nil { - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) - return - } - - ar, ok := opts["aspectRatioIndex"].(int) - if !ok { - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) - return - } - - buf := new(bytes.Buffer) - err = binary.Write(buf, binary.LittleEndian, data) - if err != nil { - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) - return - } - - images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar} - } else { - images[i] = llm.ImageData{ID: i, Data: req.Images[i]} - } + images[i] = llm.ImageData{ID: i, Data: req.Images[i]} } prompt := req.Prompt @@ -267,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { for _, i := range images { imgPrompt := "" - if isMllama { - imgPrompt = "<|image|>" - } msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)}) } @@ -295,8 +272,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { prompt = b.String() } - slog.Debug("generate request", "images", len(images), "prompt", prompt) - ch := make(chan any) go func() { // TODO (jmorganca): avoid building the response twice both here and below @@ -1170,6 +1145,7 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) { corsConfig.AllowOrigins = envconfig.AllowedOrigins() r := gin.Default() + r.HandleMethodNotAllowed = true r.Use( cors.New(corsConfig), allowedHostsMiddleware(s.addr), @@ -1225,26 +1201,8 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) { } func Serve(ln net.Listener) error { - level := slog.LevelInfo - if envconfig.Debug() { - level = slog.LevelDebug - } - + slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel())) slog.Info("server config", "env", envconfig.Values()) - handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ - Level: level, - AddSource: true, - ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr { - if attr.Key == slog.SourceKey { - source := attr.Value.Any().(*slog.Source) - source.File = filepath.Base(source.File) - } - - return attr - }, - }) - - slog.SetDefault(slog.New(handler)) blobsDir, err := GetBlobsPath("") if err != nil { @@ -1323,6 +1281,10 @@ func Serve(ln net.Listener) error { s.sched.Run(schedCtx) + // register the experimental webp decoder + // so webp images can be used in multimodal inputs + image.RegisterFormat("webp", "RIFF????WEBP", webp.Decode, webp.DecodeConfig) + // At startup we retrieve GPU information so we can get log messages before loading a model // This will log warnings to the log in case we have problems with detected GPUs gpus := discover.GetGPUInfo() @@ -1340,31 +1302,29 @@ func Serve(ln net.Listener) error { func waitForStream(c *gin.Context, ch chan any) { c.Header("Content-Type", "application/json") + var latest api.ProgressResponse for resp := range ch { switch r := resp.(type) { case api.ProgressResponse: - if r.Status == "success" { - c.JSON(http.StatusOK, r) - return - } + latest = r case gin.H: status, ok := r["status"].(int) if !ok { status = http.StatusInternalServerError } - if errorMsg, ok := r["error"].(string); ok { - c.JSON(status, gin.H{"error": errorMsg}) - return - } else { - c.JSON(status, gin.H{"error": "unexpected error format in progress response"}) - return + errorMsg, ok := r["error"].(string) + if !ok { + errorMsg = "unknown error" } + c.JSON(status, gin.H{"error": errorMsg}) + return default: - c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"}) + c.JSON(http.StatusInternalServerError, gin.H{"error": "unknown message type"}) return } } - c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"}) + + c.JSON(http.StatusOK, latest) } func streamResponse(c *gin.Context, ch chan any) { @@ -1522,8 +1482,6 @@ func (s *Server) ChatHandler(c *gin.Context) { return } - slog.Debug("chat request", "images", len(images), "prompt", prompt) - ch := make(chan any) go func() { defer close(ch) diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 466dc04f1..3b3d99100 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -24,7 +24,7 @@ import ( var stream bool = false -func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) { +func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) { t.Helper() t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir())) diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 56121d41b..6bbf5b112 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -87,7 +87,7 @@ func TestGenerateChat(t *testing.T) { }, } - go s.sched.Run(context.TODO()) + go s.sched.Run(t.Context()) _, digest := createBinFile(t, ggml.KV{ "general.architecture": "llama", @@ -99,7 +99,7 @@ func TestGenerateChat(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -158,7 +158,7 @@ func TestGenerateChat(t *testing.T) { _, digest := createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []ggml.Tensor{}) + }, []*ggml.Tensor{}) w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", Files: map[string]string{"bert.gguf": digest}, @@ -631,7 +631,7 @@ func TestGenerate(t *testing.T) { }, } - go s.sched.Run(context.TODO()) + go s.sched.Run(t.Context()) _, digest := createBinFile(t, ggml.KV{ "general.architecture": "llama", @@ -643,7 +643,7 @@ func TestGenerate(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -698,7 +698,7 @@ func TestGenerate(t *testing.T) { _, digest := createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []ggml.Tensor{}) + }, []*ggml.Tensor{}) w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", diff --git a/server/routes_test.go b/server/routes_test.go index 2894b1555..7c44bc957 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -21,6 +21,8 @@ import ( "testing" "unicode" + "github.com/gin-gonic/gin" + "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/openai" @@ -474,14 +476,24 @@ func TestRoutes(t *testing.T) { t.Fatalf("failed to read response body: %v", err) } - var retrieveResp api.RetrieveModelResponse - err = json.Unmarshal(body, &retrieveResp) + var m openai.Model + err = json.Unmarshal(body, &m) if err != nil { t.Fatalf("failed to unmarshal response body: %v", err) } - if retrieveResp.Id != "show-model" || retrieveResp.OwnedBy != "library" { - t.Errorf("expected model 'show-model' owned by 'library', got %v", retrieveResp) + if m.Id != "show-model" || m.OwnedBy != "library" { + t.Errorf("expected model 'show-model' owned by 'library', got %v", m) + } + }, + }, + { + Name: "Method Not Allowed", + Method: http.MethodGet, + Path: "/api/show", + Expected: func(t *testing.T, resp *http.Response) { + if resp.StatusCode != 405 { + t.Errorf("expected status code 405, got %d", resp.StatusCode) } }, }, @@ -517,7 +529,7 @@ func TestRoutes(t *testing.T) { for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { u := httpSrv.URL + tc.Path - req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil) + req, err := http.NewRequestWithContext(t.Context(), tc.Method, u, nil) if err != nil { t.Fatalf("failed to create request: %v", err) } @@ -872,3 +884,87 @@ func TestFilterThinkTags(t *testing.T) { } } } + +func TestWaitForStream(t *testing.T) { + gin.SetMode(gin.TestMode) + + cases := []struct { + name string + messages []any + expectCode int + expectBody string + }{ + { + name: "error", + messages: []any{ + gin.H{"error": "internal server error"}, + }, + expectCode: http.StatusInternalServerError, + expectBody: `{"error":"internal server error"}`, + }, + { + name: "error status", + messages: []any{ + gin.H{"status": http.StatusNotFound, "error": "not found"}, + }, + expectCode: http.StatusNotFound, + expectBody: `{"error":"not found"}`, + }, + { + name: "unknown error", + messages: []any{ + gin.H{"msg": "something else"}, + }, + expectCode: http.StatusInternalServerError, + expectBody: `{"error":"unknown error"}`, + }, + { + name: "unknown type", + messages: []any{ + struct{}{}, + }, + expectCode: http.StatusInternalServerError, + expectBody: `{"error":"unknown message type"}`, + }, + { + name: "progress success", + messages: []any{ + api.ProgressResponse{Status: "success"}, + }, + expectCode: http.StatusOK, + expectBody: `{"status":"success"}`, + }, + { + name: "progress more than success", + messages: []any{ + api.ProgressResponse{Status: "success"}, + api.ProgressResponse{Status: "one more thing"}, + }, + expectCode: http.StatusOK, + expectBody: `{"status":"one more thing"}`, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + + ch := make(chan any, len(tt.messages)) + for _, msg := range tt.messages { + ch <- msg + } + close(ch) + + waitForStream(c, ch) + + if w.Code != tt.expectCode { + t.Errorf("expected status %d, got %d", tt.expectCode, w.Code) + } + + if diff := cmp.Diff(w.Body.String(), tt.expectBody); diff != "" { + t.Errorf("body mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/server/sched.go b/server/sched.go index 8513fbf4b..3fc54e55a 100644 --- a/server/sched.go +++ b/server/sched.go @@ -8,6 +8,7 @@ import ( "os" "reflect" "runtime" + "slices" "sort" "strconv" "strings" @@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) { continue } numParallel := int(envconfig.NumParallel()) - // TODO (jmorganca): mllama doesn't support parallel yet - // see https://github.com/ollama/ollama/issues/4165 - if checkMllamaModelFamily(pending.model) && numParallel != 1 { + // `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1 + // ref: https://github.com/ollama/ollama/issues/4165 + if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 { numParallel = 1 - slog.Warn("mllama doesn't support parallel requests yet") + slog.Warn("mllama does not currently support parallel requests") } for { @@ -147,6 +148,7 @@ func (s *Scheduler) processPending(ctx context.Context) { s.loadedMu.Unlock() if runner != nil { if runner.needsReload(ctx, pending) { + slog.Debug("reloading", "runner", runner) runnerToExpire = runner } else { // Runner is usable, return it @@ -282,7 +284,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } // Trigger an expiration to unload once it's done runnerToExpire.refMu.Lock() - slog.Debug("resetting model to expire immediately to make room", "modelPath", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount) + slog.Debug("resetting model to expire immediately to make room", "runner", runnerToExpire, "refCount", runnerToExpire.refCount) if runnerToExpire.expireTimer != nil { runnerToExpire.expireTimer.Stop() runnerToExpire.expireTimer = nil @@ -295,13 +297,13 @@ func (s *Scheduler) processPending(ctx context.Context) { // Wait for the unload to happen // Note: at this point we're queueing up all incoming requests, even if they were for // a different model that's loaded and not scheduled to be removed. - slog.Debug("waiting for pending requests to complete and unload to occur", "modelPath", runnerToExpire.modelPath) + slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire) select { case <-ctx.Done(): slog.Debug("shutting down scheduler pending loop") return case <-s.unloadedCh: - slog.Debug("unload completed", "modelPath", runnerToExpire.modelPath) + slog.Debug("unload completed", "runner", runnerToExpire) continue } } @@ -331,16 +333,16 @@ func (s *Scheduler) processCompleted(ctx context.Context) { runner.refCount-- if runner.refCount <= 0 { if runner.sessionDuration <= 0 { - slog.Debug("runner with zero duration has gone idle, expiring to unload", "modelPath", runner.modelPath) + slog.Debug("runner with zero duration has gone idle, expiring to unload", "runner", runner) if runner.expireTimer != nil { runner.expireTimer.Stop() runner.expireTimer = nil } s.expiredCh <- runner } else if runner.expireTimer == nil { - slog.Debug("runner with non-zero duration has gone idle, adding timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration) + slog.Debug("runner with non-zero duration has gone idle, adding timer", "runner", runner, "duration", runner.sessionDuration) runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() { - slog.Debug("timer expired, expiring to unload", "modelPath", runner.modelPath) + slog.Debug("timer expired, expiring to unload", "runner", runner) runner.refMu.Lock() defer runner.refMu.Unlock() if runner.expireTimer != nil { @@ -351,18 +353,18 @@ func (s *Scheduler) processCompleted(ctx context.Context) { }) runner.expiresAt = time.Now().Add(runner.sessionDuration) } else { - slog.Debug("runner with non-zero duration has gone idle, resetting timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration) + slog.Debug("runner with non-zero duration has gone idle, resetting timer", "runner", runner, "duration", runner.sessionDuration) runner.expireTimer.Reset(runner.sessionDuration) runner.expiresAt = time.Now().Add(runner.sessionDuration) } } - slog.Debug("after processing request finished event", "modelPath", runner.modelPath, "refCount", runner.refCount) + slog.Debug("after processing request finished event", "runner", runner, "refCount", runner.refCount) runner.refMu.Unlock() case runner := <-s.expiredCh: - slog.Debug("runner expired event received", "modelPath", runner.modelPath) + slog.Debug("runner expired event received", "runner", runner) runner.refMu.Lock() if runner.refCount > 0 { - slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount) + slog.Debug("expired event with positive ref count, retrying", "runner", runner, "refCount", runner.refCount) go func(runner *runnerRef) { // We can't unload yet, but want to as soon as the current request completes // So queue up another expired event @@ -374,17 +376,29 @@ func (s *Scheduler) processCompleted(ctx context.Context) { } s.loadedMu.Lock() - slog.Debug("got lock to unload", "modelPath", runner.modelPath) - finished := runner.waitForVRAMRecovery() - runner.unload() - delete(s.loaded, runner.modelPath) - s.loadedMu.Unlock() - slog.Debug("runner released", "modelPath", runner.modelPath) - runner.refMu.Unlock() - - <-finished - slog.Debug("sending an unloaded event", "modelPath", runner.modelPath) - s.unloadedCh <- struct{}{} + slog.Debug("got lock to unload expired event", "runner", runner) + runnerToUnload := s.loaded[runner.modelPath] + if runnerToUnload == nil { + // If runnerToUnload is nil, we already processed an event and + // unloaded it. This double unload can happen if the initial + // request is canceled and we're trying to load another model + // that requires this one to be evicted, or the settings change + // and require a reload + s.loadedMu.Unlock() + runner.refMu.Unlock() + slog.Debug("duplicate expired event, ignoring", "runner", runner) + } else { + slog.Debug("starting background wait for VRAM recovery", "runner", runner) + finished := runner.waitForVRAMRecovery() + runner.unload() + delete(s.loaded, runner.modelPath) + s.loadedMu.Unlock() + slog.Debug("runner terminated and removed from list, blocking for VRAM recovery", "runner", runner) + <-finished + runner.refMu.Unlock() + slog.Debug("sending an unloaded event", "runner", runner) + s.unloadedCh <- struct{}{} + } } } } @@ -406,7 +420,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm pending.successCh <- runner go func() { <-pending.ctx.Done() - slog.Debug("context for request finished") + slog.Debug("context for request finished", "runner", runner) finished <- pending }() } @@ -441,11 +455,19 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis estimatedVRAM: llama.EstimatedVRAM(), estimatedTotal: llama.EstimatedTotal(), loading: true, + pid: llama.Pid(), } runner.numParallel = numParallel runner.refMu.Lock() // hold lock until running or aborted s.loadedMu.Lock() + if oldRunner, ok := s.loaded[req.model.ModelPath]; ok { + // Shouldn't happen, but safeguard against leaking a runner + slog.Warn("model was still loaded", "old_runner", oldRunner, "new_runner", runner) + oldRunner.refMu.Lock() + oldRunner.unload() + oldRunner.refMu.Unlock() + } s.loaded[req.model.ModelPath] = runner slog.Info("loaded runners", "count", len(s.loaded)) s.loadedMu.Unlock() @@ -455,11 +477,14 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis if err = llama.WaitUntilRunning(req.ctx); err != nil { slog.Error("error loading llama server", "error", err) req.errCh <- err - slog.Debug("triggering expiration for failed load", "model", runner.modelPath) + slog.Debug("triggering expiration for failed load", "runner", runner) s.expiredCh <- runner return } - slog.Debug("finished setting up runner", "model", req.model.ModelPath) + slog.Debug("finished setting up", "runner", runner) + if runner.pid < 0 { + runner.pid = llama.Pid() + } runner.refCount++ runner.loading = false go func() { @@ -544,6 +569,7 @@ type runnerRef struct { refCount uint // prevent unloading if > 0 llama llm.LlamaServer + pid int loading bool // True only during initial load, then false forever gpus discover.GpuInfoList // Recorded at time of provisioning estimatedVRAM uint64 @@ -628,6 +654,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || (runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") { finished <- struct{}{} + slog.Debug("no need to wait for VRAM recovery", "runner", runner) return finished } start := time.Now() @@ -646,7 +673,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { for { <-ticker.C if time.Now().After(expiresAt) { - slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "model", runner.modelPath) + slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "runner", runner) finished <- struct{}{} } @@ -659,7 +686,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { } // If we're within ~80% of the estimated memory usage recovered, bail out if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 { - slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "model", runner.modelPath) + slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner) finished <- struct{}{} return } @@ -668,6 +695,33 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { return finished } +func (runner *runnerRef) LogValue() slog.Value { + if runner == nil { + return slog.StringValue("nil") + } + attrs := []slog.Attr{} + if runner.model != nil { + attrs = append(attrs, slog.String("name", runner.model.Name)) + } + if len(runner.gpus) > 0 { + attrs = append(attrs, + slog.String("inference", runner.gpus[0].Library), + slog.Int("devices", len(runner.gpus)), + ) + } + attrs = append(attrs, + slog.String("size", format.HumanBytes2(runner.estimatedTotal)), + slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)), + slog.Int("parallel", runner.numParallel), + slog.Int("pid", runner.pid), + slog.String("model", runner.modelPath), + ) + if runner.Options != nil { + attrs = append(attrs, slog.Int("num_ctx", runner.Options.NumCtx)) + } + return slog.GroupValue(attrs...) +} + type ByDurationAndName []*runnerRef func (a ByDurationAndName) Len() int { return len(a) } @@ -790,12 +844,12 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef { rc := runner.refCount runner.refMu.Unlock() if rc == 0 { - slog.Debug("found an idle runner to unload") + slog.Debug("found an idle runner to unload", "runner", runner) return runner } } // None appear idle, just wait for the one with the shortest duration - slog.Debug("no idle runners, picking the shortest duration", "count", len(runnerList)) + slog.Debug("no idle runners, picking the shortest duration", "runner_count", len(runnerList), "runner", runnerList[0]) return runnerList[0] } diff --git a/server/sched_test.go b/server/sched_test.go index 274e18cec..01fb9a703 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -26,7 +26,7 @@ func TestMain(m *testing.M) { } func TestInitScheduler(t *testing.T) { - ctx, done := context.WithCancel(context.Background()) + ctx, done := context.WithCancel(t.Context()) defer done() s := InitScheduler(ctx) s.loadedMu.Lock() @@ -35,7 +35,7 @@ func TestInitScheduler(t *testing.T) { } func TestLoad(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond) defer done() s := InitScheduler(ctx) var f *ggml.GGML // value not used in tests @@ -126,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est "tokenizer.ggml.tokens": []string{" "}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, })) @@ -167,7 +167,7 @@ func getCpuFn() discover.GpuInfoList { } func TestRequestsSameModelSameRequest(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() s := InitScheduler(ctx) s.getGpuFn = getGpuFn @@ -210,7 +210,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) { } func TestRequestsSimpleReloadSameModel(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() s := InitScheduler(ctx) s.getGpuFn = getGpuFn @@ -258,7 +258,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) { } func TestRequestsMultipleLoadedModels(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() s := InitScheduler(ctx) s.getGpuFn = getGpuFn @@ -355,7 +355,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) { } func TestGetRunner(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 3*time.Second) + ctx, done := context.WithTimeout(t.Context(), 3*time.Second) defer done() a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond}) @@ -408,7 +408,7 @@ func TestGetRunner(t *testing.T) { } func TestExpireRunner(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond) defer done() s := InitScheduler(ctx) req := &LlmRequest{ @@ -455,7 +455,7 @@ func TestExpireRunner(t *testing.T) { // TODO - add one scenario that triggers the bogus finished event with positive ref count func TestPrematureExpired(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() // Same model, same request @@ -502,7 +502,7 @@ func TestPrematureExpired(t *testing.T) { } func TestUseLoadedRunner(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) req := &LlmRequest{ ctx: ctx, opts: api.DefaultOptions(), @@ -529,7 +529,7 @@ func TestUseLoadedRunner(t *testing.T) { } func TestUpdateFreeSpace(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() gpus := discover.GpuInfoList{ { @@ -562,7 +562,7 @@ func TestUpdateFreeSpace(t *testing.T) { } func TestFilterGPUsWithoutLoadingModels(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() gpus := discover.GpuInfoList{ { @@ -596,7 +596,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) { } func TestFindRunnerToUnload(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1} @@ -616,7 +616,7 @@ func TestFindRunnerToUnload(t *testing.T) { } func TestNeedsReload(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} @@ -663,7 +663,7 @@ func TestNeedsReload(t *testing.T) { } func TestUnloadAllRunners(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} @@ -695,7 +695,7 @@ func TestUnload(t *testing.T) { } func TestAlreadyCanceled(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() dctx, done2 := context.WithCancel(ctx) done2() @@ -712,7 +712,7 @@ func TestAlreadyCanceled(t *testing.T) { } func TestHomogeneousGPUs(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() s := InitScheduler(ctx) @@ -792,3 +792,4 @@ func (s *mockLlm) Close() error { func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] } +func (s *mockLlm) Pid() int { return -1 } diff --git a/types/syncmap/syncmap.go b/types/syncmap/syncmap.go new file mode 100644 index 000000000..ff21cd999 --- /dev/null +++ b/types/syncmap/syncmap.go @@ -0,0 +1,38 @@ +package syncmap + +import ( + "maps" + "sync" +) + +// SyncMap is a simple, generic thread-safe map implementation. +type SyncMap[K comparable, V any] struct { + mu sync.RWMutex + m map[K]V +} + +func NewSyncMap[K comparable, V any]() *SyncMap[K, V] { + return &SyncMap[K, V]{ + m: make(map[K]V), + } +} + +func (s *SyncMap[K, V]) Load(key K) (V, bool) { + s.mu.RLock() + defer s.mu.RUnlock() + val, ok := s.m[key] + return val, ok +} + +func (s *SyncMap[K, V]) Store(key K, value V) { + s.mu.Lock() + defer s.mu.Unlock() + s.m[key] = value +} + +func (s *SyncMap[K, V]) Items() map[K]V { + s.mu.RLock() + defer s.mu.RUnlock() + // shallow copy map items + return maps.Clone(s.m) +}