Compare commits
45 Commits
v0.12.11-r
...
main
| Author | SHA1 | Date |
|---|---|---|
|
|
8b1b89a984 | |
|
|
47e272c35a | |
|
|
417a81fda3 | |
|
|
dba62ff3a5 | |
|
|
d70e935526 | |
|
|
5c1063df7f | |
|
|
cb485b2019 | |
|
|
b2af50960f | |
|
|
eac5b8bfbd | |
|
|
604e43b28d | |
|
|
53985b3c4d | |
|
|
b6e02cbbd2 | |
|
|
91935631ac | |
|
|
8de30b568a | |
|
|
485da9fd35 | |
|
|
0796d79d19 | |
|
|
92981ae3f2 | |
|
|
8ed1adf3db | |
|
|
440a3823a6 | |
|
|
718961de68 | |
|
|
330f62a7fa | |
|
|
584e2d646f | |
|
|
1fd4cb87b2 | |
|
|
4aba2e8b72 | |
|
|
2f36d769aa | |
|
|
399eacf486 | |
|
|
231cc878cb | |
|
|
aa676b313f | |
|
|
dd0ed0ef17 | |
|
|
d5649821ae | |
|
|
4cea757e70 | |
|
|
a751bc159c | |
|
|
5d31242fbf | |
|
|
d7fd72193f | |
|
|
72ff5b9d8c | |
|
|
ce29f695b4 | |
|
|
12b174b10e | |
|
|
333203d871 | |
|
|
c114987523 | |
|
|
b48083f33f | |
|
|
482bec824f | |
|
|
684a9a8c5a | |
|
|
54a76d3773 | |
|
|
8a75d8b015 | |
|
|
f206357412 |
|
|
@ -15,6 +15,8 @@ ml/backend/**/*.cu linguist-vendored
|
|||
ml/backend/**/*.cuh linguist-vendored
|
||||
ml/backend/**/*.m linguist-vendored
|
||||
ml/backend/**/*.metal linguist-vendored
|
||||
ml/backend/**/*.comp linguist-vendored
|
||||
ml/backend/**/*.glsl linguist-vendored
|
||||
ml/backend/**/CMakeLists.txt linguist-vendored
|
||||
|
||||
llama/build-info.cpp linguist-generated
|
||||
|
|
|
|||
|
|
@ -366,6 +366,7 @@ jobs:
|
|||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/vulkan*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
||||
|
|
|
|||
|
|
@ -226,12 +226,9 @@ jobs:
|
|||
if: always()
|
||||
run: go test -count=1 -benchtime=1x ./...
|
||||
|
||||
# TODO(bmizerany): replace this heavy tool with just the
|
||||
# tools/checks/binaries we want and then make them all run in parallel
|
||||
# across jobs, not on a single tiny vm on Github Actions.
|
||||
- uses: golangci/golangci-lint-action@v6
|
||||
- uses: golangci/golangci-lint-action@v9
|
||||
with:
|
||||
args: --timeout 10m0s -v
|
||||
only-new-issues: true
|
||||
|
||||
patches:
|
||||
runs-on: ubuntu-latest
|
||||
|
|
@ -240,4 +237,4 @@ jobs:
|
|||
- name: Verify patches apply cleanly and do not change files
|
||||
run: |
|
||||
make -f Makefile.sync clean checkout apply-patches sync
|
||||
git diff --compact-summary --exit-code
|
||||
git diff --compact-summary --exit-code
|
||||
|
|
|
|||
|
|
@ -1,41 +1,77 @@
|
|||
run:
|
||||
timeout: 5m
|
||||
version: "2"
|
||||
linters:
|
||||
default: none
|
||||
enable:
|
||||
- asasalint
|
||||
- bidichk
|
||||
- bodyclose
|
||||
- containedctx
|
||||
- copyloopvar
|
||||
- errcheck
|
||||
- errorlint
|
||||
- exptostd
|
||||
- gocheckcompilerdirectives
|
||||
- gofmt
|
||||
- gofumpt
|
||||
- gosimple
|
||||
- gocritic
|
||||
- govet
|
||||
- ineffassign
|
||||
- intrange
|
||||
- makezero
|
||||
- misspell
|
||||
- modernize
|
||||
- nilerr
|
||||
- nilnil
|
||||
- nolintlint
|
||||
- nosprintfhostport
|
||||
- perfsprint
|
||||
- prealloc
|
||||
- sloglint
|
||||
- staticcheck
|
||||
- unconvert
|
||||
- unused
|
||||
- usestdlibvars
|
||||
- usetesting
|
||||
- wastedassign
|
||||
- whitespace
|
||||
disable:
|
||||
- usestdlibvars
|
||||
- errcheck
|
||||
linters-settings:
|
||||
staticcheck:
|
||||
checks:
|
||||
- all
|
||||
- -SA1019 # omit Deprecated check
|
||||
severity:
|
||||
default-severity: error
|
||||
rules:
|
||||
- linters:
|
||||
- gofmt
|
||||
- goimports
|
||||
- intrange
|
||||
severity: info
|
||||
settings:
|
||||
errcheck:
|
||||
exclude-functions:
|
||||
- fmt.Fprintf
|
||||
perfsprint:
|
||||
strconcat: false
|
||||
concat-loop: false
|
||||
staticcheck:
|
||||
checks:
|
||||
- all
|
||||
# Using a deprecated function, variable, constant or field.
|
||||
# https://staticcheck.dev/docs/checks/#SA1019
|
||||
- -SA1019
|
||||
# Incorrect or missing package comment.
|
||||
# https://staticcheck.dev/docs/checks/#ST1000
|
||||
- -ST1000
|
||||
# Poorly chosen identifier.
|
||||
# https://staticcheck.dev/docs/checks/#ST1003
|
||||
- -ST1003
|
||||
# The documentation of an exported function should start with the function's name.
|
||||
# https://staticcheck.dev/docs/checks/#ST1020
|
||||
- -ST1020
|
||||
# The documentation of an exported type should start with type's name.
|
||||
# https://staticcheck.dev/docs/checks/#ST1021
|
||||
- -ST1021
|
||||
# The documentation of an exported variable or constant should start with variable's name.
|
||||
# https://staticcheck.dev/docs/checks/#ST1022
|
||||
- -ST1022
|
||||
usestdlibvars:
|
||||
http-method: false
|
||||
http-status-code: false
|
||||
|
||||
formatters:
|
||||
enable:
|
||||
- gci
|
||||
- gofmt
|
||||
- gofumpt
|
||||
settings:
|
||||
gci:
|
||||
sections:
|
||||
- standard
|
||||
- default
|
||||
- localmodule
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ See the [development documentation](./docs/development.md) for instructions on h
|
|||
|
||||
* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
|
||||
* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
|
||||
* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.
|
||||
* Documentation: small updates to fill in or correct missing documentation are helpful, however large documentation additions can be hard to maintain over time.
|
||||
|
||||
### Issues that may not be accepted
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ Tips for proposals:
|
|||
* Explain how the change will be tested.
|
||||
|
||||
Additionally, for bonus points: Provide draft documentation you would expect to
|
||||
see if the change were accepted.
|
||||
see if the changes were accepted.
|
||||
|
||||
## Pull requests
|
||||
|
||||
|
|
@ -66,7 +66,6 @@ Examples:
|
|||
|
||||
llm/backend/mlx: support the llama architecture
|
||||
CONTRIBUTING: provide clarity on good commit messages, and bad
|
||||
docs: simplify manual installation with shorter curl commands
|
||||
|
||||
Bad Examples:
|
||||
|
||||
|
|
|
|||
14
Dockerfile
|
|
@ -39,14 +39,14 @@ ENV CC=clang CXX=clang++
|
|||
FROM base-${TARGETARCH} AS base
|
||||
ARG CMAKEVERSION
|
||||
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
ENV LDFLAGS=-s
|
||||
|
||||
FROM base AS cpu
|
||||
RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
|
||||
ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CPU' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CPU' \
|
||||
|
|
@ -57,6 +57,8 @@ ARG CUDA11VERSION=11.8
|
|||
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 11' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
|
||||
|
|
@ -67,6 +69,8 @@ ARG CUDA12VERSION=12.8
|
|||
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-12/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 12' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
|
||||
|
|
@ -78,6 +82,8 @@ ARG CUDA13VERSION=13.0
|
|||
RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-13/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 13' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
|
||||
|
|
@ -87,6 +93,8 @@ RUN --mount=type=cache,target=/root/.ccache \
|
|||
FROM base AS rocm-6
|
||||
ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'ROCm 6' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
|
||||
|
|
@ -118,6 +126,8 @@ RUN --mount=type=cache,target=/root/.ccache \
|
|||
&& cmake --install build --component CUDA --strip --parallel ${PARALLEL}
|
||||
|
||||
FROM base AS vulkan
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'Vulkan' \
|
||||
&& cmake --build --parallel --preset 'Vulkan' \
|
||||
|
|
|
|||
15
README.md
|
|
@ -299,6 +299,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [LibreChat](https://github.com/danny-avila/LibreChat)
|
||||
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
|
||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [AI-UI](https://github.com/bajahaw/ai-ui)
|
||||
- [Saddle](https://github.com/jikkuatwork/saddle)
|
||||
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
|
||||
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
||||
|
|
@ -365,7 +366,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
|
||||
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
|
||||
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
|
||||
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
|
||||
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VS Code extension for multi-file/whole-repo coding
|
||||
- [Void](https://github.com/voideditor/void) (Open source AI code editor and Cursor alternative)
|
||||
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
|
||||
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
||||
|
|
@ -397,7 +399,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
|
||||
- [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
|
||||
- [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
|
||||
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
|
||||
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VS Code extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
|
||||
- [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
|
||||
- [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
|
||||
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
|
||||
|
|
@ -426,6 +428,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
|
||||
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
|
||||
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
|
||||
- [KDeps](https://github.com/kdeps/kdeps) (Kdeps is an offline-first AI framework for building Dockerized full-stack AI applications declaratively using Apple PKL and integrates APIs with Ollama on the backend.)
|
||||
- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
|
||||
- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
|
||||
- [Hillnote](https://hillnote.com) (A Markdown-first workspace designed to supercharge your AI workflow. Create documents ready to integrate with Claude, ChatGPT, Gemini, Cursor, and more - all while keeping your work on your device.)
|
||||
|
|
@ -615,7 +618,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
||||
- [AI Summmary Helper plugin](https://github.com/philffm/ai-summary-helper)
|
||||
- [AI Summary Helper plugin](https://github.com/philffm/ai-summary-helper)
|
||||
- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
|
||||
- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
|
||||
- [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
|
||||
|
|
@ -623,7 +626,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
||||
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
||||
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
|
||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Editor tool to analyze scripts via Ollama)
|
||||
- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
|
||||
- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
|
||||
- [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network)
|
||||
|
|
@ -633,12 +636,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.
|
||||
|
||||
### Observability
|
||||
- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
|
||||
- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native integration to Ollama.
|
||||
- [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
|
||||
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
|
||||
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
|
||||
- [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
|
||||
- [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.
|
||||
|
||||
## Security
|
||||
### Security
|
||||
- [Ollama Fortress](https://github.com/ParisNeo/ollama_proxy_server)
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ Please include the following details in your report:
|
|||
|
||||
## Security best practices
|
||||
|
||||
While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
|
||||
While the maintainer team does its best to secure Ollama, users are encouraged to implement their own security best practices, such as:
|
||||
|
||||
- Regularly updating to the latest version of Ollama
|
||||
- Securing access to hosted instances of Ollama
|
||||
|
|
|
|||
|
|
@ -366,6 +366,9 @@ type TokenLogprob struct {
|
|||
|
||||
// Logprob is the log probability of this token.
|
||||
Logprob float64 `json:"logprob"`
|
||||
|
||||
// Bytes contains the raw byte representation of the token
|
||||
Bytes []int `json:"bytes,omitempty"`
|
||||
}
|
||||
|
||||
// Logprob contains log probability information for a generated token.
|
||||
|
|
|
|||
|
|
@ -397,8 +397,8 @@ func checkUserLoggedIn(uiServerPort int) bool {
|
|||
// handleConnectURLScheme fetches the connect URL and opens it in the browser
|
||||
func handleConnectURLScheme() {
|
||||
if checkUserLoggedIn(uiServerPort) {
|
||||
slog.Info("user is already logged in, opening settings instead")
|
||||
sendUIRequestMessage("/")
|
||||
slog.Info("user is already logged in, opening app instead")
|
||||
showWindow(wv.webview.Window())
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -434,37 +434,30 @@ func openInBrowser(url string) {
|
|||
}
|
||||
}
|
||||
|
||||
// parseURLScheme parses an ollama:// URL and returns whether it's a connect URL and the UI path
|
||||
func parseURLScheme(urlSchemeRequest string) (isConnect bool, uiPath string, err error) {
|
||||
// parseURLScheme parses an ollama:// URL and validates it
|
||||
// Supports: ollama:// (open app) and ollama://connect (OAuth)
|
||||
func parseURLScheme(urlSchemeRequest string) (isConnect bool, err error) {
|
||||
parsedURL, err := url.Parse(urlSchemeRequest)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
return false, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Check if this is a connect URL
|
||||
if parsedURL.Host == "connect" || strings.TrimPrefix(parsedURL.Path, "/") == "connect" {
|
||||
return true, "", nil
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Extract the UI path
|
||||
path := "/"
|
||||
if parsedURL.Path != "" && parsedURL.Path != "/" {
|
||||
// For URLs like ollama:///settings, use the path directly
|
||||
path = parsedURL.Path
|
||||
} else if parsedURL.Host != "" {
|
||||
// For URLs like ollama://settings (without triple slash),
|
||||
// the "settings" part is parsed as the host, not the path.
|
||||
// We need to convert it to a path by prepending "/"
|
||||
// This also handles ollama://settings/ where Windows adds a trailing slash
|
||||
path = "/" + parsedURL.Host
|
||||
// Allow bare ollama:// or ollama:/// to open the app
|
||||
if (parsedURL.Host == "" && parsedURL.Path == "") || parsedURL.Path == "/" {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return false, path, nil
|
||||
return false, fmt.Errorf("unsupported ollama:// URL path: %s", urlSchemeRequest)
|
||||
}
|
||||
|
||||
// handleURLSchemeInCurrentInstance processes URL scheme requests in the current instance
|
||||
func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
|
||||
isConnect, uiPath, err := parseURLScheme(urlSchemeRequest)
|
||||
isConnect, err := parseURLScheme(urlSchemeRequest)
|
||||
if err != nil {
|
||||
slog.Error("failed to parse URL scheme request", "url", urlSchemeRequest, "error", err)
|
||||
return
|
||||
|
|
@ -473,6 +466,8 @@ func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
|
|||
if isConnect {
|
||||
handleConnectURLScheme()
|
||||
} else {
|
||||
sendUIRequestMessage(uiPath)
|
||||
if wv.webview != nil {
|
||||
showWindow(wv.webview.Window())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,27 +24,14 @@ bool firstTimeRun,startHidden; // Set in run before initialization
|
|||
for (NSURL *url in urls) {
|
||||
if ([url.scheme isEqualToString:@"ollama"]) {
|
||||
NSString *path = url.path;
|
||||
if (!path || [path isEqualToString:@""]) {
|
||||
// For URLs like ollama://settings (without triple slash),
|
||||
// the "settings" part is parsed as the host, not the path.
|
||||
// We need to convert it to a path by prepending "/"
|
||||
if (url.host && ![url.host isEqualToString:@""]) {
|
||||
path = [@"/" stringByAppendingString:url.host];
|
||||
} else {
|
||||
path = @"/";
|
||||
}
|
||||
}
|
||||
|
||||
if ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"]) {
|
||||
|
||||
if (path && ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"])) {
|
||||
// Special case: handle connect by opening browser instead of app
|
||||
handleConnectURL();
|
||||
} else {
|
||||
// Set app to be active and visible
|
||||
[NSApp setActivationPolicy:NSApplicationActivationPolicyRegular];
|
||||
[NSApp activateIgnoringOtherApps:YES];
|
||||
|
||||
// Open the path with the UI
|
||||
[self uiRequest:path];
|
||||
}
|
||||
|
||||
break;
|
||||
|
|
@ -260,7 +247,7 @@ bool firstTimeRun,startHidden; // Set in run before initialization
|
|||
}
|
||||
|
||||
- (void)openHelp:(id)sender {
|
||||
NSURL *url = [NSURL URLWithString:@"https://github.com/ollama/ollama/tree/main/docs"];
|
||||
NSURL *url = [NSURL URLWithString:@"https://docs.ollama.com/"];
|
||||
[[NSWorkspace sharedWorkspace] openURL:url];
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ func (app *appCallbacks) HandleURLScheme(urlScheme string) {
|
|||
|
||||
// handleURLSchemeRequest processes URL scheme requests from other instances
|
||||
func handleURLSchemeRequest(urlScheme string) {
|
||||
isConnect, uiPath, err := parseURLScheme(urlScheme)
|
||||
isConnect, err := parseURLScheme(urlScheme)
|
||||
if err != nil {
|
||||
slog.Error("failed to parse URL scheme request", "url", urlScheme, "error", err)
|
||||
return
|
||||
|
|
@ -147,7 +147,9 @@ func handleURLSchemeRequest(urlScheme string) {
|
|||
if isConnect {
|
||||
handleConnectURLScheme()
|
||||
} else {
|
||||
sendUIRequestMessage(uiPath)
|
||||
if wv.webview != nil {
|
||||
showWindow(wv.webview.Window())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import {
|
|||
import { parseJsonlFromResponse } from "./util/jsonl-parsing";
|
||||
import { ollamaClient as ollama } from "./lib/ollama-client";
|
||||
import type { ModelResponse } from "ollama/browser";
|
||||
import { API_BASE } from "./lib/config";
|
||||
|
||||
// Extend Model class with utility methods
|
||||
declare module "@/gotypes" {
|
||||
|
|
@ -27,8 +28,6 @@ Model.prototype.isCloud = function (): boolean {
|
|||
return this.model.endsWith("cloud");
|
||||
};
|
||||
|
||||
const API_BASE = import.meta.env.DEV ? "http://127.0.0.1:3001" : "";
|
||||
|
||||
// Helper function to convert Uint8Array to base64
|
||||
function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
||||
const chunkSize = 0x8000; // 32KB chunks to avoid stack overflow
|
||||
|
|
|
|||
|
|
@ -0,0 +1,10 @@
|
|||
// API configuration
|
||||
const DEV_API_URL = "http://127.0.0.1:3001";
|
||||
|
||||
// Base URL for fetch API calls (can be relative in production)
|
||||
export const API_BASE = import.meta.env.DEV ? DEV_API_URL : "";
|
||||
|
||||
// Full host URL for Ollama client (needs full origin in production)
|
||||
export const OLLAMA_HOST = import.meta.env.DEV
|
||||
? DEV_API_URL
|
||||
: window.location.origin;
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
import { Ollama } from "ollama/browser";
|
||||
import { OLLAMA_HOST } from "./config";
|
||||
|
||||
let _ollamaClient: Ollama | null = null;
|
||||
|
||||
|
|
@ -6,7 +7,7 @@ export const ollamaClient = new Proxy({} as Ollama, {
|
|||
get(_target, prop) {
|
||||
if (!_ollamaClient) {
|
||||
_ollamaClient = new Ollama({
|
||||
host: window.location.origin,
|
||||
host: OLLAMA_HOST,
|
||||
});
|
||||
}
|
||||
const value = _ollamaClient[prop as keyof Ollama];
|
||||
|
|
|
|||
|
|
@ -0,0 +1,114 @@
|
|||
Ollama Benchmark Tool
|
||||
---------------------
|
||||
|
||||
A Go-based command-line tool for benchmarking Ollama models with configurable parameters and multiple output formats.
|
||||
|
||||
## Features
|
||||
|
||||
* Benchmark multiple models in a single run
|
||||
* Support for both text and image prompts
|
||||
* Configurable generation parameters (temperature, max tokens, seed, etc.)
|
||||
* Supports benchstat and CSV output formats
|
||||
* Detailed performance metrics (prefill, generate, load, total durations)
|
||||
|
||||
## Building from Source
|
||||
|
||||
```
|
||||
go build -o ollama-bench bench.go
|
||||
./bench -model gpt-oss:20b -epochs 6 -format csv
|
||||
```
|
||||
|
||||
Using Go Run (without building)
|
||||
|
||||
```
|
||||
go run bench.go -model gpt-oss:20b -epochs 3
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Example
|
||||
|
||||
```
|
||||
./bench -model gemma3 -epochs 6
|
||||
```
|
||||
|
||||
### Benchmark Multiple Models
|
||||
|
||||
```
|
||||
./bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
|
||||
benchstat -col /name gemma.bench
|
||||
```
|
||||
|
||||
### With Image Prompt
|
||||
|
||||
```
|
||||
./bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
|
||||
```
|
||||
|
||||
### Advanced Example
|
||||
|
||||
```
|
||||
./bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
|
||||
```
|
||||
|
||||
## Command Line Options
|
||||
|
||||
| Option | Description | Default |
|
||||
| -model | Comma-separated list of models to benchmark | (required) |
|
||||
| -epochs | Number of iterations per model | 1 |
|
||||
| -max-tokens | Maximum tokens for model response | 0 (unlimited) |
|
||||
| -temperature | Temperature parameter | 0.0 |
|
||||
| -seed | Random seed | 0 (random) |
|
||||
| -timeout | Timeout in seconds | 300 |
|
||||
| -p | Prompt text | "Write a long story." |
|
||||
| -image | Image file to include in prompt | |
|
||||
| -k | Keep-alive duration in seconds | 0 |
|
||||
| -format | Output format (benchstat, csv) | benchstat |
|
||||
| -output | Output file for results | "" (stdout) |
|
||||
| -v | Verbose mode | false |
|
||||
| -debug | Show debug information | false |
|
||||
|
||||
## Output Formats
|
||||
|
||||
### Markdown Format
|
||||
|
||||
The default markdown format is suitable for copying and pasting into a GitHub issue and will look like:
|
||||
```
|
||||
Model | Step | Count | Duration | nsPerToken | tokensPerSec |
|
||||
|-------|------|-------|----------|------------|--------------|
|
||||
| gpt-oss:20b | prefill | 124 | 30.006458ms | 241987.56 | 4132.44 |
|
||||
| gpt-oss:20b | generate | 200 | 2.646843954s | 13234219.77 | 75.56 |
|
||||
| gpt-oss:20b | load | 1 | 121.674208ms | - | - |
|
||||
| gpt-oss:20b | total | 1 | 2.861047625s | - | - |
|
||||
```
|
||||
|
||||
### Benchstat Format
|
||||
|
||||
Compatible with Go's benchstat tool for statistical analysis:
|
||||
|
||||
```
|
||||
BenchmarkModel/name=gpt-oss:20b/step=prefill 128 78125.00 ns/token 12800.00 token/sec
|
||||
BenchmarkModel/name=gpt-oss:20b/step=generate 512 19531.25 ns/token 51200.00 token/sec
|
||||
BenchmarkModel/name=gpt-oss:20b/step=load 1 1500000000 ns/request
|
||||
```
|
||||
|
||||
### CSV Format
|
||||
|
||||
Machine-readable comma-separated values:
|
||||
|
||||
```
|
||||
NAME,STEP,COUNT,NS_PER_COUNT,TOKEN_PER_SEC
|
||||
gpt-oss:20b,prefill,128,78125.00,12800.00
|
||||
gpt-oss:20b,generate,512,19531.25,51200.00
|
||||
gpt-oss:20b,load,1,1500000000,0
|
||||
```
|
||||
|
||||
## Metrics Explained
|
||||
|
||||
The tool reports four types of metrics for each model:
|
||||
|
||||
* prefill: Time spent processing the prompt
|
||||
* generate: Time spent generating the response
|
||||
* load: Model loading time (one-time cost)
|
||||
* total: Total request duration
|
||||
|
||||
|
|
@ -0,0 +1,309 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
type flagOptions struct {
|
||||
models *string
|
||||
epochs *int
|
||||
maxTokens *int
|
||||
temperature *float64
|
||||
seed *int
|
||||
timeout *int
|
||||
prompt *string
|
||||
imageFile *string
|
||||
keepAlive *float64
|
||||
format *string
|
||||
outputFile *string
|
||||
debug *bool
|
||||
verbose *bool
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
Model string
|
||||
Step string
|
||||
Count int
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
var once sync.Once
|
||||
|
||||
const DefaultPrompt = `Please write a descriptive story about a llama named Alonso who grows up to be President of the Land of Llamas. Include details about Alonso's childhood, adolescent years, and how he grew up to be a political mover and shaker. Write the story with a sense of whimsy.`
|
||||
|
||||
func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool) {
|
||||
switch format {
|
||||
case "benchstat":
|
||||
if verbose {
|
||||
printHeader := func() {
|
||||
fmt.Printf("sysname: %s\n", runtime.GOOS)
|
||||
fmt.Printf("machine: %s\n", runtime.GOARCH)
|
||||
}
|
||||
once.Do(printHeader)
|
||||
}
|
||||
for _, m := range metrics {
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
if m.Count > 0 {
|
||||
nsPerToken := float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec := float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d %.2f ns/token %.2f token/sec\n",
|
||||
m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
|
||||
} else {
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d 0 ns/token 0 token/sec\n",
|
||||
m.Model, m.Step, m.Count)
|
||||
}
|
||||
} else {
|
||||
var suffix string
|
||||
if m.Step == "load" {
|
||||
suffix = "/step=load"
|
||||
}
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s%s 1 %d ns/request\n",
|
||||
m.Model, suffix, m.Duration.Nanoseconds())
|
||||
}
|
||||
}
|
||||
case "csv":
|
||||
printHeader := func() {
|
||||
headings := []string{"NAME", "STEP", "COUNT", "NS_PER_COUNT", "TOKEN_PER_SEC"}
|
||||
fmt.Fprintln(w, strings.Join(headings, ","))
|
||||
}
|
||||
once.Do(printHeader)
|
||||
|
||||
for _, m := range metrics {
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
var nsPerToken float64
|
||||
var tokensPerSec float64
|
||||
if m.Count > 0 {
|
||||
nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
}
|
||||
fmt.Fprintf(w, "%s,%s,%d,%.2f,%.2f\n", m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
|
||||
} else {
|
||||
fmt.Fprintf(w, "%s,%s,1,%d,0\n", m.Model, m.Step, m.Duration.Nanoseconds())
|
||||
}
|
||||
}
|
||||
case "markdown":
|
||||
printHeader := func() {
|
||||
fmt.Fprintln(w, "| Model | Step | Count | Duration | nsPerToken | tokensPerSec |")
|
||||
fmt.Fprintln(w, "|-------|------|-------|----------|------------|--------------|")
|
||||
}
|
||||
once.Do(printHeader)
|
||||
|
||||
for _, m := range metrics {
|
||||
var nsPerToken, tokensPerSec float64
|
||||
var nsPerTokenStr, tokensPerSecStr string
|
||||
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
nsPerTokenStr = fmt.Sprintf("%.2f", nsPerToken)
|
||||
tokensPerSecStr = fmt.Sprintf("%.2f", tokensPerSec)
|
||||
} else {
|
||||
nsPerTokenStr = "-"
|
||||
tokensPerSecStr = "-"
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "| %s | %s | %d | %v | %s | %s |\n",
|
||||
m.Model, m.Step, m.Count, m.Duration, nsPerTokenStr, tokensPerSecStr)
|
||||
}
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "Unknown output format '%s'\n", format)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkChat(fOpt flagOptions) error {
|
||||
models := strings.Split(*fOpt.models, ",")
|
||||
|
||||
// todo - add multi-image support
|
||||
var imgData api.ImageData
|
||||
var err error
|
||||
if *fOpt.imageFile != "" {
|
||||
imgData, err = readImage(*fOpt.imageFile)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't read image '%s': %v\n", *fOpt.imageFile, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if *fOpt.debug && imgData != nil {
|
||||
fmt.Fprintf(os.Stderr, "Read file '%s'\n", *fOpt.imageFile)
|
||||
}
|
||||
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't create ollama client: %v\n", err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, model := range models {
|
||||
for range *fOpt.epochs {
|
||||
options := make(map[string]interface{})
|
||||
if *fOpt.maxTokens > 0 {
|
||||
options["num_predict"] = *fOpt.maxTokens
|
||||
}
|
||||
options["temperature"] = *fOpt.temperature
|
||||
if fOpt.seed != nil && *fOpt.seed > 0 {
|
||||
options["seed"] = *fOpt.seed
|
||||
}
|
||||
|
||||
var keepAliveDuration *api.Duration
|
||||
if *fOpt.keepAlive > 0 {
|
||||
duration := api.Duration{Duration: time.Duration(*fOpt.keepAlive * float64(time.Second))}
|
||||
keepAliveDuration = &duration
|
||||
}
|
||||
|
||||
req := &api.ChatRequest{
|
||||
Model: model,
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: *fOpt.prompt,
|
||||
},
|
||||
},
|
||||
Options: options,
|
||||
KeepAlive: keepAliveDuration,
|
||||
}
|
||||
|
||||
if imgData != nil {
|
||||
req.Messages[0].Images = []api.ImageData{imgData}
|
||||
}
|
||||
|
||||
var responseMetrics *api.Metrics
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*fOpt.timeout)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
err = client.Chat(ctx, req, func(resp api.ChatResponse) error {
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintf(os.Stderr, "%s", cmp.Or(resp.Message.Thinking, resp.Message.Content))
|
||||
}
|
||||
|
||||
if resp.Done {
|
||||
responseMetrics = &resp.Metrics
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintln(os.Stderr)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Chat request timed out with model '%s' after %vs\n", model, 1)
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't chat with model '%s': %v\n", model, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if responseMetrics == nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: No metrics received for model '%s'\n", model)
|
||||
continue
|
||||
}
|
||||
|
||||
metrics := []Metrics{
|
||||
{
|
||||
Model: model,
|
||||
Step: "prefill",
|
||||
Count: responseMetrics.PromptEvalCount,
|
||||
Duration: responseMetrics.PromptEvalDuration,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "generate",
|
||||
Count: responseMetrics.EvalCount,
|
||||
Duration: responseMetrics.EvalDuration,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "load",
|
||||
Count: 1,
|
||||
Duration: responseMetrics.LoadDuration,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "total",
|
||||
Count: 1,
|
||||
Duration: responseMetrics.TotalDuration,
|
||||
},
|
||||
}
|
||||
|
||||
OutputMetrics(os.Stdout, *fOpt.format, metrics, *fOpt.verbose)
|
||||
|
||||
if *fOpt.keepAlive > 0 {
|
||||
time.Sleep(time.Duration(*fOpt.keepAlive*float64(time.Second)) + 200*time.Millisecond)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readImage(filePath string) (api.ImageData, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
data, err := io.ReadAll(file)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return api.ImageData(data), nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
fOpt := flagOptions{
|
||||
models: flag.String("model", "", "Model to benchmark"),
|
||||
epochs: flag.Int("epochs", 6, "Number of epochs (iterations) per model"),
|
||||
maxTokens: flag.Int("max-tokens", 200, "Maximum tokens for model response"),
|
||||
temperature: flag.Float64("temperature", 0, "Temperature parameter"),
|
||||
seed: flag.Int("seed", 0, "Random seed"),
|
||||
timeout: flag.Int("timeout", 60*5, "Timeout in seconds (default 300s)"),
|
||||
prompt: flag.String("p", DefaultPrompt, "Prompt to use"),
|
||||
imageFile: flag.String("image", "", "Filename for an image to include"),
|
||||
keepAlive: flag.Float64("k", 0, "Keep alive duration in seconds"),
|
||||
format: flag.String("format", "markdown", "Output format [benchstat|csv] (default benchstat)"),
|
||||
outputFile: flag.String("output", "", "Output file for results (stdout if empty)"),
|
||||
verbose: flag.Bool("v", false, "Show system information"),
|
||||
debug: flag.Bool("debug", false, "Show debug information"),
|
||||
}
|
||||
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS]\n\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, "Description:\n")
|
||||
fmt.Fprintf(os.Stderr, " Model benchmarking tool with configurable parameters\n\n")
|
||||
fmt.Fprintf(os.Stderr, "Options:\n")
|
||||
flag.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nExamples:\n")
|
||||
fmt.Fprintf(os.Stderr, " bench -model gpt-oss:20b -epochs 3 -temperature 0.7\n")
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
if !slices.Contains([]string{"markdown", "benchstat", "csv"}, *fOpt.format) {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Unknown format '%s'\n", *fOpt.format)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if len(*fOpt.models) == 0 {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: No model(s) specified to benchmark.\n")
|
||||
flag.Usage()
|
||||
return
|
||||
}
|
||||
|
||||
BenchmarkChat(fOpt)
|
||||
}
|
||||
|
|
@ -0,0 +1,463 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/rand"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
func createTestFlagOptions() flagOptions {
|
||||
models := "test-model"
|
||||
format := "benchstat"
|
||||
epochs := 1
|
||||
maxTokens := 100
|
||||
temperature := 0.7
|
||||
seed := 42
|
||||
timeout := 30
|
||||
prompt := "test prompt"
|
||||
imageFile := ""
|
||||
keepAlive := 5.0
|
||||
verbose := false
|
||||
debug := false
|
||||
|
||||
return flagOptions{
|
||||
models: &models,
|
||||
format: &format,
|
||||
epochs: &epochs,
|
||||
maxTokens: &maxTokens,
|
||||
temperature: &temperature,
|
||||
seed: &seed,
|
||||
timeout: &timeout,
|
||||
prompt: &prompt,
|
||||
imageFile: &imageFile,
|
||||
keepAlive: &keepAlive,
|
||||
verbose: &verbose,
|
||||
debug: &debug,
|
||||
}
|
||||
}
|
||||
|
||||
func captureOutput(f func()) string {
|
||||
oldStdout := os.Stdout
|
||||
oldStderr := os.Stderr
|
||||
defer func() {
|
||||
os.Stdout = oldStdout
|
||||
os.Stderr = oldStderr
|
||||
}()
|
||||
|
||||
r, w, _ := os.Pipe()
|
||||
os.Stdout = w
|
||||
os.Stderr = w
|
||||
|
||||
f()
|
||||
|
||||
w.Close()
|
||||
var buf bytes.Buffer
|
||||
io.Copy(&buf, r)
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func createMockOllamaServer(t *testing.T, responses []api.ChatResponse) *httptest.Server {
|
||||
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/api/chat" {
|
||||
t.Errorf("Expected path /api/chat, got %s", r.URL.Path)
|
||||
http.Error(w, "Not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
if r.Method != "POST" {
|
||||
t.Errorf("Expected POST method, got %s", r.Method)
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
for _, resp := range responses {
|
||||
jsonData, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
t.Errorf("Failed to marshal response: %v", err)
|
||||
return
|
||||
}
|
||||
w.Write(jsonData)
|
||||
w.Write([]byte("\n"))
|
||||
if f, ok := w.(http.Flusher); ok {
|
||||
f.Flush()
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond) // Simulate some delay
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_Success(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
mockResponses := []api.ChatResponse{
|
||||
{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response part 1",
|
||||
},
|
||||
Done: false,
|
||||
},
|
||||
{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response part 2",
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
server := createMockOllamaServer(t, mockResponses)
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "BenchmarkModel/name=test-model/step=prefill") {
|
||||
t.Errorf("Expected output to contain prefill metrics, got: %s", output)
|
||||
}
|
||||
if !strings.Contains(output, "BenchmarkModel/name=test-model/step=generate") {
|
||||
t.Errorf("Expected output to contain generate metrics, got: %s", output)
|
||||
}
|
||||
if !strings.Contains(output, "ns/token") {
|
||||
t.Errorf("Expected output to contain ns/token metric, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_ServerError(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "Internal server error", http.StatusInternalServerError)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected error to be handled internally, got returned error: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: Couldn't chat with model") {
|
||||
t.Errorf("Expected error message about chat failure, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_Timeout(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
shortTimeout := 1 // Very short timeout
|
||||
fOpt.timeout = &shortTimeout
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Simulate a long delay that will cause timeout
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
response := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response",
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
jsonData, _ := json.Marshal(response)
|
||||
w.Write(jsonData)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected timeout to be handled internally, got returned error: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: Chat request timed out") {
|
||||
t.Errorf("Expected timeout error message, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_NoMetrics(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
mockResponses := []api.ChatResponse{
|
||||
{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response",
|
||||
},
|
||||
Done: false, // Never sends Done=true
|
||||
},
|
||||
}
|
||||
|
||||
server := createMockOllamaServer(t, mockResponses)
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: No metrics received") {
|
||||
t.Errorf("Expected no metrics error message, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_MultipleModels(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
models := "model1,model2"
|
||||
epochs := 2
|
||||
fOpt.models = &models
|
||||
fOpt.epochs = &epochs
|
||||
|
||||
callCount := 0
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
callCount++
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
var req api.ChatRequest
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
json.Unmarshal(body, &req)
|
||||
|
||||
response := api.ChatResponse{
|
||||
Model: req.Model,
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response for " + req.Model,
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
jsonData, _ := json.Marshal(response)
|
||||
w.Write(jsonData)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
// Should be called 4 times (2 models × 2 epochs)
|
||||
if callCount != 4 {
|
||||
t.Errorf("Expected 4 API calls, got %d", callCount)
|
||||
}
|
||||
|
||||
if !strings.Contains(output, "BenchmarkModel/name=model1") || !strings.Contains(output, "BenchmarkModel/name=model2") {
|
||||
t.Errorf("Expected output for both models, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_WithImage(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
tmpfile, err := os.CreateTemp(t.TempDir(), "testimage")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp file: %v", err)
|
||||
}
|
||||
defer os.Remove(tmpfile.Name())
|
||||
|
||||
content := []byte("fake image data")
|
||||
if _, err := tmpfile.Write(content); err != nil {
|
||||
t.Fatalf("Failed to write to temp file: %v", err)
|
||||
}
|
||||
tmpfile.Close()
|
||||
|
||||
tmpfileName := tmpfile.Name()
|
||||
fOpt.imageFile = &tmpfileName
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify the request contains image data
|
||||
var req api.ChatRequest
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
json.Unmarshal(body, &req)
|
||||
|
||||
if len(req.Messages) == 0 || len(req.Messages[0].Images) == 0 {
|
||||
t.Error("Expected request to contain images")
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
response := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response with image",
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
jsonData, _ := json.Marshal(response)
|
||||
w.Write(jsonData)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "BenchmarkModel/name=test-model") {
|
||||
t.Errorf("Expected benchmark output, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_ImageError(t *testing.T) {
|
||||
randFileName := func() string {
|
||||
const charset = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
const length = 8
|
||||
|
||||
result := make([]byte, length)
|
||||
rand.Read(result) // Fill with random bytes
|
||||
|
||||
for i := range result {
|
||||
result[i] = charset[result[i]%byte(len(charset))]
|
||||
}
|
||||
|
||||
return string(result) + ".txt"
|
||||
}
|
||||
|
||||
fOpt := createTestFlagOptions()
|
||||
imageFile := randFileName()
|
||||
fOpt.imageFile = &imageFile
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err == nil {
|
||||
t.Error("Expected error from image reading, got nil")
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: Couldn't read image") {
|
||||
t.Errorf("Expected image read error message, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadImage_Success(t *testing.T) {
|
||||
tmpfile, err := os.CreateTemp(t.TempDir(), "testimage")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp file: %v", err)
|
||||
}
|
||||
defer os.Remove(tmpfile.Name())
|
||||
|
||||
content := []byte("fake image data")
|
||||
if _, err := tmpfile.Write(content); err != nil {
|
||||
t.Fatalf("Failed to write to temp file: %v", err)
|
||||
}
|
||||
tmpfile.Close()
|
||||
|
||||
imgData, err := readImage(tmpfile.Name())
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if imgData == nil {
|
||||
t.Error("Expected image data, got nil")
|
||||
}
|
||||
|
||||
expected := api.ImageData(content)
|
||||
if string(imgData) != string(expected) {
|
||||
t.Errorf("Expected image data %v, got %v", expected, imgData)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadImage_FileNotFound(t *testing.T) {
|
||||
imgData, err := readImage("nonexistentfile.jpg")
|
||||
if err == nil {
|
||||
t.Error("Expected error for non-existent file, got nil")
|
||||
}
|
||||
if imgData != nil {
|
||||
t.Error("Expected nil image data for non-existent file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestOptionsMapCreation(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
options := make(map[string]interface{})
|
||||
if *fOpt.maxTokens > 0 {
|
||||
options["num_predict"] = *fOpt.maxTokens
|
||||
}
|
||||
options["temperature"] = *fOpt.temperature
|
||||
if fOpt.seed != nil && *fOpt.seed > 0 {
|
||||
options["seed"] = *fOpt.seed
|
||||
}
|
||||
|
||||
if options["num_predict"] != *fOpt.maxTokens {
|
||||
t.Errorf("Expected num_predict %d, got %v", *fOpt.maxTokens, options["num_predict"])
|
||||
}
|
||||
if options["temperature"] != *fOpt.temperature {
|
||||
t.Errorf("Expected temperature %f, got %v", *fOpt.temperature, options["temperature"])
|
||||
}
|
||||
if options["seed"] != *fOpt.seed {
|
||||
t.Errorf("Expected seed %d, got %v", *fOpt.seed, options["seed"])
|
||||
}
|
||||
}
|
||||
|
|
@ -206,6 +206,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||
conv = &commandrModel{}
|
||||
case "GptOssForCausalLM":
|
||||
conv = &gptossModel{}
|
||||
case "DeepseekOCRForCausalLM":
|
||||
conv = &deepseekocr{}
|
||||
default:
|
||||
return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,136 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type deepseekocr struct {
|
||||
ModelParameters
|
||||
LanguageConfig struct {
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
NumRoutedExperts uint32 `json:"n_routed_experts"`
|
||||
NumSharedExperts uint32 `json:"n_shared_experts"`
|
||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||
FirstKDenseReplace uint32 `json:"first_k_dense_replace"`
|
||||
} `json:"language_config"`
|
||||
|
||||
VisionConfig struct {
|
||||
ImageSize uint32 `json:"image_size"`
|
||||
Width struct {
|
||||
Vision struct {
|
||||
Heads uint32 `json:"heads"`
|
||||
ImageSize uint32 `json:"image_size"`
|
||||
Layers uint32 `json:"layers"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
Width uint32 `json:"width"`
|
||||
} `json:"clip-l-14-224"`
|
||||
Sam struct {
|
||||
GlobalAttentionIndexes []int32 `json:"global_attn_indexes"`
|
||||
Heads uint32 `json:"heads"`
|
||||
Layers uint32 `json:"layers"`
|
||||
Width uint32 `json:"width"`
|
||||
} `json:"sam_vit_b"`
|
||||
}
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
func (m *deepseekocr) KV(t *Tokenizer) ggml.KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "deepseekocr"
|
||||
kv["block_count"] = m.LanguageConfig.HiddenLayers
|
||||
kv["context_length"] = m.LanguageConfig.MaxPositionEmbeddings
|
||||
kv["embedding_length"] = m.LanguageConfig.HiddenSize
|
||||
kv["feed_forward_length"] = m.LanguageConfig.IntermediateSize
|
||||
kv["attention.head_count"] = m.LanguageConfig.NumAttentionHeads
|
||||
kv["attention.head_count_kv"] = m.LanguageConfig.NumKeyValueHeads
|
||||
kv["expert_count"] = m.LanguageConfig.NumRoutedExperts
|
||||
kv["expert_used_count"] = m.LanguageConfig.NumExpertsPerToken
|
||||
kv["leading_dense_block_count"] = m.LanguageConfig.FirstKDenseReplace
|
||||
|
||||
kv["vision.block_count"] = m.VisionConfig.Width.Vision.Layers
|
||||
kv["vision.embedding_length"] = m.VisionConfig.Width.Vision.Width
|
||||
kv["vision.head_count"] = m.VisionConfig.Width.Vision.Heads
|
||||
kv["vision.image_size"] = m.VisionConfig.Width.Vision.ImageSize
|
||||
kv["vision.patch_size"] = m.VisionConfig.Width.Vision.PatchSize
|
||||
|
||||
kv["sam.block_count"] = m.VisionConfig.Width.Sam.Layers
|
||||
kv["sam.embedding_length"] = m.VisionConfig.Width.Sam.Width
|
||||
kv["sam.head_count"] = m.VisionConfig.Width.Sam.Heads
|
||||
kv["sam.global_attention_indexes"] = m.VisionConfig.Width.Sam.GlobalAttentionIndexes
|
||||
return kv
|
||||
}
|
||||
|
||||
func (m *deepseekocr) Tensors(s []Tensor) (out []*ggml.Tensor) {
|
||||
merges := make([]merge, m.LanguageConfig.HiddenLayers*3)
|
||||
for i := range m.LanguageConfig.HiddenLayers {
|
||||
merges[i*3+0] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
|
||||
}
|
||||
merges[i*3+1] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
|
||||
}
|
||||
merges[i*3+2] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
|
||||
}
|
||||
}
|
||||
|
||||
out, s = mergeTensors(s, merges...)
|
||||
for _, t := range s {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (m *deepseekocr) Replacements() []string {
|
||||
return []string{
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.layers", "blk",
|
||||
"input_layernorm", "attn_norm",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.gate", "ffn_gate_inp",
|
||||
"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
|
||||
"mlp.shared_experts.up_proj", "ffn_up_shexp",
|
||||
"mlp.shared_experts.down_proj", "ffn_down_shexp",
|
||||
"model.norm", "output_norm",
|
||||
"lm_head", "output",
|
||||
|
||||
"model.vision_model", "v",
|
||||
"embeddings.patch_embedding", "patch_embd",
|
||||
"embeddings.class_embedding", "class_embd",
|
||||
"embeddings.position_embedding", "position_embd",
|
||||
"transformer.layers", "blk",
|
||||
|
||||
"model.projector", "mm",
|
||||
"model.image_newline", "mm.image_newline",
|
||||
//nolint:misspell // this misspelling is upstream. fixing it breaks the model
|
||||
"model.view_seperator", "mm.view_seperator",
|
||||
|
||||
"model.sam_model.patch_embed.proj", "s.patch_embd",
|
||||
"model.sam_model.pos_embed", "s.position_embd",
|
||||
"model.sam_model.blocks", "s.blk",
|
||||
"model.sam_model.neck", "s.neck",
|
||||
"model.sam_model.net_", "s.net_",
|
||||
}
|
||||
}
|
||||
|
|
@ -110,9 +110,12 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||
|
||||
for name, mxfp4 := range mxfp4s {
|
||||
dims := mxfp4.blocks.Shape()
|
||||
if !strings.HasSuffix(name, ".weight") {
|
||||
name = name + ".weight"
|
||||
}
|
||||
if strings.Contains(name, "ffn_down_exps") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: name + ".weight",
|
||||
Name: name,
|
||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||
Shape: []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
|
||||
WriterTo: mxfp4,
|
||||
|
|
@ -121,12 +124,12 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||
// gate_up_exps is interleaved, need to split into gate_exps and up_exps
|
||||
// e.g. gate_exps, up_exps = gate_up_exps[:, 0::2, ...], gate_up_exps[:, 1::2, ...]
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "gate_up", "gate", 1) + ".weight",
|
||||
Name: strings.Replace(name, "gate_up", "gate", 1),
|
||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||
Shape: []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
|
||||
WriterTo: mxfp4.slice(1, 0, int(dims[1]), 2),
|
||||
}, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "gate_up", "up", 1) + ".weight",
|
||||
Name: strings.Replace(name, "gate_up", "up", 1),
|
||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||
Shape: []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
|
||||
WriterTo: mxfp4.slice(1, 1, int(dims[1]), 2),
|
||||
|
|
|
|||
|
|
@ -44,7 +44,10 @@ func (t tensorBase) Kind() uint32 {
|
|||
t.name == "v.positional_embedding_vlm" ||
|
||||
t.name == "v.tile_position_embd.weight" ||
|
||||
t.name == "v.pre_tile_position_embd.weight" ||
|
||||
t.name == "v.post_tile_position_embd.weight" {
|
||||
t.name == "v.post_tile_position_embd.weight" ||
|
||||
t.name == "s.position_embd" ||
|
||||
strings.HasSuffix(t.name, "rel_pos_h") ||
|
||||
strings.HasSuffix(t.name, "rel_pos_w") {
|
||||
// these tensors are always F32
|
||||
return tensorKindFP32
|
||||
}
|
||||
|
|
|
|||
|
|
@ -96,7 +96,10 @@ type safetensor struct {
|
|||
|
||||
func (st safetensor) Kind() uint32 {
|
||||
kind := st.tensorBase.Kind()
|
||||
if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
|
||||
if st.dtype == "BF16" &&
|
||||
!strings.HasPrefix(st.name, "v.") &&
|
||||
!strings.HasPrefix(st.name, "s.") &&
|
||||
kind != tensorKindFP32 {
|
||||
kind = tensorKindBF16
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,10 +2,12 @@ package convert
|
|||
|
||||
import (
|
||||
"cmp"
|
||||
"errors"
|
||||
"io"
|
||||
"iter"
|
||||
"path"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
|
|
@ -94,6 +96,26 @@ func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []
|
|||
return matched
|
||||
})
|
||||
|
||||
slices.SortStableFunc(matched, func(a, b Tensor) int {
|
||||
x := strings.Split(a.Name(), ".")
|
||||
y := strings.Split(b.Name(), ".")
|
||||
if len(x) != len(y) {
|
||||
return cmp.Compare(len(x), len(y))
|
||||
}
|
||||
|
||||
vals := make([]int, len(x))
|
||||
for i := range x {
|
||||
vals[i] = strings.Compare(x[i], y[i])
|
||||
m, err := strconv.ParseInt(x[i], 0, 0)
|
||||
n, err2 := strconv.ParseInt(y[i], 0, 0)
|
||||
if errors.Join(err, err2) == nil {
|
||||
vals[i] = cmp.Compare(m, n)
|
||||
}
|
||||
}
|
||||
|
||||
return cmp.Or(vals...)
|
||||
})
|
||||
|
||||
if len(matched) > 0 {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: merges[i].name,
|
||||
|
|
|
|||
|
|
@ -3,8 +3,10 @@ package convert
|
|||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"iter"
|
||||
"math/rand/v2"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
|
|
@ -951,3 +953,45 @@ func TestMerge(t *testing.T) {
|
|||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestMergeOrder(t *testing.T) {
|
||||
for range 8 {
|
||||
t.Run("", func(t *testing.T) {
|
||||
tensors := make([]Tensor, 16)
|
||||
for i := range tensors {
|
||||
tensors[i] = &fakeTensor{
|
||||
name: fmt.Sprintf("layer.%d.weight", i),
|
||||
shape: []uint64{1},
|
||||
data: []float32{float32(i)},
|
||||
}
|
||||
}
|
||||
|
||||
rand.Shuffle(len(tensors), func(i, j int) {
|
||||
tensors[i], tensors[j] = tensors[j], tensors[i]
|
||||
})
|
||||
|
||||
matched, unmatched := mergeTensors(tensors, merge{"layer.*.weight", "layer.weight"})
|
||||
if len(unmatched) != 0 {
|
||||
t.Error("expected no remaining tensors, got", len(unmatched))
|
||||
}
|
||||
|
||||
if len(matched) != 1 {
|
||||
t.Error("expected 1 merged tensor, got", len(matched))
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := matched[0].WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var f32s [16]float32
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.IsSorted(f32s[:]) {
|
||||
t.Errorf("merged tensor data is not in order: %+v", f32s)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package discover
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
|
|
@ -10,12 +11,21 @@ import (
|
|||
"reflect"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
func GetCPUMem() (memInfo, error) {
|
||||
mem, err := getCPUMem()
|
||||
if err != nil {
|
||||
return memInfo{}, err
|
||||
}
|
||||
return getCPUMemByCgroups(mem), nil
|
||||
}
|
||||
|
||||
func getCPUMem() (memInfo, error) {
|
||||
var mem memInfo
|
||||
var total, available, free, buffers, cached, freeSwap uint64
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
|
|
@ -56,6 +66,32 @@ func GetCPUMem() (memInfo, error) {
|
|||
return mem, nil
|
||||
}
|
||||
|
||||
func getCPUMemByCgroups(mem memInfo) memInfo {
|
||||
total, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.max")
|
||||
if err == nil {
|
||||
mem.TotalMemory = total
|
||||
}
|
||||
used, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.current")
|
||||
if err == nil {
|
||||
mem.FreeMemory = mem.TotalMemory - used
|
||||
}
|
||||
return mem
|
||||
}
|
||||
|
||||
func getUint64ValueFromFile(path string) (uint64, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
s := bufio.NewScanner(f)
|
||||
for s.Scan() {
|
||||
line := s.Text()
|
||||
return strconv.ParseUint(line, 10, 64)
|
||||
}
|
||||
return 0, errors.New("empty file content")
|
||||
}
|
||||
|
||||
const CpuInfoFilename = "/proc/cpuinfo"
|
||||
|
||||
type linuxCpuInfo struct {
|
||||
|
|
@ -74,7 +110,41 @@ func GetCPUDetails() []CPU {
|
|||
return nil
|
||||
}
|
||||
defer file.Close()
|
||||
return linuxCPUDetails(file)
|
||||
cpus := linuxCPUDetails(file)
|
||||
return overwriteThreadCountByLinuxCgroups(cpus)
|
||||
}
|
||||
|
||||
func overwriteThreadCountByLinuxCgroups(cpus []CPU) []CPU {
|
||||
file, err := os.Open("/sys/fs/cgroup/cpu.max")
|
||||
if err != nil {
|
||||
return cpus
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if sl := strings.Split(line, " "); len(sl) == 2 {
|
||||
allowdUs, err := strconv.ParseInt(sl[0], 10, 64)
|
||||
if err != nil {
|
||||
slog.Warn("failed to parse CPU allowed micro secs", "error", err)
|
||||
return cpus
|
||||
}
|
||||
unitUs, err := strconv.ParseInt(sl[1], 10, 64)
|
||||
if err != nil {
|
||||
slog.Warn("failed to parse CPU unit micro secs", "error", err)
|
||||
return cpus
|
||||
}
|
||||
|
||||
threads := int(max(allowdUs/unitUs, 1))
|
||||
|
||||
cpu := cpus[0]
|
||||
cpu.CoreCount = threads
|
||||
cpu.ThreadCount = threads
|
||||
return []CPU{cpu}
|
||||
}
|
||||
}
|
||||
return cpus
|
||||
}
|
||||
|
||||
func linuxCPUDetails(file io.Reader) []CPU {
|
||||
|
|
|
|||
|
|
@ -65,6 +65,10 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
|||
}
|
||||
|
||||
slog.Info("discovering available GPUs...")
|
||||
|
||||
// Warn if any user-overrides are set which could lead to incorrect GPU discovery
|
||||
overrideWarnings()
|
||||
|
||||
requested := envconfig.LLMLibrary()
|
||||
jetpack := cudaJetpack()
|
||||
|
||||
|
|
@ -90,7 +94,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
|||
var dirs []string
|
||||
if dir != "" {
|
||||
if requested != "" && filepath.Base(dir) != requested {
|
||||
slog.Debug("skipping available library at users request", "requested", requested, "libDir", dir)
|
||||
slog.Debug("skipping available library at user's request", "requested", requested, "libDir", dir)
|
||||
continue
|
||||
} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
|
||||
continue
|
||||
|
|
@ -113,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
|||
// In the second pass, we more deeply initialize the GPUs to weed out devices that
|
||||
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
|
||||
// Only devices that need verification are included in this pass
|
||||
slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
|
||||
slog.Debug("evaluating which, if any, devices to filter out", "initial_count", len(devices))
|
||||
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
var wg sync.WaitGroup
|
||||
|
|
@ -121,11 +125,21 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
|||
supportedMu := sync.Mutex{}
|
||||
supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
|
||||
for i := range devices {
|
||||
libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
|
||||
if !devices[i].NeedsInitValidation() {
|
||||
// No need to validate, add to the supported map
|
||||
supportedMu.Lock()
|
||||
if _, ok := supported[devices[i].Library]; !ok {
|
||||
supported[devices[i].Library] = make(map[string]map[string]int)
|
||||
}
|
||||
if _, ok := supported[devices[i].Library][libDir]; !ok {
|
||||
supported[devices[i].Library][libDir] = make(map[string]int)
|
||||
}
|
||||
supported[devices[i].Library][libDir][devices[i].ID] = i
|
||||
supportedMu.Unlock()
|
||||
continue
|
||||
}
|
||||
libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
|
||||
slog.Debug("verifying device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
|
||||
slog.Debug("verifying if device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
|
|
@ -449,3 +463,24 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map
|
|||
|
||||
return devices
|
||||
}
|
||||
|
||||
func overrideWarnings() {
|
||||
anyFound := false
|
||||
m := envconfig.AsMap()
|
||||
for _, k := range []string{
|
||||
"CUDA_VISIBLE_DEVICES",
|
||||
"HIP_VISIBLE_DEVICES",
|
||||
"ROCR_VISIBLE_DEVICES",
|
||||
"GGML_VK_VISIBLE_DEVICES",
|
||||
"GPU_DEVICE_ORDINAL",
|
||||
"HSA_OVERRIDE_GFX_VERSION",
|
||||
} {
|
||||
if e, found := m[k]; found && e.Value != "" {
|
||||
anyFound = true
|
||||
slog.Warn("user overrode visible devices", k, e.Value)
|
||||
}
|
||||
}
|
||||
if anyFound {
|
||||
slog.Warn("if GPUs are not correctly discovered, unset and try again")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,9 +13,23 @@ Embeddings turn text into numeric vectors you can store in a vector database, se
|
|||
|
||||
## Generate embeddings
|
||||
|
||||
Use `/api/embed` with a single string.
|
||||
|
||||
<Tabs>
|
||||
<Tab title="CLI">
|
||||
Generate embeddings directly from the command line:
|
||||
|
||||
```shell
|
||||
ollama run embeddinggemma "Hello world"
|
||||
```
|
||||
|
||||
You can also pipe text to generate embeddings:
|
||||
|
||||
```shell
|
||||
echo "Hello world" | ollama run embeddinggemma
|
||||
```
|
||||
|
||||
Output is a JSON array.
|
||||
|
||||
</Tab>
|
||||
<Tab title="cURL">
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/embed \
|
||||
|
|
|
|||
|
|
@ -9,15 +9,9 @@ sidebarTitle: Cloud
|
|||
|
||||
Ollama's cloud models are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud service while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn't fit on a personal computer.
|
||||
|
||||
Ollama currently supports the following cloud models, with more coming soon:
|
||||
### Supported models
|
||||
|
||||
- `deepseek-v3.1:671b-cloud`
|
||||
- `gpt-oss:20b-cloud`
|
||||
- `gpt-oss:120b-cloud`
|
||||
- `kimi-k2:1t-cloud`
|
||||
- `qwen3-coder:480b-cloud`
|
||||
- `glm-4.6:cloud`
|
||||
- `minimax-m2:cloud`
|
||||
For a list of supported models, see Ollama's [model library](https://ollama.com/search?c=cloud).
|
||||
|
||||
### Running Cloud models
|
||||
|
||||
|
|
|
|||
|
|
@ -223,7 +223,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
|
|||
|
||||
## How can I use Ollama in Visual Studio Code?
|
||||
|
||||
There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
|
||||
There is already a large collection of plugins available for VS Code as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
|
||||
|
||||
## How do I use Ollama with GPU acceleration in Docker?
|
||||
|
||||
|
|
|
|||
|
|
@ -1,34 +1,34 @@
|
|||
---
|
||||
title: VS Code
|
||||
title: VS Code
|
||||
---
|
||||
|
||||
## Install
|
||||
|
||||
Install [VSCode](https://code.visualstudio.com/download).
|
||||
Install [VS Code](https://code.visualstudio.com/download).
|
||||
|
||||
## Usage with Ollama
|
||||
## Usage with Ollama
|
||||
|
||||
1. Open Copilot side bar found in top right window
|
||||
<div style={{ display: 'flex', justifyContent: 'center' }}>
|
||||
<img
|
||||
src="/images/vscode-sidebar.png"
|
||||
alt="VSCode chat Sidebar"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
2. Select the model drowpdown > **Manage models**
|
||||
<div style={{ display: 'flex', justifyContent: 'center' }}>
|
||||
<img
|
||||
src="/images/vscode-models.png"
|
||||
alt="VSCode model picker"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
<div style={{ display: "flex", justifyContent: "center" }}>
|
||||
<img
|
||||
src="/images/vscode-sidebar.png"
|
||||
alt="VS Code chat Sidebar"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
2. Select the model dropdown > **Manage models**
|
||||
<div style={{ display: "flex", justifyContent: "center" }}>
|
||||
<img
|
||||
src="/images/vscode-models.png"
|
||||
alt="VS Code model picker"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
3. Enter **Ollama** under **Provider Dropdown** and select desired models (e.g `qwen3, qwen3-coder:480b-cloud`)
|
||||
<div style={{ display: 'flex', justifyContent: 'center' }}>
|
||||
<img
|
||||
src="/images/vscode-model-options.png"
|
||||
alt="VSCode model options dropdown"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
<div style={{ display: "flex", justifyContent: "center" }}>
|
||||
<img
|
||||
src="/images/vscode-model-options.png"
|
||||
alt="VS Code model options dropdown"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -149,9 +149,6 @@ PARAMETER <parameter> <parametervalue>
|
|||
|
||||
| Parameter | Description | Value Type | Example Usage |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
|
||||
| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 |
|
||||
| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 |
|
||||
| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
|
||||
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
|
||||
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
||||
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
||||
|
|
|
|||
|
|
@ -111,6 +111,12 @@ components:
|
|||
description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
|
||||
options:
|
||||
$ref: "#/components/schemas/ModelOptions"
|
||||
logprobs:
|
||||
type: boolean
|
||||
description: Whether to return log probabilities of the output tokens
|
||||
top_logprobs:
|
||||
type: integer
|
||||
description: Number of most likely tokens to return at each token position when logprobs are enabled
|
||||
GenerateResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -150,6 +156,11 @@ components:
|
|||
eval_duration:
|
||||
type: integer
|
||||
description: Time spent generating tokens in nanoseconds
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Logprob"
|
||||
description: Log probability information for the generated tokens when logprobs are enabled
|
||||
GenerateStreamEvent:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -287,6 +298,12 @@ components:
|
|||
- type: string
|
||||
- type: number
|
||||
description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
|
||||
logprobs:
|
||||
type: boolean
|
||||
description: Whether to return log probabilities of the output tokens
|
||||
top_logprobs:
|
||||
type: integer
|
||||
description: Number of most likely tokens to return at each token position when logprobs are enabled
|
||||
ChatResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -344,6 +361,11 @@ components:
|
|||
eval_duration:
|
||||
type: integer
|
||||
description: Time spent generating tokens in nanoseconds
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Logprob"
|
||||
description: Log probability information for the generated tokens when logprobs are enabled
|
||||
ChatStreamEvent:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -706,6 +728,41 @@ components:
|
|||
version:
|
||||
type: string
|
||||
description: Version of Ollama
|
||||
TokenLogprob:
|
||||
type: object
|
||||
description: Log probability information for a single token alternative
|
||||
properties:
|
||||
token:
|
||||
type: string
|
||||
description: The text representation of the token
|
||||
logprob:
|
||||
type: number
|
||||
description: The log probability of this token
|
||||
bytes:
|
||||
type: array
|
||||
items:
|
||||
type: integer
|
||||
description: The raw byte representation of the token
|
||||
Logprob:
|
||||
type: object
|
||||
description: Log probability information for a generated token
|
||||
properties:
|
||||
token:
|
||||
type: string
|
||||
description: The text representation of the token
|
||||
logprob:
|
||||
type: number
|
||||
description: The log probability of this token
|
||||
bytes:
|
||||
type: array
|
||||
items:
|
||||
type: integer
|
||||
description: The raw byte representation of the token
|
||||
top_logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TokenLogprob"
|
||||
description: Most likely tokens and their log probabilities at this position
|
||||
ErrorResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
|
|
|||
|
|
@ -249,6 +249,9 @@ func (kv KV) OllamaEngineRequired() bool {
|
|||
"qwen25vl",
|
||||
"qwen3", "qwen3moe",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
"deepseekocr",
|
||||
"deepseek2",
|
||||
"nomic-bert",
|
||||
}, kv.Architecture())
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -305,7 +305,7 @@ func readGGUFV1StringsData(llm *gguf, r io.Reader, a *array[string]) (any, error
|
|||
|
||||
a.values[i] = e
|
||||
} else {
|
||||
discardGGUFString(llm, r)
|
||||
_ = discardGGUFString(llm, r)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -568,7 +568,6 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
|||
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||
// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
|
||||
for _, t := range ts {
|
||||
t := t
|
||||
w := io.NewOffsetWriter(f, offset+int64(t.Offset))
|
||||
g.Go(func() error {
|
||||
_, err := t.WriteTo(w)
|
||||
|
|
|
|||
1
go.mod
|
|
@ -17,7 +17,6 @@ require (
|
|||
github.com/x448/float16 v0.8.4
|
||||
golang.org/x/sync v0.12.0
|
||||
golang.org/x/sys v0.36.0
|
||||
|
||||
)
|
||||
|
||||
require (
|
||||
|
|
|
|||
|
|
@ -388,9 +388,9 @@ func NewFunctionNameMap() *FunctionNameMap {
|
|||
}
|
||||
}
|
||||
|
||||
// Init initializes the handler with tools and optional last message
|
||||
// Init initializes the handler with tools, optional last message, and think value
|
||||
// Implements the Parser interface
|
||||
func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
|
||||
func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
|
||||
// Initialize the harmony parser
|
||||
if h.HarmonyParser == nil {
|
||||
h.HarmonyParser = &HarmonyParser{
|
||||
|
|
|
|||
|
|
@ -14,6 +14,23 @@ import (
|
|||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
func assertBytesMatchToken(t *testing.T, label, token string, ints []int) {
|
||||
t.Helper()
|
||||
|
||||
raw := []byte(token)
|
||||
if len(ints) != len(raw) {
|
||||
t.Errorf("%s expected %d bytes for token %q, got %d (%v)", label, len(raw), token, len(ints), ints)
|
||||
return
|
||||
}
|
||||
|
||||
for i, b := range raw {
|
||||
if ints[i] != int(b) {
|
||||
t.Errorf("%s byte[%d] mismatch for token %q: got %d want %d", label, i, token, ints[i], int(b))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAPIGenerate(t *testing.T) {
|
||||
initialTimeout := 60 * time.Second
|
||||
streamTimeout := 30 * time.Second
|
||||
|
|
@ -466,6 +483,7 @@ func TestAPIGenerateLogprobs(t *testing.T) {
|
|||
if lp.Logprob > 0 {
|
||||
t.Errorf("logprob[%d] has positive logprob %f (should be <= 0)", i, lp.Logprob)
|
||||
}
|
||||
assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d]", i), lp.Token, lp.Bytes)
|
||||
|
||||
// Check top_logprobs if requested
|
||||
if test.topLogprobs > 0 {
|
||||
|
|
@ -482,6 +500,9 @@ func TestAPIGenerateLogprobs(t *testing.T) {
|
|||
t.Errorf("logprob[%d].top_logprobs not sorted: %f < %f", i, lp.TopLogprobs[j-1].Logprob, lp.TopLogprobs[j].Logprob)
|
||||
}
|
||||
}
|
||||
for j, top := range lp.TopLogprobs {
|
||||
assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
|
||||
}
|
||||
} else if len(lp.TopLogprobs) > 0 {
|
||||
t.Errorf("logprob[%d] has top_logprobs but none were requested", i)
|
||||
}
|
||||
|
|
@ -544,11 +565,15 @@ func TestAPIChatLogprobs(t *testing.T) {
|
|||
if lp.Logprob > 0 {
|
||||
t.Errorf("logprob[%d] has positive logprob %f", i, lp.Logprob)
|
||||
}
|
||||
assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d]", i), lp.Token, lp.Bytes)
|
||||
if len(lp.TopLogprobs) == 0 {
|
||||
t.Errorf("logprob[%d] expected top_logprobs but got none", i)
|
||||
}
|
||||
if len(lp.TopLogprobs) > 3 {
|
||||
t.Errorf("logprob[%d] has %d top_logprobs, expected max 3", i, len(lp.TopLogprobs))
|
||||
}
|
||||
for j, top := range lp.TopLogprobs {
|
||||
assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ package kvcache
|
|||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
|
|
@ -40,18 +39,18 @@ type Causal struct {
|
|||
|
||||
// ** current forward pass **
|
||||
|
||||
// the active layer for Get and Put
|
||||
curLayer int
|
||||
|
||||
// starting location for data storage for this batch
|
||||
curLoc int
|
||||
|
||||
// size of the current batch
|
||||
curBatchSize int
|
||||
|
||||
// locations for data storage for this batch
|
||||
curLoc ml.Tensor
|
||||
|
||||
// mask of the cache as used by this batch
|
||||
curMask ml.Tensor
|
||||
|
||||
// the active layer for Get and Put
|
||||
curLayer int
|
||||
|
||||
// locations in the cache that are needed for this batch
|
||||
curCellRange cellRange
|
||||
|
||||
|
|
@ -206,45 +205,47 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
|
|||
c.curPositions = batch.Positions
|
||||
c.opts.Except = nil
|
||||
|
||||
var locs []int32
|
||||
if !reserve {
|
||||
c.updateSlidingWindow()
|
||||
|
||||
var err error
|
||||
c.curLoc, err = c.findStartLoc()
|
||||
if errors.Is(err, ErrKvCacheFull) {
|
||||
c.defrag()
|
||||
c.curLoc, err = c.findStartLoc()
|
||||
}
|
||||
locs, err = c.findLocs()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for i, pos := range batch.Positions {
|
||||
seq := batch.Sequences[i]
|
||||
loc := int(locs[i])
|
||||
|
||||
c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
|
||||
c.cells[loc] = cacheCell{pos: pos, sequences: []int{seq}}
|
||||
|
||||
seqRange, ok := c.cellRanges[seq]
|
||||
if !ok {
|
||||
seqRange = newRange()
|
||||
}
|
||||
|
||||
seqRange.min = min(seqRange.min, c.curLoc+i)
|
||||
c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
|
||||
seqRange.min = min(seqRange.min, loc)
|
||||
c.curCellRange.min = min(c.curCellRange.min, loc)
|
||||
|
||||
seqRange.max = max(seqRange.max, c.curLoc+i)
|
||||
c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)
|
||||
seqRange.max = max(seqRange.max, loc)
|
||||
c.curCellRange.max = max(c.curCellRange.max, loc)
|
||||
|
||||
c.cellRanges[seq] = seqRange
|
||||
}
|
||||
} else {
|
||||
// If we are reserving memory, don't update any of the cache metadata but set the size
|
||||
// to the worst case.
|
||||
c.curLoc = 0
|
||||
locs = make([]int32, c.curBatchSize)
|
||||
for i := range locs {
|
||||
locs[i] = int32(i)
|
||||
}
|
||||
c.curCellRange.min = 0
|
||||
c.curCellRange.max = len(c.cells) - 1
|
||||
}
|
||||
|
||||
c.curLoc = ctx.Input().FromInts(locs, len(locs))
|
||||
c.curMask = c.buildMask(ctx)
|
||||
|
||||
return nil
|
||||
|
|
@ -257,22 +258,20 @@ func newRange() cellRange {
|
|||
}
|
||||
}
|
||||
|
||||
// Find the first contiguous block of at least curBatchSize
|
||||
func (c *Causal) findStartLoc() (int, error) {
|
||||
var start, count int
|
||||
// Returns a slice of locations where each token in the batch should be stored
|
||||
func (c *Causal) findLocs() ([]int32, error) {
|
||||
loc := make([]int32, 0, c.curBatchSize)
|
||||
|
||||
for i := range c.cells {
|
||||
if len(c.cells[i].sequences) == 0 {
|
||||
count++
|
||||
if count >= c.curBatchSize {
|
||||
return start, nil
|
||||
loc = append(loc, int32(i))
|
||||
if len(loc) >= c.curBatchSize {
|
||||
return loc, nil
|
||||
}
|
||||
} else {
|
||||
start = i + 1
|
||||
count = 0
|
||||
}
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
|
||||
return nil, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
|
||||
}
|
||||
|
||||
func (c *Causal) updateSlidingWindow() {
|
||||
|
|
@ -402,145 +401,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
|||
return maskTensor
|
||||
}
|
||||
|
||||
func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
|
||||
for i, key := range c.keys {
|
||||
if key == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
kHeadDim := key.Dim(0)
|
||||
numKVHeads := key.Dim(1)
|
||||
rowSize := key.Stride(2)
|
||||
|
||||
kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
|
||||
kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
|
||||
|
||||
value := c.values[i]
|
||||
var vSrcView, vDstView ml.Tensor
|
||||
if c.config.PermutedV {
|
||||
vHeadDim := value.Dim(1)
|
||||
elemSize := value.Stride(0)
|
||||
|
||||
vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
|
||||
vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
|
||||
} else {
|
||||
vHeadDim := value.Dim(0)
|
||||
rowSize := value.Stride(2)
|
||||
|
||||
vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
|
||||
vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
|
||||
}
|
||||
|
||||
ctx.Forward(
|
||||
kSrcView.Copy(ctx, kDstView),
|
||||
vSrcView.Copy(ctx, vDstView),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Causal) defrag() {
|
||||
slog.Debug("defragmenting kv cache")
|
||||
|
||||
// Defrag strategy:
|
||||
// - Search for empty holes at the beginning of the cache,
|
||||
// filling them with active data starting at the end
|
||||
// - If there are contiguous elements that need to be moved,
|
||||
// combine them into a single operation by holding new moves
|
||||
// until we see that the next one is non-contiguous
|
||||
// - Fill up the context with the maximum number of operations it
|
||||
// can hold then compute that and continue with a new context
|
||||
//
|
||||
// We could try to optimize placement by grouping blocks from
|
||||
// the same sequences together but most likely the next forward
|
||||
// pass will disrupt this anyways, so the real world benefit
|
||||
// seems limited as this time.
|
||||
|
||||
ctx := c.backend.NewContext()
|
||||
|
||||
// For every move, 6 tensors are required per layer (2 views and a
|
||||
// copy for each of k and v). We also need to refer to the original
|
||||
// k and v cache tensors - once per layer, not per move.
|
||||
layers := 0
|
||||
for _, key := range c.keys {
|
||||
if key == nil {
|
||||
continue
|
||||
}
|
||||
layers++
|
||||
}
|
||||
|
||||
maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
|
||||
moves := 0
|
||||
|
||||
var pendingSrc, pendingDst, pendingLen int
|
||||
src := len(c.cells) - 1
|
||||
|
||||
for dst := 0; dst < src; dst++ {
|
||||
if len(c.cells[dst].sequences) == 0 {
|
||||
for ; src > dst; src-- {
|
||||
if len(c.cells[src].sequences) != 0 {
|
||||
c.cells[dst] = c.cells[src]
|
||||
c.cells[src] = cacheCell{}
|
||||
|
||||
if pendingLen > 0 {
|
||||
if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
|
||||
pendingSrc = src
|
||||
pendingLen++
|
||||
break
|
||||
} else {
|
||||
c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
|
||||
moves++
|
||||
}
|
||||
}
|
||||
|
||||
pendingSrc = src
|
||||
pendingDst = dst
|
||||
pendingLen = 1
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if moves >= maxMoves {
|
||||
ctx.Compute()
|
||||
ctx.Close()
|
||||
ctx = c.backend.NewContext()
|
||||
|
||||
moves = 0
|
||||
}
|
||||
}
|
||||
|
||||
if pendingLen > 0 {
|
||||
c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
|
||||
moves++
|
||||
}
|
||||
|
||||
if moves > 0 {
|
||||
ctx.Compute()
|
||||
}
|
||||
ctx.Close()
|
||||
|
||||
// Reset range metadata
|
||||
for seq := range c.cellRanges {
|
||||
seqRange := newRange()
|
||||
|
||||
for i, cell := range c.cells {
|
||||
if slices.Contains(cell.sequences, seq) {
|
||||
if i < seqRange.min {
|
||||
seqRange.min = i
|
||||
}
|
||||
if i > seqRange.max {
|
||||
seqRange.max = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.cellRanges[seq] = seqRange
|
||||
}
|
||||
|
||||
c.updateSlidingWindow()
|
||||
}
|
||||
|
||||
func (c *Causal) SetLayer(layer int) {
|
||||
c.curLayer = layer
|
||||
}
|
||||
|
|
@ -625,18 +485,25 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
|
|||
}
|
||||
}
|
||||
|
||||
rowSize := c.keys[c.curLayer].Stride(2)
|
||||
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, rowSize*c.curLoc, kHeadDim*numKVHeads*batchSize)))
|
||||
key = key.Reshape(ctx, kHeadDim*numKVHeads, batchSize)
|
||||
keyCache := c.keys[c.curLayer]
|
||||
keyCache = keyCache.Reshape(ctx, kHeadDim*numKVHeads, len(c.cells))
|
||||
ctx.Forward(keyCache.SetRows(ctx, key, c.curLoc))
|
||||
|
||||
if c.config.PermutedV {
|
||||
elemSize := c.values[c.curLayer].Stride(0)
|
||||
value = value.Reshape(ctx, vHeadDim*numKVHeads, 1, batchSize)
|
||||
value = value.Permute(ctx, 2, 0, 1, 3)
|
||||
|
||||
value = value.Permute(ctx, 1, 2, 0, 3)
|
||||
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
|
||||
valueCache := c.values[c.curLayer]
|
||||
valueCache = valueCache.Reshape(ctx, 1, len(c.cells), vHeadDim*numKVHeads)
|
||||
|
||||
ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
|
||||
} else {
|
||||
rowSize := c.values[c.curLayer].Stride(2)
|
||||
value = value.Reshape(ctx, vHeadDim*numKVHeads, batchSize)
|
||||
valueCache := c.values[c.curLayer]
|
||||
valueCache = valueCache.Reshape(ctx, vHeadDim*numKVHeads, len(c.cells))
|
||||
|
||||
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, rowSize*c.curLoc, vHeadDim*numKVHeads*batchSize)))
|
||||
ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -80,10 +80,10 @@ func TestIssue7978(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestSchemaToGrammer(t *testing.T) {
|
||||
func TestSchemaToGrammar(t *testing.T) {
|
||||
cases := []struct {
|
||||
schema string
|
||||
prefix []byte // nil is check as nil
|
||||
prefix []byte // nil is checked as nil
|
||||
}{
|
||||
{`invalid`, nil},
|
||||
|
||||
|
|
@ -92,7 +92,7 @@ func TestSchemaToGrammer(t *testing.T) {
|
|||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("x", func(t *testing.T) {
|
||||
t.Run(c.schema, func(t *testing.T) {
|
||||
g := SchemaToGrammar([]byte(c.schema))
|
||||
if c.prefix == nil && g != nil {
|
||||
t.Fatalf("grammar = %v, want nil", g)
|
||||
|
|
|
|||
|
|
@ -20,10 +20,10 @@ fix vulkan PCI ID and ID handling
|
|||
ggml/src/ggml-cuda/vendors/hip.h | 3 +
|
||||
ggml/src/ggml-impl.h | 8 +
|
||||
ggml/src/ggml-metal/ggml-metal.cpp | 2 +
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 209 +++++++++++--
|
||||
ggml/src/mem_hip.cpp | 452 +++++++++++++++++++++++++++
|
||||
ggml/src/mem_nvml.cpp | 209 +++++++++++++
|
||||
9 files changed, 926 insertions(+), 30 deletions(-)
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 209 +++++++++--
|
||||
ggml/src/mem_hip.cpp | 529 +++++++++++++++++++++++++++
|
||||
ggml/src/mem_nvml.cpp | 209 +++++++++++
|
||||
9 files changed, 1003 insertions(+), 30 deletions(-)
|
||||
create mode 100644 ggml/src/mem_hip.cpp
|
||||
create mode 100644 ggml/src/mem_nvml.cpp
|
||||
|
||||
|
|
@ -58,7 +58,7 @@ index f9a6587f1..03f359ae9 100644
|
|||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index c9333689f..41b00af83 100644
|
||||
index c9333689f..f1a20e7fe 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
|
|
@ -111,7 +111,7 @@ index c9333689f..41b00af83 100644
|
|||
+ if (ggml_hip_mgmt_init() == 0) {
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing AMD specific memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
|
||||
+ ggml_hip_mgmt_release();
|
||||
+ return;
|
||||
+ }
|
||||
|
|
@ -243,7 +243,7 @@ index 05ff6a5a6..032dee76d 100644
|
|||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 3a6bbe564..d2c278a35 100644
|
||||
index 3a6bbe564..ca02ea079 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -229,6 +229,7 @@ class vk_memory_logger;
|
||||
|
|
@ -337,7 +337,7 @@ index 3a6bbe564..d2c278a35 100644
|
|||
+ if (ggml_hip_mgmt_init() == 0) {
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing AMD specific memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
|
||||
+ ggml_hip_mgmt_release();
|
||||
+ return;
|
||||
+ }
|
||||
|
|
@ -548,11 +548,12 @@ index 3a6bbe564..d2c278a35 100644
|
|||
}
|
||||
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
|
||||
new file mode 100644
|
||||
index 000000000..5a7f5d465
|
||||
index 000000000..c1949b899
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/mem_hip.cpp
|
||||
@@ -0,0 +1,452 @@
|
||||
@@ -0,0 +1,529 @@
|
||||
+#include "ggml.h"
|
||||
+#include "ggml-impl.h"
|
||||
+
|
||||
+#ifdef _WIN32
|
||||
+// AMD Device Library eXtra (ADLX)
|
||||
|
|
@ -570,7 +571,6 @@ index 000000000..5a7f5d465
|
|||
+// Unused function parameters are commented out to avoid unnecessary type
|
||||
+// definitions.
|
||||
+
|
||||
+#include "ggml-impl.h"
|
||||
+#include <filesystem>
|
||||
+#include <mutex>
|
||||
+
|
||||
|
|
@ -990,15 +990,92 @@ index 000000000..5a7f5d465
|
|||
+
|
||||
+#else // #ifdef _WIN32
|
||||
+
|
||||
+#include <fstream>
|
||||
+#include <iostream>
|
||||
+#include <sstream>
|
||||
+#include <string>
|
||||
+#include <vector>
|
||||
+#include <filesystem>
|
||||
+
|
||||
+#include <sys/stat.h>
|
||||
+#include <dirent.h>
|
||||
+#include <unistd.h>
|
||||
+#include <glob.h>
|
||||
+namespace fs = std::filesystem;
|
||||
+
|
||||
+extern "C" {
|
||||
+
|
||||
+// TODO Linux implementation of accurate VRAM reporting
|
||||
+int ggml_hip_mgmt_init() {
|
||||
+ return -1;
|
||||
+ return 0;
|
||||
+}
|
||||
+void ggml_hip_mgmt_release() {}
|
||||
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
|
||||
+ return -1;
|
||||
+ GGML_LOG_INFO("%s searching for device %s\n", __func__, id);
|
||||
+ const std::string drmDeviceGlob = "/sys/class/drm/card*/device/uevent";
|
||||
+ const std::string drmTotalMemoryFile = "mem_info_vram_total";
|
||||
+ const std::string drmUsedMemoryFile = "mem_info_vram_used";
|
||||
+ const std::string drmUeventPCISlotLabel = "PCI_SLOT_NAME=";
|
||||
+
|
||||
+ glob_t glob_result;
|
||||
+ glob(drmDeviceGlob.c_str(), GLOB_NOSORT, NULL, &glob_result);
|
||||
+
|
||||
+ for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
|
||||
+ const char* device_file = glob_result.gl_pathv[i];
|
||||
+ std::ifstream file(device_file);
|
||||
+ if (!file.is_open()) {
|
||||
+ std::cerr << "Failed to open sysfs node" << std::endl;
|
||||
+ globfree(&glob_result);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ std::string line;
|
||||
+ while (std::getline(file, line)) {
|
||||
+ // Check for PCI_SLOT_NAME label
|
||||
+ if (line.find(drmUeventPCISlotLabel) == 0) {
|
||||
+ std::istringstream iss(line.substr(drmUeventPCISlotLabel.size()));
|
||||
+ std::string pciSlot;
|
||||
+ iss >> pciSlot;
|
||||
+ if (pciSlot == std::string(id)) {
|
||||
+ std::string dir = fs::path(device_file).parent_path().string();
|
||||
+
|
||||
+ std::string totalFile = dir + "/" + drmTotalMemoryFile;
|
||||
+ std::ifstream totalFileStream(totalFile.c_str());
|
||||
+ if (!totalFileStream.is_open()) {
|
||||
+ GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, totalFile.c_str());
|
||||
+ file.close();
|
||||
+ globfree(&glob_result);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ uint64_t memory;
|
||||
+ totalFileStream >> memory;
|
||||
+ *total = memory;
|
||||
+
|
||||
+ std::string usedFile = dir + "/" + drmUsedMemoryFile;
|
||||
+ std::ifstream usedFileStream(usedFile.c_str());
|
||||
+ if (!usedFileStream.is_open()) {
|
||||
+ GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, usedFile.c_str());
|
||||
+ file.close();
|
||||
+ globfree(&glob_result);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ uint64_t memoryUsed;
|
||||
+ usedFileStream >> memoryUsed;
|
||||
+ *free = memory - memoryUsed;
|
||||
+
|
||||
+ file.close();
|
||||
+ globfree(&glob_result);
|
||||
+ return 0;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ file.close();
|
||||
+ }
|
||||
+ GGML_LOG_DEBUG("%s unable to find matching device\n", __func__);
|
||||
+ globfree(&glob_result);
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+} // extern "C"
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ index 44ae76d66..639d551a2 100644
|
|||
#ifdef __cplusplus
|
||||
}
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index d2c278a35..221e29509 100644
|
||||
index ca02ea079..c12b069e5 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ vidmem optimization.
|
|||
1 file changed, 1 insertion(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 221e29509..18b7cbccf 100644
|
||||
index c12b069e5..76c78c2ea 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ Subject: [PATCH] Vulkan MMQ Integer Dot Refactor and K-Quant support (#16536)
|
|||
create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 18b7cbccf..53b57c179 100644
|
||||
index 76c78c2ea..7669ed206 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -488,6 +488,7 @@ struct vk_device_struct {
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ index 639d551a2..e5c446d1d 100644
|
|||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 53b57c179..b2855b078 100644
|
||||
index 7669ed206..63a762ec2 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11;
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ Add new backend tests.
|
|||
6 files changed, 371 insertions(+), 117 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index b2855b078..aaf4334b5 100644
|
||||
index 63a762ec2..db92a7901 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -458,6 +458,11 @@ static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851)
|
|||
2 files changed, 16 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index aaf4334b5..3604ceb04 100644
|
||||
index db92a7901..e959674d1 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants {
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not
|
|||
1 file changed, 13 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 3604ceb04..80185d9f0 100644
|
||||
index e959674d1..903050b0b 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,25 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <git@mxy.ng>
|
||||
Date: Tue, 18 Nov 2025 11:13:04 -0800
|
||||
Subject: [PATCH] ggml-cuda: skip large batches
|
||||
|
||||
cuda panics on batches larger than 1024 so mark it as unsupported to
|
||||
fallback to cpu
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index f1a20e7fe..1a71e07c9 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
+ if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
|
||||
+ return false;
|
||||
+ }
|
||||
#ifdef GGML_USE_MUSA
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Hiltgen <daniel@ollama.com>
|
||||
Date: Tue, 18 Nov 2025 09:58:23 -0800
|
||||
Subject: [PATCH] win: exit instead of abort
|
||||
|
||||
---
|
||||
ggml/src/ggml.c | 7 ++++++-
|
||||
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 9be35c1be..923c33d05 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -229,8 +229,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
fprintf(stderr, "%s\n", message);
|
||||
ggml_print_backtrace();
|
||||
}
|
||||
-
|
||||
+#if defined(_WIN32)
|
||||
+ fflush(stderr);
|
||||
+ fflush(stdout);
|
||||
+ exit(1);
|
||||
+#else
|
||||
abort();
|
||||
+#endif
|
||||
}
|
||||
|
||||
// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
{
|
||||
"env": {
|
||||
"browser": true,
|
||||
"es6": true,
|
||||
"node": true
|
||||
},
|
||||
"extends": [
|
||||
"eslint:recommended",
|
||||
"plugin:@typescript-eslint/eslint-recommended",
|
||||
"plugin:@typescript-eslint/recommended",
|
||||
"plugin:import/recommended",
|
||||
"plugin:import/electron",
|
||||
"plugin:import/typescript"
|
||||
],
|
||||
"parser": "@typescript-eslint/parser"
|
||||
}
|
||||
|
|
@ -1,92 +0,0 @@
|
|||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
lerna-debug.log*
|
||||
|
||||
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
.DS_Store
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage
|
||||
*.lcov
|
||||
|
||||
# nyc test coverage
|
||||
.nyc_output
|
||||
|
||||
# node-waf configuration
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# TypeScript v1 declaration files
|
||||
typings/
|
||||
|
||||
# TypeScript cache
|
||||
*.tsbuildinfo
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
.eslintcache
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variables file
|
||||
.env
|
||||
.env.test
|
||||
|
||||
# parcel-bundler cache (https://parceljs.org/)
|
||||
.cache
|
||||
|
||||
# next.js build output
|
||||
.next
|
||||
|
||||
# nuxt.js build output
|
||||
.nuxt
|
||||
|
||||
# vuepress build output
|
||||
.vuepress/dist
|
||||
|
||||
# Serverless directories
|
||||
.serverless/
|
||||
|
||||
# FuseBox cache
|
||||
.fusebox/
|
||||
|
||||
# DynamoDB Local files
|
||||
.dynamodb/
|
||||
|
||||
# Webpack
|
||||
.webpack/
|
||||
|
||||
# Vite
|
||||
.vite/
|
||||
|
||||
# Electron-Forge
|
||||
out/
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
# Desktop
|
||||
|
||||
This app builds upon Ollama to provide a desktop experience for running models.
|
||||
|
||||
## Developing
|
||||
|
||||
First, build the `ollama` binary:
|
||||
|
||||
```shell
|
||||
cd ..
|
||||
go build .
|
||||
```
|
||||
|
||||
Then run the desktop app with `npm start`:
|
||||
|
||||
```shell
|
||||
cd macapp
|
||||
npm install
|
||||
npm start
|
||||
```
|
||||
|
||||
|
Before Width: | Height: | Size: 402 B |
|
Before Width: | Height: | Size: 741 B |
|
Before Width: | Height: | Size: 440 B |
|
Before Width: | Height: | Size: 763 B |
|
Before Width: | Height: | Size: 447 B |
|
Before Width: | Height: | Size: 891 B |
|
Before Width: | Height: | Size: 443 B |
|
Before Width: | Height: | Size: 844 B |
|
|
@ -1,79 +0,0 @@
|
|||
import type { ForgeConfig } from '@electron-forge/shared-types'
|
||||
import { MakerSquirrel } from '@electron-forge/maker-squirrel'
|
||||
import { MakerZIP } from '@electron-forge/maker-zip'
|
||||
import { PublisherGithub } from '@electron-forge/publisher-github'
|
||||
import { AutoUnpackNativesPlugin } from '@electron-forge/plugin-auto-unpack-natives'
|
||||
import { WebpackPlugin } from '@electron-forge/plugin-webpack'
|
||||
import * as path from 'path'
|
||||
import * as fs from 'fs'
|
||||
|
||||
import { mainConfig } from './webpack.main.config'
|
||||
import { rendererConfig } from './webpack.renderer.config'
|
||||
|
||||
const packageJson = JSON.parse(fs.readFileSync(path.resolve(__dirname, './package.json'), 'utf8'))
|
||||
|
||||
const config: ForgeConfig = {
|
||||
packagerConfig: {
|
||||
appVersion: process.env.VERSION || packageJson.version,
|
||||
asar: true,
|
||||
icon: './assets/icon.icns',
|
||||
extraResource: [
|
||||
path.join(__dirname, '../dist/darwin/ollama'),
|
||||
...fs.readdirSync(path.join(__dirname, '../dist/darwin-amd64/lib/ollama')).map(f => path.join(__dirname, '../dist/darwin-amd64/lib/ollama', f)),
|
||||
path.join(__dirname, './assets/iconTemplate.png'),
|
||||
path.join(__dirname, './assets/iconTemplate@2x.png'),
|
||||
path.join(__dirname, './assets/iconUpdateTemplate.png'),
|
||||
path.join(__dirname, './assets/iconUpdateTemplate@2x.png'),
|
||||
path.join(__dirname, './assets/iconDarkTemplate.png'),
|
||||
path.join(__dirname, './assets/iconDarkTemplate@2x.png'),
|
||||
path.join(__dirname, './assets/iconDarkUpdateTemplate.png'),
|
||||
path.join(__dirname, './assets/iconDarkUpdateTemplate@2x.png'),
|
||||
],
|
||||
...(process.env.SIGN
|
||||
? {
|
||||
osxSign: {
|
||||
identity: process.env.APPLE_IDENTITY,
|
||||
},
|
||||
osxNotarize: {
|
||||
tool: 'notarytool',
|
||||
appleId: process.env.APPLE_ID || '',
|
||||
appleIdPassword: process.env.APPLE_PASSWORD || '',
|
||||
teamId: process.env.APPLE_TEAM_ID || '',
|
||||
},
|
||||
}
|
||||
: {}),
|
||||
osxUniversal: {
|
||||
x64ArchFiles: '*',
|
||||
},
|
||||
},
|
||||
rebuildConfig: {},
|
||||
makers: [new MakerSquirrel({}), new MakerZIP({}, ['darwin'])],
|
||||
hooks: {
|
||||
readPackageJson: async (_, packageJson) => {
|
||||
return { ...packageJson, version: process.env.VERSION || packageJson.version }
|
||||
},
|
||||
},
|
||||
plugins: [
|
||||
new AutoUnpackNativesPlugin({}),
|
||||
new WebpackPlugin({
|
||||
mainConfig,
|
||||
devContentSecurityPolicy: `default-src * 'unsafe-eval' 'unsafe-inline'; img-src data: 'self'`,
|
||||
renderer: {
|
||||
config: rendererConfig,
|
||||
nodeIntegration: true,
|
||||
entryPoints: [
|
||||
{
|
||||
html: './src/index.html',
|
||||
js: './src/renderer.tsx',
|
||||
name: 'main_window',
|
||||
preload: {
|
||||
js: './src/preload.ts',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
}),
|
||||
],
|
||||
}
|
||||
|
||||
export default config
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
{
|
||||
"name": "ollama",
|
||||
"productName": "Ollama",
|
||||
"version": "0.0.0",
|
||||
"description": "ollama",
|
||||
"main": ".webpack/main",
|
||||
"scripts": {
|
||||
"start": "electron-forge start",
|
||||
"package": "electron-forge package --arch universal",
|
||||
"package:sign": "SIGN=1 electron-forge package --arch universal",
|
||||
"make": "electron-forge make --arch universal",
|
||||
"make:sign": "SIGN=1 electron-forge make --arch universal",
|
||||
"publish": "SIGN=1 electron-forge publish",
|
||||
"lint": "eslint --ext .ts,.tsx ."
|
||||
},
|
||||
"keywords": [],
|
||||
"author": {
|
||||
"name": "Jeffrey Morgan",
|
||||
"email": "jmorganca@gmail.com"
|
||||
},
|
||||
"license": "MIT",
|
||||
"devDependencies": {
|
||||
"@babel/core": "^7.22.5",
|
||||
"@babel/preset-react": "^7.22.5",
|
||||
"@electron-forge/cli": "^6.2.1",
|
||||
"@electron-forge/maker-deb": "^6.2.1",
|
||||
"@electron-forge/maker-rpm": "^6.2.1",
|
||||
"@electron-forge/maker-squirrel": "^6.2.1",
|
||||
"@electron-forge/maker-zip": "^6.2.1",
|
||||
"@electron-forge/plugin-auto-unpack-natives": "^6.2.1",
|
||||
"@electron-forge/plugin-webpack": "^6.2.1",
|
||||
"@electron-forge/publisher-github": "^6.2.1",
|
||||
"@electron/universal": "^1.4.1",
|
||||
"@svgr/webpack": "^8.0.1",
|
||||
"@types/chmodr": "^1.0.0",
|
||||
"@types/node": "^20.4.0",
|
||||
"@types/react": "^18.2.14",
|
||||
"@types/react-dom": "^18.2.6",
|
||||
"@types/uuid": "^9.0.2",
|
||||
"@typescript-eslint/eslint-plugin": "^5.60.0",
|
||||
"@typescript-eslint/parser": "^5.60.0",
|
||||
"@vercel/webpack-asset-relocator-loader": "^1.7.3",
|
||||
"babel-loader": "^9.1.2",
|
||||
"chmodr": "^1.2.0",
|
||||
"copy-webpack-plugin": "^11.0.0",
|
||||
"css-loader": "^6.8.1",
|
||||
"electron": "25.9.2",
|
||||
"eslint": "^8.43.0",
|
||||
"eslint-plugin-import": "^2.27.5",
|
||||
"fork-ts-checker-webpack-plugin": "^7.3.0",
|
||||
"node-loader": "^2.0.0",
|
||||
"postcss": "^8.4.24",
|
||||
"postcss-import": "^15.1.0",
|
||||
"postcss-loader": "^7.3.3",
|
||||
"postcss-preset-env": "^8.5.1",
|
||||
"style-loader": "^3.3.3",
|
||||
"svg-inline-loader": "^0.8.2",
|
||||
"tailwindcss": "^3.3.2",
|
||||
"ts-loader": "^9.4.3",
|
||||
"ts-node": "^10.9.1",
|
||||
"typescript": "~4.5.4",
|
||||
"url-loader": "^4.1.1",
|
||||
"webpack": "^5.88.0",
|
||||
"webpack-cli": "^5.1.4",
|
||||
"webpack-dev-server": "^4.15.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"@electron/remote": "^2.0.10",
|
||||
"@heroicons/react": "^2.0.18",
|
||||
"@segment/analytics-node": "^1.0.0",
|
||||
"copy-to-clipboard": "^3.3.3",
|
||||
"electron-squirrel-startup": "^1.0.0",
|
||||
"electron-store": "^8.1.0",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0",
|
||||
"uuid": "^9.0.0",
|
||||
"winston": "^3.10.0",
|
||||
"winston-daily-rotate-file": "^4.7.1"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
module.exports = {
|
||||
plugins: {
|
||||
'postcss-import': {},
|
||||
tailwindcss: {},
|
||||
autoprefixer: {},
|
||||
},
|
||||
}
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
html,
|
||||
body {
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
.drag {
|
||||
-webkit-app-region: drag;
|
||||
}
|
||||
|
||||
.no-drag {
|
||||
-webkit-app-region: no-drag;
|
||||
}
|
||||
|
||||
.blink {
|
||||
-webkit-animation: 1s blink step-end infinite;
|
||||
-moz-animation: 1s blink step-end infinite;
|
||||
-ms-animation: 1s blink step-end infinite;
|
||||
-o-animation: 1s blink step-end infinite;
|
||||
animation: 1s blink step-end infinite;
|
||||
}
|
||||
|
||||
@keyframes blink {
|
||||
from,
|
||||
to {
|
||||
color: transparent;
|
||||
}
|
||||
50% {
|
||||
color: black;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,122 +0,0 @@
|
|||
import { useState } from 'react'
|
||||
import copy from 'copy-to-clipboard'
|
||||
import { CheckIcon, DocumentDuplicateIcon } from '@heroicons/react/24/outline'
|
||||
import Store from 'electron-store'
|
||||
import { getCurrentWindow, app } from '@electron/remote'
|
||||
|
||||
import { install } from './install'
|
||||
import OllamaIcon from './ollama.svg'
|
||||
|
||||
const store = new Store()
|
||||
|
||||
enum Step {
|
||||
WELCOME = 0,
|
||||
CLI,
|
||||
FINISH,
|
||||
}
|
||||
|
||||
export default function () {
|
||||
const [step, setStep] = useState<Step>(Step.WELCOME)
|
||||
const [commandCopied, setCommandCopied] = useState<boolean>(false)
|
||||
|
||||
const command = 'ollama run llama3.2'
|
||||
|
||||
return (
|
||||
<div className='drag'>
|
||||
<div className='mx-auto flex min-h-screen w-full flex-col justify-between bg-white px-4 pt-16'>
|
||||
{step === Step.WELCOME && (
|
||||
<>
|
||||
<div className='mx-auto text-center'>
|
||||
<h1 className='mb-6 mt-4 text-2xl tracking-tight text-gray-900'>Welcome to Ollama</h1>
|
||||
<p className='mx-auto w-[65%] text-sm text-gray-400'>
|
||||
Let's get you up and running with your own large language models.
|
||||
</p>
|
||||
<button
|
||||
onClick={() => setStep(Step.CLI)}
|
||||
className='no-drag rounded-dm mx-auto my-8 w-[40%] rounded-md bg-black px-4 py-2 text-sm text-white hover:brightness-110'
|
||||
>
|
||||
Next
|
||||
</button>
|
||||
</div>
|
||||
<div className='mx-auto'>
|
||||
<OllamaIcon />
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
{step === Step.CLI && (
|
||||
<>
|
||||
<div className='mx-auto flex flex-col space-y-28 text-center'>
|
||||
<h1 className='mt-4 text-2xl tracking-tight text-gray-900'>Install the command line</h1>
|
||||
<pre className='mx-auto text-4xl text-gray-400'>> ollama</pre>
|
||||
<div className='mx-auto'>
|
||||
<button
|
||||
onClick={async () => {
|
||||
try {
|
||||
await install()
|
||||
setStep(Step.FINISH)
|
||||
} catch (e) {
|
||||
console.error('could not install: ', e)
|
||||
} finally {
|
||||
getCurrentWindow().show()
|
||||
getCurrentWindow().focus()
|
||||
}
|
||||
}}
|
||||
className='no-drag rounded-dm mx-auto w-[60%] rounded-md bg-black px-4 py-2 text-sm text-white hover:brightness-110'
|
||||
>
|
||||
Install
|
||||
</button>
|
||||
<p className='mx-auto my-4 w-[70%] text-xs text-gray-400'>
|
||||
You will be prompted for administrator access
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
{step === Step.FINISH && (
|
||||
<>
|
||||
<div className='mx-auto flex flex-col space-y-20 text-center'>
|
||||
<h1 className='mt-4 text-2xl tracking-tight text-gray-900'>Run your first model</h1>
|
||||
<div className='flex flex-col'>
|
||||
<div className='group relative flex items-center'>
|
||||
<pre className='language-none text-2xs w-full rounded-md bg-gray-100 px-4 py-3 text-start leading-normal'>
|
||||
{command}
|
||||
</pre>
|
||||
<button
|
||||
className={`no-drag absolute right-[5px] px-2 py-2 ${
|
||||
commandCopied
|
||||
? 'text-gray-900 opacity-100 hover:cursor-auto'
|
||||
: 'text-gray-200 opacity-50 hover:cursor-pointer'
|
||||
} hover:font-bold hover:text-gray-900 group-hover:opacity-100`}
|
||||
onClick={() => {
|
||||
copy(command)
|
||||
setCommandCopied(true)
|
||||
setTimeout(() => setCommandCopied(false), 3000)
|
||||
}}
|
||||
>
|
||||
{commandCopied ? (
|
||||
<CheckIcon className='h-4 w-4 font-bold text-gray-500' />
|
||||
) : (
|
||||
<DocumentDuplicateIcon className='h-4 w-4 text-gray-500' />
|
||||
)}
|
||||
</button>
|
||||
</div>
|
||||
<p className='mx-auto my-4 w-[70%] text-xs text-gray-400'>
|
||||
Run this command in your favorite terminal.
|
||||
</p>
|
||||
</div>
|
||||
<button
|
||||
onClick={() => {
|
||||
store.set('first-time-run', true)
|
||||
window.close()
|
||||
}}
|
||||
className='no-drag rounded-dm mx-auto w-[60%] rounded-md bg-black px-4 py-2 text-sm text-white hover:brightness-110'
|
||||
>
|
||||
Finish
|
||||
</button>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
declare module '*.svg' {
|
||||
const content: string
|
||||
export default content
|
||||
}
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
</head>
|
||||
<body>
|
||||
<div id="app"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -1,302 +0,0 @@
|
|||
import { spawn, ChildProcess } from 'child_process'
|
||||
import { app, autoUpdater, dialog, Tray, Menu, BrowserWindow, MenuItemConstructorOptions, nativeTheme } from 'electron'
|
||||
import Store from 'electron-store'
|
||||
import winston from 'winston'
|
||||
import 'winston-daily-rotate-file'
|
||||
import * as path from 'path'
|
||||
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
import { installed } from './install'
|
||||
|
||||
require('@electron/remote/main').initialize()
|
||||
|
||||
if (require('electron-squirrel-startup')) {
|
||||
app.quit()
|
||||
}
|
||||
|
||||
const store = new Store()
|
||||
|
||||
let welcomeWindow: BrowserWindow | null = null
|
||||
|
||||
declare const MAIN_WINDOW_WEBPACK_ENTRY: string
|
||||
|
||||
const logger = winston.createLogger({
|
||||
transports: [
|
||||
new winston.transports.Console(),
|
||||
new winston.transports.File({
|
||||
filename: path.join(app.getPath('home'), '.ollama', 'logs', 'server.log'),
|
||||
maxsize: 1024 * 1024 * 20,
|
||||
maxFiles: 5,
|
||||
}),
|
||||
],
|
||||
format: winston.format.printf(info => info.message),
|
||||
})
|
||||
|
||||
app.on('ready', () => {
|
||||
const gotTheLock = app.requestSingleInstanceLock()
|
||||
if (!gotTheLock) {
|
||||
app.exit(0)
|
||||
return
|
||||
}
|
||||
|
||||
app.on('second-instance', () => {
|
||||
if (app.hasSingleInstanceLock()) {
|
||||
app.releaseSingleInstanceLock()
|
||||
}
|
||||
|
||||
if (proc) {
|
||||
proc.off('exit', restart)
|
||||
proc.kill()
|
||||
}
|
||||
|
||||
app.exit(0)
|
||||
})
|
||||
|
||||
app.focus({ steal: true })
|
||||
|
||||
init()
|
||||
})
|
||||
|
||||
function firstRunWindow() {
|
||||
// Create the browser window.
|
||||
welcomeWindow = new BrowserWindow({
|
||||
width: 400,
|
||||
height: 500,
|
||||
frame: false,
|
||||
fullscreenable: false,
|
||||
resizable: false,
|
||||
movable: true,
|
||||
show: false,
|
||||
webPreferences: {
|
||||
nodeIntegration: true,
|
||||
contextIsolation: false,
|
||||
},
|
||||
})
|
||||
|
||||
require('@electron/remote/main').enable(welcomeWindow.webContents)
|
||||
|
||||
welcomeWindow.loadURL(MAIN_WINDOW_WEBPACK_ENTRY)
|
||||
welcomeWindow.on('ready-to-show', () => welcomeWindow.show())
|
||||
welcomeWindow.on('closed', () => {
|
||||
if (process.platform === 'darwin') {
|
||||
app.dock.hide()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
let tray: Tray | null = null
|
||||
let updateAvailable = false
|
||||
const assetPath = app.isPackaged ? process.resourcesPath : path.join(__dirname, '..', '..', 'assets')
|
||||
|
||||
function trayIconPath() {
|
||||
return nativeTheme.shouldUseDarkColors
|
||||
? updateAvailable
|
||||
? path.join(assetPath, 'iconDarkUpdateTemplate.png')
|
||||
: path.join(assetPath, 'iconDarkTemplate.png')
|
||||
: updateAvailable
|
||||
? path.join(assetPath, 'iconUpdateTemplate.png')
|
||||
: path.join(assetPath, 'iconTemplate.png')
|
||||
}
|
||||
|
||||
function updateTrayIcon() {
|
||||
if (tray) {
|
||||
tray.setImage(trayIconPath())
|
||||
}
|
||||
}
|
||||
|
||||
function updateTray() {
|
||||
const updateItems: MenuItemConstructorOptions[] = [
|
||||
{ label: 'An update is available', enabled: false },
|
||||
{
|
||||
label: 'Restart to update',
|
||||
click: () => autoUpdater.quitAndInstall(),
|
||||
},
|
||||
{ type: 'separator' },
|
||||
]
|
||||
|
||||
const menu = Menu.buildFromTemplate([
|
||||
...(updateAvailable ? updateItems : []),
|
||||
{ role: 'quit', label: 'Quit Ollama', accelerator: 'Command+Q' },
|
||||
])
|
||||
|
||||
if (!tray) {
|
||||
tray = new Tray(trayIconPath())
|
||||
}
|
||||
|
||||
tray.setToolTip(updateAvailable ? 'An update is available' : 'Ollama')
|
||||
tray.setContextMenu(menu)
|
||||
tray.setImage(trayIconPath())
|
||||
|
||||
nativeTheme.off('updated', updateTrayIcon)
|
||||
nativeTheme.on('updated', updateTrayIcon)
|
||||
}
|
||||
|
||||
let proc: ChildProcess = null
|
||||
|
||||
function server() {
|
||||
const binary = app.isPackaged
|
||||
? path.join(process.resourcesPath, 'ollama')
|
||||
: path.resolve(process.cwd(), '..', 'ollama')
|
||||
|
||||
proc = spawn(binary, ['serve'])
|
||||
|
||||
proc.stdout.on('data', data => {
|
||||
logger.info(data.toString().trim())
|
||||
})
|
||||
|
||||
proc.stderr.on('data', data => {
|
||||
logger.error(data.toString().trim())
|
||||
})
|
||||
|
||||
proc.on('exit', restart)
|
||||
}
|
||||
|
||||
function restart() {
|
||||
setTimeout(server, 1000)
|
||||
}
|
||||
|
||||
app.on('before-quit', () => {
|
||||
if (proc) {
|
||||
proc.off('exit', restart)
|
||||
proc.kill('SIGINT') // send SIGINT signal to the server, which also stops any loaded llms
|
||||
}
|
||||
})
|
||||
|
||||
const updateURL = `https://ollama.com/api/update?os=${process.platform}&arch=${
|
||||
process.arch
|
||||
}&version=${app.getVersion()}&id=${id()}`
|
||||
|
||||
let latest = ''
|
||||
async function isNewReleaseAvailable() {
|
||||
try {
|
||||
const response = await fetch(updateURL)
|
||||
|
||||
if (!response.ok) {
|
||||
return false
|
||||
}
|
||||
|
||||
if (response.status === 204) {
|
||||
return false
|
||||
}
|
||||
|
||||
const data = await response.json()
|
||||
|
||||
const url = data?.url
|
||||
if (!url) {
|
||||
return false
|
||||
}
|
||||
|
||||
if (latest === url) {
|
||||
return false
|
||||
}
|
||||
|
||||
latest = url
|
||||
|
||||
return true
|
||||
} catch (error) {
|
||||
logger.error(`update check failed - ${error}`)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
async function checkUpdate() {
|
||||
const available = await isNewReleaseAvailable()
|
||||
if (available) {
|
||||
logger.info('checking for update')
|
||||
autoUpdater.checkForUpdates()
|
||||
}
|
||||
}
|
||||
|
||||
function init() {
|
||||
if (app.isPackaged) {
|
||||
checkUpdate()
|
||||
setInterval(() => {
|
||||
checkUpdate()
|
||||
}, 60 * 60 * 1000)
|
||||
}
|
||||
|
||||
updateTray()
|
||||
|
||||
if (process.platform === 'darwin') {
|
||||
if (app.isPackaged) {
|
||||
if (!app.isInApplicationsFolder()) {
|
||||
const chosen = dialog.showMessageBoxSync({
|
||||
type: 'question',
|
||||
buttons: ['Move to Applications', 'Do Not Move'],
|
||||
message: 'Ollama works best when run from the Applications directory.',
|
||||
defaultId: 0,
|
||||
cancelId: 1,
|
||||
})
|
||||
|
||||
if (chosen === 0) {
|
||||
try {
|
||||
app.moveToApplicationsFolder({
|
||||
conflictHandler: conflictType => {
|
||||
if (conflictType === 'existsAndRunning') {
|
||||
dialog.showMessageBoxSync({
|
||||
type: 'info',
|
||||
message: 'Cannot move to Applications directory',
|
||||
detail:
|
||||
'Another version of Ollama is currently running from your Applications directory. Close it first and try again.',
|
||||
})
|
||||
}
|
||||
return true
|
||||
},
|
||||
})
|
||||
return
|
||||
} catch (e) {
|
||||
logger.error(`[Move to Applications] Failed to move to applications folder - ${e.message}}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
server()
|
||||
|
||||
if (store.get('first-time-run') && installed()) {
|
||||
if (process.platform === 'darwin') {
|
||||
app.dock.hide()
|
||||
}
|
||||
|
||||
app.setLoginItemSettings({ openAtLogin: app.getLoginItemSettings().openAtLogin })
|
||||
return
|
||||
}
|
||||
|
||||
// This is the first run or the CLI is no longer installed
|
||||
app.setLoginItemSettings({ openAtLogin: true })
|
||||
firstRunWindow()
|
||||
}
|
||||
|
||||
// Quit when all windows are closed, except on macOS. There, it's common
|
||||
// for applications and their menu bar to stay active until the user quits
|
||||
// explicitly with Cmd + Q.
|
||||
app.on('window-all-closed', () => {
|
||||
if (process.platform !== 'darwin') {
|
||||
app.quit()
|
||||
}
|
||||
})
|
||||
|
||||
function id(): string {
|
||||
const id = store.get('id') as string
|
||||
|
||||
if (id) {
|
||||
return id
|
||||
}
|
||||
|
||||
const uuid = uuidv4()
|
||||
store.set('id', uuid)
|
||||
return uuid
|
||||
}
|
||||
|
||||
autoUpdater.setFeedURL({ url: updateURL })
|
||||
|
||||
autoUpdater.on('error', e => {
|
||||
logger.error(`update check failed - ${e.message}`)
|
||||
console.error(`update check failed - ${e.message}`)
|
||||
})
|
||||
|
||||
autoUpdater.on('update-downloaded', () => {
|
||||
updateAvailable = true
|
||||
updateTray()
|
||||
})
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
import * as fs from 'fs'
|
||||
import { exec as cbExec } from 'child_process'
|
||||
import * as path from 'path'
|
||||
import { promisify } from 'util'
|
||||
|
||||
const app = process && process.type === 'renderer' ? require('@electron/remote').app : require('electron').app
|
||||
const ollama = app.isPackaged ? path.join(process.resourcesPath, 'ollama') : path.resolve(process.cwd(), '..', 'ollama')
|
||||
const exec = promisify(cbExec)
|
||||
const symlinkPath = '/usr/local/bin/ollama'
|
||||
|
||||
export function installed() {
|
||||
return fs.existsSync(symlinkPath) && fs.readlinkSync(symlinkPath) === ollama
|
||||
}
|
||||
|
||||
export async function install() {
|
||||
const command = `do shell script "mkdir -p ${path.dirname(
|
||||
symlinkPath
|
||||
)} && ln -F -s \\"${ollama}\\" \\"${symlinkPath}\\"" with administrator privileges`
|
||||
|
||||
await exec(`osascript -e '${command}'`)
|
||||
}
|
||||
|
Before Width: | Height: | Size: 17 KiB |
|
|
@ -1,7 +0,0 @@
|
|||
import App from './app'
|
||||
import './app.css'
|
||||
import { createRoot } from 'react-dom/client'
|
||||
|
||||
const container = document.getElementById('app')
|
||||
const root = createRoot(container)
|
||||
root.render(<App />)
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
/** @type {import('tailwindcss').Config} */
|
||||
module.exports = {
|
||||
content: ['./src/**/*.{js,ts,jsx,tsx,mdx}'],
|
||||
theme: {},
|
||||
plugins: [],
|
||||
}
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES6",
|
||||
"allowJs": true,
|
||||
"module": "commonjs",
|
||||
"skipLibCheck": true,
|
||||
"esModuleInterop": true,
|
||||
"noImplicitAny": true,
|
||||
"sourceMap": true,
|
||||
"baseUrl": ".",
|
||||
"outDir": "dist",
|
||||
"moduleResolution": "node",
|
||||
"resolveJsonModule": true,
|
||||
"paths": {
|
||||
"*": ["node_modules/*"]
|
||||
},
|
||||
"jsx": "react-jsx"
|
||||
},
|
||||
"include": ["src/**/*"]
|
||||
}
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
import type { Configuration } from 'webpack'
|
||||
|
||||
import { rules } from './webpack.rules'
|
||||
import { plugins } from './webpack.plugins'
|
||||
|
||||
export const mainConfig: Configuration = {
|
||||
/**
|
||||
* This is the main entry point for your application, it's the first file
|
||||
* that runs in the main process.
|
||||
*/
|
||||
entry: './src/index.ts',
|
||||
// Put your normal webpack config below here
|
||||
module: {
|
||||
rules,
|
||||
},
|
||||
plugins,
|
||||
resolve: {
|
||||
extensions: ['.js', '.ts', '.jsx', '.tsx', '.css', '.json'],
|
||||
},
|
||||
}
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
import type IForkTsCheckerWebpackPlugin from 'fork-ts-checker-webpack-plugin'
|
||||
import { DefinePlugin } from 'webpack'
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||
const ForkTsCheckerWebpackPlugin: typeof IForkTsCheckerWebpackPlugin = require('fork-ts-checker-webpack-plugin')
|
||||
|
||||
export const plugins = [
|
||||
new ForkTsCheckerWebpackPlugin({
|
||||
logger: 'webpack-infrastructure',
|
||||
}),
|
||||
new DefinePlugin({
|
||||
'process.env.TELEMETRY_WRITE_KEY': JSON.stringify(process.env.TELEMETRY_WRITE_KEY),
|
||||
}),
|
||||
]
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
import type { Configuration } from 'webpack'
|
||||
|
||||
import { rules } from './webpack.rules'
|
||||
import { plugins } from './webpack.plugins'
|
||||
|
||||
rules.push({
|
||||
test: /\.css$/,
|
||||
use: [{ loader: 'style-loader' }, { loader: 'css-loader' }, { loader: 'postcss-loader' }],
|
||||
})
|
||||
|
||||
export const rendererConfig: Configuration = {
|
||||
module: {
|
||||
rules,
|
||||
},
|
||||
plugins,
|
||||
resolve: {
|
||||
extensions: ['.js', '.ts', '.jsx', '.tsx', '.css'],
|
||||
},
|
||||
}
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
import type { ModuleOptions } from 'webpack'
|
||||
|
||||
export const rules: Required<ModuleOptions>['rules'] = [
|
||||
// Add support for native node modules
|
||||
{
|
||||
// We're specifying native_modules in the test because the asset relocator loader generates a
|
||||
// "fake" .node file which is really a cjs file.
|
||||
test: /native_modules[/\\].+\.node$/,
|
||||
use: 'node-loader',
|
||||
},
|
||||
{
|
||||
test: /[/\\]node_modules[/\\].+\.(m?js|node)$/,
|
||||
parser: { amd: false },
|
||||
use: {
|
||||
loader: '@vercel/webpack-asset-relocator-loader',
|
||||
options: {
|
||||
outputAssetBase: 'native_modules',
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
test: /\.tsx?$/,
|
||||
exclude: /(node_modules|\.webpack)/,
|
||||
use: {
|
||||
loader: 'ts-loader',
|
||||
options: {
|
||||
transpileOnly: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
test: /\.svg$/,
|
||||
use: ['@svgr/webpack'],
|
||||
},
|
||||
]
|
||||
|
|
@ -146,7 +146,6 @@ type Tensor interface {
|
|||
FromFloats([]float32)
|
||||
FromInts([]int32)
|
||||
|
||||
Neg(ctx Context) Tensor
|
||||
Add(ctx Context, t2 Tensor) Tensor
|
||||
Sub(ctx Context, t2 Tensor) Tensor
|
||||
Mul(ctx Context, t2 Tensor) Tensor
|
||||
|
|
@ -174,6 +173,7 @@ type Tensor interface {
|
|||
Cos(ctx Context) Tensor
|
||||
Tanh(ctx Context) Tensor
|
||||
GELU(ctx Context, up ...Tensor) Tensor
|
||||
QuickGELU(ctx Context, up ...Tensor) Tensor
|
||||
SILU(ctx Context, up ...Tensor) Tensor
|
||||
RELU(ctx Context, up ...Tensor) Tensor
|
||||
Sigmoid(ctx Context) Tensor
|
||||
|
|
@ -185,7 +185,6 @@ type Tensor interface {
|
|||
View(ctx Context, offset int, shape ...int) Tensor
|
||||
Permute(ctx Context, shape ...int) Tensor
|
||||
Contiguous(ctx Context, shape ...int) Tensor
|
||||
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
|
||||
|
||||
Pad(ctx Context, shape ...int) Tensor
|
||||
|
||||
|
|
@ -195,9 +194,14 @@ type Tensor interface {
|
|||
Repeat(ctx Context, dim, n int) Tensor
|
||||
Concat(ctx Context, t2 Tensor, dim int) Tensor
|
||||
Rows(ctx Context, t2 Tensor) Tensor
|
||||
SetRows(ctx Context, src Tensor, idxs Tensor) Tensor
|
||||
Copy(ctx Context, t2 Tensor) Tensor
|
||||
Duplicate(ctx Context) Tensor
|
||||
|
||||
Slice(ctx Context, dim, low, high, step int) Tensor
|
||||
Chunk(ctx Context, dim int, size int) []Tensor
|
||||
ChunkSections(ctx Context, dim int, sections ...int) []Tensor
|
||||
|
||||
TopK(ctx Context, k int) Tensor
|
||||
Argsort(ctx Context) Tensor
|
||||
Mean(ctx Context) Tensor
|
||||
|
|
@ -205,7 +209,8 @@ type Tensor interface {
|
|||
Stddev(ctx Context) Tensor
|
||||
Sqr(ctx Context) Tensor
|
||||
Sqrt(ctx Context) Tensor
|
||||
Clamp(ctx Context, min, max float32) Tensor
|
||||
|
||||
Interpolate(ctx Context, dims [4]int, samplingMode SamplingMode) Tensor
|
||||
}
|
||||
|
||||
// ScaledDotProductAttention implements a fused attention
|
||||
|
|
@ -229,7 +234,7 @@ type Tensor interface {
|
|||
// kqv := value.Mulmat(ctx, kq)
|
||||
// return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
type ScaledDotProductAttention interface {
|
||||
ScaledDotProductAttention(ctx Context, key, value, mask, sinks Tensor, scale float64) Tensor
|
||||
ScaledDotProductAttention(ctx Context, key, value, mask, sinks Tensor, vmla Tensor, scale float64) Tensor
|
||||
}
|
||||
|
||||
type number interface {
|
||||
|
|
@ -371,3 +376,10 @@ const (
|
|||
DTypeI32
|
||||
DTypeMXFP4
|
||||
)
|
||||
|
||||
type SamplingMode int
|
||||
|
||||
const (
|
||||
SamplingModeNearest SamplingMode = iota
|
||||
SamplingModeBilinear
|
||||
)
|
||||
|
|
|
|||
|
|
@ -314,7 +314,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||
"altup_proj", "altup_unembd_proj",
|
||||
"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
|
||||
createTensor(tensor{source: t}, output.bts, blocks)
|
||||
case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
|
||||
case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm.") || strings.HasPrefix(t.Name, "s."):
|
||||
// TODO: assign vision tensors to the gpu if possible
|
||||
createTensor(tensor{source: t}, output.bts, blocks)
|
||||
case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
|
||||
|
|
@ -499,7 +499,6 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
|||
g, ctx := errgroup.WithContext(ctx)
|
||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||
for _, t := range b.meta.Tensors().Items() {
|
||||
t := t
|
||||
g.Go(func() error {
|
||||
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
|
||||
for i := range tts {
|
||||
|
|
@ -1137,13 +1136,6 @@ func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
|
|||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_neg(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
|
|
@ -1346,6 +1338,13 @@ func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
|||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) SetRows(ctx ml.Context, src ml.Tensor, idxs ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_set_rows(ctx.(*Context).ctx, t.t, src.(*Tensor).t, idxs.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
|
|
@ -1386,6 +1385,10 @@ func inferShape(t *Tensor, shape []int) {
|
|||
}
|
||||
|
||||
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if !C.ggml_is_contiguous(t.t) {
|
||||
return t.Contiguous(ctx, shape...)
|
||||
}
|
||||
|
||||
if slices.Contains(shape, -1) {
|
||||
inferShape(t, shape)
|
||||
}
|
||||
|
|
@ -1575,6 +1578,16 @@ func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
|
|||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) QuickGELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
|
||||
var tt *C.struct_ggml_tensor
|
||||
if len(t2) > 0 {
|
||||
tt = C.ggml_geglu_quick_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t)
|
||||
} else {
|
||||
tt = C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t)
|
||||
}
|
||||
return &Tensor{b: t.b, t: tt}
|
||||
}
|
||||
|
||||
func (t *Tensor) SILU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
|
||||
if len(t2) > 0 {
|
||||
return &Tensor{
|
||||
|
|
@ -1632,21 +1645,7 @@ func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
|
|||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
|
||||
var tt *C.struct_ggml_tensor
|
||||
switch len(strides) {
|
||||
case 0:
|
||||
tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
|
||||
case 1:
|
||||
tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
|
||||
default:
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
|
||||
return &Tensor{b: t.b, t: tt}
|
||||
}
|
||||
|
||||
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sinks ml.Tensor, scale float64) ml.Tensor {
|
||||
func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sinks ml.Tensor, vmla ml.Tensor, scale float64) ml.Tensor {
|
||||
var kqMask *C.struct_ggml_tensor
|
||||
if mask != nil {
|
||||
kqMask = mask.(*Tensor).t
|
||||
|
|
@ -1663,6 +1662,16 @@ func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sin
|
|||
C.ggml_flash_attn_ext_add_sinks(kqv, sinks.(*Tensor).t)
|
||||
}
|
||||
C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
|
||||
|
||||
if vmla != nil {
|
||||
var cur ml.Tensor = &Tensor{b: t.b, t: kqv}
|
||||
cur = cur.Permute(ctx, 0, 2, 1, 3)
|
||||
cur = vmla.Mulmat(ctx, cur)
|
||||
cur = cur.Permute(ctx, 0, 2, 1, 3)
|
||||
cur = cur.Contiguous(ctx)
|
||||
kqv = cur.(*Tensor).t
|
||||
}
|
||||
|
||||
return &Tensor{b: t.b, t: kqv}
|
||||
} else {
|
||||
kq := key.MulmatFullPrec(ctx, query)
|
||||
|
|
@ -1675,6 +1684,10 @@ func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask, sin
|
|||
}
|
||||
|
||||
kqv := value.Mulmat(ctx, kq)
|
||||
if vmla != nil {
|
||||
kqv = vmla.Mulmat(ctx, kqv)
|
||||
}
|
||||
|
||||
return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
}
|
||||
}
|
||||
|
|
@ -1732,9 +1745,82 @@ func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
|
|||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
|
||||
func (t *Tensor) Interpolate(ctx ml.Context, dims [4]int, samplingMode ml.SamplingMode) ml.Tensor {
|
||||
var mode C.uint32_t
|
||||
switch samplingMode {
|
||||
case ml.SamplingModeNearest:
|
||||
mode = C.GGML_SCALE_MODE_NEAREST
|
||||
case ml.SamplingModeBilinear:
|
||||
mode = C.GGML_SCALE_MODE_BILINEAR
|
||||
default:
|
||||
panic("unsupported interpolate mode")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
|
||||
t: C.ggml_interpolate(ctx.(*Context).ctx, t.t, C.int64_t(dims[0]), C.int64_t(dims[1]), C.int64_t(dims[2]), C.int64_t(dims[3]), mode),
|
||||
}
|
||||
}
|
||||
|
||||
// Slice returns a view of the tensor sliced along dim from low to high in step steps.
|
||||
// Slice panics if the dimension is invalid or the slice parameters are out of range.
|
||||
// If dim=0 and step>1, the tensor is a copy rather than a view to ensure proper shape.
|
||||
func (t *Tensor) Slice(ctx ml.Context, dim int, low, high, step int) ml.Tensor {
|
||||
if dim < 0 || dim >= C.GGML_MAX_DIMS {
|
||||
panic("invalid dimension")
|
||||
} else if low < 0 || high > t.Dim(dim) || low >= high || step < 1 {
|
||||
panic("invalid slice parameters")
|
||||
}
|
||||
|
||||
if dim == 0 && step > 1 {
|
||||
// dim=0,step>1 is a special case so handle it here first
|
||||
return t.View(ctx,
|
||||
low*t.Stride(0), 1,
|
||||
step*t.Stride(0), (high-low+1)/step,
|
||||
t.Stride(1), t.Dim(1),
|
||||
// preserve dim 3 by merging it into dim 2
|
||||
t.Stride(2), t.Dim(2)*t.Dim(3),
|
||||
).Contiguous(ctx, (high-low+1)/step, t.Dim(1), t.Dim(2), t.Dim(3))
|
||||
}
|
||||
|
||||
args := []int{
|
||||
low * t.Stride(dim), t.Dim(0),
|
||||
t.Stride(1), t.Dim(1),
|
||||
t.Stride(2), t.Dim(2),
|
||||
t.Stride(3), t.Dim(3),
|
||||
}
|
||||
|
||||
if step == 1 {
|
||||
args[dim*2+1] = high - low
|
||||
return t.View(ctx, args[0], args[1:]...)
|
||||
} else {
|
||||
args[dim*2] = step * t.Stride(dim)
|
||||
args[dim*2+1] = (high - low + 1) / step
|
||||
return t.View(ctx, args[0], args[1:]...)
|
||||
}
|
||||
}
|
||||
|
||||
// Chunk the tensor into chunk sized tensors along dim. Each sub-tensor is a view of
|
||||
// the original.
|
||||
func (t *Tensor) Chunk(ctx ml.Context, dim, chunk int) []ml.Tensor {
|
||||
sections := make([]int, 0, t.Dim(dim)/chunk+1)
|
||||
for rest := t.Dim(dim); rest > 0; rest -= chunk {
|
||||
sections = append(sections, min(chunk, rest))
|
||||
}
|
||||
return t.ChunkSections(ctx, dim, sections...)
|
||||
}
|
||||
|
||||
// ChunkSections split the tensor into section sized tensors along dim. Each sub-tensor is a
|
||||
// view of the original. The size of the dim must equal the sum of sections.
|
||||
func (t *Tensor) ChunkSections(ctx ml.Context, dim int, sections ...int) []ml.Tensor {
|
||||
var offset int
|
||||
s := make([]ml.Tensor, len(sections))
|
||||
for i, section := range sections {
|
||||
s[i] = t.Slice(ctx, dim, offset, offset+section, 1)
|
||||
offset += section
|
||||
}
|
||||
if offset != t.Dim(dim) {
|
||||
panic("sections do not sum to tensor dimension")
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3513,7 +3513,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||
if (ggml_hip_mgmt_init() == 0) {
|
||||
int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
|
||||
if (status == 0) {
|
||||
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
|
||||
GGML_LOG_DEBUG("%s device %s utilizing AMD specific memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
|
||||
ggml_hip_mgmt_release();
|
||||
return;
|
||||
}
|
||||
|
|
@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
|
||||
return false;
|
||||
}
|
||||
#ifdef GGML_USE_MUSA
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
||||
|
|
|
|||
|
|
@ -13212,7 +13212,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
|||
if (ggml_hip_mgmt_init() == 0) {
|
||||
int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
|
||||
if (status == 0) {
|
||||
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
|
||||
GGML_LOG_DEBUG("%s device %s utilizing AMD specific memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
|
||||
ggml_hip_mgmt_release();
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -229,8 +229,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|||
fprintf(stderr, "%s\n", message);
|
||||
ggml_print_backtrace();
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
fflush(stderr);
|
||||
fflush(stdout);
|
||||
exit(1);
|
||||
#else
|
||||
abort();
|
||||
#endif
|
||||
}
|
||||
|
||||
// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
// AMD Device Library eXtra (ADLX)
|
||||
|
|
@ -16,7 +17,6 @@
|
|||
// Unused function parameters are commented out to avoid unnecessary type
|
||||
// definitions.
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include <filesystem>
|
||||
#include <mutex>
|
||||
|
||||
|
|
@ -436,15 +436,92 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
|
|||
|
||||
#else // #ifdef _WIN32
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <filesystem>
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <dirent.h>
|
||||
#include <unistd.h>
|
||||
#include <glob.h>
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
extern "C" {
|
||||
|
||||
// TODO Linux implementation of accurate VRAM reporting
|
||||
int ggml_hip_mgmt_init() {
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
void ggml_hip_mgmt_release() {}
|
||||
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
|
||||
return -1;
|
||||
GGML_LOG_INFO("%s searching for device %s\n", __func__, id);
|
||||
const std::string drmDeviceGlob = "/sys/class/drm/card*/device/uevent";
|
||||
const std::string drmTotalMemoryFile = "mem_info_vram_total";
|
||||
const std::string drmUsedMemoryFile = "mem_info_vram_used";
|
||||
const std::string drmUeventPCISlotLabel = "PCI_SLOT_NAME=";
|
||||
|
||||
glob_t glob_result;
|
||||
glob(drmDeviceGlob.c_str(), GLOB_NOSORT, NULL, &glob_result);
|
||||
|
||||
for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
|
||||
const char* device_file = glob_result.gl_pathv[i];
|
||||
std::ifstream file(device_file);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open sysfs node" << std::endl;
|
||||
globfree(&glob_result);
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(file, line)) {
|
||||
// Check for PCI_SLOT_NAME label
|
||||
if (line.find(drmUeventPCISlotLabel) == 0) {
|
||||
std::istringstream iss(line.substr(drmUeventPCISlotLabel.size()));
|
||||
std::string pciSlot;
|
||||
iss >> pciSlot;
|
||||
if (pciSlot == std::string(id)) {
|
||||
std::string dir = fs::path(device_file).parent_path().string();
|
||||
|
||||
std::string totalFile = dir + "/" + drmTotalMemoryFile;
|
||||
std::ifstream totalFileStream(totalFile.c_str());
|
||||
if (!totalFileStream.is_open()) {
|
||||
GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, totalFile.c_str());
|
||||
file.close();
|
||||
globfree(&glob_result);
|
||||
return 1;
|
||||
}
|
||||
|
||||
uint64_t memory;
|
||||
totalFileStream >> memory;
|
||||
*total = memory;
|
||||
|
||||
std::string usedFile = dir + "/" + drmUsedMemoryFile;
|
||||
std::ifstream usedFileStream(usedFile.c_str());
|
||||
if (!usedFileStream.is_open()) {
|
||||
GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, usedFile.c_str());
|
||||
file.close();
|
||||
globfree(&glob_result);
|
||||
return 1;
|
||||
}
|
||||
|
||||
uint64_t memoryUsed;
|
||||
usedFileStream >> memoryUsed;
|
||||
*free = memory - memoryUsed;
|
||||
|
||||
file.close();
|
||||
globfree(&glob_result);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file.close();
|
||||
}
|
||||
GGML_LOG_DEBUG("%s unable to find matching device\n", __func__);
|
||||
globfree(&glob_result);
|
||||
return 1;
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package ggml
|
|||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
|
|
@ -368,10 +369,714 @@ func TestPermute(t *testing.T) {
|
|||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
ctx := setup(t)
|
||||
got := tt.input(ctx).Permute(ctx, tt.shape...).Contiguous(ctx)
|
||||
got := tt.input(ctx).Permute(ctx, tt.shape...)
|
||||
got = got.Contiguous(ctx)
|
||||
if diff := cmp.Diff(tt.want(ctx), got, EquateTensors(ctx)); diff != "" {
|
||||
t.Errorf("Permute() result mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSlice(t *testing.T) {
|
||||
cases := []struct {
|
||||
dim int
|
||||
low int
|
||||
high int
|
||||
step int
|
||||
input func(ml.Context) ml.Tensor
|
||||
want func(ml.Context) ml.Tensor
|
||||
}{
|
||||
{
|
||||
dim: 0, low: 1, high: 3, step: 1,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
1, 2,
|
||||
5, 6,
|
||||
9, 10,
|
||||
13, 14,
|
||||
|
||||
17, 18,
|
||||
21, 22,
|
||||
25, 26,
|
||||
29, 30,
|
||||
|
||||
33, 34,
|
||||
37, 38,
|
||||
41, 42,
|
||||
45, 46,
|
||||
|
||||
49, 50,
|
||||
53, 54,
|
||||
57, 58,
|
||||
61, 62,
|
||||
|
||||
65, 66,
|
||||
69, 70,
|
||||
73, 74,
|
||||
77, 78,
|
||||
|
||||
81, 82,
|
||||
85, 86,
|
||||
89, 90,
|
||||
93, 94,
|
||||
|
||||
97, 98,
|
||||
101, 102,
|
||||
105, 106,
|
||||
109, 110,
|
||||
|
||||
113, 114,
|
||||
117, 118,
|
||||
121, 122,
|
||||
125, 126,
|
||||
|
||||
129, 130,
|
||||
133, 134,
|
||||
137, 138,
|
||||
141, 142,
|
||||
|
||||
145, 146,
|
||||
149, 150,
|
||||
153, 154,
|
||||
157, 158,
|
||||
|
||||
161, 162,
|
||||
165, 166,
|
||||
169, 170,
|
||||
173, 174,
|
||||
|
||||
177, 178,
|
||||
181, 182,
|
||||
185, 186,
|
||||
189, 190,
|
||||
|
||||
193, 194,
|
||||
197, 198,
|
||||
201, 202,
|
||||
205, 206,
|
||||
|
||||
209, 210,
|
||||
213, 214,
|
||||
217, 218,
|
||||
221, 222,
|
||||
|
||||
225, 226,
|
||||
229, 230,
|
||||
233, 234,
|
||||
237, 238,
|
||||
|
||||
241, 242,
|
||||
245, 246,
|
||||
249, 250,
|
||||
253, 254,
|
||||
}, 2, 4, 4, 4)
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 1, low: 1, high: 3, step: 1,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
|
||||
20, 21, 22, 23,
|
||||
24, 25, 26, 27,
|
||||
|
||||
36, 37, 38, 39,
|
||||
40, 41, 42, 43,
|
||||
|
||||
52, 53, 54, 55,
|
||||
56, 57, 58, 59,
|
||||
|
||||
68, 69, 70, 71,
|
||||
72, 73, 74, 75,
|
||||
|
||||
84, 85, 86, 87,
|
||||
88, 89, 90, 91,
|
||||
|
||||
100, 101, 102, 103,
|
||||
104, 105, 106, 107,
|
||||
|
||||
116, 117, 118, 119,
|
||||
120, 121, 122, 123,
|
||||
|
||||
132, 133, 134, 135,
|
||||
136, 137, 138, 139,
|
||||
|
||||
148, 149, 150, 151,
|
||||
152, 153, 154, 155,
|
||||
|
||||
164, 165, 166, 167,
|
||||
168, 169, 170, 171,
|
||||
|
||||
180, 181, 182, 183,
|
||||
184, 185, 186, 187,
|
||||
|
||||
196, 197, 198, 199,
|
||||
200, 201, 202, 203,
|
||||
|
||||
212, 213, 214, 215,
|
||||
216, 217, 218, 219,
|
||||
|
||||
228, 229, 230, 231,
|
||||
232, 233, 234, 235,
|
||||
|
||||
244, 245, 246, 247,
|
||||
248, 249, 250, 251,
|
||||
}, 4, 2, 4, 4)
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 2, low: 1, high: 3, step: 1,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
16, 17, 18, 19,
|
||||
20, 21, 22, 23,
|
||||
24, 25, 26, 27,
|
||||
28, 29, 30, 31,
|
||||
|
||||
32, 33, 34, 35,
|
||||
36, 37, 38, 39,
|
||||
40, 41, 42, 43,
|
||||
44, 45, 46, 47,
|
||||
|
||||
80, 81, 82, 83,
|
||||
84, 85, 86, 87,
|
||||
88, 89, 90, 91,
|
||||
92, 93, 94, 95,
|
||||
|
||||
96, 97, 98, 99,
|
||||
100, 101, 102, 103,
|
||||
104, 105, 106, 107,
|
||||
108, 109, 110, 111,
|
||||
|
||||
144, 145, 146, 147,
|
||||
148, 149, 150, 151,
|
||||
152, 153, 154, 155,
|
||||
156, 157, 158, 159,
|
||||
|
||||
160, 161, 162, 163,
|
||||
164, 165, 166, 167,
|
||||
168, 169, 170, 171,
|
||||
172, 173, 174, 175,
|
||||
|
||||
208, 209, 210, 211,
|
||||
212, 213, 214, 215,
|
||||
216, 217, 218, 219,
|
||||
220, 221, 222, 223,
|
||||
|
||||
224, 225, 226, 227,
|
||||
228, 229, 230, 231,
|
||||
232, 233, 234, 235,
|
||||
236, 237, 238, 239,
|
||||
}, 4, 4, 2, 4)
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 3, low: 1, high: 3, step: 1,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
64, 65, 66, 67,
|
||||
68, 69, 70, 71,
|
||||
72, 73, 74, 75,
|
||||
76, 77, 78, 79,
|
||||
|
||||
80, 81, 82, 83,
|
||||
84, 85, 86, 87,
|
||||
88, 89, 90, 91,
|
||||
92, 93, 94, 95,
|
||||
|
||||
96, 97, 98, 99,
|
||||
100, 101, 102, 103,
|
||||
104, 105, 106, 107,
|
||||
108, 109, 110, 111,
|
||||
|
||||
112, 113, 114, 115,
|
||||
116, 117, 118, 119,
|
||||
120, 121, 122, 123,
|
||||
124, 125, 126, 127,
|
||||
|
||||
128, 129, 130, 131,
|
||||
132, 133, 134, 135,
|
||||
136, 137, 138, 139,
|
||||
140, 141, 142, 143,
|
||||
|
||||
144, 145, 146, 147,
|
||||
148, 149, 150, 151,
|
||||
152, 153, 154, 155,
|
||||
156, 157, 158, 159,
|
||||
|
||||
160, 161, 162, 163,
|
||||
164, 165, 166, 167,
|
||||
168, 169, 170, 171,
|
||||
172, 173, 174, 175,
|
||||
|
||||
176, 177, 178, 179,
|
||||
180, 181, 182, 183,
|
||||
184, 185, 186, 187,
|
||||
188, 189, 190, 191,
|
||||
}, 4, 4, 4, 2)
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 0, low: 0, high: 4, step: 2,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
0, 2,
|
||||
4, 6,
|
||||
8, 10,
|
||||
12, 14,
|
||||
|
||||
16, 18,
|
||||
20, 22,
|
||||
24, 26,
|
||||
28, 30,
|
||||
|
||||
32, 34,
|
||||
36, 38,
|
||||
40, 42,
|
||||
44, 46,
|
||||
|
||||
48, 50,
|
||||
52, 54,
|
||||
56, 58,
|
||||
60, 62,
|
||||
|
||||
64, 66,
|
||||
68, 70,
|
||||
72, 74,
|
||||
76, 78,
|
||||
|
||||
80, 82,
|
||||
84, 86,
|
||||
88, 90,
|
||||
92, 94,
|
||||
|
||||
96, 98,
|
||||
100, 102,
|
||||
104, 106,
|
||||
108, 110,
|
||||
|
||||
112, 114,
|
||||
116, 118,
|
||||
120, 122,
|
||||
124, 126,
|
||||
|
||||
128, 130,
|
||||
132, 134,
|
||||
136, 138,
|
||||
140, 142,
|
||||
|
||||
144, 146,
|
||||
148, 150,
|
||||
152, 154,
|
||||
156, 158,
|
||||
|
||||
160, 162,
|
||||
164, 166,
|
||||
168, 170,
|
||||
172, 174,
|
||||
|
||||
176, 178,
|
||||
180, 182,
|
||||
184, 186,
|
||||
188, 190,
|
||||
|
||||
192, 194,
|
||||
196, 198,
|
||||
200, 202,
|
||||
204, 206,
|
||||
|
||||
208, 210,
|
||||
212, 214,
|
||||
216, 218,
|
||||
220, 222,
|
||||
|
||||
224, 226,
|
||||
228, 230,
|
||||
232, 234,
|
||||
236, 238,
|
||||
|
||||
240, 242,
|
||||
244, 246,
|
||||
248, 250,
|
||||
252, 254,
|
||||
}, 2, 4, 4, 4)
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 1, low: 0, high: 4, step: 2,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
0, 1, 2, 3,
|
||||
8, 9, 10, 11,
|
||||
|
||||
16, 17, 18, 19,
|
||||
24, 25, 26, 27,
|
||||
|
||||
32, 33, 34, 35,
|
||||
40, 41, 42, 43,
|
||||
|
||||
48, 49, 50, 51,
|
||||
56, 57, 58, 59,
|
||||
|
||||
64, 65, 66, 67,
|
||||
72, 73, 74, 75,
|
||||
|
||||
80, 81, 82, 83,
|
||||
88, 89, 90, 91,
|
||||
|
||||
96, 97, 98, 99,
|
||||
104, 105, 106, 107,
|
||||
|
||||
112, 113, 114, 115,
|
||||
120, 121, 122, 123,
|
||||
|
||||
128, 129, 130, 131,
|
||||
136, 137, 138, 139,
|
||||
|
||||
144, 145, 146, 147,
|
||||
152, 153, 154, 155,
|
||||
|
||||
160, 161, 162, 163,
|
||||
168, 169, 170, 171,
|
||||
|
||||
176, 177, 178, 179,
|
||||
184, 185, 186, 187,
|
||||
|
||||
192, 193, 194, 195,
|
||||
200, 201, 202, 203,
|
||||
|
||||
208, 209, 210, 211,
|
||||
216, 217, 218, 219,
|
||||
|
||||
224, 225, 226, 227,
|
||||
232, 233, 234, 235,
|
||||
|
||||
240, 241, 242, 243,
|
||||
248, 249, 250, 251,
|
||||
}, 4, 2, 4, 4)
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 2, low: 0, high: 4, step: 2,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
|
||||
32, 33, 34, 35,
|
||||
36, 37, 38, 39,
|
||||
40, 41, 42, 43,
|
||||
44, 45, 46, 47,
|
||||
|
||||
64, 65, 66, 67,
|
||||
68, 69, 70, 71,
|
||||
72, 73, 74, 75,
|
||||
76, 77, 78, 79,
|
||||
|
||||
96, 97, 98, 99,
|
||||
100, 101, 102, 103,
|
||||
104, 105, 106, 107,
|
||||
108, 109, 110, 111,
|
||||
|
||||
128, 129, 130, 131,
|
||||
132, 133, 134, 135,
|
||||
136, 137, 138, 139,
|
||||
140, 141, 142, 143,
|
||||
|
||||
160, 161, 162, 163,
|
||||
164, 165, 166, 167,
|
||||
168, 169, 170, 171,
|
||||
172, 173, 174, 175,
|
||||
|
||||
192, 193, 194, 195,
|
||||
196, 197, 198, 199,
|
||||
200, 201, 202, 203,
|
||||
204, 205, 206, 207,
|
||||
|
||||
224, 225, 226, 227,
|
||||
228, 229, 230, 231,
|
||||
232, 233, 234, 235,
|
||||
236, 237, 238, 239,
|
||||
}, 4, 4, 2, 4)
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 3, low: 0, high: 4, step: 2,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 4*4*4*4, 1, ml.DTypeF32).Reshape(ctx, 4, 4, 4, 4)
|
||||
},
|
||||
want: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
|
||||
16, 17, 18, 19,
|
||||
20, 21, 22, 23,
|
||||
24, 25, 26, 27,
|
||||
28, 29, 30, 31,
|
||||
|
||||
32, 33, 34, 35,
|
||||
36, 37, 38, 39,
|
||||
40, 41, 42, 43,
|
||||
44, 45, 46, 47,
|
||||
|
||||
48, 49, 50, 51,
|
||||
52, 53, 54, 55,
|
||||
56, 57, 58, 59,
|
||||
60, 61, 62, 63,
|
||||
|
||||
128, 129, 130, 131,
|
||||
132, 133, 134, 135,
|
||||
136, 137, 138, 139,
|
||||
140, 141, 142, 143,
|
||||
|
||||
144, 145, 146, 147,
|
||||
148, 149, 150, 151,
|
||||
152, 153, 154, 155,
|
||||
156, 157, 158, 159,
|
||||
|
||||
160, 161, 162, 163,
|
||||
164, 165, 166, 167,
|
||||
168, 169, 170, 171,
|
||||
172, 173, 174, 175,
|
||||
|
||||
176, 177, 178, 179,
|
||||
180, 181, 182, 183,
|
||||
184, 185, 186, 187,
|
||||
188, 189, 190, 191,
|
||||
}, 4, 4, 4, 2)
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
name := fmt.Sprintf("dim=%d,low=%d,high=%d,step=%d", tt.dim, tt.low, tt.high, tt.step)
|
||||
t.Run(name, func(t *testing.T) {
|
||||
ctx := setup(t)
|
||||
got := tt.input(ctx).Slice(ctx, tt.dim, tt.low, tt.high, tt.step)
|
||||
got = got.Contiguous(ctx)
|
||||
if diff := cmp.Diff(tt.want(ctx), got, EquateTensors(ctx)); diff != "" {
|
||||
t.Errorf("Slice() result mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitSections(t *testing.T) {
|
||||
cases := []struct {
|
||||
dim int
|
||||
sections []int
|
||||
input func(ml.Context) ml.Tensor
|
||||
want []func(ml.Context) ml.Tensor
|
||||
}{
|
||||
{
|
||||
dim: 0, sections: []int{1, 1, 1},
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 3, 4)
|
||||
},
|
||||
want: []func(ml.Context) ml.Tensor{
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{0, 3, 6, 9}, 1, 4)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{1, 4, 7, 10}, 1, 4)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{2, 5, 8, 11}, 1, 4)
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 1, sections: []int{1, 3},
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 3, 4)
|
||||
},
|
||||
want: []func(ml.Context) ml.Tensor{
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{0, 1, 2}, 3, 1)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
3, 4, 5,
|
||||
6, 7, 8,
|
||||
9, 10, 11,
|
||||
}, 3, 3)
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 0, sections: []int{2, 2},
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 4, 3)
|
||||
},
|
||||
want: []func(ml.Context) ml.Tensor{
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
0, 1,
|
||||
4, 5,
|
||||
8, 9,
|
||||
}, 2, 3)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
2, 3,
|
||||
6, 7,
|
||||
10, 11,
|
||||
}, 2, 3)
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 1, sections: []int{1, 2},
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 4, 3)
|
||||
},
|
||||
want: []func(ml.Context) ml.Tensor{
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{0, 1, 2, 3}, 4, 1)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
}, 4, 2)
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(fmt.Sprintf("sections=%v", tt.sections), func(t *testing.T) {
|
||||
ctx := setup(t)
|
||||
got := tt.input(ctx).ChunkSections(ctx, tt.dim, tt.sections...)
|
||||
|
||||
for i := range got {
|
||||
got[i] = got[i].Contiguous(ctx)
|
||||
}
|
||||
|
||||
ctx.Forward(got...).Compute(got...)
|
||||
for i, want := range tt.want {
|
||||
if diff := cmp.Diff(want(ctx), got[i], EquateTensors(ctx)); diff != "" {
|
||||
t.Errorf("SplitSections() section %d mismatch (-want +got):\n%s", i, diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestChunk(t *testing.T) {
|
||||
cases := []struct {
|
||||
dim int
|
||||
chunk int
|
||||
input func(ml.Context) ml.Tensor
|
||||
want []func(ml.Context) ml.Tensor
|
||||
}{
|
||||
{
|
||||
dim: 0, chunk: 1,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 3, 4)
|
||||
},
|
||||
want: []func(ml.Context) ml.Tensor{
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{0, 3, 6, 9}, 1, 4)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{1, 4, 7, 10}, 1, 4)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{2, 5, 8, 11}, 1, 4)
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 1, chunk: 2,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 3, 4)
|
||||
},
|
||||
want: []func(ml.Context) ml.Tensor{
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
0, 1, 2,
|
||||
3, 4, 5,
|
||||
}, 3, 2)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
6, 7, 8,
|
||||
9, 10, 11,
|
||||
}, 3, 2)
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
dim: 0, chunk: 2,
|
||||
input: func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 3, 4)
|
||||
},
|
||||
want: []func(ml.Context) ml.Tensor{
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
0, 1,
|
||||
3, 4,
|
||||
6, 7,
|
||||
9, 10,
|
||||
}, 2, 4)
|
||||
},
|
||||
func(ctx ml.Context) ml.Tensor {
|
||||
return ctx.FromFloats([]float32{
|
||||
2,
|
||||
5,
|
||||
8,
|
||||
11,
|
||||
}, 1, 4)
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(fmt.Sprintf("dim=%d,chunk=%d", tt.dim, tt.chunk), func(t *testing.T) {
|
||||
ctx := setup(t)
|
||||
got := tt.input(ctx).Chunk(ctx, tt.dim, tt.chunk)
|
||||
|
||||
for i := range got {
|
||||
got[i] = got[i].Contiguous(ctx)
|
||||
}
|
||||
|
||||
ctx.Forward(got...).Compute(got...)
|
||||
for i, want := range tt.want {
|
||||
if diff := cmp.Diff(want(ctx), got[i], EquateTensors(ctx)); diff != "" {
|
||||
t.Errorf("Split() section %d mismatch (-want +got):\n%s", i, diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,10 +22,14 @@ import (
|
|||
//
|
||||
// Attention output with shape [d_v, heads, seq_len_q]
|
||||
func Attention(ctx ml.Context, query, key, value ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
|
||||
return AttentionWithSinks(ctx, query, key, value, nil, scale, cache)
|
||||
return AttentionWithVMLA(ctx, query, key, value, nil, nil, scale, cache)
|
||||
}
|
||||
|
||||
func AttentionWithSinks(ctx ml.Context, query, key, value, sinks ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
|
||||
return AttentionWithVMLA(ctx, query, key, value, sinks, nil, scale, cache)
|
||||
}
|
||||
|
||||
func AttentionWithVMLA(ctx ml.Context, query, key, value, sinks ml.Tensor, vmla ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
|
||||
ctx.Forward(query)
|
||||
if key != nil && value != nil {
|
||||
if query.Dim(0) != key.Dim(0) {
|
||||
|
|
@ -56,7 +60,7 @@ func AttentionWithSinks(ctx ml.Context, query, key, value, sinks ml.Tensor, scal
|
|||
// Only use the fast SDPA implementation if we have a cache, since that's what
|
||||
// will do any expected backend-specific transformations for us
|
||||
if sdpa, ok := query.(ml.ScaledDotProductAttention); ok && cache != nil {
|
||||
return sdpa.ScaledDotProductAttention(ctx, key, value, mask, sinks, scale)
|
||||
return sdpa.ScaledDotProductAttention(ctx, key, value, mask, sinks, vmla, scale)
|
||||
} else {
|
||||
query = query.Permute(ctx, 0, 2, 1, 3)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3)
|
||||
|
|
@ -71,6 +75,11 @@ func AttentionWithSinks(ctx ml.Context, query, key, value, sinks ml.Tensor, scal
|
|||
kq = kq.Softmax(ctx)
|
||||
|
||||
kqv := value.Mulmat(ctx, kq)
|
||||
|
||||
if vmla != nil {
|
||||
kqv = vmla.Mulmat(ctx, kqv)
|
||||
}
|
||||
|
||||
return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,10 +32,9 @@ func (t Type) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
|
|||
hiddenStates = hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Mean(ctx)
|
||||
return hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
case TypeCLS:
|
||||
return hiddenStates.View(ctx, 0, hiddenStates.Dim(0))
|
||||
return hiddenStates.Slice(ctx, 1, 0, 1, 1)
|
||||
case TypeLast:
|
||||
hiddenStates = hiddenStates.View(ctx, (hiddenStates.Dim(1)-1)*hiddenStates.Stride(1), hiddenStates.Dim(0))
|
||||
return hiddenStates
|
||||
return hiddenStates.Slice(ctx, 1, hiddenStates.Dim(1)-1, hiddenStates.Dim(1), 1)
|
||||
default:
|
||||
panic("unknown pooling type")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -237,7 +237,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
|
|||
}
|
||||
}
|
||||
|
||||
if addSpecial && len(ids) > 0 {
|
||||
if addSpecial {
|
||||
ids = bpe.vocab.addSpecials(ids)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,12 +25,15 @@ const (
|
|||
|
||||
// Composite returns an image with the alpha channel removed by drawing over a white background.
|
||||
func Composite(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
return CompositeColor(img, white)
|
||||
}
|
||||
|
||||
// CompositeColor returns an image with the alpha channel removed by drawing over a white background.
|
||||
func CompositeColor(img image.Image, color color.Color) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{color}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
return dst
|
||||
}
|
||||
|
||||
|
|
@ -55,6 +58,31 @@ func Resize(img image.Image, newSize image.Point, method int) image.Image {
|
|||
return dst
|
||||
}
|
||||
|
||||
// Pad returns an image which has been resized to fit within a new size, preserving aspect ratio, and padded with a color.
|
||||
func Pad(img image.Image, newSize image.Point, color color.Color, kernel draw.Interpolator) image.Image {
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{color}, image.Point{}, draw.Src)
|
||||
|
||||
var minPoint, maxPoint image.Point
|
||||
if img.Bounds().Dx() > img.Bounds().Dy() {
|
||||
// landscape
|
||||
height := newSize.X * img.Bounds().Dy() / img.Bounds().Dx()
|
||||
minPoint = image.Point{0, (newSize.Y - height) / 2}
|
||||
maxPoint = image.Point{newSize.X, height + minPoint.Y}
|
||||
} else {
|
||||
// portrait
|
||||
width := newSize.Y * img.Bounds().Dx() / img.Bounds().Dy()
|
||||
minPoint = image.Point{(newSize.X - width) / 2, 0}
|
||||
maxPoint = image.Point{minPoint.X + width, newSize.Y}
|
||||
}
|
||||
|
||||
kernel.Scale(dst, image.Rectangle{
|
||||
Min: minPoint,
|
||||
Max: maxPoint,
|
||||
}, img, img.Bounds(), draw.Over, nil)
|
||||
return dst
|
||||
}
|
||||
|
||||
// Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value.
|
||||
func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 {
|
||||
var pixelVals []float32
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ type Model struct {
|
|||
// Forward implements model.Model.
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||
hiddenStates = hiddenStates.Add(ctx, m.TypeEmbedding.Weight.View(ctx, 0, m.hiddenSize))
|
||||
hiddenStates = hiddenStates.Add(ctx, m.TypeEmbedding.Weight.Slice(ctx, 1, 0, 1, 1))
|
||||
hiddenStates = hiddenStates.Add(ctx, m.PositionEmbedding.Forward(ctx, ctx.Input().FromInts(batch.Positions, len(batch.Positions))))
|
||||
hiddenStates = m.TokenEmbeddingNorm.Forward(ctx, hiddenStates, m.eps)
|
||||
|
||||
|
|
@ -156,6 +156,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||
)),
|
||||
},
|
||||
},
|
||||
true,
|
||||
)
|
||||
default:
|
||||
return nil, model.ErrUnsupportedTokenizer
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package deepseek2
|
|||
// uses deepseek 2 architecture but written based on deepseek 3 model
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
|
|
@ -16,6 +17,7 @@ import (
|
|||
)
|
||||
|
||||
type Options struct {
|
||||
isMLA bool
|
||||
numExpertsUsed int
|
||||
numExperts int
|
||||
normTopKProb bool
|
||||
|
|
@ -32,8 +34,6 @@ type Options struct {
|
|||
hiddenSize,
|
||||
numHeads,
|
||||
numKVHeads,
|
||||
keyLength,
|
||||
valueLength,
|
||||
originalContextLength int
|
||||
|
||||
eps,
|
||||
|
|
@ -62,6 +62,9 @@ type Attention struct {
|
|||
KVANorm *nn.RMSNorm `gguf:"attn_kv_a_norm"`
|
||||
KVB *nn.Linear `gguf:"attn_kv_b"`
|
||||
|
||||
KB *nn.Linear `gguf:"attn_k_b"`
|
||||
VB *nn.Linear `gguf:"attn_v_b"`
|
||||
|
||||
Output *nn.Linear `gguf:"attn_out,alt:attn_output"`
|
||||
}
|
||||
|
||||
|
|
@ -69,7 +72,7 @@ func (attn *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor
|
|||
seqLength := hiddenStates.Dim(1)
|
||||
|
||||
var query ml.Tensor
|
||||
if opts.qLoraRank == 0 { // nil {
|
||||
if opts.qLoraRank == 0 {
|
||||
query = attn.Q.Forward(ctx, hiddenStates)
|
||||
} else {
|
||||
query = attn.QA.Forward(ctx, hiddenStates)
|
||||
|
|
@ -78,44 +81,45 @@ func (attn *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor
|
|||
}
|
||||
|
||||
query = query.Reshape(ctx, query.Dim(0)/opts.numHeads, opts.numHeads, seqLength)
|
||||
|
||||
qPass := query.View(ctx, 0,
|
||||
opts.qkNopeHeadDim, query.Stride(1),
|
||||
query.Dim(1), query.Stride(2),
|
||||
query.Dim(2))
|
||||
|
||||
qRot := query.View(ctx, opts.qkNopeHeadDim*query.Stride(0),
|
||||
opts.qkRopeHeadDim, query.Stride(1),
|
||||
query.Dim(1), query.Stride(2),
|
||||
query.Dim(2))
|
||||
queryChunks := query.ChunkSections(ctx, 0, opts.qkNopeHeadDim, opts.qkRopeHeadDim)
|
||||
|
||||
compressedKV := attn.KVA.Forward(ctx, hiddenStates)
|
||||
kPass := compressedKV.Slice(ctx, 0, 0, opts.kvLoraRank, 1)
|
||||
kRot := compressedKV.View(ctx,
|
||||
opts.kvLoraRank*compressedKV.Stride(0), opts.qkRopeHeadDim,
|
||||
compressedKV.Stride(1), 1,
|
||||
compressedKV.Stride(1), compressedKV.Dim(1),
|
||||
)
|
||||
|
||||
kPass := compressedKV.View(ctx, 0, opts.kvLoraRank, compressedKV.Stride(1), compressedKV.Dim(1))
|
||||
kRot := compressedKV.View(ctx, opts.kvLoraRank*compressedKV.Stride(0),
|
||||
opts.qkRopeHeadDim, compressedKV.Stride(1),
|
||||
1, compressedKV.Stride(1),
|
||||
compressedKV.Dim(1))
|
||||
|
||||
kPass = attn.KVANorm.Forward(ctx, kPass, opts.eps)
|
||||
kPass = attn.KVB.Forward(ctx, kPass)
|
||||
|
||||
kv := kPass.Reshape(ctx, kPass.Dim(0)/opts.numKVHeads, opts.numKVHeads, seqLength)
|
||||
kPass = kv.View(ctx, 0, opts.kqNopeHeadDim, kv.Stride(1), kv.Dim(1), kv.Stride(2), kv.Dim(2))
|
||||
value := kv.View(ctx, opts.kqNopeHeadDim*kv.Stride(0),
|
||||
opts.vHeadDim, kv.Stride(1),
|
||||
kv.Dim(1), kv.Stride(2),
|
||||
kv.Dim(2)).Contiguous(ctx)
|
||||
|
||||
qRot = fast.RoPE(ctx, qRot, positions, opts.qkRopeHeadDim, opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
|
||||
qRot := fast.RoPE(ctx, queryChunks[1], positions, opts.qkRopeHeadDim, opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
|
||||
kRot = fast.RoPE(ctx, kRot, positions, opts.qkRopeHeadDim, opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
|
||||
kPass = attn.KVANorm.Forward(ctx, kPass, opts.eps)
|
||||
|
||||
kRot = kRot.Repeat(ctx, 1, qPass.Dim(1))
|
||||
var attention ml.Tensor
|
||||
|
||||
query = qRot.Concat(ctx, qPass, 0)
|
||||
key := kRot.Concat(ctx, kPass, 0)
|
||||
if !opts.isMLA { // v3
|
||||
kPass = attn.KVB.Forward(ctx, kPass)
|
||||
|
||||
kv := kPass.Reshape(ctx, kPass.Dim(0)/opts.numKVHeads, opts.numKVHeads, seqLength)
|
||||
kvChunks := kv.ChunkSections(ctx, 0, opts.kqNopeHeadDim, opts.vHeadDim)
|
||||
|
||||
kRot = kRot.Repeat(ctx, 1, queryChunks[0].Dim(1))
|
||||
query = qRot.Concat(ctx, queryChunks[0], 0)
|
||||
key := kRot.Concat(ctx, kvChunks[0], 0)
|
||||
attention = nn.Attention(ctx, query, key, kvChunks[1], opts.kqScale, cache)
|
||||
} else { // v3.1
|
||||
qPass := queryChunks[0].Permute(ctx, 0, 2, 1, 3)
|
||||
qPassAbsorb := attn.KB.Forward(ctx, qPass)
|
||||
qPassAbsorb = qPassAbsorb.Permute(ctx, 0, 2, 1, 3)
|
||||
|
||||
query = qRot.Concat(ctx, qPassAbsorb, 0)
|
||||
kPass = kPass.Reshape(ctx, opts.kvLoraRank, 1, seqLength)
|
||||
key := kRot.Concat(ctx, kPass, 0)
|
||||
value := kPass
|
||||
|
||||
attention = nn.AttentionWithVMLA(ctx, query, key, value, nil, attn.VB.Weight, opts.kqScale, cache)
|
||||
}
|
||||
|
||||
attention := nn.Attention(ctx, query, key, value, opts.kqScale, cache)
|
||||
attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), seqLength)
|
||||
return attn.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
|
@ -142,6 +146,7 @@ func (moe *sparse) Moe(ctx ml.Context, hiddenStates, topKIndices, topKWeights ml
|
|||
|
||||
experts := moe.Down.Weight.MulmatID(ctx, hiddenStates, topKIndices)
|
||||
experts = experts.Mul(ctx, topKWeights)
|
||||
|
||||
nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
|
||||
for i := 1; i < opts.numExpertsUsed; i++ {
|
||||
nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
|
||||
|
|
@ -245,6 +250,34 @@ func New(c fs.Config) (model.Model, error) {
|
|||
mScale := float32(1.0 + float64(c.Float("rope.scaling.yarn_log_multiplier"))*math.Log(float64(c.Float("rope.scaling.factor"))))
|
||||
kqScale := float64(mScale) * float64(mScale) / math.Sqrt(float64(c.Uint("attention.key_length")))
|
||||
|
||||
isMLA := c.Uint("attention.key_length_mla") != 0 && c.Uint("attention.value_length_mla") != 0
|
||||
keyLength := int(cmp.Or(c.Uint("attention.key_length_mla"), c.Uint("attention.key_length")))
|
||||
valueLength := int(cmp.Or(c.Uint("attention.value_length_mla"), c.Uint("attention.value_length")))
|
||||
|
||||
var pre []string
|
||||
switch c.String("tokenizer.ggml.pre") {
|
||||
case "deepseek-v3":
|
||||
pre = []string{
|
||||
// Split regex into multiple parts (according to DeepSeek3's regex)
|
||||
"\\p{N}{1,3}",
|
||||
`[一-龥-ゟ゠-ヿ]+`,
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||
}
|
||||
case "deepseek-llm":
|
||||
// TODO: these models haven't been vetted so skip for now
|
||||
// pre = []string{
|
||||
// "[\r\n]",
|
||||
// "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||
// "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||
// "\\s+$",
|
||||
// "[一-龥ࠀ-一가-]+",
|
||||
// "[0-9]",
|
||||
// }
|
||||
fallthrough
|
||||
default:
|
||||
return nil, model.ErrUnsupportedTokenizer
|
||||
}
|
||||
|
||||
m := Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
&model.Vocabulary{
|
||||
|
|
@ -259,18 +292,14 @@ func New(c fs.Config) (model.Model, error) {
|
|||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
// Split regex into multiple parts (according to DeepSeek3's regex)
|
||||
"\\p{N}{1,3}",
|
||||
`[一-龥-ゟ゠-ヿ]+`,
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||
pre...,
|
||||
),
|
||||
Layers: layers,
|
||||
Options: &Options{
|
||||
isMLA: isMLA,
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
keyLength: int(c.Uint("attention.key_length")),
|
||||
valueLength: int(c.Uint("attention.value_length")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.scaling.factor", 1),
|
||||
|
|
@ -278,13 +307,13 @@ func New(c fs.Config) (model.Model, error) {
|
|||
numExpertsUsed: int(c.Uint("expert_used_count")),
|
||||
normTopKProb: c.Bool("expert_weights_norm", true),
|
||||
|
||||
qLoraRank: int(c.Uint("attention.q_lora_rank")), //&qLoraRankVal,
|
||||
qLoraRank: int(c.Uint("attention.q_lora_rank")),
|
||||
kvLoraRank: int(c.Uint("attention.kv_lora_rank")),
|
||||
qkHeadDim: int(c.Uint("attention.key_length")),
|
||||
vHeadDim: int(c.Uint("attention.value_length")),
|
||||
qkHeadDim: keyLength,
|
||||
vHeadDim: valueLength,
|
||||
qkRopeHeadDim: int(c.Uint("rope.dimension_count")),
|
||||
qkNopeHeadDim: int(c.Uint("attention.key_length")) - int(c.Uint("rope.dimension_count")),
|
||||
kqNopeHeadDim: int(c.Uint("attention.key_length")) - int(c.Uint("rope.dimension_count")),
|
||||
qkNopeHeadDim: keyLength - int(c.Uint("rope.dimension_count")),
|
||||
kqNopeHeadDim: keyLength - int(c.Uint("rope.dimension_count")),
|
||||
|
||||
routedScalingFactor: c.Float("expert_weights_scale"),
|
||||
originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
|
||||
|
|
|
|||
|
|
@ -0,0 +1,83 @@
|
|||
package deepseekocr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"image/color"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
type ratio struct {
|
||||
x, y int
|
||||
}
|
||||
|
||||
func ProcessImage(ctx ml.Context, bts []byte) (ml.Tensor, ml.Tensor, []int, error) {
|
||||
img, _, err := image.Decode(bytes.NewReader(bts))
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
minNum, maxNum, imageSize, baseSize := 2, 9, 640, 1024
|
||||
var targetRatios []ratio
|
||||
for n := minNum; n <= maxNum; n++ {
|
||||
for i := 1; i <= n; i++ {
|
||||
for j := 1; j <= n; j++ {
|
||||
if i*j <= maxNum && i*j >= minNum && !slices.Contains(targetRatios, ratio{i, j}) {
|
||||
targetRatios = append(targetRatios, ratio{i, j})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
targetRatio := findBestAspectRatio(targetRatios, img.Bounds().Dx(), img.Bounds().Dy(), imageSize)
|
||||
targetWidth, targetHeight := imageSize*targetRatio.x, imageSize*targetRatio.y
|
||||
blocks := targetRatio.x * targetRatio.y
|
||||
|
||||
mean := imageproc.ImageNetStandardMean
|
||||
std := imageproc.ImageNetStandardSTD
|
||||
|
||||
var patches []float32
|
||||
resized := imageproc.Resize(img, image.Point{X: targetWidth, Y: targetHeight}, imageproc.ResizeBilinear)
|
||||
for i := range blocks {
|
||||
patch := image.NewRGBA(image.Rect(0, 0, imageSize, imageSize))
|
||||
draw.Draw(patch, patch.Bounds(), resized, image.Point{
|
||||
X: i % (targetWidth / imageSize) * imageSize,
|
||||
Y: i / (targetWidth / imageSize) * imageSize,
|
||||
}, draw.Over)
|
||||
|
||||
patches = append(patches, imageproc.Normalize(patch, mean, std, true, true)...)
|
||||
}
|
||||
|
||||
img = imageproc.CompositeColor(img, color.Gray{})
|
||||
img = imageproc.Pad(img, image.Point{X: baseSize, Y: baseSize}, color.Gray{127}, draw.BiLinear)
|
||||
|
||||
return ctx.Input().FromFloats(patches, imageSize, imageSize, 3, blocks),
|
||||
ctx.Input().FromFloats(imageproc.Normalize(img, mean, std, true, true), baseSize, baseSize, 3),
|
||||
[]int{targetRatio.x, targetRatio.y},
|
||||
nil
|
||||
}
|
||||
|
||||
func findBestAspectRatio(targetRatios []ratio, width, height, imageSize int) ratio {
|
||||
bestDiff := math.MaxFloat64
|
||||
best := ratio{1, 1}
|
||||
realRatio := float64(width) / float64(height)
|
||||
for _, target := range targetRatios {
|
||||
targetRatio := float64(target.x) / float64(target.y)
|
||||
diff := math.Abs(realRatio - targetRatio)
|
||||
if diff < bestDiff {
|
||||
bestDiff = diff
|
||||
best = target
|
||||
} else if diff == bestDiff {
|
||||
if float64(width*height) > 0.5*float64(imageSize*imageSize*best.x*best.y) {
|
||||
best = target
|
||||
}
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
|
@ -0,0 +1,192 @@
|
|||
package deepseekocr
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.TextProcessor
|
||||
|
||||
Sam *samModel `gguf:"s"`
|
||||
Vision *visionModel `gguf:"v"`
|
||||
Text *textModel
|
||||
|
||||
ImageNewline ml.Tensor `gguf:"mm.image_newline"`
|
||||
//nolint:misspell // this misspelling is upstream. fixing it breaks the model
|
||||
ViewSeperator ml.Tensor `gguf:"mm.view_seperator"`
|
||||
|
||||
Projector *nn.Linear `gguf:"mm.layers"`
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, bts []byte) ([]input.Multimodal, error) {
|
||||
patches, original, crop, err := ProcessImage(ctx, bts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var outputs []ml.Tensor
|
||||
if true { // TODO: local features if sum(patches) != 0
|
||||
samOutputs := m.Sam.Forward(ctx, patches)
|
||||
visionOutputs := m.Vision.Forward(ctx, patches, samOutputs)
|
||||
|
||||
samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
|
||||
visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
|
||||
localOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
|
||||
localOutputs = m.Projector.Forward(ctx, localOutputs)
|
||||
|
||||
hw := int(math.Sqrt(float64(localOutputs.Dim(1))))
|
||||
localOutputs = localOutputs.Reshape(ctx, -1, hw, crop[0], crop[1])
|
||||
localOutputs = localOutputs.Permute(ctx, 0, 2, 1, 3)
|
||||
localOutputs = localOutputs.Contiguous(ctx, -1, crop[0]*hw, crop[1]*hw)
|
||||
localOutputs = localOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, localOutputs.Dim(2)), 1)
|
||||
localOutputs = localOutputs.Reshape(ctx, localOutputs.Dim(0), -1)
|
||||
|
||||
outputs = append(outputs, localOutputs)
|
||||
}
|
||||
|
||||
samOutputs := m.Sam.Forward(ctx, original)
|
||||
visionOutputs := m.Vision.Forward(ctx, original, samOutputs)
|
||||
|
||||
samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
|
||||
visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
|
||||
globalOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
|
||||
globalOutputs = m.Projector.Forward(ctx, globalOutputs)
|
||||
|
||||
hw := int(math.Sqrt(float64(globalOutputs.Dim(1))))
|
||||
globalOutputs = globalOutputs.Reshape(ctx, -1, hw, hw)
|
||||
globalOutputs = globalOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, globalOutputs.Dim(2)), 1)
|
||||
globalOutputs = globalOutputs.Reshape(ctx, globalOutputs.Dim(0), -1)
|
||||
|
||||
outputs = append(outputs, globalOutputs, m.ViewSeperator)
|
||||
return []input.Multimodal{
|
||||
{Tensor: outputs[0].Stack(ctx, 1, outputs[1:]...)},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
outputs := make([]*input.Input, 0, len(inputs))
|
||||
for i := range inputs {
|
||||
if inputs[i].Multimodal == nil {
|
||||
outputs = append(outputs, inputs[i])
|
||||
continue
|
||||
}
|
||||
|
||||
t := inputs[i].Multimodal[0].Tensor
|
||||
outputs = append(outputs, &input.Input{
|
||||
Token: 128815,
|
||||
Multimodal: inputs[i].Multimodal,
|
||||
MultimodalHash: inputs[i].MultimodalHash,
|
||||
SameBatch: t.Dim(1) - 1,
|
||||
})
|
||||
|
||||
outputs = slices.Grow(outputs, t.Dim(1)-1)
|
||||
outputs = append(outputs, slices.Repeat([]*input.Input{{Token: 128815}}, t.Dim(1)-1)...)
|
||||
}
|
||||
return outputs, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
inputsEmbeds := m.Text.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
|
||||
positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
|
||||
|
||||
for _, mm := range batch.Multimodal {
|
||||
t := mm.Multimodal[0].Tensor
|
||||
ctx.Forward(t.Copy(ctx, inputsEmbeds.View(ctx, mm.Index*inputsEmbeds.Stride(1), t.Dim(0)*t.Dim(1))))
|
||||
}
|
||||
|
||||
hiddenStates := inputsEmbeds
|
||||
for i, block := range m.Text.Blocks {
|
||||
if m.Cache != nil {
|
||||
m.Cache.SetLayer(i)
|
||||
}
|
||||
|
||||
var outputs ml.Tensor
|
||||
if i == len(m.Text.Blocks)-1 {
|
||||
outputs = batch.Outputs
|
||||
}
|
||||
|
||||
hiddenStates = block.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Text.Options)
|
||||
}
|
||||
|
||||
hiddenStates = m.Text.OutputNorm.Forward(ctx, hiddenStates, m.Text.Options.eps)
|
||||
return m.Text.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("deepseekocr", func(c fs.Config) (model.Model, error) {
|
||||
textBlocks := make([]textBlock, c.Uint("block_count"))
|
||||
leadingDenseBlockCount := int(c.Uint("leading_dense_block_count", 1))
|
||||
for i := range textBlocks {
|
||||
if i >= leadingDenseBlockCount {
|
||||
textBlocks[i].FeedForward = &textMoe{}
|
||||
} else {
|
||||
textBlocks[i].FeedForward = &textMLP{}
|
||||
}
|
||||
}
|
||||
|
||||
m := Model{
|
||||
TextProcessor: model.NewBytePairEncoding(
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
// Split regex into multiple parts (according to DeepSeek3's regex)
|
||||
"\\p{N}{1,3}",
|
||||
`[一-龥-ゟ゠-ヿ]+`,
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||
),
|
||||
Text: &textModel{
|
||||
Blocks: textBlocks,
|
||||
Options: textOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
numExperts: int(c.Uint("expert_count")),
|
||||
numExpertsUsed: int(c.Uint("expert_used_count")),
|
||||
ropeBase: c.Float("rope.freq_base", 10_000),
|
||||
ropeScale: c.Float("rope.scaling.factor", 1.0),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon", 1e-6),
|
||||
},
|
||||
},
|
||||
Vision: &visionModel{
|
||||
Blocks: make([]visionBlock, c.Uint("vision.block_count")),
|
||||
Options: visionOptions{
|
||||
hiddenSize: int(c.Uint("vision.embedding_length")),
|
||||
numHeads: int(c.Uint("vision.head_count")),
|
||||
imageSize: int(c.Uint("vision.image_size", 224)),
|
||||
patchSize: int(c.Uint("vision.patch_size", 14)),
|
||||
eps: c.Float("vision.attention.layer_norm_epsilon", 1e-5),
|
||||
},
|
||||
},
|
||||
Sam: &samModel{
|
||||
Blocks: make([]samBlock, c.Uint("sam.block_count")),
|
||||
Options: samOptions{
|
||||
hiddenSize: int(c.Uint("sam.embedding_length")),
|
||||
numHeads: int(c.Uint("sam.head_count")),
|
||||
eps: c.Float("sam.attention.layer_norm_epsilon", 1e-6),
|
||||
globalAttentionLayers: c.Ints("sam.global_attention_indexes"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.Text.Shift)
|
||||
return &m, nil
|
||||
})
|
||||
}
|
||||
|
|
@ -0,0 +1,225 @@
|
|||
package deepseekocr
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
type samModel struct {
|
||||
PatchEmbedding *nn.Conv2D `gguf:"patch_embd"`
|
||||
PositionEmbedding ml.Tensor `gguf:"position_embd"`
|
||||
|
||||
Blocks []samBlock `gguf:"blk"`
|
||||
|
||||
Neck *samNeck `gguf:"neck"`
|
||||
Net2 *nn.Conv2D `gguf:"net_2"`
|
||||
Net3 *nn.Conv2D `gguf:"net_3"`
|
||||
|
||||
Options samOptions
|
||||
}
|
||||
|
||||
func (m *samModel) absolutePositionEmbedding(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
|
||||
source := m.PositionEmbedding.Dim(1)
|
||||
target := hiddenStates.Dim(2)
|
||||
if source != target {
|
||||
positionEmbed := m.PositionEmbedding.Permute(ctx, 2, 0, 1, 3)
|
||||
positionEmbed = positionEmbed.Interpolate(ctx, [4]int{target, target, hiddenStates.Dim(0), 1}, ml.SamplingModeBilinear)
|
||||
return positionEmbed.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
}
|
||||
|
||||
return m.PositionEmbedding
|
||||
}
|
||||
|
||||
func (m *samModel) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
|
||||
hiddenStates := m.PatchEmbedding.Forward(ctx, t, 16, 16, 0, 0, 1, 1)
|
||||
hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
if m.PositionEmbedding != nil {
|
||||
hiddenStates = hiddenStates.Add(ctx, m.absolutePositionEmbedding(ctx, hiddenStates))
|
||||
}
|
||||
|
||||
for i, block := range m.Blocks {
|
||||
var windowSize int
|
||||
if !slices.Contains(m.Options.globalAttentionLayers, int32(i)) {
|
||||
windowSize = 14
|
||||
}
|
||||
|
||||
hiddenStates = block.Forward(ctx, hiddenStates, windowSize, m.Options)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
||||
hiddenStates = m.Neck.Forward(ctx, hiddenStates, m.Options)
|
||||
hiddenStates = m.Net2.Forward(ctx, hiddenStates, 2, 2, 1, 1, 1, 1)
|
||||
hiddenStates = m.Net3.Forward(ctx, hiddenStates, 2, 2, 1, 1, 1, 1)
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type samOptions struct {
|
||||
hiddenSize,
|
||||
numHeads int
|
||||
eps float32
|
||||
globalAttentionLayers []int32
|
||||
}
|
||||
|
||||
func (o samOptions) headDim() int {
|
||||
return o.hiddenSize / o.numHeads
|
||||
}
|
||||
|
||||
type samBlock struct {
|
||||
Norm1 *nn.LayerNorm `gguf:"norm1"`
|
||||
Attention *samAttention `gguf:"attn"`
|
||||
Norm2 *nn.LayerNorm `gguf:"norm2"`
|
||||
FeedForward *samMLP `gguf:"mlp"`
|
||||
}
|
||||
|
||||
func (m *samBlock) Forward(ctx ml.Context, hiddenStates ml.Tensor, windowSize int, opts samOptions) ml.Tensor {
|
||||
c, w, h := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
|
||||
|
||||
residual := hiddenStates
|
||||
hiddenStates = m.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
||||
|
||||
var pw, ph int
|
||||
if windowSize > 0 {
|
||||
pw = (windowSize - hiddenStates.Dim(1)%windowSize) % windowSize
|
||||
ph = (windowSize - hiddenStates.Dim(2)%windowSize) % windowSize
|
||||
if pw > 0 || ph > 0 {
|
||||
hiddenStates = hiddenStates.Pad(ctx, 0, pw, ph, 0)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Reshape(ctx, c*windowSize, (w+pw)/windowSize, windowSize, -1)
|
||||
hiddenStates = hiddenStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, c, windowSize, windowSize, -1)
|
||||
}
|
||||
|
||||
hiddenStates = m.Attention.Forward(ctx, hiddenStates, opts)
|
||||
|
||||
if windowSize > 0 {
|
||||
hiddenStates = hiddenStates.Reshape(ctx, c*windowSize, windowSize, (w+pw)/windowSize, -1)
|
||||
hiddenStates = hiddenStates.Permute(ctx, 0, 2, 1, 3)
|
||||
hiddenStates = hiddenStates.Contiguous(ctx, c, w+pw, h+ph, -1)
|
||||
hiddenStates = hiddenStates.Pad(ctx, 0, -pw, -ph, 0)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = m.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = m.FeedForward.Forward(ctx, hiddenStates, opts)
|
||||
return hiddenStates.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type samAttention struct {
|
||||
QKV *nn.Linear `gguf:"qkv"`
|
||||
Output *nn.Linear `gguf:"proj"`
|
||||
|
||||
RelativePosition *struct {
|
||||
Height ml.Tensor `gguf:"h"`
|
||||
Width ml.Tensor `gguf:"w"`
|
||||
} `gguf:",pre:rel_pos_"`
|
||||
}
|
||||
|
||||
func relativeCoordinates(ctx ml.Context, qn, kn int) ml.Tensor {
|
||||
s := make([]int32, qn*kn)
|
||||
for i := range qn {
|
||||
for j := range kn {
|
||||
q := i * max(kn/qn, 1)
|
||||
k := j * max(qn/kn, 1)
|
||||
s[i*kn+j] = int32(q - k + (kn-1)*max(qn/kn, 1))
|
||||
}
|
||||
}
|
||||
return ctx.Input().FromInts(s, qn*kn)
|
||||
}
|
||||
|
||||
func relativePositions(ctx ml.Context, positions ml.Tensor, qn, kn int) ml.Tensor {
|
||||
maxRelativeDistance := 2*max(qn, kn) - 1
|
||||
if positions.Dim(1) != maxRelativeDistance {
|
||||
// linear interpolation kernel not available so approx. with bilinear interpolation
|
||||
positions = positions.Interpolate(ctx, [4]int{positions.Dim(0), maxRelativeDistance, 1, 1}, ml.SamplingModeBilinear)
|
||||
}
|
||||
|
||||
rc := relativeCoordinates(ctx, qn, kn)
|
||||
return positions.Rows(ctx, rc).Reshape(ctx, positions.Dim(0), kn, qn)
|
||||
}
|
||||
|
||||
func (m *samAttention) decomposedRelativePositions(ctx ml.Context, query ml.Tensor, qn, kn []int) (ml.Tensor, ml.Tensor) {
|
||||
qh, qw := qn[0], qn[1]
|
||||
kh, kw := kn[0], kn[1]
|
||||
|
||||
rh := relativePositions(ctx, m.RelativePosition.Height, qh, kh)
|
||||
rw := relativePositions(ctx, m.RelativePosition.Width, qw, kw)
|
||||
|
||||
query = query.Contiguous(ctx, query.Dim(0), qw, qh, -1)
|
||||
rh = rh.Mulmat(ctx, query).Reshape(ctx, 1, kh, qh*qw, -1)
|
||||
rw = rw.Mulmat(ctx, query.Permute(ctx, 0, 2, 1, 3)).Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, kw, 1, qh*qw, -1)
|
||||
return rh, rw
|
||||
}
|
||||
|
||||
func (m *samAttention) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts samOptions) ml.Tensor {
|
||||
w, h, b := hiddenStates.Dim(1), hiddenStates.Dim(2), hiddenStates.Dim(3)
|
||||
|
||||
qkv := m.QKV.Forward(ctx, hiddenStates)
|
||||
qkv = qkv.Reshape(ctx, opts.headDim(), -1, w*h, b)
|
||||
chunks := qkv.Chunk(ctx, 1, opts.numHeads)
|
||||
query, key, value := chunks[0], chunks[1], chunks[2]
|
||||
|
||||
ctx.Forward(query, key, value)
|
||||
|
||||
query = query.Permute(ctx, 0, 2, 1, 3)
|
||||
rh, rw := m.decomposedRelativePositions(ctx, query, []int{h, w}, []int{h, w})
|
||||
mask := rh.Repeat(ctx, 0, rw.Dim(0)).Add(ctx, rw)
|
||||
mask = mask.Reshape(ctx, h*w, -1, opts.numHeads, b)
|
||||
|
||||
key = key.Permute(ctx, 0, 2, 1, 3)
|
||||
scores := key.MulmatFullPrec(ctx, query)
|
||||
scores = scores.Scale(ctx, 1/math.Sqrt(float64(opts.headDim())))
|
||||
|
||||
scores = scores.Add(ctx, mask)
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3)
|
||||
attention = attention.Contiguous(ctx, -1, w, h, b)
|
||||
return m.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type samMLP struct {
|
||||
Lin1 *nn.Linear `gguf:"lin1"`
|
||||
Lin2 *nn.Linear `gguf:"lin2"`
|
||||
}
|
||||
|
||||
func (m *samMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts samOptions) ml.Tensor {
|
||||
return m.Lin2.Forward(ctx, m.Lin1.Forward(ctx, hiddenStates).GELU(ctx))
|
||||
}
|
||||
|
||||
type LayerNorm2D struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
Bias ml.Tensor `gguf:"bias"`
|
||||
}
|
||||
|
||||
func (ln *LayerNorm2D) Forward(ctx ml.Context, x ml.Tensor, eps float32) ml.Tensor {
|
||||
x = x.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
u := x.Mean(ctx)
|
||||
d := x.Sub(ctx, u)
|
||||
s := d.Sqr(ctx).Mean(ctx)
|
||||
x = d.Div(ctx, s.Add(ctx, ctx.Input().FromFloats([]float32{eps}, 1)).Sqrt(ctx))
|
||||
x = x.Mul(ctx, ln.Weight).Add(ctx, ln.Bias)
|
||||
return x.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
||||
}
|
||||
|
||||
type samNeck struct {
|
||||
C1 *nn.Conv2D `gguf:"0"`
|
||||
LN1 *LayerNorm2D `gguf:"1"`
|
||||
C2 *nn.Conv2D `gguf:"2"`
|
||||
LN2 *LayerNorm2D `gguf:"3"`
|
||||
}
|
||||
|
||||
func (m *samNeck) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts samOptions) ml.Tensor {
|
||||
hiddenStates = m.C1.Forward(ctx, hiddenStates, 1, 1, 0, 0, 1, 1)
|
||||
hiddenStates = m.LN1.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = m.C2.Forward(ctx, hiddenStates, 1, 1, 1, 1, 1, 1)
|
||||
hiddenStates = m.LN2.Forward(ctx, hiddenStates, opts.eps)
|
||||
return hiddenStates
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
package deepseekocr
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
)
|
||||
|
||||
type textModel struct {
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Blocks []textBlock `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output"`
|
||||
|
||||
Options textOptions
|
||||
}
|
||||
|
||||
func (m *textModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return m.Options.applyRotaryPositionalEmbedding(ctx, key, shift), nil
|
||||
}
|
||||
|
||||
type textOptions struct {
|
||||
hiddenSize,
|
||||
numHeads,
|
||||
numKVHeads,
|
||||
numExperts,
|
||||
numExpertsUsed int
|
||||
ropeBase,
|
||||
ropeScale,
|
||||
eps float32
|
||||
}
|
||||
|
||||
func (o textOptions) headDim() int {
|
||||
return o.hiddenSize / o.numHeads
|
||||
}
|
||||
|
||||
func (o textOptions) applyRotaryPositionalEmbedding(ctx ml.Context, t, p ml.Tensor) ml.Tensor {
|
||||
return fast.RoPE(ctx, t, p, o.headDim(), o.ropeBase, 1/o.ropeScale, rope.WithTypeNeoX())
|
||||
}
|
||||
|
||||
type textBlock struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
Attention *textAttention
|
||||
MLPNNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
FeedForward textFeedForward
|
||||
}
|
||||
|
||||
func (m *textBlock) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts textOptions) ml.Tensor {
|
||||
residual := hiddenStates
|
||||
hiddenStates = m.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = m.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
|
||||
if outputs != nil {
|
||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
||||
residual = residual.Rows(ctx, outputs)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = m.MLPNNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = m.FeedForward.Forward(ctx, hiddenStates, opts)
|
||||
return hiddenStates.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type textAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (m *textAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts textOptions) ml.Tensor {
|
||||
query := m.Query.Forward(ctx, hiddenStates)
|
||||
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, -1)
|
||||
|
||||
key := m.Key.Forward(ctx, hiddenStates)
|
||||
key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, -1)
|
||||
|
||||
value := m.Value.Forward(ctx, hiddenStates)
|
||||
value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, -1)
|
||||
|
||||
query = opts.applyRotaryPositionalEmbedding(ctx, query, positions)
|
||||
key = opts.applyRotaryPositionalEmbedding(ctx, key, positions)
|
||||
|
||||
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
|
||||
attention = attention.Reshape(ctx, -1, attention.Dim(2))
|
||||
return m.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type textFeedForward interface {
|
||||
Forward(ml.Context, ml.Tensor, textOptions) ml.Tensor
|
||||
}
|
||||
|
||||
type textMoe struct {
|
||||
Router *nn.Linear `gguf:"ffn_gate_inp"`
|
||||
Gate *nn.LinearBatch `gguf:"ffn_gate_exps"`
|
||||
Up *nn.LinearBatch `gguf:"ffn_up_exps"`
|
||||
Down *nn.LinearBatch `gguf:"ffn_down_exps"`
|
||||
SharedExperts *textMLP `gguf:",suf:_shexp"`
|
||||
}
|
||||
|
||||
func (m *textMoe) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts textOptions) ml.Tensor {
|
||||
scores := m.Router.Forward(ctx, hiddenStates).Softmax(ctx)
|
||||
indices := scores.TopK(ctx, opts.numExpertsUsed)
|
||||
weights := scores.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, indices)
|
||||
|
||||
experts := hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
|
||||
experts = m.Gate.Forward(ctx, experts, indices).SILU(ctx, m.Up.Forward(ctx, experts, indices))
|
||||
experts = m.Down.Forward(ctx, experts, indices)
|
||||
experts = experts.Mul(ctx, weights)
|
||||
|
||||
expert := func(i int) ml.Tensor {
|
||||
return experts.View(
|
||||
ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2),
|
||||
)
|
||||
}
|
||||
|
||||
routedStates := expert(0)
|
||||
for i := 1; i < opts.numExpertsUsed; i++ {
|
||||
routedStates = routedStates.Add(ctx, expert(i))
|
||||
}
|
||||
|
||||
sharedStates := m.SharedExperts.Forward(ctx, hiddenStates, opts)
|
||||
return routedStates.Add(ctx, sharedStates)
|
||||
}
|
||||
|
||||
type textMLP struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (m *textMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ textOptions) ml.Tensor {
|
||||
hiddenStates = m.Gate.Forward(ctx, hiddenStates).SILU(ctx, m.Up.Forward(ctx, hiddenStates))
|
||||
return m.Down.Forward(ctx, hiddenStates)
|
||||
}
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
package deepseekocr
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
type visionModel struct {
|
||||
PatchEmbedding *nn.Conv2D `gguf:"patch_embd"`
|
||||
ClassEmbedding ml.Tensor `gguf:"class_embd"`
|
||||
PositionEmbedding *nn.Embedding `gguf:"position_embd"`
|
||||
|
||||
PreLayerNorm *nn.LayerNorm `gguf:"pre_layrnorm"`
|
||||
Blocks []visionBlock `gguf:"blk"`
|
||||
|
||||
Options visionOptions
|
||||
}
|
||||
|
||||
func (m *visionModel) absolutePositionEmbedding(ctx ml.Context, embeds ml.Tensor) ml.Tensor {
|
||||
numPatches := m.Options.imageSize / m.Options.patchSize * m.Options.imageSize / m.Options.patchSize
|
||||
positions := ctx.Arange(0, float32(numPatches+1), 1, ml.DTypeI32)
|
||||
positionEmbeds := m.PositionEmbedding.Forward(ctx, positions)
|
||||
|
||||
source := int(math.Sqrt(float64(positionEmbeds.Dim(1) - 1)))
|
||||
target := int(math.Sqrt(float64(embeds.Dim(1) - 1)))
|
||||
if source != target {
|
||||
newPositionEmbeds := positionEmbeds.Slice(ctx, 1, 1, positionEmbeds.Dim(1), 1)
|
||||
newPositionEmbeds = newPositionEmbeds.Reshape(ctx, -1, source, source)
|
||||
newPositionEmbeds = newPositionEmbeds.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
||||
newPositionEmbeds = newPositionEmbeds.Interpolate(ctx, [4]int{target, target, embeds.Dim(0), 1}, ml.SamplingModeBilinear)
|
||||
newPositionEmbeds = newPositionEmbeds.Permute(ctx, 1, 2, 0, 3)
|
||||
newPositionEmbeds = newPositionEmbeds.Contiguous(ctx, -1, target*target)
|
||||
|
||||
positionEmbeds = positionEmbeds.Slice(ctx, 1, 0, 1, 1).Concat(ctx, newPositionEmbeds, 1)
|
||||
}
|
||||
|
||||
return positionEmbeds
|
||||
}
|
||||
|
||||
func (m *visionModel) Forward(ctx ml.Context, pixelValues, patchEmbeds ml.Tensor) ml.Tensor {
|
||||
if patchEmbeds == nil {
|
||||
patchEmbeds = m.PatchEmbedding.Forward(ctx, pixelValues, m.Options.patchSize, m.Options.patchSize, 0, 0, 1, 1)
|
||||
}
|
||||
|
||||
patchEmbeds = patchEmbeds.Reshape(ctx, -1, patchEmbeds.Dim(2), patchEmbeds.Dim(3))
|
||||
patchEmbeds = patchEmbeds.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
classEmbeds := m.ClassEmbedding.Repeat(ctx, 2, patchEmbeds.Dim(2))
|
||||
embeds := classEmbeds.Concat(ctx, patchEmbeds, 1)
|
||||
embeds = embeds.Add(ctx, m.absolutePositionEmbedding(ctx, embeds))
|
||||
|
||||
hiddenStates := m.PreLayerNorm.Forward(ctx, embeds, m.Options.eps)
|
||||
for _, block := range m.Blocks {
|
||||
hiddenStates = block.Forward(ctx, hiddenStates, m.Options)
|
||||
}
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type visionOptions struct {
|
||||
hiddenSize,
|
||||
numHeads int
|
||||
eps float32
|
||||
|
||||
imageSize, patchSize int
|
||||
}
|
||||
|
||||
func (o visionOptions) headDim() int {
|
||||
return o.hiddenSize / o.numHeads
|
||||
}
|
||||
|
||||
type visionBlock struct {
|
||||
Norm1 *nn.LayerNorm `gguf:"layer_norm1"`
|
||||
Attention *visionAttention `gguf:"self_attn"`
|
||||
Norm2 *nn.LayerNorm `gguf:"layer_norm2"`
|
||||
FeedForward *visionMLP `gguf:"mlp"`
|
||||
}
|
||||
|
||||
func (m *visionBlock) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts visionOptions) ml.Tensor {
|
||||
residual := hiddenStates
|
||||
hiddenStates = m.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = m.Attention.Forward(ctx, hiddenStates, opts)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = m.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = m.FeedForward.Forward(ctx, hiddenStates)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type visionAttention struct {
|
||||
QKV *nn.Linear `gguf:"qkv_proj"`
|
||||
Output *nn.Linear `gguf:"out_proj"`
|
||||
}
|
||||
|
||||
func (m *visionAttention) Forward(ctx ml.Context, t ml.Tensor, opts visionOptions) ml.Tensor {
|
||||
qkv := m.QKV.Forward(ctx, t)
|
||||
qkv = qkv.Reshape(ctx, opts.headDim(), -1, qkv.Dim(1), qkv.Dim(2))
|
||||
chunks := qkv.Chunk(ctx, 1, opts.numHeads)
|
||||
query, key, value := chunks[0], chunks[1], chunks[2]
|
||||
|
||||
attention := nn.Attention(ctx, query, key, value, 1/math.Sqrt(float64(opts.headDim())), nil)
|
||||
attention = attention.Reshape(ctx, -1, attention.Dim(2), attention.Dim(3))
|
||||
return m.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type visionMLP struct {
|
||||
FC1 *nn.Linear `gguf:"fc1"`
|
||||
FC2 *nn.Linear `gguf:"fc2"`
|
||||
}
|
||||
|
||||
func (m *visionMLP) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
|
||||
return m.FC2.Forward(ctx, m.FC1.Forward(ctx, t).QuickGELU(ctx))
|
||||
}
|
||||