2025-02-14 08:31:21 +08:00
package ggml
2025-03-01 10:06:47 +08:00
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
2025-02-14 08:31:21 +08:00
import "C"
import (
2025-03-20 04:03:16 +08:00
"context"
2025-04-04 03:50:20 +08:00
"errors"
2025-02-14 08:31:21 +08:00
"fmt"
"io"
"log/slog"
2025-02-20 06:26:40 +08:00
"maps"
2025-02-14 08:31:21 +08:00
"os"
2025-03-20 04:03:16 +08:00
"runtime"
2025-02-20 06:26:40 +08:00
"slices"
"strconv"
"strings"
2025-03-20 04:03:16 +08:00
"sync/atomic"
2025-02-20 06:26:40 +08:00
"unicode"
2025-02-14 08:31:21 +08:00
"unsafe"
"github.com/ollama/ollama/format"
2025-03-19 05:38:44 +08:00
"github.com/ollama/ollama/fs"
fsggml "github.com/ollama/ollama/fs/ggml"
2025-05-13 02:43:00 +08:00
"github.com/ollama/ollama/logutil"
2025-02-14 08:31:21 +08:00
"github.com/ollama/ollama/ml"
2025-02-25 07:48:42 +08:00
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
2025-02-14 08:31:21 +08:00
"golang.org/x/sync/errgroup"
)
2025-03-05 05:06:56 +08:00
func devices ( ) [ ] * C . struct_ggml_backend_device {
ggml . OnceLoad ( )
ds := make ( [ ] * C . struct_ggml_backend_device , C . ggml_backend_dev_count ( ) )
for i := range ds {
ds [ i ] = C . ggml_backend_dev_get ( C . size_t ( i ) )
2025-02-14 08:31:21 +08:00
}
2025-03-05 05:06:56 +08:00
return ds
2025-02-20 06:26:40 +08:00
}
2025-02-14 08:31:21 +08:00
type Backend struct {
2025-04-04 03:50:20 +08:00
meta * fsggml . GGML
sched * C . struct_ggml_backend_sched
schedBackends [ ] * C . struct_ggml_backend
schedBufts [ ] * C . struct_ggml_backend_buffer_type
2025-02-26 08:06:32 +08:00
tensors map [ string ] * C . struct_ggml_tensor
2025-03-05 05:06:56 +08:00
// input is the backend used for inputs
2025-03-06 06:48:27 +08:00
input * C . struct_ggml_backend_buffer_type
2025-03-05 05:06:56 +08:00
// layers is the backend used for repeating layers
2025-03-06 06:48:27 +08:00
layers map [ int ] * C . struct_ggml_backend_buffer_type
2025-02-26 09:24:36 +08:00
2025-02-20 06:26:40 +08:00
flashAttention bool
2025-03-05 05:06:56 +08:00
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
maxGraphNodes int
2025-02-14 08:31:21 +08:00
}
2025-03-20 04:03:16 +08:00
func New ( ctx context . Context , r * os . File , params ml . BackendParams ) ( ml . Backend , error ) {
2025-03-19 05:38:44 +08:00
meta , n , err := fsggml . Decode ( r , - 1 )
2025-02-14 08:31:21 +08:00
if err != nil {
return nil , err
}
slog . Info (
"" ,
"architecture" , meta . KV ( ) . Architecture ( ) ,
"file_type" , meta . KV ( ) . FileType ( ) ,
"name" , meta . KV ( ) . String ( "general.name" ) ,
"description" , meta . KV ( ) . String ( "general.description" ) ,
"num_tensors" , len ( meta . Tensors ( ) . Items ( ) ) ,
"num_key_values" , len ( meta . KV ( ) ) ,
)
2025-02-27 06:17:08 +08:00
type deviceBufferType struct {
2025-02-20 06:26:40 +08:00
d * C . struct_ggml_backend_device
bts [ ] * C . struct_ggml_backend_buffer_type
}
var cpus , accels , gpus [ ] * C . struct_ggml_backend_device
2025-03-05 05:06:56 +08:00
for _ , d := range devices ( ) {
2025-02-20 06:26:40 +08:00
switch C . ggml_backend_dev_type ( d ) {
case C . GGML_BACKEND_DEVICE_TYPE_CPU :
2025-03-06 06:48:27 +08:00
if len ( cpus ) == 0 {
// only the first cpu device should be used
cpus = append ( cpus , d )
}
2025-02-20 06:26:40 +08:00
case C . GGML_BACKEND_DEVICE_TYPE_ACCEL :
accels = append ( accels , d )
2025-02-14 08:31:21 +08:00
case C . GGML_BACKEND_DEVICE_TYPE_GPU :
2025-02-20 06:26:40 +08:00
gpus = append ( gpus , d )
2025-02-14 08:31:21 +08:00
}
}
2025-03-05 05:06:56 +08:00
// create list of buffer types for the cpu
2025-03-01 07:42:52 +08:00
cpuDeviceBufferType := deviceBufferType { d : C . ggml_backend_dev_by_type ( C . GGML_BACKEND_DEVICE_TYPE_CPU ) }
2025-02-20 06:26:40 +08:00
for _ , d := range append ( accels , append ( gpus , cpus ... ) ... ) {
switch C . ggml_backend_dev_type ( d ) {
case C . GGML_BACKEND_DEVICE_TYPE_CPU ,
C . GGML_BACKEND_DEVICE_TYPE_ACCEL :
2025-03-01 07:42:52 +08:00
cpuDeviceBufferType . bts = append ( cpuDeviceBufferType . bts , C . ggml_backend_dev_buffer_type ( d ) )
2025-02-14 08:31:21 +08:00
}
2025-02-20 06:26:40 +08:00
}
2025-03-05 05:06:56 +08:00
// create list of buffer types for each gpu
2025-02-27 06:17:08 +08:00
var gpuDeviceBufferTypes [ ] deviceBufferType
2025-02-20 06:26:40 +08:00
for _ , d := range gpus {
bt := C . ggml_backend_dev_buffer_type ( d )
2025-02-27 06:17:08 +08:00
gpuDeviceBufferTypes = append ( gpuDeviceBufferTypes , deviceBufferType {
2025-02-20 06:26:40 +08:00
d : d ,
2025-03-01 07:42:52 +08:00
bts : append ( [ ] * C . struct_ggml_backend_buffer_type { bt } , cpuDeviceBufferType . bts ... ) ,
2025-02-20 06:26:40 +08:00
} )
2025-02-14 08:31:21 +08:00
}
2025-03-05 05:06:56 +08:00
useDefaultSplit := true
for _ , s := range params . TensorSplit {
if s != 0 {
useDefaultSplit = false
break
2025-02-27 07:14:16 +08:00
}
2025-03-05 05:06:56 +08:00
}
2025-02-27 07:14:16 +08:00
2025-03-05 05:06:56 +08:00
// calculate splits
splits := make ( [ ] float32 , len ( gpus ) )
if useDefaultSplit {
// default: split on free memory
2025-02-27 07:14:16 +08:00
for i := range splits {
var free , total C . size_t
C . ggml_backend_dev_memory ( gpus [ i ] , & free , & total )
splits [ i ] = float32 ( free )
}
2025-03-05 05:06:56 +08:00
} else {
splits = params . TensorSplit
2025-02-27 07:14:16 +08:00
}
var sum float32
2025-03-05 05:06:56 +08:00
// cumulative sum of all splits
2025-02-27 07:14:16 +08:00
for i := range splits {
sum += splits [ i ]
splits [ i ] = sum
}
2025-03-05 05:06:56 +08:00
// normalize splits
2025-02-20 06:26:40 +08:00
for i := range splits {
2025-02-27 07:14:16 +08:00
splits [ i ] /= sum
2025-02-20 06:26:40 +08:00
}
2025-03-05 05:06:56 +08:00
// inputs always use cpu
2025-03-01 07:42:52 +08:00
input := cpuDeviceBufferType
2025-02-20 06:26:40 +08:00
2025-02-28 08:46:01 +08:00
blocks := int ( meta . KV ( ) . BlockCount ( ) )
2025-03-05 05:06:56 +08:00
// define a range of gpu layers. anything outside of this range is assigned to the cpu
gpuRangeStart := max ( 0 , blocks - params . NumGPULayers )
gpuRangeStop := min ( gpuRangeStart + params . NumGPULayers , blocks + 1 )
2025-03-01 07:42:52 +08:00
assignLayer := func ( i int ) deviceBufferType {
2025-03-05 05:06:56 +08:00
if i < gpuRangeStart || i >= gpuRangeStop {
2025-03-01 07:42:52 +08:00
return cpuDeviceBufferType
2025-02-20 06:26:40 +08:00
}
2025-02-27 06:17:08 +08:00
2025-03-05 05:06:56 +08:00
index := slices . IndexFunc ( splits , func ( f float32 ) bool { return float32 ( i - gpuRangeStart ) / float32 ( gpuRangeStop - gpuRangeStart ) < f } )
2025-02-27 07:14:16 +08:00
if index < 0 || index >= len ( gpuDeviceBufferTypes ) {
2025-03-01 07:42:52 +08:00
return cpuDeviceBufferType
2025-02-27 07:14:16 +08:00
}
return gpuDeviceBufferTypes [ index ]
2025-02-20 06:26:40 +08:00
}
2025-03-05 05:06:56 +08:00
// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
2025-02-27 06:17:08 +08:00
layers := make ( [ ] deviceBufferType , blocks )
2025-02-20 06:26:40 +08:00
for i := range layers {
2025-02-27 06:17:08 +08:00
layers [ i ] = assignLayer ( i )
2025-02-20 06:26:40 +08:00
}
2025-03-05 05:06:56 +08:00
// outputs are assigned iff allowed by splits and configured number of gpu layers
2025-02-27 06:17:08 +08:00
output := assignLayer ( blocks )
2025-02-20 06:26:40 +08:00
maxTensors := len ( meta . Tensors ( ) . Items ( ) )
maxTensors += 1
2025-03-05 05:06:56 +08:00
// each layer has at most 2 extra tensors for rope operations
2025-02-20 06:26:40 +08:00
maxTensors += blocks * 2
2025-02-25 07:48:42 +08:00
type tensor struct {
2025-03-19 05:38:44 +08:00
source * fsggml . Tensor
2025-02-25 07:48:42 +08:00
target string
}
2025-03-05 05:06:56 +08:00
// some tensors are mapped to different names so keep a list
2025-02-25 07:48:42 +08:00
targets := make ( map [ string ] [ ] string )
2025-03-05 05:06:56 +08:00
// contexts are shared by tensors of the same buffer type
2025-02-20 06:26:40 +08:00
ctxs := make ( map [ * C . struct_ggml_backend_buffer_type ] * C . struct_ggml_context )
2025-02-25 07:48:42 +08:00
createTensor := func ( t tensor , bts [ ] * C . struct_ggml_backend_buffer_type ) * C . struct_ggml_tensor {
2025-02-20 06:26:40 +08:00
for _ , bt := range bts {
if _ , ok := ctxs [ bt ] ; ! ok {
ctxs [ bt ] = C . ggml_init ( C . struct_ggml_init_params {
mem_size : C . ggml_tensor_overhead ( ) * C . size_t ( maxTensors ) ,
no_alloc : true ,
} )
}
2025-02-14 08:31:21 +08:00
2025-02-25 07:48:42 +08:00
targets [ t . source . Name ] = append ( targets [ t . source . Name ] , t . target )
name := t . source . Name
if t . target != "" {
name = t . target
}
cname := C . CString ( name )
2025-02-14 08:31:21 +08:00
defer C . free ( unsafe . Pointer ( cname ) )
2025-02-20 06:26:40 +08:00
if tt := C . ggml_get_tensor ( ctxs [ bt ] , cname ) ; tt != nil {
return tt
}
2025-02-25 07:48:42 +08:00
tt := C . ggml_new_tensor ( ctxs [ bt ] , t . source . Kind , C . int ( len ( t . source . Shape ) ) , ( * C . int64_t ) ( unsafe . Pointer ( & t . source . Shape [ 0 ] ) ) )
2025-02-14 08:31:21 +08:00
C . ggml_set_name ( tt , cname )
2025-05-13 02:43:00 +08:00
slog . Log ( context . TODO ( ) , logutil . LevelTrace , "created tensor" , "name" , name , "shape" , t . source . Shape , "dtype" , t . source . Kind , "buffer_type" , C . GoString ( C . ggml_backend_buft_name ( bt ) ) )
2025-02-20 06:26:40 +08:00
//nolint:staticcheck // TODO: check if buffer type supports this tensor
return tt
}
return nil
2025-02-14 08:31:21 +08:00
}
2025-02-28 08:46:01 +08:00
contains := func ( s string , parts ... string ) bool {
2025-02-20 06:26:40 +08:00
split := strings . Split ( s , "." )
for _ , part := range parts {
if slices . Contains ( split , part ) {
return true
}
}
return false
2025-02-14 08:31:21 +08:00
}
2025-02-20 06:26:40 +08:00
for _ , t := range meta . Tensors ( ) . Items ( ) {
switch {
2025-02-28 08:46:01 +08:00
case contains ( t . Name , "position_embd" , "token_embd" , "token_norm_embd" , "token_types" ) :
2025-02-25 07:48:42 +08:00
createTensor ( tensor { source : t } , input . bts )
2025-03-10 02:29:08 +08:00
if _ , ok := meta . Tensors ( ) . GroupLayers ( ) [ "output" ] ; ! ok && t . Name == "token_embd.weight" {
createTensor ( tensor { source : t , target : "output.weight" } , output . bts )
}
2025-02-28 08:46:01 +08:00
case contains ( t . Name , "cls" , "output" , "output_norm" ) :
2025-02-25 07:48:42 +08:00
createTensor ( tensor { source : t } , output . bts )
2025-02-28 08:46:01 +08:00
case strings . HasPrefix ( t . Name , "v." ) || strings . HasPrefix ( t . Name , "mm." ) :
2025-03-05 05:06:56 +08:00
// TODO: assign vision tensors to the gpu if possible
2025-03-12 00:00:10 +08:00
createTensor ( tensor { source : t } , output . bts )
2025-03-11 01:46:17 +08:00
case contains ( t . Name , "rope_freqs" , "rope_factors_long" , "rope_factors_short" ) :
// these tensors should be repeated per layer
for i , layer := range layers {
createTensor ( tensor {
source : t ,
target : "blk." + strconv . Itoa ( i ) + "." + t . Name ,
} , layer . bts )
}
2025-02-20 06:26:40 +08:00
default :
2025-03-05 05:06:56 +08:00
layerIndex := - 1
if fields := strings . FieldsFunc ( t . Name , func ( r rune ) bool { return ! unicode . IsNumber ( r ) } ) ; len ( fields ) > 0 {
if i , err := strconv . Atoi ( fields [ 0 ] ) ; err == nil {
layerIndex = i
2025-02-20 06:26:40 +08:00
}
2025-03-05 05:06:56 +08:00
}
2025-02-20 06:26:40 +08:00
2025-03-05 05:06:56 +08:00
if layerIndex >= 0 {
createTensor ( tensor { source : t } , layers [ layerIndex ] . bts )
2025-02-20 06:26:40 +08:00
} else {
2025-03-11 01:46:17 +08:00
// load all other tensors on the cpu
createTensor ( tensor { source : t } , input . bts )
2025-02-20 06:26:40 +08:00
}
}
}
2025-02-14 08:31:21 +08:00
2025-03-05 05:06:56 +08:00
// allocate buffers for each context
bbs := make ( map [ * C . struct_ggml_context ] * C . struct_ggml_backend_buffer , len ( ctxs ) )
2025-02-20 06:26:40 +08:00
for bt , c := range ctxs {
if C . ggml_get_first_tensor ( c ) == nil {
continue
}
b := C . ggml_backend_alloc_ctx_tensors_from_buft ( c , bt )
2025-04-05 06:04:25 +08:00
if b == nil {
return nil , fmt . Errorf ( "unable to allocate memory from device %v for model weights" , C . GoString ( C . ggml_backend_buft_name ( bt ) ) )
}
2025-02-20 06:26:40 +08:00
C . ggml_backend_buffer_set_usage ( b , C . GGML_BACKEND_BUFFER_USAGE_WEIGHTS )
2025-03-05 05:06:56 +08:00
bbs [ c ] = b
2025-02-20 06:26:40 +08:00
}
for bs := range maps . Values ( bbs ) {
2025-03-05 05:06:56 +08:00
slog . Info ( "model weights" , "buffer" , C . GoString ( C . ggml_backend_buffer_name ( bs ) ) , "size" , format . HumanBytes2 ( uint64 ( C . ggml_backend_buffer_get_size ( bs ) ) ) )
2025-02-20 06:26:40 +08:00
}
2025-03-05 05:06:56 +08:00
// map tensor names to tensors for easy lookup later
2025-02-20 06:26:40 +08:00
tensors := make ( map [ string ] * C . struct_ggml_tensor )
for _ , c := range ctxs {
for t := C . ggml_get_first_tensor ( c ) ; t != nil ; t = C . ggml_get_next_tensor ( c , t ) {
tensors [ C . GoString ( C . ggml_get_name ( t ) ) ] = t
}
}
2025-03-20 04:03:16 +08:00
var doneBytes atomic . Uint64
totalBytes := uint64 ( n ) - meta . Tensors ( ) . Offset
g , ctx := errgroup . WithContext ( ctx )
g . SetLimit ( runtime . GOMAXPROCS ( 0 ) )
2025-02-20 06:26:40 +08:00
for _ , t := range meta . Tensors ( ) . Items ( ) {
2025-05-07 02:20:48 +08:00
t := t
2025-03-20 04:03:16 +08:00
g . Go ( func ( ) error {
tts := make ( [ ] * C . struct_ggml_tensor , max ( 1 , len ( targets [ t . Name ] ) ) )
for i := range tts {
target := targets [ t . Name ] [ i ]
2025-02-25 07:48:42 +08:00
if target == "" {
target = t . Name
}
2025-02-20 06:26:40 +08:00
2025-02-25 07:48:42 +08:00
tt , ok := tensors [ target ]
if ! ok {
return fmt . Errorf ( "unassigned tensor: %s" , t . Name )
}
2025-02-14 08:31:21 +08:00
2025-03-20 04:03:16 +08:00
tts [ i ] = tt
}
2025-04-05 08:04:24 +08:00
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
// seeking around within an FD shared between all goroutines.
file , err := os . Open ( r . Name ( ) )
if err != nil {
2025-04-11 02:55:05 +08:00
slog . Warn ( "file open error" , "file" , r . Name ( ) , "error" , err )
2025-04-05 08:04:24 +08:00
return err
}
defer file . Close ( )
sr := io . NewSectionReader ( file , int64 ( meta . Tensors ( ) . Offset + t . Offset ) , int64 ( t . Size ( ) ) )
2025-03-20 04:03:16 +08:00
bts := make ( [ ] byte , 128 * format . KibiByte )
var s uint64
for s < t . Size ( ) {
2025-05-02 08:06:53 +08:00
// Stop if either the parent context has been canceled or if any of the other tensors returned an error
if err := ctx . Err ( ) ; err != nil {
return err
}
2025-03-20 04:03:16 +08:00
n , err := io . ReadFull ( sr , bts [ : min ( len ( bts ) , int ( t . Size ( ) - s ) ) ] )
if err != nil {
2025-04-11 02:55:05 +08:00
slog . Warn ( "file read error" , "file" , r . Name ( ) , "error" , err )
2025-03-20 04:03:16 +08:00
return err
2025-02-25 07:48:42 +08:00
}
2025-02-14 08:31:21 +08:00
2025-03-20 04:03:16 +08:00
for _ , tt := range tts {
C . ggml_backend_tensor_set ( tt , unsafe . Pointer ( & bts [ 0 ] ) , C . size_t ( s ) , C . size_t ( n ) )
2025-02-25 07:48:42 +08:00
}
2025-02-14 08:31:21 +08:00
2025-03-20 04:03:16 +08:00
s += uint64 ( n )
if params . Progress != nil {
done := doneBytes . Add ( uint64 ( n ) )
params . Progress ( float32 ( done ) / float32 ( totalBytes ) )
}
}
return nil
} )
2025-02-14 08:31:21 +08:00
}
2025-03-19 07:51:33 +08:00
if err := g . Wait ( ) ; err != nil {
2025-02-14 08:31:21 +08:00
return nil , err
}
2025-03-06 06:48:27 +08:00
// map devices to backend buffer types so new tensors can be assigned to the correct device
deviceBufferTypes := make ( map [ * C . struct_ggml_backend_device ] * C . struct_ggml_backend_buffer_type )
2025-03-05 05:06:56 +08:00
// create backends and buffer types used for the compute graph scheduler
var schedBackends [ ] * C . struct_ggml_backend
var schedBufts [ ] * C . struct_ggml_backend_buffer_type
2025-02-20 06:26:40 +08:00
for _ , d := range append ( gpus , append ( accels , cpus ... ) ... ) {
b := C . ggml_backend_dev_init ( d , nil )
bt := C . ggml_backend_get_default_buffer_type ( b )
2025-03-06 06:48:27 +08:00
deviceBufferTypes [ d ] = bt
schedBackends = append ( schedBackends , b )
2025-03-05 05:06:56 +08:00
schedBufts = append ( schedBufts , bt )
2025-02-20 06:26:40 +08:00
2025-02-27 05:53:56 +08:00
if C . ggml_backend_is_cpu ( b ) {
2025-03-05 05:06:56 +08:00
// set number of threads for cpu backend
2025-03-10 05:45:36 +08:00
C . ggml_backend_cpu_set_n_threads ( b , C . int ( Threads ( params . NumThreads ) ) )
2025-02-27 05:53:56 +08:00
}
2025-02-19 08:52:29 +08:00
}
2025-03-05 05:06:56 +08:00
maxGraphNodes := max ( 8192 , len ( meta . Tensors ( ) . Items ( ) ) * 5 )
2025-02-14 08:31:21 +08:00
return & Backend {
2025-02-26 09:24:36 +08:00
flashAttention : params . FlashAttention ,
2025-02-26 08:06:32 +08:00
meta : meta ,
tensors : tensors ,
2025-02-19 08:52:29 +08:00
sched : C . ggml_backend_sched_new (
2025-03-05 05:06:56 +08:00
( * C . ggml_backend_t ) ( unsafe . Pointer ( & schedBackends [ 0 ] ) ) ,
( * C . ggml_backend_buffer_type_t ) ( unsafe . Pointer ( & schedBufts [ 0 ] ) ) ,
C . int ( len ( schedBackends ) ) ,
C . size_t ( maxGraphNodes ) ,
2025-03-15 05:01:13 +08:00
C . _Bool ( len ( gpus ) > 1 && slices . Contains ( gpus , output . d ) ) ,
2025-05-13 03:17:26 +08:00
C . _Bool ( false ) ,
2025-02-19 08:52:29 +08:00
) ,
2025-04-04 03:50:20 +08:00
schedBackends : schedBackends ,
schedBufts : schedBufts ,
input : deviceBufferTypes [ input . d ] ,
2025-03-06 06:48:27 +08:00
layers : func ( ) map [ int ] * C . struct_ggml_backend_buffer_type {
m := make ( map [ int ] * C . struct_ggml_backend_buffer_type )
2025-02-26 08:06:32 +08:00
for i , layer := range layers {
2025-03-06 06:48:27 +08:00
m [ i ] = deviceBufferTypes [ layer . d ]
2025-02-26 08:06:32 +08:00
}
return m
} ( ) ,
2025-03-05 05:06:56 +08:00
maxGraphNodes : maxGraphNodes ,
2025-02-14 08:31:21 +08:00
} , nil
}
func init ( ) {
ml . RegisterBackend ( "ggml" , New )
}
2025-03-19 05:38:44 +08:00
func ( b * Backend ) Config ( ) fs . Config {
2025-02-14 08:31:21 +08:00
return b . meta . KV ( )
}
func ( b * Backend ) Get ( name string ) ml . Tensor {
2025-02-20 06:26:40 +08:00
if t , ok := b . tensors [ name ] ; ok {
return & Tensor { b : b , t : t }
2025-02-14 08:31:21 +08:00
}
return nil
}
func ( b * Backend ) NewContext ( ) ml . Context {
2025-03-05 05:06:56 +08:00
return b . NewContextSize ( b . maxGraphNodes )
2025-02-26 08:06:32 +08:00
}
func ( b * Backend ) NewContextSize ( n int ) ml . Context {
2025-03-08 03:19:03 +08:00
if n > b . maxGraphNodes {
panic ( fmt . Errorf ( "requested number of graph nodes (%v) for new context exceeds maximum (%v)" , n , b . maxGraphNodes ) )
}
2025-04-09 03:11:55 +08:00
var allocatedBuffers [ ] * C . struct_ggml_backend_buffer
2025-02-14 08:31:21 +08:00
return & Context {
2025-03-06 06:48:27 +08:00
b : b ,
maxGraphNodes : n ,
2025-02-20 06:26:40 +08:00
ctx : C . ggml_init ( C . struct_ggml_init_params {
2025-02-26 08:06:32 +08:00
mem_size : C . size_t ( n ) * C . ggml_tensor_overhead ( ) + C . ggml_graph_overhead_custom ( C . size_t ( n ) , false ) ,
2025-02-20 06:26:40 +08:00
no_alloc : true ,
} ) ,
2025-04-09 03:11:55 +08:00
allocatedBuffers : & allocatedBuffers ,
2025-02-14 08:31:21 +08:00
}
}
2025-02-23 13:34:10 +08:00
func ( b * Backend ) CacheConfig ( ) ml . CacheConfig {
2025-02-26 09:24:36 +08:00
if b . flashAttention {
return ml . CacheConfig { CachePadding : 256 , MaskDType : ml . DTypeF16 , MaskBatchPadding : C . GGML_KQ_MASK_PAD }
} else {
return ml . CacheConfig { CachePadding : 32 , PermutedV : true }
}
2025-02-23 13:34:10 +08:00
}
2025-02-14 08:31:21 +08:00
type Context struct {
2025-02-20 06:26:40 +08:00
b * Backend
2025-02-14 08:31:21 +08:00
2025-02-20 06:26:40 +08:00
ctx * C . struct_ggml_context
2025-02-14 08:31:21 +08:00
graph * C . struct_ggml_cgraph
2025-02-26 08:06:32 +08:00
2025-03-06 06:48:27 +08:00
// buft is the buffer type used for new tensors
buft * C . struct_ggml_backend_buffer_type
2025-02-20 06:26:40 +08:00
2025-04-09 03:11:55 +08:00
// allocatedBuffers are buffers for tensors that we have allocated in this context
// so that we can free them when we close the context
allocatedBuffers * [ ] * C . struct_ggml_backend_buffer
2025-03-05 05:06:56 +08:00
// maxGraphNodes is the maximum allowed number of graph nodes in this context
2025-02-26 04:57:49 +08:00
maxGraphNodes int
2025-02-14 08:31:21 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) Input ( ) ml . Context {
2025-03-05 05:06:56 +08:00
if c . b . input != nil {
2025-02-26 08:06:32 +08:00
return & Context {
2025-04-09 03:11:55 +08:00
b : c . b ,
ctx : c . ctx ,
buft : c . b . input ,
allocatedBuffers : c . allocatedBuffers ,
maxGraphNodes : c . maxGraphNodes ,
2025-02-26 08:06:32 +08:00
}
}
2025-03-12 07:06:06 +08:00
return c
2025-02-26 08:06:32 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) Layer ( i int ) ml . Context {
2025-03-06 06:48:27 +08:00
if buft , ok := c . b . layers [ i ] ; ok {
2025-02-26 08:06:32 +08:00
return & Context {
2025-04-09 03:11:55 +08:00
b : c . b ,
ctx : c . ctx ,
buft : buft ,
allocatedBuffers : c . allocatedBuffers ,
maxGraphNodes : c . maxGraphNodes ,
2025-02-26 08:06:32 +08:00
}
}
2025-03-12 07:06:06 +08:00
return c
2025-02-26 08:06:32 +08:00
}
2025-02-22 03:57:08 +08:00
func ( c * Context ) Forward ( tensors ... ml . Tensor ) ml . Context {
2025-02-14 08:31:21 +08:00
if c . graph == nil {
2025-02-26 04:57:49 +08:00
c . graph = C . ggml_new_graph_custom ( c . ctx , C . size_t ( c . maxGraphNodes ) , false )
2025-02-14 08:31:21 +08:00
}
2025-02-22 03:57:08 +08:00
for _ , tensor := range tensors {
C . ggml_build_forward_expand ( c . graph , tensor . ( * Tensor ) . t )
}
return c
2025-02-14 08:31:21 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) Compute ( tensors ... ml . Tensor ) {
2025-02-20 06:26:40 +08:00
C . ggml_backend_sched_graph_compute_async ( c . b . sched , c . graph )
2025-03-05 05:06:56 +08:00
C . ggml_backend_sched_reset ( c . b . sched )
2025-02-14 08:31:21 +08:00
2025-02-06 05:18:36 +08:00
needSync := true
sync := func ( ) {
if needSync {
2025-02-19 08:52:29 +08:00
C . ggml_backend_sched_synchronize ( c . b . sched )
2025-02-06 05:18:36 +08:00
needSync = false
}
}
2025-02-14 08:31:21 +08:00
2025-02-06 05:18:36 +08:00
for _ , t := range tensors {
if C . ggml_nbytes ( t . ( * Tensor ) . t ) > 0 {
t . ( * Tensor ) . sync = sync
2025-02-04 11:35:12 +08:00
}
}
2025-02-14 08:31:21 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) Reserve ( ) error {
2025-04-04 03:50:20 +08:00
if ! C . ggml_backend_sched_reserve ( c . b . sched , c . graph ) {
C . ggml_backend_sched_reset ( c . b . sched )
return errors . New ( "failed to reserve graph" )
}
slog . Debug ( "compute graph" , "nodes" , C . ggml_graph_n_nodes ( c . graph ) , "splits" , C . ggml_backend_sched_get_n_splits ( c . b . sched ) )
for i := range c . b . schedBackends {
size := C . ggml_backend_sched_get_buffer_size ( c . b . sched , c . b . schedBackends [ i ] )
slog . Info ( "compute graph" , "backend" , C . GoString ( C . ggml_backend_name ( c . b . schedBackends [ i ] ) ) , "buffer_type" , C . GoString ( C . ggml_backend_buft_name ( c . b . schedBufts [ i ] ) ) ,
"size" , format . HumanBytes2 ( uint64 ( size ) ) )
}
C . ggml_backend_sched_reset ( c . b . sched )
return nil
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) MaxGraphNodes ( ) int {
2025-02-26 04:57:49 +08:00
return c . maxGraphNodes
2024-12-18 11:59:41 +08:00
}
2025-02-04 09:21:57 +08:00
func shapeToGGML ( shape [ ] int ) * C . int64_t {
sh := make ( [ ] C . int64_t , len ( shape ) )
for i , s := range shape {
2025-02-20 06:26:40 +08:00
sh [ i ] = C . int64_t ( s )
2025-02-04 09:21:57 +08:00
}
return & sh [ 0 ]
}
2025-03-08 09:20:54 +08:00
func pad ( length , pad C . size_t ) C . size_t {
return ( ( length + pad - 1 ) / pad ) * pad
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) newTensor ( dtype ml . DType , shape [ ] int ) ( ml . Tensor , error ) {
2025-03-06 06:48:27 +08:00
if c . buft == nil {
2025-04-05 06:04:25 +08:00
panic ( "set Input or Layer before creating tensors" )
2025-03-06 06:48:27 +08:00
}
2025-03-05 05:06:56 +08:00
var cdtype uint32
switch dtype {
case ml . DTypeF32 :
cdtype = C . GGML_TYPE_F32
case ml . DTypeF16 :
cdtype = C . GGML_TYPE_F16
2025-02-22 12:54:14 +08:00
case ml . DTypeQ80 :
cdtype = C . GGML_TYPE_Q8_0
case ml . DTypeQ40 :
cdtype = C . GGML_TYPE_Q4_0
2025-03-05 05:06:56 +08:00
case ml . DTypeI32 :
cdtype = C . GGML_TYPE_I32
default :
panic ( "unsupported dtype" )
}
2025-03-08 03:19:03 +08:00
if len ( shape ) < 1 || shape [ 0 ] == 0 {
2025-03-05 05:06:56 +08:00
var shape C . int64_t = 0
2025-04-05 06:04:25 +08:00
return & Tensor { b : c . b , t : C . ggml_new_tensor ( c . ctx , cdtype , 1 , & shape ) } , nil
2025-03-05 05:06:56 +08:00
} else if len ( shape ) > 4 {
2025-02-14 08:31:21 +08:00
panic ( "unsupported number of dimensions" )
}
for _ , dim := range shape {
if dim < 1 {
panic ( "invalid shape" )
}
}
2025-03-05 05:06:56 +08:00
t := C . ggml_new_tensor ( c . ctx , cdtype , C . int ( len ( shape ) ) , shapeToGGML ( shape ) )
2025-03-08 09:20:54 +08:00
size := pad ( C . ggml_backend_buft_get_alloc_size ( c . buft , t ) , C . ggml_backend_buft_get_alignment ( c . buft ) )
b := C . ggml_backend_buft_alloc_buffer ( c . buft , size )
2025-04-05 06:04:25 +08:00
if b == nil {
return nil , fmt . Errorf ( "unable to allocate %v from device %v for new tensor" , format . HumanBytes2 ( uint64 ( size ) ) , C . GoString ( C . ggml_backend_buft_name ( c . buft ) ) )
}
2025-04-09 03:11:55 +08:00
* c . allocatedBuffers = append ( * c . allocatedBuffers , b )
2025-04-05 06:04:25 +08:00
2025-02-14 08:31:21 +08:00
C . ggml_backend_tensor_alloc ( b , t , C . ggml_backend_buffer_get_base ( b ) )
2025-04-05 06:04:25 +08:00
return & Tensor { b : c . b , t : t } , nil
2025-03-01 09:48:07 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) Empty ( dtype ml . DType , shape ... int ) ml . Tensor {
2025-04-05 06:04:25 +08:00
t , err := c . newTensor ( dtype , shape )
if err != nil {
panic ( err )
}
return t
2025-03-01 09:48:07 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) Zeros ( dtype ml . DType , shape ... int ) ml . Tensor {
2025-04-05 06:04:25 +08:00
t , err := c . newTensor ( dtype , shape )
if err != nil {
panic ( err )
}
2025-02-20 06:26:40 +08:00
C . ggml_set_zero ( t . ( * Tensor ) . t )
return t
2025-02-14 08:31:21 +08:00
}
2025-02-26 08:06:32 +08:00
func checkShape [ S ~ [ ] E , E any ] ( s S , shape ... int ) error {
2025-02-14 08:31:21 +08:00
n := len ( s )
2025-03-08 03:19:03 +08:00
if n == 0 {
return nil
}
2025-02-14 08:31:21 +08:00
for _ , v := range shape {
n /= v
}
if n != 1 {
2025-02-26 08:06:32 +08:00
return fmt . Errorf ( "invalid shape: %v" , shape )
2025-02-14 08:31:21 +08:00
}
2025-02-26 08:06:32 +08:00
return nil
2025-02-14 08:31:21 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) FromFloatSlice ( s [ ] float32 , shape ... int ) ( ml . Tensor , error ) {
2025-03-08 03:19:03 +08:00
if err := checkShape ( s , shape ... ) ; err != nil {
2025-02-26 08:06:32 +08:00
return nil , err
}
2025-04-05 06:04:25 +08:00
t , err := c . newTensor ( ml . DTypeF32 , shape )
if err != nil {
return nil , err
}
2025-03-08 03:19:03 +08:00
if len ( s ) > 0 {
C . ggml_backend_tensor_set ( t . ( * Tensor ) . t , unsafe . Pointer ( & s [ 0 ] ) , 0 , C . ggml_nbytes ( t . ( * Tensor ) . t ) )
}
2025-02-26 08:06:32 +08:00
return t , nil
2025-02-14 08:31:21 +08:00
}
2025-03-12 07:06:06 +08:00
func ( c * Context ) FromIntSlice ( s [ ] int32 , shape ... int ) ( ml . Tensor , error ) {
2025-03-08 03:19:03 +08:00
if err := checkShape ( s , shape ... ) ; err != nil {
2025-02-26 08:06:32 +08:00
return nil , err
}
2025-04-05 06:04:25 +08:00
t , err := c . newTensor ( ml . DTypeI32 , shape )
if err != nil {
return nil , err
}
2025-03-08 03:19:03 +08:00
if len ( s ) > 0 {
C . ggml_backend_tensor_set ( t . ( * Tensor ) . t , unsafe . Pointer ( & s [ 0 ] ) , 0 , C . ggml_nbytes ( t . ( * Tensor ) . t ) )
}
2025-02-26 08:06:32 +08:00
return t , nil
2025-02-14 08:31:21 +08:00
}
2025-04-04 01:25:23 +08:00
func ( c Context ) Arange ( start , stop , step float32 , dtype ml . DType ) ml . Tensor {
switch dtype {
case ml . DTypeF32 :
// ggml_arange creates a float32 tensor
return & Tensor {
b : c . b ,
t : C . ggml_arange ( c . ctx , C . float ( start ) , C . float ( stop ) , C . float ( step ) ) ,
}
case ml . DTypeI32 :
// ggml_cast does not support float32 to int32 conversion
arange := make ( [ ] int32 , 0 , int ( ( stop - start ) / step ) )
for i := start ; i < stop ; i += step {
arange = append ( arange , int32 ( i ) )
}
t , err := c . Input ( ) . FromIntSlice ( arange , len ( arange ) )
if err != nil {
panic ( err )
}
return t
default :
panic ( "unsupported dtype for arange" )
}
}
2025-03-05 05:06:56 +08:00
func ( c * Context ) Close ( ) {
if c != nil {
2025-04-09 03:11:55 +08:00
for _ , b := range * c . allocatedBuffers {
C . ggml_backend_buffer_free ( b )
}
* c . allocatedBuffers = nil
2025-02-11 02:36:00 +08:00
C . ggml_free ( c . ctx )
}
2025-02-14 08:31:21 +08:00
}
type Tensor struct {
2025-02-28 06:52:39 +08:00
b * Backend
2025-02-14 08:31:21 +08:00
t * C . struct_ggml_tensor
2025-02-06 05:18:36 +08:00
sync func ( )
2025-02-14 08:31:21 +08:00
}
func ( t * Tensor ) LogValue ( ) slog . Value {
return slog . GroupValue (
slog . String ( "name" , C . GoString ( C . ggml_get_name ( t . t ) ) ) ,
slog . String ( "type" , C . GoString ( C . ggml_type_name ( t . t . _type ) ) ) ,
slog . Any ( "shape" , t . Shape ( ) ) ,
)
}
2025-02-04 09:21:57 +08:00
func ( t * Tensor ) Dim ( n int ) int {
return int ( t . t . ne [ n ] )
2025-02-14 08:31:21 +08:00
}
2025-02-04 09:21:57 +08:00
func ( t * Tensor ) Stride ( n int ) int {
return int ( t . t . nb [ n ] )
2025-02-14 08:31:21 +08:00
}
2025-02-04 09:21:57 +08:00
func ( t * Tensor ) Shape ( ) [ ] int {
shape := make ( [ ] int , C . ggml_n_dims ( t . t ) )
2025-02-14 08:31:21 +08:00
for i := range shape {
shape [ i ] = t . Dim ( i )
}
return shape
}
2025-02-06 05:18:36 +08:00
func ( t * Tensor ) Bytes ( ) ( data [ ] byte ) {
if t . sync != nil {
data = make ( [ ] byte , C . ggml_nbytes ( t . t ) )
t . sync ( )
C . ggml_backend_tensor_get ( t . t , unsafe . Pointer ( & data [ 0 ] ) , 0 , C . ggml_nbytes ( t . t ) )
}
return
2025-02-14 08:31:21 +08:00
}
2025-02-06 05:18:36 +08:00
func ( t * Tensor ) Floats ( ) ( data [ ] float32 ) {
if t . sync != nil {
data = make ( [ ] float32 , C . ggml_nelements ( t . t ) )
t . sync ( )
C . ggml_backend_tensor_get ( t . t , unsafe . Pointer ( & data [ 0 ] ) , 0 , C . ggml_nbytes ( t . t ) )
2025-02-14 08:31:21 +08:00
}
return
}
func ( t * Tensor ) DType ( ) ml . DType {
switch t . t . _type {
case C . GGML_TYPE_F32 :
return ml . DTypeF32
2024-12-18 11:59:41 +08:00
case C . GGML_TYPE_F16 :
return ml . DTypeF16
2025-02-22 12:54:14 +08:00
case C . GGML_TYPE_Q8_0 :
return ml . DTypeQ80
case C . GGML_TYPE_Q4_0 :
return ml . DTypeQ40
2025-02-14 08:31:21 +08:00
case C . GGML_TYPE_I32 :
return ml . DTypeI32
default :
return ml . DTypeOther
}
}
2025-03-15 07:56:32 +08:00
func ( t * Tensor ) Neg ( ctx ml . Context ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_neg ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
2025-02-14 08:31:21 +08:00
func ( t * Tensor ) Add ( ctx ml . Context , t2 ml . Tensor ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_add ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t ) ,
}
}
2025-03-15 07:56:32 +08:00
func ( t * Tensor ) Repeat ( ctx ml . Context , dim , n int ) ml . Tensor {
if dim < 0 || dim >= C . GGML_MAX_DIMS {
panic ( "invalid dimension" )
}
shape := make ( [ ] C . int64_t , C . GGML_MAX_DIMS )
for i := range C . GGML_MAX_DIMS {
if i == dim {
shape [ i ] = C . int64_t ( t . Dim ( i ) * n )
} else {
shape [ i ] = C . int64_t ( t . Dim ( i ) )
}
}
tmpl := C . ggml_new_tensor ( ctx . ( * Context ) . ctx , t . t . _type , C . int ( len ( shape ) ) , unsafe . SliceData ( shape ) )
return & Tensor {
b : t . b ,
t : C . ggml_repeat ( ctx . ( * Context ) . ctx , t . t , tmpl ) ,
}
}
2025-02-14 08:31:21 +08:00
func ( t * Tensor ) Stack ( ctx ml . Context , dim int , s ... ml . Tensor ) ml . Tensor {
if len ( s ) > 0 {
return t . Concat ( ctx , s [ 0 ] . Stack ( ctx , dim , s [ 1 : ] ... ) , dim )
}
return t
}
func ( t * Tensor ) Concat ( ctx ml . Context , t2 ml . Tensor , dim int ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_concat ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t , C . int ( dim ) ) ,
}
}
func ( t * Tensor ) Contiguous ( ctx ml . Context ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_cont ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
func ( t * Tensor ) Mul ( ctx ml . Context , t2 ml . Tensor ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_mul ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t ) ,
}
}
func ( t * Tensor ) Mulmat ( ctx ml . Context , t2 ml . Tensor ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_mul_mat ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t ) ,
}
}
2025-02-14 02:01:14 +08:00
func ( t * Tensor ) MulmatFullPrec ( ctx ml . Context , t2 ml . Tensor ) ml . Tensor {
mul := C . ggml_mul_mat ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t )
C . ggml_mul_mat_set_prec ( mul , C . GGML_PREC_F32 )
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 02:01:14 +08:00
t : mul ,
}
}
2025-04-04 06:18:29 +08:00
func ( t * Tensor ) MulmatID ( ctx ml . Context , t2 , ids ml . Tensor ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_mul_mat_id ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t , ids . ( * Tensor ) . t ) ,
}
}
2025-02-14 08:31:21 +08:00
func ( t * Tensor ) LayerNorm ( ctx ml . Context , w , b ml . Tensor , eps float32 ) ml . Tensor {
2025-04-04 06:18:29 +08:00
tt := C . ggml_norm ( ctx . ( * Context ) . ctx , t . t , C . float ( eps ) )
if w != nil {
tt = C . ggml_mul ( ctx . ( * Context ) . ctx , tt , w . ( * Tensor ) . t )
if b != nil {
tt = C . ggml_add ( ctx . ( * Context ) . ctx , tt , b . ( * Tensor ) . t )
}
2025-02-14 08:31:21 +08:00
}
2025-04-04 06:18:29 +08:00
return & Tensor { b : t . b , t : tt }
2025-02-14 08:31:21 +08:00
}
func ( t * Tensor ) RMSNorm ( ctx ml . Context , w ml . Tensor , eps float32 ) ml . Tensor {
2025-04-04 06:18:29 +08:00
tt := C . ggml_rms_norm ( ctx . ( * Context ) . ctx , t . t , C . float ( eps ) )
if w != nil {
tt = C . ggml_mul ( ctx . ( * Context ) . ctx , tt , w . ( * Tensor ) . t )
}
return & Tensor { b : t . b , t : tt }
2025-02-14 08:31:21 +08:00
}
2025-02-04 09:21:57 +08:00
func ( t * Tensor ) Pad ( ctx ml . Context , shape ... int ) ml . Tensor {
2025-02-14 08:31:21 +08:00
if len ( shape ) != 4 {
panic ( "expected 4 dimensions" )
}
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_pad ( ctx . ( * Context ) . ctx , t . t , C . int ( shape [ 0 ] ) , C . int ( shape [ 1 ] ) , C . int ( shape [ 2 ] ) , C . int ( shape [ 3 ] ) ) ,
}
}
func ( t * Tensor ) Permute ( ctx ml . Context , shape ... int ) ml . Tensor {
if len ( shape ) != 4 {
panic ( "expected 4 dimensions" )
}
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_permute ( ctx . ( * Context ) . ctx , t . t , C . int ( shape [ 0 ] ) , C . int ( shape [ 1 ] ) , C . int ( shape [ 2 ] ) , C . int ( shape [ 3 ] ) ) ,
}
}
func ( t * Tensor ) Rows ( ctx ml . Context , t2 ml . Tensor ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_get_rows ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t ) ,
}
}
func ( t * Tensor ) Copy ( ctx ml . Context , t2 ml . Tensor ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_cpy ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t ) ,
}
}
2025-02-04 09:21:57 +08:00
func ( t * Tensor ) Reshape ( ctx ml . Context , shape ... int ) ml . Tensor {
2025-02-14 08:31:21 +08:00
switch len ( shape ) {
case 1 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_reshape_1d ( ctx . ( * Context ) . ctx , t . t , C . int64_t ( shape [ 0 ] ) ) ,
}
case 2 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_reshape_2d ( ctx . ( * Context ) . ctx , t . t , C . int64_t ( shape [ 0 ] ) , C . int64_t ( shape [ 1 ] ) ) ,
}
case 3 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_reshape_3d ( ctx . ( * Context ) . ctx , t . t , C . int64_t ( shape [ 0 ] ) , C . int64_t ( shape [ 1 ] ) , C . int64_t ( shape [ 2 ] ) ) ,
}
case 4 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_reshape_4d ( ctx . ( * Context ) . ctx , t . t , C . int64_t ( shape [ 0 ] ) , C . int64_t ( shape [ 1 ] ) , C . int64_t ( shape [ 2 ] ) , C . int64_t ( shape [ 3 ] ) ) ,
}
default :
panic ( "unsupported number of dimensions" )
}
}
func ( t * Tensor ) Scale ( ctx ml . Context , s float64 ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_scale ( ctx . ( * Context ) . ctx , t . t , ( C . float ) ( s ) ) ,
}
}
func ( t * Tensor ) Softmax ( ctx ml . Context ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_soft_max ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
2025-03-15 07:56:32 +08:00
func ( t * Tensor ) Sin ( ctx ml . Context ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_sin ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
func ( t * Tensor ) Cos ( ctx ml . Context ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_cos ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
2025-02-14 08:31:21 +08:00
func ( t * Tensor ) Tanh ( ctx ml . Context ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_tanh_inplace ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
2025-04-04 06:18:29 +08:00
func ( t * Tensor ) Sigmoid ( ctx ml . Context ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_sigmoid_inplace ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
2025-02-14 08:31:21 +08:00
func ( t * Tensor ) View ( ctx ml . Context , offset int , shape ... int ) ml . Tensor {
switch len ( shape ) {
case 1 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_view_1d ( ctx . ( * Context ) . ctx , t . t , C . int64_t ( shape [ 0 ] ) , C . size_t ( offset ) ) ,
}
case 3 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_view_2d ( ctx . ( * Context ) . ctx , t . t ,
C . int64_t ( shape [ 0 ] ) , C . int64_t ( shape [ 2 ] ) ,
C . size_t ( shape [ 1 ] ) ,
C . size_t ( offset ) ) ,
}
case 5 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_view_3d ( ctx . ( * Context ) . ctx , t . t ,
C . int64_t ( shape [ 0 ] ) , C . int64_t ( shape [ 2 ] ) , C . int64_t ( shape [ 4 ] ) ,
C . size_t ( shape [ 1 ] ) , C . size_t ( shape [ 3 ] ) ,
C . size_t ( offset ) ) ,
}
case 7 :
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_view_4d ( ctx . ( * Context ) . ctx , t . t ,
C . int64_t ( shape [ 0 ] ) , C . int64_t ( shape [ 2 ] ) , C . int64_t ( shape [ 4 ] ) , C . int64_t ( shape [ 6 ] ) ,
C . size_t ( shape [ 1 ] ) , C . size_t ( shape [ 3 ] ) , C . size_t ( shape [ 5 ] ) ,
C . size_t ( offset ) ) ,
}
default :
panic ( "unsupported number of dimensions" )
}
}
const (
2025-02-08 07:58:15 +08:00
ropeTypeNorm C . int = 0
ropeTypeNeox C . int = 2
ropeTypeMrope C . int = 8
ropeTypeVision C . int = 24
2025-02-14 08:31:21 +08:00
)
2025-02-08 07:58:15 +08:00
func ( t * Tensor ) RoPE ( ctx ml . Context , positionIDs , ropeFactors ml . Tensor , ropeDim , ropeType uint32 , ropeBase , ropeScale float32 ) ml . Tensor {
2025-02-14 08:31:21 +08:00
if ropeFactors == nil {
2025-02-28 06:52:39 +08:00
ropeFactors = & Tensor { b : t . b }
2025-02-14 08:31:21 +08:00
}
2024-12-18 11:59:41 +08:00
dequant := t . t
if C . ggml_is_quantized ( t . t . _type ) {
dequant = C . ggml_cast ( ctx . ( * Context ) . ctx , t . t , C . GGML_TYPE_F32 )
}
2025-02-14 08:31:21 +08:00
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_rope_ext (
2024-12-18 11:59:41 +08:00
ctx . ( * Context ) . ctx , dequant , positionIDs . ( * Tensor ) . t , ropeFactors . ( * Tensor ) . t ,
2025-02-14 08:31:21 +08:00
C . int ( ropeDim ) ,
2025-02-08 07:58:15 +08:00
C . int ( ropeType ) ,
131072 , // YaRN n_ctx_train
2025-02-14 08:31:21 +08:00
C . float ( ropeBase ) ,
C . float ( ropeScale ) ,
0. , // YaRN ext_factor
1. , // YaRN attn_factor
32. , // YaRN beta_fast
1. , // YaRN beta_slow
) ,
}
}
2025-03-15 07:56:32 +08:00
func ( t * Tensor ) IM2Col ( ctx ml . Context , t2 ml . Tensor , s0 , s1 , p0 , p1 , d0 , d1 int ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_im2col ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t , C . int ( s0 ) , C . int ( s1 ) , C . int ( p0 ) , C . int ( p1 ) , C . int ( d0 ) , C . int ( d1 ) , true , C . GGML_TYPE_F32 ) ,
}
}
2025-02-14 08:31:21 +08:00
func ( t * Tensor ) GELU ( ctx ml . Context ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_gelu_inplace ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
func ( t * Tensor ) SILU ( ctx ml . Context ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_silu_inplace ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
func ( t * Tensor ) Conv2D ( ctx ml . Context , t2 ml . Tensor , s0 , s1 , p0 , p1 , d0 , d1 int ) ml . Tensor {
return & Tensor {
2025-02-28 06:52:39 +08:00
b : t . b ,
2025-02-14 08:31:21 +08:00
t : C . ggml_conv_2d ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t , C . int ( s0 ) , C . int ( s1 ) , C . int ( p0 ) , C . int ( p1 ) , C . int ( d0 ) , C . int ( d1 ) ) ,
}
}
2025-02-15 07:55:33 +08:00
2025-03-12 00:00:10 +08:00
func ( t * Tensor ) AvgPool2D ( ctx ml . Context , k , s int , p float32 ) ml . Tensor {
2025-03-07 04:16:54 +08:00
return & Tensor {
b : t . b ,
2025-03-12 00:00:10 +08:00
t : C . ggml_pool_2d ( ctx . ( * Context ) . ctx , t . t , C . GGML_OP_POOL_AVG , C . int ( k ) , C . int ( k ) , C . int ( s ) , C . int ( s ) , C . float ( p ) , C . float ( p ) ) ,
2025-03-07 04:16:54 +08:00
}
}
2025-03-08 05:52:45 +08:00
func ( t * Tensor ) Set ( ctx ml . Context , t2 ml . Tensor , offset int , strides ... int ) ml . Tensor {
var tt * C . struct_ggml_tensor
switch len ( strides ) {
case 0 :
2025-03-08 09:38:36 +08:00
tt = C . ggml_set_1d ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t , C . size_t ( offset ) )
2025-03-08 05:52:45 +08:00
case 1 :
2025-03-08 09:38:36 +08:00
tt = C . ggml_set_2d ( ctx . ( * Context ) . ctx , t . t , t2 . ( * Tensor ) . t , C . size_t ( offset ) , C . size_t ( strides [ 0 ] ) )
2025-03-08 05:52:45 +08:00
default :
panic ( "unsupported number of dimensions" )
}
return & Tensor { b : t . b , t : tt }
}
2025-02-15 12:51:44 +08:00
func ( t * Tensor ) ScaledDotProductAttention ( ctx ml . Context , key , value , mask ml . Tensor , scale float64 ) ml . Tensor {
var kqMask * C . struct_ggml_tensor
if mask != nil {
kqMask = mask . ( * Tensor ) . t
}
2025-02-23 13:34:10 +08:00
query := t . Permute ( ctx , 0 , 2 , 1 , 3 )
key = key . Permute ( ctx , 0 , 2 , 1 , 3 )
2025-02-26 09:24:36 +08:00
if t . b . flashAttention {
value = value . Permute ( ctx , 0 , 2 , 1 , 3 )
2025-02-15 12:51:44 +08:00
2025-02-26 09:24:36 +08:00
kqv := C . ggml_flash_attn_ext ( ctx . ( * Context ) . ctx , query . ( * Tensor ) . t , key . ( * Tensor ) . t , value . ( * Tensor ) . t , kqMask , C . float ( scale ) , 0 , 0 )
C . ggml_flash_attn_ext_set_prec ( kqv , C . GGML_PREC_F32 )
return & Tensor { b : t . b , t : kqv }
} else {
kq := key . MulmatFullPrec ( ctx , query )
kq = & Tensor {
b : t . b ,
t : C . ggml_soft_max_ext ( ctx . ( * Context ) . ctx , kq . ( * Tensor ) . t , kqMask , C . float ( scale ) , 0 ) ,
}
kqv := value . Mulmat ( ctx , kq )
return kqv . Permute ( ctx , 0 , 2 , 1 , 3 ) . Contiguous ( ctx )
}
2025-02-15 12:51:44 +08:00
}
2025-03-15 07:56:32 +08:00
func ( t * Tensor ) Duplicate ( ctx ml . Context ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_dup ( ctx . ( * Context ) . ctx , t . t ) ,
}
}
2025-04-04 06:18:29 +08:00
func ( t * Tensor ) TopK ( ctx ml . Context , k int ) ml . Tensor {
return & Tensor {
b : t . b ,
t : C . ggml_top_k ( ctx . ( * Context ) . ctx , t . t , C . int ( k ) ) ,
}
}