ollama/llama/patches/0019-metal-add-mean-kernel-...

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 19 Jun 2025 08:05:21 +0300
Subject: [PATCH] metal : add mean kernel (#14267)

* metal : add mean kernel

ggml-ci

* cont : dedup implementation

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++---
 ggml/src/ggml-metal/ggml-metal.metal | 48 ++++++++++++++++++++++------
 2 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index a9eeebc6..110c9ece 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_COS,
     GGML_METAL_KERNEL_TYPE_NEG,
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
+    GGML_METAL_KERNEL_TYPE_MEAN,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
     GGML_METAL_KERNEL_TYPE_ARGMAX,
@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
@@ -1634,6 +1636,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_LOG:
             return false; // TODO: implement
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_GROUP_NORM:
             return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
@@ -2362,11 +2365,30 @@ static bool ggml_metal_encode_node(
                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
             } break;
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
             {
                 GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
 
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (dst->op) {
+                    case GGML_OP_SUM_ROWS:
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                        break;
+                    case GGML_OP_MEAN:
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline;
+                        break;
+                    default:
+                        GGML_ABORT("fatal error");
+                }
+
+                int nth = 32; // SIMD width
+
+                while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
 
+                nth = MIN(nth, ne00);
 
                 ggml_metal_kargs_sum_rows args = {
                    /*.ne00 =*/ ne00,
@@ -2396,11 +2418,12 @@ static bool ggml_metal_encode_node(
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&args length:sizeof(args) atIndex:2];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_SOFT_MAX:
             {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 9cfddf45..08e8d807 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -956,31 +956,61 @@ kernel void kernel_neg(
     dst[tpig] = -src0[tpig];
 }
 
+template <bool norm>
 kernel void kernel_sum_rows(
+        constant ggml_metal_kargs_sum_rows & args,
         device const float * src0,
         device       float * dst,
-        constant ggml_metal_kargs_sum_rows & args,
-        uint3 tpig[[thread_position_in_grid]]) {
-    int64_t i3 = tpig.z;
-    int64_t i2 = tpig.y;
-    int64_t i1 = tpig.x;
+        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    int64_t i3 = tgpig.z;
+    int64_t i2 = tgpig.y;
+    int64_t i1 = tgpig.x;
 
     if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
         return;
     }
 
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+
     device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
     device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
 
-    float row_sum = 0;
+    float sumf = 0;
 
-    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
-        row_sum += src_row[i0];
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        sumf += src_row[i0];
     }
 
-    dst_row[0] = row_sum;
+    sumf = simd_sum(sumf);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    if (tpitg.x == 0) {
+        dst_row[0] = norm ? sumf / args.ne00 : sumf;
+    }
 }
 
+typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
+
+template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
+template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
+
 template<typename T>
 kernel void kernel_soft_max(
         device const  char * src0,
add new gemma model (#11204) * update patches * cherry pick metal mean kernel * cherry pick cuda mean kernel * gemma3n 2025-06-26 12:47:09 +08:00			`From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001`
			`From: Georgi Gerganov <ggerganov@gmail.com>`
			`Date: Thu, 19 Jun 2025 08:05:21 +0300`
			`Subject: [PATCH] metal : add mean kernel (#14267)`

			`* metal : add mean kernel`

			`ggml-ci`

			`* cont : dedup implementation`

			`ggml-ci`
			`---`
			`ggml/src/ggml-metal/ggml-metal.m \| 33 ++++++++++++++++---`
			`ggml/src/ggml-metal/ggml-metal.metal \| 48 ++++++++++++++++++++++------`
			`2 files changed, 67 insertions(+), 14 deletions(-)`

			`diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m`
Increase performance for Gemma3n models on NVGPUs by enabling CUDA Graph execution (#11525) * Enable CUDA Graphs for gemma3n. Similar to https://github.com/ggml-org/llama.cpp/pull/14741, though ollama has a slightly different model graph than llama.cpp which requires different workaround checks. * Remove residual check by reshaping differently in gemma3n model This should make the heuristics more robust 2025-07-30 03:37:06 +08:00			`index a9eeebc6..110c9ece 100644`
add new gemma model (#11204) * update patches * cherry pick metal mean kernel * cherry pick cuda mean kernel * gemma3n 2025-06-26 12:47:09 +08:00			`--- a/ggml/src/ggml-metal/ggml-metal.m`
			`+++ b/ggml/src/ggml-metal/ggml-metal.m`
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net> 2025-08-06 03:21:16 +08:00			`@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {`
add new gemma model (#11204) * update patches * cherry pick metal mean kernel * cherry pick cuda mean kernel * gemma3n 2025-06-26 12:47:09 +08:00			`GGML_METAL_KERNEL_TYPE_COS,`
			`GGML_METAL_KERNEL_TYPE_NEG,`
			`GGML_METAL_KERNEL_TYPE_SUM_ROWS,`
			`+ GGML_METAL_KERNEL_TYPE_MEAN,`
			`GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,`
			`GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,`
			`GGML_METAL_KERNEL_TYPE_ARGMAX,`
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net> 2025-08-06 03:21:16 +08:00			`@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de`
add new gemma model (#11204) * update patches * cherry pick metal mean kernel * cherry pick cuda mean kernel * gemma3n 2025-06-26 12:47:09 +08:00			`GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);`
			`GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true);`
			`GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);`
			`+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true);`
			`GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);`
			`GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true);`
			`GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);`
			`@@ -1634,6 +1636,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex`
			`case GGML_OP_LOG:`
			`return false; // TODO: implement`
			`case GGML_OP_SUM_ROWS:`
			`+ case GGML_OP_MEAN:`
			`case GGML_OP_SOFT_MAX:`
			`case GGML_OP_GROUP_NORM:`
			`return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);`
			`@@ -2362,11 +2365,30 @@ static bool ggml_metal_encode_node(`
			`[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];`
			`} break;`
			`case GGML_OP_SUM_ROWS:`
			`+ case GGML_OP_MEAN:`
			`{`
			`GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));`

			`- id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;`
			`+ id<MTLComputePipelineState> pipeline = nil;`
			`+`
			`+ switch (dst->op) {`
			`+ case GGML_OP_SUM_ROWS:`
			`+ pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;`
			`+ break;`
			`+ case GGML_OP_MEAN:`
			`+ pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline;`
			`+ break;`
			`+ default:`
			`+ GGML_ABORT("fatal error");`
			`+ }`
			`+`
			`+ int nth = 32; // SIMD width`
			`+`
			`+ while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {`
			`+ nth *= 2;`
			`+ }`

			`+ nth = MIN(nth, ne00);`

			`ggml_metal_kargs_sum_rows args = {`
			`/.ne00 =/ ne00,`
			`@@ -2396,11 +2418,12 @@ static bool ggml_metal_encode_node(`
			`};`

			`[encoder setComputePipelineState:pipeline];`
			`- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];`
			`- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];`
			`- [encoder setBytes:&args length:sizeof(args) atIndex:2];`
			`+ [encoder setBytes:&args length:sizeof(args) atIndex:0];`
			`+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];`
			`+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];`
			`+ [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];`

			`- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];`
			`+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];`
			`} break;`
			`case GGML_OP_SOFT_MAX:`
			`{`
			`diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal`
			`index 9cfddf45..08e8d807 100644`
			`--- a/ggml/src/ggml-metal/ggml-metal.metal`
			`+++ b/ggml/src/ggml-metal/ggml-metal.metal`
			`@@ -956,31 +956,61 @@ kernel void kernel_neg(`
			`dst[tpig] = -src0[tpig];`
			`}`

			`+template <bool norm>`
			`kernel void kernel_sum_rows(`
			`+ constant ggml_metal_kargs_sum_rows & args,`
			`device const float * src0,`
			`device float * dst,`
			`- constant ggml_metal_kargs_sum_rows & args,`
			`- uint3 tpig[[thread_position_in_grid]]) {`
			`- int64_t i3 = tpig.z;`
			`- int64_t i2 = tpig.y;`
			`- int64_t i1 = tpig.x;`
			`+ threadgroup float * shmem_f32 [[threadgroup(0)]],`
			`+ uint3 tgpig[[threadgroup_position_in_grid]],`
			`+ ushort3 tpitg[[thread_position_in_threadgroup]],`
			`+ ushort sgitg[[simdgroup_index_in_threadgroup]],`
			`+ ushort tiisg[[thread_index_in_simdgroup]],`
			`+ ushort3 ntg[[threads_per_threadgroup]]) {`
			`+ int64_t i3 = tgpig.z;`
			`+ int64_t i2 = tgpig.y;`
			`+ int64_t i1 = tgpig.x;`

			`if (i3 >= args.ne03 \|\| i2 >= args.ne02 \|\| i1 >= args.ne01) {`
			`return;`
			`}`

			`+ if (sgitg == 0) {`
			`+ shmem_f32[tiisg] = 0.0f;`
			`+ }`
			`+`
			`device const float * src_row = (device const float ) ((device const char ) src0 + i1args.nb01 + i2args.nb02 + i3*args.nb03);`
			`device float * dst_row = (device float ) ((device char ) dst + i1args.nb1 + i2args.nb2 + i3*args.nb3);`

			`- float row_sum = 0;`
			`+ float sumf = 0;`

			`- for (int64_t i0 = 0; i0 < args.ne00; i0++) {`
			`- row_sum += src_row[i0];`
			`+ for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {`
			`+ sumf += src_row[i0];`
			`}`

			`- dst_row[0] = row_sum;`
			`+ sumf = simd_sum(sumf);`
			`+`
			`+ threadgroup_barrier(mem_flags::mem_threadgroup);`
			`+`
			`+ if (tiisg == 0) {`
			`+ shmem_f32[sgitg] = sumf;`
			`+ }`
			`+`
			`+ threadgroup_barrier(mem_flags::mem_threadgroup);`
			`+`
			`+ sumf = shmem_f32[tiisg];`
			`+ sumf = simd_sum(sumf);`
			`+`
			`+ if (tpitg.x == 0) {`
			`+ dst_row[0] = norm ? sumf / args.ne00 : sumf;`
			`+ }`
			`}`

			`+typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;`
			`+`
			`+template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;`
			`+template [[host_name("kernel_mean")]] kernel kernel_sum_rows_t kernel_sum_rows<true>;`
			`+`
			`template<typename T>`
			`kernel void kernel_soft_max(`
			`device const char * src0,`