Merge pull request #4036 from ihb2032/opt/rvv-minmax-float

opt(RVV): Optimize max and min float functions with intrinsics GitOrigin-RevId: d246089d9de5602aeb58e91d1169923d58ed9712
2025-12-22 10:43:07 +08:00 · 2025-12-22 10:43:07 +08:00 · 3f3ccf2e75
parent 4e47bbdb40
commit 3f3ccf2e75
2 changed files with 50 additions and 0 deletions
--- a/source/backend/cpu/riscv/rvv/MNNMaxFloat.cpp
+++ b/source/backend/cpu/riscv/rvv/MNNMaxFloat.cpp
@ -0,0 +1,25 @@
+#include <riscv_vector.h>
+#include <cfloat>
+
+#define UNIT 4
+
+void MNNMaxFloat(float *input, float *maxBuffer, int32_t inputCountUnit) {
+    const float init = -FLT_MAX;
+    for (int j = 0; j < UNIT; ++j) {
+        float local = init;
+        size_t i = 0;
+
+        while (i < (size_t)inputCountUnit) {
+            size_t vl = __riscv_vsetvl_e32m8(inputCountUnit - i);
+            float *p0 = input + (i * UNIT * 2) + j * 2;
+            float *p1 = p0 + 1;
+            vfloat32m8_t v0 = __riscv_vlse32_v_f32m8(p0, UNIT * 2 * sizeof(float), vl);
+            vfloat32m8_t v1 = __riscv_vlse32_v_f32m8(p1, UNIT * 2 * sizeof(float), vl);
+            vfloat32m8_t vmax = __riscv_vfmax_vv_f32m8(v0, v1, vl);
+            vfloat32m1_t vred = __riscv_vfredmax_vs_f32m8_f32m1(vmax, __riscv_vfmv_s_f_f32m1(local, 1), vl);
+            local = __riscv_vfmv_f_s_f32m1_f32(vred);
+            i += vl;
+        }
+        maxBuffer[j] = local;
+    }
+}
--- a/source/backend/cpu/riscv/rvv/MNNMinFloat.cpp
+++ b/source/backend/cpu/riscv/rvv/MNNMinFloat.cpp
@ -0,0 +1,25 @@
+#include <riscv_vector.h>
+#include <cfloat>
+
+#define UNIT 4
+
+void MNNMinFloat(float *input, float *minBuffer, int32_t inputCountUnit) {
+    const float init = FLT_MAX;
+    for (int j = 0; j < UNIT; ++j) {
+        float local = init;
+        size_t i = 0;
+
+        while (i < (size_t)inputCountUnit) {
+            size_t vl = __riscv_vsetvl_e32m8(inputCountUnit - i);
+            float *p0 = input + (i * UNIT * 2) + j * 2;
+            float *p1 = p0 + 1;
+            vfloat32m8_t v0 = __riscv_vlse32_v_f32m8(p0, UNIT * 2 * sizeof(float), vl);
+            vfloat32m8_t v1 = __riscv_vlse32_v_f32m8(p1, UNIT * 2 * sizeof(float), vl);
+            vfloat32m8_t vmin = __riscv_vfmin_vv_f32m8(v0, v1, vl);
+            vfloat32m1_t vred = __riscv_vfredmin_vs_f32m8_f32m1(vmin, __riscv_vfmv_s_f_f32m1(local, 1), vl);
+            local = __riscv_vfmv_f_s_f32m1_f32(vred);
+            i += vl;
+        }
+        minBuffer[j] = local;
+    }
+}