fix alpha div by zero bug and arm server compile bug

2021-06-23 17:36:42 +08:00 · 2021-06-23 17:36:42 +08:00 · 4eb1096b9c
parent 1a6cacc808
commit 4eb1096b9c
4 changed files with 49 additions and 38 deletions
--- a/source/backend/cpu/arm/arm64/MNNPackedSparseMatMulEpx4.S
+++ b/source/backend/cpu/arm/arm64/MNNPackedSparseMatMulEpx4.S
@ -97,21 +97,21 @@ loop_e16:

        load_e16h4_end:
        ldr w20, [x15], #4
-        mov v17.4s, v16.4s
-        mov v18.4s, v16.4s
-        mov v19.4s, v16.4s
-        mov v20.4s, v16.4s
-        mov v21.4s, v16.4s
-        mov v22.4s, v16.4s
-        mov v23.4s, v16.4s
-        mov v24.4s, v16.4s
-        mov v25.4s, v16.4s
-        mov v26.4s, v16.4s
-        mov v27.4s, v16.4s
-        mov v28.4s, v16.4s
-        mov v29.4s, v16.4s
-        mov v30.4s, v16.4s
-        mov v31.4s, v16.4s
+        mov v17.16b, v16.16b
+        mov v18.16b, v16.16b
+        mov v19.16b, v16.16b
+        mov v20.16b, v16.16b
+        mov v21.16b, v16.16b
+        mov v22.16b, v16.16b
+        mov v23.16b, v16.16b
+        mov v24.16b, v16.16b
+        mov v25.16b, v16.16b
+        mov v26.16b, v16.16b
+        mov v27.16b, v16.16b
+        mov v28.16b, v16.16b
+        mov v29.16b, v16.16b
+        mov v30.16b, v16.16b
+        mov v31.16b, v16.16b
        cbz w20, loop_e16h4l1_end

        loop_e16h4l1:
@ -382,13 +382,13 @@ beq loop_e4

        load_e8h4_end:
        ldr w20, [x15], #4
-        mov v17.4s, v16.4s
-        mov v18.4s, v16.4s
-        mov v19.4s, v16.4s
-        mov v20.4s, v16.4s
-        mov v21.4s, v16.4s
-        mov v22.4s, v16.4s
-        mov v23.4s, v16.4s
+        mov v17.16b, v16.16b
+        mov v18.16b, v16.16b
+        mov v19.16b, v16.16b
+        mov v20.16b, v16.16b
+        mov v21.16b, v16.16b
+        mov v22.16b, v16.16b
+        mov v23.16b, v16.16b
        cbz w20, loop_e8h4l1_end

        loop_e8h4l1:
@ -530,9 +530,9 @@ beq loop_e2

        load_e4h4_end:
        ldr w20, [x15], #4
-        mov v17.4s, v16.4s
-        mov v18.4s, v16.4s
-        mov v19.4s, v16.4s
+        mov v17.16b, v16.16b
+        mov v18.16b, v16.16b
+        mov v19.16b, v16.16b
        cbz w20, loop_e4h4l1_end

        loop_e4h4l1:
@ -644,7 +644,7 @@ beq loop_e1

        load_e2h4_end:
        ldr w20, [x15], #4
-        mov v17.4s, v16.4s
+        mov v17.16b, v16.16b
        cbz w20, loop_e2h4l1_end

        loop_e2h4l1:
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@ -67,7 +67,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
                if (post != nullptr) {
                    dst_x[j] = MNNInt32ToInt8(dstTemp[j], bias_dz[j], scale_dz[j], post->maxValue, post->minValue);
                } else {
-                    dst_x[j] = dstTemp[j];
+                     ((float*)dst_x)[j] = (float)(dstTemp[j] + bias_dz[j]);
                }
            }
        }
--- a/source/backend/cpu/compute/WinogradInt8Helper.cpp
+++ b/source/backend/cpu/compute/WinogradInt8Helper.cpp
@ -6,6 +6,10 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

+#if __GNUC__ == 4
+#pragma GCC optimize("-flax-vector-conversions")
+#endif
+
 #include <limits>
 #include <vector>
 #include <map>
@ -64,10 +68,10 @@ static void _sourceTransUnit4x4Pack4x4(const int8_t* srcStart, int8_t* dstStart,
        };
        for (int i = 0; i < 4; ++i) {
            auto tmp = vreinterpretq_s32_s8(m[i].value);
-            vst1q_lane_s32(dstStart + 0 * dstZStep, tmp, 0);
-            vst1q_lane_s32(dstStart + 1 * dstZStep, tmp, 1);
-            vst1q_lane_s32(dstStart + 2 * dstZStep, tmp, 2);
-            vst1q_lane_s32(dstStart + 3 * dstZStep, tmp, 3);
+            vst1q_lane_s32((int32_t*)(dstStart + 0 * dstZStep), tmp, 0);
+            vst1q_lane_s32((int32_t*)(dstStart + 1 * dstZStep), tmp, 1);
+            vst1q_lane_s32((int32_t*)(dstStart + 2 * dstZStep), tmp, 2);
+            vst1q_lane_s32((int32_t*)(dstStart + 3 * dstZStep), tmp, 3);
            dstStart += dstXStep;
        }
        dstStart -= dstXStep * 4;
@ -151,10 +155,10 @@ static void _sourceTransUnit4x4Pack16x4(const int8_t* srcStart, int8_t* dstStart
        };
        for (int i = 0; i < 4; ++i) {
            auto tmp = vreinterpretq_s32_s8(m[i].value);
-            vst1q_lane_s32(dstStart + 0 * dstZStep, tmp, 0);
-            vst1q_lane_s32(dstStart + 1 * dstZStep, tmp, 1);
-            vst1q_lane_s32(dstStart + 2 * dstZStep, tmp, 2);
-            vst1q_lane_s32(dstStart + 3 * dstZStep, tmp, 3);
+            vst1q_lane_s32((int32_t*)(dstStart + 0 * dstZStep), tmp, 0);
+            vst1q_lane_s32((int32_t*)(dstStart + 1 * dstZStep), tmp, 1);
+            vst1q_lane_s32((int32_t*)(dstStart + 2 * dstZStep), tmp, 2);
+            vst1q_lane_s32((int32_t*)(dstStart + 3 * dstZStep), tmp, 3);
            dstStart += dstXStep;
        }
        dstStart -= dstXStep * 4;
--- a/source/core/ConvolutionCommon.cpp
+++ b/source/core/ConvolutionCommon.cpp
@ -481,7 +481,10 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d,
                                              const int8_t*& weight, float*& scale, int32_t*& bias,
                                              float inputScale, float outputScale, int inputZeroPoint, int outputZeroPoint) {
    int outputCount = conv2d->common()->outputCount();
-    weight = conv2d->symmetricQuan()->weight()->data();
+    // fix xcode UndefinedBehaviorSanitizer
+    if (conv2d->symmetricQuan()->weight() != nullptr) {
+        weight = conv2d->symmetricQuan()->weight()->data();
+    }
    if (conv2d->quanParameter() && conv2d->quanParameter()->buffer()) {
        quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), false, true);
        weight = quanCommon->weight.get();
@ -529,10 +532,14 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d,
        auto alphaData   = conv2d->quanParameter()->alpha()->data();
        auto alphaScale  = inputScale / outputScale;
        for (int i = 0; i < outputCount; i++) {
-            scale[i] = alphaData[i] * alphaScale;
+            auto alphaValue = alphaData[i];
+            if (fabs(alphaValue) < 1e-6) {
+                alphaValue = 1e-6;
+            }
+            scale[i] = alphaValue * alphaScale;
            // compute outputZeroPointFused in asymmetric quant
            int outputZeroPointFused = static_cast<int32_t>(outputZeroPoint / scale[i]);
-            bias[i] = static_cast<int32_t>(biasData[i] / (inputScale * alphaData[i])) - remains[i] + outputZeroPointFused;
+            bias[i] = static_cast<int32_t>(biasData[i] / (inputScale * alphaValue)) - remains[i] + outputZeroPointFused;
        }
        return true;
    }