mirror of https://github.com/alibaba/MNN.git
fix alpha div by zero bug and arm server compile bug
This commit is contained in:
parent
1a6cacc808
commit
4eb1096b9c
|
|
@ -97,21 +97,21 @@ loop_e16:
|
|||
|
||||
load_e16h4_end:
|
||||
ldr w20, [x15], #4
|
||||
mov v17.4s, v16.4s
|
||||
mov v18.4s, v16.4s
|
||||
mov v19.4s, v16.4s
|
||||
mov v20.4s, v16.4s
|
||||
mov v21.4s, v16.4s
|
||||
mov v22.4s, v16.4s
|
||||
mov v23.4s, v16.4s
|
||||
mov v24.4s, v16.4s
|
||||
mov v25.4s, v16.4s
|
||||
mov v26.4s, v16.4s
|
||||
mov v27.4s, v16.4s
|
||||
mov v28.4s, v16.4s
|
||||
mov v29.4s, v16.4s
|
||||
mov v30.4s, v16.4s
|
||||
mov v31.4s, v16.4s
|
||||
mov v17.16b, v16.16b
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v16.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v16.16b
|
||||
mov v22.16b, v16.16b
|
||||
mov v23.16b, v16.16b
|
||||
mov v24.16b, v16.16b
|
||||
mov v25.16b, v16.16b
|
||||
mov v26.16b, v16.16b
|
||||
mov v27.16b, v16.16b
|
||||
mov v28.16b, v16.16b
|
||||
mov v29.16b, v16.16b
|
||||
mov v30.16b, v16.16b
|
||||
mov v31.16b, v16.16b
|
||||
cbz w20, loop_e16h4l1_end
|
||||
|
||||
loop_e16h4l1:
|
||||
|
|
@ -382,13 +382,13 @@ beq loop_e4
|
|||
|
||||
load_e8h4_end:
|
||||
ldr w20, [x15], #4
|
||||
mov v17.4s, v16.4s
|
||||
mov v18.4s, v16.4s
|
||||
mov v19.4s, v16.4s
|
||||
mov v20.4s, v16.4s
|
||||
mov v21.4s, v16.4s
|
||||
mov v22.4s, v16.4s
|
||||
mov v23.4s, v16.4s
|
||||
mov v17.16b, v16.16b
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v16.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v16.16b
|
||||
mov v22.16b, v16.16b
|
||||
mov v23.16b, v16.16b
|
||||
cbz w20, loop_e8h4l1_end
|
||||
|
||||
loop_e8h4l1:
|
||||
|
|
@ -530,9 +530,9 @@ beq loop_e2
|
|||
|
||||
load_e4h4_end:
|
||||
ldr w20, [x15], #4
|
||||
mov v17.4s, v16.4s
|
||||
mov v18.4s, v16.4s
|
||||
mov v19.4s, v16.4s
|
||||
mov v17.16b, v16.16b
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v16.16b
|
||||
cbz w20, loop_e4h4l1_end
|
||||
|
||||
loop_e4h4l1:
|
||||
|
|
@ -644,7 +644,7 @@ beq loop_e1
|
|||
|
||||
load_e2h4_end:
|
||||
ldr w20, [x15], #4
|
||||
mov v17.4s, v16.4s
|
||||
mov v17.16b, v16.16b
|
||||
cbz w20, loop_e2h4l1_end
|
||||
|
||||
loop_e2h4l1:
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
|
|||
if (post != nullptr) {
|
||||
dst_x[j] = MNNInt32ToInt8(dstTemp[j], bias_dz[j], scale_dz[j], post->maxValue, post->minValue);
|
||||
} else {
|
||||
dst_x[j] = dstTemp[j];
|
||||
+ ((float*)dst_x)[j] = (float)(dstTemp[j] + bias_dz[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,10 @@
|
|||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#if __GNUC__ == 4
|
||||
#pragma GCC optimize("-flax-vector-conversions")
|
||||
#endif
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
|
@ -64,10 +68,10 @@ static void _sourceTransUnit4x4Pack4x4(const int8_t* srcStart, int8_t* dstStart,
|
|||
};
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
auto tmp = vreinterpretq_s32_s8(m[i].value);
|
||||
vst1q_lane_s32(dstStart + 0 * dstZStep, tmp, 0);
|
||||
vst1q_lane_s32(dstStart + 1 * dstZStep, tmp, 1);
|
||||
vst1q_lane_s32(dstStart + 2 * dstZStep, tmp, 2);
|
||||
vst1q_lane_s32(dstStart + 3 * dstZStep, tmp, 3);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 0 * dstZStep), tmp, 0);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 1 * dstZStep), tmp, 1);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 2 * dstZStep), tmp, 2);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 3 * dstZStep), tmp, 3);
|
||||
dstStart += dstXStep;
|
||||
}
|
||||
dstStart -= dstXStep * 4;
|
||||
|
|
@ -151,10 +155,10 @@ static void _sourceTransUnit4x4Pack16x4(const int8_t* srcStart, int8_t* dstStart
|
|||
};
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
auto tmp = vreinterpretq_s32_s8(m[i].value);
|
||||
vst1q_lane_s32(dstStart + 0 * dstZStep, tmp, 0);
|
||||
vst1q_lane_s32(dstStart + 1 * dstZStep, tmp, 1);
|
||||
vst1q_lane_s32(dstStart + 2 * dstZStep, tmp, 2);
|
||||
vst1q_lane_s32(dstStart + 3 * dstZStep, tmp, 3);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 0 * dstZStep), tmp, 0);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 1 * dstZStep), tmp, 1);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 2 * dstZStep), tmp, 2);
|
||||
vst1q_lane_s32((int32_t*)(dstStart + 3 * dstZStep), tmp, 3);
|
||||
dstStart += dstXStep;
|
||||
}
|
||||
dstStart -= dstXStep * 4;
|
||||
|
|
|
|||
|
|
@ -481,7 +481,10 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d,
|
|||
const int8_t*& weight, float*& scale, int32_t*& bias,
|
||||
float inputScale, float outputScale, int inputZeroPoint, int outputZeroPoint) {
|
||||
int outputCount = conv2d->common()->outputCount();
|
||||
weight = conv2d->symmetricQuan()->weight()->data();
|
||||
// fix xcode UndefinedBehaviorSanitizer
|
||||
if (conv2d->symmetricQuan()->weight() != nullptr) {
|
||||
weight = conv2d->symmetricQuan()->weight()->data();
|
||||
}
|
||||
if (conv2d->quanParameter() && conv2d->quanParameter()->buffer()) {
|
||||
quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), false, true);
|
||||
weight = quanCommon->weight.get();
|
||||
|
|
@ -529,10 +532,14 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d,
|
|||
auto alphaData = conv2d->quanParameter()->alpha()->data();
|
||||
auto alphaScale = inputScale / outputScale;
|
||||
for (int i = 0; i < outputCount; i++) {
|
||||
scale[i] = alphaData[i] * alphaScale;
|
||||
auto alphaValue = alphaData[i];
|
||||
if (fabs(alphaValue) < 1e-6) {
|
||||
alphaValue = 1e-6;
|
||||
}
|
||||
scale[i] = alphaValue * alphaScale;
|
||||
// compute outputZeroPointFused in asymmetric quant
|
||||
int outputZeroPointFused = static_cast<int32_t>(outputZeroPoint / scale[i]);
|
||||
bias[i] = static_cast<int32_t>(biasData[i] / (inputScale * alphaData[i])) - remains[i] + outputZeroPointFused;
|
||||
bias[i] = static_cast<int32_t>(biasData[i] / (inputScale * alphaValue)) - remains[i] + outputZeroPointFused;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue