2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// Int8FunctionsOpt.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2018/08/15.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
#include <math.h>
|
2020-11-05 16:41:56 +08:00
|
|
|
#include <cstring> // for memset
|
2020-02-26 09:57:17 +08:00
|
|
|
#include "Int8FunctionsOpt.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Macro.h"
|
2021-09-18 15:52:30 +08:00
|
|
|
#include "common/CommonCompute.hpp"
|
2021-06-11 17:17:13 +08:00
|
|
|
#include "CommonOptFunction.h"
|
2023-06-16 09:42:45 +08:00
|
|
|
#include "math/Vec.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
#ifdef MNN_USE_NEON
|
|
|
|
#include <arm_neon.h>
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
extern "C" {
|
|
|
|
void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
|
|
|
|
const QuanPostTreatParameters* post, size_t realCount);
|
|
|
|
void MNNGemmInt8AddBiasScale_16x4_Unit_FAST(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
|
|
|
|
const QuanPostTreatParameters* post, size_t realCount);
|
|
|
|
void MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width,
|
2023-07-31 14:24:48 +08:00
|
|
|
size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder=nullptr);
|
2023-02-28 10:41:24 +08:00
|
|
|
void MNNMaxPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);
|
2023-02-15 10:30:27 +08:00
|
|
|
|
2023-02-28 10:41:24 +08:00
|
|
|
void MNNAvgPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
|
2023-10-18 10:31:02 +08:00
|
|
|
void MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params);
|
2022-10-30 08:44:24 +08:00
|
|
|
#if defined(__aarch64__) // aarch32 sdot workaround
|
2021-06-11 17:17:13 +08:00
|
|
|
void MNNGemmInt8AddBiasScale_ARMV82_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
|
2023-07-31 14:24:48 +08:00
|
|
|
const QuanPostTreatParameters* post, size_t realDstCount);
|
2022-10-30 08:44:24 +08:00
|
|
|
void MNNGemmInt8AddBiasScale_ARMV86_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
|
2023-07-31 14:24:48 +08:00
|
|
|
const QuanPostTreatParameters* post, size_t realDstCount);
|
|
|
|
void MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width,
|
|
|
|
size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder=nullptr);
|
2022-10-30 08:44:24 +08:00
|
|
|
#endif // __aarch64__
|
2021-06-11 17:17:13 +08:00
|
|
|
}
|
|
|
|
#endif // MNN_USE_NEON
|
|
|
|
|
2021-09-18 15:52:30 +08:00
|
|
|
/*
|
|
|
|
layout should be optimized for int8
|
|
|
|
source: source matrix is h x l
|
|
|
|
transpose: if false, export compressed matrix as h x l, other export as l x h.
|
|
|
|
*/
|
|
|
|
void MNNPackForSparseQuantMatMul_B(int8_t* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const int8_t* source, size_t h, size_t kernelCount, size_t icCount, const int eP) {
|
|
|
|
// 1. in quant convolution, source B layout is OC x (IC * KH * KW),
|
|
|
|
// the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (KH*KW*IC!=0), as a canceled result, just do BCSR
|
|
|
|
// 2. IC would be switched into the last dim.
|
|
|
|
|
|
|
|
// BCSC
|
|
|
|
int columOffset = 0;
|
|
|
|
int i = 0;
|
|
|
|
auto subSource = source;
|
|
|
|
size_t l = kernelCount * icCount;
|
|
|
|
for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
|
|
|
|
*NNZMap = 0;
|
|
|
|
for(int ik = 0; ik < kernelCount; ik += 1) {
|
|
|
|
auto kernelSource = subSource + ik;
|
|
|
|
for(int ic = 0; ic < icCount; ic += 1) {
|
|
|
|
if (!MNN::CommonCompute::checkAllZeros(kernelSource, l, sparseBlockOC, 1)) {
|
|
|
|
for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
|
|
|
|
*dest = *(kernelSource + ioc * l);
|
|
|
|
dest++;
|
|
|
|
}
|
|
|
|
*NNZMap = *NNZMap + 1;
|
|
|
|
*dataOffsetMap = columOffset;
|
|
|
|
dataOffsetMap++;
|
|
|
|
columOffset = 0;
|
|
|
|
}
|
|
|
|
columOffset += eP;
|
|
|
|
kernelSource += kernelCount;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
NNZMap++;
|
|
|
|
columOffset -= l * eP;
|
|
|
|
subSource += sparseBlockOC * l;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (; i < h; i++) {
|
|
|
|
*NNZMap = 0;
|
|
|
|
for(int ik = 0; ik < kernelCount; ik += 1) {
|
|
|
|
auto kernelSource = subSource + ik;
|
|
|
|
for(int ic = 0; ic < icCount; ic += 1) {
|
|
|
|
if (*kernelSource != 0) {
|
|
|
|
*dest = *kernelSource;
|
|
|
|
dest++;
|
|
|
|
*NNZMap = *NNZMap + 1;
|
|
|
|
*dataOffsetMap = columOffset;
|
|
|
|
dataOffsetMap++;
|
|
|
|
columOffset = 0;
|
|
|
|
}
|
|
|
|
columOffset += eP;
|
|
|
|
kernelSource += kernelCount;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
NNZMap++;
|
|
|
|
columOffset -= l * eP;
|
|
|
|
subSource += l;
|
|
|
|
}
|
|
|
|
|
|
|
|
*dataOffsetMap = columOffset; //
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void MNNGetSparseQuantMatMulPackMode(int* eP, int *lP, int* hP) {
|
|
|
|
#if defined(__arm__) && !defined(__aarch64__)
|
|
|
|
*eP = 8;
|
|
|
|
#else
|
|
|
|
*eP = 16;
|
|
|
|
#endif
|
|
|
|
*lP = 1;
|
|
|
|
*hP = 4;
|
|
|
|
// hp is corresponding to sparse block along right matrix colum dimension. in ramdom sparse, it is 1.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-06-16 09:42:45 +08:00
|
|
|
static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
|
|
|
|
int number = info[0];
|
|
|
|
int eReal = info[1];
|
|
|
|
int eDest = info[2];
|
|
|
|
int offset = info[3];
|
|
|
|
for (int n=0; n<number; ++n) {
|
|
|
|
int e = el[4 * n + 0];
|
|
|
|
int l = el[4 * n + 1];
|
|
|
|
int eOffset = el[4 * n + 2];
|
|
|
|
int lOffset = el[4 * n + 3];
|
|
|
|
auto dest = destOrigin + lOffset * eDest + eOffset;
|
|
|
|
auto source = sourceGroup[n];
|
|
|
|
|
|
|
|
for (int y=0; y<e; ++y) {
|
|
|
|
auto yR = y % eDest;
|
|
|
|
for (int x=0; x<l; ++x) {
|
|
|
|
auto xR = x % 4;
|
|
|
|
auto xC = x / 4;
|
|
|
|
dest[(x) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
|
2021-09-18 15:52:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-04 10:42:11 +08:00
|
|
|
void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
|
|
|
|
#ifdef MNN_USE_SSE
|
|
|
|
uint8_t* srcPtr = (uint8_t*)src;
|
|
|
|
uint8_t* dstPtr = (uint8_t*)dst;
|
|
|
|
int offset = 128;
|
|
|
|
#else
|
|
|
|
const int8_t* srcPtr = src;
|
|
|
|
int8_t* dstPtr = dst;
|
|
|
|
int offset = 0;
|
|
|
|
#endif
|
|
|
|
int inpZero = static_cast<int>(params->inputZeroPoint[0]);
|
|
|
|
int outZero = static_cast<int>(params->outputZeroPoint[0]);
|
|
|
|
float inpScale = params->inputScale[0];
|
|
|
|
float outScale = params->outputScale[0];
|
|
|
|
float sum = 0.f;
|
|
|
|
int max_ = static_cast<int>(params->maxValue);
|
|
|
|
int min_ = static_cast<int>(params->minValue);
|
|
|
|
for (int j = 0; j < size; ++j) {
|
|
|
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
|
|
|
sum += fx;
|
|
|
|
}
|
|
|
|
float mean = sum / size;
|
|
|
|
float square_sum = 0.f;
|
|
|
|
for (int j = 0; j < size; ++j) {
|
|
|
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
|
|
|
square_sum += (fx - mean) * (fx - mean);
|
|
|
|
}
|
|
|
|
float variable = square_sum / size;
|
|
|
|
variable = 1.f / std::sqrt(variable + epsilon);
|
|
|
|
|
|
|
|
if (gamma && beta) {
|
|
|
|
for (int j = 0; j < size; ++j) {
|
|
|
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
|
|
|
float fy = (fx - mean) * variable * gamma[j] + beta[j];
|
|
|
|
int sy = fy * outScale + outZero;
|
|
|
|
sy = ALIMAX(min_, ALIMIN(sy, max_));
|
|
|
|
dstPtr[j] = sy + offset;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (int j = 0; j < size; ++j) {
|
|
|
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
|
|
|
float fy = (fx - mean) * variable;
|
|
|
|
int sy = roundf(fy * outScale) + outZero;
|
|
|
|
sy = ALIMAX(min_, ALIMIN(sy, max_));
|
|
|
|
dstPtr[j] = sy + offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
#ifndef MNN_USE_NEON
|
2021-09-18 15:52:30 +08:00
|
|
|
|
|
|
|
void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) {
|
|
|
|
|
|
|
|
size_t eSize = sparseQuantParam[0];
|
|
|
|
size_t eP = sparseQuantParam[1];
|
|
|
|
size_t aStride = sparseQuantParam[2];
|
|
|
|
size_t l = sparseQuantParam[3];
|
|
|
|
size_t h = sparseQuantParam[4];
|
|
|
|
size_t cStride = sparseQuantParam[5];
|
|
|
|
|
|
|
|
const int32_t* bias = post->bias;
|
|
|
|
const float* scales = post->scale;
|
|
|
|
const int32_t maxValue = post->maxValue;
|
|
|
|
const int32_t minValue = post->minValue;
|
|
|
|
|
|
|
|
const int sparseBlockOC = 4;
|
|
|
|
const int8_t * a = A;
|
|
|
|
size_t ie = 0;
|
|
|
|
for (ie = 0; ie < eSize && eP <= eSize; ie += eP) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
a += diff;
|
|
|
|
const int8_t * w = B;
|
|
|
|
int8_t * blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
|
|
|
|
for (size_t ih = 0; ih < h; ih++) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihPack * cStride + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
int32_t acc2 = initValue;
|
|
|
|
int32_t acc3 = initValue;
|
|
|
|
int32_t acc4 = initValue;
|
|
|
|
int32_t acc5 = initValue;
|
|
|
|
int32_t acc6 = initValue;
|
|
|
|
int32_t acc7 = initValue;
|
|
|
|
int32_t acc8 = initValue;
|
|
|
|
int32_t acc9 = initValue;
|
|
|
|
int32_t acc10 = initValue;
|
|
|
|
int32_t acc11 = initValue;
|
|
|
|
int32_t acc12 = initValue;
|
|
|
|
int32_t acc13 = initValue;
|
|
|
|
int32_t acc14 = initValue;
|
|
|
|
int32_t acc15 = initValue;
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t a4 = a[4];
|
|
|
|
const int8_t a5 = a[5];
|
|
|
|
const int8_t a6 = a[6];
|
|
|
|
const int8_t a7 = a[7];
|
|
|
|
const int8_t a8 = a[8];
|
|
|
|
const int8_t a9 = a[9];
|
|
|
|
const int8_t a10 = a[10];
|
|
|
|
const int8_t a11 = a[11];
|
|
|
|
const int8_t a12 = a[12];
|
|
|
|
const int8_t a13 = a[13];
|
|
|
|
const int8_t a14 = a[14];
|
|
|
|
const int8_t a15 = a[15];
|
|
|
|
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
|
|
|
|
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {16});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += (int32_t)a0 * (int32_t)oneW;
|
|
|
|
acc1 += (int32_t)a1 * (int32_t)oneW;
|
|
|
|
acc2 += (int32_t)a2 * (int32_t)oneW;
|
|
|
|
acc3 += (int32_t)a3 * (int32_t)oneW;
|
|
|
|
acc4 += (int32_t)a4 * (int32_t)oneW;
|
|
|
|
acc5 += (int32_t)a5 * (int32_t)oneW;
|
|
|
|
acc6 += (int32_t)a6 * (int32_t)oneW;
|
|
|
|
acc7 += (int32_t)a7 * (int32_t)oneW;
|
|
|
|
acc8 += (int32_t)a8 * (int32_t)oneW;
|
|
|
|
acc9 += (int32_t)a9 * (int32_t)oneW;
|
|
|
|
acc10 += (int32_t)a10 * (int32_t)oneW;
|
|
|
|
acc11 += (int32_t)a11 * (int32_t)oneW;
|
|
|
|
acc12 += (int32_t)a12 * (int32_t)oneW;
|
|
|
|
acc13 += (int32_t)a13 * (int32_t)oneW;
|
|
|
|
acc14 += (int32_t)a14 * (int32_t)oneW;
|
|
|
|
acc15 += (int32_t)a15 * (int32_t)oneW;
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0; // in assemmbly code, consider reuse acc0[0-8] bit
|
|
|
|
int8_t result1;
|
|
|
|
int8_t result2;
|
|
|
|
int8_t result3;
|
|
|
|
int8_t result4;
|
|
|
|
int8_t result5;
|
|
|
|
int8_t result6;
|
|
|
|
int8_t result7;
|
|
|
|
int8_t result8;
|
|
|
|
int8_t result9;
|
|
|
|
int8_t result10;
|
|
|
|
int8_t result11;
|
|
|
|
int8_t result12;
|
|
|
|
int8_t result13;
|
|
|
|
int8_t result14;
|
|
|
|
int8_t result15;
|
|
|
|
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
result2 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc2)), float(minValue))));
|
|
|
|
result3 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc3)), float(minValue))));
|
|
|
|
result4 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc4)), float(minValue))));
|
|
|
|
result5 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc5)), float(minValue))));
|
|
|
|
result6 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc6)), float(minValue))));
|
|
|
|
result7 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc7)), float(minValue))));
|
|
|
|
result8 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc8)), float(minValue))));
|
|
|
|
result9 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc9)), float(minValue))));
|
|
|
|
result10 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc10)), float(minValue))));
|
|
|
|
result11 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc11)), float(minValue))));
|
|
|
|
result12 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc12)), float(minValue))));
|
|
|
|
result13 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc13)), float(minValue))));
|
|
|
|
result14 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc14)), float(minValue))));
|
|
|
|
result15 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc15)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
result2 = static_cast<int8_t>(std::max(std::min(maxValue, acc2), minValue));
|
|
|
|
result3 = static_cast<int8_t>(std::max(std::min(maxValue, acc3), minValue));
|
|
|
|
result4 = static_cast<int8_t>(std::max(std::min(maxValue, acc4), minValue));
|
|
|
|
result5 = static_cast<int8_t>(std::max(std::min(maxValue, acc5), minValue));
|
|
|
|
result6 = static_cast<int8_t>(std::max(std::min(maxValue, acc6), minValue));
|
|
|
|
result7 = static_cast<int8_t>(std::max(std::min(maxValue, acc7), minValue));
|
|
|
|
result8 = static_cast<int8_t>(std::max(std::min(maxValue, acc8), minValue));
|
|
|
|
result9 = static_cast<int8_t>(std::max(std::min(maxValue, acc9), minValue));
|
|
|
|
result10 = static_cast<int8_t>(std::max(std::min(maxValue, acc10), minValue));
|
|
|
|
result11 = static_cast<int8_t>(std::max(std::min(maxValue, acc11), minValue));
|
|
|
|
result12 = static_cast<int8_t>(std::max(std::min(maxValue, acc12), minValue));
|
|
|
|
result13 = static_cast<int8_t>(std::max(std::min(maxValue, acc13), minValue));
|
|
|
|
result14 = static_cast<int8_t>(std::max(std::min(maxValue, acc14), minValue));
|
|
|
|
result15 = static_cast<int8_t>(std::max(std::min(maxValue, acc15), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
c[4 * 2] = result2;
|
|
|
|
c[4 * 3] = result3;
|
|
|
|
c[4 * 4] = result4;
|
|
|
|
c[4 * 5] = result5;
|
|
|
|
c[4 * 6] = result6;
|
|
|
|
c[4 * 7] = result7;
|
|
|
|
c[4 * 8] = result8;
|
|
|
|
c[4 * 9] = result9;
|
|
|
|
c[4 * 10] = result10;
|
|
|
|
c[4 * 11] = result11;
|
|
|
|
c[4 * 12] = result12;
|
|
|
|
c[4 * 13] = result13;
|
|
|
|
c[4 * 14] = result14;
|
|
|
|
c[4 * 15] = result15;
|
|
|
|
}
|
|
|
|
a += aStride;
|
|
|
|
}
|
|
|
|
if (eSize & 0x08) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t* w = B;
|
|
|
|
int8_t* blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
for (size_t ih = 0; ih < h; ih++) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihPack * cStride + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
int32_t acc2 = initValue;
|
|
|
|
int32_t acc3 = initValue;
|
|
|
|
int32_t acc4 = initValue;
|
|
|
|
int32_t acc5 = initValue;
|
|
|
|
int32_t acc6 = initValue;
|
|
|
|
int32_t acc7 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t a4 = a[4];
|
|
|
|
const int8_t a5 = a[5];
|
|
|
|
const int8_t a6 = a[6];
|
|
|
|
const int8_t a7 = a[7];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
// MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%d, a value[0-7]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {8});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
acc1 += int32_t(a1) * int32_t(oneW);
|
|
|
|
acc2 += int32_t(a2) * int32_t(oneW);
|
|
|
|
acc3 += int32_t(a3) * int32_t(oneW);
|
|
|
|
acc4 += int32_t(a4) * int32_t(oneW);
|
|
|
|
acc5 += int32_t(a5) * int32_t(oneW);
|
|
|
|
acc6 += int32_t(a6) * int32_t(oneW);
|
|
|
|
acc7 += int32_t(a7) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0;
|
|
|
|
int8_t result1;
|
|
|
|
int8_t result2;
|
|
|
|
int8_t result3;
|
|
|
|
int8_t result4;
|
|
|
|
int8_t result5;
|
|
|
|
int8_t result6;
|
|
|
|
int8_t result7;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
result2 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc2)), float(minValue))));
|
|
|
|
result3 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc3)), float(minValue))));
|
|
|
|
result4 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc4)), float(minValue))));
|
|
|
|
result5 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc5)), float(minValue))));
|
|
|
|
result6 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc6)), float(minValue))));
|
|
|
|
result7 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc7)), float(minValue))));
|
|
|
|
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
result2 = static_cast<int8_t>(std::max(std::min(maxValue, acc2), minValue));
|
|
|
|
result3 = static_cast<int8_t>(std::max(std::min(maxValue, acc3), minValue));
|
|
|
|
result4 = static_cast<int8_t>(std::max(std::min(maxValue, acc4), minValue));
|
|
|
|
result5 = static_cast<int8_t>(std::max(std::min(maxValue, acc5), minValue));
|
|
|
|
result6 = static_cast<int8_t>(std::max(std::min(maxValue, acc6), minValue));
|
|
|
|
result7 = static_cast<int8_t>(std::max(std::min(maxValue, acc7), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
c[4 * 2] = result2;
|
|
|
|
c[4 * 3] = result3;
|
|
|
|
c[4 * 4] = result4;
|
|
|
|
c[4 * 5] = result5;
|
|
|
|
c[4 * 6] = result6;
|
|
|
|
c[4 * 7] = result7;
|
|
|
|
}
|
|
|
|
ie += 8;
|
|
|
|
a += 8;
|
|
|
|
}
|
|
|
|
if (eSize & 0x04) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t* w = B;
|
|
|
|
int8_t* blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
|
|
|
|
for (size_t ih = 0; ih < h; ih++) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihPack * cStride + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
int32_t acc2 = initValue;
|
|
|
|
int32_t acc3 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
// MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%d, a value[0-3]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {4});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
acc1 += int32_t(a1) * int32_t(oneW);
|
|
|
|
acc2 += int32_t(a2) * int32_t(oneW);
|
|
|
|
acc3 += int32_t(a3) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0;
|
|
|
|
int8_t result1;
|
|
|
|
int8_t result2;
|
|
|
|
int8_t result3;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
result2 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc2)), float(minValue))));
|
|
|
|
result3 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc3)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
result2 = static_cast<int8_t>(std::max(std::min(maxValue, acc2), minValue));
|
|
|
|
result3 = static_cast<int8_t>(std::max(std::min(maxValue, acc3), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
c[4 * 2] = result2;
|
|
|
|
c[4 * 3] = result3;
|
|
|
|
}
|
|
|
|
ie += 4;
|
|
|
|
a += 4;
|
|
|
|
}
|
|
|
|
if (eSize & 0x02) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t* w = B;
|
|
|
|
int8_t* blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
for (size_t ih = 0; ih < h; ih++) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihPack * cStride + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
// MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%d, a value[0-1]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {2});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
acc1 += int32_t(a1) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0;
|
|
|
|
int8_t result1;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
}
|
|
|
|
ie += 2;
|
|
|
|
a += 2;
|
|
|
|
}
|
|
|
|
if (eSize & 0x01) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// const float* a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t * w = B;
|
|
|
|
int8_t * blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
for (size_t ih = 0; ih < h; ih++) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihPack * cStride + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
|
|
|
|
// MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%d, a value[0]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {1});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
int8_t result0;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
}
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
}
|
|
|
|
ie += 1;
|
|
|
|
// a += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void MNNPackedSparseQuantMatMulEpx4(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) {
|
|
|
|
|
|
|
|
size_t eSize = sparseQuantParam[0];
|
|
|
|
size_t eP = sparseQuantParam[1];
|
|
|
|
size_t aStride = sparseQuantParam[2];
|
|
|
|
size_t l = sparseQuantParam[3];
|
|
|
|
size_t h = sparseQuantParam[4];
|
|
|
|
size_t cStride = sparseQuantParam[5];
|
|
|
|
|
|
|
|
const int32_t* bias = post->bias;
|
|
|
|
const float* scales = post->scale;
|
|
|
|
const int32_t maxValue = post->maxValue;
|
|
|
|
const int32_t minValue = post->minValue;
|
|
|
|
|
|
|
|
const int sparseBlockOC = 4;
|
|
|
|
const int8_t * a = A;
|
|
|
|
size_t ie = 0;
|
|
|
|
for (ie = 0; ie < eSize && eP <= eSize; ie += eP) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
a += diff;
|
|
|
|
const int8_t * w = B;
|
|
|
|
int8_t * blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
|
|
|
|
size_t ih = 0;
|
|
|
|
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto c = blockC + ihPack * cStride;
|
|
|
|
|
|
|
|
int32_t initValue[4] = {0, 0, 0, 0};
|
|
|
|
if (nullptr != bias) {
|
|
|
|
memcpy(initValue, bias + ih, 4 * sizeof(int32_t));
|
|
|
|
}
|
|
|
|
int32_t acc0[4];
|
|
|
|
int32_t acc1[4];
|
|
|
|
int32_t acc2[4];
|
|
|
|
int32_t acc3[4];
|
|
|
|
int32_t acc4[4];
|
|
|
|
int32_t acc5[4];
|
|
|
|
int32_t acc6[4];
|
|
|
|
int32_t acc7[4];
|
|
|
|
int32_t acc8[4];
|
|
|
|
int32_t acc9[4];
|
|
|
|
int32_t acc10[4];
|
|
|
|
int32_t acc11[4];
|
|
|
|
int32_t acc12[4];
|
|
|
|
int32_t acc13[4];
|
|
|
|
int32_t acc14[4];
|
|
|
|
int32_t acc15[4];
|
|
|
|
|
|
|
|
memcpy(acc0, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc1, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc2, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc3, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc4, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc5, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc6, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc7, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc8, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc9, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc10, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc11, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc12, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc13, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc14, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc15, initValue, 4 * sizeof(int32_t));
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t a4 = a[4];
|
|
|
|
const int8_t a5 = a[5];
|
|
|
|
const int8_t a6 = a[6];
|
|
|
|
const int8_t a7 = a[7];
|
|
|
|
const int8_t a8 = a[8];
|
|
|
|
const int8_t a9 = a[9];
|
|
|
|
const int8_t a10 = a[10];
|
|
|
|
const int8_t a11 = a[11];
|
|
|
|
const int8_t a12 = a[12];
|
|
|
|
const int8_t a13 = a[13];
|
|
|
|
const int8_t a14 = a[14];
|
|
|
|
const int8_t a15 = a[15];
|
|
|
|
|
|
|
|
const int8_t wv[4] = {*w++, *w++, *w++, *w++};
|
|
|
|
|
|
|
|
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {16});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
acc0[lane] += (int32_t)a0 * (int32_t)wv[lane];
|
|
|
|
acc1[lane] += (int32_t)a1 * (int32_t)wv[lane];
|
|
|
|
acc2[lane] += (int32_t)a2 * (int32_t)wv[lane];
|
|
|
|
acc3[lane] += (int32_t)a3 * (int32_t)wv[lane];
|
|
|
|
acc4[lane] += (int32_t)a4 * (int32_t)wv[lane];
|
|
|
|
acc5[lane] += (int32_t)a5 * (int32_t)wv[lane];
|
|
|
|
acc6[lane] += (int32_t)a6 * (int32_t)wv[lane];
|
|
|
|
acc7[lane] += (int32_t)a7 * (int32_t)wv[lane];
|
|
|
|
acc8[lane] += (int32_t)a8 * (int32_t)wv[lane];
|
|
|
|
acc9[lane] += (int32_t)a9 * (int32_t)wv[lane];
|
|
|
|
acc10[lane] += (int32_t)a10 * (int32_t)wv[lane];
|
|
|
|
acc11[lane] += (int32_t)a11 * (int32_t)wv[lane];
|
|
|
|
acc12[lane] += (int32_t)a12 * (int32_t)wv[lane];
|
|
|
|
acc13[lane] += (int32_t)a13 * (int32_t)wv[lane];
|
|
|
|
acc14[lane] += (int32_t)a14 * (int32_t)wv[lane];
|
|
|
|
acc15[lane] += (int32_t)a15 * (int32_t)wv[lane];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0[4];
|
|
|
|
int8_t result1[4];
|
|
|
|
int8_t result2[4];
|
|
|
|
int8_t result3[4];
|
|
|
|
int8_t result4[4];
|
|
|
|
int8_t result5[4];
|
|
|
|
int8_t result6[4];
|
|
|
|
int8_t result7[4];
|
|
|
|
int8_t result8[4];
|
|
|
|
int8_t result9[4];
|
|
|
|
int8_t result10[4];
|
|
|
|
int8_t result11[4];
|
|
|
|
int8_t result12[4];
|
|
|
|
int8_t result13[4];
|
|
|
|
int8_t result14[4];
|
|
|
|
int8_t result15[4];
|
|
|
|
|
|
|
|
if (scales) {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc0[lane])), float(minValue))));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc1[lane])), float(minValue))));
|
|
|
|
result2[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc2[lane])), float(minValue))));
|
|
|
|
result3[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc3[lane])), float(minValue))));
|
|
|
|
result4[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc4[lane])), float(minValue))));
|
|
|
|
result5[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc5[lane])), float(minValue))));
|
|
|
|
result6[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc6[lane])), float(minValue))));
|
|
|
|
result7[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc7[lane])), float(minValue))));
|
|
|
|
result8[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc8[lane])), float(minValue))));
|
|
|
|
result9[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc9[lane])), float(minValue))));
|
|
|
|
result10[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc10[lane])), float(minValue))));
|
|
|
|
result11[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc11[lane])), float(minValue))));
|
|
|
|
result12[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc12[lane])), float(minValue))));
|
|
|
|
result13[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc13[lane])), float(minValue))));
|
|
|
|
result14[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc14[lane])), float(minValue))));
|
|
|
|
result15[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc15[lane])), float(minValue))));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc0[lane]), minValue)));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc1[lane]), minValue)));
|
|
|
|
result2[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc2[lane]), minValue)));
|
|
|
|
result3[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc3[lane]), minValue)));
|
|
|
|
result4[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc4[lane]), minValue)));
|
|
|
|
result5[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc5[lane]), minValue)));
|
|
|
|
result6[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc6[lane]), minValue)));
|
|
|
|
result7[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc7[lane]), minValue)));
|
|
|
|
result8[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc8[lane]), minValue)));
|
|
|
|
result9[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc9[lane]), minValue)));
|
|
|
|
result10[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc10[lane]), minValue)));
|
|
|
|
result11[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc11[lane]), minValue)));
|
|
|
|
result12[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc12[lane]), minValue)));
|
|
|
|
result13[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc13[lane]), minValue)));
|
|
|
|
result14[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc14[lane]), minValue)));
|
|
|
|
result15[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc15[lane]), minValue)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(c , result0, 4 * sizeof(int8_t)); // store continuous c
|
|
|
|
memcpy(c + 4 , result1, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 2 , result2, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 3 , result3, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 4 , result4, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 5 , result5, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 6 , result6, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 7 , result7, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 8 , result8, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 9 , result9, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 10, result10, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 11, result11, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 12, result12, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 13, result13, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 14, result14, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 15, result15, 4 * sizeof(int8_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
blockC += (h >> 2) * cStride;
|
|
|
|
for (; ih < h; ih++) {
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
int32_t acc2 = initValue;
|
|
|
|
int32_t acc3 = initValue;
|
|
|
|
int32_t acc4 = initValue;
|
|
|
|
int32_t acc5 = initValue;
|
|
|
|
int32_t acc6 = initValue;
|
|
|
|
int32_t acc7 = initValue;
|
|
|
|
int32_t acc8 = initValue;
|
|
|
|
int32_t acc9 = initValue;
|
|
|
|
int32_t acc10 = initValue;
|
|
|
|
int32_t acc11 = initValue;
|
|
|
|
int32_t acc12 = initValue;
|
|
|
|
int32_t acc13 = initValue;
|
|
|
|
int32_t acc14 = initValue;
|
|
|
|
int32_t acc15 = initValue;
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t a4 = a[4];
|
|
|
|
const int8_t a5 = a[5];
|
|
|
|
const int8_t a6 = a[6];
|
|
|
|
const int8_t a7 = a[7];
|
|
|
|
const int8_t a8 = a[8];
|
|
|
|
const int8_t a9 = a[9];
|
|
|
|
const int8_t a10 = a[10];
|
|
|
|
const int8_t a11 = a[11];
|
|
|
|
const int8_t a12 = a[12];
|
|
|
|
const int8_t a13 = a[13];
|
|
|
|
const int8_t a14 = a[14];
|
|
|
|
const int8_t a15 = a[15];
|
|
|
|
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
|
|
|
|
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {16});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += (int32_t)a0 * (int32_t)oneW;
|
|
|
|
acc1 += (int32_t)a1 * (int32_t)oneW;
|
|
|
|
acc2 += (int32_t)a2 * (int32_t)oneW;
|
|
|
|
acc3 += (int32_t)a3 * (int32_t)oneW;
|
|
|
|
acc4 += (int32_t)a4 * (int32_t)oneW;
|
|
|
|
acc5 += (int32_t)a5 * (int32_t)oneW;
|
|
|
|
acc6 += (int32_t)a6 * (int32_t)oneW;
|
|
|
|
acc7 += (int32_t)a7 * (int32_t)oneW;
|
|
|
|
acc8 += (int32_t)a8 * (int32_t)oneW;
|
|
|
|
acc9 += (int32_t)a9 * (int32_t)oneW;
|
|
|
|
acc10 += (int32_t)a10 * (int32_t)oneW;
|
|
|
|
acc11 += (int32_t)a11 * (int32_t)oneW;
|
|
|
|
acc12 += (int32_t)a12 * (int32_t)oneW;
|
|
|
|
acc13 += (int32_t)a13 * (int32_t)oneW;
|
|
|
|
acc14 += (int32_t)a14 * (int32_t)oneW;
|
|
|
|
acc15 += (int32_t)a15 * (int32_t)oneW;
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0; // in assemmbly code, consider reuse acc0[0-8] bit
|
|
|
|
int8_t result1;
|
|
|
|
int8_t result2;
|
|
|
|
int8_t result3;
|
|
|
|
int8_t result4;
|
|
|
|
int8_t result5;
|
|
|
|
int8_t result6;
|
|
|
|
int8_t result7;
|
|
|
|
int8_t result8;
|
|
|
|
int8_t result9;
|
|
|
|
int8_t result10;
|
|
|
|
int8_t result11;
|
|
|
|
int8_t result12;
|
|
|
|
int8_t result13;
|
|
|
|
int8_t result14;
|
|
|
|
int8_t result15;
|
|
|
|
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
result2 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc2)), float(minValue))));
|
|
|
|
result3 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc3)), float(minValue))));
|
|
|
|
result4 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc4)), float(minValue))));
|
|
|
|
result5 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc5)), float(minValue))));
|
|
|
|
result6 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc6)), float(minValue))));
|
|
|
|
result7 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc7)), float(minValue))));
|
|
|
|
result8 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc8)), float(minValue))));
|
|
|
|
result9 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc9)), float(minValue))));
|
|
|
|
result10 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc10)), float(minValue))));
|
|
|
|
result11 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc11)), float(minValue))));
|
|
|
|
result12 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc12)), float(minValue))));
|
|
|
|
result13 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc13)), float(minValue))));
|
|
|
|
result14 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc14)), float(minValue))));
|
|
|
|
result15 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc15)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
result2 = static_cast<int8_t>(std::max(std::min(maxValue, acc2), minValue));
|
|
|
|
result3 = static_cast<int8_t>(std::max(std::min(maxValue, acc3), minValue));
|
|
|
|
result4 = static_cast<int8_t>(std::max(std::min(maxValue, acc4), minValue));
|
|
|
|
result5 = static_cast<int8_t>(std::max(std::min(maxValue, acc5), minValue));
|
|
|
|
result6 = static_cast<int8_t>(std::max(std::min(maxValue, acc6), minValue));
|
|
|
|
result7 = static_cast<int8_t>(std::max(std::min(maxValue, acc7), minValue));
|
|
|
|
result8 = static_cast<int8_t>(std::max(std::min(maxValue, acc8), minValue));
|
|
|
|
result9 = static_cast<int8_t>(std::max(std::min(maxValue, acc9), minValue));
|
|
|
|
result10 = static_cast<int8_t>(std::max(std::min(maxValue, acc10), minValue));
|
|
|
|
result11 = static_cast<int8_t>(std::max(std::min(maxValue, acc11), minValue));
|
|
|
|
result12 = static_cast<int8_t>(std::max(std::min(maxValue, acc12), minValue));
|
|
|
|
result13 = static_cast<int8_t>(std::max(std::min(maxValue, acc13), minValue));
|
|
|
|
result14 = static_cast<int8_t>(std::max(std::min(maxValue, acc14), minValue));
|
|
|
|
result15 = static_cast<int8_t>(std::max(std::min(maxValue, acc15), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
c[4 * 2] = result2;
|
|
|
|
c[4 * 3] = result3;
|
|
|
|
c[4 * 4] = result4;
|
|
|
|
c[4 * 5] = result5;
|
|
|
|
c[4 * 6] = result6;
|
|
|
|
c[4 * 7] = result7;
|
|
|
|
c[4 * 8] = result8;
|
|
|
|
c[4 * 9] = result9;
|
|
|
|
c[4 * 10] = result10;
|
|
|
|
c[4 * 11] = result11;
|
|
|
|
c[4 * 12] = result12;
|
|
|
|
c[4 * 13] = result13;
|
|
|
|
c[4 * 14] = result14;
|
|
|
|
c[4 * 15] = result15;
|
|
|
|
}
|
|
|
|
a += aStride;
|
|
|
|
}
|
|
|
|
if (eSize & 0x08) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t* w = B;
|
|
|
|
int8_t* blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
|
|
|
|
size_t ih = 0;
|
|
|
|
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto c = blockC + ihPack * cStride;
|
|
|
|
int32_t initValue[4] = {0, 0, 0, 0};
|
|
|
|
if (nullptr != bias) {
|
|
|
|
memcpy(initValue, bias + ih, 4 * sizeof(int32_t));
|
|
|
|
}
|
|
|
|
int32_t acc0[4];
|
|
|
|
int32_t acc1[4];
|
|
|
|
int32_t acc2[4];
|
|
|
|
int32_t acc3[4];
|
|
|
|
int32_t acc4[4];
|
|
|
|
int32_t acc5[4];
|
|
|
|
int32_t acc6[4];
|
|
|
|
int32_t acc7[4];
|
|
|
|
|
|
|
|
memcpy(acc0, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc1, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc2, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc3, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc4, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc5, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc6, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc7, initValue, 4 * sizeof(int32_t));
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t a4 = a[4];
|
|
|
|
const int8_t a5 = a[5];
|
|
|
|
const int8_t a6 = a[6];
|
|
|
|
const int8_t a7 = a[7];
|
|
|
|
const int8_t wv[4] = {*w++, *w++, *w++, *w++};
|
|
|
|
// MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value[0-3]:, a value[0-7]:\n", ie, a - A, w - B - 1, c - C);
|
|
|
|
// formatMatrix(wv, {4});
|
|
|
|
// formatMatrix(a, {8});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
acc0[lane] += int32_t(a0) * int32_t(wv[lane]);
|
|
|
|
acc1[lane] += int32_t(a1) * int32_t(wv[lane]);
|
|
|
|
acc2[lane] += int32_t(a2) * int32_t(wv[lane]);
|
|
|
|
acc3[lane] += int32_t(a3) * int32_t(wv[lane]);
|
|
|
|
acc4[lane] += int32_t(a4) * int32_t(wv[lane]);
|
|
|
|
acc5[lane] += int32_t(a5) * int32_t(wv[lane]);
|
|
|
|
acc6[lane] += int32_t(a6) * int32_t(wv[lane]);
|
|
|
|
acc7[lane] += int32_t(a7) * int32_t(wv[lane]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0[4];
|
|
|
|
int8_t result1[4];
|
|
|
|
int8_t result2[4];
|
|
|
|
int8_t result3[4];
|
|
|
|
int8_t result4[4];
|
|
|
|
int8_t result5[4];
|
|
|
|
int8_t result6[4];
|
|
|
|
int8_t result7[4];
|
|
|
|
|
|
|
|
if (scales) {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc0[lane])), float(minValue))));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc1[lane])), float(minValue))));
|
|
|
|
result2[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc2[lane])), float(minValue))));
|
|
|
|
result3[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc3[lane])), float(minValue))));
|
|
|
|
result4[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc4[lane])), float(minValue))));
|
|
|
|
result5[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc5[lane])), float(minValue))));
|
|
|
|
result6[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc6[lane])), float(minValue))));
|
|
|
|
result7[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc7[lane])), float(minValue))));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc0[lane]), minValue)));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc1[lane]), minValue)));
|
|
|
|
result2[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc2[lane]), minValue)));
|
|
|
|
result3[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc3[lane]), minValue)));
|
|
|
|
result4[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc4[lane]), minValue)));
|
|
|
|
result5[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc5[lane]), minValue)));
|
|
|
|
result6[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc6[lane]), minValue)));
|
|
|
|
result7[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc7[lane]), minValue)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(c , result0, 4 * sizeof(int8_t)); // store continuous c
|
|
|
|
memcpy(c + 4 , result1, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 2 , result2, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 3 , result3, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 4 , result4, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 5 , result5, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 6 , result6, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 7 , result7, 4 * sizeof(int8_t));
|
|
|
|
|
|
|
|
}
|
|
|
|
blockC += (ih >> 2) * cStride;
|
|
|
|
for (; ih < h; ih++) {
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
int32_t acc2 = initValue;
|
|
|
|
int32_t acc3 = initValue;
|
|
|
|
int32_t acc4 = initValue;
|
|
|
|
int32_t acc5 = initValue;
|
|
|
|
int32_t acc6 = initValue;
|
|
|
|
int32_t acc7 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t a4 = a[4];
|
|
|
|
const int8_t a5 = a[5];
|
|
|
|
const int8_t a6 = a[6];
|
|
|
|
const int8_t a7 = a[7];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
// MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%d, a value[0-7]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {8});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
acc1 += int32_t(a1) * int32_t(oneW);
|
|
|
|
acc2 += int32_t(a2) * int32_t(oneW);
|
|
|
|
acc3 += int32_t(a3) * int32_t(oneW);
|
|
|
|
acc4 += int32_t(a4) * int32_t(oneW);
|
|
|
|
acc5 += int32_t(a5) * int32_t(oneW);
|
|
|
|
acc6 += int32_t(a6) * int32_t(oneW);
|
|
|
|
acc7 += int32_t(a7) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0;
|
|
|
|
int8_t result1;
|
|
|
|
int8_t result2;
|
|
|
|
int8_t result3;
|
|
|
|
int8_t result4;
|
|
|
|
int8_t result5;
|
|
|
|
int8_t result6;
|
|
|
|
int8_t result7;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
result2 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc2)), float(minValue))));
|
|
|
|
result3 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc3)), float(minValue))));
|
|
|
|
result4 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc4)), float(minValue))));
|
|
|
|
result5 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc5)), float(minValue))));
|
|
|
|
result6 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc6)), float(minValue))));
|
|
|
|
result7 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc7)), float(minValue))));
|
|
|
|
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
result2 = static_cast<int8_t>(std::max(std::min(maxValue, acc2), minValue));
|
|
|
|
result3 = static_cast<int8_t>(std::max(std::min(maxValue, acc3), minValue));
|
|
|
|
result4 = static_cast<int8_t>(std::max(std::min(maxValue, acc4), minValue));
|
|
|
|
result5 = static_cast<int8_t>(std::max(std::min(maxValue, acc5), minValue));
|
|
|
|
result6 = static_cast<int8_t>(std::max(std::min(maxValue, acc6), minValue));
|
|
|
|
result7 = static_cast<int8_t>(std::max(std::min(maxValue, acc7), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
c[4 * 2] = result2;
|
|
|
|
c[4 * 3] = result3;
|
|
|
|
c[4 * 4] = result4;
|
|
|
|
c[4 * 5] = result5;
|
|
|
|
c[4 * 6] = result6;
|
|
|
|
c[4 * 7] = result7;
|
|
|
|
}
|
|
|
|
ie += 8;
|
|
|
|
a += 8;
|
|
|
|
}
|
|
|
|
if (eSize & 0x04) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t* w = B;
|
|
|
|
int8_t* blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
|
|
|
|
size_t ih = 0;
|
|
|
|
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto c = blockC + ihPack * cStride;
|
|
|
|
int32_t initValue[4] = {0, 0, 0, 0};
|
|
|
|
if (nullptr != bias) {
|
|
|
|
memcpy(initValue, bias + ih, 4 * sizeof(int32_t));
|
|
|
|
}
|
|
|
|
int32_t acc0[4];
|
|
|
|
int32_t acc1[4];
|
|
|
|
int32_t acc2[4];
|
|
|
|
int32_t acc3[4];
|
|
|
|
|
|
|
|
memcpy(acc0, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc1, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc2, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc3, initValue, 4 * sizeof(int32_t));
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t wv[4] = {*w++, *w++, *w++, *w++};
|
|
|
|
// MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:, a value[0-3]:\n", ie, a - A, w - B - 1, c - C);
|
|
|
|
// formatMatrix(wv, {4});
|
|
|
|
// formatMatrix(a, {4});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
acc0[lane] += int32_t(a0) * int32_t(wv[lane]);
|
|
|
|
acc1[lane] += int32_t(a1) * int32_t(wv[lane]);
|
|
|
|
acc2[lane] += int32_t(a2) * int32_t(wv[lane]);
|
|
|
|
acc3[lane] += int32_t(a3) * int32_t(wv[lane]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0[4];
|
|
|
|
int8_t result1[4];
|
|
|
|
int8_t result2[4];
|
|
|
|
int8_t result3[4];
|
|
|
|
|
|
|
|
if (scales) {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc0[lane])), float(minValue))));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc1[lane])), float(minValue))));
|
|
|
|
result2[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc2[lane])), float(minValue))));
|
|
|
|
result3[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc3[lane])), float(minValue))));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc0[lane]), minValue)));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc1[lane]), minValue)));
|
|
|
|
result2[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc2[lane]), minValue)));
|
|
|
|
result3[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc3[lane]), minValue)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(c , result0, 4 * sizeof(int8_t)); // store continuous c
|
|
|
|
memcpy(c + 4 , result1, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 2 , result2, 4 * sizeof(int8_t));
|
|
|
|
memcpy(c + 4 * 3 , result3, 4 * sizeof(int8_t));
|
|
|
|
|
|
|
|
}
|
|
|
|
blockC += (ih >> 2) * cStride;
|
|
|
|
for (; ih < h; ih++) {
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
int32_t acc2 = initValue;
|
|
|
|
int32_t acc3 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t a2 = a[2];
|
|
|
|
const int8_t a3 = a[3];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
// MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%d, a value[0-3]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {4});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
acc1 += int32_t(a1) * int32_t(oneW);
|
|
|
|
acc2 += int32_t(a2) * int32_t(oneW);
|
|
|
|
acc3 += int32_t(a3) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0;
|
|
|
|
int8_t result1;
|
|
|
|
int8_t result2;
|
|
|
|
int8_t result3;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
result2 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc2)), float(minValue))));
|
|
|
|
result3 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc3)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
result2 = static_cast<int8_t>(std::max(std::min(maxValue, acc2), minValue));
|
|
|
|
result3 = static_cast<int8_t>(std::max(std::min(maxValue, acc3), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
c[4 * 2] = result2;
|
|
|
|
c[4 * 3] = result3;
|
|
|
|
}
|
|
|
|
ie += 4;
|
|
|
|
a += 4;
|
|
|
|
}
|
|
|
|
if (eSize & 0x02) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t* w = B;
|
|
|
|
int8_t* blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
|
|
|
|
size_t ih = 0;
|
|
|
|
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto c = blockC + ihPack * cStride;
|
|
|
|
int32_t initValue[4] = {0, 0, 0, 0};
|
|
|
|
if (nullptr != bias) {
|
|
|
|
memcpy(initValue, bias + ih, 4 * sizeof(int32_t));
|
|
|
|
}
|
|
|
|
int32_t acc0[4];
|
|
|
|
int32_t acc1[4];
|
|
|
|
memcpy(acc0, initValue, 4 * sizeof(int32_t));
|
|
|
|
memcpy(acc1, initValue, 4 * sizeof(int32_t));
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t wv[4] = {*w++, *w++, *w++, *w++};
|
|
|
|
// MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:, a value[0-1]:\n", ie, a - A, w - B - 1, c - C);
|
|
|
|
// formatMatrix(wv, {4});
|
|
|
|
// formatMatrix(a, {2});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
acc0[lane] += int32_t(a0) * int32_t(wv[lane]);
|
|
|
|
acc1[lane] += int32_t(a1) * int32_t(wv[lane]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0[4];
|
|
|
|
int8_t result1[4];
|
|
|
|
if (scales) {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc0[lane])), float(minValue))));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc1[lane])), float(minValue))));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc0[lane]), minValue)));
|
|
|
|
result1[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc1[lane]), minValue)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(c , result0, 4 * sizeof(int8_t)); // store continuous c
|
|
|
|
memcpy(c + 4 , result1, 4 * sizeof(int8_t));
|
|
|
|
}
|
|
|
|
blockC += (ih >> 2) * cStride;
|
|
|
|
for (; ih < h; ih++) {
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
int32_t acc1 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t a1 = a[1];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
// MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%d, a value[0-1]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {2});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
acc1 += int32_t(a1) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0;
|
|
|
|
int8_t result1;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
result1 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc1)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
result1 = static_cast<int8_t>(std::max(std::min(maxValue, acc1), minValue));
|
|
|
|
}
|
|
|
|
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
c[4] = result1;
|
|
|
|
}
|
|
|
|
ie += 2;
|
|
|
|
a += 2;
|
|
|
|
}
|
|
|
|
if (eSize & 0x01) {
|
|
|
|
const int* dataOffset = dataOffsetMap;
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
// const float* a = blockA + diff;
|
|
|
|
a += diff;
|
|
|
|
const int8_t * w = B;
|
|
|
|
int8_t * blockC = C + (ie << 2);
|
|
|
|
const unsigned int* nnz = NNZMap;
|
|
|
|
|
|
|
|
size_t ih = 0;
|
|
|
|
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
|
|
|
|
auto ihPack = ih >> 2;
|
|
|
|
auto c = blockC + ihPack * cStride;
|
|
|
|
int32_t initValue[4] = {0, 0, 0, 0};
|
|
|
|
if (nullptr != bias) {
|
|
|
|
memcpy(initValue, bias + ih, 4 * sizeof(int32_t));
|
|
|
|
}
|
|
|
|
int32_t acc0[4];
|
|
|
|
memcpy(acc0, initValue, 4 * sizeof(int32_t));
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t wv[4] = {*w++, *w++, *w++, *w++};
|
|
|
|
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:, a value[0-1]:\n", ie, a - A, w - B - 1, c - C);
|
|
|
|
// formatMatrix(wv, {4});
|
|
|
|
// formatMatrix(a, {16});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
acc0[lane] += int32_t(a0) * int32_t(wv[lane]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int8_t result0[4];
|
|
|
|
if (scales) {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih + lane] * float(acc0[lane])), float(minValue))));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (int lane = 0; lane < 4; lane++) {
|
|
|
|
result0[lane] = static_cast<int8_t>(roundf(std::max(std::min(maxValue, acc0[lane]), minValue)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
memcpy(c, result0, 4 * sizeof(int8_t)); // store continuous c
|
|
|
|
}
|
|
|
|
blockC += (ih >> 2) * cStride;
|
|
|
|
for (; ih < h; ih++) {
|
|
|
|
auto ihSubIndex = ih & 0x03;
|
|
|
|
auto c = blockC + ihSubIndex;
|
|
|
|
const int32_t initValue = nullptr != bias ? bias[ih] : 0;
|
|
|
|
int32_t acc0 = initValue;
|
|
|
|
|
|
|
|
const int lElement = *nnz++;
|
|
|
|
for (auto il = 0; il < lElement; il++) {
|
|
|
|
const int diff = *dataOffset++;
|
|
|
|
const int8_t a0 = a[0];
|
|
|
|
const int8_t oneW = *w++;
|
|
|
|
|
|
|
|
// MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%d, a value[0]:\n", ie, a - A, w - B - 1, c - C, oneW);
|
|
|
|
// formatMatrix(a, {1});
|
|
|
|
// MNN_PRINT("\n");
|
|
|
|
a = a + diff;
|
|
|
|
acc0 += int32_t(a0) * int32_t(oneW);
|
|
|
|
}
|
|
|
|
int8_t result0;
|
|
|
|
if (scales) {
|
|
|
|
result0 = static_cast<int8_t>(roundf(std::max(std::min(float(maxValue), scales[ih] * float(acc0)), float(minValue))));
|
|
|
|
} else {
|
|
|
|
result0 = static_cast<int8_t>(std::max(std::min(maxValue, acc0), minValue));
|
|
|
|
}
|
|
|
|
// how to store faster: st4 / transpose /
|
|
|
|
c[0] = result0;
|
|
|
|
}
|
|
|
|
ie += 1;
|
|
|
|
// a += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
static int8_t MNNInt32ToInt8(int data, int bias, float scale, float maxValue, float minValue)
|
2020-11-05 16:41:56 +08:00
|
|
|
{
|
2020-02-26 09:57:17 +08:00
|
|
|
float value = (float)(data + bias) * scale;
|
2020-11-05 16:41:56 +08:00
|
|
|
value = ALIMAX(value, minValue);
|
|
|
|
value = ALIMIN(value, maxValue);
|
2020-02-26 09:57:17 +08:00
|
|
|
return static_cast<int8_t>(roundf(value));
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
|
2020-12-10 12:47:38 +08:00
|
|
|
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
|
2023-04-18 18:54:46 +08:00
|
|
|
const int bytes = ((post->useInt8 == 1) ? 1 : 4);
|
2020-02-26 09:57:17 +08:00
|
|
|
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
|
|
|
const auto weight_dz = weight + dz * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
2020-11-05 16:41:56 +08:00
|
|
|
const auto bias_dz = post->bias + dz * GEMM_INT8_UNIT;
|
[MNN:Sync] Sync internal github
Commits:
8148ae75c 弗人 bugfix
14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose
476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG.
5e26b9fd3 雁行 [Test:Feature] Add android test.
37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv.
144c185f5 tianbu.xsw hangxing fix hiai
b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size
d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model
398cc5ab6 tianhang.yth refactor demo
736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch
b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix
94b95bfed ghz [BugFix]1.Better method for fast pack valid check
6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
5f77ae889 tianhang.yth numThread bugfix
a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1
03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu
91fa7267b ghz [BugFix]1.fix the error in eP check
bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug.
1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
feb7ecc4c 弗人 modify log of python offline quant
040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
609f37db8 弗人 add log for python quant, python convert
5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
a93ff9280 tianhang.yth add tf.Unique op support
9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute.
ef8c369e3 弗人 catch exception
07c2dd670 弗人 add dependence to setup, base64 encode url, add time log
177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool
40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert.
d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu
83a198ed7 杭行 update
d0dd3e09b 杭行 update
99540202e xiaying [Converter:Optimize] Opt the tensor convert insert
333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
db5994672 杭行 merge
6293de7b8 tianbu.xsw fix pymnn updateCacheFile
5c2e11cb1 tianbu.xsw do updateCache in createSession
6e7641ff4 tianbu.xsw do not limit cacheFile for a model
5287a65e4 tianbu.xsw bugfix
52ba53a91 tianbu.xsw revert pymnn api
60284d830 tianbu.xsw bugfix
6d8077490 tianbu.xsw rename updateCacheFile api params
3cb172710 tianhang.yth updateCacheFile API size default value is 0
c5b69aabf tianbu.xsw updateCacheFile python api fix
5d5da7aa5 tianbu.xsw reflector code
5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm.
2a211825c tianbu.xsw reflector code for updateCacheFile
76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache
b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
e68bfa495 雁行 [Converter:Feature] Add UUID when model convert.
a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit
019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G)
d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find
604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu
4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment
82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary
e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error
1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx
6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump
968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop
3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model
d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
8b68be45c xiaying [MNN:Feature] Add segment
8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print
025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support
43900251e tianbu.xsw enable setCacheFile python API
ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task
9665c0a79 弗人 add check for path in json file
c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support
42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
83966d043 xiaying [Test:Feature] Add test for static module
42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
9067531c3 xiaying [Converter:Refractor] formatLicence
99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow
4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type
c6b219bce xiaying [Converter:Feature] Turn torch converter to object
dd4e68a37 xiaying [Converter:Feature] Support dump supported ops
80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed
015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info
23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution
b02b0d4de xiaying Fix bug for multi-input for conv1d
254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d
d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4
357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1
55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d
8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
846266b42 tianbu.xsw return when program and tune both nullptr
fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel
be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced
51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution
1ccdfdeb5 tianbu.xsw redefine svm macro name
31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper
d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op
24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
7b142978e xiaying [AVX512:Speed] Optimize for e <= 8
5f6febe7b tianbu.xsw code refactor
998d91b57 xiaying [Express:Speed] Merge submodule for speed
22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug
8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing
4a28f603e xiaying [Express:Speed] Shared Const for All Submodule
c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule
2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size
72f04008c xiaying [MNN:Refractor] Delete unuseful const op
1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen
4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode
1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain
f947a5f01 xiaying [Test:Feature] Add testTrain
dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp
cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad
91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512
742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge
12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios
3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8
c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx
d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx
b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto
426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack
7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize
4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin
412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul
319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test
7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit
d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32
bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16
42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float
a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch.
7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8
8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method
b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta.
3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a
2b7908ec7 tianbu.xsw modify workItemSize
3cee0d413 xiaying [MNN:Bugfix] test wrong clear
9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly
7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup
2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample
f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm
a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16
b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone.
d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn
e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize.
03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd
2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax
44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
09a5069c7 xiaying [MNN:Speed] Add offset for src and dst
6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model
cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
|
|
|
const float* scale_dz = nullptr;
|
2023-04-18 18:54:46 +08:00
|
|
|
scale_dz = post->scale + dz * GEMM_INT8_UNIT;
|
[MNN:Sync] Sync internal github
Commits:
8148ae75c 弗人 bugfix
14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose
476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG.
5e26b9fd3 雁行 [Test:Feature] Add android test.
37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv.
144c185f5 tianbu.xsw hangxing fix hiai
b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size
d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model
398cc5ab6 tianhang.yth refactor demo
736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch
b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix
94b95bfed ghz [BugFix]1.Better method for fast pack valid check
6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
5f77ae889 tianhang.yth numThread bugfix
a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1
03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu
91fa7267b ghz [BugFix]1.fix the error in eP check
bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug.
1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
feb7ecc4c 弗人 modify log of python offline quant
040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
609f37db8 弗人 add log for python quant, python convert
5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
a93ff9280 tianhang.yth add tf.Unique op support
9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute.
ef8c369e3 弗人 catch exception
07c2dd670 弗人 add dependence to setup, base64 encode url, add time log
177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool
40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert.
d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu
83a198ed7 杭行 update
d0dd3e09b 杭行 update
99540202e xiaying [Converter:Optimize] Opt the tensor convert insert
333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
db5994672 杭行 merge
6293de7b8 tianbu.xsw fix pymnn updateCacheFile
5c2e11cb1 tianbu.xsw do updateCache in createSession
6e7641ff4 tianbu.xsw do not limit cacheFile for a model
5287a65e4 tianbu.xsw bugfix
52ba53a91 tianbu.xsw revert pymnn api
60284d830 tianbu.xsw bugfix
6d8077490 tianbu.xsw rename updateCacheFile api params
3cb172710 tianhang.yth updateCacheFile API size default value is 0
c5b69aabf tianbu.xsw updateCacheFile python api fix
5d5da7aa5 tianbu.xsw reflector code
5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm.
2a211825c tianbu.xsw reflector code for updateCacheFile
76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache
b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
e68bfa495 雁行 [Converter:Feature] Add UUID when model convert.
a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit
019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G)
d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find
604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu
4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment
82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary
e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error
1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx
6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump
968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop
3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model
d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
8b68be45c xiaying [MNN:Feature] Add segment
8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print
025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support
43900251e tianbu.xsw enable setCacheFile python API
ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task
9665c0a79 弗人 add check for path in json file
c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support
42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
83966d043 xiaying [Test:Feature] Add test for static module
42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
9067531c3 xiaying [Converter:Refractor] formatLicence
99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow
4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type
c6b219bce xiaying [Converter:Feature] Turn torch converter to object
dd4e68a37 xiaying [Converter:Feature] Support dump supported ops
80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed
015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info
23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution
b02b0d4de xiaying Fix bug for multi-input for conv1d
254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d
d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4
357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1
55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d
8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
846266b42 tianbu.xsw return when program and tune both nullptr
fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel
be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced
51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution
1ccdfdeb5 tianbu.xsw redefine svm macro name
31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper
d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op
24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
7b142978e xiaying [AVX512:Speed] Optimize for e <= 8
5f6febe7b tianbu.xsw code refactor
998d91b57 xiaying [Express:Speed] Merge submodule for speed
22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug
8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing
4a28f603e xiaying [Express:Speed] Shared Const for All Submodule
c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule
2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size
72f04008c xiaying [MNN:Refractor] Delete unuseful const op
1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen
4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode
1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain
f947a5f01 xiaying [Test:Feature] Add testTrain
dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp
cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad
91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512
742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge
12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios
3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8
c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx
d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx
b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto
426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack
7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize
4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin
412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul
319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test
7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit
d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32
bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16
42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float
a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch.
7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8
8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method
b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta.
3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a
2b7908ec7 tianbu.xsw modify workItemSize
3cee0d413 xiaying [MNN:Bugfix] test wrong clear
9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly
7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup
2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample
f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm
a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16
b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone.
d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn
e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize.
03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd
2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax
44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
09a5069c7 xiaying [MNN:Speed] Add offset for src and dst
6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model
cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
|
|
|
auto dst_z = dst + dz * dst_step;
|
2020-12-10 12:47:38 +08:00
|
|
|
for (int w = 0; w < realCount; ++w) {
|
2020-02-26 09:57:17 +08:00
|
|
|
const auto src_x = src + w * GEMM_INT8_SRC_UNIT;
|
[MNN:Sync] Sync internal github
Commits:
8148ae75c 弗人 bugfix
14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose
476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG.
5e26b9fd3 雁行 [Test:Feature] Add android test.
37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv.
144c185f5 tianbu.xsw hangxing fix hiai
b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size
d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model
398cc5ab6 tianhang.yth refactor demo
736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch
b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix
94b95bfed ghz [BugFix]1.Better method for fast pack valid check
6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
5f77ae889 tianhang.yth numThread bugfix
a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1
03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu
91fa7267b ghz [BugFix]1.fix the error in eP check
bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug.
1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
feb7ecc4c 弗人 modify log of python offline quant
040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
609f37db8 弗人 add log for python quant, python convert
5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
a93ff9280 tianhang.yth add tf.Unique op support
9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute.
ef8c369e3 弗人 catch exception
07c2dd670 弗人 add dependence to setup, base64 encode url, add time log
177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool
40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert.
d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu
83a198ed7 杭行 update
d0dd3e09b 杭行 update
99540202e xiaying [Converter:Optimize] Opt the tensor convert insert
333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
db5994672 杭行 merge
6293de7b8 tianbu.xsw fix pymnn updateCacheFile
5c2e11cb1 tianbu.xsw do updateCache in createSession
6e7641ff4 tianbu.xsw do not limit cacheFile for a model
5287a65e4 tianbu.xsw bugfix
52ba53a91 tianbu.xsw revert pymnn api
60284d830 tianbu.xsw bugfix
6d8077490 tianbu.xsw rename updateCacheFile api params
3cb172710 tianhang.yth updateCacheFile API size default value is 0
c5b69aabf tianbu.xsw updateCacheFile python api fix
5d5da7aa5 tianbu.xsw reflector code
5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm.
2a211825c tianbu.xsw reflector code for updateCacheFile
76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache
b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
e68bfa495 雁行 [Converter:Feature] Add UUID when model convert.
a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit
019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G)
d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find
604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu
4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment
82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary
e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error
1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx
6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump
968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop
3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model
d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
8b68be45c xiaying [MNN:Feature] Add segment
8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print
025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support
43900251e tianbu.xsw enable setCacheFile python API
ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task
9665c0a79 弗人 add check for path in json file
c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support
42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
83966d043 xiaying [Test:Feature] Add test for static module
42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
9067531c3 xiaying [Converter:Refractor] formatLicence
99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow
4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type
c6b219bce xiaying [Converter:Feature] Turn torch converter to object
dd4e68a37 xiaying [Converter:Feature] Support dump supported ops
80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed
015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info
23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution
b02b0d4de xiaying Fix bug for multi-input for conv1d
254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d
d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4
357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1
55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d
8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
846266b42 tianbu.xsw return when program and tune both nullptr
fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel
be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced
51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution
1ccdfdeb5 tianbu.xsw redefine svm macro name
31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper
d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op
24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
7b142978e xiaying [AVX512:Speed] Optimize for e <= 8
5f6febe7b tianbu.xsw code refactor
998d91b57 xiaying [Express:Speed] Merge submodule for speed
22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug
8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing
4a28f603e xiaying [Express:Speed] Shared Const for All Submodule
c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule
2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size
72f04008c xiaying [MNN:Refractor] Delete unuseful const op
1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen
4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode
1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain
f947a5f01 xiaying [Test:Feature] Add testTrain
dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp
cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad
91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512
742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge
12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios
3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8
c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx
d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx
b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto
426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack
7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize
4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin
412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul
319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test
7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit
d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32
bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16
42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float
a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch.
7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8
8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method
b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta.
3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a
2b7908ec7 tianbu.xsw modify workItemSize
3cee0d413 xiaying [MNN:Bugfix] test wrong clear
9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly
7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup
2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample
f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm
a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16
b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone.
d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn
e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize.
03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd
2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax
44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
09a5069c7 xiaying [MNN:Speed] Add offset for src and dst
6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model
cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
|
|
|
auto dst_x = dst_z + w * GEMM_INT8_UNIT * bytes;
|
2020-02-26 09:57:17 +08:00
|
|
|
int32_t dstTemp[4] = {0, 0, 0, 0};
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-02-26 09:57:17 +08:00
|
|
|
for (int sz = 0; sz < src_depth_quad; ++sz) {
|
|
|
|
const auto weight_sz = weight_dz + (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT) * sz;
|
|
|
|
const auto src_z = src_x + sz * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT;
|
|
|
|
|
|
|
|
for (int j = 0; j < GEMM_INT8_UNIT; ++j) {
|
|
|
|
const auto weight_j = weight_sz + j * GEMM_INT8_SRC_UNIT;
|
|
|
|
for (int i = 0; i < GEMM_INT8_SRC_UNIT; ++i) {
|
|
|
|
dstTemp[j] += (int32_t)src_z[i] * (int32_t)weight_j[i];
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[MNN:Sync] Sync internal github
Commits:
8148ae75c 弗人 bugfix
14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose
476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG.
5e26b9fd3 雁行 [Test:Feature] Add android test.
37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv.
144c185f5 tianbu.xsw hangxing fix hiai
b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size
d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model
398cc5ab6 tianhang.yth refactor demo
736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch
b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix
94b95bfed ghz [BugFix]1.Better method for fast pack valid check
6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
5f77ae889 tianhang.yth numThread bugfix
a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1
03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu
91fa7267b ghz [BugFix]1.fix the error in eP check
bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug.
1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
feb7ecc4c 弗人 modify log of python offline quant
040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
609f37db8 弗人 add log for python quant, python convert
5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
a93ff9280 tianhang.yth add tf.Unique op support
9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute.
ef8c369e3 弗人 catch exception
07c2dd670 弗人 add dependence to setup, base64 encode url, add time log
177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool
40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert.
d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu
83a198ed7 杭行 update
d0dd3e09b 杭行 update
99540202e xiaying [Converter:Optimize] Opt the tensor convert insert
333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
db5994672 杭行 merge
6293de7b8 tianbu.xsw fix pymnn updateCacheFile
5c2e11cb1 tianbu.xsw do updateCache in createSession
6e7641ff4 tianbu.xsw do not limit cacheFile for a model
5287a65e4 tianbu.xsw bugfix
52ba53a91 tianbu.xsw revert pymnn api
60284d830 tianbu.xsw bugfix
6d8077490 tianbu.xsw rename updateCacheFile api params
3cb172710 tianhang.yth updateCacheFile API size default value is 0
c5b69aabf tianbu.xsw updateCacheFile python api fix
5d5da7aa5 tianbu.xsw reflector code
5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm.
2a211825c tianbu.xsw reflector code for updateCacheFile
76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache
b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction.
e68bfa495 雁行 [Converter:Feature] Add UUID when model convert.
a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit
019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G)
d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find
604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu
4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment
82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary
e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error
1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx
6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump
968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop
3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model
d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
8b68be45c xiaying [MNN:Feature] Add segment
8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print
025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support
43900251e tianbu.xsw enable setCacheFile python API
ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task
9665c0a79 弗人 add check for path in json file
c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support
42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
83966d043 xiaying [Test:Feature] Add test for static module
42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
9067531c3 xiaying [Converter:Refractor] formatLicence
99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow
4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type
c6b219bce xiaying [Converter:Feature] Turn torch converter to object
dd4e68a37 xiaying [Converter:Feature] Support dump supported ops
80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed
015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info
23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution
b02b0d4de xiaying Fix bug for multi-input for conv1d
254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d
d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4
357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1
55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d
8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
846266b42 tianbu.xsw return when program and tune both nullptr
fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel
be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced
51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution
1ccdfdeb5 tianbu.xsw redefine svm macro name
31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper
d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op
24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
7b142978e xiaying [AVX512:Speed] Optimize for e <= 8
5f6febe7b tianbu.xsw code refactor
998d91b57 xiaying [Express:Speed] Merge submodule for speed
22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug
8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing
4a28f603e xiaying [Express:Speed] Shared Const for All Submodule
c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule
2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size
72f04008c xiaying [MNN:Refractor] Delete unuseful const op
1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen
4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode
1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain
f947a5f01 xiaying [Test:Feature] Add testTrain
dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp
cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad
91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512
742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge
12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios
3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8
c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx
d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx
b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto
426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack
7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize
4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin
412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul
319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test
7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit
d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32
bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case
29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16
42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float
a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch.
7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8
8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method
b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta.
3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a
2b7908ec7 tianbu.xsw modify workItemSize
3cee0d413 xiaying [MNN:Bugfix] test wrong clear
9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14
eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly
7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup
2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample
f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm
a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16
b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone.
d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn
e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize.
03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd
2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax
44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
09a5069c7 xiaying [MNN:Speed] Add offset for src and dst
6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model
cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
|
|
|
for (int j = 0; j < GEMM_INT8_UNIT; ++j) {
|
2023-04-18 18:54:46 +08:00
|
|
|
if (!post->scale) {
|
|
|
|
((float*)dst_x)[j] = (float)(dstTemp[j] + bias_dz[j]);
|
|
|
|
} else if (post->useInt8 == 1) {
|
2021-06-11 17:17:13 +08:00
|
|
|
dst_x[j] = MNNInt32ToInt8(dstTemp[j], bias_dz[j], scale_dz[j], post->maxValue, post->minValue);
|
|
|
|
} else {
|
2023-04-18 18:54:46 +08:00
|
|
|
float value = (float)(dstTemp[j] + bias_dz[j]) * scale_dz[j];
|
|
|
|
((float*)dst_x)[j] = value;
|
2021-06-11 17:17:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-18 10:31:02 +08:00
|
|
|
static void MNNReluWithSlopeChannelInt8(int8_t* dst, const int8_t* src, const float* slope, size_t planeNumber, size_t depthQuad, QuanPrePostParameters *params) {
|
|
|
|
float mulVal = 0.f;
|
|
|
|
float inputScale = params->inputScale[0];
|
|
|
|
float outputScale = params->outputScale[0];
|
|
|
|
int32_t inputZero = static_cast<int32_t>(params->inputZeroPoint[0]);
|
|
|
|
int32_t outputZero = static_cast<int32_t>(params->outputZeroPoint[0]);
|
|
|
|
for (int j = 0;j < depthQuad; ++j) {
|
|
|
|
const float* slopeZ = slope + 4 * j;
|
|
|
|
const int8_t* srcZ = src + 4 * j * planeNumber;
|
|
|
|
int8_t* dstZ = dst + 4 * j * planeNumber;
|
|
|
|
for (int i = 0; i < planeNumber; ++i) {
|
|
|
|
for (int c = 0; c < 4; ++c) {
|
|
|
|
if (srcZ[4 * i + c] < 0) {
|
|
|
|
mulVal = (srcZ[4 * i + c] - inputZero) * slopeZ[c];
|
|
|
|
dstZ[4 * i + c] = ALIMIN(ALIMAX(static_cast<int32_t>(roundf(mulVal)) + outputZero, params->minValue), params->maxValue);
|
|
|
|
} else {
|
|
|
|
dstZ[4 * i + c] = srcZ[4 * i + c];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
static void MNNGemmInt8AddBiasScale_16x4_Unit_FAST(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
|
|
|
|
return MNNGemmInt8AddBiasScale_16x4_Unit(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, post, realCount);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters,
|
|
|
|
size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step,
|
2023-07-31 14:24:48 +08:00
|
|
|
size_t dilateY_step, int8_t* idxOrder) {
|
|
|
|
#ifdef MNN_USE_SSE
|
|
|
|
int offset = 128;
|
|
|
|
uint8_t* dstPtr = (uint8_t*)dst;
|
|
|
|
const int16_t* srcPtr = (int16_t*)src;
|
|
|
|
const int16_t* weightPtr = (int16_t*)weight;
|
|
|
|
#else
|
|
|
|
int offset = 0;
|
|
|
|
int8_t* dstPtr = dst;
|
|
|
|
const int8_t* srcPtr = src;
|
|
|
|
const int8_t* weightPtr = weight;
|
|
|
|
#endif
|
|
|
|
int pack = 16;
|
2021-06-11 17:17:13 +08:00
|
|
|
auto bias_z = parameters->bias;
|
|
|
|
auto scale_z = parameters->scale;
|
|
|
|
int dx, fx, fy;
|
|
|
|
for (dx = 0; dx < width; ++dx) {
|
2023-07-31 14:24:48 +08:00
|
|
|
auto dst_x = dstPtr + dx * pack;
|
|
|
|
int32_t dstInt32[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
|
|
const auto src_z = srcPtr + src_w_step * dx;
|
2021-06-11 17:17:13 +08:00
|
|
|
for (fy = 0; fy < fh; ++fy) {
|
|
|
|
const auto src_y = src_z + fy * dilateY_step;
|
2023-07-31 14:24:48 +08:00
|
|
|
const auto weight_y = weightPtr + fy * fw * pack;
|
2021-06-11 17:17:13 +08:00
|
|
|
for (fx = 0; fx < fw; ++fx) {
|
|
|
|
const auto src_x = src_y + fx * dilateX_step;
|
2023-07-31 14:24:48 +08:00
|
|
|
const auto weight_x = weight_y + pack * fx;
|
|
|
|
for (int j = 0; j < pack; ++j) {
|
|
|
|
dstInt32[j] += static_cast<int32_t>(src_x[j]) * static_cast<int32_t>(weight_x[j]);
|
2021-06-11 17:17:13 +08:00
|
|
|
}
|
2020-02-26 09:57:17 +08:00
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2021-06-11 17:17:13 +08:00
|
|
|
|
2023-07-31 14:24:48 +08:00
|
|
|
for (int i = 0; i < pack; ++i) {
|
|
|
|
|
|
|
|
float val = (dstInt32[i] + bias_z[i]) * scale_z[i];
|
|
|
|
int valOut = roundf(val) + offset;
|
|
|
|
if (valOut > parameters->maxValue + offset) {
|
|
|
|
valOut = parameters->maxValue + offset;
|
|
|
|
}
|
|
|
|
if (valOut < parameters->minValue + offset) {
|
|
|
|
valOut = parameters->minValue + offset;
|
|
|
|
}
|
|
|
|
dst_x[i] = static_cast<int>(valOut);
|
2021-06-11 17:17:13 +08:00
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
|
|
|
|
static void MNNLineDepthWiseInt8AddBiasScaleUnit3x3(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters,
|
|
|
|
size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder) {
|
|
|
|
MNNLineDepthWiseInt8AddBiasScaleUnit(dst, src, weight, parameters, width, src_w_step, fw, fh, dilateX_step, dilateY_step, idxOrder);
|
|
|
|
}
|
2021-06-11 17:17:13 +08:00
|
|
|
#endif
|
2021-01-06 16:29:37 +08:00
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
#ifndef MNN_USE_NEON
|
2020-02-26 09:57:17 +08:00
|
|
|
void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue,
|
2021-01-06 16:29:37 +08:00
|
|
|
ssize_t maxValue, ssize_t zeroPoint) {
|
2019-04-17 10:49:11 +08:00
|
|
|
for (int i = 0; i < sizeQuad; ++i) {
|
2020-02-26 09:57:17 +08:00
|
|
|
for (int j=0; j<4; ++j) {
|
2021-01-06 16:29:37 +08:00
|
|
|
int v = (int)roundf(src[4*i+j] * scalep[j]) + zeroPoint;
|
2020-02-26 09:57:17 +08:00
|
|
|
if (v > maxValue) {
|
|
|
|
v = maxValue;
|
|
|
|
}
|
|
|
|
if (v < minValue) {
|
|
|
|
v = minValue;
|
|
|
|
}
|
|
|
|
dst[4*i+j] = v;
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-06-11 17:17:13 +08:00
|
|
|
|
2021-01-06 16:29:37 +08:00
|
|
|
void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint) {
|
2020-12-10 17:53:24 +08:00
|
|
|
for (int i = 0; i < size; ++i) {
|
|
|
|
const auto srcStart = src + i * 4;
|
|
|
|
auto dstStart = dst + i * 4;
|
|
|
|
for (int j = 0; j < 4; ++j) {
|
2021-01-06 16:29:37 +08:00
|
|
|
dstStart[j] = static_cast<float>(srcStart[j] - zeroPoint) * scale[j];
|
2020-12-10 17:53:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-02-15 10:30:27 +08:00
|
|
|
|
2023-02-28 10:41:24 +08:00
|
|
|
void MNNAvgPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor) {
|
2023-02-15 10:30:27 +08:00
|
|
|
int pack = 16;
|
|
|
|
int8_t* dstPtr = dst;
|
|
|
|
const int8_t* srcPtr = src;
|
|
|
|
for (int ox = 0; ox < outputWidth; ++ox) {
|
|
|
|
std::vector<int> sum_(pack, 0);
|
|
|
|
for (int y = 0; y < kernely; ++y) {
|
|
|
|
for (int x = 0; x < kernelx; ++x) {
|
2023-02-28 10:41:24 +08:00
|
|
|
const int8_t *inputPtr = srcPtr + pack* (x + inputWidth* y);
|
2023-02-15 10:30:27 +08:00
|
|
|
for (int idx = 0; idx < pack; ++idx) {
|
|
|
|
sum_[idx] += *(inputPtr + idx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (int idx = 0; idx < pack; ++idx) {
|
|
|
|
*(dstPtr + idx) = static_cast<int8_t>((sum_[idx] * factor)>>24);
|
|
|
|
}
|
|
|
|
dstPtr = dstPtr + pack;
|
|
|
|
srcPtr = srcPtr + pack* stridesx;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-28 10:41:24 +08:00
|
|
|
void MNNMaxPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx) {
|
2023-02-15 10:30:27 +08:00
|
|
|
int pack = 16;
|
2023-02-28 10:41:24 +08:00
|
|
|
int8_t* dstPtr = dst;
|
|
|
|
const int8_t* srcPtr = src;
|
2023-02-15 10:30:27 +08:00
|
|
|
for (int ox = 0; ox < outputWidth; ++ox){
|
|
|
|
std::vector<int8_t> results(pack, INT8_MIN);
|
|
|
|
for (int y = 0; y < kernely; ++y) {
|
2023-02-28 10:41:24 +08:00
|
|
|
for (int x = 0; x < kernelx; ++x) {
|
2023-02-15 10:30:27 +08:00
|
|
|
const int8_t* inputPtr = srcPtr + pack* (x + inputWidth* y);
|
2023-02-28 10:41:24 +08:00
|
|
|
for (int idx = 0; idx < pack; ++idx) {
|
2023-02-15 10:30:27 +08:00
|
|
|
results[idx] = std::max(results[idx], *(inputPtr + idx));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int idx = 0; idx < pack;++idx) {
|
|
|
|
*(dstPtr + idx) = results[idx];
|
|
|
|
}
|
2023-02-28 10:41:24 +08:00
|
|
|
dstPtr = dstPtr + pack;
|
|
|
|
srcPtr = srcPtr + pack* stridesx;
|
2023-02-15 10:30:27 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-31 14:24:48 +08:00
|
|
|
void MNNBinaryAddInt8 (int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast) {
|
2023-04-27 15:11:05 +08:00
|
|
|
float sum = 0;
|
|
|
|
#ifdef MNN_USE_SSE
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 128;
|
2023-05-18 19:11:50 +08:00
|
|
|
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
|
|
|
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
|
|
|
uint8_t* outputData = (uint8_t*)outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#else
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 0;
|
2023-05-18 19:11:50 +08:00
|
|
|
const int8_t* inputData0 = inputRaw0;
|
|
|
|
const int8_t* inputData1 = inputRaw1;
|
|
|
|
int8_t* outputData = outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#endif
|
2023-07-31 14:24:48 +08:00
|
|
|
const int maxValue = static_cast<int32_t>(params->maxValue) + offset;
|
|
|
|
const int minValue = static_cast<int32_t>(params->minValue) + offset;
|
2023-04-27 15:11:05 +08:00
|
|
|
for (int i = 0; i < elementSize; ++i) {
|
|
|
|
if (needBroadcast == 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[0] - offset - (int32_t)params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - (int32_t)params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-05-18 19:11:50 +08:00
|
|
|
sum = inp0 + inp1;
|
2023-04-27 15:11:05 +08:00
|
|
|
} else if (needBroadcast == 1) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - (int32_t)params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[0] - offset - (int32_t)params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-05-18 19:11:50 +08:00
|
|
|
sum = inp0 + inp1;
|
2023-04-27 15:11:05 +08:00
|
|
|
} else {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - (int32_t)params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - (int32_t)params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-05-18 19:11:50 +08:00
|
|
|
sum = inp0 + inp1;
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
int value = (sum + (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
if (sum < 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
value = (sum - (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
}
|
2023-05-18 19:11:50 +08:00
|
|
|
if (value > maxValue) {
|
|
|
|
value = maxValue;
|
|
|
|
}
|
|
|
|
if (value < minValue) {
|
|
|
|
value = minValue;
|
|
|
|
}
|
|
|
|
outputData[i] = value;
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-31 14:24:48 +08:00
|
|
|
void MNNBinarySubInt8 (int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast) {
|
2023-04-27 15:11:05 +08:00
|
|
|
float res = 0;
|
|
|
|
#ifdef MNN_USE_SSE
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 128;
|
2023-05-18 19:11:50 +08:00
|
|
|
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
|
|
|
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
|
|
|
uint8_t* outputData = (uint8_t*)outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#else
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 0;
|
2023-05-18 19:11:50 +08:00
|
|
|
const int8_t* inputData0 = inputRaw0;
|
|
|
|
const int8_t* inputData1 = inputRaw1;
|
|
|
|
int8_t* outputData = outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#endif
|
2023-07-31 14:24:48 +08:00
|
|
|
const int maxValue = static_cast<int32_t>(params->maxValue) + offset;
|
|
|
|
const int minValue = static_cast<int32_t>(params->minValue) + offset;
|
2023-04-27 15:11:05 +08:00
|
|
|
for (int i = 0; i < elementSize; ++i) {
|
|
|
|
if (needBroadcast == 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[0] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-05-18 19:11:50 +08:00
|
|
|
res = inp0 - inp1;
|
2023-04-27 15:11:05 +08:00
|
|
|
} else if (needBroadcast == 1) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[0] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-05-18 19:11:50 +08:00
|
|
|
res = inp0 - inp1;
|
2023-04-27 15:11:05 +08:00
|
|
|
} else {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-05-18 19:11:50 +08:00
|
|
|
res = inp0 - inp1;
|
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
int value = (res + (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
if (res < 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
value = (res - (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
}
|
2023-05-18 19:11:50 +08:00
|
|
|
if (value > maxValue) {
|
|
|
|
value = maxValue;
|
|
|
|
}
|
|
|
|
if (value < minValue) {
|
|
|
|
value = minValue;
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
2023-05-18 19:11:50 +08:00
|
|
|
outputData[i] = value;
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-31 14:24:48 +08:00
|
|
|
void MNNBinaryMulInt8 (int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast) {
|
2023-04-27 15:11:05 +08:00
|
|
|
float res = 0;
|
|
|
|
#ifdef MNN_USE_SSE
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 128;
|
2023-05-18 19:11:50 +08:00
|
|
|
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
|
|
|
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
|
|
|
uint8_t* outputData = (uint8_t*)outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#else
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 0;
|
2023-05-18 19:11:50 +08:00
|
|
|
const int8_t* inputData0 = inputRaw0;
|
|
|
|
const int8_t* inputData1 = inputRaw1;
|
|
|
|
int8_t* outputData = outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#endif
|
2023-07-31 14:24:48 +08:00
|
|
|
const int maxValue = static_cast<int32_t>(params->maxValue) + offset;
|
|
|
|
const int minValue = static_cast<int32_t>(params->minValue) + offset;
|
2023-07-05 11:44:25 +08:00
|
|
|
for (int i = 0; i < elementSize; ++i) {
|
|
|
|
if (needBroadcast == 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
float inp0 = (inputData0[0] - offset - params->inputZeroPoint[0]) * inputScalesFp32[0];
|
|
|
|
float inp1 = (inputData1[i] - offset - params->inputZeroPoint[1]) * inputScalesFp32[1];
|
2023-07-05 11:44:25 +08:00
|
|
|
res = inp0 * inp1;
|
|
|
|
} else if (needBroadcast == 1) {
|
2023-07-31 14:24:48 +08:00
|
|
|
float inp0 = (inputData0[i] - offset - params->inputZeroPoint[0]) * inputScalesFp32[0];
|
|
|
|
float inp1 = (inputData1[0] - offset - params->inputZeroPoint[1]) * inputScalesFp32[1];
|
2023-07-05 11:44:25 +08:00
|
|
|
res = inp0 * inp1;
|
|
|
|
} else {
|
2023-07-31 14:24:48 +08:00
|
|
|
float inp0 = (inputData0[i] - offset - params->inputZeroPoint[0]) * inputScalesFp32[0];
|
|
|
|
float inp1 = (inputData1[i] - offset - params->inputZeroPoint[1]) * inputScalesFp32[1];
|
2023-07-05 11:44:25 +08:00
|
|
|
res = inp0 * inp1;
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
int value = (int)roundf(res * inputScalesFp32[2]) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
if (value > maxValue) {
|
|
|
|
value = maxValue;
|
|
|
|
}
|
|
|
|
if (value < minValue) {
|
|
|
|
value = minValue;
|
|
|
|
}
|
|
|
|
outputData[i] = value;
|
|
|
|
}
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
|
|
|
|
2023-07-31 14:24:48 +08:00
|
|
|
void MNNBinaryMinInt8 (int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast) {
|
2023-07-05 11:44:25 +08:00
|
|
|
int res = 0;
|
2023-04-27 15:11:05 +08:00
|
|
|
#ifdef MNN_USE_SSE
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 128;
|
2023-05-18 19:11:50 +08:00
|
|
|
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
|
|
|
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
|
|
|
uint8_t* outputData = (uint8_t*)outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#else
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 0;
|
2023-05-18 19:11:50 +08:00
|
|
|
const int8_t* inputData0 = inputRaw0;
|
|
|
|
const int8_t* inputData1 = inputRaw1;
|
|
|
|
int8_t* outputData = outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#endif
|
2023-07-31 14:24:48 +08:00
|
|
|
const int maxValue = static_cast<int32_t>(params->maxValue) + offset;
|
|
|
|
const int minValue = static_cast<int32_t>(params->minValue) + offset;
|
2023-07-05 11:44:25 +08:00
|
|
|
for (int i = 0; i < elementSize; ++i) {
|
|
|
|
if (needBroadcast == 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[0] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-07-05 11:44:25 +08:00
|
|
|
res = std::min(inp0, inp1);
|
|
|
|
} else if (needBroadcast == 1) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[0] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-07-05 11:44:25 +08:00
|
|
|
res = std::min(inp0, inp1);
|
|
|
|
} else {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-07-05 11:44:25 +08:00
|
|
|
res = std::min(inp0, inp1);
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
int value = (res + (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
if (res < 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
value = (res - (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
}
|
|
|
|
if (value > maxValue) {
|
|
|
|
value = maxValue;
|
|
|
|
}
|
|
|
|
if (value < minValue) {
|
|
|
|
value = minValue;
|
|
|
|
}
|
|
|
|
outputData[i] = value;
|
|
|
|
}
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
|
|
|
|
2023-07-31 14:24:48 +08:00
|
|
|
void MNNBinaryMaxInt8 (int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast) {
|
|
|
|
int res = 0;
|
2023-04-27 15:11:05 +08:00
|
|
|
#ifdef MNN_USE_SSE
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 128;
|
2023-05-18 19:11:50 +08:00
|
|
|
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
|
|
|
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
|
|
|
uint8_t* outputData = (uint8_t*)outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#else
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 0;
|
2023-05-18 19:11:50 +08:00
|
|
|
const int8_t* inputData0 = inputRaw0;
|
|
|
|
const int8_t* inputData1 = inputRaw1;
|
|
|
|
int8_t* outputData = outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#endif
|
2023-07-31 14:24:48 +08:00
|
|
|
const int maxValue = static_cast<int32_t>(params->maxValue) + offset;
|
|
|
|
const int minValue = static_cast<int32_t>(params->minValue) + offset;
|
2023-07-05 11:44:25 +08:00
|
|
|
for (int i = 0; i < elementSize; ++i) {
|
|
|
|
if (needBroadcast == 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[0] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-07-05 11:44:25 +08:00
|
|
|
res = std::max(inp0, inp1);
|
|
|
|
} else if (needBroadcast == 1) {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[0] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-07-05 11:44:25 +08:00
|
|
|
res = std::max(inp0, inp1);
|
|
|
|
} else {
|
2023-07-31 14:24:48 +08:00
|
|
|
int32_t inp0 = static_cast<int32_t>(inputData0[i] - offset - params->inputZeroPoint[0]) * static_cast<int32_t>(inputScalesInt32[0]);
|
|
|
|
int32_t inp1 = static_cast<int32_t>(inputData1[i] - offset - params->inputZeroPoint[1]) * static_cast<int32_t>(inputScalesInt32[1]);
|
2023-07-05 11:44:25 +08:00
|
|
|
res = std::max(inp0, inp1);
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
int value = (res + (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
if (res < 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
value = (res - (1<<15)) / (1 << 16) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-07-05 11:44:25 +08:00
|
|
|
}
|
|
|
|
if (value > maxValue) {
|
|
|
|
value = maxValue;
|
|
|
|
}
|
|
|
|
if (value < minValue) {
|
|
|
|
value = minValue;
|
|
|
|
}
|
|
|
|
outputData[i] = value;
|
|
|
|
}
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
void MNNBinarySqdInt8 (int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast) {
|
2023-05-18 19:11:50 +08:00
|
|
|
float res = 0;
|
2023-04-27 15:11:05 +08:00
|
|
|
#ifdef MNN_USE_SSE
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 128;
|
2023-05-18 19:11:50 +08:00
|
|
|
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
|
|
|
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
|
|
|
uint8_t* outputData = (uint8_t*)outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#else
|
2023-07-31 14:24:48 +08:00
|
|
|
const int offset = 0;
|
2023-05-18 19:11:50 +08:00
|
|
|
const int8_t* inputData0 = inputRaw0;
|
|
|
|
const int8_t* inputData1 = inputRaw1;
|
|
|
|
int8_t* outputData = outputRaw;
|
2023-04-27 15:11:05 +08:00
|
|
|
#endif
|
2023-07-31 14:24:48 +08:00
|
|
|
const int maxValue = static_cast<int32_t>(params->maxValue) + offset;
|
|
|
|
const int minValue = static_cast<int32_t>(params->minValue) + offset;
|
2023-04-27 15:11:05 +08:00
|
|
|
for (int i = 0; i < elementSize; ++i) {
|
|
|
|
if (needBroadcast == 0) {
|
2023-07-31 14:24:48 +08:00
|
|
|
float inp0 = (inputData0[0] - offset - params->inputZeroPoint[0]) * inputScalesFp32[0];
|
|
|
|
float inp1 = (inputData1[i] - offset - params->inputZeroPoint[1]) * inputScalesFp32[1];
|
2023-04-27 15:11:05 +08:00
|
|
|
res = (inp0 - inp1) * (inp0 - inp1);
|
|
|
|
} else if (needBroadcast == 1) {
|
2023-07-31 14:24:48 +08:00
|
|
|
float inp0 = (inputData0[i] - offset - params->inputZeroPoint[0]) * inputScalesFp32[0];
|
|
|
|
float inp1 = (inputData1[0] - offset - params->inputZeroPoint[1]) * inputScalesFp32[1];
|
2023-04-27 15:11:05 +08:00
|
|
|
res = (inp0 - inp1) * (inp0 - inp1);
|
|
|
|
} else {
|
2023-07-31 14:24:48 +08:00
|
|
|
float inp0 = (inputData0[i] - offset - params->inputZeroPoint[0]) * inputScalesFp32[0];
|
|
|
|
float inp1 = (inputData1[i] - offset - params->inputZeroPoint[1]) * inputScalesFp32[1];
|
2023-04-27 15:11:05 +08:00
|
|
|
res = (inp0 - inp1) * (inp0 - inp1);
|
|
|
|
}
|
2023-07-31 14:24:48 +08:00
|
|
|
int value = (int)roundf(res * inputScalesFp32[2]) + offset + static_cast<int32_t>(params->outputZeroPoint[0]);
|
2023-05-18 19:11:50 +08:00
|
|
|
if (value > maxValue) {
|
|
|
|
value = maxValue;
|
|
|
|
}
|
|
|
|
if (value < minValue) {
|
|
|
|
value = minValue;
|
|
|
|
}
|
|
|
|
outputData[i] = value;
|
2023-04-27 15:11:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-05 11:44:25 +08:00
|
|
|
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, int8_t* inputZeroPoint, int8_t* outputZeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack) {
|
2023-06-16 09:42:45 +08:00
|
|
|
#ifdef MNN_USE_SSE
|
|
|
|
const uint8_t* srcPtr = (uint8_t*)src;
|
|
|
|
uint8_t* dstPtr = (uint8_t*)dst;
|
|
|
|
int offset = 128;
|
|
|
|
#else
|
|
|
|
const int8_t* srcPtr = src;
|
|
|
|
int8_t* dstPtr = dst;
|
|
|
|
int offset = 0;
|
|
|
|
#endif
|
2023-07-05 11:44:25 +08:00
|
|
|
int intputZeroPointValue = *inputZeroPoint + offset;
|
|
|
|
int outputZeroPointValue = *outputZeroPoint + offset;
|
2023-06-16 09:42:45 +08:00
|
|
|
int d = mShiftBits - 1;
|
|
|
|
|
|
|
|
for (int z = 0; z < biasNumber; ++z) {
|
|
|
|
auto dstZ = dstPtr + planeNumber * pack * z;
|
|
|
|
const auto srcZ = srcPtr + planeNumber * pack * z;
|
|
|
|
std::vector<int32_t> biasZ(pack), alphaZ(pack);
|
|
|
|
for (int i = 0; i < pack; ++i) {
|
|
|
|
biasZ[i] = *(bias + pack * z + i);
|
|
|
|
alphaZ[i] = *(alpha + pack * z + i);
|
|
|
|
}
|
|
|
|
for (int p = 0; p < planeNumber; ++p) {
|
|
|
|
auto dstX = dstZ + pack * p;
|
|
|
|
const auto srcX = srcZ + pack * p;
|
|
|
|
|
|
|
|
for (int i = 0; i < pack; ++i) {
|
2023-07-05 11:44:25 +08:00
|
|
|
int32_t val = static_cast<int32_t>(srcX[i] - intputZeroPointValue) * alphaZ[i] + biasZ[i];
|
2023-06-16 09:42:45 +08:00
|
|
|
|
2023-07-05 11:44:25 +08:00
|
|
|
int valOut = (val + (1<<d)) / (1 << mShiftBits) + outputZeroPointValue;
|
2023-06-16 09:42:45 +08:00
|
|
|
if (val < 0) {
|
2023-07-05 11:44:25 +08:00
|
|
|
valOut = (val - (1<<d)) / (1 << mShiftBits) + outputZeroPointValue;
|
2023-06-16 09:42:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (valOut > maxValue + offset) {
|
|
|
|
valOut = maxValue + offset;
|
|
|
|
}
|
|
|
|
if (valOut < minValue + offset) {
|
|
|
|
valOut = minValue + offset;
|
|
|
|
}
|
|
|
|
dstX[i] = valOut;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-04-27 15:11:05 +08:00
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
#endif // #ifndef MNN_USE_NEON
|
2021-09-18 15:52:30 +08:00
|
|
|
#ifndef MNN_USE_SSE
|
2023-02-15 10:30:27 +08:00
|
|
|
|
2021-09-18 15:52:30 +08:00
|
|
|
void MNNInt8FunctionInit() {
|
|
|
|
// do nothing
|
|
|
|
}
|
2021-06-11 17:17:13 +08:00
|
|
|
#endif // #ifndef MNN_USE_SSE
|
2020-12-10 17:53:24 +08:00
|
|
|
|
2023-06-16 09:42:45 +08:00
|
|
|
template<int EP, int LP, int HP>
|
|
|
|
static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
|
|
|
|
int number = info[0];
|
|
|
|
int eReal = info[1];
|
|
|
|
int eOutsideStride = info[2] / sizeof(float);
|
|
|
|
int eDest = EP;
|
|
|
|
int offset = info[3];
|
|
|
|
const int LUNIT = LP / sizeof(float);
|
|
|
|
for (int n=0; n<number; ++n) {
|
|
|
|
int e = el[4 * n + 0];
|
|
|
|
int l = el[4 * n + 1];
|
|
|
|
int eOffset = el[4 * n + 2];
|
|
|
|
int lOffset = el[4 * n + 3];
|
|
|
|
int lC = lOffset / LP;
|
|
|
|
int lR = lOffset % LP;
|
|
|
|
int eC = eOffset / eDest;
|
|
|
|
int eR = eOffset % eDest;
|
|
|
|
auto dest = (int32_t*)(destOrigin + lC * eDest * LP + lR + eC * info[2] + eR * LP);
|
|
|
|
auto source = (int32_t*)sourceGroup[n];
|
|
|
|
int lRemain = l / 4;
|
|
|
|
int lR4 = lR / LUNIT;
|
|
|
|
int lS = LUNIT - lR4;
|
|
|
|
int eS = eDest - eR;
|
|
|
|
// Step for start
|
|
|
|
if (lR4 > 0) {
|
|
|
|
int step = ALIMIN(lS, lRemain);
|
|
|
|
for (int x=0; x<step; ++x) {
|
|
|
|
int eRemain = e;
|
|
|
|
auto d = dest + x;
|
|
|
|
auto s = source + x * eReal;
|
|
|
|
if (eR > 0) {
|
|
|
|
int eStep = ALIMIN(eRemain, eS);
|
|
|
|
for (int yi=0; yi<eStep; ++yi) {
|
|
|
|
d[yi * LUNIT] = s[yi * offset];
|
|
|
|
}
|
|
|
|
eRemain-=eStep;
|
|
|
|
d += (eOutsideStride - eR * LUNIT);
|
|
|
|
s += eS * offset;
|
|
|
|
}
|
|
|
|
while (eRemain > 0) {
|
|
|
|
int eStep = ALIMIN(eDest, eRemain);
|
|
|
|
for (int yi=0; yi<eStep; ++yi) {
|
|
|
|
d[yi * LUNIT] = s[yi * offset];
|
|
|
|
}
|
|
|
|
eRemain-=eStep;
|
|
|
|
d+= eOutsideStride;
|
|
|
|
s+= eStep * offset;
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2023-06-16 09:42:45 +08:00
|
|
|
lRemain -= step;
|
|
|
|
dest += step;
|
|
|
|
source += eReal * step;
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2023-06-16 09:42:45 +08:00
|
|
|
while (lRemain > 0) {
|
|
|
|
int step = ALIMIN(lRemain, LUNIT);
|
|
|
|
for (int x=0; x<step; ++x) {
|
|
|
|
int eRemain = e;
|
|
|
|
auto d = dest + x;
|
|
|
|
auto s = source + x * eReal;
|
|
|
|
if (eR > 0) {
|
|
|
|
int eStep = ALIMIN(eRemain, eS);
|
|
|
|
for (int yi=0; yi<eStep; ++yi) {
|
|
|
|
d[yi * LUNIT] = s[yi * offset];
|
|
|
|
}
|
|
|
|
eRemain-=eStep;
|
|
|
|
d += (eOutsideStride - eR * LUNIT);
|
|
|
|
s += eS * offset;
|
|
|
|
}
|
|
|
|
while (eRemain > 0) {
|
|
|
|
int eStep = ALIMIN(eDest, eRemain);
|
|
|
|
for (int yi=0; yi<eStep; ++yi) {
|
|
|
|
d[yi * LUNIT] = s[yi * offset];
|
|
|
|
}
|
|
|
|
eRemain-=eStep;
|
|
|
|
d+= eOutsideStride;
|
|
|
|
s+= eStep * offset;
|
2020-03-22 20:16:29 +08:00
|
|
|
}
|
|
|
|
}
|
2023-06-16 09:42:45 +08:00
|
|
|
lRemain -= step;
|
|
|
|
dest += eDest * LUNIT;
|
|
|
|
source += eReal * step;
|
2020-03-22 20:16:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
static void MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
|
|
|
*UNIT = GEMM_INT8_UNIT;
|
|
|
|
*SRC_UNIT = GEMM_INT8_SRC_UNIT;
|
|
|
|
*DST_XUNIT = GEMM_INT8_DST_XUNIT;
|
|
|
|
}
|
2021-01-06 16:29:37 +08:00
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
static void MNNGetGemmUnitSdot(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
2023-06-16 09:42:45 +08:00
|
|
|
*UNIT = 4;
|
|
|
|
*SRC_UNIT = 4;
|
|
|
|
*DST_XUNIT = 12;
|
2021-06-11 17:17:13 +08:00
|
|
|
}
|
2021-01-06 16:29:37 +08:00
|
|
|
|
2023-06-16 09:42:45 +08:00
|
|
|
static void MNNGetGemmUnitI8mm(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
|
|
|
|
*UNIT = 4;
|
|
|
|
*SRC_UNIT = 8;
|
|
|
|
*DST_XUNIT = 20;
|
2022-10-30 08:44:24 +08:00
|
|
|
}
|
|
|
|
|
2023-06-16 09:42:45 +08:00
|
|
|
template<int EP, int HP>
|
|
|
|
static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
|
|
|
|
int number = info[0];
|
|
|
|
int eReal = info[1];
|
|
|
|
int eDest = EP;
|
|
|
|
int offset = info[3];
|
|
|
|
const int LP = 4;
|
|
|
|
int eOutsideStride = info[2] / sizeof(float);
|
|
|
|
for (int n=0; n<number; ++n) {
|
|
|
|
int e = el[4 * n + 0];
|
|
|
|
int l = el[4 * n + 1];
|
|
|
|
int eOffset = el[4 * n + 2];
|
|
|
|
int lOffset = el[4 * n + 3];
|
|
|
|
int eC = eOffset / eDest;
|
|
|
|
int eR = eOffset % eDest;
|
|
|
|
auto dest = (int32_t*)(destOrigin + lOffset * eDest + eC * info[2] + eR * LP);
|
|
|
|
int eS = eDest - eR;
|
|
|
|
auto source = (int32_t*)sourceGroup[n];
|
|
|
|
int lRemain = l / sizeof(float);
|
|
|
|
for (int x=0; x<lRemain; ++x) {
|
|
|
|
int eRemain = e;
|
|
|
|
auto d = dest;
|
|
|
|
auto s = source;
|
|
|
|
if (1 == offset) {
|
|
|
|
if (eR > 0) {
|
|
|
|
int eStep = ALIMIN(eRemain, eS);
|
|
|
|
::memcpy(d, s, eStep * sizeof(int32_t));
|
|
|
|
eRemain-=eStep;
|
|
|
|
d += (eOutsideStride - eR);
|
|
|
|
s += eS * offset;
|
|
|
|
}
|
|
|
|
while (eRemain > 0) {
|
|
|
|
int eStep = ALIMIN(eDest, eRemain);
|
|
|
|
::memcpy(d, s, eStep * sizeof(int32_t));
|
|
|
|
eRemain-=eStep;
|
|
|
|
d+= eOutsideStride;
|
|
|
|
s+= eStep * offset;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (eR > 0) {
|
|
|
|
int eStep = ALIMIN(eRemain, eS);
|
|
|
|
for (int yi=0; yi<eStep; ++yi) {
|
|
|
|
d[yi] = s[yi * offset];
|
|
|
|
}
|
|
|
|
eRemain-=eStep;
|
|
|
|
d += (eOutsideStride - eR);
|
|
|
|
s += eS * offset;
|
|
|
|
}
|
|
|
|
while (eRemain > 0) {
|
|
|
|
int eStep = ALIMIN(eDest, eRemain);
|
|
|
|
for (int yi=0; yi<eStep; ++yi) {
|
|
|
|
d[yi] = s[yi * offset];
|
|
|
|
}
|
|
|
|
eRemain-=eStep;
|
|
|
|
d+= eOutsideStride;
|
|
|
|
s+= eStep * offset;
|
2022-10-30 08:44:24 +08:00
|
|
|
}
|
|
|
|
}
|
2023-06-16 09:42:45 +08:00
|
|
|
dest += eDest;
|
|
|
|
source += eReal;
|
2022-10-30 08:44:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
namespace MNN {
|
|
|
|
|
|
|
|
static CoreInt8Functions* gCoreFunc = nullptr;
|
|
|
|
|
|
|
|
void MNNCoreInt8FunctionInit() {
|
|
|
|
/* CoreInt8Functions without sdot */
|
|
|
|
gCoreFunc = new CoreInt8Functions;
|
2021-09-18 15:52:30 +08:00
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
// MatMul
|
|
|
|
gCoreFunc->Int8GemmKernel = MNNGemmInt8AddBiasScale_16x4_Unit;
|
|
|
|
gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_16x4_Unit_FAST;
|
|
|
|
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnit;
|
2023-04-18 18:54:46 +08:00
|
|
|
|
2021-06-11 17:17:13 +08:00
|
|
|
// Im2Col
|
2023-06-16 09:42:45 +08:00
|
|
|
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<GEMM_INT8_DST_XUNIT, GEMM_INT8_SRC_UNIT, GEMM_INT8_UNIT>;
|
2021-06-11 17:17:13 +08:00
|
|
|
// conv depthwise
|
|
|
|
gCoreFunc->ConvDepthwiseLineInt8 = MNNLineDepthWiseInt8AddBiasScaleUnit;
|
2021-09-18 15:52:30 +08:00
|
|
|
gCoreFunc->MNNFloat2Int8 = MNNFloat2Int8;
|
|
|
|
gCoreFunc->MNNInt8ScaleToFloat = MNNInt8ScaleToFloat;
|
|
|
|
|
|
|
|
// sparse
|
|
|
|
gCoreFunc->MNNGetSparseQuantMatMulPackMode = MNNGetSparseQuantMatMulPackMode;
|
|
|
|
gCoreFunc->MNNPackForSparseQuantMatMul_B = MNNPackForSparseQuantMatMul_B;
|
|
|
|
gCoreFunc->MNNPackedSparseQuantMatMulEpx1 = MNNPackedSparseQuantMatMulEpx1;
|
|
|
|
gCoreFunc->MNNPackedSparseQuantMatMulEpx4 = MNNPackedSparseQuantMatMulEpx4;
|
2023-06-16 09:42:45 +08:00
|
|
|
gCoreFunc->MNNPackC4Int8ForMatMul_ASparse = _MNNPackC4Int8ForMatMul_ASparse;
|
2021-06-11 17:17:13 +08:00
|
|
|
|
2023-02-15 10:30:27 +08:00
|
|
|
// pooling
|
|
|
|
gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
|
|
|
|
gCoreFunc->MNNMaxPoolInt8 = MNNMaxPoolInt8;
|
2023-09-04 10:42:11 +08:00
|
|
|
|
|
|
|
// Norm
|
|
|
|
gCoreFunc->MNNNormInt8 = MNNNormInt8;
|
2023-02-15 10:30:27 +08:00
|
|
|
|
2023-10-18 10:31:02 +08:00
|
|
|
// ReluWithSlopeChannel
|
|
|
|
gCoreFunc->MNNReluWithSlopeChannelInt8 = MNNReluWithSlopeChannelInt8;
|
|
|
|
|
2022-10-30 08:44:24 +08:00
|
|
|
#if defined(__aarch64__)
|
2021-06-11 17:17:13 +08:00
|
|
|
auto core = MNNGetCoreFunctions();
|
|
|
|
if (core->supportSDot) {
|
|
|
|
// MatMul
|
|
|
|
gCoreFunc->Int8GemmKernel = MNNGemmInt8AddBiasScale_ARMV82_Unit;
|
|
|
|
gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV82_Unit;
|
|
|
|
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitSdot;
|
|
|
|
// Im2Col
|
2023-06-16 09:42:45 +08:00
|
|
|
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 4>;
|
2023-07-31 14:24:48 +08:00
|
|
|
// ConvDepthwise
|
|
|
|
gCoreFunc->ConvDepthwise3x3LineInt8_ARM82 = MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3;
|
|
|
|
|
2021-01-06 16:29:37 +08:00
|
|
|
}
|
2022-10-30 08:44:24 +08:00
|
|
|
if (core->supportI8mm) {
|
|
|
|
// MatMul
|
|
|
|
gCoreFunc->Int8GemmKernel = MNNGemmInt8AddBiasScale_ARMV86_Unit;
|
|
|
|
gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV86_Unit;
|
|
|
|
gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitI8mm;
|
|
|
|
// Im2Col
|
2023-06-16 09:42:45 +08:00
|
|
|
gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<20, 8, 4>;
|
2022-10-30 08:44:24 +08:00
|
|
|
}
|
2021-01-06 16:29:37 +08:00
|
|
|
#endif
|
2021-06-11 17:17:13 +08:00
|
|
|
MNNInt8FunctionInit();
|
|
|
|
}
|
|
|
|
CoreInt8Functions* MNNGetInt8CoreFunctions() {
|
|
|
|
return gCoreFunc;
|
|
|
|
}
|
|
|
|
};
|