2019-04-17 10:49:11 +08:00
//
// CommonOptFunction.cpp
// MNN
//
// Created by MNN on 2018/09/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
2020-01-15 13:33:47 +08:00
# include "CommonOptFunction.h"
2021-04-08 15:34:23 +08:00
# include "ConvOpt.h"
# include "WinogradOptFunction.hpp"
2019-04-17 10:49:11 +08:00
# include <string.h>
# include <algorithm>
2019-06-17 20:10:35 +08:00
# include <math.h>
2020-11-05 16:41:56 +08:00
# include "math/Vec.hpp"
2020-07-04 01:21:30 +08:00
# include <vector>
2021-06-11 17:17:13 +08:00
# include "../CPURuntime.hpp"
# include "core/MemoryFormater.h"
# include "core/OpCommonUtils.hpp"
// TODO: Find better way to optimize it
# include "../CPUBinary.hpp"
# include "../CPUUnary.hpp"
# include "../CPUPool.hpp"
# ifndef MNN_USE_SSE
void MNNInt8ToInt16 ( int16_t * dest , const int8_t * source , size_t count ) {
// Should not be called
MNN_ASSERT ( false ) ;
}
# endif
/*
source : source matrix is h x l
transpose : if false , export compressed matrix as h x l , other export as l x h .
*/
void MNNPackForSparseMatMul_B ( float * dest , unsigned int * NNZMap , int * dataOffsetMap , int sparseBlockOC , const float * source , size_t h , size_t l , const int eP , bool transpose ) {
// 1. in convolution, source B layout is OC x (IC * KH * KW),
// the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (IC*KH*KW!=0), as a canceled result, just do BCSR, transpose should be false.
// 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
// BCSR
if ( transpose ) {
int rowOffset = 0 ;
for ( int i = 0 ; i < l ; i + = 1 ) {
* NNZMap = 0 ;
for ( int j = 0 ; j < h ; j + = sparseBlockOC ) {
if ( ! MNN : : OpCommonUtils : : checkAllZeros ( source + j * l + i , l , sparseBlockOC , 1 ) ) {
* dest = * ( source + j * l + l ) ;
dest + + ;
* NNZMap = * NNZMap + 1 ;
* dataOffsetMap = rowOffset ;
dataOffsetMap + + ;
rowOffset = 0 ;
}
rowOffset + = eP ;
}
NNZMap + + ;
rowOffset - = h * eP ;
}
} else { // BCSC
int columOffset = 0 ;
int i = 0 ;
for ( ; i + sparseBlockOC < = h ; i + = sparseBlockOC ) {
* NNZMap = 0 ;
for ( int j = 0 ; j < l ; j + = 1 ) {
if ( ! MNN : : OpCommonUtils : : checkAllZeros ( source , l , sparseBlockOC , 1 ) ) {
for ( int ioc = 0 ; ioc < sparseBlockOC ; ioc + + ) {
* dest = * ( source + ioc * l ) ;
dest + + ;
}
* NNZMap = * NNZMap + 1 ;
* dataOffsetMap = columOffset ;
dataOffsetMap + + ;
columOffset = 0 ;
}
columOffset + = eP ;
source + + ;
}
NNZMap + + ;
source + = l * ( sparseBlockOC - 1 ) ;
columOffset - = l * eP ;
}
for ( ; i < h ; i + + ) {
* NNZMap = 0 ;
for ( int j = 0 ; j < l ; j + + ) {
if ( * source ! = 0.0f ) {
* dest = * source ;
dest + + ;
* NNZMap = * NNZMap + 1 ;
* dataOffsetMap = columOffset ;
dataOffsetMap + + ;
columOffset = 0 ;
}
columOffset + = eP ;
source + + ;
}
NNZMap + + ;
columOffset - = l * eP ;
}
* dataOffsetMap = columOffset ; //
}
return ;
}
2021-04-08 15:34:23 +08:00
# ifndef MNN_USE_NEON
void MNNGetMatMulPackMode ( int * eP , int * lP , int * hP ) {
* eP = 16 ;
* lP = 1 ;
* hP = 4 ;
}
2021-06-11 17:17:13 +08:00
void MNNGetSparseMatMulPackMode ( int * eP , int * lP , int * hP ) {
* eP = 16 ;
* lP = 1 ;
* hP = 4 ;
// hp is corresponding to sparse block along right matrix colum dimension. in ramdom sparse, it is 1.
return ;
}
2021-04-21 15:54:01 +08:00
void MNNPackForMatMul_B ( float * dest , const float * source , size_t h , size_t l , bool transpose ) {
2021-04-08 15:34:23 +08:00
auto hP = h / 4 ;
auto hR = hP * 4 ;
if ( hR ! = h ) {
2021-04-21 15:54:01 +08:00
: : memset ( dest , 0 , UP_DIV ( h , 4 ) * 4 * l * sizeof ( float ) ) ;
2021-04-08 15:34:23 +08:00
}
if ( ! transpose ) {
for ( int y = 0 ; y < hP ; + + y ) {
auto destY = dest + y * 4 * l ;
auto sourceY = source + y * 4 ;
for ( int x = 0 ; x < l ; + + x ) {
2021-04-21 15:54:01 +08:00
: : memcpy ( destY + 4 * x , sourceY + x * h , 4 * sizeof ( float ) ) ;
2021-04-08 15:34:23 +08:00
}
}
auto hRemain = h - hR ;
if ( hRemain > 0 ) {
auto destY = dest + hP * 4 * l ;
auto sourceY = source + hP * 4 ;
for ( int x = 0 ; x < l ; + + x ) {
2021-04-21 15:54:01 +08:00
: : memcpy ( destY + 4 * x , sourceY + x * h , hRemain * sizeof ( float ) ) ;
2021-04-08 15:34:23 +08:00
}
}
return ;
}
MNNPackC4 ( dest , source , l , h ) ;
}
2021-06-11 17:17:13 +08:00
static void _MNNPackedMatMulRemain ( float * C , const float * A , const float * B , size_t eSize , const size_t * parameter , const float * postParameters , const float * bias , int aStride ) {
2021-04-08 15:34:23 +08:00
auto h = parameter [ 2 ] ;
auto l = parameter [ 1 ] ;
auto cStride = parameter [ 3 ] / sizeof ( float ) ;
auto hRemain = parameter [ 4 ] ;
auto bExtraStride = parameter [ 5 ] / sizeof ( float ) ;
auto bStride = bExtraStride + l * 4 ;
auto hC4 = UP_DIV ( h , 4 ) ;
for ( int y = 0 ; y < hC4 ; + + y ) {
: : memset ( C + y * cStride , 0 , eSize * 4 * sizeof ( float ) ) ;
}
float alpha = 1.0f ;
float beta = 0.0f ;
float minValue = - std : : numeric_limits < float > ( ) . max ( ) ;
float maxValue = std : : numeric_limits < float > ( ) . max ( ) ;
if ( nullptr ! = postParameters ) {
minValue = postParameters [ 2 ] ;
maxValue = postParameters [ 3 ] ;
alpha = postParameters [ 0 ] ;
beta = postParameters [ 1 ] ;
2020-07-04 01:21:30 +08:00
}
2021-04-08 15:34:23 +08:00
for ( int x = 0 ; x < eSize ; + + x ) {
auto dst = C + 4 * x ;
auto src = A + x ;
for ( int y = 0 ; y < hC4 ; + + y ) {
auto dstY = dst + y * cStride ;
auto weight = B + y * bStride ;
float summer [ 4 ] = {
0.0f ,
0.0f ,
0.0f ,
0.0f ,
} ;
if ( nullptr ! = bias ) {
for ( int v = 0 ; v < 4 ; + + v ) {
summer [ v ] = bias [ 4 * y + v ] ;
}
}
for ( int z = 0 ; z < l ; + + z ) {
2021-06-11 17:17:13 +08:00
auto aZ = src + z * aStride ;
2021-04-08 15:34:23 +08:00
auto wZ = weight + z * 4 ;
summer [ 0 ] + = wZ [ 0 ] * aZ [ 0 ] ;
summer [ 1 ] + = wZ [ 1 ] * aZ [ 0 ] ;
summer [ 2 ] + = wZ [ 2 ] * aZ [ 0 ] ;
summer [ 3 ] + = wZ [ 3 ] * aZ [ 0 ] ;
}
for ( int v = 0 ; v < 4 ; + + v ) {
auto dstValue = std : : min ( summer [ v ] , maxValue ) ;
dstValue = std : : max ( dstValue , minValue ) ;
dstY [ v ] = dstValue ;
}
}
}
}
2021-06-11 17:17:13 +08:00
void MNNPackedMatMul ( float * C , const float * A , const float * B , const size_t * parameter , const float * postParameters , const float * bias ) {
return _MNNPackedMatMulRemain ( C , A , B , 16 , parameter , postParameters , bias , 16 ) ;
}
void MNNPackedMatMulRemain ( float * C , const float * A , const float * B , size_t eSize , const size_t * parameter , const float * postParameters , const float * bias ) {
auto aStride = parameter [ 0 ] / sizeof ( float ) ;
_MNNPackedMatMulRemain ( C , A , B , eSize , parameter , postParameters , bias , aStride ) ;
}
2021-04-08 15:34:23 +08:00
void MNNPackC4ForMatMul_A ( float * destOrigin , float const * * sourceGroup , const int32_t * info , const int32_t * el ) {
int number = info [ 0 ] ;
int eReal = info [ 1 ] ;
2021-06-11 17:17:13 +08:00
int eDest = info [ 2 ] ;
2021-04-08 15:34:23 +08:00
int offset = info [ 3 ] ;
for ( int n = 0 ; n < number ; + + n ) {
int e = el [ 4 * n + 0 ] ;
int l = el [ 4 * n + 1 ] ;
int eOffset = el [ 4 * n + 2 ] ;
int lOffset = el [ 4 * n + 3 ] ;
auto dest = destOrigin + lOffset * eDest + eOffset ;
auto source = sourceGroup [ n ] ;
for ( int y = 0 ; y < e ; + + y ) {
2021-06-11 17:17:13 +08:00
auto yR = y % eDest ;
2021-04-08 15:34:23 +08:00
for ( int x = 0 ; x < l ; + + x ) {
auto xR = x % 4 ;
auto xC = x / 4 ;
dest [ ( x ) * eDest + yR ] = source [ xC * eReal * 4 + y * 4 * offset + xR ] ;
}
}
2020-07-04 01:21:30 +08:00
}
}
2021-06-11 17:17:13 +08:00
void MNNPackedSparseMatMulEpx1 ( float * C , const float * A , const float * B , size_t eSize , const size_t * parameter , const float * postParameters , const float * bias , unsigned int * NNZMap , int * dataOffsetMap ) {
auto eP = parameter [ 0 ] / sizeof ( float ) ;
MNN_ASSERT ( ( eP & 0x03 ) = = 0 ) ; // In sparse calculate, eP should be evenly divided by 4
auto h = parameter [ 2 ] ;
auto l = parameter [ 1 ] ;
auto cStride = parameter [ 3 ] / sizeof ( float ) ;
auto aStride = eP * l ;
auto hRemain = parameter [ 4 ] ;
auto bExtraStride = parameter [ 5 ] / sizeof ( float ) ;
auto bStride = bExtraStride + l * 4 ;
auto hC4 = UP_DIV ( h , 4 ) ;
float minValue = - std : : numeric_limits < float > ( ) . max ( ) ;
float maxValue = std : : numeric_limits < float > ( ) . max ( ) ;
if ( nullptr ! = postParameters ) {
minValue = postParameters [ 2 ] ;
maxValue = postParameters [ 3 ] ;
}
// MNN_PRINT("MNNPackedSparseMatMul eP:%lu, eSize:%lu, l:%lu, h:%lu, cStride:%lu, aStride:%lu\n", eP, eSize, l, h, cStride, aStride);
const float * a = A ;
size_t ie = 0 ;
for ( ie = 0 ; ie < eSize & & eP < = eSize ; ie + = eP ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
for ( auto ih = 0 ; ih < h ; ih + + ) {
auto ihPack = ih > > 2 ;
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihPack * cStride + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
float acc2 = initValue ;
float acc3 = initValue ;
float acc4 = initValue ;
float acc5 = initValue ;
float acc6 = initValue ;
float acc7 = initValue ;
float acc8 = initValue ;
float acc9 = initValue ;
float acc10 = initValue ;
float acc11 = initValue ;
float acc12 = initValue ;
float acc13 = initValue ;
float acc14 = initValue ;
float acc15 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float a4 = a [ 4 ] ;
const float a5 = a [ 5 ] ;
const float a6 = a [ 6 ] ;
const float a7 = a [ 7 ] ;
const float a8 = a [ 8 ] ;
const float a9 = a [ 9 ] ;
const float a10 = a [ 10 ] ;
const float a11 = a [ 11 ] ;
const float a12 = a [ 12 ] ;
const float a13 = a [ 13 ] ;
const float a14 = a [ 14 ] ;
const float a15 = a [ 15 ] ;
const float oneW = * w + + ;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
acc2 + = a2 * oneW ;
acc3 + = a3 * oneW ;
acc4 + = a4 * oneW ;
acc5 + = a5 * oneW ;
acc6 + = a6 * oneW ;
acc7 + = a7 * oneW ;
acc8 + = a8 * oneW ;
acc9 + = a9 * oneW ;
acc10 + = a10 * oneW ;
acc11 + = a11 * oneW ;
acc12 + = a12 * oneW ;
acc13 + = a13 * oneW ;
acc14 + = a14 * oneW ;
acc15 + = a15 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
acc2 = std : : max ( std : : min ( maxValue , acc2 ) , minValue ) ;
acc3 = std : : max ( std : : min ( maxValue , acc3 ) , minValue ) ;
acc4 = std : : max ( std : : min ( maxValue , acc4 ) , minValue ) ;
acc5 = std : : max ( std : : min ( maxValue , acc5 ) , minValue ) ;
acc6 = std : : max ( std : : min ( maxValue , acc6 ) , minValue ) ;
acc7 = std : : max ( std : : min ( maxValue , acc7 ) , minValue ) ;
acc8 = std : : max ( std : : min ( maxValue , acc8 ) , minValue ) ;
acc9 = std : : max ( std : : min ( maxValue , acc9 ) , minValue ) ;
acc10 = std : : max ( std : : min ( maxValue , acc10 ) , minValue ) ;
acc11 = std : : max ( std : : min ( maxValue , acc11 ) , minValue ) ;
acc12 = std : : max ( std : : min ( maxValue , acc12 ) , minValue ) ;
acc13 = std : : max ( std : : min ( maxValue , acc13 ) , minValue ) ;
acc14 = std : : max ( std : : min ( maxValue , acc14 ) , minValue ) ;
acc15 = std : : max ( std : : min ( maxValue , acc15 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
c [ 4 * 2 ] = acc2 ;
c [ 4 * 3 ] = acc3 ;
c [ 4 * 4 ] = acc4 ;
c [ 4 * 5 ] = acc5 ;
c [ 4 * 6 ] = acc6 ;
c [ 4 * 7 ] = acc7 ;
c [ 4 * 8 ] = acc8 ;
c [ 4 * 9 ] = acc9 ;
c [ 4 * 10 ] = acc10 ;
c [ 4 * 11 ] = acc11 ;
c [ 4 * 12 ] = acc12 ;
c [ 4 * 13 ] = acc13 ;
c [ 4 * 14 ] = acc14 ;
c [ 4 * 15 ] = acc15 ;
}
a + = aStride ;
}
// const float* blockA = A + ie * l;
if ( eSize & 0x08 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
for ( auto ih = 0 ; ih < h ; ih + + ) {
auto ihPack = ih > > 2 ;
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihPack * cStride + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
float acc2 = initValue ;
float acc3 = initValue ;
float acc4 = initValue ;
float acc5 = initValue ;
float acc6 = initValue ;
float acc7 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float a4 = a [ 4 ] ;
const float a5 = a [ 5 ] ;
const float a6 = a [ 6 ] ;
const float a7 = a [ 7 ] ;
const float oneW = * w + + ;
// MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-7]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {8});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
acc2 + = a2 * oneW ;
acc3 + = a3 * oneW ;
acc4 + = a4 * oneW ;
acc5 + = a5 * oneW ;
acc6 + = a6 * oneW ;
acc7 + = a7 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
acc2 = std : : max ( std : : min ( maxValue , acc2 ) , minValue ) ;
acc3 = std : : max ( std : : min ( maxValue , acc3 ) , minValue ) ;
acc4 = std : : max ( std : : min ( maxValue , acc4 ) , minValue ) ;
acc5 = std : : max ( std : : min ( maxValue , acc5 ) , minValue ) ;
acc6 = std : : max ( std : : min ( maxValue , acc6 ) , minValue ) ;
acc7 = std : : max ( std : : min ( maxValue , acc7 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
c [ 4 * 2 ] = acc2 ;
c [ 4 * 3 ] = acc3 ;
c [ 4 * 4 ] = acc4 ;
c [ 4 * 5 ] = acc5 ;
c [ 4 * 6 ] = acc6 ;
c [ 4 * 7 ] = acc7 ;
}
ie + = 8 ;
a + = 8 ;
}
if ( eSize & 0x04 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// const float* a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
for ( auto ih = 0 ; ih < h ; ih + + ) {
auto ihPack = ih > > 2 ;
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihPack * cStride + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
float acc2 = initValue ;
float acc3 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float oneW = * w + + ;
// MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-3]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {4});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
acc2 + = a2 * oneW ;
acc3 + = a3 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
acc2 = std : : max ( std : : min ( maxValue , acc2 ) , minValue ) ;
acc3 = std : : max ( std : : min ( maxValue , acc3 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
c [ 4 * 2 ] = acc2 ;
c [ 4 * 3 ] = acc3 ;
}
ie + = 4 ;
a + = 4 ;
}
if ( eSize & 0x02 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// const float* a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
for ( auto ih = 0 ; ih < h ; ih + + ) {
auto ihPack = ih > > 2 ;
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihPack * cStride + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float oneW = * w + + ;
// MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-1]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {2});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
}
ie + = 2 ;
a + = 2 ;
}
if ( eSize & 0x01 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// const float* a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
for ( auto ih = 0 ; ih < h ; ih + + ) {
auto ihPack = ih > > 2 ;
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihPack * cStride + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float oneW = * w + + ;
// MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%f, a value[0]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {1});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
}
ie + = 1 ;
// a += 1;
}
return ;
}
void MNNPackedSparseMatMulEpx4 ( float * C , const float * A , const float * B , size_t eSize , const size_t * parameter , const float * postParameters , const float * bias , unsigned int * NNZMap , int * dataOffsetMap ) {
auto eP = parameter [ 0 ] / sizeof ( float ) ;
MNN_ASSERT ( ( eP & 0x03 ) = = 0 ) ; // In sparse calculate, eP should be evenly divided by 4
auto h = parameter [ 2 ] ;
auto l = parameter [ 1 ] ;
auto cStride = parameter [ 3 ] / sizeof ( float ) ;
auto aStride = eP * l ;
auto hRemain = parameter [ 4 ] ;
auto bExtraStride = parameter [ 5 ] / sizeof ( float ) ;
auto bStride = bExtraStride + l * 4 ;
auto hC4 = UP_DIV ( h , 4 ) ;
float minValue = - std : : numeric_limits < float > ( ) . max ( ) ;
float maxValue = std : : numeric_limits < float > ( ) . max ( ) ;
if ( nullptr ! = postParameters ) {
minValue = postParameters [ 2 ] ;
maxValue = postParameters [ 3 ] ;
}
// MNN_PRINT("MNNPackedSparseMatMul 16x4 eP:%lu, eSize:%lu, l:%lu, h:%lu, cStride:%lu, aStride:%lu\n", eP, eSize, l, h, cStride, aStride);
const int sparseBlockOC = 4 ;
const float * a = A ;
size_t ie = 0 ;
for ( ie = 0 ; ie < eSize & & eP < = eSize ; ie + = eP ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
size_t ih = 0 ;
for ( ; ih < ( h & ( ~ 0x03 ) ) ; ih + = sparseBlockOC ) {
auto ihPack = ih > > 2 ;
auto c = blockC + ihPack * cStride ;
float initValue [ 4 ] = { 0 , 0 , 0 , 0 } ;
if ( nullptr ! = bias ) {
memcpy ( initValue , bias + ih , 4 * sizeof ( float ) ) ;
}
float acc0 [ 4 ] ;
float acc1 [ 4 ] ;
float acc2 [ 4 ] ;
float acc3 [ 4 ] ;
float acc4 [ 4 ] ;
float acc5 [ 4 ] ;
float acc6 [ 4 ] ;
float acc7 [ 4 ] ;
float acc8 [ 4 ] ;
float acc9 [ 4 ] ;
float acc10 [ 4 ] ;
float acc11 [ 4 ] ;
float acc12 [ 4 ] ;
float acc13 [ 4 ] ;
float acc14 [ 4 ] ;
float acc15 [ 4 ] ;
memcpy ( acc0 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc1 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc2 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc3 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc4 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc5 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc6 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc7 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc8 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc9 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc10 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc11 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc12 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc13 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc14 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc15 , initValue , 4 * sizeof ( float ) ) ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float a4 = a [ 4 ] ;
const float a5 = a [ 5 ] ;
const float a6 = a [ 6 ] ;
const float a7 = a [ 7 ] ;
const float a8 = a [ 8 ] ;
const float a9 = a [ 9 ] ;
const float a10 = a [ 10 ] ;
const float a11 = a [ 11 ] ;
const float a12 = a [ 12 ] ;
const float a13 = a [ 13 ] ;
const float a14 = a [ 14 ] ;
const float a15 = a [ 15 ] ;
const float wv [ 4 ] = { * w + + , * w + + , * w + + , * w + + } ;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff ;
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] + = a0 * wv [ lane ] ;
acc1 [ lane ] + = a1 * wv [ lane ] ;
acc2 [ lane ] + = a2 * wv [ lane ] ;
acc3 [ lane ] + = a3 * wv [ lane ] ;
acc4 [ lane ] + = a4 * wv [ lane ] ;
acc5 [ lane ] + = a5 * wv [ lane ] ;
acc6 [ lane ] + = a6 * wv [ lane ] ;
acc7 [ lane ] + = a7 * wv [ lane ] ;
acc8 [ lane ] + = a8 * wv [ lane ] ;
acc9 [ lane ] + = a9 * wv [ lane ] ;
acc10 [ lane ] + = a10 * wv [ lane ] ;
acc11 [ lane ] + = a11 * wv [ lane ] ;
acc12 [ lane ] + = a12 * wv [ lane ] ;
acc13 [ lane ] + = a13 * wv [ lane ] ;
acc14 [ lane ] + = a14 * wv [ lane ] ;
acc15 [ lane ] + = a15 * wv [ lane ] ;
}
}
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] = std : : max ( std : : min ( maxValue , acc0 [ lane ] ) , minValue ) ;
acc1 [ lane ] = std : : max ( std : : min ( maxValue , acc1 [ lane ] ) , minValue ) ;
acc2 [ lane ] = std : : max ( std : : min ( maxValue , acc2 [ lane ] ) , minValue ) ;
acc3 [ lane ] = std : : max ( std : : min ( maxValue , acc3 [ lane ] ) , minValue ) ;
acc4 [ lane ] = std : : max ( std : : min ( maxValue , acc4 [ lane ] ) , minValue ) ;
acc5 [ lane ] = std : : max ( std : : min ( maxValue , acc5 [ lane ] ) , minValue ) ;
acc6 [ lane ] = std : : max ( std : : min ( maxValue , acc6 [ lane ] ) , minValue ) ;
acc7 [ lane ] = std : : max ( std : : min ( maxValue , acc7 [ lane ] ) , minValue ) ;
acc8 [ lane ] = std : : max ( std : : min ( maxValue , acc8 [ lane ] ) , minValue ) ;
acc9 [ lane ] = std : : max ( std : : min ( maxValue , acc9 [ lane ] ) , minValue ) ;
acc10 [ lane ] = std : : max ( std : : min ( maxValue , acc10 [ lane ] ) , minValue ) ;
acc11 [ lane ] = std : : max ( std : : min ( maxValue , acc11 [ lane ] ) , minValue ) ;
acc12 [ lane ] = std : : max ( std : : min ( maxValue , acc12 [ lane ] ) , minValue ) ;
acc13 [ lane ] = std : : max ( std : : min ( maxValue , acc13 [ lane ] ) , minValue ) ;
acc14 [ lane ] = std : : max ( std : : min ( maxValue , acc14 [ lane ] ) , minValue ) ;
acc15 [ lane ] = std : : max ( std : : min ( maxValue , acc15 [ lane ] ) , minValue ) ;
}
memcpy ( c , acc0 , 4 * sizeof ( float ) ) ; // store continuous c
memcpy ( c + 4 , acc1 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 2 , acc2 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 3 , acc3 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 4 , acc4 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 5 , acc5 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 6 , acc6 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 7 , acc7 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 8 , acc8 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 9 , acc9 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 10 , acc10 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 11 , acc11 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 12 , acc12 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 13 , acc13 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 14 , acc14 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 15 , acc15 , 4 * sizeof ( float ) ) ;
}
blockC + = ( h > > 2 ) * cStride ;
for ( ; ih < h ; ih + + ) {
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
float acc2 = initValue ;
float acc3 = initValue ;
float acc4 = initValue ;
float acc5 = initValue ;
float acc6 = initValue ;
float acc7 = initValue ;
float acc8 = initValue ;
float acc9 = initValue ;
float acc10 = initValue ;
float acc11 = initValue ;
float acc12 = initValue ;
float acc13 = initValue ;
float acc14 = initValue ;
float acc15 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float a4 = a [ 4 ] ;
const float a5 = a [ 5 ] ;
const float a6 = a [ 6 ] ;
const float a7 = a [ 7 ] ;
const float a8 = a [ 8 ] ;
const float a9 = a [ 9 ] ;
const float a10 = a [ 10 ] ;
const float a11 = a [ 11 ] ;
const float a12 = a [ 12 ] ;
const float a13 = a [ 13 ] ;
const float a14 = a [ 14 ] ;
const float a15 = a [ 15 ] ;
const float oneW = * w + + ;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
acc2 + = a2 * oneW ;
acc3 + = a3 * oneW ;
acc4 + = a4 * oneW ;
acc5 + = a5 * oneW ;
acc6 + = a6 * oneW ;
acc7 + = a7 * oneW ;
acc8 + = a8 * oneW ;
acc9 + = a9 * oneW ;
acc10 + = a10 * oneW ;
acc11 + = a11 * oneW ;
acc12 + = a12 * oneW ;
acc13 + = a13 * oneW ;
acc14 + = a14 * oneW ;
acc15 + = a15 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
acc2 = std : : max ( std : : min ( maxValue , acc2 ) , minValue ) ;
acc3 = std : : max ( std : : min ( maxValue , acc3 ) , minValue ) ;
acc4 = std : : max ( std : : min ( maxValue , acc4 ) , minValue ) ;
acc5 = std : : max ( std : : min ( maxValue , acc5 ) , minValue ) ;
acc6 = std : : max ( std : : min ( maxValue , acc6 ) , minValue ) ;
acc7 = std : : max ( std : : min ( maxValue , acc7 ) , minValue ) ;
acc8 = std : : max ( std : : min ( maxValue , acc8 ) , minValue ) ;
acc9 = std : : max ( std : : min ( maxValue , acc9 ) , minValue ) ;
acc10 = std : : max ( std : : min ( maxValue , acc10 ) , minValue ) ;
acc11 = std : : max ( std : : min ( maxValue , acc11 ) , minValue ) ;
acc12 = std : : max ( std : : min ( maxValue , acc12 ) , minValue ) ;
acc13 = std : : max ( std : : min ( maxValue , acc13 ) , minValue ) ;
acc14 = std : : max ( std : : min ( maxValue , acc14 ) , minValue ) ;
acc15 = std : : max ( std : : min ( maxValue , acc15 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
c [ 4 * 2 ] = acc2 ;
c [ 4 * 3 ] = acc3 ;
c [ 4 * 4 ] = acc4 ;
c [ 4 * 5 ] = acc5 ;
c [ 4 * 6 ] = acc6 ;
c [ 4 * 7 ] = acc7 ;
c [ 4 * 8 ] = acc8 ;
c [ 4 * 9 ] = acc9 ;
c [ 4 * 10 ] = acc10 ;
c [ 4 * 11 ] = acc11 ;
c [ 4 * 12 ] = acc12 ;
c [ 4 * 13 ] = acc13 ;
c [ 4 * 14 ] = acc14 ;
c [ 4 * 15 ] = acc15 ;
}
a + = aStride ;
}
// const float* blockA = A + ie * l;
if ( eSize & 0x08 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
size_t ih = 0 ;
for ( ; ih < ( h & ( ~ 0x03 ) ) ; ih + = sparseBlockOC ) {
auto ihPack = ih > > 2 ;
auto c = blockC + ihPack * cStride ;
float initValue [ 4 ] = { 0 , 0 , 0 , 0 } ;
if ( nullptr ! = bias ) {
memcpy ( initValue , bias + ih , 4 * sizeof ( float ) ) ;
}
float acc0 [ 4 ] ;
float acc1 [ 4 ] ;
float acc2 [ 4 ] ;
float acc3 [ 4 ] ;
float acc4 [ 4 ] ;
float acc5 [ 4 ] ;
float acc6 [ 4 ] ;
float acc7 [ 4 ] ;
memcpy ( acc0 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc1 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc2 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc3 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc4 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc5 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc6 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc7 , initValue , 4 * sizeof ( float ) ) ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float a4 = a [ 4 ] ;
const float a5 = a [ 5 ] ;
const float a6 = a [ 6 ] ;
const float a7 = a [ 7 ] ;
const float wv [ 4 ] = { * w + + , * w + + , * w + + , * w + + } ;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff ;
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] + = a0 * wv [ lane ] ;
acc1 [ lane ] + = a1 * wv [ lane ] ;
acc2 [ lane ] + = a2 * wv [ lane ] ;
acc3 [ lane ] + = a3 * wv [ lane ] ;
acc4 [ lane ] + = a4 * wv [ lane ] ;
acc5 [ lane ] + = a5 * wv [ lane ] ;
acc6 [ lane ] + = a6 * wv [ lane ] ;
acc7 [ lane ] + = a7 * wv [ lane ] ;
}
}
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] = std : : max ( std : : min ( maxValue , acc0 [ lane ] ) , minValue ) ;
acc1 [ lane ] = std : : max ( std : : min ( maxValue , acc1 [ lane ] ) , minValue ) ;
acc2 [ lane ] = std : : max ( std : : min ( maxValue , acc2 [ lane ] ) , minValue ) ;
acc3 [ lane ] = std : : max ( std : : min ( maxValue , acc3 [ lane ] ) , minValue ) ;
acc4 [ lane ] = std : : max ( std : : min ( maxValue , acc4 [ lane ] ) , minValue ) ;
acc5 [ lane ] = std : : max ( std : : min ( maxValue , acc5 [ lane ] ) , minValue ) ;
acc6 [ lane ] = std : : max ( std : : min ( maxValue , acc6 [ lane ] ) , minValue ) ;
acc7 [ lane ] = std : : max ( std : : min ( maxValue , acc7 [ lane ] ) , minValue ) ;
}
memcpy ( c , acc0 , 4 * sizeof ( float ) ) ; // store continuous c
memcpy ( c + 4 , acc1 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 2 , acc2 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 3 , acc3 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 4 , acc4 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 5 , acc5 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 6 , acc6 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 7 , acc7 , 4 * sizeof ( float ) ) ;
}
blockC + = ( ih > > 2 ) * cStride ;
for ( ; ih < h ; ih + + ) {
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
float acc2 = initValue ;
float acc3 = initValue ;
float acc4 = initValue ;
float acc5 = initValue ;
float acc6 = initValue ;
float acc7 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float a4 = a [ 4 ] ;
const float a5 = a [ 5 ] ;
const float a6 = a [ 6 ] ;
const float a7 = a [ 7 ] ;
const float oneW = * w + + ;
// MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-7]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {8});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
acc2 + = a2 * oneW ;
acc3 + = a3 * oneW ;
acc4 + = a4 * oneW ;
acc5 + = a5 * oneW ;
acc6 + = a6 * oneW ;
acc7 + = a7 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
acc2 = std : : max ( std : : min ( maxValue , acc2 ) , minValue ) ;
acc3 = std : : max ( std : : min ( maxValue , acc3 ) , minValue ) ;
acc4 = std : : max ( std : : min ( maxValue , acc4 ) , minValue ) ;
acc5 = std : : max ( std : : min ( maxValue , acc5 ) , minValue ) ;
acc6 = std : : max ( std : : min ( maxValue , acc6 ) , minValue ) ;
acc7 = std : : max ( std : : min ( maxValue , acc7 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
c [ 4 * 2 ] = acc2 ;
c [ 4 * 3 ] = acc3 ;
c [ 4 * 4 ] = acc4 ;
c [ 4 * 5 ] = acc5 ;
c [ 4 * 6 ] = acc6 ;
c [ 4 * 7 ] = acc7 ;
}
ie + = 8 ;
a + = 8 ;
}
if ( eSize & 0x04 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// const float* a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
size_t ih = 0 ;
for ( ; ih < ( h & ( ~ 0x03 ) ) ; ih + = sparseBlockOC ) {
auto ihPack = ih > > 2 ;
auto c = blockC + ihPack * cStride ;
float initValue [ 4 ] = { 0 , 0 , 0 , 0 } ;
if ( nullptr ! = bias ) {
memcpy ( initValue , bias + ih , 4 * sizeof ( float ) ) ;
}
float acc0 [ 4 ] ;
float acc1 [ 4 ] ;
float acc2 [ 4 ] ;
float acc3 [ 4 ] ;
memcpy ( acc0 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc1 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc2 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc3 , initValue , 4 * sizeof ( float ) ) ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float wv [ 4 ] = { * w + + , * w + + , * w + + , * w + + } ;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff ;
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] + = a0 * wv [ lane ] ;
acc1 [ lane ] + = a1 * wv [ lane ] ;
acc2 [ lane ] + = a2 * wv [ lane ] ;
acc3 [ lane ] + = a3 * wv [ lane ] ;
}
}
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] = std : : max ( std : : min ( maxValue , acc0 [ lane ] ) , minValue ) ;
acc1 [ lane ] = std : : max ( std : : min ( maxValue , acc1 [ lane ] ) , minValue ) ;
acc2 [ lane ] = std : : max ( std : : min ( maxValue , acc2 [ lane ] ) , minValue ) ;
acc3 [ lane ] = std : : max ( std : : min ( maxValue , acc3 [ lane ] ) , minValue ) ;
}
memcpy ( c , acc0 , 4 * sizeof ( float ) ) ; // store continuous c
memcpy ( c + 4 , acc1 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 2 , acc2 , 4 * sizeof ( float ) ) ;
memcpy ( c + 4 * 3 , acc3 , 4 * sizeof ( float ) ) ;
}
blockC + = ( ih > > 2 ) * cStride ;
for ( ; ih < h ; ih + + ) {
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
float acc2 = initValue ;
float acc3 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float a2 = a [ 2 ] ;
const float a3 = a [ 3 ] ;
const float oneW = * w + + ;
// MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-3]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {4});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
acc2 + = a2 * oneW ;
acc3 + = a3 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
acc2 = std : : max ( std : : min ( maxValue , acc2 ) , minValue ) ;
acc3 = std : : max ( std : : min ( maxValue , acc3 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
c [ 4 * 2 ] = acc2 ;
c [ 4 * 3 ] = acc3 ;
}
ie + = 4 ;
a + = 4 ;
}
if ( eSize & 0x02 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// const float* a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
size_t ih = 0 ;
for ( ; ih < ( h & ( ~ 0x03 ) ) ; ih + = sparseBlockOC ) {
auto ihPack = ih > > 2 ;
auto c = blockC + ihPack * cStride ;
float initValue [ 4 ] = { 0 , 0 , 0 , 0 } ;
if ( nullptr ! = bias ) {
memcpy ( initValue , bias + ih , 4 * sizeof ( float ) ) ;
}
float acc0 [ 4 ] ;
float acc1 [ 4 ] ;
memcpy ( acc0 , initValue , 4 * sizeof ( float ) ) ;
memcpy ( acc1 , initValue , 4 * sizeof ( float ) ) ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float wv [ 4 ] = { * w + + , * w + + , * w + + , * w + + } ;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff ;
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] + = a0 * wv [ lane ] ;
acc1 [ lane ] + = a1 * wv [ lane ] ;
}
}
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] = std : : max ( std : : min ( maxValue , acc0 [ lane ] ) , minValue ) ;
acc1 [ lane ] = std : : max ( std : : min ( maxValue , acc1 [ lane ] ) , minValue ) ;
}
memcpy ( c , acc0 , 4 * sizeof ( float ) ) ; // store continuous c
memcpy ( c + 4 , acc1 , 4 * sizeof ( float ) ) ;
}
blockC + = ( ih > > 2 ) * cStride ;
for ( ; ih < h ; ih + + ) {
auto ihPack = ih > > 2 ;
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
float acc1 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float a1 = a [ 1 ] ;
const float oneW = * w + + ;
// MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-1]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {2});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
acc1 + = a1 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
acc1 = std : : max ( std : : min ( maxValue , acc1 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
c [ 4 ] = acc1 ;
}
ie + = 2 ;
a + = 2 ;
}
if ( eSize & 0x01 ) {
const int * dataOffset = dataOffsetMap ;
const int diff = * dataOffset + + ;
// const float* a = blockA + diff;
a + = diff ;
const float * w = B ;
float * blockC = C + ( ie < < 2 ) ;
const unsigned int * nnz = NNZMap ;
size_t ih = 0 ;
for ( ; ih < ( h & ( ~ 0x03 ) ) ; ih + = sparseBlockOC ) {
auto ihPack = ih > > 2 ;
auto c = blockC + ihPack * cStride ;
float initValue [ 4 ] = { 0 , 0 , 0 , 0 } ;
if ( nullptr ! = bias ) {
memcpy ( initValue , bias + ih , 4 * sizeof ( float ) ) ;
}
float acc0 [ 4 ] ;
memcpy ( acc0 , initValue , 4 * sizeof ( float ) ) ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float wv [ 4 ] = { * w + + , * w + + , * w + + , * w + + } ;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff ;
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] + = a0 * wv [ lane ] ;
}
}
for ( int lane = 0 ; lane < 4 ; lane + + ) {
acc0 [ lane ] = std : : max ( std : : min ( maxValue , acc0 [ lane ] ) , minValue ) ;
}
memcpy ( c , acc0 , 4 * sizeof ( float ) ) ; // store continuous c
}
blockC + = ( ih > > 2 ) * cStride ;
for ( ; ih < h ; ih + + ) {
auto ihSubIndex = ih & 0x03 ;
auto c = blockC + ihSubIndex ;
const float initValue = nullptr ! = bias ? bias [ ih ] : 0 ;
float acc0 = initValue ;
const int lElement = * nnz + + ;
for ( auto il = 0 ; il < lElement ; il + + ) {
const int diff = * dataOffset + + ;
const float a0 = a [ 0 ] ;
const float oneW = * w + + ;
// MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%f, a value[0]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {1});
// MNN_PRINT("\n");
a = a + diff ;
acc0 + = a0 * oneW ;
}
acc0 = std : : max ( std : : min ( maxValue , acc0 ) , minValue ) ;
// how to store faster: st4 / transpose /
c [ 0 ] = acc0 ;
}
ie + = 1 ;
// a += 1;
}
return ;
}
2021-04-08 15:34:23 +08:00
# endif
2020-07-04 01:21:30 +08:00
2020-04-10 14:44:01 +08:00
# ifndef MNN_USE_SSE
2020-11-05 16:41:56 +08:00
# ifndef MNN_USE_NEON
void MNNTranspose32Bit ( int32_t * dstO , const int32_t * srcO , int32_t * dim ) {
int w = dim [ 0 ] ;
int h = dim [ 1 ] ;
int srcStride = dim [ 2 ] ;
int dstStride = dim [ 3 ] ;
for ( int i = 0 ; i < h ; + + i ) {
auto si = srcO + i ;
auto di = dstO + i * dstStride ;
for ( int j = 0 ; j < w ; + + j ) {
auto sj = si + j * srcStride ;
auto dj = di + j ;
* dj = * sj ;
}
}
}
# endif
2020-07-04 01:21:30 +08:00
void MNNFunctionInit ( ) {
// Do nothing
}
2020-04-10 14:44:01 +08:00
# endif
2019-04-17 10:49:11 +08:00
# ifdef MNN_USE_NEON
# include <arm_neon.h>
# endif
2020-03-22 20:16:29 +08:00
2019-04-17 10:49:11 +08:00
# define UNIT 4
2020-11-05 16:41:56 +08:00
using Vec4 = MNN : : Math : : Vec < float , 4 > ;
2019-04-17 10:49:11 +08:00
# ifndef MNN_USE_NEON
# ifndef MNN_USE_SSE
void MNNCopyC4WithStride ( const float * source , float * dest , size_t srcStride , size_t dstStride , size_t count ) {
for ( int i = 0 ; i < count ; + + i ) {
auto s = source + i * srcStride ;
auto d = dest + i * dstStride ;
for ( int j = 0 ; j < 4 ; + + j ) {
d [ j ] = s [ j ] ;
}
}
}
void MNNAddC4WithStride ( const float * source , float * dest , size_t srcStride , size_t dstStride , size_t count ) {
for ( int i = 0 ; i < count ; + + i ) {
auto s = source + i * srcStride ;
auto d = dest + i * dstStride ;
for ( int j = 0 ; j < 4 ; + + j ) {
d [ j ] + = s [ j ] ;
}
}
}
2020-02-26 09:57:17 +08:00
void MNNReluWithSlopeChannel ( float * dst , const float * src , const float * slope , size_t sizeQuad , size_t depthQuad ) {
for ( int j = 0 ; j < depthQuad ; j + + ) {
const float * slopeZ = slope + 4 * j ;
const float * srcZ = src + 4 * j * sizeQuad ;
float * dstZ = dst + 4 * j * sizeQuad ;
for ( int i = 0 ; i < sizeQuad ; i + + ) {
for ( int c = 0 ; c < 4 ; c + + ) {
if ( srcZ [ 4 * i + c ] < 0 ) {
dstZ [ 4 * i + c ] = srcZ [ 4 * i + c ] * slopeZ [ c ] ;
} else {
dstZ [ 4 * i + c ] = srcZ [ 4 * i + c ] ;
}
}
}
}
}
2021-06-11 17:17:13 +08:00
2020-11-05 16:41:56 +08:00
void MNNPackC4 ( float * dst , const float * src , size_t area , size_t depth ) {
2021-01-30 16:06:59 +08:00
int depthC4 = depth / 4 ;
int depthRemain = depthC4 * 4 ;
int remain = depth - depthRemain ;
int z , x , y ;
const float * srcChannel [ 4 ] ;
const float * srcOffset = src ;
for ( z = 0 ; z < depthC4 ; + + z ) {
for ( y = 0 ; y < 4 ; + + y ) {
2021-06-11 17:17:13 +08:00
srcChannel [ y ] = srcOffset + area * y ;
2021-01-30 16:06:59 +08:00
}
for ( x = 0 ; x < area ; + + x ) {
for ( y = 0 ; y < 4 ; + + y ) {
dst [ 0 ] = srcChannel [ y ] [ 0 ] ;
srcChannel [ y ] + + ;
dst + + ;
}
}
srcOffset + = area * 4 ;
}
if ( remain > 0 ) {
for ( y = 0 ; y < remain ; + + y ) {
srcChannel [ y ] = srcOffset + area * y ;
}
for ( x = 0 ; x < area ; + + x ) {
for ( y = 0 ; y < remain ; + + y ) {
dst [ 0 ] = srcChannel [ y ] [ 0 ] ;
srcChannel [ y ] + + ;
dst + + ;
}
for ( y = remain ; y < 4 ; + + y ) {
dst [ 0 ] = 0 ;
dst + + ;
}
2020-11-05 16:41:56 +08:00
}
}
}
void MNNUnpackC4 ( float * dst , const float * src , size_t area , size_t depth ) {
2021-01-30 16:06:59 +08:00
int depthC4 = depth / 4 ;
int depthRemain = depthC4 * 4 ;
int remain = depth - depthRemain ;
int z , x , y ;
const float * srcChannel [ 4 ] ;
const float * srcOffset = src ;
for ( z = 0 ; z < depthC4 ; + + z ) {
for ( y = 0 ; y < 4 ; + + y ) {
srcChannel [ y ] = srcOffset + y ;
for ( x = 0 ; x < area ; + + x ) {
dst [ 0 ] = srcChannel [ y ] [ 0 ] ;
srcChannel [ y ] + = 4 ;
dst + + ;
}
}
srcOffset + = area * 4 ;
}
if ( remain > 0 ) {
for ( y = 0 ; y < remain ; + + y ) {
srcChannel [ y ] = srcOffset + y ;
for ( x = 0 ; x < area ; + + x ) {
dst [ 0 ] = srcChannel [ y ] [ 0 ] ;
srcChannel [ y ] + = 4 ;
dst + + ;
}
2020-11-05 16:41:56 +08:00
}
}
}
void MNNExpC8 ( float * dest , const float * source , const float * parameters , size_t countC8 ) {
auto count = countC8 * 8 ;
auto param = parameters [ 0 ] ;
float xLimit = 87 ;
for ( int i = 0 ; i < count ; + + i ) {
auto x = - source [ i ] ;
x = ALIMAX ( x , - xLimit ) ;
x = ALIMIN ( x , xLimit ) ;
int div = ( x * parameters [ 1 ] ) ;
int div2 = ( div + 127 ) < < 23 ;
auto xReamin = x - div * param ;
float expBasic = * ( float * ) ( & div2 ) ;
auto t = xReamin ;
auto expRemain =
( ( ( ( parameters [ 7 ] * t + parameters [ 6 ] ) * t + parameters [ 5 ] ) * t + parameters [ 4 ] ) * t + parameters [ 3 ] ) * t +
parameters [ 2 ] ;
dest [ i ] = expBasic * expRemain ;
}
}
2021-04-08 15:34:23 +08:00
void MNNReluInt8 ( int8_t * dst , const int8_t * src , size_t size ) {
int i ;
for ( i = 0 ; i < size ; + + i ) {
if ( src [ i ] < 0 ) {
dst [ i ] = 0 ;
} else {
dst [ i ] = src [ i ] ;
}
}
}
2020-03-22 20:16:29 +08:00
# endif // no MNN_USE_SSE
2019-04-17 10:49:11 +08:00
void MNNMaxFloat ( float * input , float * maxBuffer , int32_t inputCountUnit ) {
for ( int i = 0 ; i < inputCountUnit ; i + + ) {
for ( int j = 0 ; j < UNIT ; j + + ) {
for ( int m = 0 ; m < 2 ; m + + ) {
maxBuffer [ j ] = std : : max ( input [ i * UNIT * 2 + j * 2 + m ] , maxBuffer [ j ] ) ;
}
}
}
}
void MNNMinFloat ( float * input , float * minBuffer , int32_t inputCountUnit ) {
for ( int i = 0 ; i < inputCountUnit ; i + + ) {
for ( int j = 0 ; j < UNIT ; j + + ) {
for ( int m = 0 ; m < 2 ; m + + ) {
minBuffer [ j ] = std : : min ( input [ i * UNIT * 2 + j * 2 + m ] , minBuffer [ j ] ) ;
}
}
}
}
void MNNScaleAndAddBias ( float * dst , const float * src , const float * bias , const float * alpha , size_t planeNumber ,
size_t biasNumber ) {
for ( int z = 0 ; z < biasNumber ; + + z ) {
float * dstZ = dst + planeNumber * 4 * z ;
const float * srcZ = src + planeNumber * 4 * z ;
2020-01-15 13:33:47 +08:00
auto biasZ = Vec4 : : load ( bias + 4 * z ) ;
auto alphaZ = Vec4 : : load ( alpha + 4 * z ) ;
2019-04-17 10:49:11 +08:00
for ( int p = 0 ; p < planeNumber ; + + p ) {
float * dstX = dstZ + 4 * p ;
const float * srcX = srcZ + 4 * p ;
2020-01-15 13:33:47 +08:00
Vec4 : : save ( dstX , ( Vec4 : : load ( srcX ) * alphaZ ) + biasZ ) ;
2019-04-17 10:49:11 +08:00
}
}
}
2019-07-11 13:56:52 +08:00
2019-04-17 10:49:11 +08:00
void MNNUInt8ToInt16WithOffsetC4Common ( int16_t * dst , const uint8_t * src , size_t zeroPoint , size_t sizeQuad ,
size_t dstStride , size_t srcStride ) {
dstStride / = sizeof ( int16_t ) ;
srcStride / = sizeof ( uint8_t ) ;
for ( int z = 0 ; z < sizeQuad ; + + z ) {
auto dstZ = dst + dstStride * z ;
auto srcZ = src + srcStride * z ;
for ( int j = 0 ; j < 4 ; + + j ) {
dstZ [ j ] = ( int16_t ) ( ( int32_t ) srcZ [ j ] - ( int32_t ) zeroPoint ) ;
}
}
}
void MNNUInt8ToInt16WithOffsetC4Fast ( int16_t * colAddr , const uint8_t * srcStart , size_t zeroPoint , size_t sizeQuad ,
size_t depthQuad , size_t dstZStep , size_t srcZStep ) {
dstZStep / = sizeof ( int16_t ) ;
srcZStep / = sizeof ( uint8_t ) ;
for ( int sz = 0 ; sz < depthQuad ; + + sz ) {
auto dstZ = colAddr + sz * dstZStep ;
auto srcZ = srcStart + sz * srcZStep ;
MNNUInt8ToInt16WithOffsetC4Common ( dstZ , srcZ , zeroPoint , sizeQuad , 4 * sizeof ( int16_t ) , 4 * sizeof ( uint8_t ) ) ;
}
}
2019-05-24 11:26:54 +08:00
void MNNPowC8 ( float * dest , const float * source , const float * powfParam , size_t betaInt , size_t countC8 ) {
2019-06-17 20:10:35 +08:00
const int count = countC8 * 8 ;
2019-05-24 11:26:54 +08:00
const float powfConstant = powfParam [ 6 ] ;
for ( int i = 0 ; i < count ; + + i ) {
float result = 1 , x , xInv = 1 / source [ i ] ;
2019-06-17 20:10:35 +08:00
for ( int j = 0 ; j < betaInt ; result * = xInv , + + j )
;
for ( x = source [ i ] ; x > = 1.25 ; x / = 1.5 , result * = powfConstant )
;
2019-05-24 11:26:54 +08:00
float t = x - 1 ;
2019-06-17 20:10:35 +08:00
float powRemain =
powfParam [ 0 ] +
t * ( powfParam [ 1 ] + t * ( powfParam [ 2 ] + t * ( powfParam [ 3 ] + t * ( powfParam [ 4 ] + t * powfParam [ 5 ] ) ) ) ) ;
2019-05-24 11:26:54 +08:00
result * = powRemain ;
dest [ i ] = result ;
}
}
2020-05-18 11:07:41 +08:00
2020-03-22 20:16:29 +08:00
# endif // no MNN_USE_NEON
2021-06-23 14:10:31 +08:00
void MNNGridSampleComputeCord ( float * dst , const float * src , size_t inH , size_t inW , size_t outH , size_t outW , size_t stride , bool alignCorners ) {
float a = alignCorners ? 1.0f : 0.0f ;
float b = alignCorners ? 0.0f : 1.0f ;
for ( auto h = 0 ; h < outH ; + + h ) {
auto __gridPtr = src + h * stride ;
auto cordH = dst + h * outW * 2 ;
for ( auto w = 0 ; w < outW ; + + w ) {
auto x = __gridPtr [ 2 * w + 0 ] ;
auto y = __gridPtr [ 2 * w + 1 ] ;
cordH [ 2 * w + 0 ] = ( ( 1 + x ) * ( inW - a ) - b ) * 0.5f ;
cordH [ 2 * w + 1 ] = ( ( 1 + y ) * ( inH - a ) - b ) * 0.5f ;
}
}
}
Vec4 MNNGridSampleLoadSample ( int h , int w , const float * buffer , int height , int width , bool padMode ) {
if ( h < 0 | | h > = height | | w < 0 | | w > = width ) {
if ( padMode = = true ) { //padMode == BorderMode_ZEROS
return 0.0f ;
}
// Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
// For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
// the leftover reflections degrade to GridSamplePaddingMode_BORDER
h = h < 0 ? 0 : ( h > ( height - 1 ) ? ( height - 1 ) : h ) ;
w = w < 0 ? 0 : ( w > ( width - 1 ) ? ( width - 1 ) : w ) ;
}
return Vec4 : : load ( buffer + h * width * 4 + w * 4 ) ;
}
void MNNGridSampleInterp ( float * outputPtr , const float * inputPtr , const float * cordPtr , size_t inH , size_t inW , size_t outW , bool sampleMode , bool padMode ) {
for ( auto ow = 0 ; ow < outW ; + + ow ) {
auto w = cordPtr [ 2 * ow + 0 ] ;
auto h = cordPtr [ 2 * ow + 1 ] ;
Vec4 interp ;
if ( sampleMode = = true ) { //sampleMode == SampleMode_NEAREST
int nh = : : floor ( h + 0.5f ) ;
int nw = : : floor ( w + 0.5f ) ;
interp = MNNGridSampleLoadSample ( nh , nw , inputPtr , inH , inW , padMode ) ;
} else { //sampleMode == GridSampleMode_BILINEAR
int w0_h = : : floor ( h ) ;
int w0_w = : : floor ( w ) ;
int w1_h = : : ceil ( h ) ;
int w1_w = : : ceil ( w ) ;
auto oneV = Vec4 ( 1.0f ) ;
Vec4 i00 = MNNGridSampleLoadSample ( w0_h , w0_w , inputPtr , inH , inW , padMode ) ;
Vec4 i01 = MNNGridSampleLoadSample ( w0_h , w1_w , inputPtr , inH , inW , padMode ) ;
Vec4 i10 = MNNGridSampleLoadSample ( w1_h , w0_w , inputPtr , inH , inW , padMode ) ;
Vec4 i11 = MNNGridSampleLoadSample ( w1_h , w1_w , inputPtr , inH , inW , padMode ) ;
auto f0 = Vec4 ( ( float ) w1_w - w ) ;
auto f1 = oneV - f0 ;
auto h0 = Vec4 ( ( float ) w1_h - h ) ;
auto h1 = oneV - h0 ;
Vec4 i0 = i00 * f0 + i01 * f1 ;
Vec4 i1 = i10 * f0 + i11 * f1 ;
interp = i0 * h0 + i1 * h1 ;
}
Vec4 : : save ( outputPtr + 4 * ow , interp ) ;
}
}
2019-04-17 10:49:11 +08:00
2019-07-11 13:56:52 +08:00
void MNNPackC4Uint8 ( uint8_t * dst , const uint8_t * src , size_t area , size_t depth ) {
int z , x ;
int cur = 0 ;
memset ( dst , 0 , area * UP_DIV ( depth , 4 ) * 4 * sizeof ( uint8_t ) ) ;
for ( z = 0 ; z < depth ; + + z ) {
int plane = z / 4 ;
uint8_t * dstPlane = plane * area * 4 + dst ;
int offset = z % 4 ;
for ( x = 0 ; x < area ; + + x ) {
dstPlane [ 4 * x + offset ] = src [ cur + + ] ;
}
}
}
void MNNUnpackC4Uint8 ( uint8_t * dst , const uint8_t * src , size_t area , size_t depth ) {
int x ;
int z ;
int cur = 0 ;
for ( z = 0 ; z < depth ; + + z ) {
int plane = z / 4 ;
const uint8_t * srcPlane = plane * area * 4 + src ;
int offset = z % 4 ;
for ( x = 0 ; x < area ; + + x ) {
dst [ cur + + ] = srcPlane [ 4 * x + offset ] ;
}
}
}
2020-05-18 06:53:03 +08:00
void MNNUnpackTransposeUint8 ( uint8_t * dst , const uint8_t * src , size_t area , size_t depth ) {
2019-04-17 10:49:11 +08:00
if ( depth = = 4 ) {
: : memcpy ( dst , src , area * depth * sizeof ( uint8_t ) ) ;
return ;
}
# ifdef MNN_USE_NEON
if ( depth = = 3 ) {
uint8x16x4_t rgba ;
rgba . val [ 3 ] = vdupq_n_u8 ( 0 ) ;
int sta = 0 ;
int staC16 = ( int ) area / 16 ;
for ( int i = 0 ; i < staC16 ; sta + = 16 , + + i ) {
auto rgb = vld3q_u8 ( src + sta * 3 ) ;
rgba . val [ 0 ] = rgb . val [ 0 ] ;
rgba . val [ 1 ] = rgb . val [ 1 ] ;
rgba . val [ 2 ] = rgb . val [ 2 ] ;
vst4q_u8 ( dst + 4 * sta , rgba ) ;
}
sta = staC16 * 16 ;
for ( ; sta < area ; + + sta ) {
auto s = src + sta * 3 ;
auto d = dst + sta * 4 ;
d [ 0 ] = s [ 0 ] ;
d [ 1 ] = s [ 1 ] ;
d [ 2 ] = s [ 2 ] ;
d [ 3 ] = 0 ;
}
return ;
}
if ( depth = = 1 ) {
uint8x16x4_t rgba ;
rgba . val [ 1 ] = vdupq_n_u8 ( 0 ) ;
rgba . val [ 2 ] = vdupq_n_u8 ( 0 ) ;
rgba . val [ 3 ] = vdupq_n_u8 ( 0 ) ;
int sta = 0 ;
for ( ; sta < area ; sta + = 16 ) {
rgba . val [ 0 ] = vld1q_u8 ( src + sta ) ;
vst4q_u8 ( dst + 4 * sta , rgba ) ;
}
for ( ; sta < area ; + + sta ) {
auto s = src + sta ;
auto d = dst + sta * 4 ;
d [ 0 ] = s [ 0 ] ;
d [ 1 ] = 0 ;
d [ 2 ] = 0 ;
d [ 3 ] = 0 ;
}
return ;
}
# endif
int c = ( int ) depth ;
int cDiv4 = c / 4 ;
int cAlign = cDiv4 * 4 ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = ( src + hi * c ) ;
auto dstHeight = ( dst + hi * 4 ) ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
for ( int i = 0 ; i < 4 ; + + i ) {
dstHeight [ ci * area * 4 + i ] = srcHeight [ 4 * ci + i ] ;
}
}
}
if ( cAlign = = c ) {
return ;
}
int cReamin = c - cAlign ;
auto srcAlign = src + cAlign ;
auto dstAlign = dst + area * cAlign ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = srcAlign + hi * c ;
auto dstHeight = dstAlign + hi * 4 ;
for ( int i = 0 ; i < 4 ; + + i ) {
dstHeight [ i ] = 0 ;
}
for ( int ci = 0 ; ci < cReamin ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci ] ;
}
}
}
2020-05-18 06:53:03 +08:00
void MNNUnpackTranspose ( float * dst , const float * src , size_t area , size_t depth ) {
2019-04-17 10:49:11 +08:00
# ifdef MNN_USE_NEON
if ( 1 = = depth ) {
auto zeroValue = vmovq_n_f32 ( 0.0f ) ;
int areaC4 = ( int ) area / 4 ;
int remain = areaC4 * 4 ;
for ( int i = 0 ; i < areaC4 ; + + i ) {
auto srcCur = src + 4 * i ;
auto dstCur = dst + 16 * i ;
auto srcValue = vld1q_f32 ( srcCur ) ;
float32x4x4_t dstValue ;
dstValue . val [ 0 ] = srcValue ;
dstValue . val [ 1 ] = zeroValue ;
dstValue . val [ 2 ] = zeroValue ;
dstValue . val [ 3 ] = zeroValue ;
vst4q_f32 ( dstCur , dstValue ) ;
}
for ( int i = remain ; i < area ; + + i ) {
dst [ 4 * i + 0 ] = src [ i ] ;
dst [ 4 * i + 1 ] = 0.0f ;
dst [ 4 * i + 2 ] = 0.0f ;
dst [ 4 * i + 3 ] = 0.0f ;
}
return ;
}
if ( 3 = = depth ) {
auto zeroValue = vmovq_n_f32 ( 0.0f ) ;
int areaC4 = ( int ) area / 4 ;
int remain = areaC4 * 4 ;
for ( int i = 0 ; i < areaC4 ; + + i ) {
auto srcCur = src + 12 * i ;
auto dstCur = dst + 16 * i ;
auto srcValue = vld3q_f32 ( srcCur ) ;
float32x4x4_t dstValue ;
dstValue . val [ 0 ] = srcValue . val [ 0 ] ;
dstValue . val [ 1 ] = srcValue . val [ 1 ] ;
dstValue . val [ 2 ] = srcValue . val [ 2 ] ;
dstValue . val [ 3 ] = zeroValue ;
vst4q_f32 ( dstCur , dstValue ) ;
}
for ( int i = remain ; i < area ; + + i ) {
dst [ 4 * i + 0 ] = src [ 3 * i + 0 ] ;
dst [ 4 * i + 1 ] = src [ 3 * i + 1 ] ;
dst [ 4 * i + 2 ] = src [ 3 * i + 2 ] ;
dst [ 4 * i + 3 ] = 0.0f ;
}
return ;
}
# endif
int c = ( int ) depth ;
int cDiv4 = c / 4 ;
int cAlign = cDiv4 * 4 ;
for ( int hi = 0 ; hi < area ; + + hi ) {
const float * srcHeight = src + hi * c ;
float * dstHeight = dst + hi * 4 ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
2020-05-17 23:25:21 +08:00
Vec4 : : save ( dstHeight + 4 * ci * area , Vec4 : : load ( srcHeight + 4 * ci ) ) ;
2019-04-17 10:49:11 +08:00
}
}
if ( cAlign = = c ) {
return ;
}
int cReamin = c - cAlign ;
auto srcAlign = src + cAlign ;
auto dstAlign = dst + area * cAlign ;
# ifdef MNN_USE_NEON
auto zeroVector = vdupq_n_f32 ( 0.0f ) ;
# endif
for ( int hi = 0 ; hi < area ; + + hi ) {
const float * srcHeight = srcAlign + hi * c ;
float * dstHeight = dstAlign + hi * 4 ;
# ifdef MNN_USE_NEON
vst1q_f32 ( dstHeight , zeroVector ) ;
# else
for ( int i = 0 ; i < 4 ; + + i ) {
dstHeight [ i ] = 0 ;
}
# endif
for ( int ci = 0 ; ci < cReamin ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci ] ;
}
}
}
2020-05-18 06:53:03 +08:00
void MNNPackTransposeUint8 ( uint8_t * dst , const uint8_t * src , size_t area , size_t depth ) {
2019-04-17 10:49:11 +08:00
if ( 1 = = area ) {
: : memcpy ( dst , src , depth * sizeof ( uint8_t ) ) ;
return ;
}
int c = ( int ) depth ;
int cDiv4 = c / 4 ;
int cAlign = cDiv4 * 4 ;
if ( cAlign = = c ) {
int32_t * dst32 = ( int32_t * ) dst ;
const int32_t * src32 = ( int32_t * ) src ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = src32 + hi ;
auto dstHeight = dst32 + hi * cDiv4 ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci * area ] ;
}
}
return ;
}
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = src + hi * 4 ;
auto dstHeight = dst + hi * c ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
for ( int i = 0 ; i < 4 ; + + i ) {
dstHeight [ ci * 4 + i ] = srcHeight [ 4 * ci * area + i ] ;
}
}
}
int cReamin = c - cAlign ;
auto srcAlign = src + area * cAlign ;
auto dstAlign = dst + cAlign ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = srcAlign + hi * 4 ;
auto dstHeight = dstAlign + hi * c ;
for ( int ci = 0 ; ci < cReamin ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci ] ;
}
}
}
2020-05-18 06:53:03 +08:00
void MNNPackTranspose ( float * dst , const float * src , size_t area , size_t depth ) {
2019-04-17 10:49:11 +08:00
int c = ( int ) depth ;
int cDiv4 = c / 4 ;
int cAlign = cDiv4 * 4 ;
for ( int hi = 0 ; hi < area ; + + hi ) {
const float * srcHeight = src + hi * 4 ;
float * dstHeight = dst + hi * c ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
2020-05-17 23:25:21 +08:00
Vec4 : : save ( dstHeight + 4 * ci , Vec4 : : load ( srcHeight + 4 * ci * area ) ) ;
2019-04-17 10:49:11 +08:00
}
}
if ( cAlign = = c ) {
return ;
}
int cReamin = c - cAlign ;
auto srcAlign = src + area * cAlign ;
auto dstAlign = dst + cAlign ;
for ( int hi = 0 ; hi < area ; + + hi ) {
const float * srcHeight = srcAlign + hi * 4 ;
float * dstHeight = dstAlign + hi * c ;
for ( int ci = 0 ; ci < cReamin ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci ] ;
}
}
}
2019-06-17 20:10:35 +08:00
void MNNExp ( float * dst , const float * src , size_t dataSize ) {
2019-06-24 11:32:41 +08:00
int countC8 = ( int ) dataSize / 8 ;
2019-06-17 20:10:35 +08:00
if ( countC8 > 0 ) {
// Align to eight so asm is easier to write
static float parameters [ ] = {
( float ) log ( 2.0f ) , 1.0f / ( float ) log ( 2.0f ) , 1.0f , 1.0f , 0.5f , 1.0f / 6.0f , 1.0f / 24.0f , 1.0f / 120.0f } ;
MNNExpC8 ( dst , src , parameters , countC8 ) ;
}
int remain = countC8 * 8 ;
auto param = log ( 2.0f ) ;
2019-07-02 18:01:08 +08:00
float xLimit = 87 ;
2019-06-17 20:10:35 +08:00
for ( int i = remain ; i < dataSize ; i + + ) {
/*Origin Function*/
//dst[i] = expf(-src[i]);
/*Approciate Function*/
2021-04-08 15:34:23 +08:00
2019-06-17 20:10:35 +08:00
auto x = - src [ i ] ;
2019-07-02 18:01:08 +08:00
x = ALIMAX ( x , - xLimit ) ;
x = ALIMIN ( x , xLimit ) ;
2021-04-08 15:34:23 +08:00
2019-06-17 20:10:35 +08:00
int div = ( x / param ) ;
2019-07-02 18:01:08 +08:00
int div2 = ( div + 127 ) < < 23 ;
2019-06-17 20:10:35 +08:00
auto xReamin = x - div * param ;
2019-07-02 18:01:08 +08:00
float expBasic = * ( float * ) ( & div2 ) ;
2021-04-08 15:34:23 +08:00
2019-06-17 20:10:35 +08:00
auto t = xReamin ;
auto expRemain = ( ( ( ( 1.0f / 120 * t + 1.0f / 24 ) * t + 1.0f / 6 ) * t + 0.5f ) * t + 1.0f ) * t + 1.0f ;
dst [ i ] = expBasic * expRemain ;
}
}
2019-08-22 20:13:46 +08:00
// Lambert's series with 7 divisions
// reference from
// https://varietyofsound.wordpress.com/2011/02/14/efficient-tanh-computation-using-lamberts-continued-fraction/
inline float tanhf_poly ( float value ) {
if ( value > 5.0 ) {
return 1.0 ;
} else if ( value < = - 5.0 ) {
return - 1.0 ;
} else {
float x2 = value * value ;
float a = value * ( 135135.0f + x2 * ( 17325.0f + x2 * ( 378.0f + x2 ) ) ) ;
float b = 135135.0f + x2 * ( 62370.0f + x2 * ( 3150.0f + x2 * 28.0f ) ) ;
return a / b ;
}
}
void MNNTanh ( float * dst , const float * src , size_t dataSize ) {
2021-04-08 15:34:23 +08:00
/* Origin Code
2019-08-22 20:13:46 +08:00
for ( int i = 0 ; i < dataSize ; i + + ) {
// outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
dst [ i ] = tanhf_poly ( src [ i ] ) ;
}
2021-04-08 15:34:23 +08:00
*/
for ( int i = 0 ; i < dataSize ; + + i ) {
dst [ i ] = src [ i ] + src [ i ] ;
}
MNNExp ( dst , dst , dataSize ) ;
for ( int i = 0 ; i < dataSize ; i + + ) {
// outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
auto expX2 = dst [ i ] ;
dst [ i ] = ( 1.0f - expX2 ) / ( 1.0f + expX2 ) ;
}
2019-08-22 20:13:46 +08:00
}
2019-12-27 22:16:57 +08:00
void MNNReluWithSlope ( float * dst , const float * src , size_t sizeQuad , float slope ) {
float slopeValue [ 4 ] ;
for ( int i = 0 ; i < 4 ; + + i ) {
slopeValue [ i ] = slope ;
}
MNNReluWithSlopeChannel ( dst , src , slopeValue , sizeQuad , 1 ) ;
}
2021-04-08 15:34:23 +08:00
2020-02-26 09:57:17 +08:00
void MNNReluWithSlopeCommon ( float * dst , const float * src , size_t size , float slope ) {
int sizeQuad = size / 4 ;
int start = 0 ;
if ( sizeQuad > 0 ) {
MNNReluWithSlope ( dst , src , sizeQuad , slope ) ;
start = sizeQuad * 4 ;
}
for ( int j = start ; j < size ; j + + ) {
if ( src [ j ] < 0 ) {
dst [ j ] = src [ j ] * slope ;
} else {
dst [ j ] = src [ j ] ;
}
}
}
2020-01-15 13:33:47 +08:00
2021-04-08 15:34:23 +08:00
void MNNHardSwishCommon ( float * dst , const float * src , size_t size ) {
int sizeQuad = size / 4 ;
int start = 0 ;
# ifdef MNN_USE_SSE
if ( sizeQuad > 0 ) {
MNNHardSwish ( dst , src , sizeQuad ) ;
start = sizeQuad * 4 ;
}
# endif
# ifdef MNN_USE_NEON
float32x4_t zero = vdupq_n_f32 ( 0.f ) ;
float32x4_t three = vdupq_n_f32 ( 3.f ) ;
float32x4_t six = vdupq_n_f32 ( 6.f ) ;
float32x4_t divsix = vdupq_n_f32 ( 1.0f / 6.f ) ;
for ( int i = 0 ; i < sizeQuad ; i + + ) {
auto x = vld1q_f32 ( src + 4 * i ) ;
auto y = vmulq_f32 ( vmulq_f32 ( x , vminq_f32 ( vmaxq_f32 ( vaddq_f32 ( x , three ) , zero ) , six ) ) , divsix ) ;
vst1q_f32 ( dst + 4 * i , y ) ;
}
start = sizeQuad * 4 ;
# endif
for ( int j = start ; j < size ; j + + ) {
if ( src [ j ] < = - 3 ) {
dst [ j ] = 0 ;
} else if ( src [ j ] > = 3 ) {
dst [ j ] = src [ j ] ;
} else {
dst [ j ] = src [ j ] * ( src [ j ] + 3 ) / 6.f ;
}
}
}
2021-06-11 17:17:13 +08:00
void MNNGeluCommon ( float * dst , const float * src , size_t size ) {
int sizeQuad = size / 8 ;
int start = 0 ;
# ifdef MNN_USE_SSE
if ( sizeQuad > 0 ) {
MNNGelu ( dst , src , sizeQuad ) ;
start = sizeQuad * 8 ;
}
# endif
auto tanhf_poly = [ ] ( float value ) - > float {
if ( value > 5.0f ) {
return 1.0f ;
} else if ( value < = - 5.0f ) {
return - 1.0f ;
} else {
float x2 = value * value ;
float a = value * ( 135135.0f + x2 * ( 17325.0f + x2 * ( 378.0f + x2 ) ) ) ;
float b = 135135.0f + x2 * ( 62370.0f + x2 * ( 3150.0f + x2 * 28.0f ) ) ;
return a / b ;
}
} ;
for ( int i = start ; i < size ; i + + ) {
float temp = 0.044715f * src [ i ] * src [ i ] * src [ i ] ;
temp = 0.79788458f * ( temp + src [ i ] ) ;
dst [ i ] = ( 1.0f + tanhf_poly ( temp ) ) * src [ i ] * 0.5f ;
}
}
2020-01-15 13:33:47 +08:00
void MNNScaleAndAddBiasScalar ( float * dst , const float * src , float bias , float alpha , size_t number ) {
int numberC4 = ( int ) number / 4 ;
int start = 0 ;
if ( numberC4 > 0 ) {
float biasC4 [ 4 ] = {
bias ,
bias ,
bias ,
bias
} ;
float alphaC4 [ 4 ] = {
alpha ,
alpha ,
alpha ,
alpha
} ;
MNNScaleAndAddBias ( dst , src , biasC4 , alphaC4 , numberC4 , 1 ) ;
start = numberC4 * 4 ;
}
for ( int i = start ; i < number ; + + i ) {
dst [ i ] = src [ i ] * alpha + bias ;
}
}
2020-07-04 01:21:30 +08:00
void MNNAxByClamp ( float * C , const float * A , const float * B , size_t width , size_t cStride , size_t aStride , size_t bStride , size_t height , const float * parameters ) {
int widthC4 = ( int ) width / 4 ;
if ( widthC4 > 0 ) {
auto minF = Vec4 ( parameters [ 2 ] ) ;
auto maxF = Vec4 ( parameters [ 3 ] ) ;
auto alpha = Vec4 ( parameters [ 0 ] ) ;
auto beta = Vec4 ( parameters [ 1 ] ) ;
for ( int y = 0 ; y < height ; + + y ) {
auto a = A + aStride * y ;
auto b = B + bStride * y ;
auto c = C + cStride * y ;
for ( int x = 0 ; x < width ; + + x ) {
auto av = Vec4 : : load ( a + 4 * x ) ;
auto bv = Vec4 : : load ( b + 4 * x ) ;
auto cv = av * alpha + bv * beta ;
cv = Vec4 : : min ( cv , maxF ) ;
cv = Vec4 : : max ( cv , minF ) ;
Vec4 : : save ( c + 4 * x , cv ) ;
}
}
width = width - 4 * widthC4 ;
C = C + widthC4 * 4 ;
A = A + widthC4 * 4 ;
B = B + widthC4 * 4 ;
}
if ( width > 0 ) {
auto minF = parameters [ 2 ] ;
auto maxF = parameters [ 3 ] ;
auto alpha = parameters [ 0 ] ;
auto beta = parameters [ 1 ] ;
for ( int y = 0 ; y < height ; + + y ) {
auto a = A + aStride * y ;
auto b = B + bStride * y ;
auto c = C + cStride * y ;
for ( int x = 0 ; x < width ; + + x ) {
auto av = a [ x ] ;
auto bv = b [ x ] ;
auto cv = av * alpha + bv * beta ;
cv = std : : min ( cv , maxF ) ;
cv = std : : max ( cv , minF ) ;
c [ x ] = cv ;
}
}
}
}
# ifndef MNN_USE_NEON
2021-04-08 15:34:23 +08:00
void MNNAxByClampBroadcastUnit ( float * C , const float * A , const float * B , size_t width , size_t cStride , size_t aStride , size_t height , const float * parameters ) {
2020-07-04 01:21:30 +08:00
auto minF = Vec4 ( parameters [ 2 ] ) ;
auto maxF = Vec4 ( parameters [ 3 ] ) ;
auto beta = Vec4 ( parameters [ 1 ] ) ;
for ( int y = 0 ; y < height ; + + y ) {
auto a = A + aStride * y ;
auto b = B + 4 * y ;
auto bv = Vec4 : : load ( b ) ;
auto c = C + cStride * y ;
for ( int x = 0 ; x < width ; + + x ) {
auto av = Vec4 : : load ( a + 4 * x ) ;
auto cv = av + bv * beta ;
cv = Vec4 : : min ( cv , maxF ) ;
cv = Vec4 : : max ( cv , minF ) ;
Vec4 : : save ( c + 4 * x , cv ) ;
}
}
}
2020-12-15 14:12:35 +08:00
void MNNVectorTop1Float ( float * input , float * maxValue , int32_t * maxIndex , size_t inputCountUnit ) {
float maxV = input [ 0 ] ;
int maxIdx = 0 ;
for ( int i = 0 ; i < inputCountUnit ; i + + ) {
int offset = i * UNIT ;
for ( int j = 0 ; j < UNIT ; j + + ) {
if ( input [ offset + j ] > maxV ) {
maxV = input [ offset + j ] ;
maxIdx = offset + j ;
}
}
}
maxValue [ 0 ] = maxV ;
maxIndex [ 0 ] = maxIdx ;
}
void MNNVectorTop1Int32 ( int32_t * input , int32_t * maxValue , int32_t * maxIndex , size_t inputCountUnit ) {
int32_t maxV = input [ 0 ] ;
int maxIdx = 0 ;
for ( int i = 0 ; i < inputCountUnit ; i + + ) {
int offset = i * UNIT ;
for ( int j = 0 ; j < UNIT ; j + + ) {
if ( input [ offset + j ] > maxV ) {
maxV = input [ offset + j ] ;
maxIdx = offset + j ;
}
}
}
maxValue [ 0 ] = maxV ;
maxIndex [ 0 ] = maxIdx ;
}
2020-07-04 01:21:30 +08:00
# endif
2021-01-06 16:29:37 +08:00
void MNNComputeMatMulForE_1 ( const float * A , const float * B , float * C , const float * biasPtr , const MatMulParam * param , size_t tId ) {
auto l = param - > l ;
auto h = param - > h ;
auto numberThread = param - > numberThread ;
auto lC4 = l / 4 ;
auto lR = lC4 * 4 ;
if ( param - > BTranspose ) {
for ( int y = tId ; y < h ; y + = numberThread ) {
Vec4 sumValue = Vec4 ( 0.0f ) ;
auto by = B + y * l ;
for ( int x = 0 ; x < lC4 ; + + x ) {
sumValue = sumValue + Vec4 : : load ( A + x * 4 ) * Vec4 : : load ( by + x * 4 ) ;
}
float sumRemain = 0.0f ;
for ( int x = lR ; x < l ; + + x ) {
sumRemain = sumRemain + A [ x ] * by [ x ] ;
}
if ( nullptr ! = biasPtr ) {
sumRemain + = biasPtr [ y ] ;
}
C [ y ] = sumRemain + sumValue [ 0 ] + sumValue [ 1 ] + sumValue [ 2 ] + sumValue [ 3 ] ;
}
} else {
auto hC4 = h / 4 ;
auto hR = hC4 * 4 ;
for ( int y = tId ; y < hC4 ; y + = numberThread ) {
auto bs = B + 4 * y ;
Vec4 sumValue = Vec4 ( 0.0f ) ;
if ( biasPtr ! = nullptr ) {
sumValue = Vec4 : : load ( biasPtr + 4 * y ) ;
}
auto srcY = A + y * l ;
for ( int x = 0 ; x < l ; + + x ) {
sumValue = sumValue + Vec4 ( A [ x ] ) * Vec4 : : load ( bs + h * x ) ;
}
Vec4 : : save ( C + 4 * y , sumValue ) ;
}
2021-02-07 10:45:07 +08:00
for ( int y = hR + tId ; y < h ; y + = numberThread ) {
2021-01-06 16:29:37 +08:00
auto bs = B + y ;
float sumValue = 0.0f ;
if ( biasPtr ! = nullptr ) {
sumValue = biasPtr [ y ] ;
}
auto srcY = A + y * l ;
for ( int x = 0 ; x < l ; + + x ) {
sumValue = sumValue + A [ x ] * bs [ h * x ] ;
}
C [ y ] = sumValue ;
}
}
}
2021-06-11 17:17:13 +08:00
void MNNComputeMatMulForH_1 ( const float * A , const float * B , float * C , const float * biasPtr , const MatMulParam * param , size_t tId ) {
int e = param - > e ;
int l = param - > l ;
int numberThread = param - > numberThread ;
if ( param - > ATranspose ) {
float biasValue = 0.0f ;
if ( nullptr ! = biasPtr ) {
biasValue = * biasPtr ;
}
auto eC4 = e / 4 ;
auto eR = eC4 * 4 ;
for ( int y = tId ; y < eC4 ; y + = numberThread ) {
Vec4 sumValue = Vec4 ( biasValue ) ;
auto srcY = A + y * 4 ;
for ( int x = 0 ; x < l ; + + x ) {
sumValue = sumValue + Vec4 : : load ( srcY + x * e ) * Vec4 ( B [ x ] ) ;
}
Vec4 : : save ( C + 4 * y , sumValue ) ;
}
if ( 0 = = tId ) {
for ( int y = eR ; y < e ; + + y ) {
float sumValue = biasValue ;
auto srcY = A + y ;
for ( int x = 0 ; x < l ; + + x ) {
sumValue = sumValue + srcY [ x * e ] * B [ x ] ;
}
C [ y ] = sumValue ;
}
}
return ;
}
float biasValue = 0.0f ;
if ( nullptr ! = biasPtr ) {
biasValue = * biasPtr ;
}
auto lC4 = l / 4 ;
auto lR = lC4 * 4 ;
for ( int y = tId ; y < e ; y + = numberThread ) {
Vec4 sumValue = Vec4 ( biasValue ) ;
auto srcY = A + y * l ;
for ( int x = 0 ; x < lC4 ; + + x ) {
sumValue = sumValue + Vec4 : : load ( srcY + 4 * x ) * Vec4 : : load ( B + 4 * x ) ;
}
float sumSingle = sumValue [ 0 ] + sumValue [ 1 ] + sumValue [ 2 ] + sumValue [ 3 ] ;
for ( int x = lR ; x < l ; + + x ) {
sumSingle + = srcY [ x ] * B [ x ] ;
}
C [ y ] = sumSingle ;
}
}
2021-04-08 15:34:23 +08:00
void MNNPackC4Int16 ( int16_t * dst , const int16_t * src , size_t area , size_t depth ) {
int z , x ;
int cur = 0 ;
memset ( dst , 0 , area * UP_DIV ( depth , 4 ) * 4 * sizeof ( int16_t ) ) ;
for ( z = 0 ; z < depth ; + + z ) {
int plane = z / 4 ;
int16_t * dstPlane = plane * area * 4 + dst ;
int offset = z % 4 ;
for ( x = 0 ; x < area ; + + x ) {
dstPlane [ 4 * x + offset ] = src [ cur + + ] ;
}
}
}
void MNNUnpackC4Int16 ( int16_t * dst , const int16_t * src , size_t area , size_t depth ) {
int x ;
int z ;
int cur = 0 ;
for ( z = 0 ; z < depth ; + + z ) {
int plane = z / 4 ;
const int16_t * srcPlane = plane * area * 4 + src ;
int offset = z % 4 ;
for ( x = 0 ; x < area ; + + x ) {
dst [ cur + + ] = srcPlane [ 4 * x + offset ] ;
}
}
}
void MNNUnpackTransposeInt16 ( int16_t * dst , const int16_t * src , size_t area , size_t depth ) {
if ( depth = = 4 ) {
: : memcpy ( dst , src , area * depth * sizeof ( int16_t ) ) ;
return ;
}
int c = ( int ) depth ;
int cDiv4 = c / 4 ;
int cAlign = cDiv4 * 4 ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = ( src + hi * c ) ;
auto dstHeight = ( dst + hi * 4 ) ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
for ( int i = 0 ; i < 4 ; + + i ) {
dstHeight [ ci * area * 4 + i ] = srcHeight [ 4 * ci + i ] ;
}
}
}
if ( cAlign = = c ) {
return ;
}
int cReamin = c - cAlign ;
auto srcAlign = src + cAlign ;
auto dstAlign = dst + area * cAlign ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = srcAlign + hi * c ;
auto dstHeight = dstAlign + hi * 4 ;
for ( int i = 0 ; i < 4 ; + + i ) {
dstHeight [ i ] = 0 ;
}
for ( int ci = 0 ; ci < cReamin ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci ] ;
}
}
}
void MNNPackTransposeInt16 ( int16_t * dst , const int16_t * src , size_t area , size_t depth ) {
if ( 1 = = area ) {
: : memcpy ( dst , src , depth * sizeof ( int16_t ) ) ;
return ;
}
int c = ( int ) depth ;
int cDiv4 = c / 4 ;
int cAlign = cDiv4 * 4 ;
if ( cAlign = = c ) {
int64_t * dst32 = ( int64_t * ) dst ;
const int64_t * src32 = ( int64_t * ) src ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = src32 + hi ;
auto dstHeight = dst32 + hi * cDiv4 ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci * area ] ;
}
}
return ;
}
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = src + hi * 4 ;
auto dstHeight = dst + hi * c ;
for ( int ci = 0 ; ci < cDiv4 ; + + ci ) {
for ( int i = 0 ; i < 4 ; + + i ) {
dstHeight [ ci * 4 + i ] = srcHeight [ 4 * ci * area + i ] ;
}
}
}
int cReamin = c - cAlign ;
auto srcAlign = src + area * cAlign ;
auto dstAlign = dst + cAlign ;
for ( int hi = 0 ; hi < area ; + + hi ) {
auto srcHeight = srcAlign + hi * 4 ;
auto dstHeight = dstAlign + hi * c ;
for ( int ci = 0 ; ci < cReamin ; + + ci ) {
dstHeight [ ci ] = srcHeight [ ci ] ;
}
}
}
void MNNCopyC4Int16WithStride ( const float * sourceF , float * destF , size_t srcStride , size_t dstStride , size_t count ) {
auto source = ( int16_t * ) sourceF ;
auto dest = ( int16_t * ) destF ;
for ( int i = 0 ; i < count ; + + i ) {
auto s = source + i * srcStride ;
auto d = dest + i * dstStride ;
* ( int64_t * ) ( d ) = * ( ( int64_t * ) s ) ;
}
}
void MNNSin ( float * dst , const float * src , size_t dataSize ) {
for ( int i = 0 ; i < dataSize ; i + + ) {
dst [ i ] = sinf ( src [ i ] ) ;
}
}
void MNNSigmoid ( float * dst , const float * src , size_t dataSize ) {
MNNExp ( dst , src , dataSize ) ;
for ( int i = 0 ; i < dataSize ; + + i ) {
dst [ i ] = 1.0f / ( 1.0f + dst [ i ] ) ;
}
}
/**
Modified from https : //github.com/alibaba/MNN/pull/1359
Thanks for https : //github.com/hroken
*/
void MNNSigmoidLowp ( float * dst , const float * src , size_t dataSize ) {
MNNExp ( dst , src , dataSize ) ;
# ifdef MNN_USE_NEON
int dataC4 = ( int ) dataSize / 4 ;
if ( dataC4 > 0 ) {
// neon optimization for sigmid cpu
float32x4_t value = vdupq_n_f32 ( 1.0f ) ;
float32x4_t out = vld1q_f32 ( dst ) ;
for ( int i = 1 ; i < dataC4 ; + + i ) {
out = vrecpeq_f32 ( vaddq_f32 ( value , out ) ) ;
vst1q_f32 ( dst , out ) ;
dst + = 4 ;
out = vld1q_f32 ( dst ) ;
}
out = vrecpeq_f32 ( vaddq_f32 ( value , out ) ) ;
vst1q_f32 ( dst , out ) ;
dataSize = dataSize - 4 * dataC4 ;
}
# endif
for ( int i = 0 ; i < dataSize ; + + i ) {
dst [ i ] = 1.0f / ( 1.0f + dst [ i ] ) ;
}
}
2021-06-11 17:17:13 +08:00
void MNNMultiAndDestTransformCommon23 ( float * * cacheLine , const float * weigth , float * dest , int cacheLineSize , int ow , const float * bias , const float * parameters ) {
2021-04-08 15:34:23 +08:00
int unit = ow / 2 ;
MNN_ASSERT ( cacheLineSize > = 1 ) ;
2021-06-11 17:17:13 +08:00
auto biasF = Vec4 : : load ( bias ) ;
auto minF = Vec4 ( parameters [ 2 ] ) ;
auto maxF = Vec4 ( parameters [ 3 ] ) ;
2021-04-08 15:34:23 +08:00
for ( int x = 0 ; x < unit ; + + x ) {
auto offset = 4 * 4 * x ;
int i = 0 ;
Vec4 m0 = Vec4 : : load ( weigth + i * 16 + 4 * 0 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 0 ) ;
Vec4 m1 = Vec4 : : load ( weigth + i * 16 + 4 * 1 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 1 ) ;
Vec4 m2 = Vec4 : : load ( weigth + i * 16 + 4 * 2 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 2 ) ;
Vec4 m3 = Vec4 : : load ( weigth + i * 16 + 4 * 3 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 3 ) ;
for ( i = 1 ; i < cacheLineSize ; + + i ) {
m0 = m0 + Vec4 : : load ( weigth + i * 16 + 4 * 0 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 0 ) ;
m1 = m1 + Vec4 : : load ( weigth + i * 16 + 4 * 1 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 1 ) ;
m2 = m2 + Vec4 : : load ( weigth + i * 16 + 4 * 2 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 2 ) ;
m3 = m3 + Vec4 : : load ( weigth + i * 16 + 4 * 3 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 3 ) ;
}
2021-06-11 17:17:13 +08:00
auto o0 = m0 + m1 + m2 + biasF ;
auto o1 = m1 - m2 + m3 + biasF ;
o0 = Vec4 : : min ( maxF , o0 ) ;
o1 = Vec4 : : min ( maxF , o1 ) ;
o0 = Vec4 : : max ( minF , o0 ) ;
o1 = Vec4 : : max ( minF , o1 ) ;
2021-04-08 15:34:23 +08:00
Vec4 : : save ( dest + 8 * x + 0 * 4 , o0 ) ;
Vec4 : : save ( dest + 8 * x + 1 * 4 , o1 ) ;
}
if ( unit * 2 < ow ) {
auto offset = 4 * 4 * unit ;
int i = 0 ;
Vec4 m0 = Vec4 : : load ( weigth + i * 16 + 4 * 0 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 0 ) ;
Vec4 m1 = Vec4 : : load ( weigth + i * 16 + 4 * 1 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 1 ) ;
Vec4 m2 = Vec4 : : load ( weigth + i * 16 + 4 * 2 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 2 ) ;
for ( i = 1 ; i < cacheLineSize ; + + i ) {
m0 = m0 + Vec4 : : load ( weigth + i * 16 + 4 * 0 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 0 ) ;
m1 = m1 + Vec4 : : load ( weigth + i * 16 + 4 * 1 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 1 ) ;
m2 = m2 + Vec4 : : load ( weigth + i * 16 + 4 * 2 ) * Vec4 : : load ( cacheLine [ i ] + offset + 4 * 2 ) ;
}
2021-06-11 17:17:13 +08:00
auto o0 = m0 + m1 + m2 + biasF ;
o0 = Vec4 : : min ( maxF , o0 ) ;
o0 = Vec4 : : max ( minF , o0 ) ;
2021-04-08 15:34:23 +08:00
Vec4 : : save ( dest + 8 * unit + 0 * 4 , o0 ) ;
}
}
extern " C " {
void MNNConvDwF23SourceTransUnit ( const float * source , float * dest , size_t unit ) ;
}
void MNNSourceTransformCommonF23 ( const float * source , float * dest , int unit , int iw , int pad , int su , int eu ) {
for ( int x = 0 ; x < su ; + + x ) {
auto dstX = dest + 4 * 4 * x ;
auto sx = x * 2 - ( int ) pad ;
auto ex = sx + 4 ;
auto clampSx = std : : max ( sx , 0 ) ;
auto clampEx = std : : min ( ex , ( int ) iw ) ;
Vec4 v [ 4 ] = { 0.0f , 0.0f , 0.0f , 0.0f } ;
for ( int i = clampSx ; i < clampEx ; + + i ) {
v [ i - sx ] = Vec4 : : load ( source + 4 * i ) ;
}
auto m0 = v [ 0 ] - v [ 2 ] ;
auto m1 = v [ 1 ] + v [ 2 ] ;
auto m2 = v [ 2 ] - v [ 1 ] ;
auto m3 = v [ 3 ] - v [ 1 ] ;
Vec4 : : save ( dstX + 4 * 0 , m0 ) ;
Vec4 : : save ( dstX + 4 * 1 , m1 ) ;
Vec4 : : save ( dstX + 4 * 2 , m2 ) ;
Vec4 : : save ( dstX + 4 * 3 , m3 ) ;
}
MNNConvDwF23SourceTransUnit ( source + 4 * ( su * 2 - pad ) , dest + 4 * 4 * su , eu - su ) ;
for ( int x = eu ; x < unit ; + + x ) {
auto dstX = dest + 4 * 4 * x ;
auto sx = x * 2 - ( int ) pad ;
auto ex = sx + 4 ;
auto clampSx = std : : max ( sx , 0 ) ;
auto clampEx = std : : min ( ex , ( int ) iw ) ;
Vec4 v [ 4 ] = { 0.0f , 0.0f , 0.0f , 0.0f } ;
for ( int i = clampSx ; i < clampEx ; + + i ) {
v [ i - sx ] = Vec4 : : load ( source + 4 * i ) ;
}
auto m0 = v [ 0 ] - v [ 2 ] ;
auto m1 = v [ 1 ] + v [ 2 ] ;
auto m2 = v [ 2 ] - v [ 1 ] ;
auto m3 = v [ 3 ] - v [ 1 ] ;
Vec4 : : save ( dstX + 4 * 0 , m0 ) ;
Vec4 : : save ( dstX + 4 * 1 , m1 ) ;
Vec4 : : save ( dstX + 4 * 2 , m2 ) ;
Vec4 : : save ( dstX + 4 * 3 , m3 ) ;
}
}
# ifndef MNN_USE_NEON
2021-06-11 17:17:13 +08:00
void MNNConvDwF23MulTransUnit ( float * * cacheLine , const float * weigth , float * dest , size_t ow , const float * bias , const float * parameters ) {
2021-04-08 15:34:23 +08:00
int unit = ow / 2 ;
auto w00 = Vec4 : : load ( weigth + 0 * 16 + 4 * 0 ) ;
auto w01 = Vec4 : : load ( weigth + 0 * 16 + 4 * 1 ) ;
auto w02 = Vec4 : : load ( weigth + 0 * 16 + 4 * 2 ) ;
auto w03 = Vec4 : : load ( weigth + 0 * 16 + 4 * 3 ) ;
auto w10 = Vec4 : : load ( weigth + 1 * 16 + 4 * 0 ) ;
auto w11 = Vec4 : : load ( weigth + 1 * 16 + 4 * 1 ) ;
auto w12 = Vec4 : : load ( weigth + 1 * 16 + 4 * 2 ) ;
auto w13 = Vec4 : : load ( weigth + 1 * 16 + 4 * 3 ) ;
auto w20 = Vec4 : : load ( weigth + 2 * 16 + 4 * 0 ) ;
auto w21 = Vec4 : : load ( weigth + 2 * 16 + 4 * 1 ) ;
auto w22 = Vec4 : : load ( weigth + 2 * 16 + 4 * 2 ) ;
auto w23 = Vec4 : : load ( weigth + 2 * 16 + 4 * 3 ) ;
2021-06-11 17:17:13 +08:00
auto biasF = Vec4 : : load ( bias ) ;
auto minF = Vec4 ( parameters [ 2 ] ) ;
auto maxF = Vec4 ( parameters [ 3 ] ) ;
2021-04-08 15:34:23 +08:00
for ( int x = 0 ; x < unit ; + + x ) {
auto offset = 4 * 4 * x ;
int i = 0 ;
Vec4 m0 = w00 * Vec4 : : load ( cacheLine [ 0 ] + offset + 4 * 0 ) ;
Vec4 m1 = w01 * Vec4 : : load ( cacheLine [ 0 ] + offset + 4 * 1 ) ;
Vec4 m2 = w02 * Vec4 : : load ( cacheLine [ 0 ] + offset + 4 * 2 ) ;
Vec4 m3 = w03 * Vec4 : : load ( cacheLine [ 0 ] + offset + 4 * 3 ) ;
m0 = m0 + w10 * Vec4 : : load ( cacheLine [ 1 ] + offset + 4 * 0 ) ;
m1 = m1 + w11 * Vec4 : : load ( cacheLine [ 1 ] + offset + 4 * 1 ) ;
m2 = m2 + w12 * Vec4 : : load ( cacheLine [ 1 ] + offset + 4 * 2 ) ;
m3 = m3 + w13 * Vec4 : : load ( cacheLine [ 1 ] + offset + 4 * 3 ) ;
m0 = m0 + w20 * Vec4 : : load ( cacheLine [ 2 ] + offset + 4 * 0 ) ;
m1 = m1 + w21 * Vec4 : : load ( cacheLine [ 2 ] + offset + 4 * 1 ) ;
m2 = m2 + w22 * Vec4 : : load ( cacheLine [ 2 ] + offset + 4 * 2 ) ;
m3 = m3 + w23 * Vec4 : : load ( cacheLine [ 2 ] + offset + 4 * 3 ) ;
2021-06-11 17:17:13 +08:00
auto o0 = m0 + m1 + m2 + biasF ;
auto o1 = m1 - m2 + m3 + biasF ;
o0 = Vec4 : : min ( maxF , o0 ) ;
o1 = Vec4 : : min ( maxF , o1 ) ;
o0 = Vec4 : : max ( minF , o0 ) ;
o1 = Vec4 : : max ( minF , o1 ) ;
2021-04-08 15:34:23 +08:00
Vec4 : : save ( dest + 8 * x + 0 * 4 , o0 ) ;
Vec4 : : save ( dest + 8 * x + 1 * 4 , o1 ) ;
}
if ( unit * 2 < ow ) {
auto offset = 4 * 4 * unit ;
Vec4 m0 = w00 * Vec4 : : load ( cacheLine [ 0 ] + offset + 4 * 0 ) ;
Vec4 m1 = w01 * Vec4 : : load ( cacheLine [ 0 ] + offset + 4 * 1 ) ;
Vec4 m2 = w02 * Vec4 : : load ( cacheLine [ 0 ] + offset + 4 * 2 ) ;
m0 = m0 + w10 * Vec4 : : load ( cacheLine [ 1 ] + offset + 4 * 0 ) ;
m1 = m1 + w11 * Vec4 : : load ( cacheLine [ 1 ] + offset + 4 * 1 ) ;
m2 = m2 + w12 * Vec4 : : load ( cacheLine [ 1 ] + offset + 4 * 2 ) ;
m0 = m0 + w20 * Vec4 : : load ( cacheLine [ 2 ] + offset + 4 * 0 ) ;
m1 = m1 + w21 * Vec4 : : load ( cacheLine [ 2 ] + offset + 4 * 1 ) ;
m2 = m2 + w22 * Vec4 : : load ( cacheLine [ 2 ] + offset + 4 * 2 ) ;
2021-06-11 17:17:13 +08:00
auto o0 = m0 + m1 + m2 + biasF ;
o0 = Vec4 : : min ( maxF , o0 ) ;
o0 = Vec4 : : max ( minF , o0 ) ;
2021-04-08 15:34:23 +08:00
Vec4 : : save ( dest + 8 * unit + 0 * 4 , o0 ) ;
}
}
void MNNConvDwF23SourceTransUnit ( const float * source , float * dest , size_t unit ) {
if ( unit < = 0 ) {
return ;
}
Vec4 v0 = Vec4 : : load ( source + 4 * 0 ) ;
Vec4 v1 = Vec4 : : load ( source + 4 * 1 ) ;
Vec4 v2 ;
Vec4 v3 ;
source + = 8 ;
for ( int x = 0 ; x < unit ; + + x ) {
v2 = Vec4 : : load ( source + 0 * 4 ) ;
v3 = Vec4 : : load ( source + 1 * 4 ) ;
auto m0 = v0 - v2 ;
auto m1 = v1 + v2 ;
auto m2 = v2 - v1 ;
auto m3 = v3 - v1 ;
Vec4 : : save ( dest + 4 * 0 , m0 ) ;
Vec4 : : save ( dest + 4 * 1 , m1 ) ;
Vec4 : : save ( dest + 4 * 2 , m2 ) ;
Vec4 : : save ( dest + 4 * 3 , m3 ) ;
source + = 8 ;
dest + = 16 ;
v0 = v2 ;
v1 = v3 ;
}
}
# endif
namespace MNN {
static CoreFunctions * gCoreFunction = nullptr ;
void MNNCoreFunctionInit ( ) {
gCoreFunction = new CoreFunctions ;
// MatMul
gCoreFunction - > MNNGetMatMulPackMode = MNNGetMatMulPackMode ;
2021-06-11 17:17:13 +08:00
gCoreFunction - > MNNGetSparseMatMulPackMode = MNNGetSparseMatMulPackMode ;
2021-04-08 15:34:23 +08:00
gCoreFunction - > MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A ;
gCoreFunction - > MNNPackForMatMul_B = MNNPackForMatMul_B ;
gCoreFunction - > MNNPackedMatMul = MNNPackedMatMul ;
gCoreFunction - > MNNPackedMatMulRemain = MNNPackedMatMulRemain ;
2021-06-11 17:17:13 +08:00
gCoreFunction - > MNNPackForSparseMatMul_B = MNNPackForSparseMatMul_B ; // sparse packing B
gCoreFunction - > MNNPackedSparseMatMulEpx4 = MNNPackedSparseMatMulEpx4 ;
gCoreFunction - > MNNPackedSparseMatMulEpx1 = MNNPackedSparseMatMulEpx1 ;
gCoreFunction - > MNNComputeMatMulForE_1 = MNNComputeMatMulForE_1 ;
gCoreFunction - > MNNComputeMatMulForH_1 = MNNComputeMatMulForH_1 ;
2021-04-08 15:34:23 +08:00
// Lowp
gCoreFunction - > MNNFp32ToLowp = nullptr ;
gCoreFunction - > MNNLowpToFp32 = nullptr ;
gCoreFunction - > bytes = 4 ; // sizeof(float)
// Packed Function
gCoreFunction - > pack = 4 ;
gCoreFunction - > MNNPackCUnit = MNNPackC4 ;
gCoreFunction - > MNNUnpackCUnit = MNNUnpackC4 ;
2021-06-11 17:17:13 +08:00
2021-04-08 15:34:23 +08:00
// FIXME: MNNPackTranspose and MNNUnpackTranspose is reverted
gCoreFunction - > MNNUnpackCUnitTranspose = MNNPackTranspose ;
gCoreFunction - > MNNPackCUnitTranspose = MNNUnpackTranspose ;
gCoreFunction - > MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit ;
gCoreFunction - > MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise ;
gCoreFunction - > MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise ;
gCoreFunction - > MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23 ;
gCoreFunction - > MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit ;
gCoreFunction - > MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23 ;
gCoreFunction - > MNNMatrixAdd = MNNMatrixAdd ;
gCoreFunction - > MNNMatrixSub = MNNMatrixSub ;
gCoreFunction - > MNNStrassenMergeCFunction = MNNStrassenMergeCFunction ;
gCoreFunction - > penalty = 1.5f ;
gCoreFunction - > MNNScaleAndAddBias = MNNScaleAndAddBias ;
2021-06-23 14:10:31 +08:00
gCoreFunction - > MNNGridSampleComputeCord = MNNGridSampleComputeCord ;
gCoreFunction - > MNNGridSampleInterp = MNNGridSampleInterp ;
2021-04-08 15:34:23 +08:00
gCoreFunction - > MNNAddC4WithStride = MNNAddC4WithStride ;
gCoreFunction - > MNNCopyC4WithStride = MNNCopyC4WithStride ;
2021-06-11 17:17:13 +08:00
2021-04-08 15:34:23 +08:00
gCoreFunction - > chooseWinoSourceTransform = WinogradFunction : : chooseSourceTransform ;
gCoreFunction - > chooseWinoDestTransform = WinogradFunction : : chooseDestTransform ;
gCoreFunction - > MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise ;
gCoreFunction - > MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise ;
2021-06-11 17:17:13 +08:00
gCoreFunction - > MNNSelectBinaryFunctionForFloat = CPUBinary : : selectForFloat ;
gCoreFunction - > MNNSelectUnaryFunctionForFloat = CPUUnary : : selectForFloat ;
gCoreFunction - > MNNReluWithSlopeChannel = MNNReluWithSlopeChannel ;
gCoreFunction - > MNNPoolingAvg = ( decltype ( gCoreFunction - > MNNPoolingAvg ) ) ( poolingAvg < float , Vec4 , 4 > ) ;
// Set min value as 1 << 24
gCoreFunction - > MNNPoolingMax = ( decltype ( gCoreFunction - > MNNPoolingMax ) ) ( poolingMax < float , Vec4 , 4 , - 16777216 > ) ;
# ifdef MNN_USE_ARMV82
cpuinfo_arm_isa gCPUInfo ;
cpuinfo_arm_init ( & gCPUInfo ) ;
gCoreFunction - > supportFp16arith = gCPUInfo . fp16arith ;
gCoreFunction - > supportSDot = gCPUInfo . dot ;
# endif
2021-04-08 15:34:23 +08:00
MNNFunctionInit ( ) ;
}
CoreFunctions * MNNGetCoreFunctions ( ) {
return gCoreFunction ;
}
} ;