2019-04-17 10:49:11 +08:00
//
// ConvolutionTiledExecutor.cpp
// MNN
//
// Created by MNN on 2018/07/16.
// Copyright © 2018, Alibaba Group Holding Limited
//
2020-11-05 16:41:56 +08:00
# include "ConvolutionTiledExecutor.hpp"
2019-12-27 22:16:57 +08:00
# include <MNN/AutoTime.hpp>
# include "backend/cpu/CPUBackend.hpp"
2020-11-05 16:41:56 +08:00
# include "CommonOptFunction.h"
2019-12-27 22:16:57 +08:00
# include "core/Concurrency.h"
2020-11-05 16:41:56 +08:00
# include "ConvOpt.h"
2019-12-27 22:16:57 +08:00
# include "core/Macro.h"
# include "core/TensorUtils.hpp"
2020-11-05 16:41:56 +08:00
# include "math/Vec.hpp"
2021-04-08 15:34:23 +08:00
# include "core/BufferAllocator.hpp"
2019-04-17 10:49:11 +08:00
2020-11-05 16:41:56 +08:00
using Vec4 = MNN : : Math : : Vec < float , 4 > ;
2019-04-17 10:49:11 +08:00
namespace MNN {
2021-04-08 15:34:23 +08:00
static void _initWeight ( float * dest , const float * source , float * cache , int depth , int outputCount , int kernelSize , const CoreFunctions * function ) {
2020-07-04 01:21:30 +08:00
// Swap k, ic
2020-11-05 16:41:56 +08:00
int dims [ 4 ] = {
depth ,
kernelSize ,
kernelSize ,
depth
} ;
2020-07-04 01:21:30 +08:00
for ( int o = 0 ; o < outputCount ; + + o ) {
2020-07-07 19:31:31 +08:00
auto dO = cache + o * depth * kernelSize ;
2020-07-04 01:21:30 +08:00
auto sO = source + o * depth * kernelSize ;
2020-11-05 16:41:56 +08:00
MNNTranspose32Bit ( ( int32_t * ) dO , ( const int32_t * ) sO , & dims [ 0 ] ) ;
2020-07-04 01:21:30 +08:00
}
2021-04-08 15:34:23 +08:00
if ( function - > bytes < 4 ) {
// Lowp
function - > MNNFp32ToLowp ( ( float * ) cache , ( int16_t * ) cache , outputCount * kernelSize * depth ) ;
}
function - > MNNPackForMatMul_B ( dest , cache , outputCount , kernelSize * depth , true ) ;
2020-07-04 01:21:30 +08:00
}
2019-04-17 10:49:11 +08:00
ConvolutionTiledExecutor : : ConvolutionTiledExecutor ( const Convolution2DCommon * common , Backend * b ,
const float * originWeight , size_t originWeightSize ,
const float * bias , size_t biasSize )
2019-06-17 20:10:35 +08:00
: MNN : : Execution ( b ) {
2019-04-17 10:49:11 +08:00
auto outputCount = ( int ) biasSize ;
2021-01-06 16:29:37 +08:00
mResource . reset ( new CPUConvolution : : Resource ) ;
mResource - > backend = b ;
2021-04-08 15:34:23 +08:00
int eP , lP , hP ;
auto core = static_cast < CPUBackend * > ( b ) - > functions ( ) ;
int bytes = core - > bytes ;
core - > MNNGetMatMulPackMode ( & eP , & lP , & hP ) ;
2020-07-04 01:21:30 +08:00
// Don't use common->inputCount for old model common->inputCount is zero
2019-06-17 20:10:35 +08:00
auto srcCount = ( int ) originWeightSize / outputCount / common - > kernelX ( ) / common - > kernelY ( ) ;
2021-04-08 15:34:23 +08:00
auto lSize = srcCount * common - > kernelX ( ) * common - > kernelY ( ) ;
mResource - > mWeight . reset ( Tensor : : createDevice < uint8_t > (
{ UP_DIV ( outputCount , hP ) * UP_DIV ( lSize , lP ) * hP * lP * bytes } ) ) ;
std : : shared_ptr < Tensor > cache ( Tensor : : createDevice < uint8_t > ( { outputCount * srcCount * common - > kernelX ( ) * common - > kernelY ( ) * ( int ) sizeof ( float ) } ) ) ; // cache must be float
2021-01-06 16:29:37 +08:00
mValid = backend ( ) - > onAcquireBuffer ( mResource - > mWeight . get ( ) , Backend : : STATIC ) & & backend ( ) - > onAcquireBuffer ( cache . get ( ) , Backend : : STATIC ) ;
2019-04-17 10:49:11 +08:00
if ( ! mValid ) {
return ;
}
2021-04-08 15:34:23 +08:00
_initWeight ( mResource - > mWeight - > host < float > ( ) , originWeight , cache - > host < float > ( ) , srcCount , outputCount , common - > kernelX ( ) * common - > kernelY ( ) , core ) ;
2020-07-07 19:31:31 +08:00
backend ( ) - > onReleaseBuffer ( cache . get ( ) , Backend : : STATIC ) ;
2021-04-08 15:34:23 +08:00
mValid = mResource - > copyBiasAlign ( bias , biasSize ) ;
2019-04-17 10:49:11 +08:00
if ( ! mValid ) {
return ;
}
2019-06-17 20:10:35 +08:00
mProxy . reset ( new ConvolutionTiledExecutorBasic ( common , b ) ) ;
2019-04-17 10:49:11 +08:00
}
2020-12-15 18:14:15 +08:00
2021-01-06 16:29:37 +08:00
ConvolutionTiledExecutor : : ConvolutionTiledExecutor ( std : : shared_ptr < CPUConvolution : : Resource > res , const Convolution2DCommon * common , Backend * b ) : Execution ( b ) {
mResource = res ;
2020-12-15 18:14:15 +08:00
mProxy . reset ( new ConvolutionTiledExecutorBasic ( common , b ) ) ;
}
2019-04-17 10:49:11 +08:00
ConvolutionTiledExecutor : : ~ ConvolutionTiledExecutor ( ) {
2021-01-06 16:29:37 +08:00
// Do nothing
}
bool ConvolutionTiledExecutor : : onClone ( Backend * bn , const Op * op , Execution * * dst ) {
if ( ! mValid ) {
return false ;
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
if ( nullptr = = dst ) {
return true ;
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
* dst = new ConvolutionTiledExecutor ( mResource , op - > main_as_Convolution2D ( ) - > common ( ) , bn ) ;
return true ;
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
2019-06-17 20:10:35 +08:00
ErrorCode ConvolutionTiledExecutorBasic : : onResize ( const std : : vector < Tensor * > & inputs ,
const std : : vector < Tensor * > & outputs ) {
2019-04-17 10:49:11 +08:00
CPUConvolution : : onResize ( inputs , outputs ) ;
auto input = inputs [ 0 ] ;
2019-06-17 20:10:35 +08:00
auto weight = inputs [ 1 ] ;
2020-07-23 10:35:12 +08:00
Tensor * bias = nullptr ;
2021-04-08 15:34:23 +08:00
auto core = static_cast < CPUBackend * > ( backend ( ) ) - > functions ( ) ;
int bytes = core - > bytes ;
int unit = core - > pack ;
auto packA = core - > MNNPackC4ForMatMul_A ;
auto matmulUnit = core - > MNNPackedMatMul ;
auto matmulRemain = core - > MNNPackedMatMulRemain ;
int eP , lP , hP ;
core - > MNNGetMatMulPackMode ( & eP , & lP , & hP ) ;
2020-07-23 10:35:12 +08:00
const float * biasPtr = nullptr ;
if ( inputs . size ( ) > 2 ) {
bias = inputs [ 2 ] ;
biasPtr = bias - > host < float > ( ) ;
}
2019-04-17 10:49:11 +08:00
auto output = outputs [ 0 ] ;
2020-07-23 10:35:12 +08:00
auto width = output - > width ( ) ;
auto height = output - > height ( ) ;
2019-06-17 20:10:35 +08:00
int threadNumber = ( ( CPUBackend * ) backend ( ) ) - > threadNumber ( ) ;
auto weightPtr = weight - > host < float > ( ) ;
2020-07-23 10:35:12 +08:00
auto src_width = input - > width ( ) ;
auto src_height = input - > height ( ) ;
2021-04-08 15:34:23 +08:00
int src_z_step = input - > width ( ) * input - > height ( ) * unit ;
2020-07-04 01:21:30 +08:00
auto CONVOLUTION_TILED_NUMBER = eP ;
2021-04-08 15:34:23 +08:00
auto icC4 = UP_DIV ( input - > channel ( ) , unit ) ;
2020-07-04 01:21:30 +08:00
auto ic = input - > channel ( ) ;
2021-04-08 15:34:23 +08:00
auto L = ic * mCommon - > kernelY ( ) * mCommon - > kernelX ( ) ;
2020-07-04 01:21:30 +08:00
auto kernelSize = mCommon - > kernelX ( ) * mCommon - > kernelY ( ) ;
2019-04-17 10:49:11 +08:00
2021-04-08 15:34:23 +08:00
mTempBufferTranspose . buffer ( ) . type = halide_type_of < uint8_t > ( ) ;
2020-07-04 01:21:30 +08:00
mTempBufferTranspose . buffer ( ) . dimensions = 2 ;
mTempBufferTranspose . buffer ( ) . dim [ 0 ] . extent = threadNumber ;
2021-04-08 15:34:23 +08:00
mTempBufferTranspose . buffer ( ) . dim [ 1 ] . extent = UP_DIV ( L , lP ) * lP * CONVOLUTION_TILED_NUMBER * bytes ;
2020-07-04 01:21:30 +08:00
TensorUtils : : setLinearLayout ( & mTempBufferTranspose ) ;
2021-04-08 15:34:23 +08:00
int tileCount = UP_DIV ( width * height , CONVOLUTION_TILED_NUMBER ) ;
2020-07-04 01:21:30 +08:00
int plane = width * height ;
2019-04-17 10:49:11 +08:00
2021-04-08 15:34:23 +08:00
bool success = backend ( ) - > onAcquireBuffer ( & mTempBufferTranspose , Backend : : DYNAMIC ) ;
2019-04-17 10:49:11 +08:00
if ( ! success ) {
return OUT_OF_MEMORY ;
}
2020-07-04 01:21:30 +08:00
auto outputChannel = output - > channel ( ) ;
2021-04-08 15:34:23 +08:00
auto oC4 = UP_DIV ( outputChannel , unit ) ;
auto bufferAlloc = static_cast < CPUBackend * > ( backend ( ) ) - > getBufferAllocator ( ) ;
auto maxLine = UP_DIV ( CONVOLUTION_TILED_NUMBER , width ) + 1 ;
auto tempPtr = bufferAlloc - > alloc ( kernelSize * maxLine * threadNumber * ( 4 * sizeof ( int32_t ) + sizeof ( float * ) ) ) ;
if ( nullptr = = tempPtr . first ) {
return OUT_OF_MEMORY ;
2020-07-04 01:21:30 +08:00
}
backend ( ) - > onReleaseBuffer ( & mTempBufferTranspose , Backend : : DYNAMIC ) ;
2021-04-08 15:34:23 +08:00
bufferAlloc - > free ( tempPtr ) ;
2020-07-04 01:21:30 +08:00
std : : vector < size_t > parameters ( 6 ) ;
2021-04-08 15:34:23 +08:00
parameters [ 0 ] = eP * bytes ;
2020-07-04 01:21:30 +08:00
parameters [ 1 ] = L ;
parameters [ 2 ] = outputChannel ;
2021-04-08 15:34:23 +08:00
parameters [ 3 ] = plane * unit * bytes ;
2020-07-04 01:21:30 +08:00
parameters [ 4 ] = 0 ;
parameters [ 5 ] = 0 ;
2021-04-08 15:34:23 +08:00
auto threadNumberFirst = std : : min ( threadNumber , tileCount ) ;
2020-07-04 01:21:30 +08:00
auto postParameters = getPostParameters ( ) ;
2020-07-23 10:35:12 +08:00
mFunction . first = threadNumberFirst ;
auto strideX = mCommon - > strideX ( ) ;
auto strideY = mCommon - > strideY ( ) ;
auto dilateX = mCommon - > dilateX ( ) ;
auto dilateY = mCommon - > dilateY ( ) ;
auto padY = mPadY ;
auto padX = mPadX ;
auto kernel_width = mCommon - > kernelX ( ) ;
auto kernel_height = mCommon - > kernelY ( ) ;
2021-02-07 10:45:07 +08:00
if ( src_width = = 1 & & width = = 1 & & height > 1 ) {
// Swap x, y
width = height ;
height = 1 ;
padX = mPadY ;
padY = mPadX ;
strideX = strideY ;
strideY = 1 ; // Don't need stride
src_width = src_height ;
src_height = 1 ;
dilateX = dilateY ;
dilateY = 1 ;
kernel_width = kernel_height ;
kernel_height = 1 ;
}
2021-04-08 15:34:23 +08:00
auto outputBatchStride = width * height * oC4 * unit ;
auto inputBatchStride = src_width * src_height * icC4 * unit ;
2020-07-23 10:35:12 +08:00
mFunction . second = [ = ] ( int tId ) {
2021-04-08 15:34:23 +08:00
auto gemmBuffer = mTempBufferTranspose . host < uint8_t > ( ) + mTempBufferTranspose . stride ( 0 ) * tId ;
auto srcPtr = ( float const * * ) ( ( uint8_t * ) tempPtr . first + tempPtr . second + tId * kernelSize * maxLine * ( 4 * sizeof ( int32_t ) + sizeof ( float * ) ) ) ;
auto el = ( int32_t * ) ( srcPtr + kernelSize * maxLine ) ;
int32_t info [ 4 ] ;
info [ 1 ] = src_width * src_height ;
info [ 2 ] = eP ;
info [ 3 ] = strideX ;
2019-04-17 10:49:11 +08:00
for ( int batchIndex = 0 ; batchIndex < input - > batch ( ) ; + + batchIndex ) {
2021-04-08 15:34:23 +08:00
auto dstOrigin = output - > host < uint8_t > ( ) + batchIndex * outputBatchStride * bytes ;
auto srcOrigin = input - > host < uint8_t > ( ) + batchIndex * inputBatchStride * bytes ;
2019-04-17 10:49:11 +08:00
2021-04-08 15:34:23 +08:00
for ( int x = ( int ) tId ; x < tileCount ; x + = threadNumberFirst ) {
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
int start = ( int ) x * CONVOLUTION_TILED_NUMBER ;
int remain = plane - start ;
int xC = remain > CONVOLUTION_TILED_NUMBER ? CONVOLUTION_TILED_NUMBER : remain ;
2021-04-08 15:34:23 +08:00
// Compute Pack position
2020-07-04 01:21:30 +08:00
int oyBegin = start / width ;
int oxBegin = start % width ;
int oyEnd = ( start + xC - 1 ) / width ;
remain = xC ;
2021-04-08 15:34:23 +08:00
int number = 0 ;
bool needZero = false ;
int eStart = 0 ;
2020-07-04 01:21:30 +08:00
for ( int oy = oyBegin ; oy < = oyEnd ; + + oy ) {
int step = std : : min ( width - oxBegin , remain ) ;
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
int sySta = oy * strideY - padY ;
2020-07-04 01:21:30 +08:00
int kyStart = std : : max ( 0 , UP_DIV ( - sySta , dilateY ) ) ;
int kyEnd = std : : min ( kernel_height , UP_DIV ( src_height - sySta , dilateY ) ) ;
2021-04-08 15:34:23 +08:00
if ( kyEnd - kyStart < kernel_height ) {
needZero = true ;
}
for ( int ky = kyStart ; ky < kyEnd ; + + ky ) {
auto lKYOffset = ky * kernel_width * ic ;
auto srcKy = srcOrigin + ( sySta + ky * dilateY ) * src_width * bytes * unit ;
for ( int kx = 0 ; kx < kernel_width ; + + kx ) {
// Compute x range:
// 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width
// 0 <= x <= step
int end = std : : min ( step , ( src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1 ) / strideX ) ;
int sta = std : : max ( 0 , UP_DIV ( ( padX - oxBegin * strideX - dilateX * kx ) , strideX ) ) ;
if ( end - sta < step ) {
needZero = true ;
}
if ( end > sta ) {
auto lOffset = lKYOffset + ( kx * ic ) ;
auto srcKx = srcKy + ( ( oxBegin + sta ) * strideX + dilateX * kx - padX ) * bytes * unit ;
srcPtr [ number ] = ( const float * ) srcKx ;
el [ 4 * number + 0 ] = end - sta ;
el [ 4 * number + 1 ] = ic ;
el [ 4 * number + 2 ] = eStart + sta ;
el [ 4 * number + 3 ] = lOffset ;
number + + ;
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
}
2019-04-17 10:49:11 +08:00
}
}
2020-07-04 01:21:30 +08:00
oxBegin = 0 ;
remain - = step ;
2021-04-08 15:34:23 +08:00
eStart + = step ;
}
info [ 0 ] = number ;
if ( needZero | | lP ! = 1 ) {
: : memset ( gemmBuffer , 0 , mTempBufferTranspose . stride ( 0 ) ) ;
}
if ( number > 0 ) {
packA ( ( float * ) gemmBuffer , srcPtr , info , el ) ;
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
}
2020-07-04 01:21:30 +08:00
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
// GEMM
if ( xC = = CONVOLUTION_TILED_NUMBER ) {
2021-04-08 15:34:23 +08:00
matmulUnit ( ( float * ) ( dstOrigin + start * unit * bytes ) , ( float * ) gemmBuffer , weightPtr , parameters . data ( ) , postParameters . data ( ) , biasPtr ) ;
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
} else {
2021-04-08 15:34:23 +08:00
matmulRemain ( ( float * ) ( dstOrigin + start * unit * bytes ) , ( float * ) gemmBuffer , weightPtr , xC , parameters . data ( ) , postParameters . data ( ) , biasPtr ) ;
2019-04-17 10:49:11 +08:00
}
}
}
} ;
return NO_ERROR ;
}
2019-06-17 20:10:35 +08:00
ErrorCode ConvolutionTiledExecutorBasic : : onExecute ( const std : : vector < Tensor * > & inputs ,
const std : : vector < Tensor * > & outputs ) {
2020-07-23 10:35:12 +08:00
MNN_CONCURRENCY_BEGIN ( tId , mFunction . first ) {
mFunction . second ( ( int ) tId ) ;
2019-04-17 10:49:11 +08:00
}
2020-07-23 10:35:12 +08:00
MNN_CONCURRENCY_END ( ) ;
2019-04-17 10:49:11 +08:00
return NO_ERROR ;
}
} // namespace MNN