MNN/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp

272 lines
12 KiB
C++
Raw Normal View History

2019-04-17 10:49:11 +08:00
//
// ConvolutionTiledExecutor.cpp
// MNN
//
// Created by MNN on 2018/07/16.
// Copyright © 2018, Alibaba Group Holding Limited
//
2020-11-05 16:41:56 +08:00
#include "ConvolutionTiledExecutor.hpp"
2019-12-27 22:16:57 +08:00
#include <MNN/AutoTime.hpp>
#include "backend/cpu/CPUBackend.hpp"
2020-11-05 16:41:56 +08:00
#include "CommonOptFunction.h"
2019-12-27 22:16:57 +08:00
#include "core/Concurrency.h"
2020-11-05 16:41:56 +08:00
#include "ConvOpt.h"
2019-12-27 22:16:57 +08:00
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
2020-11-05 16:41:56 +08:00
#include "math/Vec.hpp"
2021-04-08 15:34:23 +08:00
#include "core/BufferAllocator.hpp"
2019-04-17 10:49:11 +08:00
2020-11-05 16:41:56 +08:00
using Vec4 = MNN::Math::Vec<float, 4>;
2019-04-17 10:49:11 +08:00
namespace MNN {
2021-04-08 15:34:23 +08:00
static void _initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function) {
2020-07-04 01:21:30 +08:00
// Swap k, ic
2020-11-05 16:41:56 +08:00
int dims[4] = {
depth,
kernelSize,
kernelSize,
depth
};
2020-07-04 01:21:30 +08:00
for (int o=0; o<outputCount; ++o) {
auto dO = cache + o * depth * kernelSize;
2020-07-04 01:21:30 +08:00
auto sO = source + o * depth * kernelSize;
2020-11-05 16:41:56 +08:00
MNNTranspose32Bit((int32_t*)dO, (const int32_t*)sO, &dims[0]);
2020-07-04 01:21:30 +08:00
}
2021-04-08 15:34:23 +08:00
if (function->bytes < 4) {
// Lowp
function->MNNFp32ToLowp((float*)cache, (int16_t*)cache, outputCount * kernelSize * depth);
}
function->MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);
2020-07-04 01:21:30 +08:00
}
2019-04-17 10:49:11 +08:00
ConvolutionTiledExecutor::ConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b,
const float* originWeight, size_t originWeightSize,
const float* bias, size_t biasSize)
: MNN::Execution(b) {
2019-04-17 10:49:11 +08:00
auto outputCount = (int)biasSize;
2021-01-06 16:29:37 +08:00
mResource.reset(new CPUConvolution::Resource);
mResource->backend = b;
2021-04-08 15:34:23 +08:00
int eP, lP, hP;
auto core = static_cast<CPUBackend*>(b)->functions();
int bytes = core->bytes;
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
2020-07-04 01:21:30 +08:00
// Don't use common->inputCount for old model common->inputCount is zero
auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
2021-04-08 15:34:23 +08:00
auto lSize = srcCount * common->kernelX() * common->kernelY();
mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
{UP_DIV(outputCount, hP) * UP_DIV(lSize, lP) * hP * lP * bytes}));
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
2021-01-06 16:29:37 +08:00
mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC) && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
2019-04-17 10:49:11 +08:00
if (!mValid) {
return;
}
2021-04-08 15:34:23 +08:00
_initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
2021-04-08 15:34:23 +08:00
mValid = mResource->copyBiasAlign(bias, biasSize);
2019-04-17 10:49:11 +08:00
if (!mValid) {
return;
}
mProxy.reset(new ConvolutionTiledExecutorBasic(common, b));
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
ConvolutionTiledExecutor::ConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon* common, Backend* b) : Execution(b) {
mResource = res;
mProxy.reset(new ConvolutionTiledExecutorBasic(common, b));
}
2019-04-17 10:49:11 +08:00
ConvolutionTiledExecutor::~ConvolutionTiledExecutor() {
2021-01-06 16:29:37 +08:00
// Do nothing
}
bool ConvolutionTiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) {
if (!mValid) {
return false;
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
if (nullptr == dst) {
return true;
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
*dst = new ConvolutionTiledExecutor(mResource, op->main_as_Convolution2D()->common(), bn);
return true;
2019-04-17 10:49:11 +08:00
}
2021-01-06 16:29:37 +08:00
ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector<Tensor*>& inputs,
const std::vector<Tensor*>& outputs) {
2019-04-17 10:49:11 +08:00
CPUConvolution::onResize(inputs, outputs);
auto input = inputs[0];
auto weight = inputs[1];
2020-07-23 10:35:12 +08:00
Tensor* bias = nullptr;
2021-04-08 15:34:23 +08:00
auto core = static_cast<CPUBackend*>(backend())->functions();
int bytes = core->bytes;
int unit = core->pack;
auto packA = core->MNNPackC4ForMatMul_A;
auto matmulUnit = core->MNNPackedMatMul;
auto matmulRemain = core->MNNPackedMatMulRemain;
int eP, lP, hP;
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
2020-07-23 10:35:12 +08:00
const float* biasPtr = nullptr;
if (inputs.size() > 2) {
bias = inputs[2];
biasPtr = bias->host<float>();
}
2019-04-17 10:49:11 +08:00
auto output = outputs[0];
2020-07-23 10:35:12 +08:00
auto width = output->width();
auto height = output->height();
int threadNumber = ((CPUBackend*)backend())->threadNumber();
auto weightPtr = weight->host<float>();
2020-07-23 10:35:12 +08:00
auto src_width = input->width();
auto src_height = input->height();
2021-04-08 15:34:23 +08:00
int src_z_step = input->width() * input->height() * unit;
2020-07-04 01:21:30 +08:00
auto CONVOLUTION_TILED_NUMBER = eP;
2021-04-08 15:34:23 +08:00
auto icC4 = UP_DIV(input->channel(), unit);
2020-07-04 01:21:30 +08:00
auto ic = input->channel();
2021-04-08 15:34:23 +08:00
auto L = ic * mCommon->kernelY() * mCommon->kernelX();
2020-07-04 01:21:30 +08:00
auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
2019-04-17 10:49:11 +08:00
2021-04-08 15:34:23 +08:00
mTempBufferTranspose.buffer().type = halide_type_of<uint8_t>();
2020-07-04 01:21:30 +08:00
mTempBufferTranspose.buffer().dimensions = 2;
mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
2021-04-08 15:34:23 +08:00
mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * CONVOLUTION_TILED_NUMBER * bytes;
2020-07-04 01:21:30 +08:00
TensorUtils::setLinearLayout(&mTempBufferTranspose);
2021-04-08 15:34:23 +08:00
int tileCount = UP_DIV(width*height, CONVOLUTION_TILED_NUMBER);
2020-07-04 01:21:30 +08:00
int plane = width * height;
2019-04-17 10:49:11 +08:00
2021-04-08 15:34:23 +08:00
bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
2019-04-17 10:49:11 +08:00
if (!success) {
return OUT_OF_MEMORY;
}
2020-07-04 01:21:30 +08:00
auto outputChannel = output->channel();
2021-04-08 15:34:23 +08:00
auto oC4 = UP_DIV(outputChannel, unit);
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto maxLine = UP_DIV(CONVOLUTION_TILED_NUMBER, width) + 1;
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float*)));
if (nullptr == tempPtr.first) {
return OUT_OF_MEMORY;
2020-07-04 01:21:30 +08:00
}
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
2021-04-08 15:34:23 +08:00
bufferAlloc->free(tempPtr);
2020-07-04 01:21:30 +08:00
std::vector<size_t> parameters(6);
2021-04-08 15:34:23 +08:00
parameters[0] = eP * bytes;
2020-07-04 01:21:30 +08:00
parameters[1] = L;
parameters[2] = outputChannel;
2021-04-08 15:34:23 +08:00
parameters[3] = plane * unit * bytes;
2020-07-04 01:21:30 +08:00
parameters[4] = 0;
parameters[5] = 0;
2021-04-08 15:34:23 +08:00
auto threadNumberFirst = std::min(threadNumber, tileCount);
2020-07-04 01:21:30 +08:00
auto postParameters = getPostParameters();
2020-07-23 10:35:12 +08:00
mFunction.first = threadNumberFirst;
auto strideX = mCommon->strideX();
auto strideY = mCommon->strideY();
auto dilateX = mCommon->dilateX();
auto dilateY = mCommon->dilateY();
auto padY = mPadY;
auto padX = mPadX;
auto kernel_width = mCommon->kernelX();
auto kernel_height = mCommon->kernelY();
2021-02-07 10:45:07 +08:00
if (src_width == 1 && width == 1 && height > 1) {
// Swap x, y
width = height;
height = 1;
padX = mPadY;
padY = mPadX;
strideX = strideY;
strideY = 1;// Don't need stride
src_width = src_height;
src_height = 1;
dilateX = dilateY;
dilateY = 1;
kernel_width = kernel_height;
kernel_height = 1;
}
2021-04-08 15:34:23 +08:00
auto outputBatchStride = width * height * oC4 * unit;
auto inputBatchStride = src_width * src_height * icC4 * unit;
2020-07-23 10:35:12 +08:00
mFunction.second = [=](int tId) {
2021-04-08 15:34:23 +08:00
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
auto srcPtr = (float const**)((uint8_t*)tempPtr.first + tempPtr.second + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float*)));
auto el = (int32_t*)(srcPtr + kernelSize * maxLine);
int32_t info[4];
info[1] = src_width * src_height;
info[2] = eP;
info[3] = strideX;
2019-04-17 10:49:11 +08:00
for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) {
2021-04-08 15:34:23 +08:00
auto dstOrigin = output->host<uint8_t>() + batchIndex * outputBatchStride * bytes;
auto srcOrigin = input->host<uint8_t>() + batchIndex * inputBatchStride * bytes;
2019-04-17 10:49:11 +08:00
2021-04-08 15:34:23 +08:00
for (int x = (int)tId; x < tileCount; x += threadNumberFirst) {
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr;
2019-10-29 13:37:26 +08:00
int start = (int)x * CONVOLUTION_TILED_NUMBER;
int remain = plane - start;
int xC = remain > CONVOLUTION_TILED_NUMBER ? CONVOLUTION_TILED_NUMBER : remain;
2021-04-08 15:34:23 +08:00
// Compute Pack position
2020-07-04 01:21:30 +08:00
int oyBegin = start / width;
int oxBegin = start % width;
int oyEnd = (start + xC-1) / width;
remain = xC;
2021-04-08 15:34:23 +08:00
int number = 0;
bool needZero = false;
int eStart = 0;
2020-07-04 01:21:30 +08:00
for (int oy=oyBegin; oy <= oyEnd; ++oy) {
int step = std::min(width - oxBegin, remain);
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr;
2019-10-29 13:37:26 +08:00
int sySta = oy * strideY - padY;
2020-07-04 01:21:30 +08:00
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
2021-04-08 15:34:23 +08:00
if (kyEnd - kyStart < kernel_height) {
needZero = true;
}
for (int ky=kyStart; ky < kyEnd; ++ky) {
auto lKYOffset = ky * kernel_width * ic;
auto srcKy = srcOrigin + (sySta + ky * dilateY) * src_width * bytes * unit;
for (int kx=0; kx<kernel_width;++kx) {
// Compute x range:
// 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width
// 0 <= x <= step
int end = std::min(step, (src_width - oxBegin * strideX - dilateX * kx + padX + strideX - 1) / strideX);
int sta = std::max(0, UP_DIV((padX - oxBegin*strideX - dilateX * kx), strideX));
if (end - sta < step) {
needZero = true;
}
if (end > sta) {
auto lOffset = lKYOffset + (kx * ic);
auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit;
srcPtr[number] = (const float*)srcKx;
el[4 * number + 0] = end - sta;
el[4 * number + 1] = ic;
el[4 * number + 2] = eStart + sta;
el[4 * number + 3] = lOffset;
number++;
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr;
2019-10-29 13:37:26 +08:00
}
2019-04-17 10:49:11 +08:00
}
}
2020-07-04 01:21:30 +08:00
oxBegin = 0;
remain -= step;
2021-04-08 15:34:23 +08:00
eStart += step;
}
info[0] = number;
if (needZero || lP != 1) {
::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0));
}
if (number > 0) {
packA((float*)gemmBuffer, srcPtr, info, el);
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr;
2019-10-29 13:37:26 +08:00
}
2020-07-04 01:21:30 +08:00
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr;
2019-10-29 13:37:26 +08:00
// GEMM
if (xC == CONVOLUTION_TILED_NUMBER) {
2021-04-08 15:34:23 +08:00
matmulUnit((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, parameters.data(), postParameters.data(), biasPtr);
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr;
2019-10-29 13:37:26 +08:00
} else {
2021-04-08 15:34:23 +08:00
matmulRemain((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters.data(), postParameters.data(), biasPtr);
2019-04-17 10:49:11 +08:00
}
}
}
};
return NO_ERROR;
}
ErrorCode ConvolutionTiledExecutorBasic::onExecute(const std::vector<Tensor*>& inputs,
const std::vector<Tensor*>& outputs) {
2020-07-23 10:35:12 +08:00
MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
mFunction.second((int)tId);
2019-04-17 10:49:11 +08:00
}
2020-07-23 10:35:12 +08:00
MNN_CONCURRENCY_END();
2019-04-17 10:49:11 +08:00
return NO_ERROR;
}
} // namespace MNN