2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// ConvolutionTiledExecutor.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2018/07/16.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
#include "ConvolutionTiledExecutor.hpp"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include <MNN/AutoTime.hpp>
|
|
|
|
#include "backend/cpu/CPUBackend.hpp"
|
2020-11-05 16:41:56 +08:00
|
|
|
#include "CommonOptFunction.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Concurrency.h"
|
2020-11-05 16:41:56 +08:00
|
|
|
#include "ConvOpt.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
#include "core/Macro.h"
|
|
|
|
#include "core/TensorUtils.hpp"
|
2020-11-05 16:41:56 +08:00
|
|
|
#include "math/Vec.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-11-05 16:41:56 +08:00
|
|
|
using Vec4 = MNN::Math::Vec<float, 4>;
|
2019-04-17 10:49:11 +08:00
|
|
|
namespace MNN {
|
2020-07-07 19:31:31 +08:00
|
|
|
static void _initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize) {
|
2020-07-04 01:21:30 +08:00
|
|
|
// Swap k, ic
|
2020-11-05 16:41:56 +08:00
|
|
|
int dims[4] = {
|
|
|
|
depth,
|
|
|
|
kernelSize,
|
|
|
|
kernelSize,
|
|
|
|
depth
|
|
|
|
};
|
2020-07-04 01:21:30 +08:00
|
|
|
for (int o=0; o<outputCount; ++o) {
|
2020-07-07 19:31:31 +08:00
|
|
|
auto dO = cache + o * depth * kernelSize;
|
2020-07-04 01:21:30 +08:00
|
|
|
auto sO = source + o * depth * kernelSize;
|
2020-11-05 16:41:56 +08:00
|
|
|
MNNTranspose32Bit((int32_t*)dO, (const int32_t*)sO, &dims[0]);
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
2020-07-07 19:31:31 +08:00
|
|
|
MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
ErrorCode ConvolutionTiledExecutorMultiInput::onExecute(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
|
|
|
int depth = inputs[1]->channel();
|
|
|
|
int outputCount = inputs[1]->batch();
|
|
|
|
if (nullptr != mTempBias) {
|
|
|
|
::memset(mTempBias->host<float>(), 0, mTempBias->size());
|
2019-12-27 22:16:57 +08:00
|
|
|
if (inputs.size() > 2) {
|
|
|
|
::memcpy(mTempBias->host<float>(), inputs[2]->host<float>(), inputs[2]->size());
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
}
|
2020-07-07 19:31:31 +08:00
|
|
|
_initWeight(mTempWeight->host<float>(), inputs[1]->host<float>(), mTempWeightCache->host<float>(), depth, outputCount,
|
2020-07-04 01:21:30 +08:00
|
|
|
inputs[1]->width() * inputs[1]->height());
|
2019-06-17 20:10:35 +08:00
|
|
|
return mProxy->onExecute(mInputs, outputs);
|
|
|
|
}
|
|
|
|
ErrorCode ConvolutionTiledExecutorMultiInput::onResize(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
|
|
|
int depth = inputs[1]->channel();
|
2019-12-27 22:16:57 +08:00
|
|
|
int outputCount = outputs[0]->channel();
|
2020-07-04 01:21:30 +08:00
|
|
|
int eP, lP, hP;
|
|
|
|
MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
mTempWeight.reset(Tensor::createDevice<float>(
|
2020-07-04 01:21:30 +08:00
|
|
|
{UP_DIV(outputCount, hP), depth * inputs[1]->width() * inputs[1]->height(), hP}));
|
2020-07-07 19:31:31 +08:00
|
|
|
mTempWeightCache.reset(Tensor::createDevice<float>({depth * inputs[1]->width() * inputs[1]->height(), outputCount}));
|
|
|
|
auto res = backend()->onAcquireBuffer(mTempWeight.get(), Backend::DYNAMIC) && backend()->onAcquireBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
|
|
|
|
if (!res) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
mTempBias.reset();
|
2019-12-27 22:16:57 +08:00
|
|
|
if (inputs.size() > 2 && inputs[2]->elementSize() % 4 == 0) {
|
|
|
|
mInputs = {inputs[0], mTempWeight.get(), inputs[2]};
|
2020-07-23 10:35:12 +08:00
|
|
|
} else if (inputs.size() > 2) {
|
2019-12-27 22:16:57 +08:00
|
|
|
mTempBias.reset(Tensor::createDevice<float>({ALIGN_UP4(outputCount)}));
|
2019-06-17 20:10:35 +08:00
|
|
|
backend()->onAcquireBuffer(mTempBias.get(), Backend::DYNAMIC);
|
|
|
|
mInputs = {inputs[0], mTempWeight.get(), mTempBias.get()};
|
2020-07-23 10:35:12 +08:00
|
|
|
} else {
|
|
|
|
mInputs = {inputs[0], mTempWeight.get()};
|
2019-06-17 20:10:35 +08:00
|
|
|
}
|
|
|
|
auto errorCode = mProxy->onResize(mInputs, outputs);
|
|
|
|
backend()->onReleaseBuffer(mTempWeight.get(), Backend::DYNAMIC);
|
|
|
|
if (nullptr != mTempBias) {
|
|
|
|
backend()->onReleaseBuffer(mTempBias.get(), Backend::DYNAMIC);
|
|
|
|
}
|
|
|
|
return errorCode;
|
|
|
|
}
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
ConvolutionTiledExecutor::ConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b,
|
|
|
|
const float* originWeight, size_t originWeightSize,
|
|
|
|
const float* bias, size_t biasSize)
|
2019-06-17 20:10:35 +08:00
|
|
|
: MNN::Execution(b) {
|
2019-04-17 10:49:11 +08:00
|
|
|
auto outputCount = (int)biasSize;
|
2020-07-04 01:21:30 +08:00
|
|
|
int eP, lP, hP;
|
|
|
|
MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
|
|
|
|
|
|
// Don't use common->inputCount for old model common->inputCount is zero
|
2019-06-17 20:10:35 +08:00
|
|
|
auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
|
|
|
|
mWeight.reset(Tensor::createDevice<float>(
|
2020-07-04 01:21:30 +08:00
|
|
|
{UP_DIV(outputCount, hP), UP_DIV(srcCount, 4), (int)common->kernelX(), common->kernelY(), 4 * hP}));
|
2020-07-07 19:31:31 +08:00
|
|
|
std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputCount, srcCount * common->kernelX() * common->kernelY()}));
|
|
|
|
mValid = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC) && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
|
2019-04-17 10:49:11 +08:00
|
|
|
if (!mValid) {
|
|
|
|
return;
|
|
|
|
}
|
2020-07-07 19:31:31 +08:00
|
|
|
_initWeight(mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY());
|
|
|
|
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
|
2019-04-17 10:49:11 +08:00
|
|
|
mBias.reset(Tensor::createDevice<float>({ALIGN_UP4((int)biasSize)}));
|
|
|
|
mValid = backend()->onAcquireBuffer(mBias.get(), Backend::STATIC);
|
|
|
|
if (!mValid) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
::memset(mBias->host<float>(), 0, mBias->size());
|
|
|
|
::memcpy(mBias->host<float>(), bias, biasSize * sizeof(float));
|
2019-06-17 20:10:35 +08:00
|
|
|
mProxy.reset(new ConvolutionTiledExecutorBasic(common, b));
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2020-12-15 18:14:15 +08:00
|
|
|
|
|
|
|
ConvolutionTiledExecutor::ConvolutionTiledExecutor( // NOLINT
|
|
|
|
const Convolution2DCommon *common, // NOLINT
|
|
|
|
const RearrangedWeightParam *rearranged_params, // NOLINT
|
|
|
|
Backend *b, const float *originWeight, // NOLINT
|
|
|
|
size_t originWeightSize, const float *bias, size_t biasSize)
|
|
|
|
: MNN::Execution(b) {
|
|
|
|
if (!rearranged_params || // NOLINT
|
|
|
|
rearranged_params->type() == RearrangedType_RT_NONE) {
|
|
|
|
new (this)ConvolutionTiledExecutor(common, b, originWeight, // NOLINT
|
|
|
|
originWeightSize, bias, biasSize);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
MNN_CHECK(b->type() == rearranged_params->backend(),
|
|
|
|
"Backend types are not match.");
|
|
|
|
MNN_CHECK(rearranged_params->weight(), "Rearranged weight is empty.");
|
|
|
|
int output_channels = common->outputCount();
|
|
|
|
int input_channels = common->inputCount();
|
|
|
|
int eP, lP, hP;
|
|
|
|
MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
|
|
mBorrowedWeight = true;
|
|
|
|
mWeight.reset(Tensor::createDevice<float>({UP_DIV(output_channels, hP), // NOLINT
|
|
|
|
UP_DIV(input_channels, 4), // NOLINT
|
|
|
|
common->kernelX(), // NOLINT
|
|
|
|
common->kernelY(), 4 * hP}));
|
|
|
|
size_t size = mWeight->elementSize();
|
|
|
|
MNN_CHECK(size == rearranged_params->weight()->size(),
|
|
|
|
"Rearranged weight size is incorrect.");
|
|
|
|
// Should make sure that the rearranged weight will not be released.
|
|
|
|
mWeight->buffer().host = (uint8_t*)(rearranged_params->weight()->data());
|
|
|
|
|
|
|
|
mBias.reset(Tensor::createDevice<float>({ALIGN_UP4((int)biasSize)}));
|
|
|
|
mValid = backend()->onAcquireBuffer(mBias.get(), Backend::STATIC);
|
|
|
|
if (!mValid) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
::memset(mBias->host<float>(), 0, mBias->size());
|
|
|
|
::memcpy(mBias->host<float>(), bias, biasSize * sizeof(float));
|
|
|
|
mProxy.reset(new ConvolutionTiledExecutorBasic(common, b));
|
|
|
|
}
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
ConvolutionTiledExecutor::~ConvolutionTiledExecutor() {
|
|
|
|
if (nullptr != mBias) {
|
|
|
|
backend()->onReleaseBuffer(mBias.get(), Backend::STATIC);
|
|
|
|
}
|
2020-12-15 18:14:15 +08:00
|
|
|
if (nullptr != mWeight && !mBorrowedWeight) {
|
2019-04-17 10:49:11 +08:00
|
|
|
backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
|
|
|
|
}
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
2019-04-17 10:49:11 +08:00
|
|
|
CPUConvolution::onResize(inputs, outputs);
|
|
|
|
auto input = inputs[0];
|
2019-06-17 20:10:35 +08:00
|
|
|
auto weight = inputs[1];
|
2020-07-23 10:35:12 +08:00
|
|
|
Tensor* bias = nullptr;
|
|
|
|
const float* biasPtr = nullptr;
|
|
|
|
if (inputs.size() > 2) {
|
|
|
|
bias = inputs[2];
|
|
|
|
biasPtr = bias->host<float>();
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
auto output = outputs[0];
|
2020-07-23 10:35:12 +08:00
|
|
|
auto width = output->width();
|
|
|
|
auto height = output->height();
|
2019-06-17 20:10:35 +08:00
|
|
|
int threadNumber = ((CPUBackend*)backend())->threadNumber();
|
|
|
|
auto weightPtr = weight->host<float>();
|
2020-07-23 10:35:12 +08:00
|
|
|
auto src_width = input->width();
|
|
|
|
auto src_height = input->height();
|
2019-06-17 20:10:35 +08:00
|
|
|
int src_z_step = input->width() * input->height() * 4;
|
2020-07-04 01:21:30 +08:00
|
|
|
int eP, lP, hP;
|
|
|
|
MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
|
|
auto CONVOLUTION_TILED_NUMBER = eP;
|
2019-04-17 10:49:11 +08:00
|
|
|
auto& tempBuffer = mTempBuffer.buffer();
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
auto icC4 = UP_DIV(input->channel(), 4);
|
2020-07-04 01:21:30 +08:00
|
|
|
auto ic = input->channel();
|
|
|
|
auto L = input->channel() * mCommon->kernelY() * mCommon->kernelX();
|
|
|
|
auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
tempBuffer.dim[0].extent = threadNumber;
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
tempBuffer.dim[1].extent = CONVOLUTION_TILED_NUMBER;
|
2020-07-04 01:21:30 +08:00
|
|
|
tempBuffer.dim[2].extent = icC4 * mCommon->kernelY() * mCommon->kernelX(); // srcCount * kx*ky
|
2019-04-17 10:49:11 +08:00
|
|
|
tempBuffer.dim[3].extent = 4;
|
|
|
|
TensorUtils::setLinearLayout(&mTempBuffer);
|
2020-07-23 10:35:12 +08:00
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
mTempBufferTranspose.buffer().dimensions = 2;
|
|
|
|
mTempBufferTranspose.buffer().dim[0].extent = threadNumber;
|
|
|
|
mTempBufferTranspose.buffer().dim[1].extent = L * CONVOLUTION_TILED_NUMBER;
|
|
|
|
TensorUtils::setLinearLayout(&mTempBufferTranspose);
|
|
|
|
|
|
|
|
int count = UP_DIV(width*height, CONVOLUTION_TILED_NUMBER);
|
|
|
|
int plane = width * height;
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
bool success = backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC) && backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
2019-04-17 10:49:11 +08:00
|
|
|
if (!success) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
2020-07-04 01:21:30 +08:00
|
|
|
auto hDiv = MNNGetC4DivNumber(hP);
|
|
|
|
auto outputChannel = output->channel();
|
|
|
|
auto oC4 = UP_DIV(outputChannel, 4);
|
|
|
|
std::shared_ptr<Tensor> cache;
|
|
|
|
if (hP % 4 != 0) {
|
|
|
|
cache.reset(Tensor::createDevice<float>({threadNumber, 4 * hDiv * eP + oC4 * 4 * eP}));
|
|
|
|
success = backend()->onAcquireBuffer(cache.get(), Backend::DYNAMIC);
|
|
|
|
if (!success) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
|
|
|
backend()->onReleaseBuffer(cache.get(), Backend::DYNAMIC);
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
|
2020-07-04 01:21:30 +08:00
|
|
|
backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
|
|
|
std::vector<size_t> parameters(6);
|
|
|
|
parameters[0] = eP * sizeof(float);
|
|
|
|
parameters[1] = L;
|
|
|
|
parameters[2] = outputChannel;
|
|
|
|
parameters[3] = plane * 4 * sizeof(float);
|
|
|
|
parameters[4] = 0;
|
|
|
|
parameters[5] = 0;
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
auto threadNumberFirst = std::min(threadNumber, count);
|
2020-07-04 01:21:30 +08:00
|
|
|
auto postParameters = getPostParameters();
|
2020-07-23 10:35:12 +08:00
|
|
|
mFunction.first = threadNumberFirst;
|
|
|
|
auto strideX = mCommon->strideX();
|
|
|
|
auto strideY = mCommon->strideY();
|
|
|
|
auto dilateX = mCommon->dilateX();
|
|
|
|
auto dilateY = mCommon->dilateY();
|
|
|
|
auto padY = mPadY;
|
|
|
|
auto padX = mPadX;
|
|
|
|
auto kernel_width = mCommon->kernelX();
|
|
|
|
auto kernel_height = mCommon->kernelY();
|
|
|
|
mFunction.second = [=](int tId) {
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
auto colBuffer = mTempBuffer.host<float>() + mTempBuffer.stride(0) * tId;
|
2020-07-04 01:21:30 +08:00
|
|
|
auto gemmBuffer = mTempBufferTranspose.host<float>() + mTempBufferTranspose.stride(0) * tId;
|
|
|
|
float* cachePtr = nullptr;
|
|
|
|
if (nullptr != cache) {
|
|
|
|
cachePtr = cache->host<float>() + tId * cache->stride(0);
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) {
|
|
|
|
auto dstOrigin = output->host<float>() + batchIndex * output->stride(0);
|
|
|
|
auto srcOrigin = input->host<float>() + batchIndex * input->stride(0);
|
|
|
|
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
for (int x = (int)tId; x < count; x += threadNumberFirst) {
|
|
|
|
int start = (int)x * CONVOLUTION_TILED_NUMBER;
|
|
|
|
int remain = plane - start;
|
|
|
|
int xC = remain > CONVOLUTION_TILED_NUMBER ? CONVOLUTION_TILED_NUMBER : remain;
|
|
|
|
// Im2Col
|
|
|
|
::memset(colBuffer, 0, mTempBuffer.stride(0) * sizeof(float));
|
2020-07-04 01:21:30 +08:00
|
|
|
int oyBegin = start / width;
|
|
|
|
int oxBegin = start % width;
|
|
|
|
int oyEnd = (start + xC-1) / width;
|
|
|
|
remain = xC;
|
|
|
|
auto colIndex = colBuffer;
|
|
|
|
for (int oy=oyBegin; oy <= oyEnd; ++oy) {
|
|
|
|
int step = std::min(width - oxBegin, remain);
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
int sySta = oy * strideY - padY;
|
2020-07-04 01:21:30 +08:00
|
|
|
int kyStart = std::max(0, UP_DIV(-sySta, dilateY));
|
|
|
|
int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY));
|
|
|
|
for (int i=0; i<step; ++i) {
|
|
|
|
int ox = i + oxBegin;
|
|
|
|
int sxSta = ox * strideX - padX;
|
|
|
|
int kxStart = std::max(0, UP_DIV(-sxSta, dilateX));
|
|
|
|
int kxEnd = std::min(kernel_width, UP_DIV(src_width - sxSta, dilateX));
|
|
|
|
// ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uConstant.dilate)));
|
|
|
|
// ivec2 efxy = min(uConstant.kernelSize, UP_DIV(inputSize.xy-s0, uConstant.dilate));
|
|
|
|
auto srcStart = srcOrigin + sxSta * 4 + sySta * 4 * src_width;
|
|
|
|
auto dstStart = colIndex + 4 * i;
|
|
|
|
for (int sz=0; sz<icC4; ++sz) {
|
|
|
|
auto srcZ = srcStart + src_z_step * sz;
|
|
|
|
auto dstZ = dstStart + 4 * CONVOLUTION_TILED_NUMBER * kernel_height * kernel_width * sz;
|
|
|
|
for (int ky=kyStart; ky<kyEnd; ++ky) {
|
|
|
|
auto sy = ky * dilateY;
|
|
|
|
auto srcY = srcZ + sy * 4 * src_width;
|
|
|
|
auto dstY = dstZ + 4 * CONVOLUTION_TILED_NUMBER * (ky*kernel_width);
|
|
|
|
for (int kx=kxStart; kx<kxEnd; ++kx) {
|
|
|
|
auto sx = kx * dilateX;
|
|
|
|
auto srcX = srcY + sx * 4;
|
|
|
|
auto dstX = dstY + 4 * CONVOLUTION_TILED_NUMBER * kx;
|
2020-11-05 16:41:56 +08:00
|
|
|
Vec4::save(dstX, Vec4::load(srcX));
|
2020-07-04 01:21:30 +08:00
|
|
|
}
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
}
|
2020-07-04 01:21:30 +08:00
|
|
|
oxBegin = 0;
|
|
|
|
remain -= step;
|
|
|
|
colIndex += 4 * step;
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
}
|
2020-07-04 01:21:30 +08:00
|
|
|
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
// GEMM
|
2020-07-04 01:21:30 +08:00
|
|
|
MNNPackC4ForMatMul_A(gemmBuffer, colBuffer, CONVOLUTION_TILED_NUMBER * kernelSize, ic, CONVOLUTION_TILED_NUMBER * kernelSize);
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
if (xC == CONVOLUTION_TILED_NUMBER) {
|
2020-07-04 01:21:30 +08:00
|
|
|
MNNPackedMatMul(dstOrigin + start * 4, gemmBuffer, weightPtr, parameters.data(), cachePtr, postParameters.data(), biasPtr);
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
} else {
|
2020-07-04 01:21:30 +08:00
|
|
|
MNNPackedMatMulRemain(dstOrigin + start * 4, gemmBuffer, weightPtr, xC, parameters.data(), cachePtr, postParameters.data(), biasPtr);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
ErrorCode ConvolutionTiledExecutorBasic::onExecute(const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) {
|
2020-07-23 10:35:12 +08:00
|
|
|
MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
|
|
|
|
mFunction.second((int)tId);
|
2019-04-17 10:49:11 +08:00
|
|
|
}
|
2020-07-23 10:35:12 +08:00
|
|
|
MNN_CONCURRENCY_END();
|
2019-04-17 10:49:11 +08:00
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
} // namespace MNN
|