2019-04-17 10:49:11 +08:00
|
|
|
//
|
|
|
|
// ConvolutionDepthwise3x3.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2019/4/3.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
|
|
|
#include "ConvolutionDepthwise3x3.hpp"
|
|
|
|
#include "CPUBackend.hpp"
|
|
|
|
#include "Concurrency.h"
|
|
|
|
#include "Macro.h"
|
|
|
|
#include "Vec4.hpp"
|
|
|
|
|
|
|
|
using namespace MNN::Math;
|
|
|
|
extern "C" {
|
|
|
|
void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow);
|
|
|
|
void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
|
|
|
|
}
|
|
|
|
static void _multiAndDestTransformCommon(float **cacheLine, const float *weigth, float *dest, int cacheLineSize,
|
|
|
|
int ow) {
|
|
|
|
int unit = ow / 2;
|
|
|
|
for (int x = 0; x < unit; ++x) {
|
|
|
|
auto offset = 4 * 4 * x;
|
|
|
|
Vec4 m0 = 0.0f;
|
|
|
|
Vec4 m1 = 0.0f;
|
|
|
|
Vec4 m2 = 0.0f;
|
|
|
|
Vec4 m3 = 0.0f;
|
|
|
|
|
|
|
|
for (int i = 0; i < cacheLineSize; ++i) {
|
|
|
|
m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
|
|
|
|
m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
|
|
|
|
m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
|
|
|
|
m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto o0 = m0 + m1 + m2;
|
|
|
|
auto o1 = m1 - m2 + m3;
|
|
|
|
Vec4::save(dest + 8 * x + 0 * 4, o0);
|
|
|
|
Vec4::save(dest + 8 * x + 1 * 4, o1);
|
|
|
|
}
|
|
|
|
if (unit * 2 < ow) {
|
|
|
|
auto offset = 4 * 4 * unit;
|
|
|
|
Vec4 m0 = 0.0f;
|
|
|
|
Vec4 m1 = 0.0f;
|
|
|
|
Vec4 m2 = 0.0f;
|
|
|
|
|
|
|
|
for (int i = 0; i < cacheLineSize; ++i) {
|
|
|
|
m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
|
|
|
|
m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
|
|
|
|
m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto o0 = m0 + m1 + m2;
|
|
|
|
Vec4::save(dest + 8 * unit + 0 * 4, o0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void _sourceTransformCommon(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
|
|
|
|
for (int x = 0; x < su; ++x) {
|
|
|
|
auto dstX = dest + 4 * 4 * x;
|
|
|
|
auto sx = x * 2 - (int)pad;
|
|
|
|
auto ex = sx + 4;
|
|
|
|
|
|
|
|
auto clampSx = std::max(sx, 0);
|
|
|
|
auto clampEx = std::min(ex, (int)iw);
|
|
|
|
|
|
|
|
Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
|
|
|
for (int i = clampSx; i < clampEx; ++i) {
|
|
|
|
v[i - sx] = Vec4::load(source + 4 * i);
|
|
|
|
}
|
|
|
|
auto m0 = v[0] - v[2];
|
|
|
|
auto m1 = v[1] + v[2];
|
|
|
|
auto m2 = v[2] - v[1];
|
|
|
|
auto m3 = v[3] - v[1];
|
|
|
|
|
|
|
|
Vec4::save(dstX + 4 * 0, m0);
|
|
|
|
Vec4::save(dstX + 4 * 1, m1);
|
|
|
|
Vec4::save(dstX + 4 * 2, m2);
|
|
|
|
Vec4::save(dstX + 4 * 3, m3);
|
|
|
|
}
|
|
|
|
MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
|
|
|
|
|
|
|
|
for (int x = eu; x < unit; ++x) {
|
|
|
|
auto dstX = dest + 4 * 4 * x;
|
|
|
|
auto sx = x * 2 - (int)pad;
|
|
|
|
auto ex = sx + 4;
|
|
|
|
|
|
|
|
auto clampSx = std::max(sx, 0);
|
|
|
|
auto clampEx = std::min(ex, (int)iw);
|
|
|
|
|
|
|
|
Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
|
|
|
for (int i = clampSx; i < clampEx; ++i) {
|
|
|
|
v[i - sx] = Vec4::load(source + 4 * i);
|
|
|
|
}
|
|
|
|
auto m0 = v[0] - v[2];
|
|
|
|
auto m1 = v[1] + v[2];
|
|
|
|
auto m2 = v[2] - v[1];
|
|
|
|
auto m3 = v[3] - v[1];
|
|
|
|
|
|
|
|
Vec4::save(dstX + 4 * 0, m0);
|
|
|
|
Vec4::save(dstX + 4 * 1, m1);
|
|
|
|
Vec4::save(dstX + 4 * 2, m2);
|
|
|
|
Vec4::save(dstX + 4 * 3, m3);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef MNN_USE_NEON
|
|
|
|
void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow) {
|
|
|
|
_multiAndDestTransformCommon(cacheLine, weigth, dest, 3, ow);
|
|
|
|
}
|
|
|
|
void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
|
|
|
|
for (int x = 0; x < unit; ++x) {
|
|
|
|
auto dstX = dest + 4 * 4 * x;
|
|
|
|
auto sx = x * 2;
|
|
|
|
Vec4 v[4];
|
|
|
|
for (int i = 0; i < 4; ++i) {
|
|
|
|
v[i] = Vec4::load(source + 4 * sx + 4 * i);
|
|
|
|
}
|
|
|
|
auto m0 = v[0] - v[2];
|
|
|
|
auto m1 = v[1] + v[2];
|
|
|
|
auto m2 = v[2] - v[1];
|
|
|
|
auto m3 = v[3] - v[1];
|
|
|
|
|
|
|
|
Vec4::save(dstX + 4 * 0, m0);
|
|
|
|
Vec4::save(dstX + 4 * 1, m1);
|
|
|
|
Vec4::save(dstX + 4 * 2, m2);
|
|
|
|
Vec4::save(dstX + 4 * 3, m3);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace MNN {
|
|
|
|
ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
|
|
|
|
const float *originWeight, size_t originWeightSize, const float *bias,
|
|
|
|
size_t biasSize)
|
|
|
|
: CPUConvolution(common, b) {
|
|
|
|
MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
|
|
|
|
MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
|
|
|
|
MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
|
|
|
|
mBias.reset(Tensor::createDevice<float>({(int)ALIGN_UP4(biasSize)}));
|
|
|
|
mValid = backend()->onAcquireBuffer(mBias.get(), Backend::STATIC);
|
|
|
|
if (!mValid) {
|
|
|
|
MNN_ERROR("Error for alloc memory in ConvolutionDepthwise3x3\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
::memset(mBias->host<float>(), 0, mBias->size());
|
|
|
|
::memcpy(mBias->host<float>(), bias, biasSize * sizeof(float));
|
|
|
|
auto channel = common->outputCount();
|
|
|
|
auto channelC4 = UP_DIV(channel, 4);
|
|
|
|
mWeight.reset(Tensor::createDevice<float>({channelC4, 3, 4, 4}));
|
|
|
|
mValid = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
|
|
|
if (!mValid) {
|
|
|
|
MNN_ERROR("Error for alloc memory in ConvolutionDepthwise3x3\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
auto weightHost = mWeight->host<float>();
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
::memset(weightHost, 0, mWeight->size());
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
/* 1D-Winograd F(2,3) and tiling */
|
|
|
|
for (int c = 0; c < channel; ++c) {
|
|
|
|
auto cIndex = c / 4;
|
|
|
|
auto cRemain = c % 4;
|
|
|
|
auto weightDstZ = weightHost + cIndex * 4 * 4 * 3 + cRemain;
|
|
|
|
auto weightSrcZ = originWeight + c * 9;
|
|
|
|
for (int y = 0; y < 3; ++y) {
|
|
|
|
auto k0 = weightSrcZ[3 * y + 0];
|
|
|
|
auto k1 = weightSrcZ[3 * y + 1];
|
|
|
|
auto k2 = weightSrcZ[3 * y + 2];
|
|
|
|
|
|
|
|
auto m0 = k0;
|
|
|
|
auto m1 = 0.5f * (k0 + k1 + k2);
|
|
|
|
auto m2 = 0.5f * (k0 - k1 + k2);
|
|
|
|
auto m3 = k2;
|
|
|
|
|
|
|
|
weightDstZ[y * 16 + 4 * 0] = m0;
|
|
|
|
weightDstZ[y * 16 + 4 * 1] = m1;
|
|
|
|
weightDstZ[y * 16 + 4 * 2] = m2;
|
|
|
|
weightDstZ[y * 16 + 4 * 3] = m3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
|
|
|
|
if (nullptr != mBias) {
|
|
|
|
backend()->onReleaseBuffer(mBias.get(), Backend::STATIC);
|
|
|
|
}
|
|
|
|
if (nullptr != mWeight) {
|
|
|
|
backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
|
|
CPUConvolution::onResize(inputs, outputs);
|
|
|
|
int numberThread = ((CPUBackend *)backend())->threadNumber();
|
|
|
|
auto output = outputs[0];
|
|
|
|
auto owUnit = UP_DIV(output->width(), 2);
|
|
|
|
// 3 cacheline, 4 is the unit of transform
|
|
|
|
mCacheLine.reset(Tensor::createDevice<float>({numberThread, 3, owUnit * 4, 4}));
|
|
|
|
auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
|
|
|
|
if (!valid) {
|
|
|
|
return OUT_OF_MEMORY;
|
|
|
|
}
|
|
|
|
backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
|
|
|
|
auto iw = inputs[0]->width();
|
|
|
|
mSourceStartX = UP_DIV(mPadX, 2);
|
|
|
|
mSourceEndX = std::max((iw + mPadX - 4) / 2, mSourceStartX);
|
|
|
|
|
|
|
|
// auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
|
|
|
|
// FUNC_PRINT_ALL(rate, f);
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
|
|
|
|
const std::vector<Tensor *> &outputs) {
|
|
|
|
auto input = inputs[0];
|
|
|
|
auto output = outputs[0];
|
|
|
|
int channelC4 = UP_DIV(input->channel(), 4);
|
|
|
|
int initSize = std::min(input->height(), 2);
|
|
|
|
int batch = input->batch();
|
|
|
|
int ow = output->width();
|
|
|
|
int oh = output->height();
|
|
|
|
int owUnit = UP_DIV(ow, 2);
|
|
|
|
|
|
|
|
auto iw = input->width();
|
|
|
|
auto ih = input->height();
|
|
|
|
auto kernelOrigin = mWeight->host<float>();
|
|
|
|
|
|
|
|
/*oy-mPadY>=0*/
|
|
|
|
int middelYStart = mPadY;
|
|
|
|
|
|
|
|
/*oy-mPadY+3-1 < ih*/
|
|
|
|
int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
|
|
|
|
|
|
|
|
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
2019-06-17 20:10:35 +08:00
|
|
|
auto maxKernelH = std::min(mPadY + ih, 3);
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
|
|
|
|
auto inputOrigin = input->host<float>() + batchIndex * input->stride(0);
|
|
|
|
auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
|
|
|
auto cacheLineStart = mCacheLine->host<float>() + tId * mCacheLine->stride(0);
|
|
|
|
for (int z = (int)tId; z < channelC4; z += threadNumber) {
|
|
|
|
auto inputZ = inputOrigin + 4 * z * iw * ih;
|
|
|
|
auto outputZ = outputOrigin + 4 * z * ow * oh;
|
|
|
|
auto kernelZ = kernelOrigin + z * mWeight->stride(0);
|
|
|
|
auto cacheLine0 = cacheLineStart + 16 * owUnit * 0;
|
|
|
|
auto cacheLine1 = cacheLineStart + 16 * owUnit * 1;
|
|
|
|
auto cacheLine2 = cacheLineStart + 16 * owUnit * 2;
|
|
|
|
|
|
|
|
float *cacheLine[3] = {cacheLine0, cacheLine1, cacheLine2};
|
|
|
|
|
|
|
|
// Init
|
|
|
|
for (int i = 0; i < initSize; ++i) {
|
|
|
|
_sourceTransformCommon(inputZ + i * iw * 4, cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
|
|
|
|
mSourceEndX);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute Top
|
|
|
|
for (int y = 0; y < middelYStart; ++y) {
|
|
|
|
auto outputY = outputZ + y * 4 * ow;
|
|
|
|
int cacheLineSize = y - mPadY + maxKernelH;
|
|
|
|
if (cacheLineSize <= 0) {
|
|
|
|
::memset(outputY, 0, 4 * ow * sizeof(float));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 16;
|
|
|
|
_multiAndDestTransformCommon(cacheLine, kernelPtr, outputY, cacheLineSize, ow);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute Mid
|
|
|
|
for (int y = middelYStart; y < middelYEnd; ++y) {
|
|
|
|
auto outputY = outputZ + y * 4 * ow;
|
|
|
|
auto iy = y - mPadY + 2;
|
|
|
|
_sourceTransformCommon(inputZ + 4 * iy * iw, cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
|
|
|
|
mSourceEndX);
|
|
|
|
// FUNC_PRINT(ow);
|
|
|
|
MNNConvDwF23MulTransUnit(cacheLine, kernelZ, outputY, ow);
|
|
|
|
|
|
|
|
auto temp = cacheLine[0];
|
|
|
|
cacheLine[0] = cacheLine[1];
|
|
|
|
cacheLine[1] = cacheLine[2];
|
|
|
|
cacheLine[2] = temp;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute Bottom
|
|
|
|
for (int y = middelYEnd; y < oh; ++y) {
|
|
|
|
auto outputY = outputZ + y * 4 * ow;
|
|
|
|
int cacheLineSize = (ih - y + mPadY);
|
|
|
|
if (cacheLineSize <= 0) {
|
|
|
|
::memset(outputY, 0, 4 * ow * sizeof(float));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
_multiAndDestTransformCommon(cacheLine, kernelZ, outputY, cacheLineSize, ow);
|
|
|
|
cacheLine[0] = cacheLine[1];
|
|
|
|
cacheLine[1] = cacheLine[2];
|
|
|
|
}
|
|
|
|
mPostFunction(outputZ, mBias->host<float>() + 4 * z, ow * oh, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MNN_CONCURRENCY_END();
|
|
|
|
}
|
|
|
|
return NO_ERROR;
|
|
|
|
}
|
|
|
|
} // namespace MNN
|