MNN/source/backend/cpu/CPUDeconvolutionDepthwise.cpp

//
//  CPUDeconvolutionDepthwise.cpp
//  MNN
//
//  Created by MNN on 2018/07/23.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "backend/cpu/CPUDeconvolutionDepthwise.hpp"
#include <string.h>
#include "backend/cpu/CPUBackend.hpp"
#include "MNN_generated.h"
#include "core/Macro.h"
#include "backend/cpu/compute/ConvOpt.h"
#include "core/Concurrency.h"


namespace MNN {
CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const Op* convOp, Backend* b)
    : MNN::CPUDeconvolutionCommon(input, convOp, b) {
    auto conv               = convOp->main_as_Convolution2D();
    auto layer              = convOp->main_as_Convolution2D()->common();
    int kw                  = layer->kernelX();
    int kh                  = layer->kernelY();
    int outputCount         = layer->outputCount();
    int depthQuad           = UP_DIV(outputCount, 4);
    int planeStride         = kw * kh * 4;

    const float* tempWeight = nullptr;
    int tempWeightSize   = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    ConvolutionCommon::getConvParameters(&quanCommon, conv, &tempWeight, &tempWeightSize);

    // Reorder weight from whc -> pwhc4
    int kernelSize = depthQuad * 4 * kw * kh;
    mWeight.reset(Tensor::createDevice<float>(std::vector<int>{kernelSize}));
    auto sucess = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC);
    if (!sucess) {
        mValid = false;
        return;
    }
    ::memset(mWeight->host<float>(), 0, mWeight->size());
    auto weight = mWeight->host<float>();
    int cur     = 0;
    for (int c = 0; c < outputCount; ++c) {
        int plane  = c / 4;
        int offset = c % 4;
        for (int y = 0; y < kh; ++y) {
            for (int x = 0; x < kw; ++x) {
                float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;
                *dst       = tempWeight[cur++];
            }
        }
    }
    mOrigin.reset(new CPUDeconvolutionDepthwiseBasic(input, convOp, b));
}

CPUDeconvolutionDepthwise::~CPUDeconvolutionDepthwise() {
    backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);
}

ErrorCode CPUDeconvolutionDepthwiseMultiInput::onResize(const std::vector<Tensor*>& inputs,
                                                        const std::vector<Tensor*>& outputs) {
    auto kw = mCommon->kernelX();
    auto kh = mCommon->kernelY();
    mWeight.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), kh, kw, 4}));
    mBias.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), 4}));
    backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);
    backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);
    mInputs   = {inputs[0], mWeight.get(), mBias.get()};
    auto code = CPUDeconvolutionDepthwiseBasic::onResize(mInputs, outputs);
    backend()->onReleaseBuffer(mWeight.get(), Backend::DYNAMIC);
    backend()->onReleaseBuffer(mBias.get(), Backend::DYNAMIC);
    return code;
}

ErrorCode CPUDeconvolutionDepthwiseMultiInput::onExecute(const std::vector<Tensor*>& inputs,
                                                         const std::vector<Tensor*>& outputs) {
    ::memset(mBias->host<float>(), 0, mBias->size());
    if (inputs.size() > 2) {
        ::memcpy(mBias->host<float>(), inputs[2]->host<float>(), inputs[2]->size());
    }
    ::memset(mWeight->host<float>(), 0, mWeight->size());
    auto weight      = mWeight->host<float>();
    auto outputCount = inputs[0]->channel();
    auto kh          = mWeight->length(1);
    auto kw          = mWeight->length(2);
    auto tempWeight  = inputs[1]->host<float>();
    auto planeStride = kw * kh * 4;
    int cur          = 0;
    for (int c = 0; c < outputCount; ++c) {
        int plane  = c / 4;
        int offset = c % 4;
        for (int y = 0; y < kh; ++y) {
            for (int x = 0; x < kw; ++x) {
                float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;
                *dst       = tempWeight[cur++];
            }
        }
    }
    return CPUDeconvolutionDepthwiseBasic::onExecute(mInputs, outputs);
}

ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& inputs,
                                                   const std::vector<Tensor*>& outputs) {
    CPUDeconvolutionBasic::onResize(inputs, outputs);
    auto layer         = mCommon;
    auto inputTensor   = outputs[0];
    auto outputTensor  = inputs[0];
    int src_width      = inputTensor->width();
    int src_height     = inputTensor->height();
    int dst_width      = outputTensor->width();
    int dst_height     = outputTensor->height();
    int dst_depth_quad = UP_DIV(layer->outputCount(), 4);
    int dst_z_step     = dst_width * dst_height * 4;
    int src_z_step     = src_width * src_height * 4;
    int dst_y_step     = dst_width * 4;
    int src_y_step     = src_width * 4;
    int strideY        = layer->strideY();
    int strideX        = layer->strideX();
    int dilateX        = layer->dilateX();
    int dilateY        = layer->dilateY();
    int dilateY_step   = dilateY * src_width * 4;
    int dilateX_step   = dilateX * 4;
    int kernel_height  = layer->kernelY();
    int kernel_width   = layer->kernelX();
    int padX           = mPadX;
    int padY           = mPadY;
    int weight_z_step  = kernel_height * kernel_width * 4;
    // Compute Mid Rect
    int l = 0, t = 0, r = dst_width, b = dst_height;
    for (; l * strideX - padX < 0; l++) {
        // do nothing
    }
    for (; t * strideY - padY < 0; t++) {
        // do nothing
    }
    for (; (r - 1) * strideX - padX + kernel_width * dilateX > src_width && r > l; r--) {
        // do nothing
    }
    for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) {
        // do nothing
    }

    auto postFunction = getPostFunction();
#define RUN_BASIC(L, T, R, B)                                                                              \
    for (int dy = T; dy < B; ++dy) {                                                                       \
        const float* dst_y = dst_z + dy * dst_y_step;                                                      \
        int srcStartY      = dy * strideY - padY;                                                          \
        float* src_dy      = src_z + srcStartY * src_y_step;                                               \
        int sfy            = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));                                     \
        int efy            = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));               \
        for (int dx = L; dx < R; ++dx) {                                                                   \
            const float* dst_x = dst_y + 4 * dx;                                                           \
            int srcStartX      = dx * strideX - padX;                                                      \
            float* src_dx      = src_dy + srcStartX * 4;                                                   \
            int sfx            = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));                                 \
            int efx            = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));             \
            MNNDeconvRunForUnitDepthWise(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4,  \
                                         weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy, \
                                         4 * kernel_width, dilateX_step, dilateY_step);                    \
        }                                                                                                  \
    }
    auto weight = inputs[1];
    auto bias   = inputs[2];
    int batch = inputs[0]->batch();
    int totalSize = batch * dst_depth_quad;
    int numberThread = ((CPUBackend*)backend())->threadNumber();

    mFunction = [=](const float* dstOrigin, float* srcOrigin, int tId) {
        for (int dz = tId; dz < totalSize; dz+=numberThread) {
            auto zPos = dz % dst_depth_quad;
            const float* dst_z     = dstOrigin + dst_z_step * dz;
            float* src_z           = srcOrigin + src_z_step * dz;
            const float* weight_dz = weight->host<float>() + zPos * weight_z_step;
            ::memset(src_z, 0, 4 * src_width * src_height * sizeof(float));

            RUN_BASIC(0, 0, dst_width, t);
            RUN_BASIC(0, b, dst_width, dst_height);

            RUN_BASIC(0, t, l, b);
            RUN_BASIC(r, t, dst_width, b);

            if (r > l) {
                for (int dy = t; dy < b; ++dy) {
                    const float* dst_y = dst_z + dy * dst_y_step;
                    int srcStartY      = dy * strideY - padY;
                    float* src_dy      = src_z + srcStartY * src_y_step;
                    MNNDeconvRunForLineDepthwise(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,
                                                 strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step);
                }
            }
            postFunction(src_z, bias->host<float>() + zPos * 4, src_width * src_height, 1);
        }
    };
#undef RUN_BASIC

    return NO_ERROR;
}

ErrorCode CPUDeconvolutionDepthwiseBasic::onExecute(const std::vector<Tensor*>& inputs,
                                                    const std::vector<Tensor*>& outputs) {
    // Revert input and output, do deconvolution
    auto inputTensor  = outputs[0];
    auto outputTensor = inputs[0];
    int numberThread = ((CPUBackend*)backend())->threadNumber();
    float* srcOrigin = inputTensor->host<float>() + 0 * inputTensor->stride(0);
    const float* dstOrigin = outputTensor->host<float>() + 0 * outputTensor->stride(0);
    MNN_CONCURRENCY_BEGIN(tId, numberThread) {
        mFunction(dstOrigin, srcOrigin, tId);
    };
    MNN_CONCURRENCY_END();
    return NO_ERROR;
}

class CPUDeconvolutionDepthwiseCreator : public CPUBackend::Creator {
public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op, Backend* backend) const {
        if (1 < inputs.size()) {
            return new CPUDeconvolutionDepthwiseMultiInput(inputs[0], op, backend);
        }
        return new CPUDeconvolutionDepthwise(inputs[0], op, backend);
    }
};

REGISTER_CPU_OP_CREATOR(CPUDeconvolutionDepthwiseCreator, OpType_DeconvolutionDepthwise);

} // namespace MNN
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// CPUDeconvolutionDepthwise.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2018/07/23.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

Update 2019-12-27 22:16:57 +08:00			`#include "backend/cpu/CPUDeconvolutionDepthwise.hpp"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#include <string.h>`
Update 2019-12-27 22:16:57 +08:00			`#include "backend/cpu/CPUBackend.hpp"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#include "MNN_generated.h"`
Update 2019-12-27 22:16:57 +08:00			`#include "core/Macro.h"`
			`#include "backend/cpu/compute/ConvOpt.h"`
Update 2020-02-26 09:57:17 +08:00			`#include "core/Concurrency.h"`

beta 0.1.0 2019-04-17 10:49:11 +08:00
			`namespace MNN {`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00			`CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const Op* convOp, Backend* b)`
			`: MNN::CPUDeconvolutionCommon(input, convOp, b) {`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`auto conv = convOp->main_as_Convolution2D();`
			`auto layer = convOp->main_as_Convolution2D()->common();`
			`int kw = layer->kernelX();`
			`int kh = layer->kernelY();`
			`int outputCount = layer->outputCount();`
			`int depthQuad = UP_DIV(outputCount, 4);`
			`int planeStride = kw * kh * 4;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
			`const float* tempWeight = nullptr;`
			`int tempWeightSize = 0;`
			`std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;`
			`ConvolutionCommon::getConvParameters(&quanCommon, conv, &tempWeight, &tempWeightSize);`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`// Reorder weight from whc -> pwhc4`
			`int kernelSize = depthQuad * 4 * kw * kh;`
			`mWeight.reset(Tensor::createDevice<float>(std::vector<int>{kernelSize}));`
			`auto sucess = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC);`
			`if (!sucess) {`
			`mValid = false;`
			`return;`
			`}`
			`::memset(mWeight->host<float>(), 0, mWeight->size());`
			`auto weight = mWeight->host<float>();`
			`int cur = 0;`
			`for (int c = 0; c < outputCount; ++c) {`
			`int plane = c / 4;`
			`int offset = c % 4;`
			`for (int y = 0; y < kh; ++y) {`
			`for (int x = 0; x < kw; ++x) {`
			`float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;`
			`*dst = tempWeight[cur++];`
			`}`
			`}`
			`}`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00			`mOrigin.reset(new CPUDeconvolutionDepthwiseBasic(input, convOp, b));`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00
beta 0.1.0 2019-04-17 10:49:11 +08:00			`CPUDeconvolutionDepthwise::~CPUDeconvolutionDepthwise() {`
			`backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC);`
			`}`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00
			`ErrorCode CPUDeconvolutionDepthwiseMultiInput::onResize(const std::vector<Tensor*>& inputs,`
			`const std::vector<Tensor*>& outputs) {`
			`auto kw = mCommon->kernelX();`
			`auto kh = mCommon->kernelY();`
			`mWeight.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), kh, kw, 4}));`
			`mBias.reset(Tensor::createDevice<float>({UP_DIV(inputs[0]->channel(), 4), 4}));`
			`backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);`
			`backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);`
			`mInputs = {inputs[0], mWeight.get(), mBias.get()};`
			`auto code = CPUDeconvolutionDepthwiseBasic::onResize(mInputs, outputs);`
			`backend()->onReleaseBuffer(mWeight.get(), Backend::DYNAMIC);`
			`backend()->onReleaseBuffer(mBias.get(), Backend::DYNAMIC);`
			`return code;`
			`}`

			`ErrorCode CPUDeconvolutionDepthwiseMultiInput::onExecute(const std::vector<Tensor*>& inputs,`
			`const std::vector<Tensor*>& outputs) {`
			`::memset(mBias->host<float>(), 0, mBias->size());`
Update 2019-12-27 22:16:57 +08:00			`if (inputs.size() > 2) {`
			`::memcpy(mBias->host<float>(), inputs[2]->host<float>(), inputs[2]->size());`
			`}`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00			`::memset(mWeight->host<float>(), 0, mWeight->size());`
			`auto weight = mWeight->host<float>();`
			`auto outputCount = inputs[0]->channel();`
			`auto kh = mWeight->length(1);`
			`auto kw = mWeight->length(2);`
			`auto tempWeight = inputs[1]->host<float>();`
			`auto planeStride = kw * kh * 4;`
			`int cur = 0;`
			`for (int c = 0; c < outputCount; ++c) {`
			`int plane = c / 4;`
			`int offset = c % 4;`
			`for (int y = 0; y < kh; ++y) {`
			`for (int x = 0; x < kw; ++x) {`
			`float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane;`
			`*dst = tempWeight[cur++];`
			`}`
			`}`
			`}`
			`return CPUDeconvolutionDepthwiseBasic::onExecute(mInputs, outputs);`
			`}`

			`ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector<Tensor*>& inputs,`
			`const std::vector<Tensor*>& outputs) {`
			`CPUDeconvolutionBasic::onResize(inputs, outputs);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`auto layer = mCommon;`
			`auto inputTensor = outputs[0];`
			`auto outputTensor = inputs[0];`
			`int src_width = inputTensor->width();`
			`int src_height = inputTensor->height();`
			`int dst_width = outputTensor->width();`
			`int dst_height = outputTensor->height();`
			`int dst_depth_quad = UP_DIV(layer->outputCount(), 4);`
			`int dst_z_step = dst_width * dst_height * 4;`
			`int src_z_step = src_width * src_height * 4;`
			`int dst_y_step = dst_width * 4;`
			`int src_y_step = src_width * 4;`
			`int strideY = layer->strideY();`
			`int strideX = layer->strideX();`
			`int dilateX = layer->dilateX();`
			`int dilateY = layer->dilateY();`
			`int dilateY_step = dilateY * src_width * 4;`
			`int dilateX_step = dilateX * 4;`
			`int kernel_height = layer->kernelY();`
			`int kernel_width = layer->kernelX();`
			`int padX = mPadX;`
			`int padY = mPadY;`
			`int weight_z_step = kernel_height * kernel_width * 4;`
			`// Compute Mid Rect`
			`int l = 0, t = 0, r = dst_width, b = dst_height;`
			`for (; l * strideX - padX < 0; l++) {`
			`// do nothing`
			`}`
			`for (; t * strideY - padY < 0; t++) {`
			`// do nothing`
			`}`
			`for (; (r - 1) * strideX - padX + kernel_width * dilateX > src_width && r > l; r--) {`
			`// do nothing`
			`}`
			`for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) {`
			`// do nothing`
			`}`

			`auto postFunction = getPostFunction();`
			`#define RUN_BASIC(L, T, R, B) \`
			`for (int dy = T; dy < B; ++dy) { \`
			`const float* dst_y = dst_z + dy * dst_y_step; \`
			`int srcStartY = dy * strideY - padY; \`
			`float* src_dy = src_z + srcStartY * src_y_step; \`
			`int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); \`
			`int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY)); \`
			`for (int dx = L; dx < R; ++dx) { \`
			`const float* dst_x = dst_y + 4 * dx; \`
			`int srcStartX = dx * strideX - padX; \`
			`float* src_dx = src_dy + srcStartX * 4; \`
			`int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); \`
			`int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX)); \`
			`MNNDeconvRunForUnitDepthWise(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4, \`
			`weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy, \`
			`4 * kernel_width, dilateX_step, dilateY_step); \`
			`} \`
			`}`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00			`auto weight = inputs[1];`
			`auto bias = inputs[2];`
Update 2020-02-26 09:57:17 +08:00			`int batch = inputs[0]->batch();`
			`int totalSize = batch * dst_depth_quad;`
			`int numberThread = ((CPUBackend*)backend())->threadNumber();`
beta 0.1.0 2019-04-17 10:49:11 +08:00
Update 2020-02-26 09:57:17 +08:00			`mFunction = [=](const float* dstOrigin, float* srcOrigin, int tId) {`
			`for (int dz = tId; dz < totalSize; dz+=numberThread) {`
			`auto zPos = dz % dst_depth_quad;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`const float* dst_z = dstOrigin + dst_z_step * dz;`
			`float* src_z = srcOrigin + src_z_step * dz;`
Update 2020-02-26 09:57:17 +08:00			`const float* weight_dz = weight->host<float>() + zPos * weight_z_step;`
			`::memset(src_z, 0, 4 * src_width * src_height * sizeof(float));`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`RUN_BASIC(0, 0, dst_width, t);`
			`RUN_BASIC(0, b, dst_width, dst_height);`

			`RUN_BASIC(0, t, l, b);`
			`RUN_BASIC(r, t, dst_width, b);`

			`if (r > l) {`
			`for (int dy = t; dy < b; ++dy) {`
			`const float* dst_y = dst_z + dy * dst_y_step;`
			`int srcStartY = dy * strideY - padY;`
			`float* src_dy = src_z + srcStartY * src_y_step;`
			`MNNDeconvRunForLineDepthwise(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l,`
			`strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step);`
			`}`
			`}`
Update 2020-02-26 09:57:17 +08:00			`postFunction(src_z, bias->host<float>() + zPos * 4, src_width * src_height, 1);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
			`};`
			`#undef RUN_BASIC`

			`return NO_ERROR;`
			`}`

beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00			`ErrorCode CPUDeconvolutionDepthwiseBasic::onExecute(const std::vector<Tensor*>& inputs,`
			`const std::vector<Tensor*>& outputs) {`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`// Revert input and output, do deconvolution`
			`auto inputTensor = outputs[0];`
			`auto outputTensor = inputs[0];`
Update 2020-02-26 09:57:17 +08:00			`int numberThread = ((CPUBackend*)backend())->threadNumber();`
			`float* srcOrigin = inputTensor->host<float>() + 0 * inputTensor->stride(0);`
			`const float* dstOrigin = outputTensor->host<float>() + 0 * outputTensor->stride(0);`
			`MNN_CONCURRENCY_BEGIN(tId, numberThread) {`
			`mFunction(dstOrigin, srcOrigin, tId);`
			`};`
			`MNN_CONCURRENCY_END();`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return NO_ERROR;`
			`}`

			`class CPUDeconvolutionDepthwiseCreator : public CPUBackend::Creator {`
			`public:`
			`virtual Execution* onCreate(const std::vector<Tensor>& inputs, const std::vector<Tensor>& outputs,`
			`const MNN::Op* op, Backend* backend) const {`
Update 2019-12-27 22:16:57 +08:00			`if (1 < inputs.size()) {`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00			`return new CPUDeconvolutionDepthwiseMultiInput(inputs[0], op, backend);`
			`}`
			`return new CPUDeconvolutionDepthwise(inputs[0], op, backend);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
			`};`

			`REGISTER_CPU_OP_CREATOR(CPUDeconvolutionDepthwiseCreator, OpType_DeconvolutionDepthwise);`

			`} // namespace MNN`