MNN/source/backend/cuda/execution/DeconvSingleInputExecution.cu

//
//  DeconvSingleInputExecution.cpp
//  MNN
//
//  Created by MNN on 2020/08/22.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "DeconvSingleInputExecution.hpp"

namespace MNN {
namespace CUDA {

__global__ void DeconvInputRerange(const int count,
        const InputReorderParameter* param,
        const float* Inp,
        __half* InpRe
        ) {
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
        int l = param->l_size;
        int h = param->h_size;
        int lIndex = i % l;
        int hIndex = i / l;
        int lU = lIndex / 16;
        int lR = lIndex % 16;
        int hU = hIndex / 16;
        int hR = hIndex % 16;

        int bIndex = hIndex / param->hw_size;
        int hwIndex = hIndex % param->hw_size;

        float value = Inp[bIndex * param->ib_stride + lIndex * param->ic_stride + hwIndex];
        //inpRe[lIndex * param->oc_stride + bIndex * param->ob_stride + hwIndex] = value;

        //__half* dst = InpRe + lU * param->hpack_size * 16 * 16 + hU * 16 * 16 + hR + lR * 16;
        __half* dst = InpRe + hU * param->lpack_size * 16 * 16 + lU * 16 * 16 + lR + hR * 16;
        dst[0] = value;
    }
}

template <typename Dtype>
__global__ void Col2Im(const int n, const Dtype* data_col,
    const int batch, const int height, const int width, const int channels,
    const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int height_col, const int width_col,
    const Dtype* bias, Dtype* data_im) {
    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (n); index += blockDim.x * gridDim.x) {
        Dtype val = 0;
        const int b_im = index / (channels * width * height);
        const int chw  = index % (channels * width * height);
        const int w_im = chw % width + pad_w;
        const int h_im = (chw / width) % height + pad_h;
        const int c_im = chw / (width * height);
        int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
        int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
        // compute the start and end of the output
        const int w_col_start =
            (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
        const int w_col_end = min(w_im / stride_w + 1, width_col);
        const int h_col_start =
            (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
        const int h_col_end = min(h_im / stride_h + 1, height_col);
        // TODO: use LCM of stride and dilation to avoid unnecessary loops
        for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
            for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
                int h_k = (h_im - h_col * stride_h);
                int w_k = (w_im - w_col * stride_w);
                if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
                    h_k /= dilation_h;
                    w_k /= dilation_w;
                    int data_col_index = ((((c_im * kernel_h + h_k) * kernel_w + w_k) * batch + b_im) *
                                            height_col + h_col) * width_col + w_col;
                    val += data_col[data_col_index];
                }
            }
        }

        if(nullptr != bias) {
            val += bias[c_im];
        }
        data_im[index] = val;
    }
}


DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
    mBackend = bn;
    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
    
    auto conv       = op->main_as_Convolution2D();
    auto common     = conv->common();
    mKernelInfo.kernelX        = common->kernelX();
    mKernelInfo.kernelY        = common->kernelY();
    mKernelInfo.groups         = common->group();
    mKernelInfo.strideX        = common->strideX();
    mKernelInfo.strideY        = common->strideY();
    mKernelInfo.dilateX        = common->dilateX();
    mKernelInfo.dilateY        = common->dilateY();
    mKernelInfo.activationType = common->relu() ? 1 : (common->relu6() ? 2 : 0);

    //weight host->device
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
    mKernelInfo.kernelN = common->outputCount();
    mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;

    MatMulParam param;
    int e = mKernelInfo.kernelN * mKernelInfo.kernelX * mKernelInfo.kernelY;
    int l = mKernelInfo.kernelC;
    int h = 0;
    param.elh[0] = e;
    param.elh[1] = l;
    param.elh[2] = h;
    param.elhPack[0] = UP_DIV(e, 16);
    param.elhPack[1] = UP_DIV(l, 16);
    param.elhPack[2] = UP_DIV(h, 16);

    param.aStride[0] = 1;
    param.aStride[1] = e;
    param.aStride[2] = 0;

    auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));
    auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
    float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
    runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
    runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
    
    // Reorder weight
    weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[0] * param.elhPack[1] * (MATMULPACK * MATMULPACK)}));
    bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
    mFilter = (void *)weightTensor.get()->buffer().device;
    GemmPrepareRerange(runtime, &param, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), cacheWeight, (__half*)mFilter, nullptr, nullptr, 4);
    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);

    // Copy Bias
    int biasSize = conv->bias()->size();
    biasTensor.reset(Tensor::createDevice<float>({biasSize}));
    bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
    mBias = (void *)biasTensor.get()->buffer().device;
    cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
    
}

DeconvSingleInputExecution::Resource::~Resource() {
    // Do nothing
}
DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res) : Execution(backend), mOp(op) {
    mResource = res;
    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
    mGpuMatMulParam = staticPool->alloc(sizeof(MatMulParam));
    mGpuCol2ImParam = staticPool->alloc(sizeof(Col2ImParameter));
    mGpuInpReorderParam = staticPool->alloc(sizeof(InputReorderParameter));
}

DeconvSingleInputExecution::~DeconvSingleInputExecution() {
    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
    staticPool->free(mGpuMatMulParam);
    staticPool->free(mGpuCol2ImParam);
    staticPool->free(mGpuInpReorderParam);
}
bool DeconvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** dst) {
    if (!mValid) {
        return false;
    }
    if (nullptr == dst) {
        return true;
    }
    auto dstExe = new DeconvSingleInputExecution(bn, op, mResource);
    *dst = dstExe;
    return true;
}


ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    auto input = inputs[0], output = outputs[0];
    const int UNIT = 1;
    auto convCommon = mOp->main_as_Convolution2D()->common();

    // Input Rerange Param
    mInpReorderParameter.hw_size = input->height() * input->width();
    mInpReorderParameter.ic_stride = mInpReorderParameter.hw_size;
    mInpReorderParameter.ib_stride = mInpReorderParameter.hw_size * input->channel();
    mInpReorderParameter.oc_stride = mInpReorderParameter.ib_stride;
    mInpReorderParameter.ob_stride = mInpReorderParameter.hw_size;
    mInpReorderParameter.l_size    = input->channel();
    mInpReorderParameter.h_size    = input->batch() * mInpReorderParameter.hw_size;
    mInpReorderParameter.lpack_size = UP_DIV(mInpReorderParameter.l_size, 16);
    mInpReorderParameter.hpack_size = UP_DIV(mInpReorderParameter.h_size, 16);

    runtime->memcpy((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second, &mInpReorderParameter, sizeof(InputReorderParameter), MNNMemcpyHostToDevice);

    // Col2Im Param
    auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common());
    mCol2ImParamter.dilateX         = convCommon->dilateX();
    mCol2ImParamter.dilateY         = convCommon->dilateY();
    mCol2ImParamter.strideX         = convCommon->strideX();
    mCol2ImParamter.strideY         = convCommon->strideY();
    mCol2ImParamter.ic              = input->channel();
    mCol2ImParamter.oc              = output->channel();
    mCol2ImParamter.kernelX         = convCommon->kernelX();
    mCol2ImParamter.kernelY         = convCommon->kernelY();
    mCol2ImParamter.padX = pad.first;
    mCol2ImParamter.padY = pad.second;

    mCol2ImParamter.ih = input->height();
    mCol2ImParamter.iw = input->width();
    mCol2ImParamter.oh = output->height();
    mCol2ImParamter.ow = output->width();
    mCol2ImParamter.ob = output->batch();

    runtime->memcpy((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second, &mCol2ImParamter, sizeof(Col2ImParameter), MNNMemcpyHostToDevice);

    // Matmul Param
    int e = output->channel() * mCol2ImParamter.kernelX * mCol2ImParamter.kernelY;
    int l = input->channel();
    int h = input->height() * input->width() * output->batch();

    mMatMulParam.elh[0] = e;
    mMatMulParam.elh[1] = l;
    mMatMulParam.elh[2] = h;
    mMatMulParam.elhPack[0] = UP_DIV(e, 16);
    mMatMulParam.elhPack[1] = UP_DIV(l, 16);
    mMatMulParam.elhPack[2] = UP_DIV(h, 16);

    mMatMulParam.bStride[0] = 0;
    mMatMulParam.bStride[1] = input->height() * input->width();
    mMatMulParam.bStride[2] = 1;

    mMatMulParam.cStride[0] = h;
    mMatMulParam.cStride[1] = 1;
    mMatMulParam.cStride[2] = 1;
    if (convCommon->relu()) {
        mMatMulParam.minValue = 0.0f;
    }
    if (convCommon->relu6()) {
        mMatMulParam.minValue = 0.0f;
        mMatMulParam.maxValue = 6.0f;
    }
    runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);

    
    // Alloc temp cuda memory
    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
    auto buffer1 = pool->alloc(sizeof(float) * mMatMulParam.elh[0] * mMatMulParam.elh[2]);
    auto buffer2 = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[1] * mMatMulParam.elhPack[2] * MATMULPACK * MATMULPACK);

    mIm2ColBuffer = (float*)((uint8_t*)buffer1.first + buffer1.second);
    mInputBuffer = (__half*)((uint8_t*)buffer2.first + buffer2.second);

    pool->free(buffer2);
    pool->free(buffer1);

    return NO_ERROR;
}

ErrorCode DeconvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
    //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
    MNN_ASSERT(inputs.size() == 1);
    MNN_ASSERT(outputs.size() == 1);
    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);

    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    const void *input_addr = (const void*)inputs[0]->deviceId();
    const void *filter_addr = mResource->mFilter;
    const void *bias_addr = mResource->mBias;
    void *output_addr = (void*)outputs[0]->deviceId();

    auto gpuInpReorder = (const InputReorderParameter*)((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second);
    auto gpuCol2Im = (const Col2ImParameter*)((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second);
    auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);

    const int rerangeCount = mInpReorderParameter.ib_stride * inputs[0]->batch();
    int inp_block_num = runtime->blocks_num(rerangeCount);
    int inp_thread_num = runtime->threads_num();

    // Do input Rerange
    runtime->memset(mInputBuffer, 0, mMatMulParam.elhPack[2] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK * sizeof(__half));
    DeconvInputRerange<<<inp_block_num, inp_thread_num>>>(rerangeCount, gpuInpReorder, (const float*)input_addr, mInputBuffer);

    // Do Gemm operation 
    GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)mIm2ColBuffer, (const half*)filter_addr, (const half*)mInputBuffer, nullptr, bytes, false, false);

    // Do Col2Im trans
    int height_col = mCol2ImParamter.ih;
    int width_col = mCol2ImParamter.iw;
    int num_kernels = mCol2ImParamter.ob * mCol2ImParamter.oc * mCol2ImParamter.oh * mCol2ImParamter.ow;

    int col2im_block_num = runtime->blocks_num(num_kernels);
    int col2im_thread_num = runtime->threads_num();

    // printf("col2im:%d, %d-%d-%d-%d-%d-%d\n %d-%d-%d-%d-%d-%d\n %d-%d\n", mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, \
    //     mCol2ImParamter.ih, mCol2ImParamter.iw, mCol2ImParamter.ic, \
    //     mCol2ImParamter.padX, mCol2ImParamter.padY, mCol2ImParamter.kernelX, mCol2ImParamter.kernelY, mCol2ImParamter.strideX, mCol2ImParamter.strideY, \
    //     col2im_block_num, col2im_thread_num);
    
    Col2Im<float><<<col2im_block_num, col2im_thread_num>>>(
        num_kernels, (const float*)mIm2ColBuffer, mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, 
        mCol2ImParamter.kernelY, mCol2ImParamter.kernelX, mCol2ImParamter.padY, mCol2ImParamter.padX, 
        mCol2ImParamter.strideY, mCol2ImParamter.strideX, mCol2ImParamter.dilateY, mCol2ImParamter.dilateX,
        height_col, width_col, (const float*)bias_addr, (float *)output_addr);

    return NO_ERROR;
}

class CUDADeconvolutionCreator : public CUDABackend::Creator {
public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, 
            const MNN::Op* op, Backend* backend) const override {
        if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
            auto quan = op->main_as_Convolution2D()->quanParameter();
            if (1 == quan->type() || 2 == quan->type()) {
                MNN_PRINT("cuda Deconv quant type 1 or 2 not support\n");
                return nullptr;
            }
        }

        if(inputs.size() == 3) {
            MNN_PRINT("Deconv inputs size:3 not support\n");
            return nullptr;
        } else if(inputs.size() == 1) {
            std::shared_ptr<DeconvSingleInputExecution::Resource> resource(new DeconvSingleInputExecution::Resource(backend, op));
            return new DeconvSingleInputExecution(backend, op, resource);
        } else {
            MNN_PRINT("Deconv inputs size:%d not support", (int)inputs.size());
            return nullptr;
        }
    }
};

//CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);

}// namespace CUDA
}// namespace MNN
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`//`
			`// DeconvSingleInputExecution.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2020/08/22.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

			`#include "DeconvSingleInputExecution.hpp"`

			`namespace MNN {`
			`namespace CUDA {`

[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`__global__ void DeconvInputRerange(const int count,`
			`const InputReorderParameter* param,`
			`const float* Inp,`
			`__half* InpRe`
			`) {`
			`for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {`
			`int l = param->l_size;`
			`int h = param->h_size;`
			`int lIndex = i % l;`
			`int hIndex = i / l;`
			`int lU = lIndex / 16;`
			`int lR = lIndex % 16;`
			`int hU = hIndex / 16;`
			`int hR = hIndex % 16;`

			`int bIndex = hIndex / param->hw_size;`
			`int hwIndex = hIndex % param->hw_size;`

			`float value = Inp[bIndex * param->ib_stride + lIndex * param->ic_stride + hwIndex];`
			`//inpRe[lIndex * param->oc_stride + bIndex * param->ob_stride + hwIndex] = value;`

			`//__half* dst = InpRe + lU * param->hpack_size * 16 * 16 + hU * 16 * 16 + hR + lR * 16;`
			`__half* dst = InpRe + hU * param->lpack_size * 16 * 16 + lU * 16 * 16 + lR + hR * 16;`
			`dst[0] = value;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
			`}`

[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`template <typename Dtype>`
			`__global__ void Col2Im(const int n, const Dtype* data_col,`
			`const int batch, const int height, const int width, const int channels,`
			`const int kernel_h, const int kernel_w,`
			`const int pad_h, const int pad_w,`
			`const int stride_h, const int stride_w,`
			`const int dilation_h, const int dilation_w,`
			`const int height_col, const int width_col,`
			`const Dtype* bias, Dtype* data_im) {`
			`for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (n); index += blockDim.x * gridDim.x) {`
			`Dtype val = 0;`
			`const int b_im = index / (channels * width * height);`
			`const int chw = index % (channels * width * height);`
			`const int w_im = chw % width + pad_w;`
			`const int h_im = (chw / width) % height + pad_h;`
			`const int c_im = chw / (width * height);`
			`int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;`
			`int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;`
			`// compute the start and end of the output`
			`const int w_col_start =`
			`(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;`
			`const int w_col_end = min(w_im / stride_w + 1, width_col);`
			`const int h_col_start =`
			`(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;`
			`const int h_col_end = min(h_im / stride_h + 1, height_col);`
			`// TODO: use LCM of stride and dilation to avoid unnecessary loops`
			`for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {`
			`for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {`
			`int h_k = (h_im - h_col * stride_h);`
			`int w_k = (w_im - w_col * stride_w);`
			`if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {`
			`h_k /= dilation_h;`
			`w_k /= dilation_w;`
			`int data_col_index = ((((c_im * kernel_h + h_k) * kernel_w + w_k) * batch + b_im) *`
			`height_col + h_col) * width_col + w_col;`
			`val += data_col[data_col_index];`
			`}`
			`}`
			`}`

			`if(nullptr != bias) {`
			`val += bias[c_im];`
			`}`
			`data_im[index] = val;`
			`}`
			`}`


			`DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {`
			`mBackend = bn;`
			`auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();`

Github release 1.1.0 2020-11-05 16:41:56 +08:00			`auto conv = op->main_as_Convolution2D();`
			`auto common = conv->common();`
			`mKernelInfo.kernelX = common->kernelX();`
			`mKernelInfo.kernelY = common->kernelY();`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`mKernelInfo.groups = common->group();`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`mKernelInfo.strideX = common->strideX();`
			`mKernelInfo.strideY = common->strideY();`
			`mKernelInfo.dilateX = common->dilateX();`
			`mKernelInfo.dilateY = common->dilateY();`
			`mKernelInfo.activationType = common->relu() ? 1 : (common->relu6() ? 2 : 0);`

			`//weight host->device`
			`const float* filterDataPtr = nullptr;`
			`int weightSize = 0;`
			`std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;`
			`ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`mKernelInfo.kernelN = common->outputCount();`
			`mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;`

			`MatMulParam param;`
			`int e = mKernelInfo.kernelN * mKernelInfo.kernelX * mKernelInfo.kernelY;`
			`int l = mKernelInfo.kernelC;`
			`int h = 0;`
			`param.elh[0] = e;`
			`param.elh[1] = l;`
			`param.elh[2] = h;`
			`param.elhPack[0] = UP_DIV(e, 16);`
			`param.elhPack[1] = UP_DIV(l, 16);`
			`param.elhPack[2] = UP_DIV(h, 16);`

			`param.aStride[0] = 1;`
			`param.aStride[1] = e;`
			`param.aStride[2] = 0;`

			`auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));`
			`auto tempCacheBuffer = static_cast<CUDABackend>(bn)->getStaticBufferPool()->alloc(weightSize sizeof(float));`
			`float* cacheWeight = (float)((uint8_t)tempCacheBuffer.first + tempCacheBuffer.second);`
			`runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);`
			`runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);`

			`// Reorder weight`
			`weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[0] * param.elhPack[1] * (MATMULPACK * MATMULPACK)}));`
			`bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`mFilter = (void *)weightTensor.get()->buffer().device;`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`GemmPrepareRerange(runtime, &param, (const MatMulParam)((uint8_t)gpuParam.first + gpuParam.second), cacheWeight, (__half*)mFilter, nullptr, nullptr, 4);`
			`static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);`
			`static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);`

			`// Copy Bias`
			`int biasSize = conv->bias()->size();`
			`biasTensor.reset(Tensor::createDevice<float>({biasSize}));`
			`bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);`
			`mBias = (void *)biasTensor.get()->buffer().device;`
			`cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));`

			`}`

			`DeconvSingleInputExecution::Resource::~Resource() {`
			`// Do nothing`
			`}`
			`DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res) : Execution(backend), mOp(op) {`
			`mResource = res;`
			`auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();`
			`auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();`
			`mGpuMatMulParam = staticPool->alloc(sizeof(MatMulParam));`
			`mGpuCol2ImParam = staticPool->alloc(sizeof(Col2ImParameter));`
			`mGpuInpReorderParam = staticPool->alloc(sizeof(InputReorderParameter));`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`

			`DeconvSingleInputExecution::~DeconvSingleInputExecution() {`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();`
			`staticPool->free(mGpuMatMulParam);`
			`staticPool->free(mGpuCol2ImParam);`
			`staticPool->free(mGpuInpReorderParam);`
			`}`
			`bool DeconvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** dst) {`
			`if (!mValid) {`
			`return false;`
			`}`
			`if (nullptr == dst) {`
			`return true;`
			`}`
			`auto dstExe = new DeconvSingleInputExecution(bn, op, mResource);`
			`*dst = dstExe;`
			`return true;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`

[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs) {`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`auto input = inputs[0], output = outputs[0];`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`const int UNIT = 1;`
			`auto convCommon = mOp->main_as_Convolution2D()->common();`

			`// Input Rerange Param`
			`mInpReorderParameter.hw_size = input->height() * input->width();`
			`mInpReorderParameter.ic_stride = mInpReorderParameter.hw_size;`
			`mInpReorderParameter.ib_stride = mInpReorderParameter.hw_size * input->channel();`
			`mInpReorderParameter.oc_stride = mInpReorderParameter.ib_stride;`
			`mInpReorderParameter.ob_stride = mInpReorderParameter.hw_size;`
			`mInpReorderParameter.l_size = input->channel();`
			`mInpReorderParameter.h_size = input->batch() * mInpReorderParameter.hw_size;`
			`mInpReorderParameter.lpack_size = UP_DIV(mInpReorderParameter.l_size, 16);`
			`mInpReorderParameter.hpack_size = UP_DIV(mInpReorderParameter.h_size, 16);`

			`runtime->memcpy((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second, &mInpReorderParameter, sizeof(InputReorderParameter), MNNMemcpyHostToDevice);`

			`// Col2Im Param`
			`auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common());`
			`mCol2ImParamter.dilateX = convCommon->dilateX();`
			`mCol2ImParamter.dilateY = convCommon->dilateY();`
			`mCol2ImParamter.strideX = convCommon->strideX();`
			`mCol2ImParamter.strideY = convCommon->strideY();`
			`mCol2ImParamter.ic = input->channel();`
			`mCol2ImParamter.oc = output->channel();`
			`mCol2ImParamter.kernelX = convCommon->kernelX();`
			`mCol2ImParamter.kernelY = convCommon->kernelY();`
			`mCol2ImParamter.padX = pad.first;`
			`mCol2ImParamter.padY = pad.second;`

			`mCol2ImParamter.ih = input->height();`
			`mCol2ImParamter.iw = input->width();`
			`mCol2ImParamter.oh = output->height();`
			`mCol2ImParamter.ow = output->width();`
			`mCol2ImParamter.ob = output->batch();`

			`runtime->memcpy((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second, &mCol2ImParamter, sizeof(Col2ImParameter), MNNMemcpyHostToDevice);`

			`// Matmul Param`
			`int e = output->channel() * mCol2ImParamter.kernelX * mCol2ImParamter.kernelY;`
			`int l = input->channel();`
			`int h = input->height() * input->width() * output->batch();`

			`mMatMulParam.elh[0] = e;`
			`mMatMulParam.elh[1] = l;`
			`mMatMulParam.elh[2] = h;`
			`mMatMulParam.elhPack[0] = UP_DIV(e, 16);`
			`mMatMulParam.elhPack[1] = UP_DIV(l, 16);`
			`mMatMulParam.elhPack[2] = UP_DIV(h, 16);`

			`mMatMulParam.bStride[0] = 0;`
			`mMatMulParam.bStride[1] = input->height() * input->width();`
			`mMatMulParam.bStride[2] = 1;`

			`mMatMulParam.cStride[0] = h;`
			`mMatMulParam.cStride[1] = 1;`
			`mMatMulParam.cStride[2] = 1;`
			`if (convCommon->relu()) {`
			`mMatMulParam.minValue = 0.0f;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`if (convCommon->relu6()) {`
			`mMatMulParam.minValue = 0.0f;`
			`mMatMulParam.maxValue = 6.0f;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00

[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`// Alloc temp cuda memory`
			`auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();`
			`auto buffer1 = pool->alloc(sizeof(float) * mMatMulParam.elh[0] * mMatMulParam.elh[2]);`
			`auto buffer2 = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[1] * mMatMulParam.elhPack[2] * MATMULPACK * MATMULPACK);`

			`mIm2ColBuffer = (float)((uint8_t)buffer1.first + buffer1.second);`
			`mInputBuffer = (__half)((uint8_t)buffer2.first + buffer2.second);`

			`pool->free(buffer2);`
			`pool->free(buffer1);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
			`return NO_ERROR;`
			`}`

			`ErrorCode DeconvSingleInputExecution::onExecute(const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs) {`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`//MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_ASSERT(inputs.size() == 1);`
			`MNN_ASSERT(outputs.size() == 1);`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
			`auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();`
			`const void input_addr = (const void)inputs[0]->deviceId();`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`const void *filter_addr = mResource->mFilter;`
			`const void *bias_addr = mResource->mBias;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`void output_addr = (void)outputs[0]->deviceId();`

[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`auto gpuInpReorder = (const InputReorderParameter)((uint8_t)mGpuInpReorderParam.first + mGpuInpReorderParam.second);`
			`auto gpuCol2Im = (const Col2ImParameter)((uint8_t)mGpuCol2ImParam.first + mGpuCol2ImParam.second);`
			`auto gpuMatMul = (const MatMulParam)((uint8_t)mGpuMatMulParam.first + mGpuMatMulParam.second);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`const int rerangeCount = mInpReorderParameter.ib_stride * inputs[0]->batch();`
			`int inp_block_num = runtime->blocks_num(rerangeCount);`
			`int inp_thread_num = runtime->threads_num();`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`// Do input Rerange`
			`runtime->memset(mInputBuffer, 0, mMatMulParam.elhPack[2] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK * sizeof(__half));`
			`DeconvInputRerange<<<inp_block_num, inp_thread_num>>>(rerangeCount, gpuInpReorder, (const float*)input_addr, mInputBuffer);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`// Do Gemm operation`
			`GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float)mIm2ColBuffer, (const half)filter_addr, (const half*)mInputBuffer, nullptr, bytes, false, false);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`// Do Col2Im trans`
			`int height_col = mCol2ImParamter.ih;`
			`int width_col = mCol2ImParamter.iw;`
			`int num_kernels = mCol2ImParamter.ob * mCol2ImParamter.oc * mCol2ImParamter.oh * mCol2ImParamter.ow;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`int col2im_block_num = runtime->blocks_num(num_kernels);`
			`int col2im_thread_num = runtime->threads_num();`

			`// printf("col2im:%d, %d-%d-%d-%d-%d-%d\n %d-%d-%d-%d-%d-%d\n %d-%d\n", mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, \`
			`// mCol2ImParamter.ih, mCol2ImParamter.iw, mCol2ImParamter.ic, \`
			`// mCol2ImParamter.padX, mCol2ImParamter.padY, mCol2ImParamter.kernelX, mCol2ImParamter.kernelY, mCol2ImParamter.strideX, mCol2ImParamter.strideY, \`
			`// col2im_block_num, col2im_thread_num);`

			`Col2Im<float><<<col2im_block_num, col2im_thread_num>>>(`
			`num_kernels, (const float*)mIm2ColBuffer, mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc,`
			`mCol2ImParamter.kernelY, mCol2ImParamter.kernelX, mCol2ImParamter.padY, mCol2ImParamter.padX,`
			`mCol2ImParamter.strideY, mCol2ImParamter.strideX, mCol2ImParamter.dilateY, mCol2ImParamter.dilateX,`
			`height_col, width_col, (const float)bias_addr, (float )output_addr);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
			`return NO_ERROR;`
			`}`

			`class CUDADeconvolutionCreator : public CUDABackend::Creator {`
			`public:`
			`virtual Execution* onCreate(const std::vector<Tensor>& inputs, const std::vector<Tensor>& outputs,`
			`const MNN::Op* op, Backend* backend) const override {`
			`if (nullptr != op->main_as_Convolution2D()->quanParameter()) {`
			`auto quan = op->main_as_Convolution2D()->quanParameter();`
			`if (1 == quan->type() \|\| 2 == quan->type()) {`
			`MNN_PRINT("cuda Deconv quant type 1 or 2 not support\n");`
			`return nullptr;`
			`}`
			`}`

			`if(inputs.size() == 3) {`
			`MNN_PRINT("Deconv inputs size:3 not support\n");`
			`return nullptr;`
			`} else if(inputs.size() == 1) {`
[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`std::shared_ptr<DeconvSingleInputExecution::Resource> resource(new DeconvSingleInputExecution::Resource(backend, op));`
			`return new DeconvSingleInputExecution(backend, op, resource);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`} else {`
			`MNN_PRINT("Deconv inputs size:%d not support", (int)inputs.size());`
			`return nullptr;`
			`}`
			`}`
			`};`

[Sync] Sync internal Gitlab 2022-02-18 11:30:27 +08:00			`//CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00
			`}// namespace CUDA`
[MNN:Sync] Sync internal gitlab Main Feature: 1. Add OpenCV API and Numpy API Support 2. Protobuf move into MNN 3. Add more op for torchscript convert 4. Add recompute to speed up geometry compute 5. Add ModuleBasic Test 2021-11-30 10:10:53 +08:00			`}// namespace MNN`