MNN/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp

//
//  ConvolutionTiledExecutor.cpp
//  MNN
//
//  Created by MNN on 2018/07/16.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "ConvolutionTiledExecutor.hpp"
#include <MNN/AutoTime.hpp>
#include "backend/cpu/CPUBackend.hpp"
#include "CommonOptFunction.h"
#include "core/Concurrency.h"
#include "ConvOpt.h"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "math/Vec.hpp"
#include "core/BufferAllocator.hpp"
#include "common/MemoryFormater.h"

using Vec4 = MNN::Math::Vec<float, 4>;
namespace MNN {

void ConvolutionTiledExecutor::initWeight(const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function) {
    // Swap k, ic
    int dims[4] = {
        depth,
        kernelSize,
        kernelSize,
        depth
    };
    for (int o=0; o<outputCount; ++o) {
        auto dO = cache + o * depth * kernelSize;
        auto sO = source + o * depth * kernelSize;
        MNNTranspose32Bit((int32_t*)dO, (const int32_t*)sO, &dims[0]);
    }
    if (function->bytes < 4) {
        // Lowp
        function->MNNFp32ToLowp((float*)cache, (int16_t*)cache, outputCount * kernelSize * depth);
    }
}

ConvolutionTiledExecutor::ConvolutionTiledExecutor(Backend* b, const float* bias, size_t biasSize)
    : MNN::Execution(b) {

    mResource.reset(new CPUConvolution::Resource);
    mResource->backend = b;
    mValid = mResource->copyBiasAlign(bias, biasSize);
    if (!mValid) {
        return;
    }
}

ConvolutionTiledExecutor::ConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, Backend* b) : mResource(res), Execution(b) {
}

ConvolutionTiledExecutor::~ConvolutionTiledExecutor() {
    // Do nothing
}
bool ConvolutionTiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) {
    if (!mValid) {
        return false;
    }
    if (nullptr == dst) {
        return true;
    }
    *dst = new ConvolutionTiledExecutor(mResource, bn);
    return true;
}

ErrorCode ConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs,
                                         const std::vector<Tensor*>& outputs) {
    return NO_ERROR;
}

ErrorCode ConvolutionTiledImpl::onExecute(const std::vector<Tensor*>& inputs,
                                          const std::vector<Tensor*>& outputs) {

    MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
        mFunction.second((int)tId);
    }
    MNN_CONCURRENCY_END();

    return NO_ERROR;
}

std::pair<size_t, std::pair<size_t, size_t>> ConvolutionTiledExecutor::computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber) {
    auto maxLine       = UP_DIV(eP, ow) + 1;
    auto stride = kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *));
    auto total = threadNumber * stride;
    return std::make_pair(total, std::make_pair(stride, kernelSize * maxLine));
}

void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core) {
    // FIXME: Set int8 and float's pack as diff
    int pack = floatCore->pack;
    const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();

    dstIm2ColParamter.dilateX         = convCommon->dilateX();
    dstIm2ColParamter.dilateY         = convCommon->dilateY();
    dstIm2ColParamter.strideX         = convCommon->strideX();
    dstIm2ColParamter.strideY         = convCommon->strideY();
    dstIm2ColParamter.icDiv4          = UP_DIV(input->channel(), pack);;
    dstIm2ColParamter.kernelX         = convCommon->kernelX();
    dstIm2ColParamter.kernelY         = convCommon->kernelY();
    dstIm2ColParamter.padX = padX;
    dstIm2ColParamter.padY = padY;

    dstIm2ColParamter.ih = input->height();
    dstIm2ColParamter.iw = input->width();
    dstIm2ColParamter.oh = output->height();
    dstIm2ColParamter.ow = output->width();
    dstIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch();
    dstIm2ColParamter.srcYStep = input->stride(2) * pack;
    dstIm2ColParamter.packCUnit = pack;
    dstIm2ColParamter.ic = input->channel();
    if (nullptr != int8Core) {
        // Compute Int8 Info and align ic
        int UNIT, SRC_UNIT, DynamicDestUnit;
        auto core = int8Core;
        core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit);
        if (SRC_UNIT > pack) {
            const auto srcCountUnit = UP_DIV(input->channel(), pack);
            dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack);
            dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack;
        } else {
            const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
            dstIm2ColParamter.kernelCountUnit = srcCountUnit * kernelCount;
            dstIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
        }
    }
    if (dstIm2ColParamter.iw == 1 && dstIm2ColParamter.ow == 1 && dstIm2ColParamter.oh > 1 && dstIm2ColParamter.kernelX == 1 && dstIm2ColParamter.padX == 0) {
        /* Convolution only work for Height. Swap x, y*/
        dstIm2ColParamter.ow = dstIm2ColParamter.oh;
        dstIm2ColParamter.oh = 1;
        dstIm2ColParamter.padX = dstIm2ColParamter.padY;
        dstIm2ColParamter.padY = 0;
        dstIm2ColParamter.strideX = dstIm2ColParamter.strideY;
        dstIm2ColParamter.strideY = 1; /* Don't need stride */
        dstIm2ColParamter.iw = dstIm2ColParamter.ih;
        dstIm2ColParamter.ih = 1;
        dstIm2ColParamter.dilateX = dstIm2ColParamter.dilateY;
        dstIm2ColParamter.dilateY = 1;
        dstIm2ColParamter.kernelX = dstIm2ColParamter.kernelY;
        dstIm2ColParamter.kernelY = 1;
    }
}
std::pair<int, bool> ConvolutionTiledExecutor::turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& p, const uint8_t* srcOrigin, int bytes) {
    /* Compute Pack position */
    int oyBegin   = start / p.ow;
    int oxBegin   = start % p.ow;
    int oyEnd     = (start + xC - 1) / p.ow;
    int remain    = xC;
    int number    = 0;
    bool needZero = false;
    int eStart    = 0;
    auto unit = p.packCUnit;

    for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {
        int step    = std::min(p.ow - oxBegin, remain);
        int oy      = oyb % p.oh;
        int ob      = oyb / p.oh;
        int sySta   = oy * p.strideY - p.padY;
        int kyStart = std::max(0, UP_DIV(-sySta, p.dilateY));
        int kyEnd   = std::min(p.kernelY, UP_DIV(p.ih - sySta, p.dilateY));
        if (kyEnd - kyStart < p.kernelY) {
            needZero = true;
        }
        auto srcStart = srcOrigin + ((ob * p.ih + sySta) * p.iw) * bytes * unit;
        for (int ky = kyStart; ky < kyEnd; ++ky) {
            auto lKYOffset = ky * p.kernelX * p.ic;
            auto srcKy     = srcStart + ky * p.dilateY * p.iw * bytes * unit;
            for (int kx = 0; kx < p.kernelX; ++kx) {
                /* Compute x range:*/
                /* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/
                /* 0 <= x <= step*/
                int end = std::min(
                    step, (p.iw - oxBegin * p.strideX - p.dilateX * kx + p.padX + p.strideX - 1) / p.strideX);
                int sta = std::max(0, UP_DIV((p.padX - oxBegin * p.strideX - p.dilateX * kx), p.strideX));
                if (end - sta < step) {
                    needZero = true;
                }
                if (end > sta) {
                    auto lOffset = lKYOffset + (kx * p.ic);
                    auto srcKx   = srcKy + ((oxBegin + sta) * p.strideX + p.dilateX * kx - p.padX) * bytes * unit;
                    srcPtr[number]     = (const float*)srcKx;
                    el[4 * number + 0] = end - sta;
                    el[4 * number + 1] = p.ic;
                    el[4 * number + 2] = eStart + sta;
                    el[4 * number + 3] = lOffset;
                    number++;
                }
            }
        }
        oxBegin = 0;
        remain -= step;
        eStart += step;
    }
    return std::make_pair(number, needZero);
}

} // namespace MNN
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// ConvolutionTiledExecutor.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2018/07/16.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

Github release 1.1.0 2020-11-05 16:41:56 +08:00			`#include "ConvolutionTiledExecutor.hpp"`
Update 2019-12-27 22:16:57 +08:00			`#include <MNN/AutoTime.hpp>`
			`#include "backend/cpu/CPUBackend.hpp"`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`#include "CommonOptFunction.h"`
Update 2019-12-27 22:16:57 +08:00			`#include "core/Concurrency.h"`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`#include "ConvOpt.h"`
Update 2019-12-27 22:16:57 +08:00			`#include "core/Macro.h"`
			`#include "core/TensorUtils.hpp"`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`#include "math/Vec.hpp"`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`#include "core/BufferAllocator.hpp"`
[MNN:Sync] Sync internal Gitlab 2021-09-18 15:52:30 +08:00			`#include "common/MemoryFormater.h"`
beta 0.1.0 2019-04-17 10:49:11 +08:00
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`using Vec4 = MNN::Math::Vec<float, 4>;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`namespace MNN {`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00
			`void ConvolutionTiledExecutor::initWeight(const float source, float cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function) {`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`// Swap k, ic`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`int dims[4] = {`
			`depth,`
			`kernelSize,`
			`kernelSize,`
			`depth`
			`};`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`for (int o=0; o<outputCount; ++o) {`
Reduce memory alloc / release in ConvolutionTiledExecutor 2020-07-07 19:31:31 +08:00			`auto dO = cache + o * depth * kernelSize;`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`auto sO = source + o * depth * kernelSize;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNNTranspose32Bit((int32_t)dO, (const int32_t)sO, &dims[0]);`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`if (function->bytes < 4) {`
			`// Lowp`
			`function->MNNFp32ToLowp((float)cache, (int16_t)cache, outputCount * kernelSize * depth);`
			`}`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`}`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00
			`ConvolutionTiledExecutor::ConvolutionTiledExecutor(Backend* b, const float* bias, size_t biasSize)`
beta 0.2.0.0 - replace FreeImage with stb_image - warn unicode error in Windows compiling - separate clang/gcc build script for android - add default values in fbs - optimize CPU conv / conv depthwise / deconv / deconv depthwise / lstm / sigmoid - add sub support in eltwise - add reciprocal / log1p / log in unary - add zero like / select / set diff 1d - add batch support for permute - add training codes - fix metal error in dynamic separate storage type handling 2019-06-17 20:10:35 +08:00			`: MNN::Execution(b) {`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`mResource.reset(new CPUConvolution::Resource);`
			`mResource->backend = b;`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`mValid = mResource->copyBiasAlign(bias, biasSize);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`if (!mValid) {`
			`return;`
			`}`
			`}`
[PATCH 214/350] [MNN::Refine] Rearrange weights for 1x1 and generic convolution. 2020-12-15 18:14:15 +08:00
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`ConvolutionTiledExecutor::ConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, Backend* b) : mResource(res), Execution(b) {`
[PATCH 214/350] [MNN::Refine] Rearrange weights for 1x1 and generic convolution. 2020-12-15 18:14:15 +08:00			`}`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`ConvolutionTiledExecutor::~ConvolutionTiledExecutor() {`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`// Do nothing`
			`}`
			`bool ConvolutionTiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) {`
			`if (!mValid) {`
			`return false;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`if (nullptr == dst) {`
			`return true;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`*dst = new ConvolutionTiledExecutor(mResource, bn);`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`return true;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`ErrorCode ConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs,`
			`const std::vector<Tensor*>& outputs) {`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return NO_ERROR;`
			`}`

Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`ErrorCode ConvolutionTiledImpl::onExecute(const std::vector<Tensor*>& inputs,`
			`const std::vector<Tensor*>& outputs) {`
[Sync] Sync internal gitlab 2022-05-06 19:51:20 +08:00
[MNN:Sync] Sync internal github 2020-07-23 10:35:12 +08:00			`MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {`
			`mFunction.second((int)tId);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
[MNN:Sync] Sync internal github 2020-07-23 10:35:12 +08:00			`MNN_CONCURRENCY_END();`
[Sync] Sync internal gitlab 2022-05-06 19:51:20 +08:00
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return NO_ERROR;`
			`}`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`std::pair<size_t, std::pair<size_t, size_t>> ConvolutionTiledExecutor::computeBlitInfoSize(int eP, int ow, int kernelSize, int threadNumber) {`
			`auto maxLine = UP_DIV(eP, ow) + 1;`
			`auto stride = kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *));`
			`auto total = threadNumber * stride;`
			`return std::make_pair(total, std::make_pair(stride, kernelSize * maxLine));`
			`}`

			`void ConvolutionTiledExecutor:: setIm2ColParameter(ConvolutionCommon::Im2ColParameter& dstIm2ColParamter, const Convolution2DCommon* convCommon, Tensor* input, Tensor* output, int padX, int padY, const CoreFunctions* floatCore, const CoreInt8Functions* int8Core) {`
			`// FIXME: Set int8 and float's pack as diff`
			`int pack = floatCore->pack;`
			`const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();`

			`dstIm2ColParamter.dilateX = convCommon->dilateX();`
			`dstIm2ColParamter.dilateY = convCommon->dilateY();`
			`dstIm2ColParamter.strideX = convCommon->strideX();`
			`dstIm2ColParamter.strideY = convCommon->strideY();`
			`dstIm2ColParamter.icDiv4 = UP_DIV(input->channel(), pack);;`
			`dstIm2ColParamter.kernelX = convCommon->kernelX();`
			`dstIm2ColParamter.kernelY = convCommon->kernelY();`
			`dstIm2ColParamter.padX = padX;`
			`dstIm2ColParamter.padY = padY;`

			`dstIm2ColParamter.ih = input->height();`
			`dstIm2ColParamter.iw = input->width();`
			`dstIm2ColParamter.oh = output->height();`
			`dstIm2ColParamter.ow = output->width();`
			`dstIm2ColParamter.srcZStep = input->stride(1) * pack * input->batch();`
			`dstIm2ColParamter.srcYStep = input->stride(2) * pack;`
			`dstIm2ColParamter.packCUnit = pack;`
			`dstIm2ColParamter.ic = input->channel();`
			`if (nullptr != int8Core) {`
			`// Compute Int8 Info and align ic`
			`int UNIT, SRC_UNIT, DynamicDestUnit;`
			`auto core = int8Core;`
			`core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DynamicDestUnit);`
			`if (SRC_UNIT > pack) {`
			`const auto srcCountUnit = UP_DIV(input->channel(), pack);`
			`dstIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / pack);`
			`dstIm2ColParamter.ic = dstIm2ColParamter.icDiv4 * pack;`
			`} else {`
			`const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);`
			`dstIm2ColParamter.kernelCountUnit = srcCountUnit * kernelCount;`
			`dstIm2ColParamter.ic = srcCountUnit * SRC_UNIT;`
			`}`
			`}`
			`if (dstIm2ColParamter.iw == 1 && dstIm2ColParamter.ow == 1 && dstIm2ColParamter.oh > 1 && dstIm2ColParamter.kernelX == 1 && dstIm2ColParamter.padX == 0) {`
			`/* Convolution only work for Height. Swap x, y*/`
			`dstIm2ColParamter.ow = dstIm2ColParamter.oh;`
			`dstIm2ColParamter.oh = 1;`
			`dstIm2ColParamter.padX = dstIm2ColParamter.padY;`
			`dstIm2ColParamter.padY = 0;`
			`dstIm2ColParamter.strideX = dstIm2ColParamter.strideY;`
			`dstIm2ColParamter.strideY = 1; /* Don't need stride */`
			`dstIm2ColParamter.iw = dstIm2ColParamter.ih;`
			`dstIm2ColParamter.ih = 1;`
			`dstIm2ColParamter.dilateX = dstIm2ColParamter.dilateY;`
			`dstIm2ColParamter.dilateY = 1;`
			`dstIm2ColParamter.kernelX = dstIm2ColParamter.kernelY;`
			`dstIm2ColParamter.kernelY = 1;`
			`}`
			`}`
			`std::pair<int, bool> ConvolutionTiledExecutor::turnIm2ColToBlitInfo(float const ** srcPtr, int32_t* el, int start, int xC, const ConvolutionCommon::Im2ColParameter& p, const uint8_t* srcOrigin, int bytes) {`
			`/* Compute Pack position */`
			`int oyBegin = start / p.ow;`
			`int oxBegin = start % p.ow;`
			`int oyEnd = (start + xC - 1) / p.ow;`
			`int remain = xC;`
			`int number = 0;`
			`bool needZero = false;`
			`int eStart = 0;`
			`auto unit = p.packCUnit;`

			`for (int oyb = oyBegin; oyb <= oyEnd; ++oyb) {`
			`int step = std::min(p.ow - oxBegin, remain);`
			`int oy = oyb % p.oh;`
			`int ob = oyb / p.oh;`
			`int sySta = oy * p.strideY - p.padY;`
			`int kyStart = std::max(0, UP_DIV(-sySta, p.dilateY));`
			`int kyEnd = std::min(p.kernelY, UP_DIV(p.ih - sySta, p.dilateY));`
			`if (kyEnd - kyStart < p.kernelY) {`
			`needZero = true;`
			`}`
			`auto srcStart = srcOrigin + ((ob * p.ih + sySta) * p.iw) * bytes * unit;`
			`for (int ky = kyStart; ky < kyEnd; ++ky) {`
			`auto lKYOffset = ky * p.kernelX * p.ic;`
			`auto srcKy = srcStart + ky * p.dilateY * p.iw * bytes * unit;`
			`for (int kx = 0; kx < p.kernelX; ++kx) {`
			`/* Compute x range:*/`
			`/* 0 <= (oxBegin + x) * strideX - padX + dilateX * kx < src_width*/`
			`/* 0 <= x <= step*/`
			`int end = std::min(`
			`step, (p.iw - oxBegin * p.strideX - p.dilateX * kx + p.padX + p.strideX - 1) / p.strideX);`
			`int sta = std::max(0, UP_DIV((p.padX - oxBegin * p.strideX - p.dilateX * kx), p.strideX));`
			`if (end - sta < step) {`
			`needZero = true;`
			`}`
			`if (end > sta) {`
			`auto lOffset = lKYOffset + (kx * p.ic);`
			`auto srcKx = srcKy + ((oxBegin + sta) * p.strideX + p.dilateX * kx - p.padX) * bytes * unit;`
			`srcPtr[number] = (const float*)srcKx;`
			`el[4 * number + 0] = end - sta;`
			`el[4 * number + 1] = p.ic;`
			`el[4 * number + 2] = eStart + sta;`
			`el[4 * number + 3] = lOffset;`
			`number++;`
			`}`
			`}`
			`}`
			`oxBegin = 0;`
			`remain -= step;`
			`eStart += step;`
			`}`
			`return std::make_pair(number, needZero);`
			`}`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`} // namespace MNN`