MNN/source/backend/cpu/compute/Convolution1x1Strassen.cpp

//
//  Convolution1x1Strassen.cpp
//  MNN
//
//  Created by MNN on 2019/02/12.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "Convolution1x1Strassen.hpp"
#include <string.h>
#include "core/BufferAllocator.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "core/Concurrency.h"
#include "ConvOpt.h"
#include "core/Macro.h"
#include "CommonOptFunction.h"

namespace MNN {
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
                                               size_t originWeightSize, const float *bias, size_t biasSize)
    : CPUConvolution(common, b) {
    auto outputCount = (int)biasSize;
    auto mSrcCount   = (int)originWeightSize / outputCount;
    mResource.reset(new CPUConvolution::Resource);
    mResource->backend = b;
    if (!mResource->copyBiasAlign(bias, biasSize)) {
        MNN_ERROR("Not Enough Memory\n");
        mValid = false;
        return;
    }
    auto core = static_cast<CPUBackend*>(b)->functions();
    int ePack, lPack, hPack;
    core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
    mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack}));
    mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
    if (!mValid) {
        MNN_ERROR("Not Enough Memory\n");
        return;
    }
    if (core->bytes < 4) {
        AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount}));
        mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC);
        if (!mValid) {
            MNN_ERROR("Not Enough Memory\n");
            return;
        }
        core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
        core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
        b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
    } else {
        core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
    }
}
Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
    mResource = resource;
}

Convolution1x1Strassen::~Convolution1x1Strassen() {
    // Do nothing
}

bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst) {
    if (!mValid) {
        return false;
    }
    if (nullptr == dst) {
        return true;
    }
    *dst = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);
    return true;
}

ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    CPUConvolution::onResize(inputs, outputs);
    auto core = static_cast<CPUBackend*>(backend())->functions();
    int ePack, lPack, hPack;
    core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
    int bytes = core->bytes;
    auto CONVOLUTION_TILED_NUMBER = ePack;
    auto input       = inputs[0];
    auto output      = outputs[0];
    int numberThread = ((CPUBackend *)backend())->threadNumber();
    auto ic = input->channel();
    auto oc = output->channel();
    auto icC4        = UP_DIV(ic, core->pack);
    auto ocC4        = UP_DIV(oc, core->pack);
    auto batch       = input->batch();
    auto matrixSizeE = output->height() * output->width() * input->batch();
    auto outputPlane = output->height() * output->width();
    mUnits.clear();
    auto inputPtr  = input->host<uint8_t>();
    auto outputPtr = output->host<uint8_t>();
    mTempOutputBatch.reset();
    mTempInputBatch.reset();
    std::shared_ptr<char> __autoFunction;
    auto padY     = mPadY;
    auto padX     = mPadX;
    auto strideX  = mCommon->strideX();
    auto strideY  = mCommon->strideY();
    mNeedPretreat = input->batch() > 1 || (!(padX == 0 && padY == 0 && strideY == 1 && strideX == 1));
    auto postParameters = getPostParameters();
    if (mNeedPretreat) {
        mTempInputBatch.reset(Tensor::createDevice<float>(std::vector<int>{icC4, matrixSizeE, core->pack}));
        mTempOutputBatch.reset(Tensor::createDevice<float>(std::vector<int>{ocC4, matrixSizeE, core->pack}));
        bool success = backend()->onAcquireBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
        success      = success && backend()->onAcquireBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
        if (!success) {
            return OUT_OF_MEMORY;
        }
        inputPtr       = mTempInputBatch->host<uint8_t>();
        outputPtr      = mTempOutputBatch->host<uint8_t>();
        __autoFunction = std::shared_ptr<char>(nullptr, [this](void *ptr) {
            backend()->onReleaseBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);
            backend()->onReleaseBuffer(mTempInputBatch.get(), Backend::DYNAMIC);
        });
        auto ow        = output->width();
        auto oh        = output->height();
        auto iw        = input->width();
        auto ih        = input->height();
        if (padX == 0 && padY == 0 && strideY == 1 && strideX == 1) {
            mPretreatFunction = [outputPlane, icC4, batch, numberThread, this, core](const uint8_t *srcBatch, uint8_t *dstBatch) {
                MNN_CONCURRENCY_BEGIN(y, icC4) {
                    auto srcY = srcBatch + outputPlane * y * core->pack * core->bytes;
                    auto dstY = dstBatch + y * outputPlane * batch * core->pack * core->bytes;
                    for (int x = 0; x < batch; ++x) {
                        auto srcX = srcY + x * outputPlane * icC4 * core->pack * core->bytes;
                        auto dstX = dstY + x * outputPlane * core->pack * core->bytes;
                        ::memcpy(dstX, srcX, outputPlane * core->pack * core->bytes);
                    }
                }
                MNN_CONCURRENCY_END();
            };
        } else if (strideY == 1 && strideX == 1) {
            mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this, core](const uint8_t *srcOrigin,
                                                                                                    uint8_t *dstOrigin) {
                auto unitBytes = core->bytes * core->pack;
                ::memset(dstOrigin, 0, outputPlane * batch * unitBytes * icC4);
                MNN_CONCURRENCY_BEGIN(z, icC4) {
                    auto srcZ = srcOrigin + z * iw * ih * unitBytes;
                    auto dstZ = dstOrigin + z * ow * oh * batch * unitBytes;
                    for (int b = 0; b < batch; ++b) {
                        auto srcBatch = srcZ + b * iw * ih * icC4 * unitBytes;
                        auto dstBatch = dstZ + b * ow * oh * unitBytes;
                        for (int y = 0; y < ih; ++y) {
                            auto src = srcBatch + iw * y * unitBytes;
                            auto dst = dstBatch + (ow * (y + padY) + padX) * unitBytes;
                            ::memcpy(dst, src, iw * unitBytes);
                        }
                    }
                }
                MNN_CONCURRENCY_END();
            };
        } else {
            int oyStart, oyEnd, oxStart, oxEnd;
            for (oyStart = 0; oyStart * strideY - padY < 0; ++oyStart) {
                // do nothing
            }
            for (oyEnd = oh - 1; oyEnd * strideY - padY >= ih; --oyEnd) {
                // do nothing
            }
            for (oxStart = 0; oxStart * strideX - padX < 0; ++oxStart) {
                // do nothing
            }
            for (oxEnd = ow - 1; oxEnd * strideX - padX >= iw; --oxEnd) {
                // do nothing
            }
            int oyCount       = oyEnd - oyStart + 1;
            int oxCount       = oxEnd - oxStart + 1;
            mPretreatFunction = [outputPlane, padY, padX, strideX, strideY, ow, oh, iw, ih, icC4, oxStart, oyStart,
                                 oxCount, oyCount, batch, this, core](const uint8_t *srcOrigin, uint8_t *dstOrigin) {
                ::memset(dstOrigin, 0, outputPlane * batch * core->bytes * core->pack * icC4);
                auto srcStride = strideX;
                auto dstStride = 1;
                int syStart    = oyStart * strideY - padY;
                int sxStart    = oxStart * strideX - padX;
                MNN_CONCURRENCY_BEGIN(z, icC4) {
                    auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * core->bytes * core->pack;
                    auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * core->bytes * core->pack;
                    for (int b = 0; b < batch; ++b) {
                        auto srcBatch = srcZ + b * iw * ih * icC4 * core->bytes * core->pack;
                        auto dstBatch = dstZ + b * ow * oh * core->bytes * core->pack;
                        for (int y = 0; y < oyCount; ++y) {
                            auto dstY = dstBatch + y * ow * core->bytes * core->pack;
                            auto srcY = srcBatch + y * strideY * iw * core->bytes * core->pack;
                            core->MNNCopyC4WithStride((const float*)(srcY), (float*)(dstY), strideX * core->pack, core->pack, oxCount);
                        }
                    }
                }
                MNN_CONCURRENCY_END();
            };
        }
    }
    auto memoryPool = ((CPUBackend *)backend())->getBufferAllocator();
    memoryPool->barrierBegin();
    std::shared_ptr<void> __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); });
    int maxDepth = 5;
    auto icAlign = UP_DIV(ic, lPack) * lPack;
    auto weightTensor = mResource->mWeight.get();
    AutoRelease<Tensor> tempWeight;
    if (icAlign != ic) {
        tempWeight.reset(Tensor::create<float>(std::vector<int>{oc, ic, hPack}, mResource->mWeight->host<uint8_t>()));
        weightTensor = tempWeight.get();
    }
    if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
        // Divide in plane, in this case the divide equal numberThread
        int divideStep = UP_DIV(matrixSizeE, numberThread);
        mUnits.resize(numberThread);
        for (int i = 0; i < numberThread; ++i) {
            int planeStart = i * divideStep;
            int planeEnd   = std::min(planeStart + divideStep, matrixSizeE);
            int planeSize  = planeEnd - planeStart;
            Unit &unit     = mUnits[i];
            if (planeSize <= 0) {
                unit.mValid = false;
                continue;
            }
            unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
            AutoRelease<Tensor> mTempInput(
                Tensor::create<float>(std::vector<int>{icC4, planeSize, core->pack}, inputPtr + core->pack * planeStart * bytes));
            mTempInput->setStride(0, matrixSizeE * core->pack);
            AutoRelease<Tensor> mTempOutput(
                Tensor::create<float>(std::vector<int>{ocC4, planeSize, core->pack}, outputPtr + core->pack * planeStart * bytes));
            mTempOutput->setStride(0, matrixSizeE * core->pack);
            unit.mTempInputVector  = std::vector<Tensor *>{mTempInput.get(), weightTensor, mResource->mBias.get()};
            unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};
            memoryPool->beginGroup();
            auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters, ic, oc);
            if (NO_ERROR != code) {
                memoryPool->endGroup();
                return code;
            }
            memoryPool->endGroup();
        }
    } else {
        // Divide in ocC4
        auto hDiv = 1;
        if (hPack > core->pack) {
            hDiv = hPack / core->pack;
        }
        auto ocDiv = UP_DIV(ocC4, hDiv);
        numberThread   = std::min(numberThread, ocDiv);
        int divideStep = (ocDiv / numberThread) * hDiv;
        mUnits.resize(numberThread);
        for (int i = 0; i < numberThread; ++i) {
            int ocStart = i * divideStep;
            int ocSize  = divideStep;
            if (i == numberThread - 1) {
                ocSize = ocC4 - i * divideStep;
            }
            Unit &unit  = mUnits[i];
            if (ocSize <= 0) {
                unit.mValid = false;
                continue;
            }
            auto ocStartWeight = (ocStart * core->pack) / hPack;
            auto ocWeightSize = std::min(UP_DIV((ocSize * core->pack), hPack), mResource->mWeight->length(0) - ocStartWeight);
            unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));
            AutoRelease<Tensor> mTempInput(Tensor::create<float>(std::vector<int>{icC4, matrixSizeE, core->pack}, inputPtr));
            AutoRelease<Tensor> mTempBias(Tensor::create<float>({ocSize, 1, core->pack}, mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes));
            AutoRelease<Tensor> mTempOutput(
                Tensor::create<float>(std::vector<int>{ocSize, matrixSizeE, core->pack}, outputPtr + core->pack * matrixSizeE * ocStart * bytes));
            AutoRelease<Tensor> mTempWeight(Tensor::create<float>(std::vector<int>{ocWeightSize, ic, hPack},
                                                         mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes));
            unit.mTempInputVector  = std::vector<Tensor *>{mTempInput.get(), mTempWeight.get(), mTempBias.get()};
            unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};
            memoryPool->beginGroup();
            auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters, ic);
            if (NO_ERROR != code) {
                memoryPool->endGroup();
                return code;
            }
            memoryPool->endGroup();
        }
    }
    return NO_ERROR;
}

ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto size   = mUnits.size();
    auto input  = inputs[0];
    auto output = outputs[0];
    auto core = static_cast<CPUBackend*>(backend())->functions();

    if (!mNeedPretreat) {
        MNN_CONCURRENCY_BEGIN(tId, size) {
            auto &unit = mUnits[tId];
            if (unit.mValid) {
                unit.mStracssenComputor->onExecute();
            }
        }
        MNN_CONCURRENCY_END();
        return NO_ERROR;
    }
    int bytes = core->bytes;
    mPretreatFunction(input->host<uint8_t>(), mTempInputBatch->host<uint8_t>());
    MNN_CONCURRENCY_BEGIN(tId, size) {
        auto &unit = mUnits[tId];
        if (unit.mValid) {
            unit.mStracssenComputor->onExecute();
        }
    }
    MNN_CONCURRENCY_END();

    auto batch       = input->batch();
    auto outputPlane = output->height() * output->width();
    auto ocC4        = UP_DIV(output->channel(), core->pack);
    MNN_CONCURRENCY_BEGIN(y, ocC4) {
        auto srcY = mTempOutputBatch->host<uint8_t>() + outputPlane * y * core->pack * batch * bytes;
        auto dstY = output->host<uint8_t>() + y * outputPlane * core->pack * bytes;
        for (int x = 0; x < batch; ++x) {
            auto srcX = srcY + x * outputPlane * core->pack * bytes;
            auto dstX = dstY + x * outputPlane * ocC4 * core->pack * bytes;
            ::memcpy(dstX, srcX, outputPlane * core->pack * bytes);
        }
    }
    MNN_CONCURRENCY_END();
    return NO_ERROR;
}
} // namespace MNN
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// Convolution1x1Strassen.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2019/02/12.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

Update 2020-02-26 09:57:17 +08:00			`#include "Convolution1x1Strassen.hpp"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#include <string.h>`
Update 2019-12-27 22:16:57 +08:00			`#include "core/BufferAllocator.hpp"`
			`#include "backend/cpu/CPUBackend.hpp"`
			`#include "core/Concurrency.h"`
Update 2020-02-26 09:57:17 +08:00			`#include "ConvOpt.h"`
Update 2019-12-27 22:16:57 +08:00			`#include "core/Macro.h"`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`#include "CommonOptFunction.h"`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`namespace MNN {`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon common, Backend b, const float *originWeight,`
			`size_t originWeightSize, const float *bias, size_t biasSize)`
			`: CPUConvolution(common, b) {`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`auto outputCount = (int)biasSize;`
			`auto mSrcCount = (int)originWeightSize / outputCount;`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`mResource.reset(new CPUConvolution::Resource);`
			`mResource->backend = b;`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`if (!mResource->copyBiasAlign(bias, biasSize)) {`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`MNN_ERROR("Not Enough Memory\n");`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`mValid = false;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return;`
			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto core = static_cast<CPUBackend*>(b)->functions();`
			`int ePack, lPack, hPack;`
			`core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);`
			`mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack}));`
			`mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);`
			`if (!mValid) {`
[PATCH 214/350] [MNN::Refine] Rearrange weights for 1x1 and generic convolution. 2020-12-15 18:14:15 +08:00			`MNN_ERROR("Not Enough Memory\n");`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`return;`
[PATCH 214/350] [MNN::Refine] Rearrange weights for 1x1 and generic convolution. 2020-12-15 18:14:15 +08:00			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`if (core->bytes < 4) {`
			`AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount}));`
			`mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC);`
			`if (!mValid) {`
			`MNN_ERROR("Not Enough Memory\n");`
			`return;`
			`}`
			`core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);`
			`core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);`
			`b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);`
			`} else {`
			`core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`}`
			`Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon common, Backend b) : CPUConvolution(common, b) {`
			`mResource = resource;`
Use strassen for Convolution1x1Strassen 2020-05-17 23:09:45 +08:00			`}`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`Convolution1x1Strassen::~Convolution1x1Strassen() {`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`// Do nothing`
			`}`

			`bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst) {`
			`if (!mValid) {`
			`return false;`
			`}`
			`if (nullptr == dst) {`
			`return true;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
[MNN:Sync] Sync internal git 2021-01-06 16:29:37 +08:00			`*dst = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);`
			`return true;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`

			`ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor > &inputs, const std::vector<Tensor > &outputs) {`
			`CPUConvolution::onResize(inputs, outputs);`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto core = static_cast<CPUBackend*>(backend())->functions();`
Use strassen for Convolution1x1Strassen 2020-05-17 23:09:45 +08:00			`int ePack, lPack, hPack;`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);`
			`int bytes = core->bytes;`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`auto CONVOLUTION_TILED_NUMBER = ePack;`
			`auto input = inputs[0];`
			`auto output = outputs[0];`
			`int numberThread = ((CPUBackend *)backend())->threadNumber();`
			`auto ic = input->channel();`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto oc = output->channel();`
			`auto icC4 = UP_DIV(ic, core->pack);`
			`auto ocC4 = UP_DIV(oc, core->pack);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`auto batch = input->batch();`
			`auto matrixSizeE = output->height() * output->width() * input->batch();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`auto outputPlane = output->height() * output->width();`
			`mUnits.clear();`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto inputPtr = input->host<uint8_t>();`
			`auto outputPtr = output->host<uint8_t>();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`mTempOutputBatch.reset();`
			`mTempInputBatch.reset();`
			`std::shared_ptr<char> __autoFunction;`
			`auto padY = mPadY;`
			`auto padX = mPadX;`
			`auto strideX = mCommon->strideX();`
			`auto strideY = mCommon->strideY();`
			`mNeedPretreat = input->batch() > 1 \|\| (!(padX == 0 && padY == 0 && strideY == 1 && strideX == 1));`
			`auto postParameters = getPostParameters();`
			`if (mNeedPretreat) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`mTempInputBatch.reset(Tensor::createDevice<float>(std::vector<int>{icC4, matrixSizeE, core->pack}));`
			`mTempOutputBatch.reset(Tensor::createDevice<float>(std::vector<int>{ocC4, matrixSizeE, core->pack}));`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`bool success = backend()->onAcquireBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);`
			`success = success && backend()->onAcquireBuffer(mTempInputBatch.get(), Backend::DYNAMIC);`
			`if (!success) {`
			`return OUT_OF_MEMORY;`
			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`inputPtr = mTempInputBatch->host<uint8_t>();`
			`outputPtr = mTempOutputBatch->host<uint8_t>();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`__autoFunction = std::shared_ptr<char>(nullptr, [this](void *ptr) {`
			`backend()->onReleaseBuffer(mTempOutputBatch.get(), Backend::DYNAMIC);`
			`backend()->onReleaseBuffer(mTempInputBatch.get(), Backend::DYNAMIC);`
			`});`
			`auto ow = output->width();`
			`auto oh = output->height();`
			`auto iw = input->width();`
			`auto ih = input->height();`
			`if (padX == 0 && padY == 0 && strideY == 1 && strideX == 1) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`mPretreatFunction = [outputPlane, icC4, batch, numberThread, this, core](const uint8_t srcBatch, uint8_t dstBatch) {`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_BEGIN(y, icC4) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcY = srcBatch + outputPlane * y * core->pack * core->bytes;`
			`auto dstY = dstBatch + y * outputPlane * batch * core->pack * core->bytes;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`for (int x = 0; x < batch; ++x) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcX = srcY + x * outputPlane * icC4 * core->pack * core->bytes;`
			`auto dstX = dstY + x * outputPlane * core->pack * core->bytes;`
			`::memcpy(dstX, srcX, outputPlane * core->pack * core->bytes);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
			`}`
			`MNN_CONCURRENCY_END();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`};`
			`} else if (strideY == 1 && strideX == 1) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this, core](const uint8_t *srcOrigin,`
			`uint8_t *dstOrigin) {`
			`auto unitBytes = core->bytes * core->pack;`
			`::memset(dstOrigin, 0, outputPlane * batch * unitBytes * icC4);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_BEGIN(z, icC4) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcZ = srcOrigin + z * iw * ih * unitBytes;`
			`auto dstZ = dstOrigin + z * ow * oh * batch * unitBytes;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`for (int b = 0; b < batch; ++b) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcBatch = srcZ + b * iw * ih * icC4 * unitBytes;`
			`auto dstBatch = dstZ + b * ow * oh * unitBytes;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`for (int y = 0; y < ih; ++y) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto src = srcBatch + iw * y * unitBytes;`
			`auto dst = dstBatch + (ow * (y + padY) + padX) * unitBytes;`
			`::memcpy(dst, src, iw * unitBytes);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`}`
			`}`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_END();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`};`
			`} else {`
			`int oyStart, oyEnd, oxStart, oxEnd;`
			`for (oyStart = 0; oyStart * strideY - padY < 0; ++oyStart) {`
			`// do nothing`
			`}`
			`for (oyEnd = oh - 1; oyEnd * strideY - padY >= ih; --oyEnd) {`
			`// do nothing`
			`}`
			`for (oxStart = 0; oxStart * strideX - padX < 0; ++oxStart) {`
			`// do nothing`
			`}`
			`for (oxEnd = ow - 1; oxEnd * strideX - padX >= iw; --oxEnd) {`
			`// do nothing`
			`}`
			`int oyCount = oyEnd - oyStart + 1;`
			`int oxCount = oxEnd - oxStart + 1;`
			`mPretreatFunction = [outputPlane, padY, padX, strideX, strideY, ow, oh, iw, ih, icC4, oxStart, oyStart,`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`oxCount, oyCount, batch, this, core](const uint8_t srcOrigin, uint8_t dstOrigin) {`
			`::memset(dstOrigin, 0, outputPlane * batch * core->bytes * core->pack * icC4);`
			`auto srcStride = strideX;`
			`auto dstStride = 1;`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`int syStart = oyStart * strideY - padY;`
			`int sxStart = oxStart * strideX - padX;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_BEGIN(z, icC4) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * core->bytes * core->pack;`
			`auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * core->bytes * core->pack;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`for (int b = 0; b < batch; ++b) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcBatch = srcZ + b * iw * ih * icC4 * core->bytes * core->pack;`
			`auto dstBatch = dstZ + b * ow * oh * core->bytes * core->pack;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`for (int y = 0; y < oyCount; ++y) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto dstY = dstBatch + y * ow * core->bytes * core->pack;`
			`auto srcY = srcBatch + y * strideY * iw * core->bytes * core->pack;`
			`core->MNNCopyC4WithStride((const float)(srcY), (float)(dstY), strideX * core->pack, core->pack, oxCount);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`}`
			`}`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_END();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`};`
			`}`
			`}`
			`auto memoryPool = ((CPUBackend *)backend())->getBufferAllocator();`
			`memoryPool->barrierBegin();`
			`std::shared_ptr<void> __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); });`
			`int maxDepth = 5;`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto icAlign = UP_DIV(ic, lPack) * lPack;`
			`auto weightTensor = mResource->mWeight.get();`
			`AutoRelease<Tensor> tempWeight;`
			`if (icAlign != ic) {`
			`tempWeight.reset(Tensor::create<float>(std::vector<int>{oc, ic, hPack}, mResource->mWeight->host<uint8_t>()));`
			`weightTensor = tempWeight.get();`
			`}`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`// Divide in plane, in this case the divide equal numberThread`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`int divideStep = UP_DIV(matrixSizeE, numberThread);`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`mUnits.resize(numberThread);`
			`for (int i = 0; i < numberThread; ++i) {`
			`int planeStart = i * divideStep;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`int planeEnd = std::min(planeStart + divideStep, matrixSizeE);`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`int planeSize = planeEnd - planeStart;`
			`Unit &unit = mUnits[i];`
			`if (planeSize <= 0) {`
			`unit.mValid = false;`
			`continue;`
			`}`
			`unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`AutoRelease<Tensor> mTempInput(`
			`Tensor::create<float>(std::vector<int>{icC4, planeSize, core->pack}, inputPtr + core->pack * planeStart * bytes));`
			`mTempInput->setStride(0, matrixSizeE * core->pack);`
			`AutoRelease<Tensor> mTempOutput(`
			`Tensor::create<float>(std::vector<int>{ocC4, planeSize, core->pack}, outputPtr + core->pack * planeStart * bytes));`
			`mTempOutput->setStride(0, matrixSizeE * core->pack);`
			`unit.mTempInputVector = std::vector<Tensor *>{mTempInput.get(), weightTensor, mResource->mBias.get()};`
			`unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`memoryPool->beginGroup();`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters, ic, oc);`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`if (NO_ERROR != code) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`memoryPool->endGroup();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`return code;`
			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`memoryPool->endGroup();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`}`
			`} else {`
			`// Divide in ocC4`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`auto hDiv = 1;`
			`if (hPack > core->pack) {`
			`hDiv = hPack / core->pack;`
			`}`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`auto ocDiv = UP_DIV(ocC4, hDiv);`
			`numberThread = std::min(numberThread, ocDiv);`
			`int divideStep = (ocDiv / numberThread) * hDiv;`
			`mUnits.resize(numberThread);`
			`for (int i = 0; i < numberThread; ++i) {`
			`int ocStart = i * divideStep;`
			`int ocSize = divideStep;`
			`if (i == numberThread - 1) {`
			`ocSize = ocC4 - i * divideStep;`
			`}`
			`Unit &unit = mUnits[i];`
			`if (ocSize <= 0) {`
			`unit.mValid = false;`
			`continue;`
			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto ocStartWeight = (ocStart * core->pack) / hPack;`
			`auto ocWeightSize = std::min(UP_DIV((ocSize * core->pack), hPack), mResource->mWeight->length(0) - ocStartWeight);`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth));`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`AutoRelease<Tensor> mTempInput(Tensor::create<float>(std::vector<int>{icC4, matrixSizeE, core->pack}, inputPtr));`
			`AutoRelease<Tensor> mTempBias(Tensor::create<float>({ocSize, 1, core->pack}, mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes));`
			`AutoRelease<Tensor> mTempOutput(`
			`Tensor::create<float>(std::vector<int>{ocSize, matrixSizeE, core->pack}, outputPtr + core->pack * matrixSizeE * ocStart * bytes));`
			`AutoRelease<Tensor> mTempWeight(Tensor::create<float>(std::vector<int>{ocWeightSize, ic, hPack},`
			`mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes));`
			`unit.mTempInputVector = std::vector<Tensor *>{mTempInput.get(), mTempWeight.get(), mTempBias.get()};`
			`unit.mTempOutputVector = std::vector<Tensor *>{mTempOutput.get()};`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`memoryPool->beginGroup();`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters, ic);`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`if (NO_ERROR != code) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`memoryPool->endGroup();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`return code;`
			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`memoryPool->endGroup();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`}`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
			`return NO_ERROR;`
			`}`

			`ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor > &inputs, const std::vector<Tensor > &outputs) {`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`auto size = mUnits.size();`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`auto input = inputs[0];`
			`auto output = outputs[0];`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto core = static_cast<CPUBackend*>(backend())->functions();`

[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`if (!mNeedPretreat) {`
			`MNN_CONCURRENCY_BEGIN(tId, size) {`
			`auto &unit = mUnits[tId];`
			`if (unit.mValid) {`
			`unit.mStracssenComputor->onExecute();`
Support multi-thread for 1x1 convolution 2020-05-19 13:40:35 +08:00			`}`
			`}`
			`MNN_CONCURRENCY_END();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00			`return NO_ERROR;`
			`}`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`int bytes = core->bytes;`
			`mPretreatFunction(input->host<uint8_t>(), mTempInputBatch->host<uint8_t>());`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_BEGIN(tId, size) {`
			`auto &unit = mUnits[tId];`
			`if (unit.mValid) {`
			`unit.mStracssenComputor->onExecute();`
Support multi-thread for 1x1 convolution 2020-05-19 13:40:35 +08:00			`}`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
			`MNN_CONCURRENCY_END();`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`auto batch = input->batch();`
			`auto outputPlane = output->height() * output->width();`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto ocC4 = UP_DIV(output->channel(), core->pack);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_BEGIN(y, ocC4) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcY = mTempOutputBatch->host<uint8_t>() + outputPlane * y * core->pack * batch * bytes;`
			`auto dstY = output->host<uint8_t>() + y * outputPlane * core->pack * bytes;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`for (int x = 0; x < batch; ++x) {`
[MNN:Sync] Sync internal Gitlab 2021-04-08 15:34:23 +08:00			`auto srcX = srcY + x * outputPlane * core->pack * bytes;`
			`auto dstX = dstY + x * outputPlane * ocC4 * core->pack * bytes;`
			`::memcpy(dstX, srcX, outputPlane * core->pack * bytes);`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`}`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`MNN_CONCURRENCY_END();`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return NO_ERROR;`
			`}`
			`} // namespace MNN`