MNN/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp

//
//  ConvInt8TiledExecutor.cpp
//  MNN
//
//  Created by MNN on 2019/5/17.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"
#include "core/Macro.h"

#include <math.h>
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "core/Concurrency.h"
#include "core/TensorUtils.hpp"
#include <math.h>
#ifdef MNN_USE_SSE
extern "C" {
void MNNInt8ToUInt8(void* ptr, int count);
}
#endif
namespace MNN {

static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,
                          const std::shared_ptr<Tensor>& weightOrigin,
                          std::shared_ptr<Tensor>& weight) {
    auto core = static_cast<CPUBackend*>(bn)->int8Functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    // reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)]
    int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
    std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
    
    weight.reset(Tensor::createDevice<int8_t>(shape));
    
    bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC);
    if (!succ) {
        MNN_ERROR("Memory not enough");
        return false;
    }
    auto weightSrc = weightOrigin->host<int8_t>();
    auto weightDst = weight->host<int8_t>();
    memset(weightDst, 0, weight->size());
    for (int k = 0; k < kernelCount; ++k) {
        const auto srcK = weightSrc + k;
        for (int y = 0; y < ic; ++y) {
            const int yOutSide    = y / UNIT;
            const int yInSide     = y % UNIT;
            const int yIndex      = yOutSide + k * UP_DIV(ic, UNIT);
            const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);
            const int ySubInSide  = yIndex % (SRC_UNIT / UNIT);
            
            auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;
            const auto srcY = srcK + y * kernelCount;
            for (int x = 0; x < oc; ++x) {
                const int xOutSide = x / UNIT;
                const int xInSide  = x % UNIT;
                const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
                const int srcIndex = x * kernelCount * ic;
                dstY[dstIndex]     = srcY[srcIndex];
            }
        }
    }
    return true;
}

ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res): CPUConvolution(convOp->common(), backend), mResource(res) {
    std::shared_ptr<Tensor> weightOrigin;
    weightOrigin.swap(mResource->mWeightInt8);
    mValid = reorderWeight(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);
    if(!mValid) {
        return;
    }
    // choose int8 gemm kernel
    auto core = static_cast<CPUBackend*>(backend)->int8Functions();
    mGemmKernel = core->Int8GemmKernel;
#ifdef MNN_USE_SSE
    int actBits = convOp->symmetricQuan()->nbits();
    if (actBits <= 7) {
        mGemmKernel = core->Int8GemmKernelFast;
    }
#else
    if(convOp->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){
        mGemmKernel = core->Int8GemmKernelFast;
    }
#endif
}

ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, std::shared_ptr<Tensor> weight, bool fastgemm)
: CPUConvolution(common, backend) {
    auto core = static_cast<CPUBackend*>(backend)->int8Functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    int oc = common->outputCount(), ic = common->inputCount(), kernel = common->kernelY() * common->kernelX();
    mResource.reset(new ResourceInt8);
    mResource->backend = backend;
    mResource->mBiasInt32.reset(Tensor::createDevice<int32_t>({ROUND_UP(oc, UNIT)}));
    mValid = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC);
    if (!mValid) {
        MNN_ERROR("Memory not enough\n");
        return;
    }
    ::memset(mResource->mBiasInt32->host<int32_t>(), 0, mResource->mBiasInt32->size());
#ifdef MNN_USE_SSE
    for (int oz = 0; oz < oc; ++oz) {
        int32_t offset = 0;
        for (int i = 0; i < ic * kernel; ++i) {
            offset += (int32_t)(weight->host<int8_t>()[oz * ic * kernel + i]) * (-128);
        }
        mResource->mBiasInt32->host<int32_t>()[oz] = offset;
    }
#endif
    mValid = reorderWeight(backend, common, weight, mResource->mWeightInt8);
    if(!mValid) {
        MNN_ERROR("Memory not enough\n");
        return;
    }
    // choose int8 gemm kernel
    mGemmKernel = core->Int8GemmKernel;
    if (fastgemm) {
        mGemmKernel = core->Int8GemmKernelFast;
    }
    mDoPostProcess = false;
}

ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, const ConvInt8TiledExecutor& exe)
    : CPUConvolution(common, backend), mGemmKernel(exe.mGemmKernel),
    mDoPostProcess(exe.mDoPostProcess), mResource(exe.mResource) {
    
}

ConvInt8TiledExecutor::~ConvInt8TiledExecutor() {
    // Do nothing
}

bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) {
    if (nullptr == dst) {
        return true;
    }
    auto exe = new ConvInt8TiledExecutor(bn, op->main_as_Convolution2D()->common(), *this);
    if (!exe->valid()) {
        return false;
    }
    *dst = exe;
    return true;
}

ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
    if (mDoPostProcess) {
        mResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
    } else {
        mResource->mInputZeroPoint = 0;
    }
    CPUConvolution::onResize(inputs, outputs);
    auto input  = inputs[0];
    auto output = outputs[0];
    
    auto core = static_cast<CPUBackend*>(backend())->int8Functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    auto convCommon = mCommon;
    const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();
    const auto srcCountUnit = UP_DIV(input->channel(), UNIT);
    const auto totalKernelCountD8Div2 = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / UNIT);

    mIm2ColParamter.dilateX         = convCommon->dilateX();
    mIm2ColParamter.dilateY         = convCommon->dilateY();
    mIm2ColParamter.strideX         = convCommon->strideX();
    mIm2ColParamter.strideY         = convCommon->strideY();
    mIm2ColParamter.padX            = convCommon->padX();
    mIm2ColParamter.padY            = convCommon->padY();
    mIm2ColParamter.icDiv4          = srcCountUnit;
    mIm2ColParamter.kernelX         = convCommon->kernelX();
    mIm2ColParamter.kernelY         = convCommon->kernelY();
    mIm2ColParamter.kernelCountUnit = totalKernelCountD8Div2;
    mIm2ColParamter.padX = mPadX;
    mIm2ColParamter.padY = mPadY;

    mIm2ColParamter.ih = input->height();
    mIm2ColParamter.iw = input->width();
    mIm2ColParamter.oh = output->height();
    mIm2ColParamter.ow = output->width();
    mIm2ColParamter.srcZStep = input->stride(1) * UNIT;
    mIm2ColParamter.srcYStep = input->stride(2) * UNIT;

    mTileCount        = UP_DIV(output->height() * output->width(), DST_XUNIT);
    const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
    mThreadNums       = std::min(threads, mTileCount);

    // set im2col tensor info
    mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, mResource->mWeightInt8->length(1) * SRC_UNIT}));
    bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
    if (!success) {
        return OUT_OF_MEMORY;
    }
    backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
    return NO_ERROR;
}

ErrorCode ConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
    const auto input = inputs[0];
    auto output      = outputs[0];
    auto core = static_cast<CPUBackend*>(backend())->int8Functions();
    
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    
    auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());

    const int outputPlaneLen = output->height() * output->width();
    const int dstZStep = outputPlaneLen * UNIT;

    const int batch = input->batch();
    const int ocDiv4 = UP_DIV(output->channel(), UNIT);
    const auto kernelCountUnitDouble = mIm2ColParamter.kernelCountUnit;
    //auto remain = outputPlaneLen % GEMM_INT8_DST_XUNIT;
    //FUNC_PRINT(remain);

    const auto inputDataPtr = input->host<int8_t>();
    const auto weightDataPtr = mResource->mWeightInt8->host<int8_t>();
    
    auto im2colPtr           = mTempIm2ColBuffer->host<int8_t>();
    auto outputDataPtr       = output->host<int8_t>();
    QuanPostTreatParameters quanParam;
    quanParam.bias = mResource->mBiasInt32->host<int32_t>();
    if (mDoPostProcess) {
        quanParam.scale = mResource->mScaleFloat->host<float>();
        quanParam.maxValue = mResource->mClampMax;
        if (mResource->mRelu) {
            quanParam.minValue = mResource->mOutputZeroPoint;
        } else {
            quanParam.minValue = mResource->mClampMin;
        }
    } else {
        quanParam.scale = nullptr;
    }
    //MNN_PRINT("max: %d, min: %d\n", quanParam.maxValue, quanParam.minValue);
    
    const int bytes = (mDoPostProcess ? 1 : 4); // int8_t or float
    for (int bIndex = 0; bIndex < batch; ++bIndex) {
        const auto srcPtr = inputDataPtr + bIndex * input->stride(0);
        auto dstPtr       = outputDataPtr + bIndex * output->stride(0) * bytes;

        auto threadFunction = [&](int tId) {
            auto colAddr        = im2colPtr + tId * mTempIm2ColBuffer->stride(0);

            for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
                const int xIndexStart  = tIndex * DST_XUNIT;
                const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT);
                // im2col
                im2ColProcess(colAddr, srcPtr, mResource->mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);
#ifdef MNN_USE_SSE
                const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT;
                MNNInt8ToUInt8(colAddr, col_buffer_size);
#endif
                auto outputInTilePtr = dstPtr + xIndexStart * UNIT * bytes;
                mGemmKernel(outputInTilePtr, colAddr, weightDataPtr, kernelCountUnitDouble, dstZStep * bytes, ocDiv4, &quanParam, realDstCount);
            }
        };

        MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
            threadFunction((int)tId);
        }
        MNN_CONCURRENCY_END();
    }

    return NO_ERROR;
}

} // namespace MNN
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`//`
			`// ConvInt8TiledExecutor.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2019/5/17.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

			`#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"`
			`#include "core/Macro.h"`

			`#include <math.h>`
			`#include "backend/cpu/CPUBackend.hpp"`
			`#include "backend/cpu/compute/CommonOptFunction.h"`
			`#include "core/Concurrency.h"`
			`#include "core/TensorUtils.hpp"`
			`#include <math.h>`
			`#ifdef MNN_USE_SSE`
			`extern "C" {`
			`void MNNInt8ToUInt8(void* ptr, int count);`
			`}`
			`#endif`
			`namespace MNN {`

			`static bool reorderWeight(Backend* bn, const Convolution2DCommon* common,`
			`const std::shared_ptr<Tensor>& weightOrigin,`
			`std::shared_ptr<Tensor>& weight) {`
			`auto core = static_cast<CPUBackend*>(bn)->int8Functions();`
			`int UNIT, SRC_UNIT, DST_XUNIT;`
			`core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);`
			`// reorder weight, [oc, ic, k^2] => [oc/unit, ((ic/unit)*k^2)/(src_unit/unit), unit(oc), (src_unit/unit), unit(ic)]`
			`int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();`
			`std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};`

			`weight.reset(Tensor::createDevice<int8_t>(shape));`

			`bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC);`
			`if (!succ) {`
			`MNN_ERROR("Memory not enough");`
			`return false;`
			`}`
			`auto weightSrc = weightOrigin->host<int8_t>();`
			`auto weightDst = weight->host<int8_t>();`
			`memset(weightDst, 0, weight->size());`
			`for (int k = 0; k < kernelCount; ++k) {`
			`const auto srcK = weightSrc + k;`
			`for (int y = 0; y < ic; ++y) {`
			`const int yOutSide = y / UNIT;`
			`const int yInSide = y % UNIT;`
			`const int yIndex = yOutSide + k * UP_DIV(ic, UNIT);`
			`const int ySubOutSide = yIndex / (SRC_UNIT / UNIT);`
			`const int ySubInSide = yIndex % (SRC_UNIT / UNIT);`

			`auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * UNIT + yInSide;`
			`const auto srcY = srcK + y * kernelCount;`
			`for (int x = 0; x < oc; ++x) {`
			`const int xOutSide = x / UNIT;`
			`const int xInSide = x % UNIT;`
			`const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;`
			`const int srcIndex = x * kernelCount * ic;`
			`dstY[dstIndex] = srcY[srcIndex];`
			`}`
			`}`
			`}`
			`return true;`
			`}`

			`ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res): CPUConvolution(convOp->common(), backend), mResource(res) {`
			`std::shared_ptr<Tensor> weightOrigin;`
			`weightOrigin.swap(mResource->mWeightInt8);`
			`mValid = reorderWeight(backend, convOp->common(), weightOrigin, mResource->mWeightInt8);`
			`if(!mValid) {`
			`return;`
			`}`
			`// choose int8 gemm kernel`
			`auto core = static_cast<CPUBackend*>(backend)->int8Functions();`
			`mGemmKernel = core->Int8GemmKernel;`
			`#ifdef MNN_USE_SSE`
			`int actBits = convOp->symmetricQuan()->nbits();`
			`if (actBits <= 7) {`
			`mGemmKernel = core->Int8GemmKernelFast;`
			`}`
			`#else`
			`if(convOp->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){`
			`mGemmKernel = core->Int8GemmKernelFast;`
			`}`
			`#endif`
			`}`

			`ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, std::shared_ptr<Tensor> weight, bool fastgemm)`
			`: CPUConvolution(common, backend) {`
			`auto core = static_cast<CPUBackend*>(backend)->int8Functions();`
			`int UNIT, SRC_UNIT, DST_XUNIT;`
			`core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);`
			`int oc = common->outputCount(), ic = common->inputCount(), kernel = common->kernelY() * common->kernelX();`
			`mResource.reset(new ResourceInt8);`
			`mResource->backend = backend;`
			`mResource->mBiasInt32.reset(Tensor::createDevice<int32_t>({ROUND_UP(oc, UNIT)}));`
			`mValid = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC);`
			`if (!mValid) {`
			`MNN_ERROR("Memory not enough\n");`
			`return;`
			`}`
			`::memset(mResource->mBiasInt32->host<int32_t>(), 0, mResource->mBiasInt32->size());`
			`#ifdef MNN_USE_SSE`
			`for (int oz = 0; oz < oc; ++oz) {`
			`int32_t offset = 0;`
			`for (int i = 0; i < ic * kernel; ++i) {`
			`offset += (int32_t)(weight->host<int8_t>()[oz * ic * kernel + i]) * (-128);`
			`}`
			`mResource->mBiasInt32->host<int32_t>()[oz] = offset;`
			`}`
			`#endif`
			`mValid = reorderWeight(backend, common, weight, mResource->mWeightInt8);`
			`if(!mValid) {`
			`MNN_ERROR("Memory not enough\n");`
			`return;`
			`}`
			`// choose int8 gemm kernel`
			`mGemmKernel = core->Int8GemmKernel;`
			`if (fastgemm) {`
			`mGemmKernel = core->Int8GemmKernelFast;`
			`}`
			`mDoPostProcess = false;`
			`}`

			`ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2DCommon* common, const ConvInt8TiledExecutor& exe)`
			`: CPUConvolution(common, backend), mGemmKernel(exe.mGemmKernel),`
			`mDoPostProcess(exe.mDoPostProcess), mResource(exe.mResource) {`

			`}`

			`ConvInt8TiledExecutor::~ConvInt8TiledExecutor() {`
			`// Do nothing`
			`}`

			`bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) {`
			`if (nullptr == dst) {`
			`return true;`
			`}`
[MNN:Bugfix] Fix bug for ConvInt8TiledExecutor onClone 2021-06-16 16:20:42 +08:00			`auto exe = new ConvInt8TiledExecutor(bn, op->main_as_Convolution2D()->common(), *this);`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`if (!exe->valid()) {`
			`return false;`
			`}`
			`*dst = exe;`
			`return true;`
			`}`

			`ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor>& inputs, const std::vector<Tensor>& outputs) {`
			`if (mDoPostProcess) {`
			`mResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));`
			`} else {`
			`mResource->mInputZeroPoint = 0;`
			`}`
			`CPUConvolution::onResize(inputs, outputs);`
			`auto input = inputs[0];`
			`auto output = outputs[0];`

			`auto core = static_cast<CPUBackend*>(backend())->int8Functions();`
			`int UNIT, SRC_UNIT, DST_XUNIT;`
			`core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);`
			`auto convCommon = mCommon;`
			`const auto kernelCount = convCommon->kernelX() * convCommon->kernelY();`
			`const auto srcCountUnit = UP_DIV(input->channel(), UNIT);`
			`const auto totalKernelCountD8Div2 = UP_DIV(srcCountUnit * kernelCount, SRC_UNIT / UNIT);`

			`mIm2ColParamter.dilateX = convCommon->dilateX();`
			`mIm2ColParamter.dilateY = convCommon->dilateY();`
			`mIm2ColParamter.strideX = convCommon->strideX();`
			`mIm2ColParamter.strideY = convCommon->strideY();`
			`mIm2ColParamter.padX = convCommon->padX();`
			`mIm2ColParamter.padY = convCommon->padY();`
			`mIm2ColParamter.icDiv4 = srcCountUnit;`
			`mIm2ColParamter.kernelX = convCommon->kernelX();`
			`mIm2ColParamter.kernelY = convCommon->kernelY();`
			`mIm2ColParamter.kernelCountUnit = totalKernelCountD8Div2;`
			`mIm2ColParamter.padX = mPadX;`
			`mIm2ColParamter.padY = mPadY;`

			`mIm2ColParamter.ih = input->height();`
			`mIm2ColParamter.iw = input->width();`
			`mIm2ColParamter.oh = output->height();`
			`mIm2ColParamter.ow = output->width();`
			`mIm2ColParamter.srcZStep = input->stride(1) * UNIT;`
			`mIm2ColParamter.srcYStep = input->stride(2) * UNIT;`

			`mTileCount = UP_DIV(output->height() * output->width(), DST_XUNIT);`
			`const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);`
			`mThreadNums = std::min(threads, mTileCount);`

			`// set im2col tensor info`
			`mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT, mResource->mWeightInt8->length(1) * SRC_UNIT}));`
			`bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);`
			`if (!success) {`
			`return OUT_OF_MEMORY;`
			`}`
			`backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);`
			`return NO_ERROR;`
			`}`

			`ErrorCode ConvInt8TiledExecutor::onExecute(const std::vector<Tensor>& inputs, const std::vector<Tensor>& outputs) {`
			`const auto input = inputs[0];`
			`auto output = outputs[0];`
			`auto core = static_cast<CPUBackend*>(backend())->int8Functions();`

			`int UNIT, SRC_UNIT, DST_XUNIT;`
			`core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);`

			`auto im2ColProcess = core->chooseIm2Col(&mIm2ColParamter, input->channel());`

			`const int outputPlaneLen = output->height() * output->width();`
			`const int dstZStep = outputPlaneLen * UNIT;`

			`const int batch = input->batch();`
			`const int ocDiv4 = UP_DIV(output->channel(), UNIT);`
			`const auto kernelCountUnitDouble = mIm2ColParamter.kernelCountUnit;`
			`//auto remain = outputPlaneLen % GEMM_INT8_DST_XUNIT;`
			`//FUNC_PRINT(remain);`

			`const auto inputDataPtr = input->host<int8_t>();`
			`const auto weightDataPtr = mResource->mWeightInt8->host<int8_t>();`

			`auto im2colPtr = mTempIm2ColBuffer->host<int8_t>();`
			`auto outputDataPtr = output->host<int8_t>();`
			`QuanPostTreatParameters quanParam;`
			`quanParam.bias = mResource->mBiasInt32->host<int32_t>();`
			`if (mDoPostProcess) {`
			`quanParam.scale = mResource->mScaleFloat->host<float>();`
			`quanParam.maxValue = mResource->mClampMax;`
			`if (mResource->mRelu) {`
			`quanParam.minValue = mResource->mOutputZeroPoint;`
			`} else {`
			`quanParam.minValue = mResource->mClampMin;`
			`}`
			`} else {`
			`quanParam.scale = nullptr;`
			`}`
			`//MNN_PRINT("max: %d, min: %d\n", quanParam.maxValue, quanParam.minValue);`

			`const int bytes = (mDoPostProcess ? 1 : 4); // int8_t or float`
			`for (int bIndex = 0; bIndex < batch; ++bIndex) {`
			`const auto srcPtr = inputDataPtr + bIndex * input->stride(0);`
			`auto dstPtr = outputDataPtr + bIndex * output->stride(0) * bytes;`

			`auto threadFunction = [&](int tId) {`
			`auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);`

			`for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {`
			`const int xIndexStart = tIndex * DST_XUNIT;`
			`const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT);`
			`// im2col`
			`im2ColProcess(colAddr, srcPtr, mResource->mInputZeroPoint, &mIm2ColParamter, xIndexStart, realDstCount);`
			`#ifdef MNN_USE_SSE`
			`const int col_buffer_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT;`
			`MNNInt8ToUInt8(colAddr, col_buffer_size);`
			`#endif`
			`auto outputInTilePtr = dstPtr + xIndexStart * UNIT * bytes;`
			`mGemmKernel(outputInTilePtr, colAddr, weightDataPtr, kernelCountUnitDouble, dstZStep * bytes, ocDiv4, &quanParam, realDstCount);`
			`}`
			`};`

			`MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {`
			`threadFunction((int)tId);`
			`}`
			`MNN_CONCURRENCY_END();`
			`}`

			`return NO_ERROR;`
			`}`

			`} // namespace MNN`