MNN/source/backend/cpu/compute/ConvInt8Winograd.cpp

#include "ConvInt8Winograd.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "core/Macro.h"
#include "core/Concurrency.h"
#include "core/TensorUtils.hpp"
#include "ConvOpt.h"
#include "Int8FunctionsOpt.h"
#include "CommonOptFunction.h"
#include "MNN/AutoTime.hpp"
#include "math/Vec.hpp"
#include "math/WingoradGenerater.hpp"
#include <map>
#include <numeric>
#include <cmath>
#include <string>
#include <memory>
#include <vector>
#ifdef MNN_USE_NEON
#include <arm_neon.h>
#endif
#ifndef MNN_REDUCE_SIZE

namespace MNN {

std::shared_ptr<ConvInt8Winograd::WinoResource> ConvInt8Winograd::makeWinoResource(const int8_t* originWeight, std::shared_ptr<Tensor> scaleFloat, const int32_t* attr, Backend* backend, int oc, int ic, int kernelY, int kernelX) {
    auto core = static_cast<CPUBackend*>(backend)->int8Functions();
    auto gcore = static_cast<CPUBackend*>(backend)->functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    int pack = gcore->pack;
    int ocDivUnit = UP_DIV(oc, UNIT), ic4 = UP_DIV(ic, SRC_UNIT);
    int oc4 = UP_DIV(oc, pack);
    int kySize = attr[2], kxSize = attr[3], unitY = attr[4], unitX = attr[5]; attr += 6;
    int alphaY = kySize + unitY - 1, alphaX = kxSize + unitX - 1, alpha2 = alphaY * alphaX;

    std::shared_ptr<Tensor> weight, offsets, scales, inputScales, mergeInfo;
    weight.reset(Tensor::createDevice<int8_t>({1, ocDivUnit, ic4, UNIT, SRC_UNIT}));
    offsets.reset(Tensor::createDevice<float>({alpha2, oc4, pack}));
    scales.reset(Tensor::createDevice<float>({1, 2 * oc4 * pack}));
    inputScales.reset(Tensor::createDevice<float>({alpha2, pack}));
    mergeInfo.reset(Tensor::createDevice<int8_t>({alpha2, weight->stride(0) + scales->size()}));

    auto allocTensors = [=](std::vector<std::shared_ptr<Tensor>> tensors) -> bool {
        bool success = true;
        for (const auto& t : tensors) {
            success &= backend->onAcquireBuffer(t.get(), Backend::STATIC);
        }
        return success;
    };

    if (!allocTensors({offsets, scales, inputScales, mergeInfo})) {
        MNN_ERROR("Memory not enough\n");
        return nullptr;
    }
    std::shared_ptr<Tensor> originWeightFloat, weightFloat;
    originWeightFloat.reset(Tensor::createDevice<float>({oc, ic, kySize, kxSize}));
    weightFloat.reset(Tensor::createDevice<float>({alpha2, oc, ic, 1, 1}));
    if (!allocTensors({weight, originWeightFloat, weightFloat})) {
        MNN_ERROR("Memory not enough\n");
        return nullptr;
    }
    ::memset(weight->host<int8_t>(), 0, weight->size());
    ::memset(offsets->host<float>(), 0, offsets->size());
    ::memset(scales->host<float>(), 0, scales->size());
    auto inputScaleData = (const float*)attr; attr += alpha2;
    auto inputPointData = (const int32_t*)attr; attr += alpha2;
    auto weightScaleData = (const float*)attr; attr += alpha2 * oc;
    for (int i = 0; i < alpha2; ++i) {
        auto scale = 1.0f / inputScaleData[i];
        for (int u = 0; u < pack; ++u) {
            inputScales->host<float>()[i * pack + u] = scale;
        }
    }
    for (int c = 0; c < oc * ic; ++c) {
        for (int h = 0; h < kySize; ++h) {
            for (int w = 0; w < kxSize; ++w) {
                auto srcInt8 = originWeight[(c * kernelY + h) * kernelX + w];
                auto scale = scaleFloat->host<float>()[c / ic];
                originWeightFloat->host<float>()[(c * kySize + h) * kxSize + w] = srcInt8 * scale;
            }
        }
    }
    Math::WinogradGenerater generator({unitY, unitX}, {kySize, kxSize}, 1, true);
    generator.transformWeight(weightFloat.get(), originWeightFloat.get(), true);
    auto scalePtr = scales->host<float>();
    for (int a = 0; a < alpha2; ++a) {
        for (int oz = 0; oz < oc; ++oz) {
            int oz4 = oz / UNIT, ozRemain = oz % UNIT;
            int offset_int32 = 0;
            float offset = 0.f;
            float scale = weightScaleData[a * oc + oz];
            for (int sz = 0; sz < ic; ++sz) {
                int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT;
                int index = ((oz4 * ic4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain;
                float srcData = weightFloat->host<float>()[(a * oc + oz) * ic + sz];
                // -ffast-math may cause inexact input then wrong rounded result, add eps to avoid this
                float eps = ((srcData/scale) > 0 ? 1 : -1) * 1e-6;
                auto quanData = (int8_t)ALIMIN(ALIMAX(roundf(srcData / scale + eps), -127), 127);
                weight->host<int8_t>()[index] = quanData;
                offset += quanData * (-inputPointData[a]);
#ifdef MNN_USE_SSE
                offset += quanData * (-128);
#endif
            }

            offsets->host<float>()[a * oc4 * pack + oz] = offset * scale * inputScaleData[a];
            scalePtr[oz] = scale * inputScaleData[a];
        }
        int32_t params[6] = {1, ocDivUnit, ic4, UNIT, SRC_UNIT, oc4 * pack};
        ConvInt8TiledExecutor::packWeightAndQuantInfo(mergeInfo->host<int8_t>() + a * mergeInfo->stride(0), weight->host<int8_t>(), scales->host<int8_t>(), params);
    }

    std::shared_ptr<WinoResource> resource(new WinoResource);
    resource->weight = mergeInfo;
    resource->offsets = offsets;
    resource->scales = scales;
    resource->transInputScales = inputScales;
    std::vector<int32_t> inputZeroPoints(inputPointData, inputPointData + alpha2);
    resource->transInputZeroPoints = inputZeroPoints;
    resource->backend = backend;

    backend->onReleaseBuffer(weight.get(), Backend::STATIC);
    backend->onReleaseBuffer(originWeightFloat.get(), Backend::STATIC);
    backend->onReleaseBuffer(weightFloat.get(), Backend::STATIC);
    return resource;
}

ConvInt8Winograd::ConvInt8Winograd(Backend *b, const Convolution2D *convOp, std::shared_ptr<ResourceInt8> res) : CPUConvolution(convOp->common(), b), mResource(res) {
    int oc = mCommon->outputCount(), ic = mCommon->inputCount();
    int kernelY = mCommon->kernelY(), kernelX = mCommon->kernelX();
    auto core = static_cast<CPUBackend*>(b)->int8Functions();

    auto attr = convOp->symmetricQuan()->winogradAttr()->data();
    int version = *(attr++), unitNum = *(attr++);
    if (version != 0) {
        MNN_ERROR("ConvInt8 winograd attr proto version must be 1\n");
        mValid = false;
        return;
    }
    //FUNC_PRINT(convOp->symmetricQuan()->winogradAttr()->size());
    auto weightData = res->mWeightInt8->host<int8_t>();
    for (int i = 0; i < unitNum; ++i) {
        int unitSize = *(attr++);
        int kyStart = attr[0], kxStart = attr[1], kySize = attr[2], kxSize = attr[3], unitY = attr[4], unitX = attr[5];
        int alphaY = kySize + unitY - 1, alphaX = kxSize + unitX - 1;
        // TODO: support alphaY != alphaX
        if (alphaY != alphaX) {
            MNN_ERROR("ConvInt8 winograd only support ky==kx && unitY==unitX\n");
            mValid = false;
            return;
        }
        std::shared_ptr<Tensor> tempInput, tempOutput;
        auto winoRes = makeWinoResource(weightData + kyStart * kernelY + kxStart, mResource->mOriginScale, attr, b, oc, ic, kernelY, kernelX);
        attr += unitSize;
        std::shared_ptr<WinoExecution> exe(new WinoExecution(winoRes, kySize, kxSize, unitY, unitX, oc, ic));
        mUnits.push_back({kyStart, kxStart, tempInput, tempOutput, exe});
    }
    mResource->mWeightInt8.reset((Tensor*)nullptr);
}
ConvInt8Winograd::ConvInt8Winograd(Backend* backend, const Convolution2DCommon* common, const ConvInt8Winograd& exe)
    : CPUConvolution(common, backend) {
    for (const auto& unit : exe.mUnits) {
        std::shared_ptr<Tensor> tempInput, tempOutput;
        std::shared_ptr<WinoExecution> runner(new WinoExecution(backend, *unit.runner.get()));
        mUnits.push_back({unit.kyStart, unit.kxStart, tempInput, tempOutput, runner});
    }
    mResource = exe.mResource;
}
ConvInt8Winograd::~ConvInt8Winograd() {
    // Do nothing
}
bool ConvInt8Winograd::onClone(Backend* bn, const Op* op, Execution** dst) {
    if (nullptr == dst) {
        return true;
    }
    auto dstExe = new ConvInt8Winograd(bn, op->main_as_Convolution2D()->common(), *this);
    if (!dstExe->valid()) {
        return false;
    }
    *dst = dstExe;
    return false;
}
ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    CPUConvolution::onResize(inputs, outputs);

    mInputFloat.reset(Tensor::createDevice<float>(inputs[0]->shape(), Tensor::CAFFE_C4));
    mValid = backend()->onAcquireBuffer(mInputFloat.get(), Backend::DYNAMIC);
    if (!mValid) {
        return OUT_OF_MEMORY;
    }

    auto core = static_cast<CPUBackend*>(backend())->int8Functions();
    auto gcore = static_cast<CPUBackend*>(backend())->functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    UNIT = gcore->pack;
    int pack = gcore->pack;

    mFusedBias.reset(Tensor::createDevice<float>({ROUND_UP(outputs[0]->channel(), pack)}));
    mValid &= backend()->onAcquireBuffer(mFusedBias.get(), Backend::STATIC);
    if (!mValid) {
        return OUT_OF_MEMORY;
    }
    auto fusedBiasPtr = mFusedBias->host<float>();
    ::memset(fusedBiasPtr, 0, mFusedBias->size());
    for (int i = 0; i < outputs[0]->channel(); ++i) {
        fusedBiasPtr[i] = mResource->mOriginBias->host<float>()[i] / mResource->mOutputScale + static_cast<float>(mResource->mOutputZeroPoint);
    }

    auto input = mInputFloat.get(), output = outputs[0];
    int batch = input->batch(), ic = input->channel(), oc = output->channel();
    int ih = input->height(), iw = input->width();
    for (auto& unit : mUnits) {
        unit.output.reset(Tensor::createDevice<float>(output->shape(), Tensor::CAFFE_C4));
        mValid = backend()->onAcquireBuffer(unit.output.get(), Backend::DYNAMIC);
        if (!mValid) {
            return OUT_OF_MEMORY;
        }
    }
    for (auto& unit : mUnits) {
        int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0);
        auto srcChunk = TensorUtils::getDescribeOrigin(input)->mem->chunk() + (sy * iw + sx) * pack;
        unit.input.reset(Tensor::createDevice<float>({batch, ic, ih - sy, iw - sx}, Tensor::CAFFE_C4));
        TensorUtils::getDescribeOrigin(unit.input.get())->mem = (new CPUMemObj(nullptr, srcChunk, 0));
        for (int i = 0; i < input->dimensions(); ++i) {
            unit.input->setStride(i, input->stride(i));
        }
        unit.runner->mPadY = ALIMAX(mPadY - unit.kyStart, 0);
        unit.runner->mPadX = ALIMAX(mPadX - unit.kxStart, 0);
        auto res = unit.runner->onResize({unit.input.get()}, {unit.output.get()});
        if (res != NO_ERROR) {
            mValid = false;
            return res;
        }
    }
    for (auto& unit : mUnits) {
        backend()->onReleaseBuffer(unit.output.get(), Backend::DYNAMIC);
    }
    backend()->onReleaseBuffer(mInputFloat.get(), Backend::DYNAMIC);
    return NO_ERROR;
}

static void mergeAddBiasScaleQuantize(const std::vector<Tensor*>& inputs, Tensor* output, const QuanPostTreatParameters* quanParam, CPUBackend* cpuBn, int zeroPoint) {
    auto core = cpuBn->functions();
    auto coreInt8 = cpuBn->int8Functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
    coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    int pack = core->pack;

    int countC4 = UP_DIV(output->channel(), pack), plane = output->height() * output->width() * output->batch();
    auto mergeFloat = inputs[0]->host<float>();
    for (int i = 1; i < inputs.size(); ++i) {
        core->MNNMatrixAdd(mergeFloat, mergeFloat, inputs[i]->host<float>(), plane * countC4, 0, 0, 0, 1);
    }
    auto zeroPointPtr = quanParam->biasFloat;
    for (int i = 0; i < countC4; ++i) {
        coreInt8->MNNFloat2Int8(mergeFloat + i * plane * pack, output->host<int8_t>() + i * plane * pack, plane, quanParam->scale, quanParam->minValue, quanParam->maxValue, zeroPointPtr + i * pack, 2);
    }
}

// AVX: 8 -> 16, arm32/64: 4 -> 16, AVX512: 16 -> 16, arm82: 4 -> 4
static void _reorderCommon(float* dst, const float* src, size_t area, size_t depth, int* areaOffset, int uFrom, int uTo) {
    if (uFrom == 1 && uTo == 4) {
        MNNPackC4((float*)dst, (const float*)src, area, depth, areaOffset);
        return;
    }
    if (uFrom == 1 && uTo == 2) {
        MNNPackInt8C2((float*)dst, (const float*)src, area, depth, areaOffset);
        return;
    }
    size_t srcOffset = areaOffset[0], dstOffset = areaOffset[1];
    int z = 0;
    if (uFrom == 2 && uTo == 4) {
        for (; z + 3 < depth; z += 4) {
            auto srcZ = src + z * srcOffset;
            auto dstZ = dst + z * dstOffset;
            for (int i = 0; i < area; ++i) {
                dstZ[i * 4] = srcZ[i * 2];
                dstZ[i * 4 + 1] = srcZ[i * 2 + 1];
                dstZ[i * 4 + 2] = srcZ[srcOffset * 2 + i * 2];
                dstZ[i * 4 + 3] = srcZ[srcOffset * 2 + i * 2 + 1];
            }
        }
    }
    // Other UNIT != SRC_UNIT case if exist, and remain
    for (; z < depth; ++z) {
        auto dstZ = dst + (z / uTo) * dstOffset * uTo + (z % uTo);
        auto srcZ = src + (z / uFrom) * srcOffset * uFrom + (z % uFrom);
        for (int i = 0; i < area; ++i) {
            dstZ[i * uTo] = srcZ[i * uFrom];
        }
    }
    int depthLast = depth % uTo;
    if (depthLast != 0) {
        int zero = 0;
#ifdef MNN_USE_SSE
        zero = 128;
#endif
        auto dstZ = dst + (depth / uTo) * dstOffset + depthLast;
        for (int i = 0; i < area; ++i) {
            ::memset(dstZ + i * uTo, zero, (uTo - depthLast) * sizeof(float));
        }
    }
}

ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto bn = static_cast<CPUBackend*>(backend());
    auto core = bn->int8Functions();
    auto gcore = bn->functions();
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    // UNIT = gcore->pack;
    int pack = gcore->pack;
    // scale, zero, min, max
    auto inputQuant = TensorUtils::getQuantInfo(inputs[0]);
    auto outputQuant = TensorUtils::getQuantInfo(outputs[0]);
    if (TensorUtils::getDescribe(inputs[0])->quantAttr.get() == nullptr) {
        inputQuant = {(float)mResource->mInputScale,
            (float)mResource->mInputZeroPoint,
            (float)mResource->mClampMin,
            (float)mResource->mClampMax,
        };
        outputQuant = {(float)mResource->mOutputScale,
            (float)mResource->mOutputZeroPoint,
            (float)mResource->mClampMin,
            (float)mResource->mClampMax,
        };
    }

    std::vector<float> scale(pack, inputQuant[0]);
    int size = bn->getTensorSize(mInputFloat.get());
    core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), &inputQuant[0], size / pack, &inputQuant[1], 0);
    std::vector<Tensor*> tmp_outputs;
    for (auto& unit : mUnits) {
        unit.input->buffer().host = TensorUtils::getDescribeOrigin(unit.input.get())->mem->chunk().ptr();
        auto ret = unit.runner->onExecute({unit.input.get()}, {unit.output.get()});
        if (ret != NO_ERROR) {
            return ret;
        }
        tmp_outputs.push_back(unit.output.get());
    }
    QuanPostTreatParameters quanParam;
    float outputdequantScale = 1.0 / mResource->mOutputScale;
    quanParam.scale = &outputdequantScale;
    // For winograd Int8, will not treat origin bias to int32, use float directly
    // quanParam.biasFloat = mResource->mOriginBias->host<float>();
    quanParam.biasFloat = mFusedBias->host<float>();
    quanParam.maxValue = outputQuant[3];
    if (mResource->mRelu) {
        quanParam.minValue = outputQuant[1];
    } else {
        quanParam.minValue = outputQuant[2];
    }
    mergeAddBiasScaleQuantize(tmp_outputs, outputs[0], &quanParam, bn, outputQuant[1]);
    return NO_ERROR;
};

ConvInt8Winograd::WinoExecution::WinoExecution(std::shared_ptr<WinoResource> res, int kernelY, int kernelX, int unitY, int unitX, int outputCount, int inputCount)
: Execution(res->backend), mWinoResource(res), mUnitY(unitY), mUnitX(unitX), mKernelY(kernelY), mKernelX(kernelX) {
    auto core = static_cast<CPUBackend*>(res->backend)->int8Functions();
    auto gcore = static_cast<CPUBackend*>(res->backend)->functions();

    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    int pack = gcore->pack;

    int threadNumber = ((CPUBackend *)backend())->threadNumber();
    int alphaY = mUnitY + mKernelY - 1, alphaX = mUnitX + mKernelX - 1, alpha2 = alphaY * alphaX;
    int ic4 = UP_DIV(inputCount, SRC_UNIT), oc4 = UP_DIV(outputCount, pack);
    mTempInputBuffer.reset(Tensor::createDevice<int8_t>({threadNumber, alpha2, ic4, DST_XUNIT * SRC_UNIT}));
    mTempOutputBuffer.reset(Tensor::createDevice<float>({threadNumber, alpha2, oc4, DST_XUNIT * pack}));
    int midSize = alpha2 * DST_XUNIT * ALIMAX(ROUND_UP(inputCount, pack), oc4 * pack);
    mTransformMidBuffer.reset(Tensor::createDevice<float>({threadNumber, 3, midSize}));
}
ConvInt8Winograd::WinoExecution::WinoExecution(Backend* bn, const WinoExecution& exe)
    : Execution(bn), mWinoResource(exe.mWinoResource),
    mUnitY(exe.mUnitY), mUnitX(exe.mUnitX), mKernelY(exe.mKernelY), mKernelX(exe.mKernelX),
    mPadY(exe.mPadY), mPadX(exe.mPadX) {

    mTempInputBuffer.reset(Tensor::createDevice<int8_t>(exe.mTempInputBuffer->shape()));
    mTempOutputBuffer.reset(Tensor::createDevice<float>(exe.mTempOutputBuffer->shape()));
    mTransformMidBuffer.reset(Tensor::createDevice<float>(exe.mTransformMidBuffer->shape()));
}
ErrorCode ConvInt8Winograd::WinoExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    bool success = backend()->onAcquireBuffer(mTempInputBuffer.get(), Backend::DYNAMIC);
    success &= backend()->onAcquireBuffer(mTempOutputBuffer.get(), Backend::DYNAMIC);
    success &= backend()->onAcquireBuffer(mTransformMidBuffer.get(), Backend::DYNAMIC);
    if (!success) {
        return OUT_OF_MEMORY;
    }
    backend()->onReleaseBuffer(mTempInputBuffer.get(), Backend::DYNAMIC);
    backend()->onReleaseBuffer(mTempOutputBuffer.get(), Backend::DYNAMIC);
    backend()->onReleaseBuffer(mTransformMidBuffer.get(), Backend::DYNAMIC);
    return NO_ERROR;
}
ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto core = static_cast<CPUBackend*>(backend())->functions();
    auto coreInt8 = static_cast<CPUBackend*>(backend())->int8Functions();
    auto input = inputs[0], output = outputs[0];

    int alphaY = mKernelY + mUnitY - 1, alphaX = mKernelX + mUnitX - 1, alpha2 = alphaY * alphaX;
    bool conv1d = (alphaY == 1 || alphaX == 1);
    int UNIT, SRC_UNIT, DST_XUNIT;
    coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    UNIT = core->pack;
    int pack = core->pack;

    auto gemmFunc = coreInt8->Int8GemmKernel;
    CoreFunctions::WinoUnrollTransFunc srcTransXFunc = nullptr, srcTransYFunc = nullptr;
    if (mKernelX != 1) {
        srcTransXFunc = core->chooseWinoSourceUnrollTransform(alphaX, alphaX);
    }
    if (mKernelY != 1) {
        srcTransYFunc = core->chooseWinoSourceUnrollTransform(alphaY, alphaY);
    }

#define MAX_UNIT 8
    CoreFunctions::WinoUnrollDestTransFunc dstTransXFunc[MAX_UNIT + 1], dstTransYFunc[MAX_UNIT + 1];
    if (mKernelX != 1) {
        core->chooseWinoDestUnrollTransform(dstTransXFunc, MAX_UNIT + 1, alphaX, mUnitX);
    }
    if (mKernelY != 1) {
        core->chooseWinoDestUnrollTransform(dstTransYFunc, MAX_UNIT + 1, alphaY, mUnitY);
    }

    int ow = output->width(), oh = output->height();
    int iw = input->width(), ih = input->height();
    int ic = input->channel(), ic_4 = UP_DIV(ic, pack);
    int dc_4 = UP_DIV(output->channel(), pack);

    int padY = mPadY, padX = mPadX;
    auto wUnit = UP_DIV(ow, mUnitX), hUnit = UP_DIV(oh, mUnitY);
    int batch = output->batch();

    auto totalCount   = wUnit * hUnit * batch;
    // MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
    int threadNumber = std::max(((CPUBackend *)backend())->threadNumber(), 1);
    int tileCount    = UP_DIV(totalCount, DST_XUNIT);
    threadNumber     = std::min(threadNumber, tileCount);

    auto src_trans_func = [&](float* dstOrigin, const float* srcOrigin, float* buffer, int xIndex, int xC) {
        int bufSize = mTransformMidBuffer->stride(1);
        auto midBuffer0 = buffer, midBuffer1 = midBuffer0 + bufSize;
        int oybBegin = xIndex / wUnit;
        int oxBegin = xIndex % wUnit;
        int oybEnd = (xIndex + xC-1) / wUnit;
        int remain = xC;
        for (int hbIndex=oybBegin; hbIndex <= oybEnd; ++hbIndex) {
            auto hIndex = hbIndex % hUnit;
            auto bIndex = hbIndex / hUnit;
            auto bOffset = iw * ih * pack * bIndex;
            auto srcBatch = srcOrigin + bOffset;
            int dstZStep = DST_XUNIT * pack, unitStep = dstZStep * ic_4;
            int step = std::min(wUnit - oxBegin, remain);
            int srcY  = hIndex * mUnitY - padY;
            int ey    = ALIMIN(srcY + alphaY, ih) - srcY;
            int sy    = ALIMAX(0, srcY) - srcY;

            int sBegin = step, sEnd = step;
            if (ey - sy == alphaY) {
                for (int si = 0; si < step; ++si) {
                    auto wIndex = si + oxBegin;
                    int srcX  = wIndex * mUnitX - padX;
                    int sx    = ALIMAX(0, srcX) - srcX;
                    int ex    = ALIMIN(srcX + alphaX, iw) - srcX;
                    if (sBegin == step && ex - sx == alphaX) {
                        sBegin = si;
                    } else if (sBegin < step && ex - sx != alphaX) {
                        sEnd = si;
                        break;
                    }
                }
            }
            for (int si=0; si<step;) {
                int sStep = (si == sBegin ? sEnd - sBegin : 1);
                auto wIndex = si + oxBegin;
                int srcX  = wIndex * mUnitX - padX;
                int sx    = ALIMAX(0, srcX) - srcX;
                int ex    = ALIMIN(srcX + alphaX, iw) - srcX;
                auto dst_x = dstOrigin + si * pack;

                int sourceZStep = iw * ih * pack * batch, sourceYStep = iw * pack;
                auto srcStart = srcBatch + srcY * sourceYStep + srcX * pack;
                // when input window exceed limit (so need pad value), copy from src to midbuffer0
                if (ex - sx != alphaX || ey - sy != alphaY) {
                    ::memset(midBuffer0, 0, alpha2 * ic_4 * pack * sizeof(float));
                    int count = pack * (ex - sx);
                    for (int z = 0; count > 0 && z < ic_4; ++z) {
                        for (int yy = sy; yy < ey; ++yy) {
                            auto dst_yy = midBuffer0 + ((z * alphaY + yy) * alphaX + sx) * pack;
                            auto src_yy = srcStart + z * sourceZStep + yy * sourceYStep + sx * pack;
                            ::memcpy(dst_yy, src_yy, count * sizeof(float));
                        }
                    }
                    srcStart = midBuffer0;
                    sourceZStep = alpha2 * pack;
                    sourceYStep = alphaX * pack;
                }
                for (int sz = 0; sz < ic_4; ++sz) {
                    for (int s = 0; s < sStep; ++s) {
                        auto dst = dst_x + sz * dstZStep + s * pack;
                        auto src = srcStart + sz * sourceZStep + s * mUnitX * pack;
                        srcTransXFunc(src, midBuffer1, sourceYStep, alphaX * pack, pack, pack);
                        srcTransYFunc(midBuffer1, dst, pack, unitStep, alphaX * pack, alphaX * unitStep);
                    }
                }
                si += sStep;
            }
            oxBegin = 0;
            remain -= step;
            dstOrigin += pack * step;
        }

    };

        auto srcOrigin = input->host<float>();
        auto dstOrigin = output->host<float>();

        auto weight    = mWinoResource->weight->host<int8_t>();
        std::vector<float> xkernelSum(DST_XUNIT, 0);
        std::vector<float> wKernelSum(dc_4 * pack, 0);
        std::vector<float> fakeInputScale(DST_XUNIT, 1.f);
        std::vector<float> reluThred = {-std::numeric_limits<float>().max(), std::numeric_limits<float>().max()};

        auto tFunction = [&](int tId) {
            auto _srcOrigin = mTempInputBuffer->host<int8_t>() + tId * mTempInputBuffer->stride(0);
            auto _dstOrigin = mTempOutputBuffer->host<float>() + tId * mTempOutputBuffer->stride(0);
            QuanPostTreatParameters quanParam;
            quanParam.useInt8 = 0;
            quanParam.srcKernelSum = xkernelSum.data();
            quanParam.weightKernelSum = wKernelSum.data();
            quanParam.fp32minmax = reluThred.data();
            quanParam.inputScale = nullptr;

            for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) {
                int xIndex  = (int)tIndex * DST_XUNIT;
                int xReamin = totalCount - xIndex;
                int xC      = xReamin > DST_XUNIT ? DST_XUNIT : xReamin;

                int bufSize = mTransformMidBuffer->stride(1);
                auto buffer0 = mTransformMidBuffer->host<float>() + tId * mTransformMidBuffer->stride(0);
                auto buffer1 = buffer0 + bufSize, buffer2 = buffer1 + bufSize;
    #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
                src_trans_func(buffer2, srcOrigin, buffer0, xIndex, xC);
    #endif
                ::memset(buffer1, 0, dc_4 * pack * sizeof(float));
                // Multi
                for (int i = 0; i < alpha2; ++i) {
                    auto _srcInt8Ptr = _srcOrigin + i * mTempInputBuffer->stride(1);

                    auto scaleVec = mWinoResource->transInputScales->host<float>() + i * pack;
                    float zeroPoint = static_cast<float>(mWinoResource->transInputZeroPoints[i]);
                    coreInt8->MNNFloat2Int8(buffer2 + i * DST_XUNIT * ic_4 * pack, (pack == SRC_UNIT ? _srcInt8Ptr: (int8_t*)buffer0), ic_4 * DST_XUNIT, scaleVec, -127, 127, &zeroPoint, 0);
                    if (pack != SRC_UNIT) {
                        int areaOffset[] = {DST_XUNIT, DST_XUNIT}, byte = sizeof(float);
                        _reorderCommon((float*)_srcInt8Ptr, buffer0, DST_XUNIT, UP_DIV(ic, byte), areaOffset, pack / byte, SRC_UNIT / byte);
                    }

                    auto _dstFloatPtr = _dstOrigin + i * dc_4 * xC * pack;
                    auto _weightInt8Ptr = weight + i * mWinoResource->weight->stride(0);

                    quanParam.biasFloat = (mWinoResource->offsets->host<float>() + i * mWinoResource->offsets->stride(0));
                    quanParam.scale = mWinoResource->scales->host<float>() + i * dc_4 * pack;
                    quanParam.inputScale = fakeInputScale.data();
                    quanParam.bias = nullptr;
                    quanParam.blockNum = 1;
                    gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * pack * sizeof(float), dc_4, &quanParam, DST_XUNIT);
                }
    #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
                {
                    auto midBuffer0 = buffer0;
                    auto midBuffer1 = (float*)((int8_t*)midBuffer0 + mTransformMidBuffer->stride(1));
                    int srcZStep = xC * pack;
                    int unitStep = dc_4 * xC * pack;
                    int oybBegin = xIndex / wUnit;
                    int oxBegin = xIndex % wUnit;
                    int oybEnd = (xIndex + xC-1) / wUnit;
                    int remain = xC;
                    auto dstS = _dstOrigin;
                    for (int hbIndex=oybBegin; hbIndex <= oybEnd; ++hbIndex) {
                        int hIndex = hbIndex % hUnit;
                        int bIndex = hbIndex / hUnit;
                        int step = std::min(wUnit - oxBegin, remain);
                        int dstY = hIndex * mUnitY;
                        int ey = ALIMIN(dstY + mUnitY, oh) - dstY;

                        int sBegin = step, sEnd = step;
                        if (alphaX != 1 || ey == mUnitY) {
                            for (int si = 0; si < step; ++si) {
                                auto wIndex = si + oxBegin;
                                int dstX = wIndex * mUnitX;
                                int ex = ALIMIN(dstX + mUnitX, ow) - dstX;
                                if (sBegin == step && ex == mUnitX) {
                                    sBegin = si;
                                } else if (sBegin < step && ex != mUnitX) {
                                    sEnd = si;
                                    break;
                                }
                            }
                        }
                        for (int si=0; si<step;) {
                            int sStep = (si == sBegin ? sEnd - sBegin : 1);
                            auto wIndex = si + oxBegin;
                            auto srcXi = dstS + pack * si;
                            int dstX = wIndex * mUnitX;
                            auto dstStart = dstOrigin + (dstX + dstY * ow + bIndex * ow * oh) * pack;
                            int ex = ALIMIN(dstX + mUnitX, ow) - dstX;
                            int count = ex * pack;

                            auto _dstStart = dstStart;
                            int dstZStep = oh * ow * batch * pack, dstYStep = ow * pack;
                            if (ex != mUnitX || (alphaX == 1 && ey != mUnitY)) {
                                dstZStep = mUnitY * mUnitX * pack;
                                dstYStep = mUnitX * pack;
                                _dstStart = midBuffer1;
                            }
                            for (int z = 0; z < dc_4; ++z) {
                                for (int x = 0; x < sStep; ++x) {
                                    auto srcXiZ = srcXi + z * srcZStep + x * pack;
                                    auto _dstStartZ = _dstStart + z * dstZStep + x * mUnitX * pack;
                                    dstTransYFunc[alphaX](srcXiZ, midBuffer0, nullptr, nullptr, unitStep, pack, alphaX * unitStep, alphaX * pack);
                                    dstTransXFunc[ey](midBuffer0, _dstStartZ, nullptr, nullptr, alphaX * pack, dstYStep, pack, pack);
                                }
                            }
                            if (ex != mUnitX || (alphaX == 1 && ey != mUnitY)) {
                                for (int z = 0; z < dc_4; ++z) {
                                    for (int yy = 0; yy < ey; ++yy) {
                                        auto srcYAddr = _dstStart + (z * mUnitY + yy) * mUnitX * pack;
                                        auto dstYAddr = dstStart + z * ow * oh * batch * pack + yy * ow * pack;
                                        ::memcpy(dstYAddr, srcYAddr, count * sizeof(float));
                                    }
                                }
                            }
                            si += sStep;
                        }
                        oxBegin = 0;
                        remain -= step;
                        dstS += pack * step;
                    }
                }
#endif
        }
    };

    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
        tFunction((int)tId);
    }
    MNN_CONCURRENCY_END();

    return NO_ERROR;
}

bool ConvInt8Winograd::mustUse(const Convolution2D *convOp) {
    auto quan = convOp->symmetricQuan();
    if (quan == nullptr || quan->winogradAttr() == nullptr) {
        return false;
    }
    return true;
}

} /* MNN */
#endif