MNN/test/op/ConvolutionTest.cpp

//
//  ConvolutionTest.cpp
//  MNNTests
//
//  Created by MNN on 2019/01/15.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include <MNN/Interpreter.hpp>
#include <MNN/expr/Expr.hpp>
#include <MNN/expr/ExprCreator.hpp>
#include <MNN/expr/Optimizer.hpp>
#include <MNN/AutoTime.hpp>
#include <vector>
#include "MNNTestSuite.h"
#include "MNN_generated.h"
#include "CommonOpCreator.hpp"
#include "core/Session.hpp"
#include "core/TensorUtils.hpp"
#include "core/MemoryFormater.h"
#include "core/CommonCompute.hpp"

#define TEST_RANDOM_SEED 100

using namespace MNN;
using namespace MNN::Express;
static void reference_conv2d(const std::vector<float>& input, const std::vector<float>& weight,
                             const std::vector<float>& bias, std::vector<float>& output, std::vector<float>& outputDataSeparateBias, int batch, int ic, int oc,
                             int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, int kw, int stride,
                             int dilation, int group, ConvertFP32 functor) {
    int oh, ow;
    if (mode == PadMode_SAME) {
        oh    = (ih + stride - 1) / stride; // oh = ceil(ih / stride)
        ow    = (iw + stride - 1) / stride; // ow = ceil(iw / stride)
        pad_h = ((oh - 1) * stride + (kh - 1) * dilation + 1 - ih) / 2;
        pad_w = ((ow - 1) * stride + (kw - 1) * dilation + 1 - iw) / 2;
    } else {
        if (mode == PadMode_VALID) {
            pad_h = pad_w = 0;
        }
        oh = (ih + 2 * pad_h - (kh - 1) * dilation - 1) / stride + 1;
        ow = (iw + 2 * pad_w - (kw - 1) * dilation - 1) / stride + 1;
    }

    MNN_ASSERT(oc % group == 0 && ic % group == 0);
    if (oh <= 0 || ow <= 0) {
        output.clear();
        return;
    }
    output.resize(batch * oh * ow * oc);
    /*
      In CPUConvolutionDepthwise, bias function 'MNNAxByClampBroadcastUnit' is called separately with MNNConvRunForLineDepthwise,
      this would affect the precision when using bf16 or fp16.
      winograd convolution also did this.
      we keep the two result for checking.
    */
    outputDataSeparateBias.resize(batch * oh * ow * oc);

    int ocGroup = oc / group, icGroup = ic / group;
    for (int b = 0; b < batch; ++b) {
        for (int oz = 0; oz < oc; ++oz) {
            int gId = oz / ocGroup;
            for (int oy = 0; oy < oh; ++oy) {
                for (int ox = 0; ox < ow; ++ox) {
                    float sum = 0;
                    auto destOffset = ((b * oc + oz) * oh + oy) * ow + ox;
                    for (int sz = gId * icGroup; sz < (gId + 1) * icGroup; ++sz) {
                        for (int ky = 0; ky < kh; ++ky) {
                            for (int kx = 0; kx < kw; ++kx) {
                                int ix = ox * stride + kx * dilation - pad_w, iy = oy * stride + ky * dilation - pad_h;
                                float xValue = 0.0f;
                                if (ix >= 0 && ix < iw && iy >= 0 && iy < ih) {
                                    xValue = input[(((b * ic + sz) * ih + iy) * iw + ix)];
                                }
                                float convertX = functor(xValue);
                                float convertW = functor(weight[(((gId * ocGroup + oz % ocGroup) * icGroup + sz % icGroup) * kh + ky) * kw + kx]);
                                sum += convertX * convertW;
                            }
                        }
                    }

                    output[destOffset] = functor(sum + functor(bias[oz]));
                    outputDataSeparateBias[destOffset] = functor(functor(sum) + functor(bias[oz]));
                }
            }
        }
    }
}

VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1},
           int group = 1, INTS pads = {0, 0}, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool sparse = false) {
std::unique_ptr<OpT> convOp(new OpT);
convOp->type = OpType_Convolution;
auto shape   = weight -> getInfo();
if (NHWC == shape->order) {
    weight = _Transpose(weight, {0, 3, 1, 2});
    shape  = weight->getInfo();
    }
    auto channel    = std::vector<int>{shape->dim[0], shape->dim[1]};
    auto kernelSize = std::vector<int>{shape->dim[3], shape->dim[2]};
    if (1 == channel[1] && channel[0] == group) {
        convOp->type = OpType_ConvolutionDepthwise;
        channel[1] = group;
    }
    convOp->main.type  = OpParameter_Convolution2D;
    convOp->main.value = new Convolution2DT;
    auto conv2D        = convOp->main.AsConvolution2D();
    conv2D->common.reset(new Convolution2DCommonT);
    if (pads.size() == 2) {
        conv2D->common->padX        = pads[0];
        conv2D->common->padY        = pads[1];
    } else {
        conv2D->common->pads = std::move(pads);
    }
    conv2D->common->padMode     = _convertPadMode(pad);
    conv2D->common->strideX     = stride[0];
    conv2D->common->strideY     = stride[1];
    conv2D->common->group       = group;
    conv2D->common->outputCount = channel[0];
    conv2D->common->inputCount  = channel[1];
    conv2D->common->dilateX     = dilate[0];
    conv2D->common->dilateY     = dilate[1];
    conv2D->common->kernelX     = kernelSize[0];
    conv2D->common->kernelY     = kernelSize[1];
    if (sparse) {
        size_t weightNNZElement, weightBlockNumber = 0;
        int weightSize = weight->getInfo()->size;
        int biasSize = bias->getInfo()->size;
        CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, weight->readMap<float>(), biasSize, weightSize / biasSize, sparseBlockOC);

        std::unique_ptr<MNN::AttributeT> arg1(new MNN::AttributeT);
        arg1->key = "sparseBlockOC";
        arg1->i = sparseBlockOC;

        std::unique_ptr<MNN::AttributeT> arg2(new MNN::AttributeT);;
        arg2->key = "sparseBlockKernel";
        arg2->i = 1;

        std::unique_ptr<MNN::AttributeT> arg3(new MNN::AttributeT);;
        arg3->key = "NNZElement";
        arg3->i = weightNNZElement;

        std::unique_ptr<MNN::AttributeT> arg4(new MNN::AttributeT);;
        arg4->key = "blockNumber";
        arg4->i = weightBlockNumber;

        flatbuffers::FlatBufferBuilder builder;
        std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
        auto sparseArg1 = MNN::CreateAttribute(builder, arg1.get());
        auto sparseArg2 = MNN::CreateAttribute(builder, arg2.get());
        auto sparseArg3 = MNN::CreateAttribute(builder, arg3.get());
        auto sparseArg4 = MNN::CreateAttribute(builder, arg4.get());

        argsVector.emplace_back(sparseArg1);
        argsVector.emplace_back(sparseArg2);
        argsVector.emplace_back(sparseArg3);
        argsVector.emplace_back(sparseArg4);

        auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
        auto sparseCom = MNN::CreateSparseCommon(builder, sparseAlgo, sparseArgs);
        builder.Finish(sparseCom);
        auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();

        conv2D->sparseParameter.reset(sparseComPtr);
    }
    if (nullptr == bias) {
        return (Variable::create(Expr::create(convOp.get(), {x, weight})));
    }
    return (Variable::create(Expr::create(convOp.get(), {x, weight, bias})));
}
VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
           PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0},
           bool relu = false, bool relu6 = false, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool sparese = false) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_Convolution;
    if (channel[0] == channel[1] && channel[0] == group) {
        convOp->type = OpType_ConvolutionDepthwise;
    }
    convOp->main.type  = OpParameter_Convolution2D;
    convOp->main.value = new Convolution2DT;
    auto conv2D        = convOp->main.AsConvolution2D();
    conv2D->common.reset(new Convolution2DCommonT);
    conv2D->common->padMode     = _convertPadMode(pad);
    if (pads.size() == 2) {
        conv2D->common->padX        = pads[0];
        conv2D->common->padY        = pads[1];
    } else {
        conv2D->common->pads = std::move(pads);
    }
    conv2D->common->strideX     = stride[0];
    conv2D->common->strideY     = stride[1];
    conv2D->common->group       = group;
    conv2D->common->outputCount = channel[1];
    conv2D->common->inputCount  = channel[0];
    conv2D->common->dilateX     = dilate[0];
    conv2D->common->dilateY     = dilate[1];
    conv2D->common->kernelX     = kernelSize[0];
    conv2D->common->kernelY     = kernelSize[1];
    conv2D->common->relu6 = relu6;
    conv2D->common->relu = relu;
    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
    conv2D->weight = std::move(weight);
    MNN_ASSERT(bias.size() == channel[1]);
    conv2D->bias = std::move(bias);
    if (sparese) {
        size_t weightNNZElement, weightBlockNumber = 0;
        CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, conv2D->weight.data(), conv2D->bias.size(), conv2D->weight.size() / conv2D->bias.size(), sparseBlockOC);

        std::unique_ptr<MNN::AttributeT> arg1(new MNN::AttributeT);
        arg1->key = "sparseBlockOC";
        arg1->i = sparseBlockOC;

        std::unique_ptr<MNN::AttributeT> arg2(new MNN::AttributeT);;
        arg2->key = "sparseBlockKernel";
        arg2->i = 1;

        std::unique_ptr<MNN::AttributeT> arg3(new MNN::AttributeT);;
        arg3->key = "NNZElement";
        arg3->i = weightNNZElement;

        std::unique_ptr<MNN::AttributeT> arg4(new MNN::AttributeT);;
        arg4->key = "blockNumber";
        arg4->i = weightBlockNumber;

        flatbuffers::FlatBufferBuilder builder;
        std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
        auto sparseArg1 = MNN::CreateAttribute(builder, arg1.get());
        auto sparseArg2 = MNN::CreateAttribute(builder, arg2.get());
        auto sparseArg3 = MNN::CreateAttribute(builder, arg3.get());
        auto sparseArg4 = MNN::CreateAttribute(builder, arg4.get());

        argsVector.emplace_back(sparseArg1);
        argsVector.emplace_back(sparseArg2);
        argsVector.emplace_back(sparseArg3);
        argsVector.emplace_back(sparseArg4);

        auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
        auto sparseCom = MNN::CreateSparseCommon(builder, sparseAlgo, sparseArgs);
        builder.Finish(sparseCom);
        auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();

        conv2D->sparseParameter.reset(sparseComPtr);
        CommonCompute::compressFloatWeightToSparse(convOp.get());
    }
    return (Variable::create(Expr::create(convOp.get(), {x})));
}

VARP _Conv(float weight, float bias, VARP x, INTS channel, INTS kernelSize, PaddingMode pad = VALID,
           INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool sparse = false) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_Convolution;
    if (channel[0] == channel[1] && channel[0] == group) {
        convOp->type = OpType_ConvolutionDepthwise;
    }
    convOp->main.type  = OpParameter_Convolution2D;
    convOp->main.value = new Convolution2DT;
    auto conv2D        = convOp->main.AsConvolution2D();
    conv2D->common.reset(new Convolution2DCommonT);
    conv2D->common->padMode     = _convertPadMode(pad);
    conv2D->common->strideX     = stride[0];
    conv2D->common->strideY     = stride[1];
    conv2D->common->group       = group;
    conv2D->common->outputCount = channel[1];
    conv2D->common->inputCount  = channel[0];
    conv2D->common->dilateX     = dilate[0];
    conv2D->common->dilateY     = dilate[1];
    conv2D->common->kernelX     = kernelSize[0];
    conv2D->common->kernelY     = kernelSize[1];
    conv2D->weight.resize(channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
    std::fill(conv2D->weight.begin(), conv2D->weight.end(), weight);
    conv2D->bias.resize(channel[1]);
    std::fill(conv2D->bias.begin(), conv2D->bias.end(), bias);
    if (sparse) {
        size_t weightNNZElement, weightBlockNumber = 0;
        CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, conv2D->weight.data(), conv2D->bias.size(), conv2D->weight.size() / conv2D->bias.size(), sparseBlockOC);

        std::unique_ptr<MNN::AttributeT> arg1(new MNN::AttributeT);
        arg1->key = "sparseBlockOC";
        arg1->i = sparseBlockOC;

        std::unique_ptr<MNN::AttributeT> arg2(new MNN::AttributeT);;
        arg2->key = "sparseBlockKernel";
        arg2->i = 1;

        std::unique_ptr<MNN::AttributeT> arg3(new MNN::AttributeT);;
        arg3->key = "NNZElement";
        arg3->i = weightNNZElement;

        std::unique_ptr<MNN::AttributeT> arg4(new MNN::AttributeT);;
        arg4->key = "blockNumber";
        arg4->i = weightBlockNumber;

        flatbuffers::FlatBufferBuilder builder;
        std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
        auto sparseArg1 = MNN::CreateAttribute(builder, arg1.get());
        auto sparseArg2 = MNN::CreateAttribute(builder, arg2.get());
        auto sparseArg3 = MNN::CreateAttribute(builder, arg3.get());
        auto sparseArg4 = MNN::CreateAttribute(builder, arg4.get());

        argsVector.emplace_back(sparseArg1);
        argsVector.emplace_back(sparseArg2);
        argsVector.emplace_back(sparseArg3);
        argsVector.emplace_back(sparseArg4);

        auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
        auto sparseCom = MNN::CreateSparseCommon(builder, sparseAlgo, sparseArgs);
        builder.Finish(sparseCom);
        auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();

        conv2D->sparseParameter.reset(sparseComPtr);
    }
    return (Variable::create(Expr::create(convOp.get(), {x})));
}

class ConvolutionCommonTest : public MNNTestCase {
protected:
    bool mSparse = false;
    bool mBenchSpeed = false;
public:
    virtual ~ConvolutionCommonTest() = default;
    virtual bool run (int precision) {
        return true;
    }

public:
    virtual void generateWeight(std::vector<float>& weightData, int ic, int oc, int kh, int kw, int dilation, int group, int sparseBlockOC) {
        for (int i = 0; i < group * (oc / group) * (ic / group) * kw * kh; i++) {
            auto data      = ((((i / kw)% 1317) * ((i / kh) % 1317)) % 1317 + i / ic + i / oc + (((oc - i) % 1317) * ic) % 1317 + i * ((oc - i) % 1317)) % 1317;
            auto floatData      = (float)(data % 255) / 255.0f / 1000.0f;
            weightData.push_back(floatData);
        }

    }
    ConvolutionCommonTest& speed() {
        mBenchSpeed = true;
        return *this;
    }
    bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name, int batch,
                     int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, int kw, int stride,
                     int dilation, int group, int precision, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool debug = false) {
        using namespace MNN::Express;
        std::map<PadMode, Express::PaddingMode> padMap = {
            {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}};
        std::vector<float> weightData, biasData;

        generateWeight(weightData, ic, oc, kh, kw, dilation, group, sparseBlockOC);

        for (int i = 0; i < oc; i++) {
            auto data      = (((i / kw) % 1317) * ((i / kh) % 1317) + i / ic + i / oc + (oc - i) * ic + i * (oc - i)) % 1317;
            auto floatData = (float)(data % 255) / 255.0f;
            data           = data * data;
            biasData.push_back(floatData);
            // biasData.push_back(0.0f);
        }

        std::vector<float> inputData, outputData, outputDataSeparateBias;
        for (int i = 0; i < ih * iw * ic * batch; ++i) {
            auto data      = ((i / kw) % 1317) * ((i / kh) % 1317) + ((i / ic)% 1317) * ((i / oc) % 1317) + ((oc - i) % 1317) * ic + (i % 1317) * ((oc - i) % 1317);
            data = data % 1317;
            data           = (data * data) % 1317;
            auto floatData = (float)(data % 255) / 255.0f;
            inputData.push_back(floatData);
        }

        if (debug) {
           std::vector<float> printCache(inputData.size());
           for (int i = 0; i < inputData.size(); ++i) {
               printCache[i] = FP32Converter[precision](inputData[i]);
           }
           MNN_PRINT("input:");
           formatMatrix(printCache.data(), {batch, ic, ih, iw});
           printCache.resize(weightData.size());
           for (int i = 0; i < weightData.size(); ++i) {
               printCache[i] = FP32Converter[precision](weightData[i]);
           }
           MNN_PRINT("weight:");
           formatMatrix(printCache.data(), {oc, ic, kh, kw});
           printCache.resize(biasData.size());
           for (int i = 0; i < biasData.size(); ++i) {
               printCache[i] = FP32Converter[precision](biasData[i]);
           }
           MNN_PRINT("bias:");
           formatMatrix(printCache.data(), {oc});

        }

        reference_conv2d(inputData, weightData, biasData, outputData, outputDataSeparateBias, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw,
                         stride, dilation, group, FP32Converter[precision]);
        if (outputData.size() == 0) {
            return true;
        }

        auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of<float>());
        ::memcpy(input->writeMap<float>(), inputData.data(), inputData.size() * sizeof(float));
         // Multi Conv
         if (group == 1 || (group == ic && ic == oc)) {
             VARP weightVar;
             if (group == 1) {
                 weightVar = _Const(weightData.data(), {oc, ic, kh, kw}, NCHW, halide_type_of<float>());
             } else {
                 weightVar = _Const(weightData.data(), {oc, ic / group, kh, kw}, NCHW, halide_type_of<float>());
             }
             auto biasVar = _Const(biasData.data(), {oc}, NCHW, halide_type_of<float>());
             auto out     = _Conv(weightVar, biasVar, _Convert(input, NC4HW4), padMap[mode], {stride, stride}, {dilation, dilation}, group,
                              {pad_w, pad_h}, sparseAlgo, sparseBlockOC, mSparse);
             out = _Convert(out, NCHW);
             auto outputPtr = out->readMap<float>();
             if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputData.size(), 0.05)) {
                 MNN_PRINT("multi expect:\t real:\n");
                 for (int i = 0; i < outputData.size(); ++i)
                 {
                     MNN_PRINT("%f\t, %f\n", outputData[i], outputPtr[i]);
                 }
                 MNN_ERROR("%s(%s) multi test failed!\n", test_op_name.c_str(), device_name.c_str());
                 return false;
             }
         }
        // Single Conv
        auto output = _Conv(std::move(weightData), std::move(biasData), input, {ic, oc}, {kw, kh}, padMap[mode],
                            {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, sparseAlgo, sparseBlockOC, mSparse);

        // difference below 0.5% relative error is considered correct.
        auto outputPtr = output->readMap<float>();

        if (debug) {
            MNN_PRINT("\ndata NCHW shape:");
            printDims(input->getInfo()->dim);
            MNN_PRINT("\nweight OIHW shape:");
            printDims({oc, ic, kh, kw});
            MNN_PRINT("\noutput NCHW shape:");
            printDims(output->getInfo()->dim);
            MNN_PRINT("\nexpected output:");
            formatMatrix(outputData.data(), output->getInfo()->dim);
            MNN_PRINT("\nexpected output 2:");
            formatMatrix(outputDataSeparateBias.data(), output->getInfo()->dim);
            MNN_PRINT("\nreal output:");
            formatMatrix(outputPtr, output->getInfo()->dim);
        }
        // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate.

        float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 100; // winograd error in 16-bits is relatively large
        if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputDataSeparateBias.data(), outputData.size(), 0.001 * errorScale)) {
            MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision);
            for (int i = 0; i < outputData.size(); ++i)
            {
                MNN_PRINT("%f\t, %f\t, %f\n", outputData[i],outputDataSeparateBias[i], outputPtr[i]);
            }
            MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str());
            return false;
        }


        if (mBenchSpeed) {
            int oh = output->getInfo()->dim[2], ow = output->getInfo()->dim[3];
            input.fix(VARP::INPUT);
            MNN::Timer _t;
            const int LOOP = 20;
            for (int i = 0; i < LOOP; ++i) {
                input->writeMap<float>();
                output->readMap<float>();
            }
            auto time = (float)_t.durationInUs() / 1000.0f;
            MNN_PRINT("kernel=(%dx%d) input=(1x%dx%dx%d) output=(1x%dx%dx%d) stride=(%dx%d), avg time = %f\n",
                      kh, kw, ic, ih, iw, oc, oh, ow, stride, stride, 1.0 * time / LOOP);
        }

        return true;
    }
};

class SparseConvolutionCommonTest : public ConvolutionCommonTest {

public:
    SparseConvolutionCommonTest() {
        mSparse = true;
    }
    virtual void generateWeight(std::vector<float>& weightData, int ic, int oc, int kh, int kw, int dilation, int group, int sparseBlockOC) {
        assert(sparseBlockOC);
        int ocEven = (group * (oc / group) / sparseBlockOC) * sparseBlockOC;
        int reduceDimLength = (ic / group) * kw * kh;
        weightData.resize(group * (oc / group) * reduceDimLength);
        size_t ioc = 0;
        size_t index = 0;
        for (; ioc < ocEven; ioc += sparseBlockOC) {
            for (size_t i = 0; i < reduceDimLength; i++) {
                index = ioc * reduceDimLength + i;
                bool isZero = index % 4 != 0;
                for (int iblock = 0; iblock < sparseBlockOC; iblock++) {
                    if(isZero) {
                        weightData[index] = 0;
                    } else {
                        auto data      = (index / kw) * (index / kh) + index / ic + index / oc + (oc - index) * ic + index * (oc - index);
                        weightData[index] = (float)(data % 255) / 255.0f / 1000.0f;
                    }
                    index += reduceDimLength;
                }
            }
        }
        for (; ioc < oc; ioc++) {
            for (size_t i = 0; i < reduceDimLength; i++) {
                index = ioc * reduceDimLength + i;
                bool isZero = index % 4 != 0;
                if(isZero) {
                    weightData[index] = 0;
                } else {
                    auto data      = (index / kw) * (index / kh) + index / ic + index / oc + (oc - index) * ic + index * (oc - index);
                    weightData[index] = (float)(data % 255) / 255.0f;
                }
            }
        }
        return;
    }
};

class ConvolutionInt8CommonTest : public ConvolutionCommonTest {
public:
    virtual ~ConvolutionInt8CommonTest() = default;
    virtual bool run (int precision) {
        return true;
    }

public:
    virtual void generateWeight(std::vector<float>& weightData, int ic, int oc, int kh, int kw, int dilation, int group, int sparseBlockOC) {
        auto numbers = group * (oc / group) * (ic / group) * kw * kh;
        weightData.resize(numbers);
        float rate = 1.0f;
        if (numbers > 10000) {
            // Avoid exceed fp16
            rate = 0.01f;
        }
        for (int ri = 0; ri < numbers; ri++) {
            int i = numbers - ri;
            auto data      = ((((i / kw)% 1317) * ((i / kh) % 1317)) % 1317 + i / ic + i / oc + (((oc - i) % 1317) * ic) % 1317 + i * ((oc - i) % 1317)) % 1317;
            auto floatData      = (float)(data % 255) / 255.0f / 1000.0f * rate;
            weightData[ri] = data;
        }
    }
    ConvolutionInt8CommonTest& speed() {
        mBenchSpeed = true;
        return *this;
    }

    bool testUnit(MNNForwardType type, const std::string& device_name, const std::string& test_op_name, int batch,
                     int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, int kw, int stride,
                  int dilation, int group, int precision, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool debug = false, int nbit = 8, bool async = false) {
        using namespace MNN::Express;
        std::map<PadMode, Express::PaddingMode> padMap = {
            {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}};
        std::vector<float> weightData, biasData;

        generateWeight(weightData, ic, oc, kh, kw, dilation, group, sparseBlockOC);

        for (int i = 0; i < oc; i++) {
            auto data      = (((i / kw) % 1317) * ((i / kh) % 1317) + i / ic + i / oc + (oc - i) * ic + i * (oc - i)) % 1317;
            auto floatData = (float)(data % 255) / 255.0f;
            data           = data * data;
            biasData.push_back(floatData);
            // biasData.push_back(0.0f);
        }

        std::vector<float> inputData, outputData, outputDataSeparateBias;
        float rate = 1.0f;
        if (ih * iw * ic * batch > 10000) {
            // Avoid exceed fp16 limit
            rate = 0.01f;
        }
        for (int i = 0; i < ih * iw * ic * batch; ++i) {
            auto data      = ((i / kw) % 1317) * ((i / kh) % 1317) + ((i / ic)% 1317) * ((i / oc) % 1317) + ((oc - i) % 1317) * ic + (i % 1317) * ((oc - i) % 1317);
            data = data % 1317;
            data           = (data * data) % 1317;
            auto floatData = (float)(data % 255) / 255.0f * rate;
            inputData.push_back(floatData);
        }
        float fac = 1.23;
        int res = 10;
        float tail = 0.2;
        float threshold = (float)(1 << (nbit - 1)) - 1.0f;
        float clampMin = -threshold;
        if (async) {
            clampMin = -threshold - 1;
        }
        int kernel_size = ic * kw * kh;
        std::vector<int8_t> quantWeight(oc*ic*kw*kh);
        std::vector<float> wScale;
        if (async) {

            wScale.resize(2 * oc);
            for (int k = 0; k < oc; ++k) {
                int beginIndex = k * kernel_size;
                auto minMax = findMinMax(weightData.data() + beginIndex, kernel_size);
                auto minValue = minMax.first;
                wScale[2*k] = minMax.first;
                auto absMax = minMax.second - minMax.first;
                wScale[2*k+1] = absMax / (threshold - clampMin);
                float scale = 0.0f;
                if (absMax >= 0.000001f) {
                    scale = 1.0f / wScale[2*k+1];
                }
                float* ptr = weightData.data() + beginIndex;
                for (int i = 0; i < kernel_size; ++i) {
                    int8_t quantValue = int8_t(std::round((ptr[i] - minValue) * scale + clampMin));
                    float floatValue = ((float)quantValue - clampMin) * wScale[2*k+1] + minValue;
                    quantWeight[k * kernel_size + i] = quantValue;
                    ptr[i] = floatValue;
                }
            }
        } else {
            wScale.resize(oc);
            for (int k = 0; k < oc; ++k) {
                int beginIndex = k * kernel_size;
                auto absMax = findAbsMax(weightData.data() + beginIndex, kernel_size);
                wScale[k] = absMax / threshold;

                float* ptr = weightData.data() + beginIndex;
                for (int i = 0; i < kernel_size; ++i) {
                    int8_t quantVal = (int8_t)(fmax(fmin(round(ptr[i] / wScale[k]), threshold), clampMin));
                    quantWeight[k * kernel_size + i] = quantVal;
                    ptr[i] = (float)quantVal * wScale[k];
                }
            }
        }
        reference_conv2d(inputData, weightData, biasData, outputData, outputDataSeparateBias, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw,
                         stride, dilation, group, FP32Converter[precision]);
        if (outputData.size() == 0) {
            return true;
        }

        auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of<float>());
        ::memcpy(input->writeMap<float>(), inputData.data(), inputData.size() * sizeof(float));
        // Single Conv
        auto weightLength = weightData.size();
        auto output     = _HybridConv(weightData, std::move(biasData), std::move(wScale), input,
                                      {ic, oc}, {kw, kh}, padMap[mode],  {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, nbit, async);

        // difference below 0.5% relative error is considered correct.
        auto outputPtr = output->readMap<float>();

        if (debug) {
            MNN_PRINT("\ndata NCHW shape:");
            printDims(input->getInfo()->dim);
            MNN_PRINT("\nweight OIHW shape:");
            printDims({oc, ic, kh, kw});
            MNN_PRINT("\noutput NCHW shape:");
            printDims(output->getInfo()->dim);
            MNN_PRINT("\nexpected output:");
            formatMatrix(outputData.data(), output->getInfo()->dim);
            MNN_PRINT("\nexpected output 2:");
            formatMatrix(outputDataSeparateBias.data(), output->getInfo()->dim);
            MNN_PRINT("\nreal output:");
            formatMatrix(outputPtr, output->getInfo()->dim);
        }
        // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate.

        float errorScale = 1.0f;
        if (nbit == 4 && weightLength > 10000) {
            errorScale = 50.0f;
        }
        if (precision > MNN::BackendConfig::Precision_High) {
            errorScale = 100.0f;
        }
        if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputDataSeparateBias.data(), outputData.size(), 0.001 * errorScale)) {
            MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision);
            for (int i = 0; i < outputData.size(); ++i)
            {
                MNN_PRINT("%f\t, %f\t, %f\n", outputData[i],outputDataSeparateBias[i], outputPtr[i]);
            }
            MNN_ERROR("%s(%s) test failed for %d bits, async=%d !\n", test_op_name.c_str(), device_name.c_str(), nbit, async);
            return false;
        }


        if (mBenchSpeed) {
            int oh = output->getInfo()->dim[2], ow = output->getInfo()->dim[3];
            input.fix(VARP::INPUT);
            MNN::Timer _t;
            const int LOOP = 20;
            for (int i = 0; i < LOOP; ++i) {
                input->writeMap<float>();
                output->readMap<float>();
            }
            auto time = (float)_t.durationInUs() / 1000.0f;
            MNN_PRINT("ConvInt8Weight kernel=(%dx%d) input=(1x%dx%dx%d) output=(1x%dx%dx%d) stride=(%dx%d), avg time = %f\n",
                      kh, kw, ic, ih, iw, oc, oh, ow, stride, stride, 1.0 * time / LOOP);
        }
        return true;
    }
    bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name, int batch,
                     int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, int kw, int stride,
              int dilation, int group, int precision, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool debug = false) {
        auto res = testUnit(type, device_name, test_op_name, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw, stride, dilation, group, precision, sparseAlgo, sparseBlockOC, debug, 8, true);
        if (!res) {
            FUNC_PRINT(1);
            return res;
        }
        res = res && testUnit(type, device_name, test_op_name, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw, stride, dilation, group, precision, sparseAlgo, sparseBlockOC, debug, 8, false);
        if (!res) {
            FUNC_PRINT(1);
            return res;
        }
        res = res && testUnit(type, device_name, test_op_name, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw, stride, dilation, group, precision, sparseAlgo, sparseBlockOC, debug, 4, true);
        if (!res) {
            FUNC_PRINT(1);
            return res;
        }
        res = res && testUnit(type, device_name, test_op_name, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw, stride, dilation, group, precision, sparseAlgo, sparseBlockOC, debug, 4, false);
        if (!res) {
            FUNC_PRINT(1);
            return res;
        }
        return res;
    }
};

template <typename ConvolutionType>
class ConvolutionSpeedTest : public ConvolutionType {
public:
    virtual ~ConvolutionSpeedTest() = default;


protected:
    static bool test(MNNForwardType type, const std::string& device_name, int precision, MNN::SparseAlgo sparseAlgo, int MaxBlock) {
        int padW = 1, padH = 1, iw = 28, ih = 28, ic = 128, oc = 128;
        std::vector<std::vector<int>> kernels = {
            {1, 1}, {3, 3}, {5, 5}, {7, 1}, {1, 7} // {w, h}
        };
        std::vector<std::string> titles = {"3x3", "5x5", "1x7", "7x1"};
        for (int i = 0; i < kernels.size(); ++i) {
            auto res = ConvolutionType().speed().test(type, device_name, "Conv2D Speed",
                            1, ic, oc, ih, iw, PadMode_CAFFE, padH, padW, kernels[i][1], kernels[i][0], 1, 1, 1, precision);
            if (!res) {
                MNN_ERROR("Error for test kernel %s for ConvolutionSpeedTest\n", titles[i].c_str());
                return false;
            }
        }
        return true;
    }
};


template <typename ConvolutionType>
class ConvolutionTest : public ConvolutionType {
public:
    virtual ~ConvolutionTest() = default;

protected:
    static bool test(MNNForwardType type, const std::string& device_name, int precision, MNN::SparseAlgo sparseAlgo, std::vector<int> blocks) {


        for (int b = 1; b <= 2; b++) {
            for (int oc = 1; oc <= 17; oc += 4) {
                for (int ic = 1; ic <= 18; ic += 5) {
                    for (int is = 1; is <= 17; is += 3) {
                        for (int kw = 1; kw <= 3 && kw <= is; kw+=2) {
                            for (int kh = 1; kh <= 3 && kh <= is; kh+=3) {
                                for (int d = 1; d <= 2; d++) {
                                    if (d > is || d * (kw - 1) + 1 > is || d * (kh - 1) + 1 > is)
                                        continue;

                                    for (int s = 1; s <= 2; s++) {
                                        for (auto block : blocks) {
                                            for (int p = 0; p <= 1; p++) {
                                                bool succ =
                                                    ConvolutionType().test(type, device_name, "Conv2D", b, ic, oc, is,
                                                                                is, PadMode_CAFFE, p, p, kh, kw, s, d, 1, precision, sparseAlgo, block, false);
                                                if (!succ) {
                                                    MNN_ERROR(
                                                        "Error for conv b=%d, oc=%d, ic=%d, ih=%d, "
                                                        "iw=%d,kw=%d,kh=%d,d=%d,s=%d,p=%d, block=%d\n",
                                                        b, oc, ic, is, is, kw, kh, d, s, p, block);
                                                    return false;
                                                }
                                            }


                                            {
                                                bool succ =
                                                    ConvolutionType().test(type, device_name, "Conv2D", b, ic, oc, is,
                                                                                is, PadMode_VALID, 0, 0, kh, kw, s, d, 1, precision, sparseAlgo, block, false);
                                                if (!succ) {
                                                    MNN_ERROR(
                                                        "Error for conv b=%d, oc=%d, ic=%d, is=%d, is=%d, kw=%d,kh=%d,d=%d,s=%d, block=%d, "
                                                        "valid pad\n",
                                                        b, oc, ic, is, is, kw, kh, d, s, block);
                                                    return false;
                                                }
                                            }

                                            {
                                                bool succ =
                                                    ConvolutionType().test(type, device_name, "Conv2D", b, ic, oc, is,
                                                                                is, PadMode_SAME, 0, 0, kh, kw, s, d, 1, precision, sparseAlgo, block, false);
                                                if (!succ) {
                                                    MNN_ERROR(
                                                        "Error for conv b=%d, oc=%d, ic=%d, is=%d, is=%d, kw=%d, kh=%d, d=%d, s=%d, block=%d, "
                                                        "same pad\n",
                                                        b, oc, ic, is, is, kw, kh, d, s, block);
                                                    return false;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        // Check Long convolution
         bool succ =
            ConvolutionType().test(type, device_name, "Conv2D", 1, 256, 256, 24, 24, PadMode_SAME, 0, 0, 3, 3, 1, 1, 1, precision, sparseAlgo, 4, false);
        if (!succ) {
            MNN_ERROR("Error for long conv\n");
            return false;
        }
        // // uncovered and easily wrong case.
        succ =
            ConvolutionType().test(type, device_name, "Conv2D", 1, 3, 16, 256, 256, PadMode_CAFFE, 1, 1, 3, 3, 1, 1, 1, precision, sparseAlgo, 4, false);
        if (!succ) {
            MNN_ERROR("Error in pick up case 1.\n");
            return false;
        }
        succ =
            ConvolutionType().test(type, device_name, "Conv2D", 1, 1, 8, 28, 28, PadMode_CAFFE, 2, 2, 5, 5, 1, 1, 1, precision, sparseAlgo, 1, false);
        if (!succ) {
            MNN_ERROR("Error in pick up case 2.\n");
            return false;
        }

        succ =
            ConvolutionType().test(type, device_name, "Conv2D", 1, 1, 8, 14, 14, PadMode_CAFFE, 2, 2, 5, 5, 1, 1, 1, precision, sparseAlgo, 1, false);
        if (!succ) {
            MNN_ERROR("Error in pick up case 3.\n");
            return false;
        }

        return true;
    }

};

using DenseConvolutionTest = ConvolutionTest<ConvolutionCommonTest>;
class ConvolutionTestOnCPU : public DenseConvolutionTest {
public:
    ~ConvolutionTestOnCPU() = default;
    virtual bool run(int precision) {
        return DenseConvolutionTest::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_RANDOM, {1});
    }
};

using DenseConvolutionInt8Test = ConvolutionTest<ConvolutionInt8CommonTest>;
class ConvolutionInt8Test : public DenseConvolutionInt8Test {
public:
    ~ConvolutionInt8Test() = default;
    virtual bool run(int precision) {
        return DenseConvolutionInt8Test::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_RANDOM, {1});
    }
};

using DenseConvolutionSpeedTest = ConvolutionSpeedTest<ConvolutionCommonTest>;
class ConvolutionSpeedTestOnCPU : public DenseConvolutionSpeedTest {
public:
    ~ConvolutionSpeedTestOnCPU() = default;
    virtual bool run(int precision) {
        return DenseConvolutionSpeedTest::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_RANDOM, 1);
    }
};


using SparseConvolutionTest = ConvolutionTest<SparseConvolutionCommonTest>;
class SparseConvolutionTestOnCPU : public SparseConvolutionTest {
public:
    ~SparseConvolutionTestOnCPU() = default;
    virtual bool run(int precision) {
        std::vector<int> blocks = {1, 4, 8};
        return SparseConvolutionTest::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_SIMD_OC, blocks);
    }
};

class DepthwiseConvolutionTest : public ConvolutionCommonTest {
public:
    virtual ~DepthwiseConvolutionTest() = default;

protected:
    virtual bool run(int precision) {
        return test(MNN_FORWARD_CPU, "CPU", precision);
    }

    static bool test(MNNForwardType type, const std::string& device_name, int precision) {
        srand(TEST_RANDOM_SEED);
        // correct unit test
        for (int b = 1; b <= 2; b++) {
            for (int oc = 4; oc <= 16; oc *= 2) {
                for (int ic = oc; ic <= oc; ic++) {
                    for (int isw = 1; isw <= 8; ++isw) {
                        for (int ish = 1; ish <= 8; ++ish) {
                            for (int kw = 1; kw <= 4; kw++) {
                                for (int kh = 1; kh <= 4; kh++) {
                                    for (int d = 1; d <= 2; d++) {
                                        for (int s = 1; s <= 2; s++) {
                                            for (int p = 0; p <= std::min(kw, kh); p++) {
                                                // depthwise <==> group == outputChannel
                                                bool succ = ConvolutionCommonTest().test(
                                                                                         type, device_name, "DepthwiseConv2D", b, ic, oc, ish, isw, PadMode_CAFFE,
                                                                                         p, p, kh, kw, s, d, oc, precision);
                                                if (!succ) {
                                                    MNN_ERROR(
                                                              "Error for dw oc=%d, ic=%d, ih=%d, iw = %d, kw=%d,kh=%d,d=%d,s=%d,p=%d\n", oc,
                                                              ic, ish, isw, kw, kh, d, s, p);
#ifdef DEBUG
                                                    //Rerun test for easy to test
                                                    ConvolutionCommonTest().test(
                                                                                             type, device_name, "DepthwiseConv2D", b, ic, oc, ish, isw, PadMode_CAFFE,
                                                                                             p, p, kh, kw, s, d, oc, precision);
#endif
                                                    return false;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        // memory leak unit test
        int b = 1, oc = 4, ic = oc, group = oc, is = 2, p = 1, kh = 3, kw = 3, s = 2, d = 1;
        return ConvolutionCommonTest().test(type, device_name, "DepthwiseConv2D", b, ic, oc, is, is,
                                           PadMode_CAFFE, p, p, kh, kw, s, d, group, precision);
    }
};

class GroupConvolutionTest : public ConvolutionCommonTest {
public:
    virtual ~GroupConvolutionTest() = default;

protected:
    static bool test(MNNForwardType type, const std::string& device_name, int precision) {
        srand(TEST_RANDOM_SEED);
        bool succ = ConvolutionCommonTest().test(
            type, device_name, "GroupConv2D", 2, 8, 16, 1, 1, PadMode_CAFFE,
            0, 0, 1, 1, 1, 1, 2, precision, MNN::SparseAlgo_RANDOM, 1, false);
        return succ;
        for (int b = 1; b <= 2; b++) {
            for (int g = 2; g <= 4; g *= 2) {
                for (int oc = g * 4; oc <= 4 * g * 4; oc += g * 4) {
                    for (int ic = g * 4; ic <= 4 * g * 4; ic += g * 4) {
                        for (int is = 1; is <= 8; is *= 2) {
                            for (int kw = 1; kw <= 3 && kw <= is; kw++) {
                                for (int kh = 1; kh <= 3 && kh <= is; kh++) {
                                    for (int d = 1; d <= 2; d++) {
                                        if (d > std::min(kw, kh) || d * (std::max(kw, kh) - 1) + 1 > is)
                                            continue;
                                        for (int s = 1; s <= 2; s++) {
                                            for (int p = 0; p <= 1; p++) {
                                                bool debug = false;
                                                bool succ = ConvolutionCommonTest().test(
                                                    type, device_name, "GroupConv2D", b, ic, oc, is, is, PadMode_CAFFE,
                                                    p, p, kh, kw, s, d, g, precision, MNN::SparseAlgo_RANDOM, 1, debug);
                                                if (!succ) {
                                                    MNN_PRINT("convolution group b=%d, oc=%d, ic=%d, is=%d,kw=%d,kh=%d,d=%d,s=%d,g=%d,p=%d\n", b, oc,
                                                    ic, is, kw, kh, d, s, g, p);
                                                    return false;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        return true;
    }
};

class GroupConvolutionTestOnCPU : public GroupConvolutionTest {
public:
    virtual ~GroupConvolutionTestOnCPU() = default;
    virtual bool run(int precision) {
        return GroupConvolutionTest::test(MNN_FORWARD_CPU, "CPU", precision);
    }
};

MNNTestSuiteRegister(ConvolutionTestOnCPU, "op/convolution/conv2d");
MNNTestSuiteRegister(ConvolutionInt8Test, "op/convolution/weighti8i4conv2d");
MNNTestSuiteRegister(ConvolutionSpeedTestOnCPU, "speed/convolution/conv2d");
MNNTestSuiteRegister(SparseConvolutionTestOnCPU, "op/convolution/sparse_conv2d");
MNNTestSuiteRegister(DepthwiseConvolutionTest, "op/convolution/depthwise_conv");
MNNTestSuiteRegister(GroupConvolutionTestOnCPU, "op/convolution/conv_group");