mirror of https://github.com/alibaba/MNN.git
730 lines
35 KiB
C++
730 lines
35 KiB
C++
//
|
|
// ConvInt8Test.cpp
|
|
// MNNTests
|
|
//
|
|
// Created by MNN on b'2020/02/19'.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include <math.h>
|
|
#include <random>
|
|
#include <MNN/expr/ExprCreator.hpp>
|
|
#include "MNN_generated.h"
|
|
#include "MNNTestSuite.h"
|
|
#include "TestUtils.h"
|
|
#include "core/CommonCompute.hpp"
|
|
#include "core/MemoryFormater.h"
|
|
#include "core/WinogradInt8Attr.hpp"
|
|
#include "math/WingoradGenerater.hpp"
|
|
#include <MNN/AutoTime.hpp>
|
|
|
|
using namespace MNN::Express;
|
|
using namespace MNN;
|
|
static PadMode _convertPadMode(PaddingMode mode) {
|
|
switch (mode) {
|
|
case CAFFE:
|
|
return PadMode_CAFFE;
|
|
case VALID:
|
|
return PadMode_VALID;
|
|
case SAME:
|
|
return PadMode_SAME;
|
|
default:
|
|
break;
|
|
}
|
|
return PadMode_CAFFE;
|
|
}
|
|
|
|
inline int8_t int32ToInt8(int data, int bias, float scale) {
|
|
float value = 0.f;
|
|
value = roundf((float)(data + bias) * scale);
|
|
|
|
value = std::max(value, -127.0f);
|
|
value = std::min(value, 127.0f);
|
|
return static_cast<int8_t>(value);
|
|
}
|
|
|
|
VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel,
|
|
INTS kernelSize, PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu,
|
|
int8_t inputZeroPoint, int8_t outputZeroPoint, int8_t minValue, int8_t maxValue, bool accumulateToInt16,
|
|
MNN::SparseAlgo sparseAlgo, int sparseBlockOC) {
|
|
std::unique_ptr<OpT> convOp(new OpT);
|
|
convOp->type = OpType_ConvInt8;
|
|
if (channel[0] == channel[1] && channel[0] == group) {
|
|
convOp->type = OpType_DepthwiseConvInt8;
|
|
}
|
|
convOp->main.type = OpParameter_Convolution2D;
|
|
convOp->main.value = new Convolution2DT;
|
|
auto conv2D = convOp->main.AsConvolution2D();
|
|
conv2D->common.reset(new Convolution2DCommonT);
|
|
conv2D->common->padMode = _convertPadMode(pad);
|
|
if (pads.size() == 2) {
|
|
conv2D->common->padX = pads[0];
|
|
conv2D->common->padY = pads[1];
|
|
} else {
|
|
conv2D->common->pads = std::move(pads);
|
|
}
|
|
conv2D->common->strideX = stride[0];
|
|
conv2D->common->strideY = stride[1];
|
|
conv2D->common->group = group;
|
|
conv2D->common->outputCount = channel[1];
|
|
conv2D->common->inputCount = channel[0];
|
|
conv2D->common->dilateX = dilate[0];
|
|
conv2D->common->dilateY = dilate[1];
|
|
conv2D->common->kernelX = kernelSize[0];
|
|
conv2D->common->kernelY = kernelSize[1];
|
|
conv2D->common->relu = relu;
|
|
MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
|
|
conv2D->symmetricQuan.reset(new QuantizedFloatParamT);
|
|
|
|
if (sparseAlgo == MNN::SparseAlgo_RANDOM || sparseAlgo == MNN::SparseAlgo_SIMD_OC) {
|
|
size_t weightNNZElement, weightBlockNumber = 0;
|
|
CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, weight.data(), bias.size(), weight.size() / bias.size(), sparseBlockOC);
|
|
std::unique_ptr<MNN::AttributeT> arg1(new MNN::AttributeT);
|
|
arg1->key = "sparseBlockOC";
|
|
arg1->i = sparseBlockOC;
|
|
|
|
std::unique_ptr<MNN::AttributeT> arg2(new MNN::AttributeT);
|
|
arg2->key = "sparseBlockKernel";
|
|
arg2->i = 1;
|
|
|
|
std::unique_ptr<MNN::AttributeT> arg3(new MNN::AttributeT);
|
|
arg3->key = "NNZElement";
|
|
arg3->i = static_cast<int32_t>(weightNNZElement);
|
|
|
|
std::unique_ptr<MNN::AttributeT> arg4(new MNN::AttributeT);
|
|
arg4->key = "blockNumber";
|
|
arg4->i = static_cast<int32_t>(weightBlockNumber);
|
|
|
|
flatbuffers::FlatBufferBuilder builder;
|
|
std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
|
|
auto sparseArg1 = MNN::CreateAttribute(builder, arg1.get());
|
|
auto sparseArg2 = MNN::CreateAttribute(builder, arg2.get());
|
|
auto sparseArg3 = MNN::CreateAttribute(builder, arg3.get());
|
|
auto sparseArg4 = MNN::CreateAttribute(builder, arg4.get());
|
|
|
|
argsVector.emplace_back(sparseArg1);
|
|
argsVector.emplace_back(sparseArg2);
|
|
argsVector.emplace_back(sparseArg3);
|
|
argsVector.emplace_back(sparseArg4);
|
|
|
|
auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
|
|
auto sparseCom = MNN::CreateSparseCommon(builder, sparseAlgo, sparseArgs);
|
|
builder.Finish(sparseCom);
|
|
auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
|
|
|
|
conv2D->sparseParameter.reset(sparseComPtr);
|
|
}
|
|
|
|
if (bias.size() == 0) {
|
|
bias.resize(channel[1]);
|
|
std::fill(bias.begin(), bias.end(), 0);
|
|
}
|
|
conv2D->symmetricQuan->bias = std::move(bias);
|
|
conv2D->symmetricQuan->scale = std::move(scale);
|
|
conv2D->symmetricQuan->zeroPoint = std::move(inputZeroPoint);
|
|
conv2D->symmetricQuan->outputZeroPoint = std::move(outputZeroPoint);
|
|
MNN_ASSERT(maxValue > minValue);
|
|
conv2D->symmetricQuan->clampMin = minValue;
|
|
conv2D->symmetricQuan->clampMax = maxValue;
|
|
conv2D->symmetricQuan->weight = std::move(weight);
|
|
|
|
if (accumulateToInt16) {
|
|
conv2D->symmetricQuan->method = MNN::QuantizeAlgo::QuantizeAlgo_OVERFLOW_AWARE;
|
|
}
|
|
|
|
|
|
return (Variable::create(Expr::create(convOp.get(), {x})));
|
|
}
|
|
|
|
// y = Conv(x, w), x and y is C4 ordered format, weight is [oc, ic, kh, kw] raw format.
|
|
// weight: [group, ocGroup, icGroup, kh, kw]
|
|
static std::vector<int8_t> naiveConvInt8(const int8_t* x, const int8_t* weight, const int* bias, const float* scale,
|
|
int ow, int oh, int iw, int ih, int ic, int oc, int kw, int kh, int padX, int padY, int group, int padValue = 0,
|
|
int strideX = 1, int strideY = 1, int dilateX = 1, int dilateY = 1, int batch = 1) {
|
|
int ocGroup = oc / group, icGroup = ic / group;
|
|
std::vector<int8_t> yCorrect(batch * oc * oh * ow, 0);
|
|
for (int b = 0; b < batch; ++b) {
|
|
for (int oz = 0; oz < oc; ++oz) {
|
|
int gId = oz / ocGroup;
|
|
for (int oy = 0; oy < oh; ++oy) {
|
|
for (int ox = 0; ox < ow; ++ox) {
|
|
int32_t yInt32 = 0;
|
|
auto destOffset = ((b * oc + oz) * oh + oy) * ow + ox;
|
|
for (int sz = gId * icGroup; sz < (gId + 1) * icGroup; ++sz) {
|
|
for (int ky = 0; ky < kh; ++ky) {
|
|
for (int kx = 0; kx < kw; ++kx) {
|
|
int ix = ox * strideX + kx * dilateX - padX, iy = oy * strideY + ky * dilateY - padY;
|
|
int8_t xValue = padValue;
|
|
if (ix >= 0 && ix < iw && iy >= 0 && iy < ih) {
|
|
xValue = x[(((b * ic + sz) * ih + iy) * iw + ix)];
|
|
}
|
|
yInt32 += xValue * weight[(((gId * ocGroup + oz % ocGroup) * icGroup + sz % icGroup) * kh + ky) * kw + kx];
|
|
}
|
|
}
|
|
}
|
|
yCorrect[destOffset] = int32ToInt8(yInt32, bias[oz], scale[oz]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return yCorrect;
|
|
}
|
|
|
|
class ConvInt8TestCommon : public MNNTestCase {
|
|
protected:
|
|
virtual void generateWeight(std::vector<int8_t>& weight, int ic, int oc, int kh, int kw, int group, int xMax, int xMin, int sparseBlockOC) {
|
|
for (int i = 0; i < oc/group; ++i) {
|
|
for (int j = 0; j < ic; ++j) {
|
|
auto weightCurrent = weight.data() + (i * ic + j) * kw * kh;
|
|
for (int k = 0; k < kw * kh; ++k) {
|
|
weightCurrent[k] = ((i * i + j * j + k * k) % (xMax - xMin + 1)) + xMin; // w in [xMin, xMax]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool testKernel(INTS inputShape, INTS kernel, INTS channel, INTS pad, INTS strides, INTS dilate, int nbit = 8,
|
|
bool overflow = false, int group = 1, int batch = 1, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM,
|
|
int sparseBlockOC = 1, bool debug = false, bool speed = false) {
|
|
|
|
std::vector<int> bias(channel[1]);
|
|
std::vector<float> scale(channel[1]);
|
|
std::vector<int8_t> weight(channel[1] * channel[0] / group * kernel[0] * kernel[1]);
|
|
int iw = inputShape[0], ih = inputShape[1];
|
|
VARP x = _Input({batch, channel[0], ih, iw}, NCHW, halide_type_of<int8_t>());
|
|
auto xInfo = x->getInfo();
|
|
auto xPtr = x->writeMap<int8_t>();
|
|
int8_t xMin = -(1<<(nbit-1))+1, xMax = (1<<(nbit-1))-1;
|
|
for (int i = 0; i < xInfo->size; ++i) {
|
|
xPtr[i] = (i % (xMax - xMin + 1)) + xMin; // x in [xMin, xMax]
|
|
}
|
|
|
|
for (int i = 0; i < bias.size(); ++i) {
|
|
// bias[i] = 0;
|
|
// scale[i] = 1;
|
|
bias[i] = (10000 + i * i * 10 - i * i * i) % 12580;
|
|
scale[i] = ((127 - i) * i % 128) / 20000.0f;
|
|
}
|
|
|
|
generateWeight(weight, channel[0], channel[1], kernel[1], kernel[0], group, xMax, xMin, sparseBlockOC);
|
|
|
|
if (debug) {
|
|
MNN_PRINT("\nxPtr data :\n");
|
|
formatMatrix(xPtr, {batch, channel[0], ih, iw});
|
|
MNN_PRINT("\nweight data:\n");
|
|
formatMatrix(weight.data(), {channel[1], channel[0], kernel[0], kernel[1]});
|
|
MNN_PRINT("\nscale data:\n");
|
|
formatMatrix(scale.data(), {static_cast<int>(scale.size())});
|
|
MNN_PRINT("\nbias data:\n");
|
|
formatMatrix(bias.data(), {static_cast<int>(bias.size())});
|
|
}
|
|
|
|
auto saveWeight = weight;
|
|
auto saveBias = bias;
|
|
auto saveScale = scale;
|
|
|
|
VARP y;
|
|
auto xC4 = _Convert(x, NC4HW4);
|
|
// For sse we use uint8 instead of int8, use FloatToInt8 to hidden detail
|
|
xC4 = _FloatToInt8(_Cast<float>(xC4), _Scalar<float>(1.0f), -127, 127);
|
|
if (overflow) {
|
|
y = _Conv(std::vector<int8_t>(weight), std::vector<int>(bias), std::vector<float>(scale), xC4,
|
|
channel, kernel, PaddingMode::CAFFE, strides, dilate, group, pad, false, 0, 0, -127, 127, true, sparseAlgo, sparseBlockOC);
|
|
} else {
|
|
y = _Conv(std::vector<int8_t>(weight), std::vector<int>(bias), std::vector<float>(scale), xC4,
|
|
channel, kernel, PaddingMode::CAFFE, strides, dilate, group, pad, false, 0, 0, -127, 127, false, sparseAlgo, sparseBlockOC);
|
|
}
|
|
bool testDepthwise = false;
|
|
if (channel[0] == channel[1] && channel[0] == group) {
|
|
testDepthwise = true;
|
|
}
|
|
y = _Int8ToFloat(y, _Scalar<float>(1.0f));
|
|
y = _Cast<int8_t>(y);
|
|
y = _Convert(y, NCHW);
|
|
auto yInfo = y->getInfo();
|
|
auto ow = yInfo->dim[3], oh = yInfo->dim[2];
|
|
auto targetValues = naiveConvInt8(xPtr, saveWeight.data(), saveBias.data(), saveScale.data(),
|
|
ow, oh, iw, ih, channel[0], channel[1], kernel[0], kernel[1], pad[0], pad[1], group, 0, strides[0], strides[1], dilate[0], dilate[1], batch);
|
|
auto yPtr = y->readMap<int8_t>();
|
|
if (debug) {
|
|
MNN_PRINT("\ndebug expected output nchw");
|
|
formatMatrix(targetValues.data(), {yInfo->dim[0], yInfo->dim[1]/4, yInfo->dim[2], yInfo->dim[3], 4});
|
|
MNN_PRINT("\nreal output:");
|
|
formatMatrix(yPtr, {yInfo->dim[0], yInfo->dim[1]/4, yInfo->dim[2], yInfo->dim[3], 4});
|
|
}
|
|
for (int i = 0; i < targetValues.size(); ++i) {
|
|
int8_t targetValue = targetValues[i], computeResult = yPtr[i];
|
|
// Because of round implement in ARM / X86 / PC may cause 1 / 0 / -1 diff, don't care about this error
|
|
auto error = (int32_t)targetValue - (int32_t)computeResult;
|
|
if (error * error > 1) {
|
|
MNN_PRINT("%d x %d, ConvInt8 result %d Error: %d -> %d\n", ow, oh, i, targetValue, computeResult);
|
|
#ifdef DEBUG
|
|
x->writeMap<int8_t>();
|
|
auto ptr = y->readMap<int8_t>();
|
|
FUNC_PRINT_ALL(ptr, p);
|
|
#endif
|
|
return false;
|
|
}
|
|
}
|
|
if (speed) {
|
|
x.fix(VARP::INPUT);
|
|
// warm up, do onResize first for shapeDirty
|
|
x->writeMap<float>();
|
|
y->readMap<float>();
|
|
|
|
MNN::Timer _t;
|
|
const int LOOP = 100;
|
|
for (int i = 0; i < LOOP; ++i) {
|
|
x->writeMap<float>();
|
|
y->readMap<float>();
|
|
}
|
|
auto time = (float)_t.durationInUs() / 1000.0f;
|
|
MNN_PRINT("DepthwiseConvInt8 Speed: input = (1x%dx%dx%d), kernel=(%dx%dx%d), avg time=%f\n",
|
|
channel[0], ih, iw, channel[0], kernel[0], kernel[1], time);
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
class ConvInt8Im2colGemmTest : public ConvInt8TestCommon {
|
|
public:
|
|
|
|
virtual bool run(int precision) {
|
|
std::vector<std::vector<int>> kernels = {
|
|
{4, 2}, {1, 5}, {7, 1}
|
|
};
|
|
int iw = 34; int ih = 23;
|
|
std::vector<std::string> titles = {"4x2", "1x5", "7x1"};
|
|
for (int sx=1; sx<2; ++sx) {
|
|
for (int sy=1; sy<2; ++sy) {
|
|
for (int dx=1; dx<2; ++dx) {
|
|
for (int dy=1; dy<2; ++dy) {
|
|
for (int px=2; px<4; ++px) {
|
|
for (int py=3; py<4; ++py) {
|
|
for (int ic=1; ic<=64; ic*=8) {
|
|
for (int oc=1; oc<=64; oc*=8) {
|
|
INTS strides = {sx, sy}, dilate = {dx, dy}, pad = {px, py}, inputShape = {iw, ih};
|
|
INTS channel = {ic, oc};
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 2, MNN::SparseAlgo_RANDOM, 1, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 3, MNN::SparseAlgo_RANDOM, 1, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm + overflow aware)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 5, MNN::SparseAlgo_RANDOM, 1, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 2, MNN::SparseAlgo_RANDOM, 1, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm + overflow aware)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
class SparseConvInt8Im2colGemmTest : public ConvInt8TestCommon {
|
|
public:
|
|
|
|
virtual void generateWeight(std::vector<int8_t>& weight, int ic, int oc, int kh, int kw, int group, int xMax, int xMin, int sparseBlockOC) {
|
|
|
|
assert(sparseBlockOC);
|
|
int ocEven = (group * (oc / group) / sparseBlockOC) * sparseBlockOC;
|
|
int reduceDimLength = (ic / group) * kw * kh;
|
|
weight.resize(group * (oc / group) * reduceDimLength);
|
|
size_t ioc = 0;
|
|
size_t index = 0;
|
|
for (; ioc < ocEven; ioc += sparseBlockOC) {
|
|
for (size_t i = 0; i < reduceDimLength; i++) {
|
|
index = ioc * reduceDimLength + i;
|
|
bool isZero = index % 4 != 0;
|
|
for (int iblock = 0; iblock < sparseBlockOC; iblock++) {
|
|
if(isZero) {
|
|
weight[index] = 0;
|
|
} else {
|
|
auto data = (index / kw) * (index / kh) + index / ic + index / oc + (oc - index) * ic + index * (oc - index);
|
|
weight[index] = (data % (xMax - xMin + 1)) + xMin;
|
|
}
|
|
index += reduceDimLength;
|
|
}
|
|
}
|
|
}
|
|
for (; ioc < oc; ioc++) {
|
|
for (size_t i = 0; i < reduceDimLength; i++) {
|
|
index = ioc * reduceDimLength + i;
|
|
bool isZero = index % 4 != 0;
|
|
if(isZero) {
|
|
weight[index] = 0;
|
|
} else {
|
|
auto data = (index / kw) * (index / kh) + index / ic + index / oc + (oc - index) * ic + index * (oc - index);
|
|
weight[index] = (data % (xMax - xMin + 1)) + xMin;
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
virtual bool run(int precision) {
|
|
|
|
std::vector<std::pair<MNN::SparseAlgo, int>> SparseList = {{SparseAlgo_RANDOM, 1}, {MNN::SparseAlgo_SIMD_OC, 4}};
|
|
|
|
for (int is = 0; is < SparseList.size(); ++is)
|
|
{
|
|
// INTS strides = {1, 1}, dilate = {1, 1}, pad = {3, 4}, inputShape = {215, 204}; // {w, h}
|
|
// INTS channel = {64, 64}; // {ci, co}
|
|
// std::vector<std::vector<int>> kernels = {
|
|
// {4, 2}, {1, 5}, {7, 1}
|
|
// };
|
|
|
|
INTS strides = {1, 1}, dilate = {1, 1}, pad = {0, 0}, inputShape = {6, 6}; // {w, h}
|
|
INTS channel = {8, 8}; // {ci, co}
|
|
std::vector<std::vector<int>> kernels = {
|
|
{3, 3}, {1, 5}, {7, 1}
|
|
};
|
|
|
|
std::vector<std::string> titles = {"4x2", "1x5", "7x1"};
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 2, SparseList[is].first, SparseList[is].second, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 3, SparseList[is].first, SparseList[is].second, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 204 (im2col + gemm + overflow aware)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
inputShape = {215, 201};
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 8, false, 1, 5, SparseList[is].first, SparseList[is].second, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, strides, dilate, 3, true, 1, 2, SparseList[is].first, SparseList[is].second, false);
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 215, 201 (im2col + gemm + overflow aware)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
class ConvInt8WinogradTestCommon : public MNNTestCase {
|
|
public:
|
|
static VARP referenceWinograd(const VARP xInt, const std::vector<int8_t>& weight, const std::vector<float>& wScale, const std::vector<float>& bias, INTS kernel, INTS channel, INTS pads, const WinogradInt8Attr::Attr& attr, float xScale, float yScale, int8_t xZeroPoint, int8_t yZeroPoint, bool relu) {
|
|
auto clamp = [](VARP x) {return _Maximum(_Minimum(x, _Scalar<float>(127)), _Scalar<float>(-127));};
|
|
|
|
//auto round = [](VARP x) { return _Round(x); };
|
|
auto roundWithEps = [](VARP x) { return _Round(x + _Sign(x) * _Scalar<float>(1e-6)); };
|
|
|
|
auto inDims = xInt->getInfo()->dim;
|
|
int batch = inDims[0], inH = inDims[2], inW = inDims[3];
|
|
int outChannel = channel[1], inChannel = channel[0], kernelH = kernel[1], kernelW = kernel[0];
|
|
int padW = pads[0], padH = pads[1];
|
|
int outH = inH + 2 * padH - kernelH + 1, outW = inW + 2 * padW - kernelW + 1;
|
|
int unitH = attr.unitY, unitW = attr.unitX, unitNumH = UP_DIV(outH, unitH), unitNumW = UP_DIV(outW, unitW);
|
|
int alphaH = unitH + kernelH - 1, alphaW = unitW + kernelW - 1;
|
|
|
|
int needH = unitNumH * unitH + kernelH - 1, needW = unitNumW * unitW + kernelW - 1;
|
|
int paddings[] = {0, 0, 0, 0, padH, needH - inH - padH, padW, needW - inW - padW};
|
|
|
|
auto xx = _Int8ToFloat(xInt, _Scalar<float>(xScale), xZeroPoint);
|
|
xx = _Convert(xx, NCHW);
|
|
xx = _Pad(xx, _Const(paddings, {8}, NCHW, halide_type_of<int32_t>()));
|
|
// [ic * alphaH * alphaW, N * h_unit_num * w_unit_num]
|
|
xx = _Im2Col(xx, {alphaW, alphaH}, {1, 1}, {0, 0}, {unitW, unitH});
|
|
// [N * h_unit_num * w_unit_num, ic, alphaH, alphaW]
|
|
xx = _Transpose(_Reshape(xx, {inChannel, alphaH, alphaW, -1}), {3, 0, 1, 2});
|
|
Math::WinogradGenerater genH(unitH, kernelH, 1, true), genW(unitW, kernelW, 1, true);
|
|
auto srcTransH = _Const(genH.B()->host<void>(), {alphaH, alphaH}, NCHW);
|
|
auto srcTransW = _Const(genW.B()->host<void>(), {alphaW, alphaW}, NCHW);
|
|
xx = _MatMul(_MatMul(_Transpose(srcTransH, {1, 0}), xx), srcTransW);
|
|
// [alphaH * alphaW, ic, N * h_unit_num * w_unit_num]
|
|
xx = _Reshape(_Transpose(xx, {2, 3, 1, 0}), {alphaH * alphaW, inChannel, -1});
|
|
|
|
// simulate input asym quant
|
|
auto xxScale = _Const(attr.inputScales.data(), {alphaH * alphaW, 1, 1}, NCHW);
|
|
auto xxZeroPoint = _Cast<float>(_Const(attr.inputZeroPoints.data(), {alphaH * alphaW, 1, 1}, NCHW, halide_type_of<int>()));
|
|
xx = (clamp(_Round(xx / xxScale + xxZeroPoint)) - xxZeroPoint) * xxScale;
|
|
|
|
auto w = _Const(weight.data(), {outChannel, inChannel, kernelH, kernelW}, NCHW, halide_type_of<int8_t>());
|
|
w = _Cast<float>(w) * _Const(wScale.data(), {outChannel, 1, 1, 1}, NCHW);
|
|
auto wTransH = _Const(genH.G()->host<void>(), {alphaH, kernelH}, NCHW);
|
|
auto wTransW = _Const(genW.G()->host<void>(), {alphaW, kernelW}, NCHW);
|
|
// [oc, ic, alphaH, alphaW]
|
|
auto ww = _MatMul(_MatMul(wTransH, w), _Transpose(wTransW, {1, 0}));
|
|
// [alphaH * alphaW, oc, ic]
|
|
ww = _Transpose(_Reshape(ww, {outChannel, inChannel, -1}), {2, 0, 1});
|
|
// simulate weight quant
|
|
auto wwScale = _Const(attr.weightScales.data(), {alphaH * alphaW, outChannel, 1}, NCHW);
|
|
ww = clamp(roundWithEps(ww / wwScale));
|
|
ww = ww * wwScale;
|
|
|
|
// [alphaH * alphaW, oc, N * h_unit_num * w_unit_num]
|
|
auto yy = _MatMul(ww, xx);
|
|
// [oc, N * h_unit_num * w_unit_num, alphaH, alphaW]
|
|
yy = _Reshape(_Transpose(yy, {1, 2, 0}), {outChannel, -1, alphaH, alphaW});
|
|
auto dstTransH = _Const(genH.A()->host<void>(), {alphaH, unitH}, NCHW);
|
|
auto dstTransW = _Const(genW.A()->host<void>(), {alphaW, unitW}, NCHW);
|
|
// [oc, N * h_unit_num * w_unit_num, unitH, unitW]
|
|
yy = _MatMul(_MatMul(_Transpose(dstTransH, {1, 0}), yy), dstTransW);
|
|
// [N, oc, h_unit_num * unitH, w_unit_num * unitW]
|
|
yy = _Reshape(_Transpose(_Reshape(yy, {outChannel, batch, unitNumH, unitNumW, unitH, unitW}), {1, 0, 2, 4, 3, 5}), {batch, outChannel, unitNumH * unitH, unitNumW * unitW});
|
|
int sliceStartData[] = {0, 0, 0, 0}, sliceEndData[] = {-1, -1, outH, outW};
|
|
yy = _Slice(yy, _Const(sliceStartData, {4}, NCHW), _Const(sliceEndData, {4}, NCHW));
|
|
// TODO: add operator!= to VARP
|
|
if (!bias.empty()) {
|
|
yy = yy + _Const(bias.data(), {1, outChannel, 1, 1}, NCHW);
|
|
}
|
|
if (relu) {
|
|
yy = _Maximum(yy, _Scalar<float>(0));
|
|
}
|
|
yy = _Convert(yy, NC4HW4);
|
|
yy = _FloatToInt8(yy, _Scalar<float>(1.0 / yScale), -127, 127, yZeroPoint);
|
|
return yy;
|
|
}
|
|
static bool testKernel(INTS inputShape, INTS kernel, INTS channel, INTS pads, INTS alphas, bool speed, std::string title, bool relu = true) {
|
|
int ic = channel[0], oc = channel[1], iw = inputShape[0], ih = inputShape[1], kx = kernel[0], ky = kernel[1], alpha2 = alphas[0] * alphas[1];
|
|
for (int batchSize = 1; batchSize <= 3; ++batchSize) {
|
|
VARP x = _Input({batchSize, ic, ih, iw}, NCHW);
|
|
auto xPtr = x->writeMap<float>();
|
|
float xMin = std::numeric_limits<float>::max(), xMax = std::numeric_limits<float>::lowest();
|
|
for (int i = 0; i < x->getInfo()->size; ++i) {
|
|
xPtr[i] = i % 128; // x in [0, 127], same as relu output, test asym quant
|
|
xMin = std::min(xMin, xPtr[i]);
|
|
xMax = std::max(xMax, xPtr[i]);
|
|
}
|
|
float xScale = (xMax - xMin) / (2.0 * 127), yScale = 0.5;
|
|
int8_t xZeroPoint = roundf((0 - xMin) / xScale - 127), yZeroPoint = 1;
|
|
|
|
int wMin = -3, wMax = 3;
|
|
std::vector<float> wScale(oc), bias(oc);
|
|
std::vector<int8_t> weight(oc * ic * ky * kx);
|
|
for (int oz = 0; oz < oc; ++oz) {
|
|
wScale[oz] = (oz % 11) * 0.1 + 0.5; // wScale in [0.5, 1.5]
|
|
bias[oz] = (oz % 5) * 0.5 - 1; // bias in [-1, 1]
|
|
for (int sz = 0; sz < ic; ++sz) {
|
|
for (int k = 0; k < ky * kx; ++k) {
|
|
weight[(oz * ic + sz) * ky * kx + k] = ((oz * ic + sz) * ky * kx + k) % (wMax - wMin + 1) + wMin;
|
|
//weight[(oz * ic + sz) * ky * kx + k] = (oz * oz + sz * sz + k * k) % (wMax - wMin + 1) + wMin; // w in [wMin, wMax]
|
|
}
|
|
}
|
|
}
|
|
|
|
x = _Convert(x, NC4HW4);
|
|
// For sse we use uint8 instead of int8, use FloatToInt8 to hidden detail
|
|
x = _FloatToInt8(x, _Scalar<float>(1.0 / xScale), -127, 127, xZeroPoint);
|
|
|
|
WinogradInt8Attr attrs;
|
|
std::vector<float> transInputScales(alpha2, 0.9), transWeightScales(alpha2 * oc, 1.1);
|
|
std::vector<int> transInputZeroPoint(alpha2, 1);
|
|
attrs.add(0, 0, ky, kx, alphas[1] - ky + 1, alphas[0] - kx + 1, transInputScales, transWeightScales, transInputZeroPoint);
|
|
auto yTarget = referenceWinograd(x, weight, wScale, bias, kernel, channel, pads, attrs.attrs[0], xScale, yScale, xZeroPoint, yZeroPoint, relu);
|
|
auto y = _Conv(std::move(weight), std::move(bias), std::move(wScale), x, channel,
|
|
kernel, CAFFE, {1, 1}, {1, 1}, 1, pads, relu, xScale, yScale, xZeroPoint, yZeroPoint,
|
|
-127, 127, 127, false);
|
|
y = attrs.turnToWinogradConv(y);
|
|
|
|
yTarget = _Convert(_Cast<int>(_Int8ToFloat(yTarget, _Scalar<float>(1.0))), NCHW);
|
|
y = _Convert(_Cast<int>(_Int8ToFloat(y, _Scalar<float>(1.0))), NCHW);
|
|
auto yTargetInfo = yTarget->getInfo(), yInfo = y->getInfo();
|
|
if (yTargetInfo == nullptr || yInfo == nullptr || yTargetInfo->size != yInfo->size) {
|
|
MNN_ERROR("[ConvInt8WinogradTestCommon] getInfo not match\n");
|
|
return false;
|
|
}
|
|
auto yTargetPtr = yTarget->readMap<int>(), yPtr = y->readMap<int>();
|
|
if (yTargetPtr == nullptr || yPtr == nullptr) {
|
|
MNN_ERROR("[ConvInt8WinogradTestCommon] result is nullptr\n");
|
|
return false;
|
|
}
|
|
if (!checkVector<int>(yPtr, yTargetPtr, yInfo->size, 1)) {
|
|
MNN_ERROR("[ConvInt8WinogradTestCommon] result error for batchSize = %d\n", batchSize);
|
|
return false;
|
|
}
|
|
if (speed) {
|
|
x.fix(VARP::INPUT);
|
|
// warm up, do onResize first for shapeDirty
|
|
x->writeMap<float>();
|
|
y->readMap<float>();
|
|
|
|
MNN::Timer _t;
|
|
const int LOOP = 20;
|
|
for (int i = 0; i < LOOP; ++i) {
|
|
x->writeMap<float>();
|
|
y->readMap<float>();
|
|
}
|
|
auto time = (float)_t.durationInUs() / 1000.0f;
|
|
MNN_PRINT("ConvInt8 Winograd %s input=(1x%dx%dx%d) kernel=(%dx%dx%dx%d) avg time = %.2f\n",
|
|
title.c_str(), ic, ih, iw, oc, ic, ky, kx, 1.0 * time / LOOP);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
class ConvInt8WinogradTest : public ConvInt8WinogradTestCommon {
|
|
virtual bool run(int precision) {
|
|
INTS pad = {1, 1}, inputShape = {128, 128}; // {w, h}
|
|
INTS channel = {32, 32}; // {ci, co}
|
|
|
|
std::vector<std::vector<int>> kernels = {
|
|
{3, 3}//, {3, 2}, {2, 3}, {2, 2}//, {4, 4}, {7, 1}, {1, 7} // {w, h}
|
|
};
|
|
std::vector<std::string> titles = {
|
|
"3x3", "2x3", "3x2", "2x2", "4x4", "1x7", "7x1"
|
|
};
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, {4, 4}, false, titles[i] + ",alpha=4");
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 (winograd)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
/*res = testKernel(inputShape, kernels[i], channel, pad, {6, 6}, false, titles[i] + ",alpha=6");
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 (winograd)\n", titles[i].c_str());
|
|
return false;
|
|
}*/
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
class ConvSpeedInt8WinogradTest : public ConvInt8WinogradTestCommon {
|
|
public:
|
|
virtual bool run(int precision) {
|
|
INTS pad = {1, 1}, inputShape = {28, 28}; // {w, h}
|
|
INTS channel = {128, 128};
|
|
std::vector<INTS> kernels = {
|
|
{3, 3}//, {5, 5}, {7, 1}, {1, 7} // {w, h}
|
|
};
|
|
|
|
std::vector<std::string> titles = {"3x3", "5x5", "1x7", "7x1"};
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], channel, pad, {4, 4}, true, titles[i] + ",alpha=4");
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 (winograd)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
res = testKernel(inputShape, kernels[i], channel, pad, {6, 6}, true, titles[i] + ",alpha=6");
|
|
if (!res) {
|
|
MNN_ERROR("Error for test kernel %s for convint8 (winograd)\n", titles[i].c_str());
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
class DepthwiseConvInt8Test : public ConvInt8TestCommon {
|
|
public:
|
|
virtual bool run(int precision) {
|
|
INTS strides = {1, 1}, dilate = {1, 1}, pad = {0, 0}, inputShape = {21, 13}; // {w, h}
|
|
int channel = 64;
|
|
std::vector<std::vector<int>> kernels = {
|
|
{3, 3}
|
|
};
|
|
std::vector<std::string> titles = {
|
|
"3x3"
|
|
};
|
|
printf("Test strides=1\n");
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], {channel, channel}, pad, strides, dilate, 8, false, channel, 4, MNN::SparseAlgo_RANDOM, 1, false);
|
|
if (!res) {
|
|
FUNC_PRINT(1);
|
|
return false;
|
|
}
|
|
}
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], {channel, channel}, pad, strides, dilate, 3, true, channel, 1, MNN::SparseAlgo_RANDOM, 1, false);
|
|
if (!res) {
|
|
FUNC_PRINT(1);
|
|
return false;
|
|
}
|
|
}
|
|
printf("strides=2\n");
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], {channel, channel}, pad, {2, 2}, dilate, 8, true, channel, 1, MNN::SparseAlgo_RANDOM, 1, false);
|
|
if (!res) {
|
|
FUNC_PRINT(1);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
class DepthwiseConvSpeedInt8Test : public ConvInt8TestCommon {
|
|
public:
|
|
virtual bool run(int precision) {
|
|
INTS strides = {1, 1}, dilate = {1, 1}, pad = {0, 0}, inputShape = {112, 144}; // {w, h}
|
|
int channel = 16;
|
|
std::vector<std::vector<int>> kernels = {
|
|
{3, 3}
|
|
};
|
|
std::vector<std::string> titles = {
|
|
"3x3"
|
|
};
|
|
printf("Depthwise Speed Test Strides=1.\n");
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], {channel, channel}, pad, strides, dilate, 8, false, channel, 4, MNN::SparseAlgo_RANDOM, 1, false, true);
|
|
if (!res) {
|
|
FUNC_PRINT(1);
|
|
return false;
|
|
}
|
|
}
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], {channel, channel}, pad, strides, dilate, 3, true, channel, 1, MNN::SparseAlgo_RANDOM, 1, false, true);
|
|
if (!res) {
|
|
FUNC_PRINT(1);
|
|
return false;
|
|
}
|
|
}
|
|
printf("Depthwise Speed Test Strides=2\n");
|
|
for (int i = 0; i < kernels.size(); ++i) {
|
|
auto res = testKernel(inputShape, kernels[i], {channel, channel}, pad, {2, 2}, dilate, 8, true, channel, 1, MNN::SparseAlgo_RANDOM, 1, false, true);
|
|
if (!res) {
|
|
FUNC_PRINT(1);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
MNNTestSuiteRegister(ConvInt8Im2colGemmTest, "op/ConvInt8/im2col_gemm");
|
|
#if defined(__arm__) || defined(__aarch64__) // arm32 or arm64
|
|
MNNTestSuiteRegister(SparseConvInt8Im2colGemmTest, "op/ConvInt8/im2col_spmm");
|
|
#endif
|
|
MNNTestSuiteRegister(ConvInt8WinogradTest, "op/ConvInt8/winograd");
|
|
MNNTestSuiteRegister(ConvSpeedInt8WinogradTest, "speed/ConvInt8/winograd");
|
|
MNNTestSuiteRegister(DepthwiseConvInt8Test, "op/ConvInt8/depthwise");
|
|
MNNTestSuiteRegister(DepthwiseConvSpeedInt8Test, "speed/ConvInt8/depthwise");
|