mirror of https://github.com/alibaba/MNN.git
354 lines
15 KiB
C++
354 lines
15 KiB
C++
//
|
|
// ConvolutionHybrid.cpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2023/10/26.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include "ConvolutionHybrid.hpp"
|
|
#include <string.h>
|
|
#include "core/BufferAllocator.hpp"
|
|
#include "backend/cpu/CPUBackend.hpp"
|
|
#include "core/Concurrency.h"
|
|
#include "ConvOpt.h"
|
|
#include "core/Macro.h"
|
|
#include "CommonOptFunction.h"
|
|
#include "core/TensorUtils.hpp"
|
|
#include <math.h>
|
|
#include "backend/cpu/compute/DenseConvolutionTiledExecutor.hpp"
|
|
|
|
namespace MNN {
|
|
|
|
bool ConvolutionHybrid::initQuantizeResource(std::shared_ptr<ConvolutionCommon::Int8Common> int8Info, std::shared_ptr<CPUConvolution::Resource> resource, int hU, int hP, int lU, int lP, int outputCount, int srcChannel, int kernelSize, int bytes) {
|
|
int weightLength = hU * lU * hP * lP;
|
|
resource->mWeight.reset(Tensor::createDevice<uint8_t>(
|
|
{weightLength}));
|
|
auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC);
|
|
if (!res) {
|
|
return false;
|
|
}
|
|
resource->mDequantize.bits = 8;
|
|
resource->hU = hU;
|
|
resource->lU = lU;
|
|
resource->hP = hP;
|
|
resource->lP = lP;
|
|
// Reorder weight
|
|
auto dstWInt8 = resource->mWeight->host<int8_t>();
|
|
auto srcWInt8 = int8Info->weight.get();
|
|
// oc, ic -> oc/hP, ic/lP, hP, lP
|
|
for (int i = 0; i < hU; i++) {
|
|
for (int j = 0; j < lU; j++) {
|
|
for (int k = 0; k < hP; k++) {
|
|
for (int l = 0; l < lP; l++) {
|
|
dstWInt8[i * srcChannel * hP + j * hP * lP + k * lP + l] = srcWInt8[(i * hP + k) * srcChannel + (j * lP + l)];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Save scale bias
|
|
resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice<float>({hU * hP * 2}));
|
|
res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
|
|
if (!res) {
|
|
return false;
|
|
}
|
|
auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
|
|
auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + hU * hP * bytes);
|
|
::memset(alphaPtr, 0, 2 * hU * hP * bytes);
|
|
int h = int8Info->alpha.size();
|
|
if (bytes == 2) {
|
|
auto core = static_cast<CPUBackend*>(resource->backend)->functions();
|
|
if (int8Info->asymmetric) {
|
|
std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
|
|
core->MNNFp32ToLowp(int8Info->alpha.get(), tmp.get(), h);
|
|
for (int i=0; i< h/2; ++i) {
|
|
reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[2 * i + 1];
|
|
reinterpret_cast<int16_t*>(biasPtr)[i] = tmp[2 * i];
|
|
}
|
|
} else {
|
|
core->MNNFp32ToLowp(int8Info->alpha.get(), reinterpret_cast<int16_t*>(alphaPtr), h);
|
|
}
|
|
} else {
|
|
if (int8Info->asymmetric) {
|
|
h = h / 2;
|
|
for (int i=0; i<h; ++i) {
|
|
alphaPtr[i] = int8Info->alpha.get()[2 * i + 1];
|
|
biasPtr[i] = int8Info->alpha.get()[2 * i];
|
|
}
|
|
} else {
|
|
for (int i=0; i<h; ++i) {
|
|
alphaPtr[i] = int8Info->alpha.get()[i];
|
|
biasPtr[i] = 0.f;
|
|
}
|
|
}
|
|
}
|
|
if (int8Info->canUseInt4) {
|
|
MNN_ASSERT(weightLength % 2 == 0);
|
|
weightLength = UP_DIV(weightLength, 2);
|
|
resource->mDequantize.bits = 4;
|
|
std::shared_ptr<MNN::Tensor> weightLow(Tensor::createDevice<uint8_t>(
|
|
{weightLength}));
|
|
auto res = resource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
|
|
if (!res) {
|
|
return false;
|
|
}
|
|
auto srcPtr = resource->mWeight->host<int8_t>();
|
|
auto dstPtr = weightLow->host<uint8_t>();
|
|
for (int i=0; i < weightLength; ++i) {
|
|
int s0 = srcPtr[2 * i + 0];
|
|
int s1 = srcPtr[2 * i + 1];
|
|
int d = (s0 + 8) * 16 + (s1 + 8);
|
|
dstPtr[i] = d;
|
|
}
|
|
resource->mWeight = weightLow;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
ConvolutionHybrid::ConvolutionHybrid(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
|
size_t originWeightSize, const float *bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> quantInfo)
|
|
: CPUConvolution(common, b) {
|
|
mResource.reset(new CPUConvolution::Resource);
|
|
mResource->backend = b;
|
|
if (!mResource->copyBiasAlign(bias, (int)biasSize)) {
|
|
MNN_ERROR("Not Enough Memory\n");
|
|
mValid = false;
|
|
return;
|
|
}
|
|
MNN_ASSERT(nullptr != quantInfo.get());
|
|
originWeightSize = quantInfo->weight.size();
|
|
auto outputCount = (int)biasSize;
|
|
int inputCount = (int)originWeightSize / (int)biasSize * common->kernelX() * common->kernelY();
|
|
auto core = static_cast<CPUBackend*>(b)->functions();
|
|
auto int8_core = static_cast<CPUBackend*>(backend())->int8Functions();
|
|
int unit = core->pack;
|
|
int ePack, lPack, hPack;
|
|
core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
|
|
// printf("ePack, lPack, hPack = %d, %d, %d\n", ePack, lPack, hPack);
|
|
// printf("UNIT, SRC_UNIT, DST_XUNIT = %d, %d, %d\n", UNIT, SRC_UNIT, DST_XUNIT);
|
|
hPack = unit;
|
|
lPack = unit;
|
|
// [oc, ic] => [oc/unit, ic/src_unit, unit, src_unit]
|
|
if (unit == 4 && core->supportI8mm) { // Low Memory: use fp32 and smmla.
|
|
hPack = 8;
|
|
lPack = 8;
|
|
}
|
|
auto hU = UP_DIV(outputCount, hPack);
|
|
auto lU = UP_DIV(inputCount, lPack);
|
|
ConvolutionHybrid::initQuantizeResource(quantInfo, mResource, hU, hPack, lU, lPack, outputCount, (int)originWeightSize / (int)biasSize, common->kernelX() * common->kernelY(), core->bytes);
|
|
}
|
|
|
|
ConvolutionHybrid::ConvolutionHybrid(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
|
|
mResource = resource;
|
|
}
|
|
|
|
ConvolutionHybrid::~ConvolutionHybrid() {
|
|
// Do nothing
|
|
}
|
|
|
|
bool ConvolutionHybrid::onClone(Backend* bn, const Op* op, Execution** dst) {
|
|
if (!mValid) {
|
|
return false;
|
|
}
|
|
if (nullptr == dst) {
|
|
return true;
|
|
}
|
|
*dst = new ConvolutionHybrid(mResource, op->main_as_Convolution2D()->common(), bn);
|
|
return true;
|
|
}
|
|
|
|
ErrorCode ConvolutionHybrid::allocTensor(Tensor* tensor, size_t size) {
|
|
tensor->buffer().type = halide_type_of<int8_t>();
|
|
tensor->buffer().dimensions = 1;
|
|
tensor->buffer().dim[0].extent = size;
|
|
bool success = backend()->onAcquireBuffer(tensor, Backend::DYNAMIC);
|
|
if (!success) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode ConvolutionHybrid::allocDynamicQuantInfo(int thread, int batch, int ic, int oc, int bytes) {
|
|
// absmax: thread * batch * bytes
|
|
// sum: thread * batch * sizeof(int)
|
|
// dequant_scale: batch * bytes
|
|
// quant_scale: batch * bytes
|
|
allocTensor(&mQuantInfo.quant_info, (thread + 2) * batch * bytes + thread * batch * sizeof(int));
|
|
if (ANeedToPack8) {
|
|
int ic8 = UP_DIV(ic, 8) * 8;
|
|
int oc8 = UP_DIV(oc, 8) * 8;
|
|
mInputTemp.reset(Tensor::createDevice<int8_t>({batch, 1, 1, ic8}));
|
|
mOutputTemp.reset(Tensor::createDevice<float>({batch, 1, 1, oc8}));
|
|
bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
|
|
allocSucc = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
|
|
if (!allocSucc) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
allocTensor(&mQuantInfo.quant_buffer, batch * ic8);
|
|
backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
|
|
backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
|
|
} else {
|
|
allocTensor(&mQuantInfo.quant_buffer, batch * ic);
|
|
}
|
|
backend()->onReleaseBuffer(&mQuantInfo.quant_info, Backend::DYNAMIC);
|
|
backend()->onReleaseBuffer(&mQuantInfo.quant_buffer, Backend::DYNAMIC);
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode ConvolutionHybrid::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
CPUConvolution::onResize(inputs, outputs);
|
|
auto input = inputs[0];
|
|
auto output = outputs[0];
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
auto int8_core = static_cast<CPUBackend*>(backend())->int8Functions();
|
|
auto inputPtr = input->host<float>();
|
|
auto outputPtr = output->host<float>();
|
|
auto weightPtr = mResource->mWeight->host<float>();
|
|
auto biasPtr = mResource->mBias->host<float>();
|
|
auto batch = output->batch() * output->height() * output->width();
|
|
int ic = input->channel();
|
|
int oc = output->channel();
|
|
int bytes = core->bytes;
|
|
int unit = core->pack;
|
|
int eP, lP, hP;
|
|
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
int UNIT, SRC_UNIT, DST_XUNIT;
|
|
int8_core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
|
hP = unit;
|
|
lP = unit;
|
|
int tileC = std::max(unit, hP);
|
|
LowMemoryGemmFuncWithInt8Weight gemmKernel;
|
|
gemmKernel = core->MNNGemmHybridInt8;
|
|
float weightBytes = 1;
|
|
if (mResource->mDequantize.bits == 4) {
|
|
weightBytes = 0.5;
|
|
gemmKernel = core->MNNGemmHybridInt4;
|
|
}
|
|
|
|
const uint8_t* dequantAlpha = mResource->mDequantize.mScaleBias->host<uint8_t>();;
|
|
const uint8_t* dequantBias = dequantAlpha + mResource->hU * mResource->hP * bytes;;
|
|
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
|
auto oC4 = UP_DIV(oc, tileC);
|
|
int iC4 = UP_DIV(ic, unit);
|
|
if (iC4 < threadNumber || oC4 < threadNumber) {
|
|
threadNumber = std::min(oC4, iC4);
|
|
}
|
|
int tileCount = UP_DIV(oC4, threadNumber);
|
|
int iTileCount = UP_DIV(iC4, threadNumber);
|
|
if (unit == 4 && core->supportI8mm) { // Low Memory: use fp32 and smmla.
|
|
ANeedToPack8 = true;
|
|
}
|
|
int8_t order[32] = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31, 8, 9, 10, 11, 4, 5, 6, 7, 24, 25, 26, 27, 20, 21, 22, 23};
|
|
allocDynamicQuantInfo(threadNumber, batch, ic, oc, bytes);
|
|
mDynamicQuant = [=]() {
|
|
auto maxPtr = mQuantInfo.quant_info.host<uint8_t>();
|
|
auto sumPtr = maxPtr + threadNumber * batch * bytes;
|
|
auto dequantPtr = sumPtr + threadNumber * batch * sizeof(int);
|
|
auto quantPtr = dequantPtr + batch * bytes;
|
|
// compute sum and absmax
|
|
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
|
int workCount = iTileCount;
|
|
if (tId == threadNumber - 1) {
|
|
workCount = iC4 - tId * iTileCount;
|
|
}
|
|
int icIndex = tId * iTileCount;
|
|
auto input_ptr = reinterpret_cast<const float*>(input->host<uint8_t>() + icIndex * batch * unit * bytes);
|
|
auto max_ptr = reinterpret_cast<float*>(maxPtr + tId * batch * bytes);
|
|
core->MNNAbsMax(input_ptr, max_ptr, workCount, batch, unit);
|
|
}
|
|
MNN_CONCURRENCY_END();
|
|
// compute scale
|
|
core->MNNQuantScale((float*)maxPtr, (float*)quantPtr, (float*)dequantPtr, threadNumber, batch);
|
|
// quant
|
|
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
|
int workCount = iTileCount;
|
|
if (tId == threadNumber - 1) {
|
|
workCount = iC4 - tId * iTileCount;
|
|
}
|
|
int icIndex = tId * iTileCount;
|
|
auto input_ptr = reinterpret_cast<float*>(input->host<uint8_t>() + icIndex * batch * unit * bytes);
|
|
auto quant_ptr = mQuantInfo.quant_buffer.host<int8_t>() + icIndex * batch * unit;
|
|
auto scale_ptr = reinterpret_cast<float*>(quantPtr);
|
|
auto sum_ptr = reinterpret_cast<float*>(sumPtr + tId * batch * sizeof(int));
|
|
core->MNNDynamicQuant(input_ptr, quant_ptr, scale_ptr, sum_ptr, workCount, batch, unit);
|
|
}
|
|
MNN_CONCURRENCY_END();
|
|
// compute quant sum
|
|
core->MNNQuantSum((float*)sumPtr, (float*)dequantPtr, threadNumber, batch);
|
|
};
|
|
mFunction.first = threadNumber;
|
|
mFunction.second = [=](int tId){
|
|
int workCount = tileCount;
|
|
if (tId == threadNumber - 1) {
|
|
workCount = oC4 - tId * tileCount;
|
|
}
|
|
int unit_ = unit;
|
|
int tileCount_ = tileCount;
|
|
if (ANeedToPack8) {
|
|
int oC8 = UP_DIV(oc, 8);
|
|
tileCount_ = UP_DIV(oC8, threadNumber);
|
|
workCount = tileCount_;
|
|
if (tId == threadNumber - 1) {
|
|
workCount = oC8 - tId * tileCount_;
|
|
}
|
|
unit_ = 8;
|
|
}
|
|
|
|
int ocIndex = tId * tileCount_ * unit_;
|
|
const float* finput_ptr = input->host<float>();
|
|
const int8_t* input_ptr = mQuantInfo.quant_buffer.host<int8_t>();
|
|
const int8_t* input_ptr_tmp = mQuantInfo.quant_buffer.host<int8_t>();
|
|
auto weight_ptr = mResource->mWeight->host<int8_t>() + static_cast<int>(ocIndex * ic * weightBytes);
|
|
auto output_ptr = reinterpret_cast<float*>(outputs[0]->host<uint8_t>() + ocIndex * batch * bytes);
|
|
if (ANeedToPack8 && batch > 1) {
|
|
input_ptr = mInputTemp->host<int8_t>();
|
|
output_ptr = reinterpret_cast<float*>(mOutputTemp->host<uint8_t>() + ocIndex * batch * bytes);
|
|
}
|
|
auto bias_ptr = reinterpret_cast<const float*>(mResource->mBias->host<uint8_t>() + ocIndex * bytes);
|
|
auto alpha_ptr = reinterpret_cast<const float*>(dequantAlpha + ocIndex * bytes);
|
|
auto zero_ptr = reinterpret_cast<const float*>(dequantBias + ocIndex * bytes);
|
|
const uint8_t* max_ptr = mQuantInfo.quant_info.host<uint8_t>();
|
|
const float* sums_ptr = reinterpret_cast<const float*>(max_ptr + threadNumber * batch * bytes);
|
|
const float* scale_ptr = reinterpret_cast<const float*>(max_ptr + threadNumber * batch * (bytes + sizeof(int)));
|
|
size_t dst_depth_quad = workCount;
|
|
size_t src_depth_quad = UP_DIV(ic, unit_);
|
|
size_t dst_step = batch * unit_ * bytes;
|
|
size_t realSize = batch;
|
|
const float* param[6];
|
|
param[0] = alpha_ptr;
|
|
param[1] = zero_ptr;
|
|
param[2] = bias_ptr;
|
|
param[3] = sums_ptr;
|
|
param[4] = scale_ptr;
|
|
param[5] = (float*)order;
|
|
gemmKernel(output_ptr, input_ptr, weight_ptr, src_depth_quad, dst_step, dst_depth_quad, realSize, param);
|
|
};
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode ConvolutionHybrid::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
mDynamicQuant();
|
|
if (ANeedToPack8 && inputs[0]->batch() > 1) {
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
auto plane_in = inputs[0]->width() * inputs[0]->height() * inputs[0]->batch();
|
|
auto plane_out = outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
|
|
auto depth = UP_DIV(inputs[0]->channel(), core->pack);
|
|
auto output_depth = UP_DIV(outputs[0]->channel(), core->pack);
|
|
int areaOffset[2] = {plane_out, plane_out};
|
|
MNNPackInt8C2Origin(mInputTemp.get()->host<float>(), mQuantInfo.quant_buffer.host<float>(), plane_in, depth, plane_in);
|
|
MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
|
|
mFunction.second((int)tId);
|
|
}
|
|
MNN_CONCURRENCY_END();
|
|
MNNUnpackC2Float(outputs[0]->host<float>(), mOutputTemp.get()->host<float>(), plane_out, output_depth, areaOffset, core->pack);
|
|
return NO_ERROR;
|
|
}
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
|
|
mFunction.second((int)tId);
|
|
}
|
|
MNN_CONCURRENCY_END();
|
|
return NO_ERROR;
|
|
}
|
|
} // namespace MNN
|