mirror of https://github.com/alibaba/MNN.git
321 lines
14 KiB
C++
321 lines
14 KiB
C++
#if MNN_KLEIDIAI_ENABLED
|
|
#include "KleidiAIDenseConvolution.hpp"
|
|
|
|
#include <numeric>
|
|
|
|
#include "CommonOptFunction.h"
|
|
#include "MNN/ErrorCode.hpp"
|
|
#include "backend/cpu/CPUBackend.hpp"
|
|
#include "backend/cpu/CPUTensorConvert.hpp"
|
|
#include "core/Macro.h"
|
|
#include "core/TensorUtils.hpp"
|
|
#include "kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h"
|
|
#include "kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
|
|
#include "kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
|
|
|
|
namespace MNN {
|
|
template <typename T>
|
|
static void initWeight(const T* weight, const T* bias, T* cache, T* output, const std::vector<int>& shape,
|
|
const int bytes) {
|
|
::memset(cache, 0, sizeof(T) * std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
|
|
ConvertOIHWToHWIO(cache, weight, shape);
|
|
auto outputCount = shape[0];
|
|
auto srcCount = shape[1];
|
|
auto kh = shape[2];
|
|
auto kw = shape[3];
|
|
if (bytes == 4) {
|
|
kai_run_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(outputCount, kh * kw, srcCount, outputCount * sizeof(T),
|
|
cache, bias, output);
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
}
|
|
|
|
KleidiAIDenseConvolution::KleidiAIDenseConvolution(const Convolution2DCommon* common, Backend* b,
|
|
const float* originWeight, size_t originWeightSize,
|
|
const float* bias, size_t biasSize,
|
|
std::shared_ptr<ConvolutionCommon::Int8Common> int8Info)
|
|
: ConvolutionTiledExecutor(b, bias, biasSize) {
|
|
auto outputCount = (int)biasSize;
|
|
auto core = static_cast<CPUBackend*>(b)->functions();
|
|
int bytes = core->bytes;
|
|
auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
|
|
if (core->matmulBytes != 0) {
|
|
bytes = core->matmulBytes;
|
|
}
|
|
|
|
int kai_rhs_packed_size = 0;
|
|
if (core->bytes == 4) {
|
|
kai_rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
|
|
outputCount, common->kernelY() * common->kernelX(), srcCount);
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
mResource->mWeight.reset(Tensor::createDevice<uint8_t>({kai_rhs_packed_size}));
|
|
mResource->mBias.reset(Tensor::createDevice<uint8_t>({outputCount * core->bytes}));
|
|
|
|
mValid = mValid && backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC);
|
|
if (!mValid) {
|
|
return;
|
|
}
|
|
mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
|
if (!mValid) {
|
|
return;
|
|
}
|
|
|
|
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>(
|
|
{outputCount, srcCount * common->kernelX() * common->kernelY(), (int)sizeof(float)})); // cache must be float
|
|
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
|
|
if (!mValid) {
|
|
return;
|
|
}
|
|
|
|
std::vector<int> oihwShape = {outputCount, srcCount, common->kernelY(), common->kernelX()};
|
|
if (core->bytes == 4) {
|
|
MNN::initWeight(originWeight, bias, cache->host<float>(), mResource->mWeight->host<float>(), oihwShape,
|
|
core->bytes);
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
|
|
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
|
|
mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b, mResource.get()));
|
|
}
|
|
|
|
KleidiAIDenseConvolution::KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res,
|
|
const Convolution2DCommon* common, Backend* b)
|
|
: ConvolutionTiledExecutor(res, b) {
|
|
mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b, mResource.get()));
|
|
}
|
|
|
|
KleidiAIDenseConvolution::~KleidiAIDenseConvolution() {
|
|
// Do nothing
|
|
}
|
|
|
|
bool KleidiAIDenseConvolution::onClone(Backend* bn, const Op* op, Execution** dst) {
|
|
if (!mValid) {
|
|
return false;
|
|
}
|
|
if (nullptr == dst) {
|
|
return true;
|
|
}
|
|
auto dense = new KleidiAIDenseConvolution(mResource, op->main_as_Convolution2D()->common(), bn);
|
|
dense->mProxy->mConvPerfconfig = mProxy->mConvPerfconfig;
|
|
*dst = dense;
|
|
return true;
|
|
}
|
|
|
|
ErrorCode KleidiAIDenseConvolution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
auto code = mProxy->onExecute(mInputs, outputs);
|
|
return code;
|
|
}
|
|
ErrorCode KleidiAIDenseConvolution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
|
|
auto code = mProxy->onResize(mInputs, outputs);
|
|
if (NO_ERROR != code) {
|
|
return code;
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode KleidiAIDenseConvolutionMultiInput::onExecute(const std::vector<Tensor*>& inputs,
|
|
const std::vector<Tensor*>& outputs) {
|
|
auto function = static_cast<CPUBackend*>(backend())->functions();
|
|
if (nullptr != mTempBias) {
|
|
::memset(mTempBias->host<float>(), 0, mTempBias->elementSize() * function->bytes);
|
|
if (inputs.size() > 2) {
|
|
::memcpy(mTempBias->host<float>(), inputs[2]->host<float>(), inputs[2]->elementSize() * function->bytes);
|
|
}
|
|
}
|
|
auto cache = mTempWeightCache->host<float>();
|
|
auto source = inputs[1]->host<float>();
|
|
if (function->bytes == 4) {
|
|
initWeight(source, mInputs[2]->host<float>(), cache, mTempWeight->host<float>(), inputs[1]->shape(),
|
|
function->bytes);
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
return mProxy->onExecute(mInputs, outputs);
|
|
}
|
|
ErrorCode KleidiAIDenseConvolutionMultiInput::onResize(const std::vector<Tensor*>& inputs,
|
|
const std::vector<Tensor*>& outputs) {
|
|
int depth = inputs[1]->channel();
|
|
int outputCount = outputs[0]->channel();
|
|
auto function = static_cast<CPUBackend*>(backend())->functions();
|
|
if (function->bytes == 4) {
|
|
int kai_rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
|
|
outputCount, inputs[1]->stride(1), depth);
|
|
mTempWeight.reset(Tensor::createDevice<uint8_t>({kai_rhs_packed_size}));
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
mTempWeightCache.reset(Tensor::createDevice<float>(
|
|
{inputs[1]->height(), inputs[1]->width(), inputs[1]->channel(), inputs[1]->batch()}));
|
|
auto res = backend()->onAcquireBuffer(mTempWeight.get(), Backend::DYNAMIC);
|
|
res = res && backend()->onAcquireBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
|
|
mTempBias.reset();
|
|
if (!res) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
if (inputs.size() > 2 && inputs[2]->elementSize() % function->pack == 0) {
|
|
mInputs = {inputs[0], mTempWeight.get(), inputs[2]};
|
|
} else {
|
|
mTempBias.reset(Tensor::createDevice<float>({UP_DIV(outputCount, function->pack) * function->pack}));
|
|
backend()->onAcquireBuffer(mTempBias.get(), Backend::DYNAMIC);
|
|
mInputs = {inputs[0], mTempWeight.get(), mTempBias.get()};
|
|
}
|
|
backend()->onReleaseBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
|
|
auto errorCode = mProxy->onResize(mInputs, outputs);
|
|
backend()->onReleaseBuffer(mTempWeight.get(), Backend::DYNAMIC);
|
|
if (nullptr != mTempBias) {
|
|
backend()->onReleaseBuffer(mTempBias.get(), Backend::DYNAMIC);
|
|
}
|
|
return errorCode;
|
|
}
|
|
|
|
ErrorCode KleidiAIDenseConvolutionImpl::onResize(const std::vector<Tensor*>& inputs,
|
|
const std::vector<Tensor*>& outputs) {
|
|
CPUConvolution::onResize(inputs, outputs);
|
|
auto input = inputs[0];
|
|
auto weight = inputs[1];
|
|
Tensor* bias = nullptr;
|
|
if (inputs.size() > 2) {
|
|
bias = inputs[2];
|
|
}
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
int bytes = core->bytes;
|
|
int matmulBytes = bytes;
|
|
if (core->matmulBytes != 0) {
|
|
matmulBytes = core->matmulBytes;
|
|
}
|
|
auto ic = input->channel();
|
|
auto output = outputs[0];
|
|
auto batch = output->batch();
|
|
|
|
auto outputChannel = output->channel();
|
|
auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
|
|
|
|
mTempBufferTranspose.buffer().type = halide_type_of<uint8_t>();
|
|
mTempBufferTranspose.buffer().dimensions = 1;
|
|
int outputNhwSize = batch * output->height() * output->width();
|
|
if (core->bytes == 4) {
|
|
mTempBufferTranspose.buffer().dim[0].extent =
|
|
kai_get_lhs_packed_size_lhs_imatmul_pack_x32p2vlx1_x32p_sme(outputNhwSize, kernelSize, ic);
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
TensorUtils::setLinearLayout(&mTempBufferTranspose);
|
|
|
|
bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
|
if (!success) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
|
|
TensorUtils::getDescribe(&mOutputNHWC)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
|
mOutputNHWC.buffer().dimensions = 4;
|
|
mOutputNHWC.buffer().dim[0].extent = output->batch();
|
|
mOutputNHWC.buffer().dim[1].extent = output->height();
|
|
mOutputNHWC.buffer().dim[2].extent = output->width();
|
|
mOutputNHWC.buffer().dim[3].extent = output->channel();
|
|
mOutputNHWC.buffer().type = output->getType();
|
|
success = backend()->onAcquireBuffer(&mOutputNHWC, Backend::DYNAMIC);
|
|
if (!success) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
|
|
TensorUtils::getDescribe(&mInputNHWC)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
|
mInputNHWC.buffer().dimensions = 4;
|
|
mInputNHWC.buffer().dim[0].extent = input->batch();
|
|
mInputNHWC.buffer().dim[1].extent = input->height();
|
|
mInputNHWC.buffer().dim[2].extent = input->width();
|
|
mInputNHWC.buffer().dim[3].extent = input->channel();
|
|
mInputNHWC.buffer().type = input->getType();
|
|
success = backend()->onAcquireBuffer(&mInputNHWC, Backend::DYNAMIC);
|
|
if (!success) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
|
|
TensorUtils::getDescribe(&mPadBuffer)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
|
mPadBuffer.buffer().dimensions = 1;
|
|
mPadBuffer.buffer().dim[0].extent = input->channel();
|
|
mPadBuffer.buffer().type = input->getType();
|
|
TensorUtils::setLinearLayout(&mPadBuffer);
|
|
success = backend()->onAcquireBuffer(&mPadBuffer, Backend::DYNAMIC);
|
|
if (!success) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
|
|
backend()->onReleaseBuffer(&mOutputNHWC, Backend::DYNAMIC);
|
|
backend()->onReleaseBuffer(&mInputNHWC, Backend::DYNAMIC);
|
|
backend()->onReleaseBuffer(&mPadBuffer, Backend::DYNAMIC);
|
|
|
|
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
|
|
|
auto postParameters = getPostParameters();
|
|
mFunction.first = ((CPUBackend*)backend())->threadNumber();
|
|
|
|
auto padFull = ConvolutionCommon::convolutionPadFull(input, output, mCommon);
|
|
ConvParams params{
|
|
.inputChannel = ic,
|
|
.outputChannel = outputChannel,
|
|
.kernelHeight = mCommon->kernelY(),
|
|
.kernelWidth = mCommon->kernelX(),
|
|
.strideHeight = mCommon->strideY(),
|
|
.strideWidth = mCommon->strideX(),
|
|
.padTop = std::get<1>(padFull),
|
|
.padBottom = std::get<3>(padFull),
|
|
.padLeft = std::get<0>(padFull),
|
|
.padRight = std::get<2>(padFull),
|
|
.dilatedHeight = mCommon->dilateY(),
|
|
.dilatedWidth = mCommon->dilateX(),
|
|
};
|
|
|
|
mFunction.second = [=](int tid) {
|
|
// Convert NC4HW4 to NHWC
|
|
auto inputShape = input->shape(); // TODO check for NC4HW4, should be the NCHW
|
|
CPUTensorConverter::convert(input, &mInputNHWC, core);
|
|
// Lhs packing
|
|
if (bytes == 4) {
|
|
int blockSize = kai_get_m_step_lhs_imatmul_pack_x32p2vlx1_x32p_sme();
|
|
::memset(mPadBuffer.host<float>(), 0, params.inputChannel * sizeof(float));
|
|
auto table = IndirectionTable<float>(mInputNHWC.shape(), params, mInputNHWC.host<float>(),
|
|
mPadBuffer.host<float>(), blockSize);
|
|
kai_run_lhs_imatmul_pack_x32p2vlx1_x32p_sme(outputNhwSize, kernelSize, ic, table.data.data(), 0,
|
|
mPadBuffer.host<uint8_t>(),
|
|
mTempBufferTranspose.host<uint8_t>());
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
|
|
// Run Matmul
|
|
if (bytes == 4) {
|
|
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa(
|
|
outputNhwSize, outputChannel, kernelSize, ic, mTempBufferTranspose.host<uint8_t>(),
|
|
weight->host<uint8_t>(), mOutputNHWC.host<uint8_t>(), outputChannel * sizeof(float), postParameters[2],
|
|
postParameters[3]);
|
|
} else {
|
|
MNN_ERROR("Not fp32, should not be called here\n");
|
|
abort();
|
|
}
|
|
|
|
// Convert NHWC to NC4HW4
|
|
CPUTensorConverter::convert(&mOutputNHWC, output, core);
|
|
};
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode KleidiAIDenseConvolutionImpl::onExecute(const std::vector<Tensor*>& inputs,
|
|
const std::vector<Tensor*>& outputs) {
|
|
mFunction.second(0);
|
|
return NO_ERROR;
|
|
}
|
|
} // namespace MNN
|
|
#endif
|