mirror of https://github.com/alibaba/MNN.git
180 lines
7.4 KiB
C++
180 lines
7.4 KiB
C++
//
|
|
// CPULayerNorm.cpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2020/07/15.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include <cmath>
|
|
#include "backend/cpu/CPULayerNorm.hpp"
|
|
#include "backend/cpu/CPUBackend.hpp"
|
|
#include "CPUCast.hpp"
|
|
#include "backend/cpu/compute/CommonOptFunction.h"
|
|
#include "core/Execution.hpp"
|
|
#include "core/Concurrency.h"
|
|
#include "core/TensorUtils.hpp"
|
|
#include "MNN_generated.h"
|
|
|
|
namespace MNN {
|
|
|
|
CPULayerNorm::CPULayerNorm(std::shared_ptr<Resource> res, Backend* backend) : Execution(backend) {
|
|
mResource = res;
|
|
}
|
|
|
|
std::shared_ptr<CPULayerNorm::Resource> CPULayerNorm::makeResource(const MNN::Op* op, Backend* backend) {
|
|
const auto* layer_norm_param = op->main_as_LayerNorm();
|
|
std::shared_ptr<CPULayerNorm::Resource> res(new Resource);
|
|
res->mAxis = 0;
|
|
if (nullptr != layer_norm_param->axis()) {
|
|
res->mAxis = layer_norm_param->axis()->size();
|
|
}
|
|
res->mGroup = layer_norm_param->group();
|
|
res->mEpsilon = layer_norm_param->epsilon();
|
|
res->mRMSNorm = layer_norm_param->useRMSNorm();
|
|
bool hasGammaBeta = (layer_norm_param->gamma() && layer_norm_param->beta());
|
|
int gammasize = 0;
|
|
if (hasGammaBeta) {
|
|
MNN_ASSERT(layer_norm_param->gamma()->size() == layer_norm_param->beta()->size());
|
|
gammasize = layer_norm_param->gamma()->size();
|
|
}
|
|
hasGammaBeta = hasGammaBeta || (layer_norm_param->external() && layer_norm_param->external()->size() > 1 && layer_norm_param->external()->data()[1] > 0);
|
|
if (hasGammaBeta && gammasize == 0) {
|
|
gammasize = layer_norm_param->external()->data()[1] / sizeof(float);
|
|
}
|
|
if (hasGammaBeta) {
|
|
res->mIniGammaBeta = true;
|
|
// Use uint8_t to avoid lowp reduce float bytes
|
|
res->mGamma.reset(Tensor::createDevice<uint8_t>({gammasize * 4}));
|
|
res->mBeta.reset(Tensor::createDevice<uint8_t>({gammasize * 4}));
|
|
auto status = backend->onAcquireBuffer(res->mGamma.get(), Backend::STATIC) && backend->onAcquireBuffer(res->mBeta.get(), Backend::STATIC);
|
|
if (!status) {
|
|
MNN_ERROR("Out of memory when gamma is acquired in CPULayerNorm.\n");
|
|
return nullptr;
|
|
}
|
|
bool useCachedMmap = backend->getRuntime()->hint().useCachedMmap > 1;
|
|
if (useCachedMmap) {
|
|
return res;
|
|
}
|
|
|
|
const float* gamma_data = layer_norm_param->gamma()->data();
|
|
memcpy(res->mGamma->host<float>(), gamma_data, gammasize * sizeof(float));
|
|
const float* beta_data = layer_norm_param->beta()->data();
|
|
memcpy(res->mBeta->host<float>(), beta_data, gammasize * sizeof(float));
|
|
}
|
|
return res;
|
|
}
|
|
|
|
ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
|
|
const std::vector<Tensor*> &outputs) {
|
|
const float* gamma = mResource->mIniGammaBeta ? mResource->mGamma->host<float>() : nullptr;
|
|
const float* beta = mResource->mIniGammaBeta ? mResource->mBeta->host<float>() : nullptr;
|
|
auto input = inputs[0]->host<uint8_t>();
|
|
auto output = outputs[0]->host<uint8_t>();
|
|
auto bn = static_cast<CPUBackend*>(backend());
|
|
auto core = bn->functions();
|
|
auto threadNumber = bn->threadNumber();
|
|
threadNumber = ALIMIN(threadNumber, mOutterSize);
|
|
auto int8core = bn->int8Functions();
|
|
int bytes = core->bytes;
|
|
auto inputQuan = TensorUtils::getDescribe(inputs[0])->quantAttr.get();
|
|
auto outputQuan = TensorUtils::getDescribe(outputs[0])->quantAttr.get();
|
|
|
|
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
|
bytes = 1;
|
|
}
|
|
|
|
MNN_CONCURRENCY_BEGIN(ttId, threadNumber) {
|
|
for (int tId=ttId; tId < mOutterSize; tId += threadNumber) {
|
|
const float* inner_input = (const float*)(input + tId * mInnerSize * bytes);
|
|
float* inner_output = (float*)(output + tId * mInnerSize * bytes);
|
|
if (bytes != 4) {
|
|
auto tmpInput = (float*)(mTmpInputFloat.ptr() + ttId * mInnerSize * sizeof(float));
|
|
auto tmpOutput = (float*)(mTmpOutputFloat.ptr() + ttId * mInnerSize * sizeof(float));
|
|
if (bytes == 1) {
|
|
CPUCastCreator::cast(inner_input, tmpInput, CPUCastCreator::INT8_TO_FlOAT, mInnerSize, inputQuan->scale, inputQuan->zero, inputQuan->min, inputQuan->max, bn);
|
|
} else {
|
|
core->MNNLowpToFp32((const int16_t*)inner_input, tmpInput, mInnerSize);
|
|
}
|
|
MNNNorm(tmpOutput, tmpInput, gamma, beta, mResource->mEpsilon, mInnerSize, mResource->mRMSNorm);
|
|
if (bytes == 1) {
|
|
CPUCastCreator::cast(tmpOutput, inner_output, CPUCastCreator::FlOAT_TO_INT8, mInnerSize, outputQuan->scale, outputQuan->zero, outputQuan->min, outputQuan->max, bn);
|
|
} else {
|
|
core->MNNFp32ToLowp(tmpOutput, (int16_t*)inner_output, mInnerSize);
|
|
}
|
|
} else {
|
|
MNNNorm(inner_output, inner_input, gamma, beta, mResource->mEpsilon, mInnerSize, mResource->mRMSNorm);
|
|
}
|
|
}
|
|
}
|
|
MNN_CONCURRENCY_END();
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs,
|
|
const std::vector<Tensor*> &outputs) {
|
|
mOutterSize = 1;
|
|
mInnerSize = 1;
|
|
do {
|
|
// Compute outter and inner
|
|
int rank = inputs.at(0)->dimensions();
|
|
if (mResource->mGroup > 1) {
|
|
mOutterSize = inputs.at(0)->length(0) * mResource->mGroup;
|
|
for (int i = 1; i < rank; i++) {
|
|
mInnerSize *= inputs.at(0)->length(i);
|
|
}
|
|
mInnerSize /= mResource->mGroup;
|
|
if (mResource->mIniGammaBeta) {
|
|
MNN_ASSERT(mResource->mGamma->size() == mInnerSize * sizeof(float));
|
|
}
|
|
break;
|
|
}
|
|
for (int i = 0; i < rank - mResource->mAxis; ++i) {
|
|
mOutterSize *= inputs.at(0)->length(i);
|
|
}
|
|
for (int i = rank - mResource->mAxis; i < rank; ++i) {
|
|
mInnerSize *= inputs.at(0)->length(i);
|
|
}
|
|
if (mResource->mIniGammaBeta) {
|
|
MNN_ASSERT(mResource->mGamma->size() == mInnerSize * sizeof(float));
|
|
}
|
|
} while (false);
|
|
auto bn = static_cast<CPUBackend*>(backend());
|
|
auto threadNumber = ALIMIN(bn->threadNumber(), mOutterSize);
|
|
auto buf = bn->getBufferAllocator();
|
|
|
|
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1 || bn->functions()->bytes != 4) {
|
|
mTmpInputFloat = buf->alloc(threadNumber * mInnerSize * sizeof(float));
|
|
mTmpOutputFloat = buf->alloc(threadNumber * mInnerSize * sizeof(float));
|
|
buf->free(mTmpInputFloat);
|
|
buf->free(mTmpOutputFloat);
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
|
|
CPULayerNorm::~CPULayerNorm() {
|
|
// Do nothing
|
|
}
|
|
bool CPULayerNorm::onClone(Backend* bn, const Op* op, Execution** dst) {
|
|
if (nullptr == dst) {
|
|
return true;
|
|
}
|
|
*dst = new CPULayerNorm(mResource, bn);
|
|
return true;
|
|
}
|
|
|
|
class CPULayerNormCreator : public CPUBackend::Creator {
|
|
public:
|
|
Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* backend) const override {
|
|
auto res = CPULayerNorm::makeResource(op, backend);
|
|
if (nullptr == res.get()) {
|
|
return nullptr;
|
|
}
|
|
return new CPULayerNorm(res, backend);
|
|
}
|
|
};
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPULayerNormCreator, OpType_LayerNorm);
|
|
|
|
} // namespace MNN
|