2021-04-08 15:34:23 +08:00
|
|
|
//
|
|
|
|
// BF16Backend.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2020/01/26.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
#include "BF16Functions.hpp"
|
|
|
|
#include "BF16Backend.hpp"
|
|
|
|
#include "core/BufferAllocator.hpp"
|
|
|
|
#include "core/TensorUtils.hpp"
|
|
|
|
#include "backend/cpu/CPUTensorConvert.hpp"
|
|
|
|
#include "core/OpCommonUtils.hpp"
|
|
|
|
namespace MNN {
|
|
|
|
|
|
|
|
void registerBF16Ops();
|
|
|
|
static std::map<OpType, BF16Backend::BF16Creator*>* gInstance = nullptr;
|
|
|
|
// The Function Will be Called in init
|
|
|
|
extern void registerBF16Backend() {
|
|
|
|
gInstance = new std::map<OpType, BF16Backend::BF16Creator*>;
|
|
|
|
bool success = BF16Functions::init();
|
|
|
|
if (success) {
|
|
|
|
registerBF16Ops();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bool BF16Backend::addBF16Creator(OpType t, BF16Creator* ct) {
|
|
|
|
auto creatorContainer = gInstance;
|
|
|
|
if (creatorContainer->find(t) == creatorContainer->end()) {
|
|
|
|
creatorContainer->insert(std::make_pair(t, ct));
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
BF16Backend::BF16Backend(const CPURuntime* runtime) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION) {
|
|
|
|
mCoreFunctions = BF16Functions::get();
|
|
|
|
}
|
|
|
|
|
|
|
|
BF16Backend::~BF16Backend() {
|
|
|
|
// nothing to do
|
|
|
|
}
|
|
|
|
|
|
|
|
Execution* BF16Backend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
const MNN::Op* op) {
|
|
|
|
for (auto t : outputs) {
|
|
|
|
if (t->getType().code != halide_type_float) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
auto quantInfo = OpCommonUtils::getQuantInfo(inputs);
|
|
|
|
if (quantInfo.first) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
bool originCreate = OpCommonUtils::opCompabilityForLowp(op);
|
|
|
|
if (originCreate) {
|
|
|
|
return CPUBackend::onCreate(inputs, outputs, op);
|
|
|
|
}
|
|
|
|
auto creatorContainer = gInstance;
|
|
|
|
auto iter = creatorContainer->find(op->type());
|
|
|
|
|
|
|
|
if (iter == creatorContainer->end()) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
auto exe = iter->second->onCreate(inputs, outputs, op, this);
|
|
|
|
if (exe == nullptr) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
return exe;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
|
|
|
|
// The default data type of input tensor for arm82 backend is FLOAT32.
|
|
|
|
// However, BF16Backend default data type is FLOAT16, so check whether data type is FLOAT32,
|
|
|
|
// then divide size by 2
|
|
|
|
int size = sizeof(int16_t);
|
|
|
|
const int dimensions = buffer.dimensions;
|
|
|
|
for (int i = 0; i < dimensions; i++) {
|
|
|
|
int currentDimSize = buffer.dim[i].extent;
|
|
|
|
if (format == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
|
|
|
|
currentDimSize = ALIGN_UP4(currentDimSize);
|
|
|
|
}
|
|
|
|
size *= currentDimSize;
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool BF16Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
|
|
|
|
// arm82 backend tensor data type is fp16 default
|
|
|
|
auto tensor = const_cast<Tensor*>(nativeTensor);
|
|
|
|
auto& buffer = tensor->buffer();
|
|
|
|
if (buffer.type != halide_type_of<float>()) {
|
|
|
|
return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
|
|
|
|
}
|
|
|
|
auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType);
|
|
|
|
if (!res) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// Set mask in device for easy to determine
|
|
|
|
buffer.device = 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void BF16Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
|
|
|
|
auto& ib = srcTensor->buffer();
|
|
|
|
auto& ob = dstTensor->buffer();
|
|
|
|
if (ib.type.code != halide_type_float) {
|
|
|
|
CPUBackend::onCopyBuffer(srcTensor, dstTensor);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
|
|
|
|
auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
|
|
|
|
auto srcType = MNN_FORWARD_CPU;
|
|
|
|
if (ib.device != 0) {
|
|
|
|
srcType = MNN_FORWARD_CPU_EXTENSION;
|
|
|
|
}
|
|
|
|
auto dstType = MNN_FORWARD_CPU;
|
|
|
|
if (ob.device != 0) {
|
|
|
|
dstType = MNN_FORWARD_CPU_EXTENSION;
|
|
|
|
}
|
|
|
|
if (srcType == dstType) {
|
|
|
|
ErrorCode code = ErrorCode::NO_ERROR;
|
|
|
|
auto tup = CPUTensorConverter::splitDimensions(srcTensor->buffer(), source);
|
|
|
|
int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup);
|
|
|
|
if (srcType == MNN_FORWARD_CPU) {
|
2021-06-11 17:17:13 +08:00
|
|
|
code = CPUTensorConverter::convert(srcTensor->host<void>(), dstTensor->host<void>(), source, dest, batch, area, channel, 4, MNNGetCoreFunctions());
|
2021-04-08 15:34:23 +08:00
|
|
|
} else {
|
2021-06-11 17:17:13 +08:00
|
|
|
code = CPUTensorConverter::convert(srcTensor->host<void>(), dstTensor->host<void>(), source, dest, batch, area, channel, 2, mCoreFunctions);
|
2021-04-08 15:34:23 +08:00
|
|
|
}
|
|
|
|
MNN_ASSERT(code == ErrorCode::NO_ERROR);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Use CPU Copy to turn save format
|
|
|
|
std::shared_ptr<Tensor> tempTensor;
|
|
|
|
if (source != dest) {
|
|
|
|
if (srcType == MNN_FORWARD_CPU) {
|
|
|
|
tempTensor.reset(Tensor::create<float>(dstTensor->shape(), nullptr, TensorUtils::getDimType(dstTensor)));
|
|
|
|
MNNCPUCopyBuffer(srcTensor, tempTensor.get());
|
|
|
|
srcTensor = tempTensor.get();
|
|
|
|
source = dest;
|
|
|
|
} else {
|
|
|
|
tempTensor.reset(Tensor::create<float>(srcTensor->shape(), nullptr, TensorUtils::getDimType(srcTensor)), [dstTensor](void* ptr) {
|
|
|
|
auto tempT = (Tensor*)ptr;
|
|
|
|
MNNCPUCopyBuffer(tempT, dstTensor);
|
|
|
|
delete tempT;
|
|
|
|
});
|
|
|
|
dstTensor = tempTensor.get();
|
|
|
|
dest = source;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
|
|
|
|
// The format is the same, just convert fp32-fp16
|
|
|
|
const int elemenSize = srcTensor->elementSize();
|
|
|
|
// copy and quantize/dequantize data
|
|
|
|
if (srcType == MNN_FORWARD_CPU) {
|
|
|
|
const auto src = srcTensor->host<float>();
|
|
|
|
auto dst = dstTensor->host<int16_t>();
|
|
|
|
BF16Functions::get()->MNNFp32ToLowp(src, dst, elemenSize);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (srcType == MNN_FORWARD_CPU_EXTENSION) {
|
|
|
|
const auto src = srcTensor->host<int16_t>();
|
|
|
|
auto dst = dstTensor->host<float>();
|
|
|
|
BF16Functions::get()->MNNLowpToFp32(src, dst, elemenSize);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace MNN
|