Fix bug for Arm82Backend's onCopyBuffer

This commit is contained in:
xiaying 2021-01-07 13:47:43 +08:00
parent 160f0ec778
commit 04530e55ba
6 changed files with 157 additions and 131 deletions

View File

@ -68,13 +68,7 @@ Execution* Arm82Backend::onCreate(const std::vector<Tensor*>& inputs, const std:
return exe;
}
bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
// arm82 backend tensor data type is fp16 default
auto tensor = const_cast<Tensor*>(nativeTensor);
auto& buffer = tensor->buffer();
if (buffer.type != halide_type_of<float>()) {
return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
}
static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
// The default data type of input tensor for arm82 backend is FLOAT32.
// However, Arm82Backend default data type is FLOAT16, so check whether data type is FLOAT32,
// then divide size by 2
@ -82,12 +76,22 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
const int dimensions = buffer.dimensions;
for (int i = 0; i < dimensions; i++) {
int currentDimSize = buffer.dim[i].extent;
if (TensorUtils::getDescribe(tensor)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
if (format == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
currentDimSize = ALIGN_UP8(currentDimSize);
}
size *= currentDimSize;
}
auto res = allocBuffer(size, (Tensor*)nativeTensor, storageType);
return size;
}
bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
// arm82 backend tensor data type is fp16 default
auto tensor = const_cast<Tensor*>(nativeTensor);
auto& buffer = tensor->buffer();
if (buffer.type != halide_type_of<float>()) {
return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
}
auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType);
if (!res) {
return false;
}
@ -95,56 +99,13 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
buffer.device = 1;
return true;
}
void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
auto ib = srcTensor->buffer();
auto ob = dstTensor->buffer();
if (ib.type.code != halide_type_float) {
CPUBackend::onCopyBuffer(srcTensor, dstTensor);
return;
}
auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
auto srcType = MNN_FORWARD_CPU;
if (ib.device != 0) {
srcType = MNN_FORWARD_CPU_EXTENSION;
}
auto dstType = MNN_FORWARD_CPU;
if (ob.device != 0) {
dstType = MNN_FORWARD_CPU_EXTENSION;
}
//MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
auto fastMode = source == dest && (source == MNN_DATA_FORMAT_NCHW || source == MNN_DATA_FORMAT_NHWC);
//MNN_PRINT("%d -> %d, %d\n", source, dest, fastMode);
if (ib.dimensions <= 1 || fastMode) {
const int elemenSize = srcTensor->elementSize();
// if not float, just copy data
if(ib.type != halide_type_of<float>()){
memcpy(dstTensor->host<char>(), srcTensor->host<char>(), srcTensor->size());
return;
}
// copy and quantize/dequantize data
// cpu -> arm82 copy
if (srcType == MNN_FORWARD_CPU || dstType == MNN_FORWARD_CPU_EXTENSION) {
const auto src = srcTensor->host<float>();
auto dst = dstTensor->host<FLOAT16>();
MNNQuantizeFP16(dst, src, elemenSize);
return;
}
// arm82 -> cpu copy
if (srcType == MNN_FORWARD_CPU_EXTENSION || dstType == MNN_FORWARD_CPU) {
const auto src = srcTensor->host<half_float::half>();
auto dst = dstTensor->host<float>();
for (int i = 0; i < elemenSize; ++i) {
dst[i] = float(src[i]);
}
return;
}
MNN_ASSERT(false);
}
static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t& ob, MNN_DATA_FORMAT source, MNN_DATA_FORMAT dest) {
int area = 1;
int channel = 0;
if (source == dest) {
::memcpy(ob.host, ib.host, _getAliginSize(ib, source));
return;
}
if (source == MNN_DATA_FORMAT_NC4HW4 || source == MNN_DATA_FORMAT_NCHW) {
channel = ib.dim[1].extent;
for (int axis = 2; axis < ib.dimensions; ++axis) {
@ -166,16 +127,9 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
const int outBatchStide = channel * area;
if(srcType == MNN_FORWARD_CPU_EXTENSION && dstType == MNN_FORWARD_CPU_EXTENSION){
for (int i = 0; i < batch; ++i) {
MNNNC8HW8TONCHW_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
channel);
}
}else{
for (int i = 0; i < batch; ++i) {
MNNNC8HW8TONCHW((float*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
channel);
}
for (int i = 0; i < batch; ++i) {
MNNNC8HW8TONCHW_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
channel);
}
return;
}
@ -183,54 +137,99 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
if (source == MNN_DATA_FORMAT_NCHW && dest == MNN_DATA_FORMAT_NC4HW4) {
const int inbatchStride = channel * area;
const int outBatchStide = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
if(dstType == MNN_FORWARD_CPU_EXTENSION && srcType == MNN_FORWARD_CPU_EXTENSION){
for (int i = 0; i < batch; ++i) {
MNNNCHWTONC8HW8_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
channel);
}
}else{
for (int i = 0; i < batch; ++i) {
MNNNCHWTONC8HW8((uint16_t*)ob.host + outBatchStide * i, (const float*)ib.host + inbatchStride * i, area,
channel);
}
}
return;
}
if (source == MNN_DATA_FORMAT_NC4HW4 && dest == MNN_DATA_FORMAT_NHWC) {
const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
const int outBatchStide = channel * area;
for (int i = 0; i < batch; ++i) {
MNNNC8HW8TONHWC((float*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
MNNNCHWTONC8HW8_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
channel);
}
return;
}
// internal use
// copy between CPUBackend and Arm82Backend
// Arm82Backend -> CPUBackend(Arm82Backend has not supported op, callback to CPUBackend)
MNN_ASSERT(source == dest && source == MNN_DATA_FORMAT_NC4HW4);
if (srcType == MNN_FORWARD_CPU_EXTENSION || dstType == MNN_FORWARD_CPU) {
const int inbatchStride = ROUND_UP(channel, ARMV82_CHANNEL_UNIT) * area;
const int outBatchStide = ob.dim[0].stride;
for (int i = 0; i < batch; ++i) {
MNNNC8HW8TONC4HW4((float*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
channel);
MNN_ERROR("Invalide format %d - %d copy for intenal Arm82 Backend\n", source, dest);
}
void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
auto& ib = srcTensor->buffer();
auto& ob = dstTensor->buffer();
if (ib.type.code != halide_type_float) {
CPUBackend::onCopyBuffer(srcTensor, dstTensor);
return;
}
auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
auto srcType = MNN_FORWARD_CPU;
if (ib.device != 0) {
srcType = MNN_FORWARD_CPU_EXTENSION;
}
auto dstType = MNN_FORWARD_CPU;
if (ob.device != 0) {
dstType = MNN_FORWARD_CPU_EXTENSION;
}
if (srcType == dstType) {
if (srcType == MNN_FORWARD_CPU) {
MNNCPUCopyBuffer(srcTensor, dstTensor);
} else {
_convertFp16Inside(ib, ob, source, dest);
}
return;
}
if (srcType == MNN_FORWARD_CPU || dstType == MNN_FORWARD_CPU_EXTENSION) {
const int inbatchStride = ib.dim[0].stride;
const int outBatchStide = ROUND_UP(channel, ARMV82_CHANNEL_UNIT) * area;
for (int i = 0; i < batch; ++i) {
MNNNC4HW4TONC8HW8((uint16_t*)ob.host + outBatchStide * i, (const float*)ib.host + inbatchStride * i, area,
channel);
// Use CPU Copy to turn save format
std::shared_ptr<Tensor> tempTensor;
if (srcType == MNN_FORWARD_CPU) {
tempTensor.reset(Tensor::create<float>(dstTensor->shape(), nullptr, TensorUtils::getDimType(dstTensor)));
MNNCPUCopyBuffer(srcTensor, tempTensor.get());
srcTensor = tempTensor.get();
source = dest;
} else {
tempTensor.reset(Tensor::create<float>(srcTensor->shape(), nullptr, TensorUtils::getDimType(srcTensor)), [dstTensor](void* ptr) {
auto tempT = (Tensor*)ptr;
MNNCPUCopyBuffer(tempT, dstTensor);
delete tempT;
});
dstTensor = tempTensor.get();
dest = source;
}
if (source == MNN_DATA_FORMAT_NC4HW4) {
// NC4HW4 <-> NC8HW8
int area = 1;
int channel = srcTensor->length(1);
for (int axis = 2; axis < ib.dimensions; ++axis) {
area *= srcTensor->length(axis);
}
const int batch = srcTensor->length(0);
if (srcType == MNN_FORWARD_CPU) {
const int outBatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
const int inbatchStride = UP_DIV(channel, 4) * area * 4;
for (int i = 0; i < batch; ++i) {
MNNNC4HW4TONC8HW8(dstTensor->host<uint16_t>() + outBatchStride * i, srcTensor->host<float>() + inbatchStride * i, area,
channel);
}
} else {
const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
const int outBatchStide = UP_DIV(channel, 4) * area * 4;
for (int i = 0; i < batch; ++i) {
MNNNC8HW8TONC4HW4(dstTensor->host<float>() + outBatchStide * i, srcTensor->host<uint16_t>() + inbatchStride * i, area,
channel);
}
}
return;
}
//MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
// The format is the same, just convert fp32-fp16
const int elemenSize = srcTensor->elementSize();
// copy and quantize/dequantize data
// cpu -> arm82 copy
if (srcType == MNN_FORWARD_CPU) {
const auto src = srcTensor->host<float>();
auto dst = dstTensor->host<FLOAT16>();
MNNQuantizeFP16(dst, src, elemenSize);
return;
}
// arm82 -> cpu copy
if (srcType == MNN_FORWARD_CPU_EXTENSION) {
const auto src = srcTensor->host<int16_t>();
auto dst = dstTensor->host<float>();
MNNDequantizeFP16(dst, src, elemenSize);
return;
}
MNN_ERROR("Invalide copy for intenal Arm82 Backend\n");
return;
}

View File

@ -9,12 +9,8 @@
#include "backend/arm82/Arm82OptFunc.hpp"
#include "core/Macro.h"
#include "half.hpp"
#ifdef MNN_USE_NEON
#include <arm_neon.h>
#endif
void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size) {
#ifdef MNN_USE_NEON
int sizeDiv4 = size / 4;
int remain = size - sizeDiv4 * 4;
@ -27,12 +23,28 @@ void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size) {
dst[i] = half_float::half(src[i]);
}
}
}
#else
for (int i = 0; i < size; ++i) {
dst[i] = half_float::half(src[i]);
void MNNDequantizeFP16(float* dst, const int16_t* srcint, int size) {
auto src = (const FLOAT16*)srcint;
int sizeDiv4 = size / 4;
int remain = size - sizeDiv4 * 4;
for (int i = 0; i < sizeDiv4; ++i) {
auto S = vld1_f16(src);
auto D = vcvt_f32_f16(S);
vst1q_f32(dst, D);
dst += 4;
src += 4;
}
if (remain > 0) {
FLOAT16 tempSrc[4];
float tempDst[4];
::memcpy(tempSrc, src, remain * sizeof(int16_t));
auto S = vld1_f16(tempSrc);
auto D = vcvt_f32_f16(S);
vst1q_f32(tempDst, D);
::memcpy(dst, tempDst, remain * sizeof(float));
}
#endif
}
void MNNNC4HW4TONC8HW8(uint16_t* dst, const float* source, size_t plane, size_t channel) {
@ -157,4 +169,4 @@ void MNNNC8HW8TONCHW_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plan
MNNUnpackUNIT<uint16_t, uint16_t, 8>(dest, source, plane, channel);
}
#endif
#endif

View File

@ -23,6 +23,7 @@ void MNNGemmFP16C8_UNIT(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight,
void MNNShuffleChannelC8(FLOAT16* dst, const FLOAT16* src, size_t size, size_t halfFlag);
void MNNQuantizeFP16_UNIT4(FLOAT16* dst, const float* src, int size);
void MNNDequantizeFP16(float* dst, const int16_t* src, int size);
#ifdef __cplusplus
}
@ -76,4 +77,4 @@ void MNNUnpackUNIT(TOUT* dst, const TIN* src, size_t area, size_t depth) {
#endif
#endif
#endif

View File

@ -502,4 +502,19 @@ void TensorUtils::adjustTensorForCompability(Tensor* newTensor) {
}
}
Tensor::DimensionType TensorUtils::getDimType(const Tensor* t) {
auto format = TensorUtils::getDescribe(t)->dimensionFormat;
switch (format) {
case MNN_DATA_FORMAT_NCHW:
return Tensor::CAFFE;
case MNN_DATA_FORMAT_NC4HW4:
return Tensor::CAFFE_C4;
case MNN_DATA_FORMAT_NHWC:
return Tensor::TENSORFLOW;
default:
break;
}
return Tensor::TENSORFLOW;
}
} // namespace MNN

View File

@ -138,6 +138,7 @@ public:
static bool reshapeSlice(Tensor::InsideDescribe::Region& slice, int outside, int inside, int axis);
static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
static void adjustTensorForCompability(Tensor* t);
static Tensor::DimensionType getDimType(const Tensor* t);
};
} // namespace MNN

View File

@ -446,24 +446,22 @@ public:
if (nullptr == creator) {
continue;
}
MNN::Backend::Info info;
info.type = type;
BackendConfig user;
user.precision = MNN::BackendConfig::Precision_High;
info.user = &user;
std::shared_ptr<Runtime> runtime(creator->onCreate(info));
MNN_PRINT("Test %d Backend\n", type);
std::shared_ptr<Backend> bn(runtime->onCreate());
// uint8
// auto res = nhwc_2_nhwc_uint8(bn);
auto res = NC4HW4_2_NC4HW4_float(bn);
res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
if (!res) {
MNN_ERROR("Error for %d bn\n", i);
return false;
for (int p = 0; p < 3; ++p) {
MNN::Backend::Info info;
info.type = type;
BackendConfig user;
user.precision = (MNN::BackendConfig::PrecisionMode)p;
info.user = &user;
std::shared_ptr<Runtime> runtime(creator->onCreate(info));
MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
std::shared_ptr<Backend> bn(runtime->onCreate());
auto res = NC4HW4_2_NC4HW4_float(bn);
res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
if (!res) {
MNN_ERROR("Error for %d bn\n", i);
return false;
}
}
// NC4HW4_2_NC4HW4_uint8(bn);
}
return true;
}