mirror of https://github.com/alibaba/MNN.git
Fix bug for Arm82Backend's onCopyBuffer
This commit is contained in:
parent
160f0ec778
commit
04530e55ba
|
|
@ -68,13 +68,7 @@ Execution* Arm82Backend::onCreate(const std::vector<Tensor*>& inputs, const std:
|
|||
return exe;
|
||||
}
|
||||
|
||||
bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
|
||||
// arm82 backend tensor data type is fp16 default
|
||||
auto tensor = const_cast<Tensor*>(nativeTensor);
|
||||
auto& buffer = tensor->buffer();
|
||||
if (buffer.type != halide_type_of<float>()) {
|
||||
return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
|
||||
}
|
||||
static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
|
||||
// The default data type of input tensor for arm82 backend is FLOAT32.
|
||||
// However, Arm82Backend default data type is FLOAT16, so check whether data type is FLOAT32,
|
||||
// then divide size by 2
|
||||
|
|
@ -82,12 +76,22 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
|
|||
const int dimensions = buffer.dimensions;
|
||||
for (int i = 0; i < dimensions; i++) {
|
||||
int currentDimSize = buffer.dim[i].extent;
|
||||
if (TensorUtils::getDescribe(tensor)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
|
||||
if (format == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
|
||||
currentDimSize = ALIGN_UP8(currentDimSize);
|
||||
}
|
||||
size *= currentDimSize;
|
||||
}
|
||||
auto res = allocBuffer(size, (Tensor*)nativeTensor, storageType);
|
||||
return size;
|
||||
}
|
||||
|
||||
bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
|
||||
// arm82 backend tensor data type is fp16 default
|
||||
auto tensor = const_cast<Tensor*>(nativeTensor);
|
||||
auto& buffer = tensor->buffer();
|
||||
if (buffer.type != halide_type_of<float>()) {
|
||||
return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
|
||||
}
|
||||
auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType);
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -95,56 +99,13 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
|
|||
buffer.device = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
|
||||
auto ib = srcTensor->buffer();
|
||||
auto ob = dstTensor->buffer();
|
||||
if (ib.type.code != halide_type_float) {
|
||||
CPUBackend::onCopyBuffer(srcTensor, dstTensor);
|
||||
return;
|
||||
}
|
||||
auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
|
||||
auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
|
||||
auto srcType = MNN_FORWARD_CPU;
|
||||
if (ib.device != 0) {
|
||||
srcType = MNN_FORWARD_CPU_EXTENSION;
|
||||
}
|
||||
auto dstType = MNN_FORWARD_CPU;
|
||||
if (ob.device != 0) {
|
||||
dstType = MNN_FORWARD_CPU_EXTENSION;
|
||||
}
|
||||
//MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
|
||||
auto fastMode = source == dest && (source == MNN_DATA_FORMAT_NCHW || source == MNN_DATA_FORMAT_NHWC);
|
||||
//MNN_PRINT("%d -> %d, %d\n", source, dest, fastMode);
|
||||
if (ib.dimensions <= 1 || fastMode) {
|
||||
const int elemenSize = srcTensor->elementSize();
|
||||
// if not float, just copy data
|
||||
if(ib.type != halide_type_of<float>()){
|
||||
memcpy(dstTensor->host<char>(), srcTensor->host<char>(), srcTensor->size());
|
||||
return;
|
||||
}
|
||||
// copy and quantize/dequantize data
|
||||
// cpu -> arm82 copy
|
||||
if (srcType == MNN_FORWARD_CPU || dstType == MNN_FORWARD_CPU_EXTENSION) {
|
||||
const auto src = srcTensor->host<float>();
|
||||
auto dst = dstTensor->host<FLOAT16>();
|
||||
MNNQuantizeFP16(dst, src, elemenSize);
|
||||
return;
|
||||
}
|
||||
// arm82 -> cpu copy
|
||||
if (srcType == MNN_FORWARD_CPU_EXTENSION || dstType == MNN_FORWARD_CPU) {
|
||||
const auto src = srcTensor->host<half_float::half>();
|
||||
auto dst = dstTensor->host<float>();
|
||||
for (int i = 0; i < elemenSize; ++i) {
|
||||
dst[i] = float(src[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
MNN_ASSERT(false);
|
||||
}
|
||||
|
||||
static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t& ob, MNN_DATA_FORMAT source, MNN_DATA_FORMAT dest) {
|
||||
int area = 1;
|
||||
int channel = 0;
|
||||
if (source == dest) {
|
||||
::memcpy(ob.host, ib.host, _getAliginSize(ib, source));
|
||||
return;
|
||||
}
|
||||
if (source == MNN_DATA_FORMAT_NC4HW4 || source == MNN_DATA_FORMAT_NCHW) {
|
||||
channel = ib.dim[1].extent;
|
||||
for (int axis = 2; axis < ib.dimensions; ++axis) {
|
||||
|
|
@ -166,16 +127,9 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
|
|||
const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
|
||||
const int outBatchStide = channel * area;
|
||||
|
||||
if(srcType == MNN_FORWARD_CPU_EXTENSION && dstType == MNN_FORWARD_CPU_EXTENSION){
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC8HW8TONCHW_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
}else{
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC8HW8TONCHW((float*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC8HW8TONCHW_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
@ -183,54 +137,99 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
|
|||
if (source == MNN_DATA_FORMAT_NCHW && dest == MNN_DATA_FORMAT_NC4HW4) {
|
||||
const int inbatchStride = channel * area;
|
||||
const int outBatchStide = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
|
||||
if(dstType == MNN_FORWARD_CPU_EXTENSION && srcType == MNN_FORWARD_CPU_EXTENSION){
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNCHWTONC8HW8_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
}else{
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNCHWTONC8HW8((uint16_t*)ob.host + outBatchStide * i, (const float*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (source == MNN_DATA_FORMAT_NC4HW4 && dest == MNN_DATA_FORMAT_NHWC) {
|
||||
const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
|
||||
const int outBatchStide = channel * area;
|
||||
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC8HW8TONHWC((float*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
|
||||
MNNNCHWTONC8HW8_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// internal use
|
||||
// copy between CPUBackend and Arm82Backend
|
||||
// Arm82Backend -> CPUBackend(Arm82Backend has not supported op, callback to CPUBackend)
|
||||
MNN_ASSERT(source == dest && source == MNN_DATA_FORMAT_NC4HW4);
|
||||
if (srcType == MNN_FORWARD_CPU_EXTENSION || dstType == MNN_FORWARD_CPU) {
|
||||
const int inbatchStride = ROUND_UP(channel, ARMV82_CHANNEL_UNIT) * area;
|
||||
const int outBatchStide = ob.dim[0].stride;
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC8HW8TONC4HW4((float*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
MNN_ERROR("Invalide format %d - %d copy for intenal Arm82 Backend\n", source, dest);
|
||||
}
|
||||
void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
|
||||
auto& ib = srcTensor->buffer();
|
||||
auto& ob = dstTensor->buffer();
|
||||
if (ib.type.code != halide_type_float) {
|
||||
CPUBackend::onCopyBuffer(srcTensor, dstTensor);
|
||||
return;
|
||||
}
|
||||
auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
|
||||
auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
|
||||
auto srcType = MNN_FORWARD_CPU;
|
||||
if (ib.device != 0) {
|
||||
srcType = MNN_FORWARD_CPU_EXTENSION;
|
||||
}
|
||||
auto dstType = MNN_FORWARD_CPU;
|
||||
if (ob.device != 0) {
|
||||
dstType = MNN_FORWARD_CPU_EXTENSION;
|
||||
}
|
||||
if (srcType == dstType) {
|
||||
if (srcType == MNN_FORWARD_CPU) {
|
||||
MNNCPUCopyBuffer(srcTensor, dstTensor);
|
||||
} else {
|
||||
_convertFp16Inside(ib, ob, source, dest);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (srcType == MNN_FORWARD_CPU || dstType == MNN_FORWARD_CPU_EXTENSION) {
|
||||
const int inbatchStride = ib.dim[0].stride;
|
||||
const int outBatchStide = ROUND_UP(channel, ARMV82_CHANNEL_UNIT) * area;
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC4HW4TONC8HW8((uint16_t*)ob.host + outBatchStide * i, (const float*)ib.host + inbatchStride * i, area,
|
||||
channel);
|
||||
// Use CPU Copy to turn save format
|
||||
std::shared_ptr<Tensor> tempTensor;
|
||||
if (srcType == MNN_FORWARD_CPU) {
|
||||
tempTensor.reset(Tensor::create<float>(dstTensor->shape(), nullptr, TensorUtils::getDimType(dstTensor)));
|
||||
MNNCPUCopyBuffer(srcTensor, tempTensor.get());
|
||||
srcTensor = tempTensor.get();
|
||||
source = dest;
|
||||
} else {
|
||||
tempTensor.reset(Tensor::create<float>(srcTensor->shape(), nullptr, TensorUtils::getDimType(srcTensor)), [dstTensor](void* ptr) {
|
||||
auto tempT = (Tensor*)ptr;
|
||||
MNNCPUCopyBuffer(tempT, dstTensor);
|
||||
delete tempT;
|
||||
});
|
||||
dstTensor = tempTensor.get();
|
||||
dest = source;
|
||||
}
|
||||
if (source == MNN_DATA_FORMAT_NC4HW4) {
|
||||
// NC4HW4 <-> NC8HW8
|
||||
int area = 1;
|
||||
int channel = srcTensor->length(1);
|
||||
for (int axis = 2; axis < ib.dimensions; ++axis) {
|
||||
area *= srcTensor->length(axis);
|
||||
}
|
||||
const int batch = srcTensor->length(0);
|
||||
if (srcType == MNN_FORWARD_CPU) {
|
||||
const int outBatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
|
||||
const int inbatchStride = UP_DIV(channel, 4) * area * 4;
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC4HW4TONC8HW8(dstTensor->host<uint16_t>() + outBatchStride * i, srcTensor->host<float>() + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
} else {
|
||||
const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
|
||||
const int outBatchStide = UP_DIV(channel, 4) * area * 4;
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
MNNNC8HW8TONC4HW4(dstTensor->host<float>() + outBatchStide * i, srcTensor->host<uint16_t>() + inbatchStride * i, area,
|
||||
channel);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
//MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
|
||||
// The format is the same, just convert fp32-fp16
|
||||
const int elemenSize = srcTensor->elementSize();
|
||||
// copy and quantize/dequantize data
|
||||
// cpu -> arm82 copy
|
||||
if (srcType == MNN_FORWARD_CPU) {
|
||||
const auto src = srcTensor->host<float>();
|
||||
auto dst = dstTensor->host<FLOAT16>();
|
||||
MNNQuantizeFP16(dst, src, elemenSize);
|
||||
return;
|
||||
}
|
||||
// arm82 -> cpu copy
|
||||
if (srcType == MNN_FORWARD_CPU_EXTENSION) {
|
||||
const auto src = srcTensor->host<int16_t>();
|
||||
auto dst = dstTensor->host<float>();
|
||||
MNNDequantizeFP16(dst, src, elemenSize);
|
||||
return;
|
||||
}
|
||||
MNN_ERROR("Invalide copy for intenal Arm82 Backend\n");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,12 +9,8 @@
|
|||
#include "backend/arm82/Arm82OptFunc.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "half.hpp"
|
||||
#ifdef MNN_USE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size) {
|
||||
#ifdef MNN_USE_NEON
|
||||
|
||||
int sizeDiv4 = size / 4;
|
||||
int remain = size - sizeDiv4 * 4;
|
||||
|
||||
|
|
@ -27,12 +23,28 @@ void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size) {
|
|||
dst[i] = half_float::half(src[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
for (int i = 0; i < size; ++i) {
|
||||
dst[i] = half_float::half(src[i]);
|
||||
void MNNDequantizeFP16(float* dst, const int16_t* srcint, int size) {
|
||||
auto src = (const FLOAT16*)srcint;
|
||||
int sizeDiv4 = size / 4;
|
||||
int remain = size - sizeDiv4 * 4;
|
||||
for (int i = 0; i < sizeDiv4; ++i) {
|
||||
auto S = vld1_f16(src);
|
||||
auto D = vcvt_f32_f16(S);
|
||||
vst1q_f32(dst, D);
|
||||
dst += 4;
|
||||
src += 4;
|
||||
}
|
||||
if (remain > 0) {
|
||||
FLOAT16 tempSrc[4];
|
||||
float tempDst[4];
|
||||
::memcpy(tempSrc, src, remain * sizeof(int16_t));
|
||||
auto S = vld1_f16(tempSrc);
|
||||
auto D = vcvt_f32_f16(S);
|
||||
vst1q_f32(tempDst, D);
|
||||
::memcpy(dst, tempDst, remain * sizeof(float));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void MNNNC4HW4TONC8HW8(uint16_t* dst, const float* source, size_t plane, size_t channel) {
|
||||
|
|
@ -157,4 +169,4 @@ void MNNNC8HW8TONCHW_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plan
|
|||
MNNUnpackUNIT<uint16_t, uint16_t, 8>(dest, source, plane, channel);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ void MNNGemmFP16C8_UNIT(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight,
|
|||
|
||||
void MNNShuffleChannelC8(FLOAT16* dst, const FLOAT16* src, size_t size, size_t halfFlag);
|
||||
void MNNQuantizeFP16_UNIT4(FLOAT16* dst, const float* src, int size);
|
||||
void MNNDequantizeFP16(float* dst, const int16_t* src, int size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
@ -76,4 +77,4 @@ void MNNUnpackUNIT(TOUT* dst, const TIN* src, size_t area, size_t depth) {
|
|||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -502,4 +502,19 @@ void TensorUtils::adjustTensorForCompability(Tensor* newTensor) {
|
|||
}
|
||||
}
|
||||
|
||||
Tensor::DimensionType TensorUtils::getDimType(const Tensor* t) {
|
||||
auto format = TensorUtils::getDescribe(t)->dimensionFormat;
|
||||
switch (format) {
|
||||
case MNN_DATA_FORMAT_NCHW:
|
||||
return Tensor::CAFFE;
|
||||
case MNN_DATA_FORMAT_NC4HW4:
|
||||
return Tensor::CAFFE_C4;
|
||||
case MNN_DATA_FORMAT_NHWC:
|
||||
return Tensor::TENSORFLOW;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Tensor::TENSORFLOW;
|
||||
}
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -138,6 +138,7 @@ public:
|
|||
static bool reshapeSlice(Tensor::InsideDescribe::Region& slice, int outside, int inside, int axis);
|
||||
static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
|
||||
static void adjustTensorForCompability(Tensor* t);
|
||||
static Tensor::DimensionType getDimType(const Tensor* t);
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
|
|
|
|||
|
|
@ -446,24 +446,22 @@ public:
|
|||
if (nullptr == creator) {
|
||||
continue;
|
||||
}
|
||||
MNN::Backend::Info info;
|
||||
info.type = type;
|
||||
BackendConfig user;
|
||||
user.precision = MNN::BackendConfig::Precision_High;
|
||||
info.user = &user;
|
||||
std::shared_ptr<Runtime> runtime(creator->onCreate(info));
|
||||
|
||||
MNN_PRINT("Test %d Backend\n", type);
|
||||
std::shared_ptr<Backend> bn(runtime->onCreate());
|
||||
// uint8
|
||||
// auto res = nhwc_2_nhwc_uint8(bn);
|
||||
auto res = NC4HW4_2_NC4HW4_float(bn);
|
||||
res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
|
||||
if (!res) {
|
||||
MNN_ERROR("Error for %d bn\n", i);
|
||||
return false;
|
||||
for (int p = 0; p < 3; ++p) {
|
||||
MNN::Backend::Info info;
|
||||
info.type = type;
|
||||
BackendConfig user;
|
||||
user.precision = (MNN::BackendConfig::PrecisionMode)p;
|
||||
info.user = &user;
|
||||
std::shared_ptr<Runtime> runtime(creator->onCreate(info));
|
||||
MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
|
||||
std::shared_ptr<Backend> bn(runtime->onCreate());
|
||||
auto res = NC4HW4_2_NC4HW4_float(bn);
|
||||
res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
|
||||
if (!res) {
|
||||
MNN_ERROR("Error for %d bn\n", i);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// NC4HW4_2_NC4HW4_uint8(bn);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue