MNN/source/backend/arm82/Arm82OptFunc.cpp

202 lines
6.3 KiB
C++

//
// Arm82OptFunc.hpp
// MNN
//
// Created by MNN on 2019/02/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#if defined(__ANDROID__) || defined(__aarch64__)
#include "Arm82OptFunc.hpp"
#include "Arm82Vec.hpp"
#include "core/Macro.h"
#include "half.hpp"
#ifdef MNN_USE_NEON
#include <arm_neon.h>
#endif
extern "C" {
void MNNExpFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* params, size_t blockCount);
void MNNQuantizeFP16_UNIT4(int16_t* dst, const float* src, int size);
}
void Arm82MNNExp(FLOAT16* dst, const FLOAT16* src, size_t dataSize) {
int blockCount = dataSize / 16;
if (blockCount > 0) {
static FLOAT16 params[] = {
(FLOAT16)log(2.0f), (FLOAT16)(1.0f / log(2.0f)), 1.0f, 1.0f, 0.5f, 1.0f / 6.0f, 1.0f / 24.0f, 1.0f / 120.0f};
MNNExpFP16(dst, src, params, blockCount);
}
FLOAT16 xLimit = 11, expStep = log(2.0f), expStep_r = 1.0f / expStep;
for (int i = blockCount * 16; i < dataSize; ++i) {
auto x = -src[i];
x = ALIMAX(x, -xLimit);
x = ALIMIN(x, xLimit);
int div = x * expStep_r, expBasicRaw = (div + 15) << 10;
FLOAT16 t = x - div * expStep, expBasic = *(FLOAT16*)(&expBasicRaw);
FLOAT16 expRemain = ((((1.0f / 120 * t + 1.0f / 24) * t + 1.0f / 6) * t + 0.5f) * t + 1.0f) * t + 1.0f;
dst[i] = (FLOAT16)(expBasic * expRemain);
}
}
void Arm82MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
#ifdef __aarch64__
*hP = 16;
#else
*hP = 8;
#endif
*eP = 12;
*lP = 1;
}
void MNNQuantizeFP16(const float* src, int16_t* dst, size_t size) {
int sizeDiv4 = size / 4;
int remain = size - sizeDiv4 * 4;
if (sizeDiv4 > 0) {
MNNQuantizeFP16_UNIT4(dst, src, sizeDiv4);
src += sizeDiv4 * 4;
dst += sizeDiv4 * 4;
}
if (remain > 0) {
float tempSrc[4];
int16_t tempDst[4];
::memcpy(tempSrc, src, remain * sizeof(float));
MNNQuantizeFP16_UNIT4(tempDst, tempSrc, 1);
::memcpy(dst, tempDst, remain * sizeof(int16_t));
}
}
void MNNDequantizeFP16(const int16_t* srcint, float* dst, size_t size) {
auto src = (const FLOAT16*)srcint;
int sizeDiv4 = size / 4;
int remain = size - sizeDiv4 * 4;
for (int i = 0; i < sizeDiv4; ++i) {
auto S = vld1_f16(src);
auto D = vcvt_f32_f16(S);
vst1q_f32(dst, D);
dst += 4;
src += 4;
}
if (remain > 0) {
FLOAT16 tempSrc[4];
float tempDst[4];
::memcpy(tempSrc, src, remain * sizeof(int16_t));
auto S = vld1_f16(tempSrc);
auto D = vcvt_f32_f16(S);
vst1q_f32(tempDst, D);
::memcpy(dst, tempDst, remain * sizeof(float));
}
}
void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t plane, size_t channel) {
MNNPackUNIT<FLOAT16, FLOAT16, 8>(dest, source, plane, channel);
}
void MNNUnPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t plane, size_t channel) {
MNNUnpackUNIT<FLOAT16, FLOAT16, 8>(dest, source, plane, channel);
}
void MNNNC4HW4TONC8HW8(FLOAT16* dst, const float* source, size_t plane, size_t channel) {
const int c4 = UP_DIV(channel, 4);
const int c8 = UP_DIV(channel, 8);
memset(dst, 0, plane * c8 * 8 * sizeof(FLOAT16));
#if defined(MNN_USE_NEON) && defined(__aarch64__)
auto dest = (float16_t*)dst;
#else
auto dest = dst;
#endif
for (int c = 0; c < c4; ++c) {
int ci = c / 2;
int cj = c % 2;
auto dstChannel = dest + ci * 8 * plane + cj * 4;
auto srcChannle = source + c * plane * 4;
for (int i = 0; i < plane; ++i) {
#if defined(MNN_USE_NEON) && defined(__aarch64__)
float32x4_t a = vld1q_f32(srcChannle + i * 4);
vst1_f16(dstChannel + i * 8, vcvt_f16_f32(a));
#else
half_float::half dataHalf[4];
for (int k = 0; k < 4; ++k) {
dataHalf[k] = srcChannle[i * 4 + k];
// MNN_PRINT("==> %f\n", float(dataHalf[k]));
}
memcpy(dstChannel + i * 8, dataHalf, sizeof(half_float::half) * 4);
#endif
}
}
}
void MNNNC8HW8TONC4HW4(float* dest, const FLOAT16* src, size_t plane, size_t channel) {
const int c4 = UP_DIV(channel, 4);
#if defined(MNN_USE_NEON) && defined(__aarch64__)
auto source = (float16_t*)src;
#else
auto source = src;
#endif
for (int c = 0; c < c4; ++c) {
int ci = c / 2;
int cj = c % 2;
auto srcChannel = source + ci * 8 * plane + cj * 4;
auto dstChannel = dest + c * plane * 4;
for (int i = 0; i < plane; ++i) {
#if defined(MNN_USE_NEON) && defined(__aarch64__)
float16x4_t a = vld1_f16(srcChannel + i * 8);
vst1q_f32(dstChannel + i * 4, vcvt_f32_f16(a));
#else
half_float::half dataHalf[4];
memcpy(dataHalf, srcChannel + i * 8, sizeof(half_float::half) * 4);
for (int k = 0; k < 4; ++k) {
dstChannel[i * 4 + k] = float(dataHalf[k]);
}
#endif
}
}
}
void MNNNC8HW8TONHWC(float* dest, const FLOAT16* src, size_t plane, size_t channel) {
int c = (int)channel;
int cDiv8 = c / 8;
int cAlign = cDiv8 * 8;
#if defined(MNN_USE_NEON) && defined(__aarch64__)
auto source = (float16_t*)src;
#else
auto source = src;
#endif
for (int hi = 0; hi < plane; ++hi) {
const auto srcHeight = source + hi * 8;
float* dstHeight = dest + hi * c;
for (int ci = 0; ci < cDiv8; ++ci) {
#if defined(MNN_USE_NEON) && defined(__aarch64__)
float16x8_t a = vld1q_f16(srcHeight + 8 * ci * plane);
vst1q_f32(dstHeight + 8 * ci, vcvt_high_f32_f16(a));
#else
half_float::half dataHalf[8];
memcpy(dataHalf, srcHeight + 8 * ci * plane, 8 * sizeof(FLOAT16));
for (int i = 0; i < 8; ++i) {
dstHeight[ci * 8 + i] = float(dataHalf[i]);
}
#endif
}
}
if (cAlign == c) {
return;
}
int cReamin = c - cAlign;
const auto srcAlign = reinterpret_cast<const half_float::half*>(source + plane * cAlign);
auto dstAlign = dest + cAlign;
for (int hi = 0; hi < plane; ++hi) {
const auto srcHeight = srcAlign + hi * 8;
float* dstHeight = dstAlign + hi * c;
for (int ci = 0; ci < cReamin; ++ci) {
dstHeight[ci] = float(srcHeight[ci]);
}
}
}
#endif