2021-04-08 15:34:23 +08:00
//
// VecHalf.hpp
// MNN
//
// Created by MNN on 2021/01/26.
// Copyright © 2018, Alibaba Group Holding Limited
//
# ifndef VecHalf_hpp
# define VecHalf_hpp
# include "core/Macro.h"
# include <stdint.h>
2021-04-21 15:54:01 +08:00
# include <array>
2021-04-08 15:34:23 +08:00
# include <algorithm> // supply std::max and std::min
namespace MNN {
namespace Math {
template < int N >
struct VecHalf {
using VecType = VecHalf < N > ;
2021-04-21 15:54:01 +08:00
std : : array < float , N > value ;
VecType operator + ( const VecType & lr ) const {
2021-04-08 15:34:23 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] + lr . value [ i ] ;
}
return dst ;
}
2021-04-21 15:54:01 +08:00
VecType operator - ( const VecType & lr ) const {
2021-04-08 15:34:23 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] - lr . value [ i ] ;
}
return dst ;
}
2021-04-21 15:54:01 +08:00
VecType operator * ( const VecType & lr ) const {
2021-04-08 15:34:23 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] * lr . value [ i ] ;
}
return dst ;
}
2021-04-21 15:54:01 +08:00
VecType operator * ( float lr ) const {
2021-04-08 15:34:23 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] * lr ;
}
return dst ;
}
VecType & operator = ( const VecType & lr ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = lr . value [ i ] ;
}
return * this ;
}
VecType operator - ( ) {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = - value [ i ] ;
}
return dst ;
}
VecHalf ( ) {
}
VecHalf ( const float v ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = v ;
}
}
2021-04-21 15:54:01 +08:00
VecHalf ( std : : array < float , N > & & v ) {
value = std : : move ( v ) ;
}
2021-04-08 15:34:23 +08:00
VecHalf ( const VecType & lr ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = lr . value [ i ] ;
}
}
float operator [ ] ( size_t i ) {
return value [ i ] ;
}
2021-06-11 17:17:13 +08:00
static VecType broadcast ( int16_t val ) {
VecType v ;
auto tempV = ( int32_t * ) v . value . data ( ) ;
for ( int i = 0 ; i < N ; + + i ) {
tempV [ i ] = val < < 16 ;
}
return v ;
}
2021-04-08 15:34:23 +08:00
static VecType load ( const int16_t * addr ) {
VecType v ;
2021-04-21 15:54:01 +08:00
auto tempV = ( int32_t * ) v . value . data ( ) ;
2021-04-08 15:34:23 +08:00
for ( int i = 0 ; i < N ; + + i ) {
tempV [ i ] = addr [ i ] < < 16 ;
}
return v ;
}
static void save ( int16_t * addr , const VecType & v ) {
2021-04-21 15:54:01 +08:00
auto tempV = ( int32_t * ) v . value . data ( ) ;
2021-04-08 15:34:23 +08:00
for ( int i = 0 ; i < N ; + + i ) {
addr [ i ] = tempV [ i ] > > 16 ;
}
}
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = std : : max ( v1 . value [ i ] , v2 . value [ i ] ) ;
}
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = std : : min ( v1 . value [ i ] , v2 . value [ i ] ) ;
}
return dst ;
}
2021-09-18 15:52:30 +08:00
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
VecType source [ 4 ] = { vec0 , vec1 , vec2 , vec3 } ;
for ( int i = 0 ; i < N ; + + i ) {
vec0 . value [ i ] = source [ i % 4 ] . value [ i > > 2 ] ;
vec1 . value [ i ] = source [ i % 4 ] . value [ ( i + N ) > > 2 ] ;
vec2 . value [ i ] = source [ i % 4 ] . value [ ( i + 2 * N ) > > 2 ] ;
vec3 . value [ i ] = source [ i % 4 ] . value [ ( i + 3 * N ) > > 2 ] ;
}
}
static inline void transpose12 ( int16_t * srcPtr , const size_t packCUnit ) {
MNN_ASSERT ( false ) ;
}
2021-04-08 15:34:23 +08:00
} ;
# if defined(MNN_USE_SSE)
# if defined(_MSC_VER)
# include <intrin.h>
# else
# include <x86intrin.h>
# endif
template < >
struct VecHalf < 4 > {
using VecType = VecHalf < 4 > ;
__m128 value ;
VecType operator + ( const VecType & lr ) const {
VecType dst = { _mm_add_ps ( value , lr . value ) } ;
return dst ;
}
VecType operator - ( const VecType & lr ) const {
VecType dst = { _mm_sub_ps ( value , lr . value ) } ;
return dst ;
}
VecType operator * ( const VecType & lr ) const {
VecType dst = { _mm_mul_ps ( value , lr . value ) } ;
return dst ;
}
VecType operator * ( float lr ) const {
VecType dst = { _mm_mul_ps ( value , _mm_set1_ps ( lr ) ) } ;
return dst ;
}
VecType & operator = ( const VecType & lr ) {
value = lr . value ;
return * this ;
}
VecType operator - ( ) {
VecType dst ;
# if defined(_MSC_VER)
dst . value = _mm_xor_ps ( value , _mm_set1_ps ( - 0.f ) ) ; // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC.
# else
dst . value = - value ;
# endif
return dst ;
}
VecHalf ( ) {
}
VecHalf ( const float v ) {
value = _mm_set1_ps ( v ) ;
}
VecHalf ( __m128 & v ) {
value = v ;
}
VecHalf ( __m128 & & v ) {
value = std : : move ( v ) ;
}
VecHalf ( const VecType & lr ) {
value = lr . value ;
}
VecHalf ( VecType & & lr ) {
value = std : : move ( lr . value ) ;
}
float operator [ ] ( size_t i ) {
# if defined(_MSC_VER) // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension.
float temp [ 4 ] ;
_mm_storeu_ps ( temp , value ) ;
return temp [ i ] ;
# else
return value [ i ] ;
# endif
}
2021-06-11 17:17:13 +08:00
static VecType broadcast ( int16_t val ) {
auto temp = _mm_set1_epi16 ( val ) ;
# ifndef MNN_SSE_USE_FP16_INSTEAD
auto zero = _mm_xor_si128 ( temp , temp ) ;
auto res = _mm_castsi128_ps ( _mm_unpacklo_epi16 ( zero , temp ) ) ;
# else
auto res = _mm_cvtph_ps ( temp ) ;
# endif
VecType v = { std : : move ( res ) } ;
return v ;
}
2021-04-08 15:34:23 +08:00
static VecType load ( const int16_t * addr ) {
auto temp = _mm_loadl_epi64 ( ( __m128i * ) addr ) ;
# ifndef MNN_SSE_USE_FP16_INSTEAD
auto zero = _mm_xor_si128 ( temp , temp ) ;
auto res = _mm_castsi128_ps ( _mm_unpacklo_epi16 ( zero , temp ) ) ;
# else
auto res = _mm_cvtph_ps ( temp ) ;
# endif
VecType v = { std : : move ( res ) } ;
return v ;
}
static void save ( int16_t * addr , const VecType & v ) {
# ifndef MNN_SSE_USE_FP16_INSTEAD
auto temp = _mm_castps_si128 ( v . value ) ;
temp = _mm_srai_epi32 ( temp , 16 ) ;
temp = _mm_packs_epi32 ( temp , temp ) ;
# else
static __m128 gMinValue = _mm_set1_ps ( - 32768 ) ;
static __m128 gMaxValue = _mm_set1_ps ( 32767 ) ;
auto t = _mm_max_ps ( v . value , gMinValue ) ;
t = _mm_min_ps ( t , gMaxValue ) ;
auto temp = _mm_cvtps_ph ( t , 0x8 ) ;
# endif
_mm_storel_epi64 ( ( __m128i * ) addr , temp ) ;
}
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { _mm_max_ps ( v1 . value , v2 . value ) } ;
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { _mm_min_ps ( v1 . value , v2 . value ) } ;
return dst ;
}
2021-09-18 15:52:30 +08:00
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
__m128 tmp3 , tmp2 , tmp1 , tmp0 ;
tmp0 = _mm_unpacklo_ps ( ( vec0 . value ) , ( vec1 . value ) ) ;
tmp2 = _mm_unpacklo_ps ( ( vec2 . value ) , ( vec3 . value ) ) ;
tmp1 = _mm_unpackhi_ps ( ( vec0 . value ) , ( vec1 . value ) ) ;
tmp3 = _mm_unpackhi_ps ( ( vec2 . value ) , ( vec3 . value ) ) ;
vec0 . value = _mm_movelh_ps ( tmp0 , tmp2 ) ;
vec1 . value = _mm_movehl_ps ( tmp2 , tmp0 ) ;
vec2 . value = _mm_movelh_ps ( tmp1 , tmp3 ) ;
vec3 . value = _mm_movehl_ps ( tmp3 , tmp1 ) ;
}
// x86 VecHalf transpose12 unused in any case
static inline void transpose12 ( int16_t * srcPtr , const size_t packCUnit ) {
MNN_ASSERT ( false ) ;
}
2021-04-08 15:34:23 +08:00
} ;
# endif
# if defined(MNN_USE_NEON)
# include <arm_neon.h>
template < >
struct VecHalf < 4 > {
using VecType = VecHalf < 4 > ;
float32x4_t value ;
VecType operator + ( const VecType & lr ) const {
VecType dst = { vaddq_f32 ( value , lr . value ) } ;
return dst ;
}
VecType operator - ( const VecType & lr ) const {
VecType dst = { vsubq_f32 ( value , lr . value ) } ;
return dst ;
}
VecType operator * ( const VecType & lr ) const {
VecType dst = { vmulq_f32 ( value , lr . value ) } ;
return dst ;
}
VecType operator * ( const float lr ) const {
VecType dst = { vmulq_f32 ( value , vdupq_n_f32 ( lr ) ) } ;
return dst ;
}
VecType & operator = ( const VecType & lr ) {
value = lr . value ;
return * this ;
}
VecType operator - ( ) {
VecType dst = { vnegq_f32 ( value ) } ;
return dst ;
}
VecHalf ( ) {
}
VecHalf ( const float v ) {
value = vdupq_n_f32 ( v ) ;
}
VecHalf ( float32x4_t & v ) {
value = v ;
}
VecHalf ( float32x4_t & & v ) {
value = std : : move ( v ) ;
}
VecHalf ( const VecType & lr ) {
value = lr . value ;
}
VecHalf ( VecType & & lr ) {
value = std : : move ( lr . value ) ;
}
float operator [ ] ( const int i ) {
// vgetq_lane_f32(value, i) does NOT work, i must be const number such as 0, 2,
return value [ i ] ;
}
2021-06-11 17:17:13 +08:00
static VecType broadcast ( int16_t val ) {
VecType dst = { vreinterpretq_f32_s32 ( vshll_n_s16 ( vdup_n_s16 ( val ) , 16 ) ) } ;
return dst ;
}
2021-04-08 15:34:23 +08:00
static VecType load ( const int16_t * addr ) {
// equivalent to this:
// int16x4_t vec4s16 = vld1_s16(addr); // load bf16 data as fixed point data of 16-bit.
// int32x4_t vec4s32 =vshll_n_s16(vec4s16, 16); // shift left 16bit as 32-bit data.
// float32x4_t vec4f32 = vreinterpretq_f32_s32(vec4s32);// treat 32-bit fix point result as float32 data
// VecType dest = { vec4f32 }; // construct a struct of VecType
VecType dst = { vreinterpretq_f32_s32 ( vshll_n_s16 ( vld1_s16 ( addr ) , 16 ) ) } ;
return dst ;
}
static void save ( int16_t * addr , const VecType & v ) {
vst1_s16 ( addr , vshrn_n_s32 ( vreinterpretq_s32_f32 ( v . value ) , 16 ) ) ;
return ;
}
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { vmaxq_f32 ( v1 . value , v2 . value ) } ;
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { vminq_f32 ( v1 . value , v2 . value ) } ;
return dst ;
}
2021-09-18 15:52:30 +08:00
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
# ifdef __aarch64__
auto m0 = vtrn1q_s32 ( vec0 . value , vec1 . value ) ;
auto m1 = vtrn2q_s32 ( vec0 . value , vec1 . value ) ;
auto m2 = vtrn1q_s32 ( vec2 . value , vec3 . value ) ;
auto m3 = vtrn2q_s32 ( vec2 . value , vec3 . value ) ;
vec0 . value = vtrn1q_s64 ( m0 , m2 ) ;
vec1 . value = vtrn1q_s64 ( m1 , m3 ) ;
vec2 . value = vtrn2q_s64 ( m0 , m2 ) ;
vec3 . value = vtrn2q_s64 ( m1 , m3 ) ;
# else
auto m0m1 = vtrnq_s32 ( vec0 . value , vec1 . value ) ;
auto m2m3 = vtrnq_s32 ( vec2 . value , vec3 . value ) ;
vec0 . value = m0m1 . val [ 0 ] ;
vec1 . value = m0m1 . val [ 1 ] ;
vec2 . value = m2m3 . val [ 0 ] ;
vec3 . value = m2m3 . val [ 1 ] ;
vec0 . value = vsetq_lane_s64 ( vgetq_lane_s64 ( m2m3 . val [ 0 ] , 0 ) , vec0 . value , 1 ) ;
vec1 . value = vsetq_lane_s64 ( vgetq_lane_s64 ( m2m3 . val [ 1 ] , 0 ) , vec1 . value , 1 ) ;
vec2 . value = vsetq_lane_s64 ( vgetq_lane_s64 ( m0m1 . val [ 0 ] , 1 ) , vec2 . value , 0 ) ;
vec3 . value = vsetq_lane_s64 ( vgetq_lane_s64 ( m0m1 . val [ 1 ] , 1 ) , vec3 . value , 0 ) ;
/*
generated arm32 assembly code is almost the same as :
vtrn .32 d0 , d2
vtrn .32 d1 , d3
vtrn .32 d4 , d6
vtrn .32 d5 , d7
vswp d1 , d4
vswp d3 , d6
*/
# endif
}
static inline void transpose4 ( int16x4_t & vec0 , int16x4_t & vec1 , int16x4_t & vec2 , int16x4_t & vec3 ) {
auto trans0 = vtrn_s16 ( vec0 , vec1 ) ;
auto m0 = trans0 . val [ 0 ] ;
auto m1 = trans0 . val [ 1 ] ;
auto trans1 = vtrn_s16 ( vec2 , vec3 ) ;
auto m2 = trans1 . val [ 0 ] ;
auto m3 = trans1 . val [ 1 ] ;
auto trans2 = vtrn_s32 ( m0 , m2 ) ;
vec0 = trans2 . val [ 0 ] ;
vec2 = trans2 . val [ 1 ] ;
auto trans3 = vtrn_s32 ( m1 , m3 ) ;
vec1 = trans3 . val [ 0 ] ;
vec3 = trans3 . val [ 1 ] ;
}
static inline void transpose12 ( int16_t * srcPtr , const size_t packCUnit ) {
auto s0 = vld1_s16 ( srcPtr + 0 * packCUnit ) ;
auto s3 = vld1_s16 ( srcPtr + 1 * packCUnit ) ;
auto s6 = vld1_s16 ( srcPtr + 2 * packCUnit ) ;
auto s9 = vld1_s16 ( srcPtr + 3 * packCUnit ) ;
auto s1 = vld1_s16 ( srcPtr + 4 * packCUnit ) ;
auto s4 = vld1_s16 ( srcPtr + 5 * packCUnit ) ;
auto s7 = vld1_s16 ( srcPtr + 6 * packCUnit ) ;
auto s10 = vld1_s16 ( srcPtr + 7 * packCUnit ) ;
auto s2 = vld1_s16 ( srcPtr + 8 * packCUnit ) ;
auto s5 = vld1_s16 ( srcPtr + 9 * packCUnit ) ;
auto s8 = vld1_s16 ( srcPtr + 10 * packCUnit ) ;
auto s11 = vld1_s16 ( srcPtr + 11 * packCUnit ) ;
transpose4 ( s0 , s3 , s6 , s9 ) ;
transpose4 ( s1 , s4 , s7 , s10 ) ;
transpose4 ( s2 , s5 , s8 , s11 ) ;
vst1_s16 ( srcPtr + 0 * packCUnit , s0 ) ;
vst1_s16 ( srcPtr + 1 * packCUnit , s1 ) ;
vst1_s16 ( srcPtr + 2 * packCUnit , s2 ) ;
vst1_s16 ( srcPtr + 3 * packCUnit , s3 ) ;
vst1_s16 ( srcPtr + 4 * packCUnit , s4 ) ;
vst1_s16 ( srcPtr + 5 * packCUnit , s5 ) ;
vst1_s16 ( srcPtr + 6 * packCUnit , s6 ) ;
vst1_s16 ( srcPtr + 7 * packCUnit , s7 ) ;
vst1_s16 ( srcPtr + 8 * packCUnit , s8 ) ;
vst1_s16 ( srcPtr + 9 * packCUnit , s9 ) ;
vst1_s16 ( srcPtr + 10 * packCUnit , s10 ) ;
vst1_s16 ( srcPtr + 11 * packCUnit , s11 ) ;
}
2021-04-08 15:34:23 +08:00
} ;
# endif
}
}
# endif