2020-11-05 16:41:56 +08:00
//
// Vec.hpp
// MNN
//
// Created by MNN on 2019/04/01.
// Copyright © 2018, Alibaba Group Holding Limited
//
# ifndef Vec_hpp
# define Vec_hpp
# include "core/Macro.h"
2024-11-18 14:37:45 +08:00
# include "core/SimdHeader.h"
2021-04-21 15:54:01 +08:00
# include <array>
2020-11-05 16:41:56 +08:00
# include <algorithm> // supply std::max and std::min
2023-06-16 09:42:45 +08:00
# include <math.h>
2020-11-05 16:41:56 +08:00
namespace MNN {
namespace Math {
template < typename T , int N >
struct Vec {
using VecType = Vec < T , N > ;
2021-04-21 15:54:01 +08:00
std : : array < T , N > value ;
2021-11-30 10:10:53 +08:00
VecType operator + ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] + lr . value [ i ] ;
}
return dst ;
}
2021-06-11 17:17:13 +08:00
VecType operator - ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] - lr . value [ i ] ;
}
return dst ;
}
2021-04-21 15:54:01 +08:00
Vec ( std : : array < T , N > & & v ) {
value = std : : move ( v ) ;
}
2023-10-18 10:31:02 +08:00
VecType operator = = ( const VecType & lr ) const {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
if ( value [ i ] = = lr . value [ i ] ) {
dst . value [ i ] = 1 ;
} else {
dst . value [ i ] = 0 ;
}
}
return dst ;
}
VecType operator < ( const VecType & lr ) const {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
if ( value [ i ] < lr . value [ i ] ) {
dst . value [ i ] = 1 ;
} else {
dst . value [ i ] = 0 ;
}
}
return dst ;
}
VecType operator < = ( const VecType & lr ) const {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
if ( value [ i ] < = lr . value [ i ] ) {
dst . value [ i ] = 1 ;
} else {
dst . value [ i ] = 0 ;
}
}
return dst ;
}
VecType operator > ( const VecType & lr ) const {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
if ( value [ i ] > lr . value [ i ] ) {
dst . value [ i ] = 1 ;
} else {
dst . value [ i ] = 0 ;
}
}
return dst ;
}
VecType operator > = ( const VecType & lr ) const {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
if ( value [ i ] > = lr . value [ i ] ) {
dst . value [ i ] = 1 ;
} else {
dst . value [ i ] = 0 ;
}
}
return dst ;
}
2021-06-11 17:17:13 +08:00
VecType operator * ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] * lr . value [ i ] ;
}
return dst ;
}
2021-06-11 17:17:13 +08:00
VecType operator * ( T lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = value [ i ] * lr ;
}
return dst ;
}
2022-01-04 10:50:40 +08:00
VecType operator + = ( const VecType & lr ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = value [ i ] + lr . value [ i ] ;
}
return * this ;
}
VecType operator - = ( const VecType & lr ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = value [ i ] - lr . value [ i ] ;
}
return * this ;
}
2020-11-05 16:41:56 +08:00
VecType & operator = ( const VecType & lr ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = lr . value [ i ] ;
}
return * this ;
}
VecType operator - ( ) {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = - value [ i ] ;
}
return dst ;
}
Vec ( ) {
}
Vec ( const T v ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = v ;
}
}
Vec ( const VecType & lr ) {
for ( int i = 0 ; i < N ; + + i ) {
value [ i ] = lr . value [ i ] ;
}
}
T operator [ ] ( size_t i ) {
return value [ i ] ;
}
2021-06-11 17:17:13 +08:00
template < typename U >
static VecType load ( const U * addr ) {
2020-11-05 16:41:56 +08:00
VecType v ;
for ( int i = 0 ; i < N ; + + i ) {
2021-06-11 17:17:13 +08:00
v . value [ i ] = static_cast < T > ( addr [ i ] ) ;
2020-11-05 16:41:56 +08:00
}
return v ;
}
2022-05-06 19:51:20 +08:00
template < typename U >
static VecType broadcast ( const U * addr ) {
VecType v ;
v . value [ 0 ] = static_cast < T > ( addr [ 0 ] ) ;
for ( int i = 1 ; i < N ; + + i ) {
v . value [ i ] = v . value [ 0 ] ;
}
return v ;
}
2023-06-16 09:42:45 +08:00
template < typename U >
static void save ( U * addr , const VecType & v ) {
2020-11-05 16:41:56 +08:00
for ( int i = 0 ; i < N ; + + i ) {
2023-06-16 09:42:45 +08:00
addr [ i ] = static_cast < U > ( v . value [ i ] ) ;
2020-11-05 16:41:56 +08:00
}
}
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = std : : max ( v1 . value [ i ] , v2 . value [ i ] ) ;
}
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst ;
for ( int i = 0 ; i < N ; + + i ) {
dst . value [ i ] = std : : min ( v1 . value [ i ] , v2 . value [ i ] ) ;
}
return dst ;
}
2022-01-04 10:50:40 +08:00
static VecType fma ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
return v1 + v2 * v3 ;
2021-06-11 17:17:13 +08:00
}
2022-01-04 10:50:40 +08:00
static VecType fms ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
return v1 - v2 * v3 ;
2021-06-11 17:17:13 +08:00
}
2021-09-18 15:52:30 +08:00
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
VecType source [ 4 ] = { vec0 , vec1 , vec2 , vec3 } ;
for ( int i = 0 ; i < N ; + + i ) {
vec0 . value [ i ] = source [ i % 4 ] . value [ i > > 2 ] ;
vec1 . value [ i ] = source [ i % 4 ] . value [ ( i + N ) > > 2 ] ;
vec2 . value [ i ] = source [ i % 4 ] . value [ ( i + 2 * N ) > > 2 ] ;
vec3 . value [ i ] = source [ i % 4 ] . value [ ( i + 3 * N ) > > 2 ] ;
}
}
2020-11-05 16:41:56 +08:00
} ;
# ifdef MNN_USE_NEON
2023-10-18 10:31:02 +08:00
template < >
struct Vec < int32_t , 4 > {
using VecType = Vec < int32_t , 4 > ;
int32x4_t value ;
Vec ( ) {
}
Vec ( const int32_t v ) {
value = vdupq_n_s32 ( v ) ;
}
Vec ( const float v ) {
value = vdupq_n_s32 ( ( int32_t ) v ) ;
}
Vec ( const int32x4_t v ) {
value = v ;
}
Vec ( const VecType & lr ) {
value = lr . value ;
}
Vec ( const VecType & & lr ) {
value = std : : move ( lr . value ) ;
}
float operator [ ] ( size_t i ) {
return value [ i ] ;
}
static VecType load ( const float * addr ) {
2024-04-19 11:58:21 +08:00
VecType v = { ( int32x4_t ) ( vld1q_f32 ( addr ) ) } ;
2023-10-18 10:31:02 +08:00
return v ;
}
static VecType broadcast ( const float * addr ) {
2024-04-19 11:58:21 +08:00
VecType dst = { ( int32x4_t ) ( vld1q_dup_f32 ( addr ) ) } ;
2023-10-18 10:31:02 +08:00
return dst ;
}
static VecType broadcast ( const int32_t * addr ) {
VecType dst = { vld1q_dup_s32 ( addr ) } ;
return dst ;
}
static VecType load ( const int32_t * addr ) {
VecType v = { vld1q_s32 ( addr ) } ;
return v ;
}
static void save ( float * addr , const VecType & v ) {
2024-04-19 11:58:21 +08:00
vst1q_f32 ( addr , ( float32x4_t ) ( v . value ) ) ;
2023-10-18 10:31:02 +08:00
}
static void save ( int32_t * addr , const VecType & v ) {
vst1q_s32 ( addr , v . value ) ;
}
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { vmaxq_s32 ( v1 . value , v2 . value ) } ;
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { vminq_s32 ( v1 . value , v2 . value ) } ;
return dst ;
}
static VecType fma ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
VecType dst = { vmlaq_s32 ( v1 . value , v2 . value , v3 . value ) } ;
return dst ;
}
static VecType fms ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
VecType dst = { vmlsq_s32 ( v1 . value , v2 . value , v3 . value ) } ;
return dst ;
}
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
# ifdef __aarch64__
auto m0 = vtrn1q_s32 ( vec0 . value , vec1 . value ) ;
auto m1 = vtrn2q_s32 ( vec0 . value , vec1 . value ) ;
auto m2 = vtrn1q_s32 ( vec2 . value , vec3 . value ) ;
auto m3 = vtrn2q_s32 ( vec2 . value , vec3 . value ) ;
2024-04-19 11:58:21 +08:00
vec0 . value = ( int32x4_t ) ( vtrn1q_s64 ( ( int64x2_t ) ( m0 ) , ( int64x2_t ) ( m2 ) ) ) ;
vec1 . value = ( int32x4_t ) ( vtrn1q_s64 ( ( int64x2_t ) ( m1 ) , ( int64x2_t ) ( m3 ) ) ) ;
vec2 . value = ( int32x4_t ) ( vtrn2q_s64 ( ( int64x2_t ) ( m0 ) , ( int64x2_t ) ( m2 ) ) ) ;
vec3 . value = ( int32x4_t ) ( vtrn2q_s64 ( ( int64x2_t ) ( m1 ) , ( int64x2_t ) ( m3 ) ) ) ;
2023-10-18 10:31:02 +08:00
# else
auto m0m1 = vtrnq_s32 ( vec0 . value , vec1 . value ) ;
auto m2m3 = vtrnq_s32 ( vec2 . value , vec3 . value ) ;
vec0 . value = m0m1 . val [ 0 ] ;
vec1 . value = m0m1 . val [ 1 ] ;
vec2 . value = m2m3 . val [ 0 ] ;
vec3 . value = m2m3 . val [ 1 ] ;
2024-05-11 19:17:02 +08:00
vec0 . value = ( int32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) m2m3 . val [ 0 ] , 0 ) , ( int64x2_t ) vec0 . value , 1 ) ) ;
vec1 . value = ( int32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) m2m3 . val [ 1 ] , 0 ) , ( int64x2_t ) vec1 . value , 1 ) ) ;
vec2 . value = ( int32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) m0m1 . val [ 0 ] , 1 ) , ( int64x2_t ) vec2 . value , 0 ) ) ;
vec3 . value = ( int32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) m0m1 . val [ 1 ] , 1 ) , ( int64x2_t ) vec3 . value , 0 ) ) ;
2023-10-18 10:31:02 +08:00
# endif
}
VecType operator + ( const VecType & lr ) const {
VecType dst = { vaddq_s32 ( value , lr . value ) } ;
return dst ;
}
VecType operator - ( const VecType & lr ) const {
VecType dst = { vsubq_s32 ( value , lr . value ) } ;
return dst ;
}
VecType operator + = ( const VecType & lr ) {
value = vaddq_s32 ( value , lr . value ) ;
return * this ;
}
VecType operator - = ( const VecType & lr ) {
value = vsubq_s32 ( value , lr . value ) ;
return * this ;
}
VecType operator * ( int32_t lr ) const {
VecType dst = { vmulq_n_s32 ( value , lr ) } ;
return dst ;
}
VecType operator * ( float lr ) const {
VecType dst = { vmulq_n_s32 ( value , ( int32_t ) lr ) } ;
return dst ;
}
VecType operator * ( const VecType & lr ) const {
VecType dst = { vmulq_s32 ( value , lr . value ) } ;
return dst ;
}
VecType & operator = ( const VecType & lr ) {
value = lr . value ;
return * this ;
}
VecType & operator = ( const VecType & & lr ) {
value = std : : move ( lr . value ) ;
return * this ;
}
VecType operator - ( ) {
VecType dst = { vnegq_s32 ( value ) } ;
return dst ;
}
VecType operator < ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcltq_s32 ( value , lr . value ) ;
VecType dst = { vbslq_s32 ( res , one , zero ) } ;
return dst ;
}
VecType operator > ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcgtq_s32 ( value , lr . value ) ;
VecType dst = { vbslq_s32 ( res , one , zero ) } ;
return dst ;
}
VecType operator < = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcleq_s32 ( value , lr . value ) ;
VecType dst = { vbslq_s32 ( res , one , zero ) } ;
return dst ;
}
VecType operator > = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcgeq_s32 ( value , lr . value ) ;
VecType dst = { vbslq_s32 ( res , one , zero ) } ;
return dst ;
}
VecType operator = = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vceqq_s32 ( value , lr . value ) ;
VecType dst = { vbslq_s32 ( res , one , zero ) } ;
return dst ;
}
} ;
2020-11-05 16:41:56 +08:00
template < >
struct Vec < float , 4 > {
using VecType = Vec < float , 4 > ;
2023-10-18 10:31:02 +08:00
using VecTypeInt32 = Vec < int32_t , 4 > ;
2020-11-05 16:41:56 +08:00
float32x4_t value ;
Vec ( ) {
}
Vec ( const float v ) {
value = vdupq_n_f32 ( v ) ;
}
Vec ( const float32x4_t v ) {
value = v ;
}
Vec ( const VecType & lr ) {
value = lr . value ;
}
Vec ( const VecType & & lr ) {
value = std : : move ( lr . value ) ;
}
float operator [ ] ( size_t i ) {
return value [ i ] ;
}
static VecType load ( const float * addr ) {
VecType v = { vld1q_f32 ( addr ) } ;
return v ;
}
2022-05-06 19:51:20 +08:00
static VecType broadcast ( const float * addr ) {
VecType dst = { vld1q_dup_f32 ( addr ) } ;
return dst ;
}
2021-06-11 17:17:13 +08:00
static VecType load ( const int32_t * addr ) {
VecType v = { vcvtq_f32_s32 ( vld1q_s32 ( addr ) ) } ;
return v ;
}
2020-11-05 16:41:56 +08:00
static void save ( float * addr , const VecType & v ) {
vst1q_f32 ( addr , v . value ) ;
}
2023-10-18 10:31:02 +08:00
static void save ( float * addr , const VecTypeInt32 & v ) {
2024-04-19 11:58:21 +08:00
vst1q_f32 ( addr , ( float32x4_t ) ( v . value ) ) ;
2023-10-18 10:31:02 +08:00
}
2023-10-19 10:12:55 +08:00
static void save ( int32_t * addr , const VecType & v ) {
2024-04-19 11:58:21 +08:00
vst1q_s32 ( addr , ( int32x4_t ) ( v . value ) ) ;
2023-10-19 10:12:55 +08:00
}
2020-11-05 16:41:56 +08:00
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { vmaxq_f32 ( v1 . value , v2 . value ) } ;
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { vminq_f32 ( v1 . value , v2 . value ) } ;
return dst ;
}
2022-01-04 10:50:40 +08:00
static VecType fma ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
VecType dst = { vmlaq_f32 ( v1 . value , v2 . value , v3 . value ) } ;
return dst ;
2021-06-11 17:17:13 +08:00
}
2022-01-04 10:50:40 +08:00
static VecType fms ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
VecType dst = { vmlsq_f32 ( v1 . value , v2 . value , v3 . value ) } ;
return dst ;
2021-06-11 17:17:13 +08:00
}
2021-09-18 15:52:30 +08:00
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
# ifdef __aarch64__
2024-04-19 11:58:21 +08:00
auto m0 = vtrn1q_s32 ( ( int32x4_t ) ( vec0 . value ) , ( int32x4_t ) ( vec1 . value ) ) ;
auto m1 = vtrn2q_s32 ( ( int32x4_t ) ( vec0 . value ) , ( int32x4_t ) ( vec1 . value ) ) ;
auto m2 = vtrn1q_s32 ( ( int32x4_t ) ( vec2 . value ) , ( int32x4_t ) ( vec3 . value ) ) ;
auto m3 = vtrn2q_s32 ( ( int32x4_t ) ( vec2 . value ) , ( int32x4_t ) ( vec3 . value ) ) ;
vec0 . value = ( float32x4_t ) ( vtrn1q_s64 ( ( int64x2_t ) ( m0 ) , ( int64x2_t ) ( m2 ) ) ) ;
vec1 . value = ( float32x4_t ) ( vtrn1q_s64 ( ( int64x2_t ) ( m1 ) , ( int64x2_t ) ( m3 ) ) ) ;
vec2 . value = ( float32x4_t ) ( vtrn2q_s64 ( ( int64x2_t ) ( m0 ) , ( int64x2_t ) ( m2 ) ) ) ;
vec3 . value = ( float32x4_t ) ( vtrn2q_s64 ( ( int64x2_t ) ( m1 ) , ( int64x2_t ) ( m3 ) ) ) ;
2021-09-18 15:52:30 +08:00
# else
2024-04-19 11:58:21 +08:00
auto m0m1 = vtrnq_s32 ( ( int32x4_t ) ( vec0 . value ) , ( int32x4_t ) ( vec1 . value ) ) ;
auto m2m3 = vtrnq_s32 ( ( int32x4_t ) ( vec2 . value ) , ( int32x4_t ) ( vec3 . value ) ) ;
vec0 . value = ( float32x4_t ) ( m0m1 . val [ 0 ] ) ;
vec1 . value = ( float32x4_t ) ( m0m1 . val [ 1 ] ) ;
vec2 . value = ( float32x4_t ) ( m2m3 . val [ 0 ] ) ;
vec3 . value = ( float32x4_t ) ( m2m3 . val [ 1 ] ) ;
vec0 . value = ( float32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) ( m2m3 . val [ 0 ] ) , 0 ) , ( int64x2_t ) ( vec0 . value ) , 1 ) ) ;
vec1 . value = ( float32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) ( m2m3 . val [ 1 ] ) , 0 ) , ( int64x2_t ) ( vec1 . value ) , 1 ) ) ;
vec2 . value = ( float32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) ( m0m1 . val [ 0 ] ) , 1 ) , ( int64x2_t ) ( vec2 . value ) , 0 ) ) ;
vec3 . value = ( float32x4_t ) ( vsetq_lane_s64 ( vgetq_lane_s64 ( ( int64x2_t ) ( m0m1 . val [ 1 ] ) , 1 ) , ( int64x2_t ) ( vec3 . value ) , 0 ) ) ;
2021-09-18 15:52:30 +08:00
/*
generated arm32 assembly code is almost the same as :
vtrn .32 d0 , d2
vtrn .32 d1 , d3
vtrn .32 d4 , d6
vtrn .32 d5 , d7
vswp d1 , d4
vswp d3 , d6
*/
# endif
}
2021-11-30 10:10:53 +08:00
VecType operator + ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { vaddq_f32 ( value , lr . value ) } ;
return dst ;
}
2021-11-30 10:10:53 +08:00
VecType operator - ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { vsubq_f32 ( value , lr . value ) } ;
return dst ;
}
2022-01-04 10:50:40 +08:00
VecType operator + = ( const VecType & lr ) {
value = vaddq_f32 ( value , lr . value ) ;
return * this ;
}
VecType operator - = ( const VecType & lr ) {
value = vsubq_f32 ( value , lr . value ) ;
return * this ;
}
2021-06-11 17:17:13 +08:00
VecType operator * ( float lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { vmulq_n_f32 ( value , lr ) } ;
return dst ;
}
2021-06-11 17:17:13 +08:00
VecType operator * ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { vmulq_f32 ( value , lr . value ) } ;
return dst ;
}
VecType & operator = ( const VecType & lr ) {
value = lr . value ;
return * this ;
}
VecType & operator = ( const VecType & & lr ) {
value = std : : move ( lr . value ) ;
return * this ;
}
VecType operator - ( ) {
VecType dst = { vnegq_f32 ( value ) } ;
return dst ;
}
2023-10-18 10:31:02 +08:00
VecType operator < ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcltq_f32 ( value , lr . value ) ;
2024-04-19 11:58:21 +08:00
VecType dst = { ( float32x4_t ) ( vbslq_s32 ( res , one , zero ) ) } ;
2023-10-18 10:31:02 +08:00
return dst ;
}
VecType operator > ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcgtq_f32 ( value , lr . value ) ;
2024-04-19 11:58:21 +08:00
VecType dst = { ( float32x4_t ) ( vbslq_s32 ( res , one , zero ) ) } ;
2023-10-18 10:31:02 +08:00
return dst ;
}
VecType operator < = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcleq_f32 ( value , lr . value ) ;
2024-04-19 11:58:21 +08:00
VecType dst = { ( float32x4_t ) ( vbslq_s32 ( res , one , zero ) ) } ;
2023-10-18 10:31:02 +08:00
return dst ;
}
VecType operator > = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vcgeq_f32 ( value , lr . value ) ;
2024-04-19 11:58:21 +08:00
VecType dst = { ( float32x4_t ) ( vbslq_s32 ( res , one , zero ) ) } ;
2023-10-18 10:31:02 +08:00
return dst ;
}
VecType operator = = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
int32x4_t one = vdupq_n_s32 ( 1 ) ;
int32x4_t zero = vdupq_n_s32 ( 0 ) ;
2023-10-18 10:31:02 +08:00
uint32x4_t res = vceqq_f32 ( value , lr . value ) ;
2024-04-19 11:58:21 +08:00
VecType dst = { ( float32x4_t ) ( vbslq_s32 ( res , one , zero ) ) } ;
2023-10-18 10:31:02 +08:00
return dst ;
}
2020-11-05 16:41:56 +08:00
} ;
# elif defined(MNN_USE_SSE)
2023-10-18 10:31:02 +08:00
template < >
struct Vec < int32_t , 4 > {
using VecType = Vec < int32_t , 4 > ;
using VecTypeArray = std : : array < VecType , 4 > ;
__m128i value ;
VecType operator + ( const VecType & lr ) const {
VecType dst = { _mm_add_epi32 ( value , lr . value ) } ;
return dst ;
}
VecType operator - ( const VecType & lr ) const {
VecType dst = { _mm_sub_epi32 ( value , lr . value ) } ;
return dst ;
}
VecType operator + = ( const VecType & lr ) {
value = _mm_add_epi32 ( value , lr . value ) ;
return * this ;
}
VecType operator - = ( const VecType & lr ) {
value = _mm_sub_epi32 ( value , lr . value ) ;
return * this ;
}
VecType operator * ( const VecType & lr ) const {
VecType dst = { _mm_cvtps_epi32 ( _mm_mul_ps ( _mm_cvtepi32_ps ( value ) , _mm_cvtepi32_ps ( lr . value ) ) ) } ;
return dst ;
}
VecType & operator = ( const VecType & lr ) {
value = lr . value ;
return * this ;
}
VecType operator = = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128i mask = _mm_cmpeq_epi32 ( value , lr . value ) ;
VecType dst = { _mm_and_si128 ( one , mask ) } ;
return dst ;
}
VecType operator < ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128i mask = _mm_cmplt_epi32 ( value , lr . value ) ;
VecType dst = { _mm_and_si128 ( one , mask ) } ;
return dst ;
}
VecType operator < = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128i mask = _mm_cmpgt_epi32 ( value , lr . value ) ;
VecType dst = { _mm_andnot_si128 ( mask , one ) } ;
return dst ;
}
VecType operator > ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128i mask = _mm_cmpgt_epi32 ( value , lr . value ) ;
VecType dst = { _mm_and_si128 ( one , mask ) } ;
return dst ;
}
VecType operator > = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128i mask = _mm_cmplt_epi32 ( value , lr . value ) ;
VecType dst = { _mm_andnot_si128 ( mask , one ) } ;
return dst ;
}
VecType operator - ( ) {
VecType dst ;
# if defined(_MSC_VER)
dst . value = _mm_cvtps_epi32 ( _mm_xor_ps ( _mm_cvtepi32_ps ( value ) , _mm_set1_ps ( - 0.f ) ) ) ; // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC.
# else
dst . value = - value ;
# endif
return dst ;
}
Vec ( ) {
}
Vec ( const float v ) {
int u = static_cast < int32_t > ( v ) ;
value = _mm_set_epi32 ( u , u , u , u ) ;
}
Vec ( const int32_t v ) {
value = _mm_set_epi32 ( v , v , v , v ) ;
}
Vec ( __m128i & & v ) {
value = v ;
}
Vec ( __m128 & & v ) {
value = _mm_castps_si128 ( v ) ;
}
Vec ( const VecType & lr ) {
value = lr . value ;
}
float operator [ ] ( size_t i ) {
# if defined(_MSC_VER) // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension.
int32_t temp [ 4 ] ;
_mm_storeu_si128 ( ( __m128i * ) temp , value ) ;
return temp [ i ] ;
# else
return value [ i ] ;
# endif
}
static VecType load ( const int32_t * addr ) {
VecType v = { _mm_loadu_si128 ( ( __m128i const * ) ( addr ) ) } ;
return v ;
}
static VecType broadcast ( const int32_t * addr ) {
int32_t arr [ 4 ] = { * addr , 0 , 0 , 0 } ;
VecType dst = { _mm_loadu_si128 ( ( __m128i const * ) ( arr ) ) } ;
return dst ;
}
static void save ( int32_t * addr , const VecType & v ) {
_mm_storeu_si128 ( ( __m128i * ) addr , v . value ) ;
}
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { _mm_cvtps_epi32 ( _mm_max_ps ( _mm_cvtepi32_ps ( v1 . value ) , _mm_cvtepi32_ps ( v2 . value ) ) ) } ;
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { _mm_cvtps_epi32 ( _mm_min_ps ( _mm_cvtepi32_ps ( v1 . value ) , _mm_cvtepi32_ps ( v2 . value ) ) ) } ;
return dst ;
}
static VecType fma ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
return v1 + v2 * v3 ; // TODO: use fma instruction
}
static VecType fms ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
return v1 - v2 * v3 ; // TODO: use fma instruction
}
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
__m128 tmp3 , tmp2 , tmp1 , tmp0 ;
tmp0 = _mm_unpacklo_ps ( _mm_castsi128_ps ( vec0 . value ) , _mm_castsi128_ps ( vec1 . value ) ) ;
tmp2 = _mm_unpacklo_ps ( _mm_castsi128_ps ( vec2 . value ) , _mm_castsi128_ps ( vec3 . value ) ) ;
tmp1 = _mm_unpackhi_ps ( _mm_castsi128_ps ( vec0 . value ) , _mm_castsi128_ps ( vec1 . value ) ) ;
tmp3 = _mm_unpackhi_ps ( _mm_castsi128_ps ( vec2 . value ) , _mm_castsi128_ps ( vec3 . value ) ) ;
vec0 . value = _mm_castps_si128 ( _mm_movelh_ps ( tmp0 , tmp2 ) ) ;
vec1 . value = _mm_castps_si128 ( _mm_movehl_ps ( tmp2 , tmp0 ) ) ;
vec2 . value = _mm_castps_si128 ( _mm_movelh_ps ( tmp1 , tmp3 ) ) ;
vec3 . value = _mm_castps_si128 ( _mm_movehl_ps ( tmp3 , tmp1 ) ) ;
}
} ;
2020-11-05 16:41:56 +08:00
template < >
struct Vec < float , 4 > {
using VecType = Vec < float , 4 > ;
2023-10-18 10:31:02 +08:00
using VecTypeInt32 = Vec < int32_t , 4 > ;
2023-06-16 09:42:45 +08:00
using VecTypeArray = std : : array < VecType , 4 > ;
2020-11-05 16:41:56 +08:00
__m128 value ;
2021-11-30 10:10:53 +08:00
VecType operator + ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { _mm_add_ps ( value , lr . value ) } ;
return dst ;
}
2021-11-30 10:10:53 +08:00
VecType operator - ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { _mm_sub_ps ( value , lr . value ) } ;
return dst ;
}
2022-01-04 10:50:40 +08:00
VecType operator + = ( const VecType & lr ) {
value = _mm_add_ps ( value , lr . value ) ;
return * this ;
}
VecType operator - = ( const VecType & lr ) {
value = _mm_sub_ps ( value , lr . value ) ;
return * this ;
}
2021-06-11 17:17:13 +08:00
VecType operator * ( const VecType & lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { _mm_mul_ps ( value , lr . value ) } ;
return dst ;
}
2021-06-11 17:17:13 +08:00
VecType operator * ( float lr ) const {
2020-11-05 16:41:56 +08:00
VecType dst = { _mm_mul_ps ( value , _mm_set1_ps ( lr ) ) } ;
return dst ;
}
VecType & operator = ( const VecType & lr ) {
value = lr . value ;
return * this ;
}
VecType operator - ( ) {
VecType dst ;
# if defined(_MSC_VER)
dst . value = _mm_xor_ps ( value , _mm_set1_ps ( - 0.f ) ) ; // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC.
# else
dst . value = - value ;
# endif
return dst ;
}
2023-10-18 10:31:02 +08:00
VecType operator = = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128i mask = _mm_cmpeq_epi32 ( _mm_castps_si128 ( value ) , _mm_castps_si128 ( lr . value ) ) ;
VecType dst = { _mm_castsi128_ps ( _mm_and_si128 ( one , mask ) ) } ;
return dst ;
}
VecType operator < ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128i mask = _mm_cmplt_epi32 ( _mm_castps_si128 ( value ) , _mm_castps_si128 ( lr . value ) ) ;
VecType dst = { _mm_castsi128_ps ( _mm_and_si128 ( one , mask ) ) } ;
return dst ;
}
VecType operator < = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128 mask = _mm_cmple_ps ( value , lr . value ) ;
VecType dst = { _mm_castsi128_ps ( _mm_and_si128 ( one , _mm_castps_si128 ( mask ) ) ) } ;
return dst ;
}
VecType operator > ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128 mask = _mm_cmpgt_ps ( value , lr . value ) ;
VecType dst = { _mm_castsi128_ps ( _mm_and_si128 ( one , _mm_castps_si128 ( mask ) ) ) } ;
return dst ;
}
VecType operator > = ( const VecType & lr ) const {
2023-12-04 11:12:20 +08:00
__m128i one = _mm_set1_epi32 ( 1 ) ;
2023-10-18 10:31:02 +08:00
__m128 mask = _mm_cmpge_ps ( value , lr . value ) ;
VecType dst = { _mm_castsi128_ps ( _mm_and_si128 ( one , _mm_castps_si128 ( mask ) ) ) } ;
return dst ;
}
2020-11-05 16:41:56 +08:00
Vec ( ) {
}
Vec ( const float v ) {
value = _mm_set1_ps ( v ) ;
}
Vec ( __m128 & & v ) {
value = v ;
}
Vec ( const VecType & lr ) {
value = lr . value ;
}
float operator [ ] ( size_t i ) {
# if defined(_MSC_VER) // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension.
float temp [ 4 ] ;
_mm_storeu_ps ( temp , value ) ;
return temp [ i ] ;
# else
return value [ i ] ;
# endif
}
static VecType load ( const float * addr ) {
VecType v = { _mm_loadu_ps ( addr ) } ;
return v ;
}
2022-05-06 19:51:20 +08:00
static VecType broadcast ( const float * addr ) {
VecType dst = { _mm_load_ss ( addr ) } ;
return dst ;
}
2020-11-05 16:41:56 +08:00
static void save ( float * addr , const VecType & v ) {
_mm_storeu_ps ( addr , v . value ) ;
}
2023-10-18 10:31:02 +08:00
static void save ( float * addr , const VecTypeInt32 & v ) {
_mm_storeu_ps ( addr , _mm_castsi128_ps ( v . value ) ) ;
}
2023-10-19 10:12:55 +08:00
static void save ( int32_t * addr , const VecType & v ) {
_mm_storeu_si128 ( ( __m128i * ) addr , _mm_castps_si128 ( v . value ) ) ;
}
2020-11-05 16:41:56 +08:00
static VecType max ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { _mm_max_ps ( v1 . value , v2 . value ) } ;
return dst ;
}
static VecType min ( const VecType & v1 , const VecType & v2 ) {
VecType dst = { _mm_min_ps ( v1 . value , v2 . value ) } ;
return dst ;
}
2022-01-04 10:50:40 +08:00
static VecType fma ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
return v1 + v2 * v3 ; // TODO: use fma instruction
2021-06-11 17:17:13 +08:00
}
2022-01-04 10:50:40 +08:00
static VecType fms ( const VecType & v1 , const VecType & v2 , const VecType & v3 ) {
return v1 - v2 * v3 ; // TODO: use fma instruction
2021-06-11 17:17:13 +08:00
}
2021-09-18 15:52:30 +08:00
static inline void transpose4 ( VecType & vec0 , VecType & vec1 , VecType & vec2 , VecType & vec3 ) {
__m128 tmp3 , tmp2 , tmp1 , tmp0 ;
tmp0 = _mm_unpacklo_ps ( ( vec0 . value ) , ( vec1 . value ) ) ;
tmp2 = _mm_unpacklo_ps ( ( vec2 . value ) , ( vec3 . value ) ) ;
tmp1 = _mm_unpackhi_ps ( ( vec0 . value ) , ( vec1 . value ) ) ;
tmp3 = _mm_unpackhi_ps ( ( vec2 . value ) , ( vec3 . value ) ) ;
vec0 . value = _mm_movelh_ps ( tmp0 , tmp2 ) ;
vec1 . value = _mm_movehl_ps ( tmp2 , tmp0 ) ;
vec2 . value = _mm_movelh_ps ( tmp1 , tmp3 ) ;
vec3 . value = _mm_movehl_ps ( tmp3 , tmp1 ) ;
}
2020-11-05 16:41:56 +08:00
} ;
2021-04-08 15:34:23 +08:00
2020-11-05 16:41:56 +08:00
# endif
} // namespace Math
} // namespace MNN
# endif /* Vec_hpp */