2021-04-08 15:34:23 +08:00
//
// WinogradOptFunctionHalf.cpp
// MNN
//
// Created by MNN on 2021/03/12.
// Copyright © 2018, Alibaba Group Holding Limited
//
# include "WinogradOptFunctionHalf.hpp"
# include <cstring>
# include <memory>
2021-09-18 15:52:30 +08:00
# include <map>
2021-04-08 15:34:23 +08:00
# include "core/Macro.h"
# include "VecHalf.hpp"
using BFVec4 = MNN : : Math : : VecHalf < 4 > ;
2021-09-18 15:52:30 +08:00
using VecType = BFVec4 ;
using ElementType = int16_t ;
2022-05-06 19:51:20 +08:00
/* CAUTION:
bf16 8 x8 winograd would lead to larger error for some kinds of models .
uncomment the following code only if you are sure accuracy is enough for your model .
*/
// #define USE_8x8_WINOGRAD_KERNEL
2021-09-18 15:52:30 +08:00
// to be optimized into VecType::transpose12
# define TRANSPOSE_12X4_SAVE() \
VecType s0 = VecType : : load ( srcPtr + 0 * packCUnit ) ; \
VecType s3 = VecType : : load ( srcPtr + 1 * packCUnit ) ; \
VecType s6 = VecType : : load ( srcPtr + 2 * packCUnit ) ; \
VecType s9 = VecType : : load ( srcPtr + 3 * packCUnit ) ; \
VecType s1 = VecType : : load ( srcPtr + 4 * packCUnit ) ; \
VecType s4 = VecType : : load ( srcPtr + 5 * packCUnit ) ; \
VecType s7 = VecType : : load ( srcPtr + 6 * packCUnit ) ; \
VecType s10 = VecType : : load ( srcPtr + 7 * packCUnit ) ; \
VecType s2 = VecType : : load ( srcPtr + 8 * packCUnit ) ; \
VecType s5 = VecType : : load ( srcPtr + 9 * packCUnit ) ; \
VecType s8 = VecType : : load ( srcPtr + 10 * packCUnit ) ; \
VecType s11 = VecType : : load ( srcPtr + 11 * packCUnit ) ; \
VecType : : transpose4 ( s0 , s3 , s6 , s9 ) ; \
VecType : : transpose4 ( s1 , s4 , s7 , s10 ) ; \
VecType : : transpose4 ( s2 , s5 , s8 , s11 ) ; \
VecType : : save ( srcPtr + 0 * packCUnit , s0 ) ; \
VecType : : save ( srcPtr + 1 * packCUnit , s1 ) ; \
VecType : : save ( srcPtr + 2 * packCUnit , s2 ) ; \
VecType : : save ( srcPtr + 3 * packCUnit , s3 ) ; \
VecType : : save ( srcPtr + 4 * packCUnit , s4 ) ; \
VecType : : save ( srcPtr + 5 * packCUnit , s5 ) ; \
VecType : : save ( srcPtr + 6 * packCUnit , s6 ) ; \
VecType : : save ( srcPtr + 7 * packCUnit , s7 ) ; \
VecType : : save ( srcPtr + 8 * packCUnit , s8 ) ; \
VecType : : save ( srcPtr + 9 * packCUnit , s9 ) ; \
VecType : : save ( srcPtr + 10 * packCUnit , s10 ) ; \
VecType : : save ( srcPtr + 11 * packCUnit , s11 ) ;
2021-04-08 15:34:23 +08:00
namespace MNN {
2021-09-18 15:52:30 +08:00
static void _sourceTransformUnit4x4Pack12 ( ElementType * srcBlock , ElementType * dstStart , size_t dstStep ) {
// register number: (srcUnit + 1) * EPack/packCUnit
constexpr int Nh = 4 ; // srcUnit
constexpr int ePack = 12 ;
constexpr size_t packCUnit = 4 ;
const size_t loadTransposeStride = packCUnit * ePack ;
ElementType * srcPtr = srcBlock ;
for ( int iNh = 0 ; iNh < Nh ; + + iNh )
{
// register number : ePack
// TRANSPOSE_12X4_SAVE();
VecType : : transpose12 ( srcPtr , packCUnit ) ;
srcPtr + = loadTransposeStride ;
}
srcPtr = srcBlock ;
ElementType * dstPtr = dstStart ;
for ( int i4c = 0 ; i4c < packCUnit ; + + i4c )
{
// source transform D * B. register number : srcUnit * (EPack/4 + 1)
VecType s00 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 0 * packCUnit ) ;
VecType s01 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 1 * packCUnit ) ;
VecType s02 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 2 * packCUnit ) ;
VecType s10 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 0 * packCUnit ) ;
VecType s11 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 1 * packCUnit ) ;
VecType s12 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 2 * packCUnit ) ;
VecType s20 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 0 * packCUnit ) ;
VecType s21 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 1 * packCUnit ) ;
VecType s22 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 2 * packCUnit ) ;
VecType s30 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 0 * packCUnit ) ;
VecType s31 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 1 * packCUnit ) ;
VecType s32 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 2 * packCUnit ) ;
// dstStep = ePack * pack * ic_4
auto ep0 = s00 - s20 ;
auto ep1 = s01 - s21 ;
auto ep2 = s02 - s22 ;
VecType : : save ( dstPtr + 0 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 0 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 0 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s10 + s20 ;
ep1 = s11 + s21 ;
ep2 = s12 + s22 ;
VecType : : save ( dstPtr + 1 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 1 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 1 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s20 - s10 ;
ep1 = s21 - s11 ;
ep2 = s22 - s12 ;
VecType : : save ( dstPtr + 2 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 2 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 2 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s30 - s10 ;
ep1 = s31 - s11 ;
ep2 = s32 - s12 ;
VecType : : save ( dstPtr + 3 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 3 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 3 * dstStep + 2 * packCUnit , ep2 ) ;
// VecType::save(dstPtr + 0 * dstStep + 0 * packCUnit, s00);
// VecType::save(dstPtr + 0 * dstStep + 1 * packCUnit, s01);
// VecType::save(dstPtr + 0 * dstStep + 2 * packCUnit, s02);
// VecType::save(dstPtr + 1 * dstStep + 0 * packCUnit, s10);
// VecType::save(dstPtr + 1 * dstStep + 1 * packCUnit, s11);
// VecType::save(dstPtr + 1 * dstStep + 2 * packCUnit, s12);
// VecType::save(dstPtr + 2 * dstStep + 0 * packCUnit, s20);
// VecType::save(dstPtr + 2 * dstStep + 1 * packCUnit, s21);
// VecType::save(dstPtr + 2 * dstStep + 2 * packCUnit, s22);
// VecType::save(dstPtr + 3 * dstStep + 0 * packCUnit, s30);
// VecType::save(dstPtr + 3 * dstStep + 1 * packCUnit, s31);
// VecType::save(dstPtr + 3 * dstStep + 2 * packCUnit, s32);
// MNN_PRINT("\nwinograd in BT*D*B, iNh:0-3, i4c:%d\n", i4c);
// formatMatrix(dstPtr + 0 * dstStep , {ePack});
// formatMatrix(dstPtr + 1 * dstStep , {ePack});
// formatMatrix(dstPtr + 2 * dstStep , {ePack});
// formatMatrix(dstPtr + 3 * dstStep , {ePack});
srcPtr + = ePack ;
dstPtr + = ePack ;
}
}
static void _sourceTransformUnit8x8Pack12 ( ElementType * srcBlock , ElementType * dstStart , size_t dstStep ) {
// source transform D * B. register number : (srcUnit + 1) * EPack/packCUnit = 27
// todo: impliment
constexpr int Nh = 8 ; // srcUnit
constexpr int ePack = 12 ;
constexpr size_t packCUnit = 4 ;
const size_t loadTransposeStride = packCUnit * ePack ;
ElementType * srcPtr = srcBlock ;
for ( int iNh = 0 ; iNh < Nh ; + + iNh )
{
// register number : ePack
VecType : : transpose12 ( srcPtr , packCUnit ) ;
srcPtr + = loadTransposeStride ;
}
srcPtr = srcBlock ;
ElementType * dstPtr = dstStart ;
for ( int i4c = 0 ; i4c < packCUnit ; + + i4c )
{
VecType s00 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 0 * packCUnit ) ;
VecType s01 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 1 * packCUnit ) ;
VecType s02 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 2 * packCUnit ) ;
VecType s10 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 0 * packCUnit ) ;
VecType s11 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 1 * packCUnit ) ;
VecType s12 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 2 * packCUnit ) ;
VecType s20 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 0 * packCUnit ) ;
VecType s21 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 1 * packCUnit ) ;
VecType s22 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 2 * packCUnit ) ;
VecType s30 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 0 * packCUnit ) ;
VecType s31 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 1 * packCUnit ) ;
VecType s32 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 2 * packCUnit ) ;
VecType s40 = VecType : : load ( srcPtr + 4 * loadTransposeStride + 0 * packCUnit ) ;
VecType s41 = VecType : : load ( srcPtr + 4 * loadTransposeStride + 1 * packCUnit ) ;
VecType s42 = VecType : : load ( srcPtr + 4 * loadTransposeStride + 2 * packCUnit ) ;
VecType s50 = VecType : : load ( srcPtr + 5 * loadTransposeStride + 0 * packCUnit ) ;
VecType s51 = VecType : : load ( srcPtr + 5 * loadTransposeStride + 1 * packCUnit ) ;
VecType s52 = VecType : : load ( srcPtr + 5 * loadTransposeStride + 2 * packCUnit ) ;
VecType s60 = VecType : : load ( srcPtr + 6 * loadTransposeStride + 0 * packCUnit ) ;
VecType s61 = VecType : : load ( srcPtr + 6 * loadTransposeStride + 1 * packCUnit ) ;
VecType s62 = VecType : : load ( srcPtr + 6 * loadTransposeStride + 2 * packCUnit ) ;
VecType s70 = VecType : : load ( srcPtr + 7 * loadTransposeStride + 0 * packCUnit ) ;
VecType s71 = VecType : : load ( srcPtr + 7 * loadTransposeStride + 1 * packCUnit ) ;
VecType s72 = VecType : : load ( srcPtr + 7 * loadTransposeStride + 2 * packCUnit ) ;
// to-try: reorder complicated commpute of 8x8
auto ep0 = s00 * 36.f - s20 * 49.f + s40 * 14.f - s60 ;
auto ep1 = s01 * 36.f - s21 * 49.f + s41 * 14.f - s61 ;
auto ep2 = s02 * 36.f - s22 * 49.f + s42 * 14.f - s62 ;
VecType : : save ( dstPtr + 0 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 0 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 0 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = ( s10 + s20 ) * 36.f - ( s30 + s40 ) * 13.f + ( s50 + s60 ) ;
ep1 = ( s11 + s21 ) * 36.f - ( s31 + s41 ) * 13.f + ( s51 + s61 ) ;
ep2 = ( s12 + s22 ) * 36.f - ( s32 + s42 ) * 13.f + ( s52 + s62 ) ;
VecType : : save ( dstPtr + 1 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 1 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 1 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = ( s20 - s10 ) * 36.f + ( s30 - s40 ) * 13.f + ( s60 - s50 ) ;
ep1 = ( s21 - s11 ) * 36.f + ( s31 - s41 ) * 13.f + ( s61 - s51 ) ;
ep2 = ( s22 - s12 ) * 36.f + ( s32 - s42 ) * 13.f + ( s62 - s52 ) ;
VecType : : save ( dstPtr + 2 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 2 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 2 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s10 * 18.f + s20 * 9.f - s30 * 20.f - s40 * 10.f + s50 * 2.f + s60 ;
ep1 = s11 * 18.f + s21 * 9.f - s31 * 20.f - s41 * 10.f + s51 * 2.f + s61 ;
ep2 = s12 * 18.f + s22 * 9.f - s32 * 20.f - s42 * 10.f + s52 * 2.f + s62 ;
VecType : : save ( dstPtr + 3 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 3 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 3 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s20 * 9.f - s10 * 18.f + s30 * 20.f - s40 * 10.f - s50 * 2.f + s60 ;
ep1 = s21 * 9.f - s11 * 18.f + s31 * 20.f - s41 * 10.f - s51 * 2.f + s61 ;
ep2 = s22 * 9.f - s12 * 18.f + s32 * 20.f - s42 * 10.f - s52 * 2.f + s62 ;
VecType : : save ( dstPtr + 4 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 4 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 4 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s10 * 12.f + s20 * 4.f - s30 * 15.f - s40 * 5.f + s50 * 3.f + s60 ;
ep1 = s11 * 12.f + s21 * 4.f - s31 * 15.f - s41 * 5.f + s51 * 3.f + s61 ;
ep2 = s12 * 12.f + s22 * 4.f - s32 * 15.f - s42 * 5.f + s52 * 3.f + s62 ;
VecType : : save ( dstPtr + 5 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 5 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 5 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s20 * 4.f - s10 * 12.f + s30 * 15.f - s40 * 5.f - s50 * 3.f + s60 ;
ep1 = s21 * 4.f - s11 * 12.f + s31 * 15.f - s41 * 5.f - s51 * 3.f + s61 ;
ep2 = s22 * 4.f - s12 * 12.f + s32 * 15.f - s42 * 5.f - s52 * 3.f + s62 ;
VecType : : save ( dstPtr + 6 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 6 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 6 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s30 * 49.f - s10 * 36.f - s50 * 14.f + s70 ;
ep1 = s31 * 49.f - s11 * 36.f - s51 * 14.f + s71 ;
ep2 = s32 * 49.f - s12 * 36.f - s52 * 14.f + s72 ;
VecType : : save ( dstPtr + 7 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 7 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 7 * dstStep + 2 * packCUnit , ep2 ) ;
srcPtr + = ePack ;
dstPtr + = ePack ;
}
}
static void _sourceTransformUnit6x6Pack12 ( ElementType * srcBlock , ElementType * dstStart , size_t dstStep ) {
// source transform D * B. register number : (srcUnit + 1) * EPack/packCUnit
constexpr int Nh = 6 ; // srcUnit
constexpr int ePack = 12 ;
constexpr size_t packCUnit = 4 ;
const size_t loadTransposeStride = packCUnit * ePack ;
ElementType * srcPtr = srcBlock ;
for ( int iNh = 0 ; iNh < Nh ; + + iNh )
{
// register number : ePack
VecType : : transpose12 ( srcPtr , packCUnit ) ;
srcPtr + = loadTransposeStride ;
}
srcPtr = srcBlock ;
ElementType * dstPtr = dstStart ;
for ( int i4c = 0 ; i4c < packCUnit ; + + i4c )
{
VecType s00 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 0 * packCUnit ) ;
VecType s01 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 1 * packCUnit ) ;
VecType s02 = VecType : : load ( srcPtr + 0 * loadTransposeStride + 2 * packCUnit ) ;
VecType s10 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 0 * packCUnit ) ;
VecType s11 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 1 * packCUnit ) ;
VecType s12 = VecType : : load ( srcPtr + 1 * loadTransposeStride + 2 * packCUnit ) ;
VecType s20 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 0 * packCUnit ) ;
VecType s21 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 1 * packCUnit ) ;
VecType s22 = VecType : : load ( srcPtr + 2 * loadTransposeStride + 2 * packCUnit ) ;
VecType s30 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 0 * packCUnit ) ;
VecType s31 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 1 * packCUnit ) ;
VecType s32 = VecType : : load ( srcPtr + 3 * loadTransposeStride + 2 * packCUnit ) ;
VecType s40 = VecType : : load ( srcPtr + 4 * loadTransposeStride + 0 * packCUnit ) ;
VecType s41 = VecType : : load ( srcPtr + 4 * loadTransposeStride + 1 * packCUnit ) ;
VecType s42 = VecType : : load ( srcPtr + 4 * loadTransposeStride + 2 * packCUnit ) ;
VecType s50 = VecType : : load ( srcPtr + 5 * loadTransposeStride + 0 * packCUnit ) ;
VecType s51 = VecType : : load ( srcPtr + 5 * loadTransposeStride + 1 * packCUnit ) ;
VecType s52 = VecType : : load ( srcPtr + 5 * loadTransposeStride + 2 * packCUnit ) ;
// to-try: reorder
auto ep0 = s00 * 4.f - s20 * 5.f + s40 ;
auto ep1 = s01 * 4.f - s21 * 5.f + s41 ;
auto ep2 = s02 * 4.f - s22 * 5.f + s42 ;
VecType : : save ( dstPtr + 0 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 0 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 0 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = ( s10 + s20 ) * ( - 4.f ) + s30 + s40 ;
ep1 = ( s11 + s21 ) * ( - 4.f ) + s31 + s41 ;
ep2 = ( s12 + s22 ) * ( - 4.f ) + s32 + s42 ;
VecType : : save ( dstPtr + 1 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 1 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 1 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = ( s10 - s20 ) * ( 4.f ) + s40 - s30 ;
ep1 = ( s11 - s21 ) * ( 4.f ) + s41 - s31 ;
ep2 = ( s12 - s22 ) * ( 4.f ) + s42 - s32 ;
VecType : : save ( dstPtr + 2 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 2 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 2 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s10 * ( - 2.f ) - s20 + s30 * 2.f + s40 ;
ep1 = s11 * ( - 2.f ) - s21 + s31 * 2.f + s41 ;
ep2 = s12 * ( - 2.f ) - s22 + s32 * 2.f + s42 ;
VecType : : save ( dstPtr + 3 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 3 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 3 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s10 * 2.f - s20 - s30 * 2.f + s40 ;
ep1 = s11 * 2.f - s21 - s31 * 2.f + s41 ;
ep2 = s12 * 2.f - s22 - s32 * 2.f + s42 ;
VecType : : save ( dstPtr + 4 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 4 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 4 * dstStep + 2 * packCUnit , ep2 ) ;
ep0 = s10 * 4.f - s30 * 5.f + s50 ;
ep1 = s11 * 4.f - s31 * 5.f + s51 ;
ep2 = s12 * 4.f - s32 * 5.f + s52 ;
VecType : : save ( dstPtr + 5 * dstStep + 0 * packCUnit , ep0 ) ;
VecType : : save ( dstPtr + 5 * dstStep + 1 * packCUnit , ep1 ) ;
VecType : : save ( dstPtr + 5 * dstStep + 2 * packCUnit , ep2 ) ;
srcPtr + = ePack ;
dstPtr + = ePack ;
}
}
2021-04-08 15:34:23 +08:00
2022-01-04 10:50:40 +08:00
static void _sourceUnrollTransformUnit4x4 ( const ElementType * srcBlock , ElementType * dstStart , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
constexpr size_t srcUnit = 4 ; // srcUnit
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
for ( int i = 0 ; i < srcUnit - 1 ; + + i ) { //Nw iteration
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto m0 = s0 - s2 ;
auto m1 = s1 + s2 ;
auto m2 = s2 - s1 ;
auto m3 = s3 - s1 ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( srcUnit - 1 ) * dstRowStep ) ;
2021-04-08 15:34:23 +08:00
auto m0 = s0 - s2 ;
auto m1 = s1 + s2 ;
auto m2 = s2 - s1 ;
auto m3 = s3 - s1 ;
2022-01-04 10:50:40 +08:00
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
2021-04-08 15:34:23 +08:00
}
2021-09-18 15:52:30 +08:00
2022-01-04 10:50:40 +08:00
static void _sourceUnrollTransformUnit6x6 ( const ElementType * srcBlock , ElementType * dstStart , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
VecType two ( 2.f ) ;
VecType four ( 4.f ) ;
VecType five ( 5.f ) ;
constexpr size_t srcUnit = 6 ; // srcUnit
VecType buf0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType buf1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType buf2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType buf3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType buf4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType buf5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
// #pragma unroll(srcUnit)
for ( int i = 0 ; i < srcUnit - 1 ; + + i ) { //Nw iteration
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto mid0 = VecType : : fma ( buf4 , buf2 , VecType ( - 4 ) ) ;
auto mid1 = VecType : : fma ( buf3 , buf1 , VecType ( - 4 ) ) ;
auto mid2 = VecType : : fma ( buf2 , buf0 , VecType ( - 4 ) ) ;
auto mid3 = VecType : : fma ( buf5 , buf3 , VecType ( - 4 ) ) ;
auto mid4 = buf4 - buf2 ;
auto mid5 = ( buf3 - buf1 ) * VecType ( 2 ) ;
VecType m0 = mid0 - mid2 ;
VecType m1 = mid0 + mid1 ;
VecType m2 = mid0 - mid1 ;
VecType m3 = mid4 + mid5 ;
VecType m4 = mid4 - mid5 ;
VecType m5 = mid3 - mid1 ;
buf0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
buf1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
buf2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
buf3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
buf4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
buf5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( srcUnit - 1 ) * dstRowStep ) ;
auto mid0 = VecType : : fma ( buf4 , buf2 , VecType ( - 4 ) ) ;
auto mid1 = VecType : : fma ( buf3 , buf1 , VecType ( - 4 ) ) ;
auto mid2 = VecType : : fma ( buf2 , buf0 , VecType ( - 4 ) ) ;
auto mid3 = VecType : : fma ( buf5 , buf3 , VecType ( - 4 ) ) ;
auto mid4 = buf4 - buf2 ;
auto mid5 = ( buf3 - buf1 ) * VecType ( 2 ) ;
VecType m0 = mid0 - mid2 ;
VecType m1 = mid0 + mid1 ;
VecType m2 = mid0 - mid1 ;
VecType m3 = mid4 + mid5 ;
VecType m4 = mid4 - mid5 ;
VecType m5 = mid3 - mid1 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
}
static void _sourceUnrollTransformUnit8x8 ( const ElementType * srcBlock , ElementType * dstStart , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
constexpr size_t srcUnit = 8 ; // srcUnit
VecType buf0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType buf1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType buf2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType buf3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType buf4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType buf5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
VecType buf6 = VecType : : load ( srcBlock + 6 * srcStep ) ;
VecType buf7 = VecType : : load ( srcBlock + 7 * srcStep ) ;
// #pragma unroll(srcUnit - 1)
for ( int i = 0 ; i < srcUnit - 1 ; + + i ) { //Nw iteration
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
VecType mid0 , mid1 , mid2 ;
mid0 = VecType : : fma ( VecType : : fma ( buf6 , buf2 , VecType ( 36 ) ) , buf4 , VecType ( - 13 ) ) ;
mid1 = VecType : : fma ( VecType : : fma ( buf4 , buf0 , VecType ( 36 ) ) , buf2 , VecType ( - 13 ) ) ;
VecType m0 = mid1 - mid0 ;
mid2 = VecType : : fma ( VecType : : fma ( buf5 , buf1 , VecType ( 36 ) ) , buf3 , VecType ( - 13 ) ) ;
VecType m1 = mid0 + mid2 ;
VecType m2 = mid0 - mid2 ;
mid1 = VecType : : fma ( VecType : : fma ( buf7 , buf3 , VecType ( 36 ) ) , buf5 , VecType ( - 13 ) ) ;
VecType m7 = mid1 - mid2 ;
mid0 = VecType : : fma ( VecType : : fma ( buf6 , buf2 , VecType ( 9 ) ) , buf4 , VecType ( - 10 ) ) ;
mid1 = VecType : : fma ( buf5 , buf1 , VecType ( 18 ) ) + VecType : : fma ( buf5 , buf3 , VecType ( - 20 ) ) ;
mid2 = VecType : : fma ( buf5 * 3 , buf1 , VecType ( 12 ) ) ;
VecType m3 = mid0 + mid1 ;
VecType m4 = mid0 - mid1 ;
mid0 = VecType : : fma ( VecType : : fma ( buf6 , buf2 , VecType ( 4 ) ) , buf4 , VecType ( - 5 ) ) ;
mid1 = VecType : : fma ( mid2 , buf3 , VecType ( - 15 ) ) ;
VecType m5 = mid0 + mid1 ;
VecType m6 = mid0 - mid1 ;
buf0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
buf1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
buf2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
buf3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
buf4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
buf5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
buf6 = VecType : : load ( srcFloatPtr + 6 * srcStep ) ;
VecType : : save ( dstFloatPtr + 6 * dstStep , m6 ) ;
buf7 = VecType : : load ( srcFloatPtr + 7 * srcStep ) ;
VecType : : save ( dstFloatPtr + 7 * dstStep , m7 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( srcUnit - 1 ) * dstRowStep ) ;
VecType mid0 , mid1 , mid2 ;
mid0 = VecType : : fma ( VecType : : fma ( buf6 , buf2 , VecType ( 36 ) ) , buf4 , VecType ( - 13 ) ) ;
mid1 = VecType : : fma ( VecType : : fma ( buf4 , buf0 , VecType ( 36 ) ) , buf2 , VecType ( - 13 ) ) ;
VecType m0 = mid1 - mid0 ;
mid2 = VecType : : fma ( VecType : : fma ( buf5 , buf1 , VecType ( 36 ) ) , buf3 , VecType ( - 13 ) ) ;
VecType m1 = mid0 + mid2 ;
VecType m2 = mid0 - mid2 ;
mid1 = VecType : : fma ( VecType : : fma ( buf7 , buf3 , VecType ( 36 ) ) , buf5 , VecType ( - 13 ) ) ;
VecType m7 = mid1 - mid2 ;
mid0 = VecType : : fma ( VecType : : fma ( buf6 , buf2 , VecType ( 9 ) ) , buf4 , VecType ( - 10 ) ) ;
mid1 = VecType : : fma ( buf5 , buf1 , VecType ( 18 ) ) + VecType : : fma ( buf5 , buf3 , VecType ( - 20 ) ) ;
mid2 = VecType : : fma ( buf5 * 3 , buf1 , VecType ( 12 ) ) ;
VecType m3 = mid0 + mid1 ;
VecType m4 = mid0 - mid1 ;
mid0 = VecType : : fma ( VecType : : fma ( buf6 , buf2 , VecType ( 4 ) ) , buf4 , VecType ( - 5 ) ) ;
mid1 = VecType : : fma ( mid2 , buf3 , VecType ( - 15 ) ) ;
VecType m5 = mid0 + mid1 ;
VecType m6 = mid0 - mid1 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
VecType : : save ( dstFloatPtr + 6 * dstStep , m6 ) ;
VecType : : save ( dstFloatPtr + 7 * dstStep , m7 ) ;
2021-04-08 15:34:23 +08:00
2022-01-04 10:50:40 +08:00
}
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit4x2 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto m0 = s0 + s1 + s2 ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m1 = ( s1 - s2 ) + s3 ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
2021-04-08 15:34:23 +08:00
auto m0 = s0 + s1 + s2 ;
auto m1 = ( s1 - s2 ) + s3 ;
2022-01-04 10:50:40 +08:00
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit4x3 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto m0 = s0 + s1 + s2 ;
auto m1 = ( s1 - s2 ) ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m2 = ( s1 + s2 ) + s3 ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
2021-04-08 15:34:23 +08:00
auto m0 = s0 + s1 + s2 ;
auto m1 = ( s1 - s2 ) ;
auto m2 = ( s1 + s2 ) + s3 ;
2022-01-04 10:50:40 +08:00
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit6x5 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto m0 = s0 + s1 + s2 + s3 + s4 ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f ;
auto m2 = ( s1 + s2 ) + ( s3 + s4 ) * 4.f ;
auto m3 = ( s1 - s2 ) + ( s3 - s4 ) * 8.f ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m4 = ( s1 + s2 ) + ( s3 + s4 ) * 16.f + s5 ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
2021-04-08 15:34:23 +08:00
auto m0 = s0 + s1 + s2 + s3 + s4 ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f ;
auto m2 = ( s1 + s2 ) + ( s3 + s4 ) * 4.f ;
auto m3 = ( s1 - s2 ) + ( s3 - s4 ) * 8.f ;
auto m4 = ( s1 + s2 ) + ( s3 + s4 ) * 16.f + s5 ;
2022-01-04 10:50:40 +08:00
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit6x4 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto v0 = s3 + s4 ;
auto v1 = s3 - s4 ;
auto v2 = s1 + s2 ;
auto v3 = s1 - s2 ;
auto m0 = s0 + v2 + v0 ;
auto m1 = v3 + v1 + v1 ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m2 = v2 + v0 * 4.f ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
auto m3 = v3 + v1 * 8.f + s5 ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
2021-04-08 15:34:23 +08:00
auto v0 = s3 + s4 ;
auto v1 = s3 - s4 ;
auto v2 = s1 + s2 ;
auto v3 = s1 - s2 ;
auto m0 = s0 + v2 + v0 ;
auto m1 = v3 + v1 + v1 ;
auto m2 = v2 + v0 * 4.f ;
auto m3 = v3 + v1 * 8.f + s5 ;
2022-01-04 10:50:40 +08:00
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit6x3 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto m0 = s0 + s1 + s2 + s3 + s4 ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m2 = ( s1 + s2 ) + ( s3 + s4 ) * 4.f + s5 ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
2021-04-08 15:34:23 +08:00
auto m0 = s0 + s1 + s2 + s3 + s4 ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f ;
auto m2 = ( s1 + s2 ) + ( s3 + s4 ) * 4.f + s5 ;
2022-01-04 10:50:40 +08:00
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
2021-04-08 15:34:23 +08:00
2022-01-04 10:50:40 +08:00
}
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit6x2 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto m0 = s0 + s1 + s2 + s3 + s4 ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f + s5 ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
2021-04-08 15:34:23 +08:00
auto m0 = s0 + s1 + s2 + s3 + s4 ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f + s5 ;
2022-01-04 10:50:40 +08:00
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit8x2 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
for ( int i = 0 ; i < IterLoop ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + i * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
VecType s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType s6 = VecType : : load ( srcFloatPtr + 6 * srcStep ) ;
VecType s7 = VecType : : load ( srcFloatPtr + 7 * srcStep ) ;
auto m0 = s0 + s1 + s2 + s3 + s4 + s5 + s6 ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f + ( s5 - s6 ) * 3.f + s7 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
}
}
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit8x3 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
VecType s6 = VecType : : load ( srcBlock + 6 * srcStep ) ;
VecType s7 = VecType : : load ( srcBlock + 7 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
auto m0 = s0 + s1 + s2 + s3 + s4 + s5 + s6 ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m2 = ( s1 + s2 ) + ( s3 + s4 ) * 4.f ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
m2 + = ( s5 + s6 ) * 9.f + s7 ;
m1 + = ( s5 - s6 ) * 3.f ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s6 = VecType : : load ( srcFloatPtr + 6 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s7 = VecType : : load ( srcFloatPtr + 7 * srcStep ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
auto m0 = s0 + s1 + s2 + s3 + s4 + s5 + s6 ;
auto m1 = ( s1 - s2 ) + ( s3 - s4 ) * 2.f + ( s5 - s6 ) * 3.f ;
auto m2 = ( s1 + s2 ) + ( s3 + s4 ) * 4.f + ( s5 + s6 ) * 9.f + s7 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
}
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit8x4 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
VecType s6 = VecType : : load ( srcBlock + 6 * srcStep ) ;
VecType s7 = VecType : : load ( srcBlock + 7 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
auto m0 = s0 + mid0 + mid2 + mid4 ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f + s7 ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s6 = VecType : : load ( srcFloatPtr + 6 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
s7 = VecType : : load ( srcFloatPtr + 7 * srcStep ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
auto m0 = s0 + mid0 + mid2 + mid4 ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f + s7 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
}
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit8x5 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
VecType s6 = VecType : : load ( srcBlock + 6 * srcStep ) ;
VecType s7 = VecType : : load ( srcBlock + 7 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
auto m0 = s0 + mid0 + mid2 + mid4 ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
auto m4 = mid0 + mid2 * 16.f + mid4 * 81.f + s7 ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s6 = VecType : : load ( srcFloatPtr + 6 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
s7 = VecType : : load ( srcFloatPtr + 7 * srcStep ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
auto m0 = s0 + mid0 + mid2 + mid4 ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f ;
auto m4 = mid0 + mid2 * 16.f + mid4 * 81.f + s7 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
}
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit8x6 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
VecType s6 = VecType : : load ( srcBlock + 6 * srcStep ) ;
VecType s7 = VecType : : load ( srcBlock + 7 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
auto m0 = s0 + mid0 + mid2 + mid4 ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m4 = mid0 + mid2 * 16.f + mid4 * 81.f ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
auto m5 = mid1 + mid3 * 32.f + mid5 * 243.f + s7 ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
s6 = VecType : : load ( srcFloatPtr + 6 * srcStep ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
s7 = VecType : : load ( srcFloatPtr + 7 * srcStep ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
auto m0 = s0 + mid0 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
m0 = m0 + mid2 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
m0 = m0 + mid4 ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f ;
auto m4 = mid0 + mid2 * 16.f + mid4 * 81.f ;
auto m5 = mid1 + mid3 * 32.f + mid5 * 243.f + s7 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
}
template < size_t IterLoop >
2022-05-06 19:51:20 +08:00
static void _destUnrollTransformUnit8x7 ( const ElementType * srcBlock , ElementType * dstStart , const float * bias , const float * postParameters , size_t srcRowStep , size_t dstRowStep , size_t srcStep , size_t dstStep ) {
2022-01-04 10:50:40 +08:00
VecType s0 = VecType : : load ( srcBlock + 0 * srcStep ) ;
VecType s1 = VecType : : load ( srcBlock + 1 * srcStep ) ;
VecType s2 = VecType : : load ( srcBlock + 2 * srcStep ) ;
VecType s3 = VecType : : load ( srcBlock + 3 * srcStep ) ;
VecType s4 = VecType : : load ( srcBlock + 4 * srcStep ) ;
VecType s5 = VecType : : load ( srcBlock + 5 * srcStep ) ;
VecType s6 = VecType : : load ( srcBlock + 6 * srcStep ) ;
VecType s7 = VecType : : load ( srcBlock + 7 * srcStep ) ;
for ( int i = 0 ; i < IterLoop - 1 ; + + i ) {
auto srcFloatPtr = ( const ElementType * ) ( srcBlock + ( i + 1 ) * srcRowStep ) ;
auto dstFloatPtr = ( ElementType * ) ( dstStart + i * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
auto m0 = s0 + mid0 + mid2 + mid4 ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f ;
auto m4 = mid0 + mid2 * 16.f + mid4 * 81.f ;
auto m5 = mid1 + mid3 * 32.f + mid5 * 243.f ;
s0 = VecType : : load ( srcFloatPtr + 0 * srcStep ) ;
auto m6 = mid0 + mid2 * 64.f + mid4 * 729.f + s7 ;
s1 = VecType : : load ( srcFloatPtr + 1 * srcStep ) ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
s2 = VecType : : load ( srcFloatPtr + 2 * srcStep ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
s3 = VecType : : load ( srcFloatPtr + 3 * srcStep ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
s4 = VecType : : load ( srcFloatPtr + 4 * srcStep ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
s5 = VecType : : load ( srcFloatPtr + 5 * srcStep ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
s6 = VecType : : load ( srcFloatPtr + 6 * srcStep ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
s7 = VecType : : load ( srcFloatPtr + 7 * srcStep ) ;
VecType : : save ( dstFloatPtr + 6 * dstStep , m6 ) ;
}
auto dstFloatPtr = ( ElementType * ) ( dstStart + ( IterLoop - 1 ) * dstRowStep ) ;
VecType mid0 , mid1 , mid2 , mid3 , mid4 , mid5 ;
mid0 = s1 + s2 ;
mid1 = s1 - s2 ;
mid2 = s3 + s4 ;
mid3 = s3 - s4 ;
mid4 = s5 + s6 ;
mid5 = s5 - s6 ;
auto m0 = s0 + mid0 + mid2 + mid4 ;
auto m1 = mid1 + mid3 * 2.f + mid5 * 3.f ;
auto m2 = mid0 + mid2 * 4.f + mid4 * 9.f ;
auto m3 = mid1 + mid3 * 8.f + mid5 * 27.f ;
auto m4 = mid0 + mid2 * 16.f + mid4 * 81.f ;
auto m5 = mid1 + mid3 * 32.f + mid5 * 243.f ;
auto m6 = mid0 + mid2 * 64.f + mid4 * 729.f + s7 ;
VecType : : save ( dstFloatPtr + 0 * dstStep , m0 ) ;
VecType : : save ( dstFloatPtr + 1 * dstStep , m1 ) ;
VecType : : save ( dstFloatPtr + 2 * dstStep , m2 ) ;
VecType : : save ( dstFloatPtr + 3 * dstStep , m3 ) ;
VecType : : save ( dstFloatPtr + 4 * dstStep , m4 ) ;
VecType : : save ( dstFloatPtr + 5 * dstStep , m5 ) ;
VecType : : save ( dstFloatPtr + 6 * dstStep , m6 ) ;
}
2021-09-18 15:52:30 +08:00
WinogradFunctionHalf : : TransformPackFunc WinogradFunctionHalf : : chooseWinoSourceTransformPack ( int k , int w , int ePack , int lPack , int packCUnit ) {
if ( ePack = = 12 & & lPack = = 1 & & packCUnit = = 4 ) {
if ( k = = 4 & & w = = 4 ) {
return _sourceTransformUnit4x4Pack12 ;
}
if ( k = = 6 & & w = = 6 ) {
return _sourceTransformUnit6x6Pack12 ;
}
2022-05-06 19:51:20 +08:00
# ifdef USE_8x8_WINOGRAD_KERNEL
2021-09-18 15:52:30 +08:00
if ( k = = 8 & & w = = 8 ) {
return _sourceTransformUnit8x8Pack12 ;
}
2022-05-06 19:51:20 +08:00
# endif
2021-09-18 15:52:30 +08:00
// other packing size
}
2022-05-06 19:51:20 +08:00
2021-09-18 15:52:30 +08:00
// if (ePack == 3 && lPack == 8 && packCUnit == 4) no need to transformPack for x86 bf16 pack format of 3 x 8 x 4, will not be called in ConvolutionWinograd.cpp by allow_x86_bf16_winograd
return nullptr ;
}
2022-01-04 10:50:40 +08:00
WinogradFunctionHalf : : WinoUnrollTransFunc WinogradFunctionHalf : : chooseSourceUnrollTransform ( int k , int w ) {
2022-05-06 19:51:20 +08:00
# ifdef USE_8x8_WINOGRAD_KERNEL
2022-01-04 10:50:40 +08:00
if ( 8 = = k & & 8 = = w ) {
return _sourceUnrollTransformUnit8x8 ;
}
2022-05-06 19:51:20 +08:00
# endif
2021-04-08 15:34:23 +08:00
if ( 6 = = k & & 6 = = w ) {
2022-01-04 10:50:40 +08:00
return _sourceUnrollTransformUnit6x6 ;
2021-04-08 15:34:23 +08:00
}
if ( 4 = = k & & 4 = = w ) {
2022-01-04 10:50:40 +08:00
return _sourceUnrollTransformUnit4x4 ;
2021-04-08 15:34:23 +08:00
}
return nullptr ;
}
2022-01-04 10:50:40 +08:00
2022-05-06 19:51:20 +08:00
void WinogradFunctionHalf : : chooseWinoDestUnrollTransform ( WinogradFunctionHalf : : WinoUnrollDestTransFunc * destFunctions , size_t maxUnit , int k , int h ) {
2022-01-04 10:50:40 +08:00
2022-05-06 19:51:20 +08:00
static WinogradFunctionHalf : : WinoUnrollDestTransFunc gDestTransUnit4 [ ] [ 5 ] = {
2022-01-04 10:50:40 +08:00
{
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr
} , // 0
{
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr
} , // 1
{
nullptr ,
_destUnrollTransformUnit4x2 < 1 > ,
_destUnrollTransformUnit4x2 < 2 > ,
_destUnrollTransformUnit4x2 < 3 > ,
_destUnrollTransformUnit4x2 < 4 >
} ,
{
nullptr ,
_destUnrollTransformUnit4x3 < 1 > ,
_destUnrollTransformUnit4x3 < 2 > ,
_destUnrollTransformUnit4x3 < 3 > ,
_destUnrollTransformUnit4x3 < 4 >
}
} ;
2022-05-06 19:51:20 +08:00
static WinogradFunctionHalf : : WinoUnrollDestTransFunc gDestTransUnit6 [ ] [ 7 ] = {
2022-01-04 10:50:40 +08:00
{
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr
} , // 0
{
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr
} , // 1
{
nullptr ,
_destUnrollTransformUnit6x2 < 1 > ,
_destUnrollTransformUnit6x2 < 2 > ,
_destUnrollTransformUnit6x2 < 3 > ,
_destUnrollTransformUnit6x2 < 4 > ,
_destUnrollTransformUnit6x2 < 5 > ,
_destUnrollTransformUnit6x2 < 6 >
} ,
{
nullptr ,
_destUnrollTransformUnit6x3 < 1 > ,
_destUnrollTransformUnit6x3 < 2 > ,
_destUnrollTransformUnit6x3 < 3 > ,
_destUnrollTransformUnit6x3 < 4 > ,
_destUnrollTransformUnit6x3 < 5 > ,
_destUnrollTransformUnit6x3 < 6 >
} ,
{
nullptr ,
_destUnrollTransformUnit6x4 < 1 > ,
_destUnrollTransformUnit6x4 < 2 > ,
_destUnrollTransformUnit6x4 < 3 > ,
_destUnrollTransformUnit6x4 < 4 > ,
_destUnrollTransformUnit6x4 < 5 > ,
_destUnrollTransformUnit6x4 < 6 >
} ,
{
nullptr ,
_destUnrollTransformUnit6x5 < 1 > ,
_destUnrollTransformUnit6x5 < 2 > ,
_destUnrollTransformUnit6x5 < 3 > ,
_destUnrollTransformUnit6x5 < 4 > ,
_destUnrollTransformUnit6x5 < 5 > ,
_destUnrollTransformUnit6x5 < 6 >
}
} ;
2022-05-06 19:51:20 +08:00
static WinogradFunctionHalf : : WinoUnrollDestTransFunc gDestTransUnit8 [ ] [ 9 ] = {
2022-01-04 10:50:40 +08:00
{
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr
} , // 0
{
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr ,
nullptr
} , // 1
{
nullptr ,
_destUnrollTransformUnit8x2 < 1 > ,
_destUnrollTransformUnit8x2 < 2 > ,
_destUnrollTransformUnit8x2 < 3 > ,
_destUnrollTransformUnit8x2 < 4 > ,
_destUnrollTransformUnit8x2 < 5 > ,
_destUnrollTransformUnit8x2 < 6 > ,
_destUnrollTransformUnit8x2 < 7 > ,
_destUnrollTransformUnit8x2 < 8 >
} ,
{
nullptr ,
_destUnrollTransformUnit8x3 < 1 > ,
_destUnrollTransformUnit8x3 < 2 > ,
_destUnrollTransformUnit8x3 < 3 > ,
_destUnrollTransformUnit8x3 < 4 > ,
_destUnrollTransformUnit8x3 < 5 > ,
_destUnrollTransformUnit8x3 < 6 > ,
_destUnrollTransformUnit8x3 < 7 > ,
_destUnrollTransformUnit8x3 < 8 >
} ,
{
nullptr ,
_destUnrollTransformUnit8x4 < 1 > ,
_destUnrollTransformUnit8x4 < 2 > ,
_destUnrollTransformUnit8x4 < 3 > ,
_destUnrollTransformUnit8x4 < 4 > ,
_destUnrollTransformUnit8x4 < 5 > ,
_destUnrollTransformUnit8x4 < 6 > ,
_destUnrollTransformUnit8x4 < 7 > ,
_destUnrollTransformUnit8x4 < 8 >
} ,
{
nullptr ,
_destUnrollTransformUnit8x5 < 1 > ,
_destUnrollTransformUnit8x5 < 2 > ,
_destUnrollTransformUnit8x5 < 3 > ,
_destUnrollTransformUnit8x5 < 4 > ,
_destUnrollTransformUnit8x5 < 5 > ,
_destUnrollTransformUnit8x5 < 6 > ,
_destUnrollTransformUnit8x5 < 7 > ,
_destUnrollTransformUnit8x5 < 8 >
} ,
{
nullptr ,
_destUnrollTransformUnit8x6 < 1 > ,
_destUnrollTransformUnit8x6 < 2 > ,
_destUnrollTransformUnit8x6 < 3 > ,
_destUnrollTransformUnit8x6 < 4 > ,
_destUnrollTransformUnit8x6 < 5 > ,
_destUnrollTransformUnit8x6 < 6 > ,
_destUnrollTransformUnit8x6 < 7 > ,
_destUnrollTransformUnit8x6 < 8 >
} ,
{
nullptr ,
_destUnrollTransformUnit8x7 < 1 > ,
_destUnrollTransformUnit8x7 < 2 > ,
_destUnrollTransformUnit8x7 < 3 > ,
_destUnrollTransformUnit8x7 < 4 > ,
_destUnrollTransformUnit8x7 < 5 > ,
_destUnrollTransformUnit8x7 < 6 > ,
_destUnrollTransformUnit8x7 < 7 > ,
_destUnrollTransformUnit8x7 < 8 >
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
} ;
2022-05-06 19:51:20 +08:00
: : memset ( ( void * ) destFunctions , 0 , maxUnit * sizeof ( WinogradFunctionHalf : : WinoUnrollDestTransFunc ) ) ;
# ifdef USE_8x8_WINOGRAD_KERNEL
2022-01-04 10:50:40 +08:00
if ( 8 = = k & & h > 1 & & h < 8 ) {
2022-05-06 19:51:20 +08:00
memcpy ( ( void * ) destFunctions , gDestTransUnit8 [ h ] , ( 8 + 1 ) * sizeof ( WinogradFunctionHalf : : WinoUnrollDestTransFunc ) ) ;
2022-01-04 10:50:40 +08:00
return ;
2021-04-08 15:34:23 +08:00
}
2022-05-06 19:51:20 +08:00
# endif
2022-01-04 10:50:40 +08:00
if ( 6 = = k & & h > 1 & & h < 6 ) {
2022-05-06 19:51:20 +08:00
: : memcpy ( ( void * ) destFunctions , gDestTransUnit6 [ h ] , ( 6 + 1 ) * sizeof ( WinogradFunctionHalf : : WinoUnrollDestTransFunc ) ) ;
2022-01-04 10:50:40 +08:00
return ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
if ( 4 = = k & & h > 1 & & h < 4 ) {
2022-05-06 19:51:20 +08:00
memcpy ( ( void * ) destFunctions , gDestTransUnit4 [ h ] , ( 4 + 1 ) * sizeof ( WinogradFunctionHalf : : WinoUnrollDestTransFunc ) ) ;
2022-01-04 10:50:40 +08:00
return ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
return ;
2021-04-08 15:34:23 +08:00
}
2022-01-04 10:50:40 +08:00
2021-04-08 15:34:23 +08:00
} // namespace MNN
2021-09-18 15:52:30 +08:00
# undef TRANSPOSE_12X4_SAVE
2022-05-06 19:51:20 +08:00
# undef USE_8x8_WINOGRAD_KERNEL
2021-09-18 15:52:30 +08:00