mirror of https://github.com/alibaba/MNN.git
Compare commits
10 Commits
8cb9bcc450
...
65d7cd34c9
| Author | SHA1 | Date |
|---|---|---|
|
|
65d7cd34c9 | |
|
|
daa62c77c1 | |
|
|
6fbbfda5ec | |
|
|
f845f0e665 | |
|
|
4c9f48b76b | |
|
|
5e3c8a3c12 | |
|
|
4f790e8bd4 | |
|
|
b5b5845787 | |
|
|
da8b7337c4 | |
|
|
875814bfb9 |
|
|
@ -754,6 +754,10 @@
|
||||||
CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
|
CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
|
||||||
CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
|
CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
|
||||||
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
|
CE0AD4E42E1FB106002013A8 /* CountMinMaxValue_FP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE0AD4E32E1FB106002013A8 /* CountMinMaxValue_FP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
|
CE0AD4E82E1FB152002013A8 /* MoEModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE0AD4E62E1FB152002013A8 /* MoEModule.hpp */; };
|
||||||
|
CE0AD4E92E1FB152002013A8 /* ModuleInside.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE0AD4E52E1FB152002013A8 /* ModuleInside.hpp */; };
|
||||||
|
CE0AD4EA2E1FB152002013A8 /* MoEModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE0AD4E72E1FB152002013A8 /* MoEModule.cpp */; };
|
||||||
CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
|
CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
|
||||||
CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
|
CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
|
||||||
CE31C7C12D783CBB00741F49 /* WorkerThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE31C7C02D783CBB00741F49 /* WorkerThread.cpp */; };
|
CE31C7C12D783CBB00741F49 /* WorkerThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE31C7C02D783CBB00741F49 /* WorkerThread.cpp */; };
|
||||||
|
|
@ -1587,6 +1591,10 @@
|
||||||
CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
|
CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
|
||||||
CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
|
CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
|
||||||
CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
|
CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
|
||||||
|
CE0AD4E32E1FB106002013A8 /* CountMinMaxValue_FP16.S */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; name = CountMinMaxValue_FP16.S; path = ../arm82/asm/arm64/CountMinMaxValue_FP16.S; sourceTree = "<group>"; };
|
||||||
|
CE0AD4E52E1FB152002013A8 /* ModuleInside.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = ModuleInside.hpp; sourceTree = "<group>"; };
|
||||||
|
CE0AD4E62E1FB152002013A8 /* MoEModule.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = MoEModule.hpp; sourceTree = "<group>"; };
|
||||||
|
CE0AD4E72E1FB152002013A8 /* MoEModule.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = MoEModule.cpp; sourceTree = "<group>"; };
|
||||||
CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
|
CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
|
||||||
CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
|
CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
|
||||||
CE31C7BF2D783CBB00741F49 /* WorkerThread.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = WorkerThread.hpp; sourceTree = "<group>"; };
|
CE31C7BF2D783CBB00741F49 /* WorkerThread.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = WorkerThread.hpp; sourceTree = "<group>"; };
|
||||||
|
|
@ -1851,7 +1859,6 @@
|
||||||
488873A8215B639D0079B12E /* source */ = {
|
488873A8215B639D0079B12E /* source */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
CE482EF5288536DA007CD935 /* internal */,
|
|
||||||
4DF87C482887D3560003E2D4 /* calib3d */,
|
4DF87C482887D3560003E2D4 /* calib3d */,
|
||||||
4D4CF4612760946500A36D9F /* imgproc */,
|
4D4CF4612760946500A36D9F /* imgproc */,
|
||||||
4D9A931B26255BDA00F9B43C /* coreml */,
|
4D9A931B26255BDA00F9B43C /* coreml */,
|
||||||
|
|
@ -1919,6 +1926,7 @@
|
||||||
48887410215B639D0079B12E /* cpu */ = {
|
48887410215B639D0079B12E /* cpu */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
CE0AD4E32E1FB106002013A8 /* CountMinMaxValue_FP16.S */,
|
||||||
CEA3C8892D6D71E1003EFAD2 /* CPUStft.hpp */,
|
CEA3C8892D6D71E1003EFAD2 /* CPUStft.hpp */,
|
||||||
CEA3C88A2D6D71E1003EFAD2 /* CPUStft.cpp */,
|
CEA3C88A2D6D71E1003EFAD2 /* CPUStft.cpp */,
|
||||||
CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */,
|
CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */,
|
||||||
|
|
@ -2203,6 +2211,9 @@
|
||||||
48C84B6F250F711600EE7666 /* module */ = {
|
48C84B6F250F711600EE7666 /* module */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
CE0AD4E52E1FB152002013A8 /* ModuleInside.hpp */,
|
||||||
|
CE0AD4E62E1FB152002013A8 /* MoEModule.hpp */,
|
||||||
|
CE0AD4E72E1FB152002013A8 /* MoEModule.cpp */,
|
||||||
48C84B71250F711600EE7666 /* PipelineModule.cpp */,
|
48C84B71250F711600EE7666 /* PipelineModule.cpp */,
|
||||||
48C84B72250F711600EE7666 /* Module.cpp */,
|
48C84B72250F711600EE7666 /* Module.cpp */,
|
||||||
48C84B73250F711600EE7666 /* WhileModule.hpp */,
|
48C84B73250F711600EE7666 /* WhileModule.hpp */,
|
||||||
|
|
@ -2881,7 +2892,6 @@
|
||||||
CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */,
|
CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */,
|
||||||
4DE4E82C275E307B0016A916 /* cv in Headers */,
|
4DE4E82C275E307B0016A916 /* cv in Headers */,
|
||||||
1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
|
1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
|
||||||
CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */,
|
|
||||||
1F501F822397BA5B004E8721 /* Interpreter.hpp in Headers */,
|
1F501F822397BA5B004E8721 /* Interpreter.hpp in Headers */,
|
||||||
C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */,
|
C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */,
|
||||||
1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */,
|
1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */,
|
||||||
|
|
@ -2892,7 +2902,6 @@
|
||||||
48C84B98250F71E900EE7666 /* CPUSoftmax.hpp in Headers */,
|
48C84B98250F71E900EE7666 /* CPUSoftmax.hpp in Headers */,
|
||||||
4882C8B8241A22B800DAC168 /* OpCommonUtils.hpp in Headers */,
|
4882C8B8241A22B800DAC168 /* OpCommonUtils.hpp in Headers */,
|
||||||
48608B54250632EC00CB1D71 /* GeometryComputer.hpp in Headers */,
|
48608B54250632EC00CB1D71 /* GeometryComputer.hpp in Headers */,
|
||||||
CECF8C7A299CAD9400D3875B /* sha1.h in Headers */,
|
|
||||||
4894C6EC27016F7200D8BE79 /* CPUResizeCache.hpp in Headers */,
|
4894C6EC27016F7200D8BE79 /* CPUResizeCache.hpp in Headers */,
|
||||||
92FF04A623AA0BFB00AC97F6 /* FileLoader.hpp in Headers */,
|
92FF04A623AA0BFB00AC97F6 /* FileLoader.hpp in Headers */,
|
||||||
48F34733273A7C8400C45394 /* ImageProcessFunction.hpp in Headers */,
|
48F34733273A7C8400C45394 /* ImageProcessFunction.hpp in Headers */,
|
||||||
|
|
@ -2906,7 +2915,6 @@
|
||||||
48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */,
|
48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */,
|
||||||
92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
|
92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
|
||||||
4D9A937826255BDA00F9B43C /* CoreMLBinary.hpp in Headers */,
|
4D9A937826255BDA00F9B43C /* CoreMLBinary.hpp in Headers */,
|
||||||
CECF8C85299CAD9400D3875B /* log_util.h in Headers */,
|
|
||||||
4D6D7FD52656896600F80814 /* DenseConvolutionTiledExecutor.hpp in Headers */,
|
4D6D7FD52656896600F80814 /* DenseConvolutionTiledExecutor.hpp in Headers */,
|
||||||
4D9A936626255BDA00F9B43C /* CoreMLExecutor.h in Headers */,
|
4D9A936626255BDA00F9B43C /* CoreMLExecutor.h in Headers */,
|
||||||
92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */,
|
92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */,
|
||||||
|
|
@ -2915,7 +2923,6 @@
|
||||||
1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */,
|
1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */,
|
||||||
19D0FE76285C66F200B74B1A /* MetalLayerNorm.hpp in Headers */,
|
19D0FE76285C66F200B74B1A /* MetalLayerNorm.hpp in Headers */,
|
||||||
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */,
|
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */,
|
||||||
CECF8C86299CAD9400D3875B /* sds.h in Headers */,
|
|
||||||
1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */,
|
1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */,
|
||||||
92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */,
|
92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */,
|
||||||
4D9A935B26255BDA00F9B43C /* NeuralNetwork.pb-c.h in Headers */,
|
4D9A935B26255BDA00F9B43C /* NeuralNetwork.pb-c.h in Headers */,
|
||||||
|
|
@ -2937,10 +2944,8 @@
|
||||||
481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */,
|
481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */,
|
||||||
4894C6EA27016F7200D8BE79 /* UnaryUtils.hpp in Headers */,
|
4894C6EA27016F7200D8BE79 /* UnaryUtils.hpp in Headers */,
|
||||||
EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */,
|
EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */,
|
||||||
CECF8C81299CAD9400D3875B /* log_util_imp.h in Headers */,
|
|
||||||
92FF037623AA0B5A00AC97F6 /* CPUBinary.hpp in Headers */,
|
92FF037623AA0B5A00AC97F6 /* CPUBinary.hpp in Headers */,
|
||||||
4D9A935826255BDA00F9B43C /* FeatureTypes.pb-c.h in Headers */,
|
4D9A935826255BDA00F9B43C /* FeatureTypes.pb-c.h in Headers */,
|
||||||
CECF8C7C299CAD9400D3875B /* hmac-sha.h in Headers */,
|
|
||||||
48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */,
|
48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */,
|
||||||
489D7A732550FDC800AD896A /* MetalBackend.hpp in Headers */,
|
489D7A732550FDC800AD896A /* MetalBackend.hpp in Headers */,
|
||||||
92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */,
|
92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */,
|
||||||
|
|
@ -2963,7 +2968,6 @@
|
||||||
4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
|
4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
|
||||||
4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
|
4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
|
||||||
48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
|
48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
|
||||||
CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
|
|
||||||
4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
|
4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
|
||||||
92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
|
92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
|
||||||
4AF4FB26269ED235005BA97B /* SparseConvInt8TiledExecutor.hpp in Headers */,
|
4AF4FB26269ED235005BA97B /* SparseConvInt8TiledExecutor.hpp in Headers */,
|
||||||
|
|
@ -3001,7 +3005,6 @@
|
||||||
92FF03CA23AA0B5A00AC97F6 /* CPUConvolutionDepthwise.hpp in Headers */,
|
92FF03CA23AA0B5A00AC97F6 /* CPUConvolutionDepthwise.hpp in Headers */,
|
||||||
92FF04A923AA0BFB00AC97F6 /* Schedule.hpp in Headers */,
|
92FF04A923AA0BFB00AC97F6 /* Schedule.hpp in Headers */,
|
||||||
489D7A9F2550FDC900AD896A /* MetalConvolutionCommon.hpp in Headers */,
|
489D7A9F2550FDC900AD896A /* MetalConvolutionCommon.hpp in Headers */,
|
||||||
CECF8C80299CAD9400D3875B /* lz4.h in Headers */,
|
|
||||||
92FF028623AA0B5A00AC97F6 /* CPUDeconvolution.hpp in Headers */,
|
92FF028623AA0B5A00AC97F6 /* CPUDeconvolution.hpp in Headers */,
|
||||||
489D7A722550FDC800AD896A /* MetalReLU6.hpp in Headers */,
|
489D7A722550FDC800AD896A /* MetalReLU6.hpp in Headers */,
|
||||||
92FF04B523AA0BFB00AC97F6 /* TensorUtils.hpp in Headers */,
|
92FF04B523AA0BFB00AC97F6 /* TensorUtils.hpp in Headers */,
|
||||||
|
|
@ -3042,6 +3045,8 @@
|
||||||
92FF026023AA0B5A00AC97F6 /* CPURNNSequenceGRU.hpp in Headers */,
|
92FF026023AA0B5A00AC97F6 /* CPURNNSequenceGRU.hpp in Headers */,
|
||||||
48747D4F245D9E13000B9709 /* CPURaster.hpp in Headers */,
|
48747D4F245D9E13000B9709 /* CPURaster.hpp in Headers */,
|
||||||
489D7A822550FDC900AD896A /* MetalPReLU.hpp in Headers */,
|
489D7A822550FDC900AD896A /* MetalPReLU.hpp in Headers */,
|
||||||
|
CE0AD4E82E1FB152002013A8 /* MoEModule.hpp in Headers */,
|
||||||
|
CE0AD4E92E1FB152002013A8 /* ModuleInside.hpp in Headers */,
|
||||||
48C84B84250F711700EE7666 /* WhileModule.hpp in Headers */,
|
48C84B84250F711700EE7666 /* WhileModule.hpp in Headers */,
|
||||||
92FF02A923AA0B5A00AC97F6 /* CPUCropAndResize.hpp in Headers */,
|
92FF02A923AA0B5A00AC97F6 /* CPUCropAndResize.hpp in Headers */,
|
||||||
4D6D7FD92656897200F80814 /* SparseConvolutionTiledExecutor.hpp in Headers */,
|
4D6D7FD92656897200F80814 /* SparseConvolutionTiledExecutor.hpp in Headers */,
|
||||||
|
|
@ -3053,24 +3058,20 @@
|
||||||
92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */,
|
92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */,
|
||||||
92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */,
|
92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */,
|
||||||
92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
|
92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
|
||||||
CECF8C88299CAD9400D3875B /* log_api.h in Headers */,
|
|
||||||
4A224A0D27D0C2D9000A9260 /* ConvolutionPackWinograd.hpp in Headers */,
|
4A224A0D27D0C2D9000A9260 /* ConvolutionPackWinograd.hpp in Headers */,
|
||||||
4A224A0E27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.hpp in Headers */,
|
4A224A0E27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.hpp in Headers */,
|
||||||
4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */,
|
4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */,
|
||||||
48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
|
48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
|
||||||
F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */,
|
F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */,
|
||||||
CECF8C5B299CACFD00D3875B /* LogHelper.hpp in Headers */,
|
|
||||||
92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */,
|
92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */,
|
||||||
482BFBCD28351BA1009210E4 /* ShaderMap.hpp in Headers */,
|
482BFBCD28351BA1009210E4 /* ShaderMap.hpp in Headers */,
|
||||||
489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */,
|
489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */,
|
||||||
CECF8C7F299CAD9400D3875B /* md5.h in Headers */,
|
|
||||||
92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */,
|
92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */,
|
||||||
92FF028023AA0B5A00AC97F6 /* CPUFloatToInt8.hpp in Headers */,
|
92FF028023AA0B5A00AC97F6 /* CPUFloatToInt8.hpp in Headers */,
|
||||||
92FF028723AA0B5A00AC97F6 /* CPUFixedPoint.hpp in Headers */,
|
92FF028723AA0B5A00AC97F6 /* CPUFixedPoint.hpp in Headers */,
|
||||||
C43C8227251894F400A0FF84 /* Vec.hpp in Headers */,
|
C43C8227251894F400A0FF84 /* Vec.hpp in Headers */,
|
||||||
4819FB1D24C138DF0050BD09 /* GeometryConvUtils.hpp in Headers */,
|
4819FB1D24C138DF0050BD09 /* GeometryConvUtils.hpp in Headers */,
|
||||||
489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */,
|
489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */,
|
||||||
CECF8C83299CAD9400D3875B /* log_define.h in Headers */,
|
|
||||||
C48CAE2628900C4A00271A6D /* ConvInt8Winograd.hpp in Headers */,
|
C48CAE2628900C4A00271A6D /* ConvInt8Winograd.hpp in Headers */,
|
||||||
48F34730273A7C7300C45394 /* CPUImageProcess.hpp in Headers */,
|
48F34730273A7C7300C45394 /* CPUImageProcess.hpp in Headers */,
|
||||||
489D7A702550FDC800AD896A /* MetalRaster.hpp in Headers */,
|
489D7A702550FDC800AD896A /* MetalRaster.hpp in Headers */,
|
||||||
|
|
@ -3391,7 +3392,6 @@
|
||||||
48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
|
48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
|
||||||
6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */,
|
6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */,
|
||||||
48958781268EBA6F00EA01A7 /* CPUSegmentMean.cpp in Sources */,
|
48958781268EBA6F00EA01A7 /* CPUSegmentMean.cpp in Sources */,
|
||||||
CECF8C7B299CAD9400D3875B /* sha1.c in Sources */,
|
|
||||||
4D9A937026255BDA00F9B43C /* CoreMLUnary.cpp in Sources */,
|
4D9A937026255BDA00F9B43C /* CoreMLUnary.cpp in Sources */,
|
||||||
92FF04A823AA0BFB00AC97F6 /* AutoTime.cpp in Sources */,
|
92FF04A823AA0BFB00AC97F6 /* AutoTime.cpp in Sources */,
|
||||||
92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */,
|
92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */,
|
||||||
|
|
@ -3418,6 +3418,7 @@
|
||||||
48925F342744AC0700919B37 /* CPUROIAlign.cpp in Sources */,
|
48925F342744AC0700919B37 /* CPUROIAlign.cpp in Sources */,
|
||||||
4896D36925FE2A3D00717702 /* Arm82Unary.cpp in Sources */,
|
4896D36925FE2A3D00717702 /* Arm82Unary.cpp in Sources */,
|
||||||
4DCF53902892B17100B5B393 /* ShapeHistogram.cpp in Sources */,
|
4DCF53902892B17100B5B393 /* ShapeHistogram.cpp in Sources */,
|
||||||
|
CE0AD4EA2E1FB152002013A8 /* MoEModule.cpp in Sources */,
|
||||||
92FF043423AA0B7100AC97F6 /* ShapeStridedSlice.cpp in Sources */,
|
92FF043423AA0B7100AC97F6 /* ShapeStridedSlice.cpp in Sources */,
|
||||||
4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */,
|
4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */,
|
||||||
4D4CF46B2760946500A36D9F /* draw.cpp in Sources */,
|
4D4CF46B2760946500A36D9F /* draw.cpp in Sources */,
|
||||||
|
|
@ -3446,7 +3447,6 @@
|
||||||
92FF03CE23AA0B5A00AC97F6 /* CPUOPRegister.cpp in Sources */,
|
92FF03CE23AA0B5A00AC97F6 /* CPUOPRegister.cpp in Sources */,
|
||||||
92FF02B323AA0B5A00AC97F6 /* CPUInstanceNorm.cpp in Sources */,
|
92FF02B323AA0B5A00AC97F6 /* CPUInstanceNorm.cpp in Sources */,
|
||||||
4819FB2C24C1396A0050BD09 /* GeometryPoolGrad.cpp in Sources */,
|
4819FB2C24C1396A0050BD09 /* GeometryPoolGrad.cpp in Sources */,
|
||||||
CECF8C7E299CAD9400D3875B /* log_builder.cpp in Sources */,
|
|
||||||
92FF042223AA0B7100AC97F6 /* ShapeConcat.cpp in Sources */,
|
92FF042223AA0B7100AC97F6 /* ShapeConcat.cpp in Sources */,
|
||||||
4D6D7FD12656891400F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
|
4D6D7FD12656891400F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
|
||||||
4D5662CC299B76ED0031C1A1 /* MNNMaxPoolInt8.S in Sources */,
|
4D5662CC299B76ED0031C1A1 /* MNNMaxPoolInt8.S in Sources */,
|
||||||
|
|
@ -3520,11 +3520,11 @@
|
||||||
92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
|
92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
|
||||||
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
|
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
|
||||||
4AF4FB29269ED244005BA97B /* MNNPackedSparseQuantMatMulEpx1.S in Sources */,
|
4AF4FB29269ED244005BA97B /* MNNPackedSparseQuantMatMulEpx1.S in Sources */,
|
||||||
|
CE0AD4E42E1FB106002013A8 /* CountMinMaxValue_FP16.S in Sources */,
|
||||||
4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
|
4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
|
||||||
11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
|
11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
|
||||||
48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
|
48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
|
||||||
CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */,
|
CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */,
|
||||||
CECF8C7D299CAD9400D3875B /* md5.c in Sources */,
|
|
||||||
92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
|
92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
|
||||||
92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */,
|
92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */,
|
||||||
CE072A182C91AEE700F190FD /* MNNGRAYToC4Fast.S in Sources */,
|
CE072A182C91AEE700F190FD /* MNNGRAYToC4Fast.S in Sources */,
|
||||||
|
|
@ -3588,10 +3588,8 @@
|
||||||
92FF042E23AA0B7100AC97F6 /* ShapeProposal.cpp in Sources */,
|
92FF042E23AA0B7100AC97F6 /* ShapeProposal.cpp in Sources */,
|
||||||
92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */,
|
92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */,
|
||||||
92FF045B23AA0B7100AC97F6 /* ShapeShape.cpp in Sources */,
|
92FF045B23AA0B7100AC97F6 /* ShapeShape.cpp in Sources */,
|
||||||
CECF8C87299CAD9400D3875B /* sds.c in Sources */,
|
|
||||||
9560EAD62BDE426A00C8D0B6 /* GeometryLayernorm.cpp in Sources */,
|
9560EAD62BDE426A00C8D0B6 /* GeometryLayernorm.cpp in Sources */,
|
||||||
4D6D7FD72656896D00F80814 /* SparseConvolutionTiledExecutor.cpp in Sources */,
|
4D6D7FD72656896D00F80814 /* SparseConvolutionTiledExecutor.cpp in Sources */,
|
||||||
CECF8C82299CAD9400D3875B /* log_api.cpp in Sources */,
|
|
||||||
92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */,
|
92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */,
|
||||||
4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
|
4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
|
||||||
92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
|
92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
|
||||||
|
|
@ -3599,7 +3597,6 @@
|
||||||
4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
|
4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
|
||||||
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
|
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
|
||||||
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
|
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
|
||||||
CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
|
|
||||||
48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
|
48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
|
||||||
92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
|
92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
|
||||||
CE072A162C91AEE700F190FD /* MNNBGRAToBGR.S in Sources */,
|
CE072A162C91AEE700F190FD /* MNNBGRAToBGR.S in Sources */,
|
||||||
|
|
@ -3627,7 +3624,6 @@
|
||||||
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
|
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
|
||||||
EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
|
EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
|
||||||
92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
|
92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
|
||||||
CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
|
|
||||||
92FF045623AA0B7100AC97F6 /* ShapeReshape.cpp in Sources */,
|
92FF045623AA0B7100AC97F6 /* ShapeReshape.cpp in Sources */,
|
||||||
92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
|
92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
|
||||||
92FF044423AA0B7100AC97F6 /* ShapeLSTM.cpp in Sources */,
|
92FF044423AA0B7100AC97F6 /* ShapeLSTM.cpp in Sources */,
|
||||||
|
|
@ -3663,7 +3659,6 @@
|
||||||
92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */,
|
92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */,
|
||||||
92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
|
92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
|
||||||
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
|
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
|
||||||
CECF8C78299CAD9400D3875B /* log_util_imp.cpp in Sources */,
|
|
||||||
CE072A152C91AEE700F190FD /* MNNRGBAToGRAYFast.S in Sources */,
|
CE072A152C91AEE700F190FD /* MNNRGBAToGRAYFast.S in Sources */,
|
||||||
92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
|
92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
|
||||||
952298B22B4D39050043978B /* MetalLoop.mm in Sources */,
|
952298B22B4D39050043978B /* MetalLoop.mm in Sources */,
|
||||||
|
|
@ -3688,13 +3683,11 @@
|
||||||
92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
|
92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
|
||||||
4D9A937926255BDA00F9B43C /* CoreMLRaster.cpp in Sources */,
|
4D9A937926255BDA00F9B43C /* CoreMLRaster.cpp in Sources */,
|
||||||
48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */,
|
48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */,
|
||||||
CECF8C84299CAD9400D3875B /* lz4.c in Sources */,
|
|
||||||
489D7A7E2550FDC900AD896A /* MNNMetalContext.mm in Sources */,
|
489D7A7E2550FDC900AD896A /* MNNMetalContext.mm in Sources */,
|
||||||
92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
|
92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
|
||||||
92FF036B23AA0B5A00AC97F6 /* CPUResize.cpp in Sources */,
|
92FF036B23AA0B5A00AC97F6 /* CPUResize.cpp in Sources */,
|
||||||
92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
|
92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
|
||||||
92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
|
92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
|
||||||
CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */,
|
|
||||||
92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
|
92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
|
||||||
CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
|
CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
|
||||||
92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
|
92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
|
||||||
|
|
@ -4168,6 +4161,7 @@
|
||||||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||||
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
||||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdeve;
|
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdeve;
|
||||||
|
"PRODUCT_BUNDLE_IDENTIFIER[sdk=iphoneos*]" = com.taobao.mnn.abcdes;
|
||||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
TARGETED_DEVICE_FAMILY = "1,2";
|
TARGETED_DEVICE_FAMILY = "1,2";
|
||||||
};
|
};
|
||||||
|
|
@ -4196,6 +4190,7 @@
|
||||||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||||
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
||||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdeve;
|
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdeve;
|
||||||
|
"PRODUCT_BUNDLE_IDENTIFIER[sdk=iphoneos*]" = com.taobao.mnn.abcdes;
|
||||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
TARGETED_DEVICE_FAMILY = "1,2";
|
TARGETED_DEVICE_FAMILY = "1,2";
|
||||||
};
|
};
|
||||||
|
|
@ -4309,4 +4304,3 @@
|
||||||
};
|
};
|
||||||
rootObject = 0F1465AE1FA18D1000F9860A /* Project object */;
|
rootObject = 0F1465AE1FA18D1000F9860A /* Project object */;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,9 @@ if (MNN_KLEIDIAI)
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/)
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/)
|
||||||
|
|
||||||
list(APPEND MNN_SOURCES_KLEIDIAI ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c)
|
list(APPEND MNN_SOURCES_KLEIDIAI ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c)
|
||||||
list(APPEND MNN_SOURCES_KLEIDIAI ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c)
|
list(APPEND MNN_SOURCES_KLEIDIAI ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c)
|
||||||
|
|
@ -93,9 +95,15 @@ if (MNN_KLEIDIAI)
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.c
|
||||||
)
|
)
|
||||||
|
|
||||||
set_source_files_properties(${MNN_SOURCES_KLEIDIAI} PROPERTIES COMPILE_OPTIONS -march=armv8.2-a+i8mm+dotprod+sve+sve2+fp16)
|
set_source_files_properties(${MNN_SOURCES_KLEIDIAI} PROPERTIES COMPILE_OPTIONS "-fno-tree-vectorize;-march=armv8.2-a+i8mm+dotprod+sve+sve2+fp16")
|
||||||
set_source_files_properties(${KLEIDIAI_FILES_SME2} PROPERTIES COMPILE_OPTIONS "-fno-tree-vectorize;-march=armv8.2-a+sve+sve2")
|
set_source_files_properties(${KLEIDIAI_FILES_SME2} PROPERTIES COMPILE_OPTIONS "-fno-tree-vectorize;-march=armv8.2-a+sve+sve2")
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -15,13 +15,11 @@ KleidiAI *KleidiAI::mKaiInstance = NULL;
|
||||||
KleidiAI::StaticInfo KleidiAI::mStaticInfo;
|
KleidiAI::StaticInfo KleidiAI::mStaticInfo;
|
||||||
|
|
||||||
//Get instance.
|
//Get instance.
|
||||||
KleidiAI& KleidiAI::getInstance(const MNNCPUInfo& gCPUInfo, bool bFP16, bool bBF16) {
|
KleidiAI& KleidiAI::getInstance(const MNNCPUInfo& gCPUInfo) {
|
||||||
if(!mKaiInstance) {
|
if(!mKaiInstance) {
|
||||||
mKaiInstance = new KleidiAI;
|
mKaiInstance = new KleidiAI;
|
||||||
mKaiInitialized = true;
|
mKaiInitialized = true;
|
||||||
|
|
||||||
mStaticInfo.mFP16 = bFP16;
|
|
||||||
mStaticInfo.mBF16 = bBF16;
|
|
||||||
mStaticInfo.mDot = gCPUInfo.dot;
|
mStaticInfo.mDot = gCPUInfo.dot;
|
||||||
mStaticInfo.mI8mm = gCPUInfo.i8mm;
|
mStaticInfo.mI8mm = gCPUInfo.i8mm;
|
||||||
mStaticInfo.mSme2 = gCPUInfo.sme2;
|
mStaticInfo.mSme2 = gCPUInfo.sme2;
|
||||||
|
|
@ -45,9 +43,11 @@ void KleidiAI::printInfo(AccelType type) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * const names[] = {
|
static const char * const names[] = {
|
||||||
"QI4_ASYM_CHNLQT",
|
"QI4_ASYM_CHNLQT_F32",
|
||||||
"QI4_ASYM_BLKQT",
|
"QI4_ASYM_CHNLQT_F16",
|
||||||
"QI4_SYM_CHNLQT",
|
"QI4_ASYM_BLKQT_F32",
|
||||||
|
"QI4_ASYM_BLKQT_F16",
|
||||||
|
"QI4_SYM_CHNLQT_F32",
|
||||||
"QI4_SYM_BLKQT",
|
"QI4_SYM_BLKQT",
|
||||||
"QI8_ASYM_CHNLQT",
|
"QI8_ASYM_CHNLQT",
|
||||||
"QI8_ASYM_BLKQT",
|
"QI8_ASYM_BLKQT",
|
||||||
|
|
@ -60,18 +60,11 @@ void KleidiAI::printInfo(AccelType type) {
|
||||||
|
|
||||||
KernelInfo *pInfo = &mStaticInfo.mKernelInfo[(size_t)type];
|
KernelInfo *pInfo = &mStaticInfo.mKernelInfo[(size_t)type];
|
||||||
if(pInfo->mKernelSupport) {
|
if(pInfo->mKernelSupport) {
|
||||||
MNN_PRINT("\nKleidiAI is running! AccelType is %s. ", names[(size_t)type]);
|
MNN_PRINT("\nKleidiAI is running! AccelType is %s.\n", names[(size_t)type]);
|
||||||
} else {
|
} else {
|
||||||
MNN_PRINT("\nKleidiAI cannot accelerate! AccelType is %s. ", names[(size_t)type]);
|
MNN_PRINT("\nKleidiAI cannot accelerate! AccelType is %s.\n", names[(size_t)type]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(mStaticInfo.mFP16) {
|
|
||||||
MNN_PRINT("Data type is FP16.\n");
|
|
||||||
} else if(mStaticInfo.mBF16) {
|
|
||||||
MNN_PRINT("Data type is BF16.\n");
|
|
||||||
} else {
|
|
||||||
MNN_PRINT("Data type is FP32.\n");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Init
|
//Init
|
||||||
|
|
@ -82,52 +75,50 @@ void KleidiAI::initKernelInfo() {
|
||||||
bool bSupport = false;
|
bool bSupport = false;
|
||||||
|
|
||||||
switch(static_cast<AccelType>(type)) {
|
switch(static_cast<AccelType>(type)) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
{
|
{
|
||||||
if(!mStaticInfo.mFP16 && !mStaticInfo.mBF16) { //Currently only support FP32.
|
if(mStaticInfo.mSme2) {
|
||||||
if(mStaticInfo.mSme2) {
|
bSupport = true;
|
||||||
bSupport = true;
|
pParam->mKaiMstepGemv = 1;
|
||||||
pParam->mKaiMstepGemv = 1;
|
pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
||||||
pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
||||||
pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
pParam->mKaiMrGemv = 1;
|
||||||
pParam->mKaiMrGemv = 1;
|
pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
||||||
pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
||||||
pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
|
pParam->mKaiKr = 4;
|
||||||
pParam->mKaiKr = 4;
|
pParam->mKaiSr = 1;
|
||||||
pParam->mKaiSr = 1;
|
} else if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
|
||||||
} else if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
|
bSupport = true;
|
||||||
bSupport = true;
|
pParam->mKaiMstepGemv = 1;
|
||||||
pParam->mKaiMstepGemv = 1;
|
pParam->mKaiMstepGemm = 8;
|
||||||
pParam->mKaiMstepGemm = 8;
|
pParam->mKaiNStep = 4;
|
||||||
pParam->mKaiNStep = 4;
|
pParam->mKaiMrGemv = 1;
|
||||||
pParam->mKaiMrGemv = 1;
|
pParam->mKaiMrGemm = 4;
|
||||||
pParam->mKaiMrGemm = 4;
|
pParam->mKaiNr = 4;
|
||||||
pParam->mKaiNr = 4;
|
pParam->mKaiKr = 16;
|
||||||
pParam->mKaiKr = 16;
|
pParam->mKaiSr = 2;
|
||||||
pParam->mKaiSr = 2;
|
} else {
|
||||||
} else {
|
bSupport = false;
|
||||||
bSupport = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
{
|
{
|
||||||
if(!mStaticInfo.mBF16) { //Currently support FP32 and FP16.
|
if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
|
||||||
if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
|
bSupport = true;
|
||||||
bSupport = true;
|
pParam->mKaiMstepGemv = 1;
|
||||||
pParam->mKaiMstepGemv = 1;
|
pParam->mKaiMstepGemm = 8;
|
||||||
pParam->mKaiMstepGemm = 8;
|
pParam->mKaiNStep = 4;
|
||||||
pParam->mKaiNStep = 4;
|
pParam->mKaiMrGemv = 1;
|
||||||
pParam->mKaiMrGemv = 1;
|
pParam->mKaiMrGemm = 4;
|
||||||
pParam->mKaiMrGemm = 4;
|
pParam->mKaiNr = 4;
|
||||||
pParam->mKaiNr = 4;
|
pParam->mKaiKr = 16;
|
||||||
pParam->mKaiKr = 16;
|
pParam->mKaiSr = 2;
|
||||||
pParam->mKaiSr = 2;
|
} else {
|
||||||
} else {
|
bSupport = false;
|
||||||
bSupport = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -139,35 +130,31 @@ void KleidiAI::initKernelInfo() {
|
||||||
break;
|
break;
|
||||||
case AccelType::FP16:
|
case AccelType::FP16:
|
||||||
{
|
{
|
||||||
if (mStaticInfo.mFP16 && !mStaticInfo.mBF16) {
|
if (mStaticInfo.mSme2) {
|
||||||
if (mStaticInfo.mSme2) {
|
bSupport = true;
|
||||||
bSupport = true;
|
pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
||||||
pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
||||||
pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
||||||
pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
pParam->mKaiNr = kai_get_nr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
||||||
pParam->mKaiNr = kai_get_nr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
pParam->mKaiKr = kai_get_kr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
||||||
pParam->mKaiKr = kai_get_kr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
pParam->mKaiSr = kai_get_sr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
||||||
pParam->mKaiSr = kai_get_sr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
|
} else {
|
||||||
} else {
|
bSupport = false;
|
||||||
bSupport = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case AccelType::FP32:
|
case AccelType::FP32:
|
||||||
{
|
{
|
||||||
if (!mStaticInfo.mFP16 && !mStaticInfo.mBF16) {
|
if (mStaticInfo.mSme2) {
|
||||||
if (mStaticInfo.mSme2) {
|
bSupport = true;
|
||||||
bSupport = true;
|
pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
||||||
pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
||||||
pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
||||||
pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
||||||
pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
pParam->mKaiKr = kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
||||||
pParam->mKaiKr = kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
pParam->mKaiSr = kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
||||||
pParam->mKaiSr = kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
|
} else {
|
||||||
} else {
|
bSupport = false;
|
||||||
bSupport = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -183,19 +170,21 @@ void KleidiAI::initKernelInfo() {
|
||||||
}
|
}
|
||||||
|
|
||||||
//Get Info
|
//Get Info
|
||||||
KleidiAI::AccelType KleidiAI::getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize) {
|
KleidiAI::AccelType KleidiAI::getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize, size_t bytes) {
|
||||||
static std::map<KleidiAI::QIntInfo, KleidiAI::AccelType> infoMap = {
|
static std::map<KleidiAI::QIntInfo, KleidiAI::AccelType> infoMap = {
|
||||||
{KleidiAI::QIntInfo(4, true, 0), KleidiAI::AccelType::QI4_ASYM_CHNLQT},
|
{KleidiAI::QIntInfo(4, true, 0, 4), KleidiAI::AccelType::QI4_ASYM_CHNLQT_F32},
|
||||||
{KleidiAI::QIntInfo(4, true, -1), KleidiAI::AccelType::QI4_ASYM_BLKQT},
|
{KleidiAI::QIntInfo(4, true, -1, 4), KleidiAI::AccelType::QI4_ASYM_BLKQT_F32},
|
||||||
{KleidiAI::QIntInfo(4, false, 0), KleidiAI::AccelType::QI4_SYM_CHNLQT},
|
{KleidiAI::QIntInfo(4, false, 0, 4), KleidiAI::AccelType::QI4_SYM_CHNLQT_F32},
|
||||||
{KleidiAI::QIntInfo(4, false, -1), KleidiAI::AccelType::QI4_SYM_BLKQT},
|
{KleidiAI::QIntInfo(4, true, 0, 2), KleidiAI::AccelType::QI4_ASYM_CHNLQT_F16},
|
||||||
{KleidiAI::QIntInfo(8, true, 0), KleidiAI::AccelType::QI8_ASYM_CHNLQT},
|
{KleidiAI::QIntInfo(4, true, -1, 2), KleidiAI::AccelType::QI4_ASYM_BLKQT_F16},
|
||||||
{KleidiAI::QIntInfo(8, true, -1), KleidiAI::AccelType::QI8_ASYM_BLKQT},
|
{KleidiAI::QIntInfo(4, false, -1, -1), KleidiAI::AccelType::QI4_SYM_BLKQT},
|
||||||
{KleidiAI::QIntInfo(8, false, 0), KleidiAI::AccelType::QI8_SYM_CHNLQT},
|
{KleidiAI::QIntInfo(8, true, 0, -1), KleidiAI::AccelType::QI8_ASYM_CHNLQT},
|
||||||
{KleidiAI::QIntInfo(8, false, -1), KleidiAI::AccelType::QI8_SYM_BLKQT},
|
{KleidiAI::QIntInfo(8, true, -1, -1), KleidiAI::AccelType::QI8_ASYM_BLKQT},
|
||||||
|
{KleidiAI::QIntInfo(8, false, 0, -1), KleidiAI::AccelType::QI8_SYM_CHNLQT},
|
||||||
|
{KleidiAI::QIntInfo(8, false, -1, -1), KleidiAI::AccelType::QI8_SYM_BLKQT},
|
||||||
};
|
};
|
||||||
|
|
||||||
QIntInfo info(bits, bAsymmetric, blockSize);
|
QIntInfo info(bits, bAsymmetric, blockSize, bytes);
|
||||||
auto it = infoMap.find(info);
|
auto it = infoMap.find(info);
|
||||||
if(it != infoMap.end()) {
|
if(it != infoMap.end()) {
|
||||||
return it->second;
|
return it->second;
|
||||||
|
|
@ -223,18 +212,16 @@ size_t KleidiAI::getLhsQuantedPackedSize(AccelType type, size_t m, size_t k, siz
|
||||||
MNN_ASSERT(type >= AccelType::QINT && type <= AccelType::QINT_END);
|
MNN_ASSERT(type >= AccelType::QINT && type <= AccelType::QINT_END);
|
||||||
|
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
return kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(m, k, getMr(type, m), getKr(type), getSr(type));
|
return kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(m, k, getMr(type, m), getKr(type), getSr(type));
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
bl = k;
|
bl = k;
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
{
|
return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
|
||||||
if(mStaticInfo.mFP16) {
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
|
bl = k;
|
||||||
} else {
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
|
return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
|
||||||
}
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
MNN_ASSERT(0);
|
MNN_ASSERT(0);
|
||||||
}
|
}
|
||||||
|
|
@ -250,18 +237,16 @@ size_t KleidiAI::getLhsQuantedPackedOffset(AccelType type, size_t m, size_t mIdx
|
||||||
}
|
}
|
||||||
|
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
return kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(mIdx, k, getMr(type, m), getKr(type), getSr(type));
|
return kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(mIdx, k, getMr(type, m), getKr(type), getSr(type));
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
bl = k;
|
bl = k;
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
{
|
return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f32_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
|
||||||
if(mStaticInfo.mFP16) {
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f16_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
|
bl = k;
|
||||||
} else {
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f32_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
|
return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f16_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
|
||||||
}
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
MNN_ASSERT(0);
|
MNN_ASSERT(0);
|
||||||
}
|
}
|
||||||
|
|
@ -290,17 +275,18 @@ void KleidiAI::runLhsQuantPack(AccelType type, size_t m, size_t k, size_t bl, si
|
||||||
MNN_ASSERT(type >= AccelType::QINT && type <= AccelType::QINT_END);
|
MNN_ASSERT(type >= AccelType::QINT && type <= AccelType::QINT_END);
|
||||||
|
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
kai_run_lhs_quant_pack_qai8dxp_f32(m, k, mr, getKr(type), getSr(type), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
|
kai_run_lhs_quant_pack_qai8dxp_f32(m, k, mr, getKr(type), getSr(type), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
|
||||||
break;
|
break;
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
bl = k;
|
bl = k;
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
if(mStaticInfo.mFP16) {
|
kai_run_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
|
||||||
kai_run_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const __fp16 *)lhs, k * sizeof(__fp16), lhsQuantedPacked);
|
break;
|
||||||
} else {
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
kai_run_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
|
bl = k;
|
||||||
}
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
|
kai_run_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const __fp16 *)lhs, k * sizeof(__fp16), lhsQuantedPacked);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
MNN_ASSERT(0);
|
MNN_ASSERT(0);
|
||||||
|
|
@ -310,15 +296,17 @@ void KleidiAI::runLhsQuantPack(AccelType type, size_t m, size_t k, size_t bl, si
|
||||||
//Rhs
|
//Rhs
|
||||||
size_t KleidiAI::getRhsPackedSize(AccelType type, size_t n, size_t k, size_t bl) {
|
size_t KleidiAI::getRhsPackedSize(AccelType type, size_t n, size_t k, size_t bl) {
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
if(mStaticInfo.mSme2) {
|
if(mStaticInfo.mSme2) {
|
||||||
return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon(n, k, getNr(type), getKr(type), getSr(type));
|
return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon(n, k, getNr(type), getKr(type), getSr(type));
|
||||||
} else {
|
} else {
|
||||||
return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(n, k, getNr(type), getKr(type), getSr(type));
|
return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(n, k, getNr(type), getKr(type), getSr(type));
|
||||||
}
|
}
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
bl = k;
|
bl = k;
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
return kai_get_rhs_packed_size_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(n, k, getNr(type), getKr(type), bl);
|
return kai_get_rhs_packed_size_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(n, k, getNr(type), getKr(type), bl);
|
||||||
case AccelType::FP16:
|
case AccelType::FP16:
|
||||||
return kai_get_rhs_packed_size_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme(n, k);
|
return kai_get_rhs_packed_size_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme(n, k);
|
||||||
|
|
@ -336,15 +324,17 @@ size_t KleidiAI::getRhsPackedOffset(AccelType type, size_t nIdx, size_t k, size_
|
||||||
}
|
}
|
||||||
|
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
if(mStaticInfo.mSme2) {
|
if(mStaticInfo.mSme2) {
|
||||||
return kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon(nIdx, k, getNr(type), getKr(type), getSr(type));
|
return kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon(nIdx, k, getNr(type), getKr(type), getSr(type));
|
||||||
} else {
|
} else {
|
||||||
return kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(nIdx, k, getNr(type), getKr(type), getSr(type));
|
return kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(nIdx, k, getNr(type), getKr(type), getSr(type));
|
||||||
}
|
}
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
bl = k;
|
bl = k;
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
return kai_get_rhs_packed_offset_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(nIdx, k, getNr(type), getKr(type), bl);
|
return kai_get_rhs_packed_offset_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(nIdx, k, getNr(type), getKr(type), bl);
|
||||||
default:
|
default:
|
||||||
MNN_ASSERT(0);
|
MNN_ASSERT(0);
|
||||||
|
|
@ -356,7 +346,7 @@ void KleidiAI::runRhsPack(AccelType type, size_t numGroups, size_t n, size_t k,
|
||||||
const void* rhs, const void* scale, const void* zeroPoint, const void* bias,
|
const void* rhs, const void* scale, const void* zeroPoint, const void* bias,
|
||||||
void* rhsPacked) {
|
void* rhsPacked) {
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
{
|
{
|
||||||
KleidiAIUtil::rhsPackParamCommon paramCommon;
|
KleidiAIUtil::rhsPackParamCommon paramCommon;
|
||||||
if(mStaticInfo.mSme2) {
|
if(mStaticInfo.mSme2) {
|
||||||
|
|
@ -370,9 +360,11 @@ void KleidiAI::runRhsPack(AccelType type, size_t numGroups, size_t n, size_t k,
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
bl = k;
|
bl = k;
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
struct kai_rhs_pack_nxk_qai4c32p_params params;
|
struct kai_rhs_pack_nxk_qai4c32p_params params;
|
||||||
params.lhs_zero_point = 1;
|
params.lhs_zero_point = 1;
|
||||||
params.rhs_zero_point = 8;
|
params.rhs_zero_point = 8;
|
||||||
|
|
@ -401,7 +393,7 @@ void KleidiAI::runMatmul(AccelType type, size_t m, size_t n, size_t k, size_t bl
|
||||||
KAI_UNUSED(bl);
|
KAI_UNUSED(bl);
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case AccelType::QI4_SYM_CHNLQT:
|
case AccelType::QI4_SYM_CHNLQT_F32:
|
||||||
{
|
{
|
||||||
if(mStaticInfo.mSme2) {
|
if(mStaticInfo.mSme2) {
|
||||||
if(m == 1) {
|
if(m == 1) {
|
||||||
|
|
@ -427,29 +419,30 @@ void KleidiAI::runMatmul(AccelType type, size_t m, size_t n, size_t k, size_t bl
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case AccelType::QI4_ASYM_CHNLQT:
|
case AccelType::QI4_ASYM_CHNLQT_F32:
|
||||||
bl = k;
|
bl = k;
|
||||||
case AccelType::QI4_ASYM_BLKQT:
|
case AccelType::QI4_ASYM_BLKQT_F32:
|
||||||
if(mStaticInfo.mFP16) {
|
if(m == 1) {
|
||||||
if(m == 1) {
|
kai_run_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
|
||||||
kai_run_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
|
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
||||||
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
||||||
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
|
||||||
} else {
|
|
||||||
kai_run_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
|
|
||||||
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
|
||||||
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if(m == 1) {
|
kai_run_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
|
||||||
kai_run_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
|
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
||||||
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
||||||
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
}
|
||||||
} else {
|
break;
|
||||||
kai_run_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
|
case AccelType::QI4_ASYM_CHNLQT_F16:
|
||||||
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
bl = k;
|
||||||
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
case AccelType::QI4_ASYM_BLKQT_F16:
|
||||||
}
|
if(m == 1) {
|
||||||
|
kai_run_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
|
||||||
|
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
||||||
|
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
||||||
|
} else {
|
||||||
|
kai_run_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
|
||||||
|
(const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
|
||||||
|
dstStrideRow, dstStrideCol, scalarMin, scalarMax);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case AccelType::FP16:
|
case AccelType::FP16:
|
||||||
|
|
|
||||||
|
|
@ -32,9 +32,11 @@ namespace MNN {
|
||||||
CHNLQT/BLKQT: channel wise/block wise;
|
CHNLQT/BLKQT: channel wise/block wise;
|
||||||
*/
|
*/
|
||||||
QINT = 0,
|
QINT = 0,
|
||||||
QI4_ASYM_CHNLQT = QINT,
|
QI4_ASYM_CHNLQT_F32 = QINT,
|
||||||
QI4_ASYM_BLKQT,
|
QI4_ASYM_CHNLQT_F16,
|
||||||
QI4_SYM_CHNLQT,
|
QI4_ASYM_BLKQT_F32,
|
||||||
|
QI4_ASYM_BLKQT_F16,
|
||||||
|
QI4_SYM_CHNLQT_F32,
|
||||||
QI4_SYM_BLKQT,
|
QI4_SYM_BLKQT,
|
||||||
QI8_ASYM_CHNLQT,
|
QI8_ASYM_CHNLQT,
|
||||||
QI8_ASYM_BLKQT,
|
QI8_ASYM_BLKQT,
|
||||||
|
|
@ -72,9 +74,6 @@ namespace MNN {
|
||||||
} KernelInfo;
|
} KernelInfo;
|
||||||
|
|
||||||
typedef struct StaticInfo {
|
typedef struct StaticInfo {
|
||||||
bool mFP16 = false; //fp16 or fp32.
|
|
||||||
bool mBF16 = false; //bf16 or fp32.
|
|
||||||
|
|
||||||
bool mDot = false;
|
bool mDot = false;
|
||||||
bool mI8mm = false;
|
bool mI8mm = false;
|
||||||
bool mSme2 = false;
|
bool mSme2 = false;
|
||||||
|
|
@ -87,11 +86,13 @@ namespace MNN {
|
||||||
size_t mBits;
|
size_t mBits;
|
||||||
bool mAsymmetric; //Asymmetric quantized model.
|
bool mAsymmetric; //Asymmetric quantized model.
|
||||||
size_t mBlockSize; //0: Per channel quant; others: Per block quant.
|
size_t mBlockSize; //0: Per channel quant; others: Per block quant.
|
||||||
|
size_t mBytes; //4: float32; 2: float16.
|
||||||
|
|
||||||
QIntInfo(size_t bits = 4, bool asymmetric = false, size_t blockSize = 0) {
|
QIntInfo(size_t bits = 4, bool asymmetric = false, size_t blockSize = 0, size_t bytes = 0) {
|
||||||
mBits = bits;
|
mBits = bits;
|
||||||
mAsymmetric = asymmetric;
|
mAsymmetric = asymmetric;
|
||||||
mBlockSize = blockSize;
|
mBlockSize = blockSize;
|
||||||
|
mBytes = bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator<(const QIntInfo& rhs) const {
|
bool operator<(const QIntInfo& rhs) const {
|
||||||
|
|
@ -103,6 +104,10 @@ namespace MNN {
|
||||||
return mAsymmetric < rhs.mAsymmetric;
|
return mAsymmetric < rhs.mAsymmetric;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(mBytes != rhs.mBytes) {
|
||||||
|
return mBytes < rhs.mBytes;
|
||||||
|
}
|
||||||
|
|
||||||
bool lhsPerChannel = mBlockSize == 0 ? true : false;
|
bool lhsPerChannel = mBlockSize == 0 ? true : false;
|
||||||
bool rhsPerChannel = rhs.mBlockSize == 0 ? true : false;
|
bool rhsPerChannel = rhs.mBlockSize == 0 ? true : false;
|
||||||
return lhsPerChannel < rhsPerChannel;
|
return lhsPerChannel < rhsPerChannel;
|
||||||
|
|
@ -115,7 +120,7 @@ namespace MNN {
|
||||||
static bool mKaiInitialized;
|
static bool mKaiInitialized;
|
||||||
|
|
||||||
//Get instance.
|
//Get instance.
|
||||||
static KleidiAI &getInstance(const MNNCPUInfo& gCPUInfo, bool bFP16, bool bBF16);
|
static KleidiAI &getInstance(const MNNCPUInfo& gCPUInfo);
|
||||||
static KleidiAI &getInstance();
|
static KleidiAI &getInstance();
|
||||||
static void initKernelInfo();
|
static void initKernelInfo();
|
||||||
|
|
||||||
|
|
@ -126,13 +131,12 @@ namespace MNN {
|
||||||
//Check and set
|
//Check and set
|
||||||
bool canAccelerate();
|
bool canAccelerate();
|
||||||
bool canAccelerate(AccelType type);
|
bool canAccelerate(AccelType type);
|
||||||
|
bool canAccelerate(AccelType type, const Convolution2DCommon *common);
|
||||||
bool isLoaded(AccelType type);
|
bool isLoaded(AccelType type);
|
||||||
void setLoaded(AccelType type) { mLoaded[(size_t)type] = true; }
|
void setLoaded(AccelType type) { mLoaded[(size_t)type] = true; }
|
||||||
bool isLinear() { return mLinear; }
|
|
||||||
void setLinear(bool bLinear) { mLinear = bLinear; }
|
|
||||||
|
|
||||||
//Get info
|
//Get info
|
||||||
static AccelType getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize);
|
static AccelType getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize, size_t bytes);
|
||||||
size_t getMr(AccelType type, size_t m = 1);
|
size_t getMr(AccelType type, size_t m = 1);
|
||||||
size_t getNr(AccelType type);
|
size_t getNr(AccelType type);
|
||||||
size_t getKr(AccelType type);
|
size_t getKr(AccelType type);
|
||||||
|
|
@ -142,9 +146,6 @@ namespace MNN {
|
||||||
size_t getVecNumPerThread(size_t totalVec, size_t totalThread, size_t minStep);
|
size_t getVecNumPerThread(size_t totalVec, size_t totalThread, size_t minStep);
|
||||||
//Get Static info
|
//Get Static info
|
||||||
bool bSupportSme2() { return mStaticInfo.mSme2; }
|
bool bSupportSme2() { return mStaticInfo.mSme2; }
|
||||||
bool isFP16() { return mStaticInfo.mFP16; }
|
|
||||||
bool isBF16() { return mStaticInfo.mBF16; }
|
|
||||||
bool isHalf() { return mStaticInfo.mFP16 || mStaticInfo.mBF16; }
|
|
||||||
|
|
||||||
//Lhs
|
//Lhs
|
||||||
size_t getLhsPackedSize(AccelType type, size_t m, size_t k);
|
size_t getLhsPackedSize(AccelType type, size_t m, size_t k);
|
||||||
|
|
@ -198,6 +199,27 @@ namespace MNN {
|
||||||
return mStaticInfo.mKernelInfo[(size_t)type].mKernelSupport;
|
return mStaticInfo.mKernelInfo[(size_t)type].mKernelSupport;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool KleidiAI::canAccelerate(AccelType type, const Convolution2DCommon* common) {
|
||||||
|
if(type >= AccelType::ACC_TYPE_ERROR) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if(common->group() != 1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if(type == AccelType::QI4_ASYM_CHNLQT_F32|| type == AccelType::QI4_ASYM_CHNLQT_F16 || type == AccelType::QI8_ASYM_CHNLQT) {
|
||||||
|
if(common->inputCount() % 32 != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(common->kernelX() == 1 && common->kernelY() == 1
|
||||||
|
&& common->padX() == 0 && common->padY() == 0
|
||||||
|
&& common->strideX() == 1 && common->strideY() == 1
|
||||||
|
&& common->dilateX() == 1 && common->dilateY() == 1) {
|
||||||
|
return mStaticInfo.mKernelInfo[(size_t)type].mKernelSupport;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
inline bool KleidiAI::isLoaded(AccelType type) {
|
inline bool KleidiAI::isLoaded(AccelType type) {
|
||||||
MNN_ASSERT(type < AccelType::ACC_TYPE_NUMBER);
|
MNN_ASSERT(type < AccelType::ACC_TYPE_NUMBER);
|
||||||
return mLoaded[(size_t)type];
|
return mLoaded[(size_t)type];
|
||||||
|
|
|
||||||
|
|
@ -41,97 +41,6 @@ inline static size_t kai_rhs_packed_stride(size_t k, size_t nr, size_t kr, size_
|
||||||
return nr * (num_bytes_per_block * num_blocks_per_row + kai_num_bytes_bias);
|
return nr * (num_bytes_per_block * num_blocks_per_row + kai_num_bytes_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
void KleidiAIUtil::transferNCHWToNC4HW4(float* src, float* dst, size_t rowNum, size_t rowSize) {
|
|
||||||
size_t blockNum = rowSize / 4;
|
|
||||||
size_t blockSize = 4 * sizeof(float);
|
|
||||||
|
|
||||||
for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
|
|
||||||
const float *rowSrc = src + blockIndex * 4;
|
|
||||||
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
|
|
||||||
memcpy(dst, rowSrc, blockSize);
|
|
||||||
dst += 4;
|
|
||||||
rowSrc += rowSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t remain = rowSize - blockNum * 4;
|
|
||||||
if(remain){
|
|
||||||
const float *rowSrc = src + blockNum * 4;
|
|
||||||
size_t remainSize = remain * sizeof(float);
|
|
||||||
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
|
|
||||||
memcpy(dst, rowSrc, remainSize);
|
|
||||||
dst += 4;
|
|
||||||
rowSrc += rowSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void KleidiAIUtil::transferNCHWToNC4HW4(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize) {
|
|
||||||
size_t blockNum = rowSize / 8;
|
|
||||||
size_t blockSize = 8 * sizeof(__fp16);
|
|
||||||
|
|
||||||
for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
|
|
||||||
const __fp16 *rowSrc = src + blockIndex * 8;
|
|
||||||
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
|
|
||||||
memcpy(dst, rowSrc, blockSize);
|
|
||||||
dst += 8;
|
|
||||||
rowSrc += rowSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t remain = rowSize - blockNum * 8;
|
|
||||||
if(remain){
|
|
||||||
const __fp16 *rowSrc = src + blockNum * 8;
|
|
||||||
size_t remainSize = remain * sizeof(__fp16);
|
|
||||||
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
|
|
||||||
memcpy(dst, rowSrc, remainSize);
|
|
||||||
dst += 8;
|
|
||||||
rowSrc += rowSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void KleidiAIUtil::transferNC4HW4ToNCHW(float* src, float* dst, size_t rowNum, size_t rowSize) {
|
|
||||||
size_t blockNum = (rowSize+3) / 4;
|
|
||||||
size_t blockSize = 4 * sizeof(float);
|
|
||||||
|
|
||||||
for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
|
|
||||||
const float *rowSrc = src + blockIndex * 4 * rowNum;
|
|
||||||
float *block_dst = dst + blockIndex * 4;
|
|
||||||
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
|
|
||||||
memcpy(block_dst, rowSrc, blockSize);
|
|
||||||
block_dst += rowSize;
|
|
||||||
rowSrc += 4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void KleidiAIUtil::transferNC4HW4ToNCHW(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize) {
|
|
||||||
size_t blockNum = (rowSize+7) / 8;
|
|
||||||
size_t blockSize = 8 * sizeof(__fp16);
|
|
||||||
|
|
||||||
for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
|
|
||||||
const __fp16 *rowSrc = src + blockIndex * 8 * rowNum;
|
|
||||||
__fp16 *block_dst = dst + blockIndex * 8;
|
|
||||||
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
|
|
||||||
memcpy(block_dst, rowSrc, blockSize);
|
|
||||||
block_dst += rowSize;
|
|
||||||
rowSrc += 8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t remain = rowSize - blockNum * 8;
|
|
||||||
if(remain){
|
|
||||||
const __fp16 *rowSrc = src + blockNum * 8;
|
|
||||||
size_t remainSize = remain * sizeof(__fp16);
|
|
||||||
for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
|
|
||||||
memcpy(dst, rowSrc, remainSize);
|
|
||||||
dst += 8;
|
|
||||||
rowSrc += rowSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Rhs pack functions for matmul_clamp_f32_qai8dxp_qsi4cxp.
|
// Rhs pack functions for matmul_clamp_f32_qai8dxp_qsi4cxp.
|
||||||
void KleidiAIUtil::packQsi4cxps16s0Qs4cxs0s1(
|
void KleidiAIUtil::packQsi4cxps16s0Qs4cxs0s1(
|
||||||
size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, const uint8_t* rhs, const float* bias,
|
size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, const uint8_t* rhs, const float* bias,
|
||||||
|
|
|
||||||
|
|
@ -48,11 +48,6 @@ namespace MNN {
|
||||||
uint8_t mRhsZeroPoint = 8;
|
uint8_t mRhsZeroPoint = 8;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void transferNCHWToNC4HW4(float* src, float* dst, size_t rowNum, size_t rowSize);
|
|
||||||
static void transferNCHWToNC4HW4(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize);
|
|
||||||
static void transferNC4HW4ToNCHW(float* src, float* dst, size_t rowNum, size_t rowSize);
|
|
||||||
static void transferNC4HW4ToNCHW(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize);
|
|
||||||
|
|
||||||
/// Rhs pack functions for matmul_clamp_f32_qai8dxp_qsi4cxp.
|
/// Rhs pack functions for matmul_clamp_f32_qai8dxp_qsi4cxp.
|
||||||
static void packQsi4cxps16s0Qs4cxs0s1(
|
static void packQsi4cxps16s0Qs4cxs0s1(
|
||||||
size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr,
|
size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr,
|
||||||
|
|
|
||||||
|
|
@ -408,111 +408,6 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
|
||||||
|
|
||||||
// dynamic quant
|
// dynamic quant
|
||||||
bool directReadInt4weight = (kernelCount == 1 && ROUND_UP(oc, UNIT) == oc && ROUND_UP(ic, SRC_UNIT) == ic);
|
bool directReadInt4weight = (kernelCount == 1 && ROUND_UP(oc, UNIT) == oc && ROUND_UP(ic, SRC_UNIT) == ic);
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
if(quanCommon->canUseInt4) {
|
|
||||||
bool bFP16 = gcore->bytes == 2 ? true : false;
|
|
||||||
bool bAsym = quanCommon->asymmetric;
|
|
||||||
size_t blkSize = mResourceInt8->mBlockNum == 1 ? 0 : ic / mResourceInt8->mBlockNum;
|
|
||||||
KleidiAI::AccelType accelType = KleidiAI::getQIntAccelType(4, bAsym, blkSize);
|
|
||||||
|
|
||||||
if (!KleidiAI::mKaiInitialized) {
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo(), bFP16, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if(!kai.isLoaded(accelType)) {
|
|
||||||
kai.setLoaded(accelType);
|
|
||||||
kai.printInfo(accelType);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(kai.canAccelerate(accelType)) {
|
|
||||||
AutoStorage<int8_t> reorderedQuantInfo;
|
|
||||||
reorderedQuantInfo.reset(2 * scaleSize * QUANT_INFO_BYTES + oc * QUANT_INFO_BYTES);
|
|
||||||
if (reorderedQuantInfo.get() == nullptr) {
|
|
||||||
MNN_ERROR("Memory not enough\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//Prepare scale and zero data.
|
|
||||||
{
|
|
||||||
int outputCount = convOp->common()->outputCount();
|
|
||||||
int originOffset = -8;
|
|
||||||
auto quanInfoPtr = quanCommon->alpha.get();
|
|
||||||
auto scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
|
|
||||||
auto zeroPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(scalePtr) + scaleSize * QUANT_INFO_BYTES);
|
|
||||||
auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(zeroPtr) + scaleSize * QUANT_INFO_BYTES);
|
|
||||||
if (quanCommon->asymmetric) {
|
|
||||||
for (int i = 0; i < blockNum; ++i) {
|
|
||||||
auto dstScale = scalePtr + i * ocUp4;
|
|
||||||
auto dstZero = zeroPtr + i * ocUp4;
|
|
||||||
for (int j = 0; j < outputCount; ++j) {
|
|
||||||
int scaleIndex = j * blockNum + i;
|
|
||||||
dstScale[j] = quanInfoPtr[2 * scaleIndex + 1];
|
|
||||||
dstZero[j] = quanInfoPtr[2 * scaleIndex] + (float)originOffset * dstScale[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = 0; i < blockNum; ++i) {
|
|
||||||
auto dstScale = scalePtr + i * ocUp4;
|
|
||||||
auto dstZero = zeroPtr + i * ocUp4;
|
|
||||||
for (int j = 0; j < outputCount; ++j) {
|
|
||||||
int scaleIndex = j * blockNum + i;
|
|
||||||
dstScale[j] = quanInfoPtr[scaleIndex];
|
|
||||||
dstZero[j] = (float)originOffset * dstScale[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
::memcpy(biasPtr, convOp->bias()->data(), oc * QUANT_INFO_BYTES);
|
|
||||||
}
|
|
||||||
|
|
||||||
mAccelType = accelType;
|
|
||||||
int n = oc;
|
|
||||||
int k = ic;
|
|
||||||
int packedWeightSize = kai.getRhsPackedSize(mAccelType, n, k, blkSize);
|
|
||||||
|
|
||||||
//Alloc packed weight tensor.
|
|
||||||
mResourceInt8->mWeightInt8.reset(Tensor::createDevice<uint8_t>({packedWeightSize}));
|
|
||||||
bool success = backend->onAcquireBuffer(mResourceInt8->mWeightInt8.get(), Backend::STATIC);
|
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
MNN_ERROR("Out of static memory!\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t paraNum = scaleSize;
|
|
||||||
float *scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
|
|
||||||
float *zeroPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + paraNum;
|
|
||||||
float *biasPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + 2 * paraNum;
|
|
||||||
//Reload some parameters to fit ukernels' layout.
|
|
||||||
auto quanInfoPtr = quanCommon->alpha.get();
|
|
||||||
auto alphaSize = quanCommon->alpha.size();
|
|
||||||
if(bAsym) {
|
|
||||||
for(int i = 0; i < paraNum; i++) {
|
|
||||||
if(i*2 >= alphaSize){
|
|
||||||
zeroPtr[i] = 0;
|
|
||||||
scalePtr[i] = 0;
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
zeroPtr[i] = quanInfoPtr[i * 2];
|
|
||||||
scalePtr[i] = quanInfoPtr[i * 2 + 1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(blkSize != 0) {
|
|
||||||
memcpy(scalePtr, (uint8_t*)quanInfoPtr, paraNum * sizeof(float));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//Run rhs pack.
|
|
||||||
auto weightPackedData = mResourceInt8->mWeightInt8->host<uint8_t>();
|
|
||||||
kai.runRhsPack(mAccelType, 1, n, k, blkSize, 0/*unused*/,
|
|
||||||
(uint8_t*)quanCommon->weight.get(),
|
|
||||||
(const void*)scalePtr, (const void*)zeroPtr, (const void*)biasPtr,
|
|
||||||
weightPackedData);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
auto target = mResourceInt8;
|
auto target = mResourceInt8;
|
||||||
// Save bias
|
// Save bias
|
||||||
if (convOp->bias()) {
|
if (convOp->bias()) {
|
||||||
|
|
@ -609,9 +504,6 @@ bool DenseConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution**
|
||||||
if (!exe->valid()) {
|
if (!exe->valid()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
exe->mAccelType = this->mAccelType;
|
|
||||||
#endif
|
|
||||||
*dst = exe;
|
*dst = exe;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -655,38 +547,6 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
|
||||||
float weightBytes = mResourceInt8->mActBits == 4 ? 0.5 : 1;
|
float weightBytes = mResourceInt8->mActBits == 4 ? 0.5 : 1;
|
||||||
mBlockNum = mResourceInt8->mBlockNum;
|
mBlockNum = mResourceInt8->mBlockNum;
|
||||||
|
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if(mResourceInt8->mDynamicQuant && mResourceInt8->mActBits == 4 && kai.canAccelerate(mAccelType)) {
|
|
||||||
MNN_ASSERT(kai.isLoaded(mAccelType));
|
|
||||||
const size_t m = inputs[0]->batch(); //lhs vector number.
|
|
||||||
const size_t n = outputs[0]->channel(); //rhs vector number.
|
|
||||||
const size_t k = inputs[0]->channel(); //vector size.
|
|
||||||
const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
|
|
||||||
|
|
||||||
int packedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
|
|
||||||
int elementSize = kai.isHalf() ? sizeof(__fp16) : sizeof(float);
|
|
||||||
if(m > 1 && !kai.isLinear()) {
|
|
||||||
int srcSize = m * k * elementSize;
|
|
||||||
int dstSize = m * n * elementSize;
|
|
||||||
int extraSize = srcSize > dstSize ? srcSize : dstSize;
|
|
||||||
packedSize += extraSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
//Split mTempIm2ColBuffer as two parts for linear/tile transfer:
|
|
||||||
//Part0: Lhs_packed.
|
|
||||||
//Part1: Lhs/Dst before transfer.
|
|
||||||
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({packedSize}));
|
|
||||||
bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
|
|
||||||
if (!success) {
|
|
||||||
MNN_ERROR("Out of dynamic memory!\n");
|
|
||||||
return OUT_OF_MEMORY;
|
|
||||||
}
|
|
||||||
|
|
||||||
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
CPUConvolution::onResize(inputs, outputs);
|
CPUConvolution::onResize(inputs, outputs);
|
||||||
if (mResourceInt8->mDynamicQuant == false) {
|
if (mResourceInt8->mDynamicQuant == false) {
|
||||||
mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
|
mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
|
||||||
|
|
@ -943,99 +803,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
||||||
auto gcore = static_cast<CPUBackend*>(backend())->functions();
|
auto gcore = static_cast<CPUBackend*>(backend())->functions();
|
||||||
auto dynamicOption = static_cast<CPUBackend*>(backend())->getRuntime()->hint().dynamicQuantOption;
|
auto dynamicOption = static_cast<CPUBackend*>(backend())->getRuntime()->hint().dynamicQuantOption;
|
||||||
|
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if(mResourceInt8->mDynamicQuant && mResourceInt8->mActBits == 4 && kai.canAccelerate(mAccelType)) {
|
|
||||||
MNN_ASSERT(kai.isLoaded(mAccelType));
|
|
||||||
const size_t m = input->batch(); //lhs vector number.
|
|
||||||
const size_t n = output->channel(); //rhs vector number.
|
|
||||||
const size_t k = input->channel(); //vector size.
|
|
||||||
const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
|
|
||||||
|
|
||||||
bool bHalf = kai.isHalf();
|
|
||||||
size_t elementSize = bHalf ? sizeof(__fp16) : sizeof(float);
|
|
||||||
size_t lhsPackedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
|
|
||||||
|
|
||||||
auto lhs = input->host<uint8_t>();
|
|
||||||
auto lhsPacked = mTempIm2ColBuffer->host<int8_t>();
|
|
||||||
auto rhsPacked = mResourceInt8->mWeightInt8->host<uint8_t>();
|
|
||||||
auto dst = output->host<uint8_t>();
|
|
||||||
|
|
||||||
uint8_t *linearLhs, *linearDst;
|
|
||||||
if(m > 1 && !kai.isLinear()) {
|
|
||||||
linearLhs = (uint8_t *)lhsPacked + lhsPackedSize;
|
|
||||||
linearDst = linearLhs;
|
|
||||||
} else {
|
|
||||||
linearLhs = lhs;
|
|
||||||
linearDst = dst;
|
|
||||||
}
|
|
||||||
|
|
||||||
int threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
|
|
||||||
int threadNeed, vecPerThread;
|
|
||||||
|
|
||||||
//Dynamic quant pack lhs.
|
|
||||||
if(m == 1) {
|
|
||||||
kai.runLhsQuantPack(mAccelType, 1, k, blkSize, 1, linearLhs, lhsPacked);
|
|
||||||
} else {
|
|
||||||
if(!kai.isLinear()) {
|
|
||||||
if(bHalf) {
|
|
||||||
KleidiAIUtil::transferNC4HW4ToNCHW((__fp16 *)lhs, (__fp16 *)linearLhs, m, k);
|
|
||||||
} else {
|
|
||||||
KleidiAIUtil::transferNC4HW4ToNCHW((float *)lhs, (float *)linearLhs, m, k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
vecPerThread = kai.getVecNumPerThread(m, threadNum, kai.getMr(mAccelType, m));
|
|
||||||
threadNeed = m % vecPerThread == 0 ? m / vecPerThread : (m / vecPerThread + 1);
|
|
||||||
size_t srcStride = vecPerThread * k * elementSize;
|
|
||||||
|
|
||||||
auto BatchDynamicQuant = [=, &kai](int tId) {
|
|
||||||
auto threadSrc = linearLhs + tId * srcStride;
|
|
||||||
auto threadDst = lhsPacked + kai.getLhsQuantedPackedOffset(mAccelType, m, tId * vecPerThread, k, blkSize);
|
|
||||||
int vecNum = (tId == threadNeed - 1) ? (m - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
|
|
||||||
kai.runLhsQuantPack(mAccelType, vecNum, k, blkSize, kai.getMr(mAccelType, m), threadSrc, threadDst);
|
|
||||||
};
|
|
||||||
|
|
||||||
MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
|
|
||||||
BatchDynamicQuant((int)tId);
|
|
||||||
}
|
|
||||||
MNN_CONCURRENCY_END();
|
|
||||||
}
|
|
||||||
|
|
||||||
//Run matmul.
|
|
||||||
if(kai.bSupportSme2() && mAccelType == KleidiAI::AccelType::QI4_SYM_CHNLQT) {
|
|
||||||
//SME prefer running on single thread to obtain better performance/power consumption ratio.
|
|
||||||
threadNum = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
vecPerThread = kai.getVecNumPerThread(n, threadNum, kai.getNStep(mAccelType));
|
|
||||||
threadNeed = n % vecPerThread == 0 ? n / vecPerThread : (n / vecPerThread + 1);
|
|
||||||
|
|
||||||
auto ThreadFunction = [=, &kai](int tId) {
|
|
||||||
auto threadRhsPacked = rhsPacked + kai.getRhsPackedOffset(mAccelType, tId * vecPerThread, k, blkSize);
|
|
||||||
auto threadDst = linearDst + kai.getDstOffset(0, tId * vecPerThread, n, elementSize);
|
|
||||||
int vecNum = (tId == threadNeed - 1) ? (n - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
|
|
||||||
float scalarMax = bHalf ? FLT16_MAX : FLT_MAX;
|
|
||||||
kai.runMatmul(mAccelType, m, vecNum, k, blkSize, lhsPacked, threadRhsPacked, threadDst, n * elementSize, elementSize, scalarMax, -scalarMax);
|
|
||||||
};
|
|
||||||
|
|
||||||
MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
|
|
||||||
ThreadFunction((int)tId);
|
|
||||||
}
|
|
||||||
MNN_CONCURRENCY_END();
|
|
||||||
|
|
||||||
if(m > 1 && !kai.isLinear()) {
|
|
||||||
if(bHalf) {
|
|
||||||
KleidiAIUtil::transferNCHWToNC4HW4((__fp16 *)linearDst, (__fp16 *)dst, m, n);
|
|
||||||
} else {
|
|
||||||
KleidiAIUtil::transferNCHWToNC4HW4((float *)linearDst, (float *)dst, m, n);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||||
auto blitProc = core->MNNPackC4Int8ForMatMul_A;
|
auto blitProc = core->MNNPackC4Int8ForMatMul_A;
|
||||||
|
|
|
||||||
|
|
@ -84,9 +84,6 @@ private:
|
||||||
bool mIm2ColBasedInt8;
|
bool mIm2ColBasedInt8;
|
||||||
int mSizeInputBlockQuant;
|
int mSizeInputBlockQuant;
|
||||||
bool mToFuseInputbias2Bias;
|
bool mToFuseInputbias2Bias;
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace MNN
|
} // namespace MNN
|
||||||
|
|
|
||||||
|
|
@ -52,88 +52,10 @@ Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
|
core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
if (core->bytes == 2) {
|
|
||||||
if (!KleidiAI::mKaiInitialized) {
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo(), true, false);
|
|
||||||
}
|
|
||||||
KleidiAI::AccelType accelType = KleidiAI::AccelType::FP16;
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if (!kai.isLoaded(accelType)) {
|
|
||||||
kai.setLoaded(accelType);
|
|
||||||
kai.printInfo(accelType);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (kai.canAccelerate(accelType)) {
|
|
||||||
mAccelType = accelType;
|
|
||||||
AutoRelease<Tensor> tempBiasTensor(Tensor::createDevice<float>({outputCount}));
|
|
||||||
mValid = b->onAcquireBuffer(tempBiasTensor.get(), Backend::STATIC);
|
|
||||||
if (!mValid) {
|
|
||||||
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
|
||||||
MNN_ERROR("Not Enough Memory\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
core->MNNFp32ToLowp(bias, tempBiasTensor->host<int16_t>(), outputCount);
|
|
||||||
|
|
||||||
int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
|
|
||||||
//Alloc packed weight tensor.
|
|
||||||
mResource->mWeight.reset(Tensor::createDevice<float>({packedSize}));
|
|
||||||
bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
|
||||||
if (!success) {
|
|
||||||
b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
|
|
||||||
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
|
||||||
MNN_ERROR("Out of static memory!\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//Run rhs pack.
|
|
||||||
kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(__fp16),
|
|
||||||
tempTensor->host<void>(), nullptr, nullptr, tempBiasTensor->host<void>(),
|
|
||||||
mResource->mWeight->host<void>());
|
|
||||||
b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
|
|
||||||
} else {
|
|
||||||
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
|
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
|
||||||
#endif
|
|
||||||
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
||||||
} else {
|
} else {
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
if (!KleidiAI::mKaiInitialized) {
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo(), false, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
KleidiAI::AccelType accelType = KleidiAI::AccelType::FP32;
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if(!kai.isLoaded(accelType)) {
|
|
||||||
kai.setLoaded(accelType);
|
|
||||||
kai.printInfo(accelType);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (kai.canAccelerate(accelType)) {
|
|
||||||
mAccelType = accelType;
|
|
||||||
int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
|
|
||||||
//Alloc packed weight tensor.
|
|
||||||
mResource->mWeight.reset(Tensor::createDevice<float>({packedSize}));
|
|
||||||
bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
|
||||||
if (!success) {
|
|
||||||
MNN_ERROR("Out of static memory!\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//Run rhs pack.
|
|
||||||
kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(float),
|
|
||||||
originWeight, nullptr, nullptr, bias, mResource->mWeight->host<void>());
|
|
||||||
} else {
|
|
||||||
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
|
core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
|
Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
|
||||||
|
|
@ -152,9 +74,6 @@ bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
auto exe = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);
|
auto exe = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
exe->mAccelType = this->mAccelType;
|
|
||||||
#endif
|
|
||||||
*dst = exe;
|
*dst = exe;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -183,26 +102,6 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
||||||
int maxDepth = 5;
|
int maxDepth = 5;
|
||||||
auto icAlign = UP_DIV(ic, lPack) * lPack;
|
auto icAlign = UP_DIV(ic, lPack) * lPack;
|
||||||
auto weightTensor = mResource->mWeight.get();
|
auto weightTensor = mResource->mWeight.get();
|
||||||
|
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if (kai.canAccelerate(mAccelType)) {
|
|
||||||
if (batch != 1) {
|
|
||||||
int packedSize = kai.getLhsPackedSize(mAccelType, batch, ic);
|
|
||||||
|
|
||||||
mInputResource.reset(Tensor::createDevice<float>({packedSize}));
|
|
||||||
bool success = backend()->onAcquireBuffer(mInputResource.get(), Backend::DYNAMIC);
|
|
||||||
if (!success) {
|
|
||||||
MNN_ERROR("Out of dynamic memory!\n");
|
|
||||||
return OUT_OF_MEMORY;
|
|
||||||
}
|
|
||||||
|
|
||||||
backend()->onReleaseBuffer(mInputResource.get(), Backend::DYNAMIC);
|
|
||||||
}
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
mWeightBytes = bytes;
|
mWeightBytes = bytes;
|
||||||
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
|
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
|
||||||
std::vector<int> divides(numberThread+1);
|
std::vector<int> divides(numberThread+1);
|
||||||
|
|
@ -298,24 +197,6 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs,
|
||||||
auto weightPtr = mResource->mWeight->host<uint8_t>();
|
auto weightPtr = mResource->mWeight->host<uint8_t>();
|
||||||
auto biasPtr = mResource->mBias->host<uint8_t>();
|
auto biasPtr = mResource->mBias->host<uint8_t>();
|
||||||
|
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if (kai.canAccelerate(mAccelType)) {
|
|
||||||
const size_t m = input->batch(); //lhs vector number.
|
|
||||||
const size_t n = output->channel(); //rhs vector number.
|
|
||||||
const size_t k = input->channel(); //vector size.
|
|
||||||
auto lhsPacked = inputPtr;
|
|
||||||
auto dst = output->host<uint8_t>();
|
|
||||||
size_t elementSize = kai.isFP16() ? sizeof(__fp16) : sizeof(float);
|
|
||||||
if(m != 1) {
|
|
||||||
lhsPacked = mInputResource->host<uint8_t>();
|
|
||||||
kai.runLhsPack(mAccelType, m, k, 0, inputPtr, k * elementSize, lhsPacked);
|
|
||||||
}
|
|
||||||
auto postPtr = getPostParameters();
|
|
||||||
kai.runMatmul(mAccelType, m, n, k, 0, lhsPacked, weightPtr, dst, n * elementSize, elementSize, postPtr[3], postPtr[2]);
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
MNN_CONCURRENCY_BEGIN(tId, size) {
|
MNN_CONCURRENCY_BEGIN(tId, size) {
|
||||||
auto &unit = mUnits[tId];
|
auto &unit = mUnits[tId];
|
||||||
if (unit.mValid) {
|
if (unit.mValid) {
|
||||||
|
|
|
||||||
|
|
@ -26,9 +26,6 @@ public:
|
||||||
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<CPUConvolution::Resource> mResource;
|
std::shared_ptr<CPUConvolution::Resource> mResource;
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
std::shared_ptr<Tensor> mInputResource;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct Unit {
|
struct Unit {
|
||||||
bool mValid = true;
|
bool mValid = true;
|
||||||
|
|
@ -38,9 +35,6 @@ private:
|
||||||
|
|
||||||
std::vector<Unit> mUnits;
|
std::vector<Unit> mUnits;
|
||||||
int mWeightBytes = 4;
|
int mWeightBytes = 4;
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
|
||||||
KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
} // namespace MNN
|
} // namespace MNN
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
#include "backend/cpu/compute/ConvolutionFloatFactory.h"
|
#include "backend/cpu/compute/ConvolutionFloatFactory.h"
|
||||||
#include "backend/cpu/CPUConvolutionDepthwise.hpp"
|
#include "backend/cpu/CPUConvolutionDepthwise.hpp"
|
||||||
|
#include "backend/cpu/CPURuntime.hpp"
|
||||||
#include "backend/cpu/compute/ConvOpt.h"
|
#include "backend/cpu/compute/ConvOpt.h"
|
||||||
#include "backend/cpu/compute/Convolution1x1Strassen.hpp"
|
#include "backend/cpu/compute/Convolution1x1Strassen.hpp"
|
||||||
#include "backend/cpu/compute/ConvolutionGroup.hpp"
|
#include "backend/cpu/compute/ConvolutionGroup.hpp"
|
||||||
|
|
@ -22,6 +23,11 @@
|
||||||
#include "core/OpCommonUtils.hpp"
|
#include "core/OpCommonUtils.hpp"
|
||||||
#include "backend/cpu/OneDNNConvolution.hpp"
|
#include "backend/cpu/OneDNNConvolution.hpp"
|
||||||
#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"
|
#include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
#include "backend/cpu/compute/KleidiAIConvInt8.hpp"
|
||||||
|
#include "backend/cpu/compute/KleidiAIConvolution.hpp"
|
||||||
|
#include "backend/cpu/compute/KleidiAIDenseConvolution.hpp"
|
||||||
|
#endif //MNN_KLEIDIAI_ENABLED
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
|
|
||||||
|
|
@ -48,6 +54,41 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
|
||||||
#ifdef MNN_LOW_MEMORY
|
#ifdef MNN_LOW_MEMORY
|
||||||
if (lowMemory && nullptr != weightQuantInfo.get() && originWeightSize == 0) {
|
if (lowMemory && nullptr != weightQuantInfo.get() && originWeightSize == 0) {
|
||||||
if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
|
if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
do {
|
||||||
|
if (!weightQuantInfo->canUseInt4) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto convOp = op->main_as_Convolution2D();
|
||||||
|
auto core = static_cast<CPUBackend*>(backend)->functions();
|
||||||
|
int oc = convOp->common()->outputCount();
|
||||||
|
int ic = convOp->common()->inputCount();
|
||||||
|
|
||||||
|
int blockNum = 1;
|
||||||
|
int dequantCnt = weightQuantInfo->alphaSize;
|
||||||
|
if (weightQuantInfo->asymmetric) {
|
||||||
|
dequantCnt /= 2;
|
||||||
|
}
|
||||||
|
blockNum = dequantCnt / oc;
|
||||||
|
|
||||||
|
bool bAsym = weightQuantInfo->asymmetric;
|
||||||
|
size_t blkSize = blockNum == 1 ? 0 : ic / blockNum;
|
||||||
|
|
||||||
|
KleidiAI::AccelType accelType = KleidiAI::getQIntAccelType(4, bAsym, blkSize, core->bytes);
|
||||||
|
|
||||||
|
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
|
||||||
|
if(!kai.isLoaded(accelType)) {
|
||||||
|
kai.setLoaded(accelType);
|
||||||
|
kai.printInfo(accelType);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!kai.canAccelerate(accelType, convOp->common())){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return new KleidiAIConvInt8(backend, op, weightQuantInfo, true, kai, accelType, blockNum);
|
||||||
|
} while (0);
|
||||||
|
#endif
|
||||||
|
|
||||||
return new DenseConvInt8TiledExecutor(backend, op, weightQuantInfo, true);
|
return new DenseConvInt8TiledExecutor(backend, op, weightQuantInfo, true);
|
||||||
} else {
|
} else {
|
||||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
|
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
|
||||||
|
|
@ -55,14 +96,37 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
|
if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
if (MNNGetCPUInfo()->sme2 && !weigthQauntInfo && cpuBackend->functions()->bytes == 4) {
|
||||||
|
return new KleidiAIDenseConvolution(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
|
||||||
|
}
|
||||||
|
#else
|
||||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
|
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef MNN_REDUCE_SIZE
|
#ifndef MNN_REDUCE_SIZE
|
||||||
if (fastWay && cpuBackend->functions()->matmulBytes == 0) {
|
if (fastWay && cpuBackend->functions()->matmulBytes == 0) {
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
auto bytes = cpuBackend->functions()->bytes;
|
||||||
|
auto accelType = (bytes==2) ? KleidiAI::AccelType::FP16 : KleidiAI::AccelType::FP32;
|
||||||
|
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
|
||||||
|
if (kai.canAccelerate(accelType)){
|
||||||
|
return new KleidiAIConvolution(common, backend, originWeight, originWeightSize, bias, biasSize);
|
||||||
|
}
|
||||||
|
#endif //MNN_KLEIDIAI_ENABLED
|
||||||
|
|
||||||
return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize);
|
return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
if (MNNGetCPUInfo()->sme2 && !weightQuantInfo && cpuBackend->functions()->bytes == 4) {
|
||||||
|
return new KleidiAIDenseConvolution(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (cpuBackend->getRuntime()->hint().winogradMemoryUsed == 0 || (!ConvolutionWinogradBridge::canUseWinograd(common))) {
|
if (cpuBackend->getRuntime()->hint().winogradMemoryUsed == 0 || (!ConvolutionWinogradBridge::canUseWinograd(common))) {
|
||||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
|
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,306 @@
|
||||||
|
//
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
#include "KleidiAIConvInt8.hpp"
|
||||||
|
#include "core/Macro.h"
|
||||||
|
#include "core/BufferAllocator.hpp"
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include "backend/cpu/CPUBackend.hpp"
|
||||||
|
#include "core/Concurrency.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
#include "backend/cpu/CPUTensorConvert.hpp"
|
||||||
|
|
||||||
|
#define QUANT_INFO_BYTES 4
|
||||||
|
namespace MNN {
|
||||||
|
|
||||||
|
KleidiAIConvInt8::KleidiAIConvInt8(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon, bool isDynamicQuant,
|
||||||
|
KleidiAI &kai, KleidiAI::AccelType accelType, int32_t blockNum)
|
||||||
|
: CPUConvolution(op->main_as_Convolution2D()->common(), backend), kai(kai), mAccelType(accelType), mBlockNum(blockNum) {
|
||||||
|
// convolution info
|
||||||
|
auto convOp = op->main_as_Convolution2D();
|
||||||
|
int oc = convOp->common()->outputCount();
|
||||||
|
int ic = convOp->common()->inputCount();
|
||||||
|
|
||||||
|
// backend info
|
||||||
|
auto core = static_cast<CPUBackend*>(backend)->functions();
|
||||||
|
int pack = core->pack;
|
||||||
|
|
||||||
|
// compute info
|
||||||
|
int ocUp4 = ROUND_UP(oc, pack);
|
||||||
|
int scaleSize = ocUp4 * mBlockNum;
|
||||||
|
|
||||||
|
// kleidia info
|
||||||
|
bool bFP16 = core->bytes == 2 ? true : false;
|
||||||
|
bool bAsym = quanCommon->asymmetric;
|
||||||
|
size_t blkSize = mBlockNum == 1 ? 0 : ic / mBlockNum;
|
||||||
|
|
||||||
|
AutoStorage<int8_t> reorderedQuantInfo;
|
||||||
|
reorderedQuantInfo.reset(2 * scaleSize * QUANT_INFO_BYTES + oc * QUANT_INFO_BYTES);
|
||||||
|
if (reorderedQuantInfo.get() == nullptr) {
|
||||||
|
MNN_ERROR("Memory not enough\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Prepare scale and zero data.
|
||||||
|
{
|
||||||
|
int outputCount = convOp->common()->outputCount();
|
||||||
|
int originOffset = -8;
|
||||||
|
auto quanInfoPtr = quanCommon->alpha.get();
|
||||||
|
auto scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
|
||||||
|
auto zeroPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(scalePtr) + scaleSize * QUANT_INFO_BYTES);
|
||||||
|
auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(zeroPtr) + scaleSize * QUANT_INFO_BYTES);
|
||||||
|
if (quanCommon->asymmetric) {
|
||||||
|
for (int i = 0; i < blockNum; ++i) {
|
||||||
|
auto dstScale = scalePtr + i * ocUp4;
|
||||||
|
auto dstZero = zeroPtr + i * ocUp4;
|
||||||
|
for (int j = 0; j < outputCount; ++j) {
|
||||||
|
int scaleIndex = j * blockNum + i;
|
||||||
|
dstScale[j] = quanInfoPtr[2 * scaleIndex + 1];
|
||||||
|
dstZero[j] = quanInfoPtr[2 * scaleIndex] + (float)originOffset * dstScale[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < blockNum; ++i) {
|
||||||
|
auto dstScale = scalePtr + i * ocUp4;
|
||||||
|
auto dstZero = zeroPtr + i * ocUp4;
|
||||||
|
for (int j = 0; j < outputCount; ++j) {
|
||||||
|
int scaleIndex = j * blockNum + i;
|
||||||
|
dstScale[j] = quanInfoPtr[scaleIndex];
|
||||||
|
dstZero[j] = (float)originOffset * dstScale[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
::memcpy(biasPtr, convOp->bias()->data(), oc * QUANT_INFO_BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
int n = oc;
|
||||||
|
int k = ic;
|
||||||
|
int packedWeightSize = kai.getRhsPackedSize(mAccelType, n, k, blkSize);
|
||||||
|
|
||||||
|
//Alloc packed weight tensor.
|
||||||
|
mWeightInt8.reset(Tensor::createDevice<uint8_t>({packedWeightSize}));
|
||||||
|
bool success = backend->onAcquireBuffer(mWeightInt8.get(), Backend::STATIC);
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
MNN_ERROR("Out of static memory!\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t paraNum = scaleSize;
|
||||||
|
float *scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
|
||||||
|
float *zeroPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + paraNum;
|
||||||
|
float *biasPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + 2 * paraNum;
|
||||||
|
//Reload some parameters to fit ukernels' layout.
|
||||||
|
auto quanInfoPtr = quanCommon->alpha.get();
|
||||||
|
auto alphaSize = quanCommon->alpha.size();
|
||||||
|
if(bAsym) {
|
||||||
|
for(int i = 0; i < paraNum; i++) {
|
||||||
|
if(i*2 >= alphaSize){
|
||||||
|
zeroPtr[i] = 0;
|
||||||
|
scalePtr[i] = 0;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
zeroPtr[i] = quanInfoPtr[i * 2];
|
||||||
|
scalePtr[i] = quanInfoPtr[i * 2 + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(blkSize != 0) {
|
||||||
|
memcpy(scalePtr, (uint8_t*)quanInfoPtr, paraNum * sizeof(float));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Run rhs pack.
|
||||||
|
auto weightPackedData = mWeightInt8->host<uint8_t>();
|
||||||
|
kai.runRhsPack(mAccelType, 1, n, k, blkSize, 0/*unused*/,
|
||||||
|
(uint8_t*)quanCommon->weight.get(),
|
||||||
|
(const void*)scalePtr, (const void*)zeroPtr, (const void*)biasPtr,
|
||||||
|
weightPackedData);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
KleidiAIConvInt8::KleidiAIConvInt8(Backend* backend, const Op* op, const KleidiAIConvInt8& exe)
|
||||||
|
: CPUConvolution(op->main_as_Convolution2D()->common(), backend), kai(exe.kai), mAccelType(exe.mAccelType),
|
||||||
|
mWeightInt8(exe.mWeightInt8),mBlockNum(exe.mBlockNum),
|
||||||
|
mTempIm2ColBuffer(exe.mTempIm2ColBuffer) {
|
||||||
|
}
|
||||||
|
|
||||||
|
KleidiAIConvInt8::~KleidiAIConvInt8() {
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KleidiAIConvInt8::onClone(Backend* bn, const Op* op, Execution** dst) {
|
||||||
|
if (nullptr == dst) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
auto exe = new KleidiAIConvInt8(bn, op, *this);
|
||||||
|
if (!exe->valid()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*dst = exe;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// need
|
||||||
|
ErrorCode KleidiAIConvInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
// Initialize.
|
||||||
|
auto input = inputs[0];
|
||||||
|
auto output = outputs[0];
|
||||||
|
auto core =static_cast<CPUBackend*>(backend())->functions();
|
||||||
|
auto b = backend();
|
||||||
|
|
||||||
|
MNN_ASSERT(kai.isLoaded(mAccelType));
|
||||||
|
const size_t m = inputs[0]->batch() * inputs[0]->width() * inputs[0]->height(); //lhs vector number.
|
||||||
|
const size_t n = outputs[0]->channel(); //rhs vector number.
|
||||||
|
const size_t k = inputs[0]->channel(); //vector size.
|
||||||
|
const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
|
||||||
|
|
||||||
|
auto inputOriginFmt = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
|
||||||
|
auto outputOriginFmt = TensorUtils::getDescribe(outputs[0])->dimensionFormat;
|
||||||
|
halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
|
||||||
|
if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
mInputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{input->batch(), input->height(), input->width(), input->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
|
||||||
|
mValid = b->onAcquireBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
if (!mValid) {
|
||||||
|
MNN_ERROR("Out of dynamic memory!\n");
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
mOutputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{output->batch(), output->height(), output->width(), output->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
|
||||||
|
mValid = b->onAcquireBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
if (!mValid) {
|
||||||
|
MNN_ERROR("Out of dynamic memory!\n");
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int packedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
|
||||||
|
int elementSize = core->bytes;
|
||||||
|
|
||||||
|
//Split mTempIm2ColBuffer as two parts for linear/tile transfer:
|
||||||
|
//Part0: Lhs_packed.
|
||||||
|
//Part1: Lhs/Dst before transfer.
|
||||||
|
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({packedSize}));
|
||||||
|
bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
|
||||||
|
if (!success) {
|
||||||
|
MNN_ERROR("Out of dynamic memory!\n");
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
|
||||||
|
|
||||||
|
if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
b->onReleaseBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
b->onReleaseBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode KleidiAIConvInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
const auto input = inputs[0];
|
||||||
|
auto output = outputs[0];
|
||||||
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
|
|
||||||
|
// Initialize for convert
|
||||||
|
auto inputDes = TensorUtils::getDescribe(inputs[0]);
|
||||||
|
auto outputDes = TensorUtils::getDescribe(outputs[0]);
|
||||||
|
auto b = backend();
|
||||||
|
halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
|
||||||
|
|
||||||
|
MNN_ASSERT(kai.isLoaded(mAccelType));
|
||||||
|
const size_t m = input->batch() * input->width() * input->height(); //lhs vector number.
|
||||||
|
const size_t n = output->channel(); //rhs vector number.
|
||||||
|
const size_t k = input->channel(); //vector size.
|
||||||
|
const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
|
||||||
|
|
||||||
|
size_t elementSize = core->bytes;
|
||||||
|
size_t lhsPackedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
|
||||||
|
|
||||||
|
auto lhs = input->host<uint8_t>();
|
||||||
|
auto lhsPacked = mTempIm2ColBuffer->host<int8_t>();
|
||||||
|
auto rhsPacked = mWeightInt8->host<uint8_t>();
|
||||||
|
|
||||||
|
int threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
|
||||||
|
int threadNeed, vecPerThread;
|
||||||
|
|
||||||
|
if(inputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC) {
|
||||||
|
// Convert input to NHWC format.
|
||||||
|
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
|
||||||
|
CPUTensorConverter::convert(input, mInputConvertBuffer.get(), core, tId, threadNum);
|
||||||
|
};
|
||||||
|
MNN_CONCURRENCY_END();
|
||||||
|
lhs = mInputConvertBuffer->host<uint8_t>();
|
||||||
|
}
|
||||||
|
|
||||||
|
//Dynamic quant pack lhs.
|
||||||
|
if(m == 1) {
|
||||||
|
kai.runLhsQuantPack(mAccelType, 1, k, blkSize, 1, lhs, lhsPacked);
|
||||||
|
} else {
|
||||||
|
vecPerThread = kai.getVecNumPerThread(m, threadNum, kai.getMr(mAccelType, m));
|
||||||
|
threadNeed = m % vecPerThread == 0 ? m / vecPerThread : (m / vecPerThread + 1);
|
||||||
|
size_t srcStride = vecPerThread * k * elementSize;
|
||||||
|
|
||||||
|
auto BatchDynamicQuant = [=](int tId) {
|
||||||
|
auto threadSrc = lhs + tId * srcStride;
|
||||||
|
auto threadDst = lhsPacked + kai.getLhsQuantedPackedOffset(mAccelType, m, tId * vecPerThread, k, blkSize);
|
||||||
|
int vecNum = (tId == threadNeed - 1) ? (m - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
|
||||||
|
kai.runLhsQuantPack(mAccelType, vecNum, k, blkSize, kai.getMr(mAccelType, m), threadSrc, threadDst);
|
||||||
|
};
|
||||||
|
|
||||||
|
MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
|
||||||
|
BatchDynamicQuant((int)tId);
|
||||||
|
}
|
||||||
|
MNN_CONCURRENCY_END();
|
||||||
|
}
|
||||||
|
|
||||||
|
//Run matmul.
|
||||||
|
auto dst = output->host<uint8_t>();
|
||||||
|
if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC) {
|
||||||
|
//store matmul result to convert buffer.
|
||||||
|
dst = mOutputConvertBuffer->host<uint8_t>();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(kai.bSupportSme2() && mAccelType == KleidiAI::AccelType::QI4_SYM_CHNLQT_F32) {
|
||||||
|
//SME prefer running on single thread to obtain better performance/power consumption ratio.
|
||||||
|
threadNum = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
vecPerThread = kai.getVecNumPerThread(n, threadNum, kai.getNStep(mAccelType));
|
||||||
|
threadNeed = n % vecPerThread == 0 ? n / vecPerThread : (n / vecPerThread + 1);
|
||||||
|
auto postPtr = getPostParameters();
|
||||||
|
|
||||||
|
auto ThreadFunction = [=](int tId) {
|
||||||
|
auto threadRhsPacked = rhsPacked + kai.getRhsPackedOffset(mAccelType, tId * vecPerThread, k, blkSize);
|
||||||
|
auto threadDst = dst + kai.getDstOffset(0, tId * vecPerThread, n, elementSize);
|
||||||
|
int vecNum = (tId == threadNeed - 1) ? (n - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
|
||||||
|
kai.runMatmul(mAccelType, m, vecNum, k, blkSize, lhsPacked, threadRhsPacked, threadDst, n * elementSize, elementSize, postPtr[3], postPtr[2]);
|
||||||
|
};
|
||||||
|
|
||||||
|
MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
|
||||||
|
ThreadFunction((int)tId);
|
||||||
|
}
|
||||||
|
MNN_CONCURRENCY_END();
|
||||||
|
|
||||||
|
|
||||||
|
if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC) {
|
||||||
|
// Convert output from NHWC format to original format.
|
||||||
|
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
|
||||||
|
CPUTensorConverter::convert(mOutputConvertBuffer.get(), output, core, tId, threadNum);
|
||||||
|
};
|
||||||
|
MNN_CONCURRENCY_END();
|
||||||
|
}
|
||||||
|
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace MNN
|
||||||
|
#endif //MNN_KLEIDIAI_ENABLED
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
//
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef KleidiAIConvInt8_hpp
|
||||||
|
#define KleidiAIConvInt8_hpp
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
#include "backend/cpu/CPUConvolution.hpp"
|
||||||
|
#include "Int8FunctionsOpt.h"
|
||||||
|
#include "CommonOptFunction.h"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
class KleidiAIConvInt8 : public CPUConvolution {
|
||||||
|
public:
|
||||||
|
KleidiAIConvInt8(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon, bool isDynamicQuant, KleidiAI &kai, KleidiAI::AccelType accelType, int32_t blockNum);
|
||||||
|
virtual ~KleidiAIConvInt8();
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
||||||
|
private:
|
||||||
|
KleidiAIConvInt8(Backend* backend, const Op* op, const KleidiAIConvInt8& exe);
|
||||||
|
std::shared_ptr<Tensor> mWeightInt8;
|
||||||
|
std::shared_ptr<Tensor> mTempIm2ColBuffer;
|
||||||
|
std::shared_ptr<Tensor> mInputConvertBuffer;
|
||||||
|
std::shared_ptr<Tensor> mOutputConvertBuffer;
|
||||||
|
KleidiAI &kai;
|
||||||
|
KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
|
||||||
|
int32_t mBlockNum = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace MNN
|
||||||
|
#endif // MNN_KLEIDIAI_ENABLED
|
||||||
|
#endif /* KleidiAIConvInt8_hpp */
|
||||||
|
|
@ -0,0 +1,232 @@
|
||||||
|
//
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
#include "KleidiAIConvolution.hpp"
|
||||||
|
#include <string.h>
|
||||||
|
#include "core/BufferAllocator.hpp"
|
||||||
|
#include "backend/cpu/CPUBackend.hpp"
|
||||||
|
#include "core/Concurrency.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
#include "backend/cpu/CPUTensorConvert.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
#ifndef MNN_REDUCE_SIZE
|
||||||
|
|
||||||
|
KleidiAIConvolution::KleidiAIConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
||||||
|
size_t originWeightSize, const float *bias, size_t biasSize)
|
||||||
|
: CPUConvolution(common, b) {
|
||||||
|
|
||||||
|
auto outputCount = (int)biasSize;
|
||||||
|
auto core = static_cast<CPUBackend*>(b)->functions();
|
||||||
|
mResource.reset(new CPUConvolution::Resource);
|
||||||
|
mResource->backend = b;
|
||||||
|
auto mSrcCount = (int)originWeightSize / outputCount;
|
||||||
|
if (!mResource->copyBiasAlign(bias, (int)biasSize)) {
|
||||||
|
MNN_ERROR("Not Enough Memory\n");
|
||||||
|
mValid = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (b->getRuntime()->hint().useCachedMmap > 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
|
||||||
|
|
||||||
|
if (core->bytes == 2) {
|
||||||
|
AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount}));
|
||||||
|
mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC);
|
||||||
|
if (!mValid) {
|
||||||
|
MNN_ERROR("Not Enough Memory\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
|
||||||
|
|
||||||
|
KleidiAI::AccelType accelType = KleidiAI::AccelType::FP16;
|
||||||
|
if (!kai.isLoaded(accelType)) {
|
||||||
|
kai.setLoaded(accelType);
|
||||||
|
kai.printInfo(accelType);
|
||||||
|
}
|
||||||
|
|
||||||
|
mAccelType = accelType;
|
||||||
|
AutoRelease<Tensor> tempBiasTensor(Tensor::createDevice<float>({outputCount}));
|
||||||
|
mValid = b->onAcquireBuffer(tempBiasTensor.get(), Backend::STATIC);
|
||||||
|
if (!mValid) {
|
||||||
|
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
||||||
|
MNN_ERROR("Not Enough Memory\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
core->MNNFp32ToLowp(bias, tempBiasTensor->host<int16_t>(), outputCount);
|
||||||
|
|
||||||
|
int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
|
||||||
|
//Alloc packed weight tensor.
|
||||||
|
mResource->mWeight.reset(Tensor::createDevice<int8_t>({packedSize}));
|
||||||
|
bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
||||||
|
if (!success) {
|
||||||
|
b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
|
||||||
|
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
||||||
|
MNN_ERROR("Out of static memory!\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Run rhs pack.
|
||||||
|
kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(__fp16),
|
||||||
|
tempTensor->host<void>(), nullptr, nullptr, tempBiasTensor->host<void>(),
|
||||||
|
mResource->mWeight->host<void>());
|
||||||
|
b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
|
||||||
|
b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
|
||||||
|
} else {
|
||||||
|
KleidiAI::AccelType accelType = KleidiAI::AccelType::FP32;
|
||||||
|
if(!kai.isLoaded(accelType)) {
|
||||||
|
kai.setLoaded(accelType);
|
||||||
|
kai.printInfo(accelType);
|
||||||
|
}
|
||||||
|
mAccelType = accelType;
|
||||||
|
int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
|
||||||
|
//Alloc packed weight tensor.
|
||||||
|
mResource->mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{packedSize}));
|
||||||
|
mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
||||||
|
if (!mValid) {
|
||||||
|
MNN_ERROR("Out of static memory!\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Run rhs pack.
|
||||||
|
kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(float),
|
||||||
|
originWeight, nullptr, nullptr, bias, mResource->mWeight->host<void>());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
KleidiAIConvolution::KleidiAIConvolution(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
|
||||||
|
mResource = resource;
|
||||||
|
}
|
||||||
|
|
||||||
|
KleidiAIConvolution::~KleidiAIConvolution() {
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KleidiAIConvolution::onClone(Backend* bn, const Op* op, Execution** dst) {
|
||||||
|
if (!mValid) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (nullptr == dst) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
auto exe = new KleidiAIConvolution(mResource, op->main_as_Convolution2D()->common(), bn);
|
||||||
|
exe->mAccelType = this->mAccelType;
|
||||||
|
*dst = exe;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode KleidiAIConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
|
CPUConvolution::onResize(inputs, outputs);
|
||||||
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
|
int bytes = core->bytes;
|
||||||
|
auto input = inputs[0];
|
||||||
|
auto output = outputs[0];
|
||||||
|
auto inputDes = TensorUtils::getDescribe(inputs[0]);
|
||||||
|
auto outputDes = TensorUtils::getDescribe(outputs[0]);
|
||||||
|
auto ic = input->channel();
|
||||||
|
auto oc = output->channel();
|
||||||
|
auto batch = input->batch();
|
||||||
|
auto b = backend();
|
||||||
|
|
||||||
|
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
|
||||||
|
auto inputOriginFmt = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
|
||||||
|
auto outputOriginFmt = TensorUtils::getDescribe(outputs[0])->dimensionFormat;
|
||||||
|
halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
|
||||||
|
if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
mInputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{input->batch(), input->height(), input->width(), input->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
|
||||||
|
mValid = b->onAcquireBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
if (!mValid) {
|
||||||
|
MNN_ERROR("Out of dynamic memory!\n");
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
mOutputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{output->batch(), output->height(), output->width(), output->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
|
||||||
|
mValid = b->onAcquireBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
if (!mValid) {
|
||||||
|
MNN_ERROR("Out of dynamic memory!\n");
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto m = batch * input->width() * input->height();
|
||||||
|
if (m != 1) {
|
||||||
|
int packedSize = kai.getLhsPackedSize(mAccelType, m, ic);
|
||||||
|
|
||||||
|
mInputResource.reset(Tensor::createDevice<float>({packedSize}));
|
||||||
|
bool success = backend()->onAcquireBuffer(mInputResource.get(), Backend::DYNAMIC);
|
||||||
|
if (!success) {
|
||||||
|
MNN_ERROR("Out of dynamic memory!\n");
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
b->onReleaseBuffer(mInputResource.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
b->onReleaseBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
|
||||||
|
b->onReleaseBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
ErrorCode KleidiAIConvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
|
auto input = inputs[0];
|
||||||
|
auto output = outputs[0];
|
||||||
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
|
auto inputPtr = input->host<uint8_t>();
|
||||||
|
auto weightPtr = mResource->mWeight->host<uint8_t>();
|
||||||
|
int threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
|
||||||
|
|
||||||
|
KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
|
||||||
|
const size_t m = input->batch() * input->width() * input->height(); //lhs vector number.
|
||||||
|
const size_t n = output->channel(); //rhs vector number.
|
||||||
|
const size_t k = input->channel(); //vector size.
|
||||||
|
auto dst = output->host<uint8_t>();
|
||||||
|
halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
|
||||||
|
size_t elementSize = core->bytes;
|
||||||
|
auto b = backend();
|
||||||
|
|
||||||
|
auto inputDes = TensorUtils::getDescribe(inputs[0]);
|
||||||
|
if(inputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC){
|
||||||
|
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
|
||||||
|
CPUTensorConverter::convert(input, mInputConvertBuffer.get(), core, tId, threadNum);
|
||||||
|
};
|
||||||
|
MNN_CONCURRENCY_END();
|
||||||
|
inputPtr = mInputConvertBuffer->host<uint8_t>();
|
||||||
|
}
|
||||||
|
auto lhsPacked = inputPtr;
|
||||||
|
if(m != 1) {
|
||||||
|
lhsPacked = mInputResource->host<uint8_t>();
|
||||||
|
kai.runLhsPack(mAccelType, m, k, 0, inputPtr, k * elementSize, lhsPacked);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto outputDes = TensorUtils::getDescribe(outputs[0]);
|
||||||
|
auto postPtr = getPostParameters();
|
||||||
|
auto outputPtr = output->host<uint8_t>();
|
||||||
|
if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC){
|
||||||
|
outputPtr = mOutputConvertBuffer->host<uint8_t>();
|
||||||
|
}
|
||||||
|
|
||||||
|
kai.runMatmul(mAccelType, m, n, k, 0, lhsPacked, weightPtr, outputPtr, n * elementSize, elementSize, postPtr[3], postPtr[2]);
|
||||||
|
|
||||||
|
if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC){
|
||||||
|
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
|
||||||
|
CPUTensorConverter::convert(mOutputConvertBuffer.get(), output, core, tId, threadNum);
|
||||||
|
};
|
||||||
|
MNN_CONCURRENCY_END();
|
||||||
|
}
|
||||||
|
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
} // namespace MNN
|
||||||
|
#endif //MNN_KLEIDIAI_ENABLED
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
//
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef KleidiAIConvolution_hpp
|
||||||
|
#define KleidiAIConvolution_hpp
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
#include <functional>
|
||||||
|
#include "backend/cpu/CPUConvolution.hpp"
|
||||||
|
namespace MNN {
|
||||||
|
#ifndef MNN_REDUCE_SIZE
|
||||||
|
|
||||||
|
class KleidiAIConvolution : public CPUConvolution{
|
||||||
|
public:
|
||||||
|
KleidiAIConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight, size_t originWeightSize, const float *bias, size_t biasSize);
|
||||||
|
KleidiAIConvolution(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b);
|
||||||
|
virtual ~KleidiAIConvolution();
|
||||||
|
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
||||||
|
private:
|
||||||
|
std::shared_ptr<Tensor> mInputResource;
|
||||||
|
std::shared_ptr<Tensor> mInputConvertBuffer;
|
||||||
|
std::shared_ptr<Tensor> mOutputConvertBuffer;
|
||||||
|
std::shared_ptr<CPUConvolution::Resource> mResource;
|
||||||
|
KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
|
||||||
|
|
||||||
|
};
|
||||||
|
#endif //MNN_KLEIDIAI_ENABLED
|
||||||
|
|
||||||
|
} // namespace MNN
|
||||||
|
#endif
|
||||||
|
#endif /* KleidiAIConvolution_hpp */
|
||||||
|
|
@ -0,0 +1,320 @@
|
||||||
|
#if MNN_KLEIDIAI_ENABLED
|
||||||
|
#include "KleidiAIDenseConvolution.hpp"
|
||||||
|
|
||||||
|
#include <numeric>
|
||||||
|
|
||||||
|
#include "CommonOptFunction.h"
|
||||||
|
#include "MNN/ErrorCode.hpp"
|
||||||
|
#include "backend/cpu/CPUBackend.hpp"
|
||||||
|
#include "backend/cpu/CPUTensorConvert.hpp"
|
||||||
|
#include "core/Macro.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
#include "kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h"
|
||||||
|
#include "kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
|
||||||
|
#include "kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
template <typename T>
|
||||||
|
static void initWeight(const T* weight, const T* bias, T* cache, T* output, const std::vector<int>& shape,
|
||||||
|
const int bytes) {
|
||||||
|
::memset(cache, 0, sizeof(T) * std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
|
||||||
|
ConvertOIHWToHWIO(cache, weight, shape);
|
||||||
|
auto outputCount = shape[0];
|
||||||
|
auto srcCount = shape[1];
|
||||||
|
auto kh = shape[2];
|
||||||
|
auto kw = shape[3];
|
||||||
|
if (bytes == 4) {
|
||||||
|
kai_run_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(outputCount, kh * kw, srcCount, outputCount * sizeof(T),
|
||||||
|
cache, bias, output);
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
KleidiAIDenseConvolution::KleidiAIDenseConvolution(const Convolution2DCommon* common, Backend* b,
|
||||||
|
const float* originWeight, size_t originWeightSize,
|
||||||
|
const float* bias, size_t biasSize,
|
||||||
|
std::shared_ptr<ConvolutionCommon::Int8Common> int8Info)
|
||||||
|
: ConvolutionTiledExecutor(b, bias, biasSize) {
|
||||||
|
auto outputCount = (int)biasSize;
|
||||||
|
auto core = static_cast<CPUBackend*>(b)->functions();
|
||||||
|
int bytes = core->bytes;
|
||||||
|
auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
|
||||||
|
if (core->matmulBytes != 0) {
|
||||||
|
bytes = core->matmulBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
int kai_rhs_packed_size = 0;
|
||||||
|
if (core->bytes == 4) {
|
||||||
|
kai_rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
|
||||||
|
outputCount, common->kernelY() * common->kernelX(), srcCount);
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
mResource->mWeight.reset(Tensor::createDevice<uint8_t>({kai_rhs_packed_size}));
|
||||||
|
mResource->mBias.reset(Tensor::createDevice<uint8_t>({outputCount * core->bytes}));
|
||||||
|
|
||||||
|
mValid = mValid && backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC);
|
||||||
|
if (!mValid) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
||||||
|
if (!mValid) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>(
|
||||||
|
{outputCount, srcCount * common->kernelX() * common->kernelY(), (int)sizeof(float)})); // cache must be float
|
||||||
|
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
|
||||||
|
if (!mValid) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> oihwShape = {outputCount, srcCount, common->kernelY(), common->kernelX()};
|
||||||
|
if (core->bytes == 4) {
|
||||||
|
MNN::initWeight(originWeight, bias, cache->host<float>(), mResource->mWeight->host<float>(), oihwShape,
|
||||||
|
core->bytes);
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
|
||||||
|
mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b, mResource.get()));
|
||||||
|
}
|
||||||
|
|
||||||
|
KleidiAIDenseConvolution::KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res,
|
||||||
|
const Convolution2DCommon* common, Backend* b)
|
||||||
|
: ConvolutionTiledExecutor(res, b) {
|
||||||
|
mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b, mResource.get()));
|
||||||
|
}
|
||||||
|
|
||||||
|
KleidiAIDenseConvolution::~KleidiAIDenseConvolution() {
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KleidiAIDenseConvolution::onClone(Backend* bn, const Op* op, Execution** dst) {
|
||||||
|
if (!mValid) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (nullptr == dst) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
auto dense = new KleidiAIDenseConvolution(mResource, op->main_as_Convolution2D()->common(), bn);
|
||||||
|
dense->mProxy->mConvPerfconfig = mProxy->mConvPerfconfig;
|
||||||
|
*dst = dense;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode KleidiAIDenseConvolution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
auto code = mProxy->onExecute(mInputs, outputs);
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
ErrorCode KleidiAIDenseConvolution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
|
||||||
|
auto code = mProxy->onResize(mInputs, outputs);
|
||||||
|
if (NO_ERROR != code) {
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode KleidiAIDenseConvolutionMultiInput::onExecute(const std::vector<Tensor*>& inputs,
|
||||||
|
const std::vector<Tensor*>& outputs) {
|
||||||
|
auto function = static_cast<CPUBackend*>(backend())->functions();
|
||||||
|
if (nullptr != mTempBias) {
|
||||||
|
::memset(mTempBias->host<float>(), 0, mTempBias->elementSize() * function->bytes);
|
||||||
|
if (inputs.size() > 2) {
|
||||||
|
::memcpy(mTempBias->host<float>(), inputs[2]->host<float>(), inputs[2]->elementSize() * function->bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto cache = mTempWeightCache->host<float>();
|
||||||
|
auto source = inputs[1]->host<float>();
|
||||||
|
if (function->bytes == 4) {
|
||||||
|
initWeight(source, mInputs[2]->host<float>(), cache, mTempWeight->host<float>(), inputs[1]->shape(),
|
||||||
|
function->bytes);
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
return mProxy->onExecute(mInputs, outputs);
|
||||||
|
}
|
||||||
|
ErrorCode KleidiAIDenseConvolutionMultiInput::onResize(const std::vector<Tensor*>& inputs,
|
||||||
|
const std::vector<Tensor*>& outputs) {
|
||||||
|
int depth = inputs[1]->channel();
|
||||||
|
int outputCount = outputs[0]->channel();
|
||||||
|
auto function = static_cast<CPUBackend*>(backend())->functions();
|
||||||
|
if (function->bytes == 4) {
|
||||||
|
int kai_rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
|
||||||
|
outputCount, inputs[1]->stride(1), depth);
|
||||||
|
mTempWeight.reset(Tensor::createDevice<uint8_t>({kai_rhs_packed_size}));
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
mTempWeightCache.reset(Tensor::createDevice<float>(
|
||||||
|
{inputs[1]->height(), inputs[1]->width(), inputs[1]->channel(), inputs[1]->batch()}));
|
||||||
|
auto res = backend()->onAcquireBuffer(mTempWeight.get(), Backend::DYNAMIC);
|
||||||
|
res = res && backend()->onAcquireBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
|
||||||
|
mTempBias.reset();
|
||||||
|
if (!res) {
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
if (inputs.size() > 2 && inputs[2]->elementSize() % function->pack == 0) {
|
||||||
|
mInputs = {inputs[0], mTempWeight.get(), inputs[2]};
|
||||||
|
} else {
|
||||||
|
mTempBias.reset(Tensor::createDevice<float>({UP_DIV(outputCount, function->pack) * function->pack}));
|
||||||
|
backend()->onAcquireBuffer(mTempBias.get(), Backend::DYNAMIC);
|
||||||
|
mInputs = {inputs[0], mTempWeight.get(), mTempBias.get()};
|
||||||
|
}
|
||||||
|
backend()->onReleaseBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
|
||||||
|
auto errorCode = mProxy->onResize(mInputs, outputs);
|
||||||
|
backend()->onReleaseBuffer(mTempWeight.get(), Backend::DYNAMIC);
|
||||||
|
if (nullptr != mTempBias) {
|
||||||
|
backend()->onReleaseBuffer(mTempBias.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
return errorCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode KleidiAIDenseConvolutionImpl::onResize(const std::vector<Tensor*>& inputs,
|
||||||
|
const std::vector<Tensor*>& outputs) {
|
||||||
|
CPUConvolution::onResize(inputs, outputs);
|
||||||
|
auto input = inputs[0];
|
||||||
|
auto weight = inputs[1];
|
||||||
|
Tensor* bias = nullptr;
|
||||||
|
if (inputs.size() > 2) {
|
||||||
|
bias = inputs[2];
|
||||||
|
}
|
||||||
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
|
int bytes = core->bytes;
|
||||||
|
int matmulBytes = bytes;
|
||||||
|
if (core->matmulBytes != 0) {
|
||||||
|
matmulBytes = core->matmulBytes;
|
||||||
|
}
|
||||||
|
auto ic = input->channel();
|
||||||
|
auto output = outputs[0];
|
||||||
|
auto batch = output->batch();
|
||||||
|
|
||||||
|
auto outputChannel = output->channel();
|
||||||
|
auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
|
||||||
|
|
||||||
|
mTempBufferTranspose.buffer().type = halide_type_of<uint8_t>();
|
||||||
|
mTempBufferTranspose.buffer().dimensions = 1;
|
||||||
|
int outputNhwSize = batch * output->height() * output->width();
|
||||||
|
if (core->bytes == 4) {
|
||||||
|
mTempBufferTranspose.buffer().dim[0].extent =
|
||||||
|
kai_get_lhs_packed_size_lhs_imatmul_pack_x32p2vlx1_x32p_sme(outputNhwSize, kernelSize, ic);
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
TensorUtils::setLinearLayout(&mTempBufferTranspose);
|
||||||
|
|
||||||
|
bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
||||||
|
if (!success) {
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorUtils::getDescribe(&mOutputNHWC)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
||||||
|
mOutputNHWC.buffer().dimensions = 4;
|
||||||
|
mOutputNHWC.buffer().dim[0].extent = output->batch();
|
||||||
|
mOutputNHWC.buffer().dim[1].extent = output->height();
|
||||||
|
mOutputNHWC.buffer().dim[2].extent = output->width();
|
||||||
|
mOutputNHWC.buffer().dim[3].extent = output->channel();
|
||||||
|
mOutputNHWC.buffer().type = output->getType();
|
||||||
|
success = backend()->onAcquireBuffer(&mOutputNHWC, Backend::DYNAMIC);
|
||||||
|
if (!success) {
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorUtils::getDescribe(&mInputNHWC)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
||||||
|
mInputNHWC.buffer().dimensions = 4;
|
||||||
|
mInputNHWC.buffer().dim[0].extent = input->batch();
|
||||||
|
mInputNHWC.buffer().dim[1].extent = input->height();
|
||||||
|
mInputNHWC.buffer().dim[2].extent = input->width();
|
||||||
|
mInputNHWC.buffer().dim[3].extent = input->channel();
|
||||||
|
mInputNHWC.buffer().type = input->getType();
|
||||||
|
success = backend()->onAcquireBuffer(&mInputNHWC, Backend::DYNAMIC);
|
||||||
|
if (!success) {
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorUtils::getDescribe(&mPadBuffer)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
|
||||||
|
mPadBuffer.buffer().dimensions = 1;
|
||||||
|
mPadBuffer.buffer().dim[0].extent = input->channel();
|
||||||
|
mPadBuffer.buffer().type = input->getType();
|
||||||
|
TensorUtils::setLinearLayout(&mPadBuffer);
|
||||||
|
success = backend()->onAcquireBuffer(&mPadBuffer, Backend::DYNAMIC);
|
||||||
|
if (!success) {
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
backend()->onReleaseBuffer(&mOutputNHWC, Backend::DYNAMIC);
|
||||||
|
backend()->onReleaseBuffer(&mInputNHWC, Backend::DYNAMIC);
|
||||||
|
backend()->onReleaseBuffer(&mPadBuffer, Backend::DYNAMIC);
|
||||||
|
|
||||||
|
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
||||||
|
|
||||||
|
auto postParameters = getPostParameters();
|
||||||
|
mFunction.first = ((CPUBackend*)backend())->threadNumber();
|
||||||
|
|
||||||
|
auto padFull = ConvolutionCommon::convolutionPadFull(input, output, mCommon);
|
||||||
|
ConvParams params{
|
||||||
|
.inputChannel = ic,
|
||||||
|
.outputChannel = outputChannel,
|
||||||
|
.kernelHeight = mCommon->kernelY(),
|
||||||
|
.kernelWidth = mCommon->kernelX(),
|
||||||
|
.strideHeight = mCommon->strideY(),
|
||||||
|
.strideWidth = mCommon->strideX(),
|
||||||
|
.padTop = std::get<1>(padFull),
|
||||||
|
.padBottom = std::get<3>(padFull),
|
||||||
|
.padLeft = std::get<0>(padFull),
|
||||||
|
.padRight = std::get<2>(padFull),
|
||||||
|
.dilatedHeight = mCommon->dilateY(),
|
||||||
|
.dilatedWidth = mCommon->dilateX(),
|
||||||
|
};
|
||||||
|
|
||||||
|
mFunction.second = [=](int tid) {
|
||||||
|
// Convert NC4HW4 to NHWC
|
||||||
|
auto inputShape = input->shape(); // TODO check for NC4HW4, should be the NCHW
|
||||||
|
CPUTensorConverter::convert(input, &mInputNHWC, core);
|
||||||
|
// Lhs packing
|
||||||
|
if (bytes == 4) {
|
||||||
|
int blockSize = kai_get_m_step_lhs_imatmul_pack_x32p2vlx1_x32p_sme();
|
||||||
|
::memset(mPadBuffer.host<float>(), 0, params.inputChannel * sizeof(float));
|
||||||
|
auto table = IndirectionTable<float>(mInputNHWC.shape(), params, mInputNHWC.host<float>(),
|
||||||
|
mPadBuffer.host<float>(), blockSize);
|
||||||
|
kai_run_lhs_imatmul_pack_x32p2vlx1_x32p_sme(outputNhwSize, kernelSize, ic, table.data.data(), 0,
|
||||||
|
mPadBuffer.host<uint8_t>(),
|
||||||
|
mTempBufferTranspose.host<uint8_t>());
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run Matmul
|
||||||
|
if (bytes == 4) {
|
||||||
|
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa(
|
||||||
|
outputNhwSize, outputChannel, kernelSize, ic, mTempBufferTranspose.host<uint8_t>(),
|
||||||
|
weight->host<uint8_t>(), mOutputNHWC.host<uint8_t>(), outputChannel * sizeof(float), postParameters[2],
|
||||||
|
postParameters[3]);
|
||||||
|
} else {
|
||||||
|
MNN_ERROR("Not fp32, should not be called here\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert NHWC to NC4HW4
|
||||||
|
CPUTensorConverter::convert(&mOutputNHWC, output, core);
|
||||||
|
};
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode KleidiAIDenseConvolutionImpl::onExecute(const std::vector<Tensor*>& inputs,
|
||||||
|
const std::vector<Tensor*>& outputs) {
|
||||||
|
mFunction.second(0);
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
} // namespace MNN
|
||||||
|
#endif
|
||||||
|
|
@ -0,0 +1,245 @@
|
||||||
|
#if MNN_KLEIDIAI_ENABLED
|
||||||
|
|
||||||
|
#ifndef KleidiAIDenseConvolution_hpp
|
||||||
|
#define KleidiAIDenseConvolution_hpp
|
||||||
|
|
||||||
|
#include "ConvolutionTiledExecutor.hpp"
|
||||||
|
#include "backend/cpu/CPUConvolution.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
struct ConvParams {
|
||||||
|
int inputChannel;
|
||||||
|
int outputChannel;
|
||||||
|
int kernelHeight;
|
||||||
|
int kernelWidth;
|
||||||
|
int strideHeight;
|
||||||
|
int strideWidth;
|
||||||
|
int padTop;
|
||||||
|
int padBottom;
|
||||||
|
int padLeft;
|
||||||
|
int padRight;
|
||||||
|
int dilatedHeight;
|
||||||
|
int dilatedWidth;
|
||||||
|
|
||||||
|
struct Size2D {
|
||||||
|
int height;
|
||||||
|
int width;
|
||||||
|
};
|
||||||
|
|
||||||
|
Size2D getOutputSize(int inputHeight, int inputWidth) const {
|
||||||
|
auto kernelSizeWithDilated = [](int kernel, int dilated) { return kernel + (kernel - 1) * (dilated - 1); };
|
||||||
|
auto outputSize = [](int input, int pad1, int pad2, int kernel, int stride) {
|
||||||
|
int t = (input + pad1 + pad2 - kernel);
|
||||||
|
return t / stride + 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
int dilatedKernelHeight = kernelSizeWithDilated(kernelHeight, dilatedHeight);
|
||||||
|
int dilatedKernelWidth = kernelSizeWithDilated(kernelWidth, dilatedWidth);
|
||||||
|
|
||||||
|
int outputHeight = outputSize(inputHeight, padTop, padBottom, dilatedKernelHeight, strideHeight);
|
||||||
|
int outputWidth = outputSize(inputHeight, padLeft, padRight, dilatedKernelWidth, strideWidth);
|
||||||
|
|
||||||
|
return {outputHeight, outputWidth};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct IndirectionTable {
|
||||||
|
std::vector<const void*> data;
|
||||||
|
int height;
|
||||||
|
int width;
|
||||||
|
int blockSize;
|
||||||
|
|
||||||
|
/// Creates an indirection table for LHS packing.
|
||||||
|
///
|
||||||
|
/// When implementing convolution via matrix multiplication, we need to
|
||||||
|
/// transform the input and weight tensors into matrices. This transformation
|
||||||
|
/// for the input is typically referred to as `im2col`. The resulting matrix has
|
||||||
|
/// dimensions:
|
||||||
|
/// - Rows: batch * output_height * output_width
|
||||||
|
/// - Columns: input_channels * kernel_height * kernel_width
|
||||||
|
///
|
||||||
|
/// The indirection table stores the starting addresses of all these chunks in
|
||||||
|
/// the input tensor. For cases where padding is applied, it stores pointers
|
||||||
|
/// directly to the padded buffer. Note that the length of the padding buffer
|
||||||
|
/// must match the number of input channels.
|
||||||
|
///
|
||||||
|
/// The indirection table stores the starting addresses of all these chunks in
|
||||||
|
/// the input tensor. Furthermore, LHS packing also requires a transpose over
|
||||||
|
/// every `M_STEP` rows to optimize data layout for computation.
|
||||||
|
///
|
||||||
|
/// @param[in] shape The NHWC input shape
|
||||||
|
/// @param[in] params The parameters of convolution
|
||||||
|
/// @param[in] input The raw pointer for the input tensor
|
||||||
|
/// @param[in] padValues The raw pointer for the pad tensor
|
||||||
|
/// @param[in] blockSize The block size for the transpose
|
||||||
|
///
|
||||||
|
/// @return The indirection table ready for lhs packing.
|
||||||
|
IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input, const T* padValues,
|
||||||
|
const int blockSize);
|
||||||
|
|
||||||
|
~IndirectionTable() = default;
|
||||||
|
|
||||||
|
/// To compute the offset after blocking of blockSize.
|
||||||
|
///
|
||||||
|
/// @param[in] row The row index
|
||||||
|
/// @param[in] col The col index
|
||||||
|
/// @param[in] width The table column count
|
||||||
|
/// @param[in] block The block size
|
||||||
|
///
|
||||||
|
/// @return The offset in blocking table
|
||||||
|
int getReorderedOffset(int row, int col, int width, int block) {
|
||||||
|
int c = row % block;
|
||||||
|
int r = row / block * width + col;
|
||||||
|
return r * block + c;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
IndirectionTable<T>::IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input,
|
||||||
|
const T* padValues, const int blockSize) {
|
||||||
|
int batchSize = shape[0];
|
||||||
|
int inputChannel = shape[3];
|
||||||
|
int inputHeight = shape[1];
|
||||||
|
int inputWidth = shape[2];
|
||||||
|
|
||||||
|
int elementCount = batchSize * inputChannel * inputHeight * inputWidth;
|
||||||
|
auto outputSize = params.getOutputSize(inputHeight, inputWidth);
|
||||||
|
int outputHeight = outputSize.height;
|
||||||
|
int outputWidth = outputSize.width;
|
||||||
|
|
||||||
|
int rowCount = batchSize * outputHeight * outputWidth;
|
||||||
|
int colCount = params.kernelHeight * params.kernelWidth;
|
||||||
|
|
||||||
|
this->data.resize((rowCount + blockSize - 1) / blockSize * blockSize * colCount);
|
||||||
|
this->height = rowCount;
|
||||||
|
this->width = colCount;
|
||||||
|
this->blockSize = blockSize;
|
||||||
|
|
||||||
|
for (int i = 0; i < this->data.size(); i++) {
|
||||||
|
this->data[i] = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int b = 0; b < batchSize; b++) {
|
||||||
|
for (int h = 0; h < outputSize.height; h++) {
|
||||||
|
for (int w = 0; w < outputSize.width; w++) {
|
||||||
|
int inputRow = h * params.strideHeight - params.padTop;
|
||||||
|
int inputCol = w * params.strideWidth - params.padLeft;
|
||||||
|
|
||||||
|
for (int kh = 0; kh < params.kernelHeight; kh++) {
|
||||||
|
// Every row of im2col resulting matrix $kernel height * kernel width$
|
||||||
|
// chunks. So indirection table has relevant values, which point to the
|
||||||
|
// relevant chunk. The `tableRow` and `tableCol` is the row and column
|
||||||
|
// of the table not transposed.
|
||||||
|
int tableRow = b * outputHeight * outputWidth + h * outputWidth + w;
|
||||||
|
int tableCol = kh * params.kernelWidth;
|
||||||
|
|
||||||
|
int inputRowPrime = inputRow + kh * params.dilatedHeight;
|
||||||
|
int inputOffsetStart = b * inputHeight * inputWidth + inputRowPrime * inputWidth;
|
||||||
|
if (inputRowPrime >= 0 && inputRowPrime < inputHeight) {
|
||||||
|
for (int kw = 0; kw < params.kernelWidth; kw++) {
|
||||||
|
int tableOffset = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
|
||||||
|
int inputColPrime = inputCol + kw * params.dilatedWidth;
|
||||||
|
if (inputColPrime >= 0 && inputColPrime < inputWidth) {
|
||||||
|
int inputOffset = (inputOffsetStart + inputColPrime) * inputChannel;
|
||||||
|
assert(inputOffset < elementCount);
|
||||||
|
assert(tableOffset < this->data.size());
|
||||||
|
this->data[tableOffset] = input + inputOffset;
|
||||||
|
} else {
|
||||||
|
assert(tableOffset < this->data.size());
|
||||||
|
this->data[tableOffset] = padValues;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int kw = 0; kw < params.kernelWidth; kw++) {
|
||||||
|
int tableOffset = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
|
||||||
|
assert(tableOffset < this->data.size());
|
||||||
|
this->data[tableOffset] = padValues;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename DstT, typename SrcT>
|
||||||
|
static void ConvertOIHWToHWIO(DstT* dst, const SrcT* src, const std::vector<int>& shape) {
|
||||||
|
assert(shape.size() == 4);
|
||||||
|
int height = shape[2];
|
||||||
|
int width = shape[3];
|
||||||
|
int outputChannel = shape[0];
|
||||||
|
int inputChannel = shape[1];
|
||||||
|
|
||||||
|
int spatialSize = height * width;
|
||||||
|
for (int oc = 0; oc < outputChannel; oc++) {
|
||||||
|
for (int ic = 0; ic < inputChannel; ic++) {
|
||||||
|
for (int s = 0; s < spatialSize; s++) {
|
||||||
|
int inputOffset = oc * inputChannel * spatialSize + ic * spatialSize + s;
|
||||||
|
int outputOffset = s * inputChannel * outputChannel + ic * outputChannel + oc;
|
||||||
|
|
||||||
|
// TODO Check the force conversion.
|
||||||
|
dst[outputOffset] = (DstT)(src[inputOffset]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class KleidiAIDenseConvolutionImpl : public ConvolutionTiledImpl {
|
||||||
|
public:
|
||||||
|
KleidiAIDenseConvolutionImpl(const Convolution2DCommon *common, Backend *b,
|
||||||
|
CPUConvolution::Resource *resource = nullptr)
|
||||||
|
: ConvolutionTiledImpl(common, b) {
|
||||||
|
mResource = resource;
|
||||||
|
}
|
||||||
|
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ~KleidiAIDenseConvolutionImpl() = default;
|
||||||
|
virtual void getPackParameter(int *eP, int *lP, int *hP, const CoreFunctions *core) override {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Tensor mOutputNHWC;
|
||||||
|
Tensor mInputNHWC;
|
||||||
|
Tensor mPadBuffer;
|
||||||
|
};
|
||||||
|
|
||||||
|
class KleidiAIDenseConvolution : public ConvolutionTiledExecutor {
|
||||||
|
public:
|
||||||
|
KleidiAIDenseConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
||||||
|
size_t originWeightSize, const float *bias, size_t biasSize,
|
||||||
|
std::shared_ptr<ConvolutionCommon::Int8Common>);
|
||||||
|
|
||||||
|
KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common,
|
||||||
|
Backend *b);
|
||||||
|
virtual ~KleidiAIDenseConvolution();
|
||||||
|
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override;
|
||||||
|
void initWeight(float *dest, const float *source, float *cache, int depth, int outputCount, int kernelSize,
|
||||||
|
const CoreFunctions *function);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
|
||||||
|
};
|
||||||
|
|
||||||
|
class KleidiAIDenseConvolutionMultiInput : public Execution {
|
||||||
|
public:
|
||||||
|
KleidiAIDenseConvolutionMultiInput(const Convolution2DCommon *common, Backend *b) : Execution(b) {
|
||||||
|
mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b));
|
||||||
|
}
|
||||||
|
virtual ~KleidiAIDenseConvolutionMultiInput() = default;
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<Tensor> mTempWeight;
|
||||||
|
std::shared_ptr<Tensor> mTempWeightCache;
|
||||||
|
std::shared_ptr<Tensor> mTempBias;
|
||||||
|
std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
|
||||||
|
std::vector<Tensor *> mInputs;
|
||||||
|
};
|
||||||
|
} // namespace MNN
|
||||||
|
|
||||||
|
#endif /* KleidiAIDenseConvolution_hpp */
|
||||||
|
#endif
|
||||||
|
|
@ -21,12 +21,6 @@
|
||||||
|
|
||||||
#ifdef MNN_KLEIDIAI_ENABLED
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
#include "../backend/cpu/arm/mnn_kleidiai.h"
|
#include "../backend/cpu/arm/mnn_kleidiai.h"
|
||||||
/**
|
|
||||||
* Set Convolution's input/output tensor format:
|
|
||||||
* 1: format will be NCHW, skip pack/unpack functions.
|
|
||||||
* 0: format will be NC4HW4, need pack/unpack functions to fit kleidiAI ukernel.
|
|
||||||
**/
|
|
||||||
#define KAI_CONV_NCHW_IN_OUT 1
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
|
|
|
||||||
|
|
@ -268,19 +268,6 @@ std::shared_ptr<Tensor> GeometryConvUtils::im2Col(Tensor* im2Col, Tensor* input,
|
||||||
return tempTensor;
|
return tempTensor;
|
||||||
}
|
}
|
||||||
bool GeometryConvUtils::computeSingle(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, GeometryComputer::Context& context, CommandBuffer& res) {
|
bool GeometryConvUtils::computeSingle(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, GeometryComputer::Context& context, CommandBuffer& res) {
|
||||||
#if KAI_CONV_NCHW_IN_OUT
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
auto common = op->main_as_Convolution2D()->common();
|
|
||||||
if(kai.canAccelerate() && common->kernelX() == 1 && common->kernelY() == 1) {
|
|
||||||
kai.setLinear(true);
|
|
||||||
std::shared_ptr<Command> cmd(new Command);
|
|
||||||
cmd->op = op;
|
|
||||||
cmd->inputs = std::move(inputs);
|
|
||||||
cmd->outputs = std::move(outputs);
|
|
||||||
res.command.emplace_back(std::move(cmd));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
auto newOutputs = outputs;
|
auto newOutputs = outputs;
|
||||||
auto newInputs = inputs;
|
auto newInputs = inputs;
|
||||||
auto originOutput = outputs[0];
|
auto originOutput = outputs[0];
|
||||||
|
|
|
||||||
|
|
@ -23,13 +23,7 @@ public:
|
||||||
sourceFmt = MNN_DATA_FORMAT_NCHW;
|
sourceFmt = MNN_DATA_FORMAT_NCHW;
|
||||||
}
|
}
|
||||||
auto destFmt = info->dest();
|
auto destFmt = info->dest();
|
||||||
#if KAI_CONV_NCHW_IN_OUT
|
|
||||||
KleidiAI& kai = KleidiAI::getInstance();
|
|
||||||
if(kai.canAccelerate()) {
|
|
||||||
kai.setLinear(true);
|
|
||||||
destFmt = MNN_DATA_FORMAT_NCHW;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
TensorUtils::getDescribe(outputs[0])->dimensionFormat = destFmt;
|
TensorUtils::getDescribe(outputs[0])->dimensionFormat = destFmt;
|
||||||
if (destFmt == MNN_DATA_FORMAT_NC4HW4) {
|
if (destFmt == MNN_DATA_FORMAT_NC4HW4) {
|
||||||
destFmt = MNN_DATA_FORMAT_NCHW;
|
destFmt = MNN_DATA_FORMAT_NCHW;
|
||||||
|
|
|
||||||
|
|
@ -1266,7 +1266,9 @@ public:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#ifndef MNN_KLEIDIAI_ENABLED
|
||||||
MNNTestSuiteRegister(WinogradMemoryTest, "expr/WinogradMemoryTest");
|
MNNTestSuiteRegister(WinogradMemoryTest, "expr/WinogradMemoryTest");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
class SequenceMemoryTest : public MNNTestCase {
|
class SequenceMemoryTest : public MNNTestCase {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,161 @@
|
||||||
|
#ifdef MNN_KLEIDIAI_ENABLED
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
|
#include <numeric>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
#include "MNNTestSuite.h"
|
||||||
|
#include "backend/cpu/compute/KleidiAIDenseConvolution.hpp"
|
||||||
|
|
||||||
|
using namespace MNN;
|
||||||
|
|
||||||
|
namespace utils {
|
||||||
|
enum class FillType { RANDOM, ZERO };
|
||||||
|
|
||||||
|
class RandomEngine {
|
||||||
|
public:
|
||||||
|
static std::mt19937& get() {
|
||||||
|
static std::random_device device;
|
||||||
|
static std::mt19937 gen(device());
|
||||||
|
return gen;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct RandomGenerator;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct RandomGenerator<float> {
|
||||||
|
static float generate() {
|
||||||
|
std::uniform_real_distribution<float> dist(0.0f, 1.0f);
|
||||||
|
return dist(RandomEngine::get());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct RandomGenerator<int> {
|
||||||
|
static int generate() {
|
||||||
|
std::uniform_int_distribution<int> dist(0, 100);
|
||||||
|
return dist(RandomEngine::get());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace utils
|
||||||
|
|
||||||
|
class LhsPackingTest : public MNNTestCase {
|
||||||
|
public:
|
||||||
|
virtual bool run(int precision) {
|
||||||
|
return testIndirectionTable1() && testIndirectionTable2() && testWeightConversion();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool testIndirectionTable(const ConvParams& params, int batchSize, int inputHeight, int inputWidth) {
|
||||||
|
auto outputSize = params.getOutputSize(inputHeight, inputWidth);
|
||||||
|
int outputHeight = outputSize.height;
|
||||||
|
int outputWidth = outputSize.width;
|
||||||
|
std::vector<int> inputShape = {batchSize, inputHeight, inputWidth, params.inputChannel};
|
||||||
|
|
||||||
|
std::vector<float> input(std::accumulate(inputShape.begin(), inputShape.end(), 1, std::multiplies<int>()));
|
||||||
|
std::vector<float> padValues(params.inputChannel);
|
||||||
|
|
||||||
|
int blockSize = 32;
|
||||||
|
auto table = IndirectionTable<float>(inputShape, params, input.data(), padValues.data(), blockSize);
|
||||||
|
|
||||||
|
bool succ = true;
|
||||||
|
|
||||||
|
// Check the first row
|
||||||
|
for (int col = 0; col < blockSize; col++) {
|
||||||
|
int oh = col / outputWidth;
|
||||||
|
int ow = col % outputWidth;
|
||||||
|
int ih = oh * params.strideHeight - params.padTop;
|
||||||
|
int iw = ow * params.strideWidth - params.padLeft;
|
||||||
|
|
||||||
|
if (ih < 0 || ih >= inputHeight) {
|
||||||
|
succ &= (table.data[col] == padValues.data());
|
||||||
|
} else if (iw < 0 || iw >= inputWidth) {
|
||||||
|
succ &= (table.data[col] == padValues.data());
|
||||||
|
} else {
|
||||||
|
int offset = (ih * inputWidth + iw) * params.inputChannel;
|
||||||
|
succ &= (table.data[col] == input.data() + offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return succ;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool testIndirectionTable1() {
|
||||||
|
ConvParams params{
|
||||||
|
.inputChannel = 3,
|
||||||
|
.outputChannel = 5,
|
||||||
|
.kernelHeight = 3,
|
||||||
|
.kernelWidth = 2,
|
||||||
|
.strideHeight = 2,
|
||||||
|
.strideWidth = 1,
|
||||||
|
.padTop = 1,
|
||||||
|
.padBottom = 3,
|
||||||
|
.padLeft = 2,
|
||||||
|
.padRight = 1,
|
||||||
|
.dilatedHeight = 1,
|
||||||
|
.dilatedWidth = 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
int batchSize = 4;
|
||||||
|
int inputHeight = 7;
|
||||||
|
int inputWidth = 5;
|
||||||
|
|
||||||
|
return testIndirectionTable(params, batchSize, inputHeight, inputWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool testIndirectionTable2() {
|
||||||
|
ConvParams params{
|
||||||
|
.inputChannel = 256,
|
||||||
|
.outputChannel = 256,
|
||||||
|
.kernelHeight = 3,
|
||||||
|
.kernelWidth = 3,
|
||||||
|
.strideHeight = 1,
|
||||||
|
.strideWidth = 1,
|
||||||
|
.padTop = 1,
|
||||||
|
.padBottom = 1,
|
||||||
|
.padLeft = 1,
|
||||||
|
.padRight = 1,
|
||||||
|
.dilatedHeight = 1,
|
||||||
|
.dilatedWidth = 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
int batchSize = 1;
|
||||||
|
int inputHeight = 24;
|
||||||
|
int inputWidth = 24;
|
||||||
|
|
||||||
|
return testIndirectionTable(params, batchSize, inputHeight, inputWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool testWeightConversion() {
|
||||||
|
std::vector<int> shape = {4, 5, 6, 7};
|
||||||
|
int size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
|
||||||
|
std::vector<float> weightSrc(size);
|
||||||
|
std::vector<float> weightDst(size);
|
||||||
|
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
weightSrc[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
ConvertOIHWToHWIO(weightDst.data(), weightSrc.data(), shape);
|
||||||
|
|
||||||
|
bool succ = true;
|
||||||
|
|
||||||
|
for (int oc = 0; oc < 4; oc++) {
|
||||||
|
for (int ic = 0; ic < 5; ic++) {
|
||||||
|
for (int h = 0; h < 6; h++) {
|
||||||
|
for (int w = 0; w < 7; w++) {
|
||||||
|
int oo = (h * 7 + w) * 5 * 4 + ic * 4 + oc;
|
||||||
|
int io = oc * 5 * 6 * 7 + ic * 6 * 7 + h * 7 + w;
|
||||||
|
succ &= (weightSrc[io] == weightDst[oo]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
MNNTestSuiteRegister(LhsPackingTest, "imatmul/lhs");
|
||||||
|
#endif
|
||||||
|
|
@ -195,6 +195,9 @@ static int eval(Llm* llm, std::string prompt_file, int max_token_number) {
|
||||||
prompts = {prompt};
|
prompts = {prompt};
|
||||||
#else
|
#else
|
||||||
while (std::getline(prompt_fs, prompt)) {
|
while (std::getline(prompt_fs, prompt)) {
|
||||||
|
if (prompt.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (prompt.back() == '\r') {
|
if (prompt.back() == '\r') {
|
||||||
prompt.pop_back();
|
prompt.pop_back();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue