Merge 91d982616a into daa62c77c1

Merge pull request #3713 from alibaba/feature/bugfix
MNN:Bugfix: Fix compile bug for ios compile and llm_demo crash for empty
2025-07-11 11:08:19 +08:00 · 2025-07-10 16:58:43 +08:00 · 2025-07-10 16:51:22 +08:00 · 2025-07-10 16:06:11 +08:00 · 2025-07-10 15:42:55 +08:00 · 2025-07-10 15:42:19 +08:00
39 changed files with 1982 additions and 755 deletions
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -268,6 +268,22 @@ public:
        NetModule* module(new NetModule(submodule, newInfo, nullptr, 0, 0.0f));
 #ifdef MNN_INTERNAL_ENABLED
        module->mLogInfo = mLogInfo;
+#endif
+        return this->cloneBaseTo(ctx, module);
+    }
+    virtual Module* clone(CloneContext* ctx, const ScheduleConfig* config) const override {
+        auto mModule = mChildren[0];
+        auto origin = mInfo->runTimeManager->getInside();
+        std::shared_ptr<Executor::RuntimeManager> newRt (Executor::RuntimeManager::createRuntimeManager(*config));
+        const_cast<RuntimeAttr*>(newRt->getInside())->mContent->mExternalFile = origin->mContent->mExternalFile;
+        std::shared_ptr<Module::Info> newInfo(new Module::Info);
+        *newInfo = *mInfo;
+        ctx->pRuntimeManager = newRt;
+        newInfo->runTimeManager = newRt;
+        std::shared_ptr<Module> submodule(mModule->clone(ctx));
+        NetModule* module(new NetModule(submodule, newInfo, nullptr, 0, 0.0f));
+#ifdef MNN_INTERNAL_ENABLED
+        module->mLogInfo = mLogInfo;
 #endif
        return this->cloneBaseTo(ctx, module);
    }
@ -515,6 +531,11 @@ Module* Module::clone(const Module* module, const bool shareParams) {
    return module->clone(&context);
 }

+Module* Module::clone(const Module* module, const ScheduleConfig* config, const bool shareParams) {
+    CloneContext context(shareParams);
+    return module->clone(&context, config);
+}
+
 Module* Module::cloneBaseTo(CloneContext* ctx, Module* module) const {
    for (const Express::VARP& var : mParameters) {
        module->mParameters.push_back(ctx->getOrClone(var));
--- a/include/MNN/expr/Module.hpp
+++ b/include/MNN/expr/Module.hpp
@ -78,6 +78,7 @@ public:
    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});

    static Module* clone(const Module* module, const bool shareParams = false);
+    static Module* clone(const Module* module, const ScheduleConfig* config, const bool shareParams = false);

    struct Info {
        // Input info load from model
@ -104,6 +105,9 @@ public:
    virtual Module* clone(CloneContext* ctx) const {
        return nullptr;
    }
+    virtual Module* clone(CloneContext* ctx, const ScheduleConfig* config) const {
+        return clone(ctx);
+    }
    void registerModel(const std::vector<std::shared_ptr<Module>>& children);

    static void destroy(Module* m);
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -754,6 +754,10 @@
 		CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
 		CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
 		CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		CE0AD4E42E1FB106002013A8 /* CountMinMaxValue_FP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE0AD4E32E1FB106002013A8 /* CountMinMaxValue_FP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		CE0AD4E82E1FB152002013A8 /* MoEModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE0AD4E62E1FB152002013A8 /* MoEModule.hpp */; };
+		CE0AD4E92E1FB152002013A8 /* ModuleInside.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE0AD4E52E1FB152002013A8 /* ModuleInside.hpp */; };
+		CE0AD4EA2E1FB152002013A8 /* MoEModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE0AD4E72E1FB152002013A8 /* MoEModule.cpp */; };
 		CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
 		CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
 		CE31C7C12D783CBB00741F49 /* WorkerThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE31C7C02D783CBB00741F49 /* WorkerThread.cpp */; };
@ -1587,6 +1591,10 @@
 		CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
 		CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
 		CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
+		CE0AD4E32E1FB106002013A8 /* CountMinMaxValue_FP16.S */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; name = CountMinMaxValue_FP16.S; path = ../arm82/asm/arm64/CountMinMaxValue_FP16.S; sourceTree = "<group>"; };
+		CE0AD4E52E1FB152002013A8 /* ModuleInside.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = ModuleInside.hpp; sourceTree = "<group>"; };
+		CE0AD4E62E1FB152002013A8 /* MoEModule.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = MoEModule.hpp; sourceTree = "<group>"; };
+		CE0AD4E72E1FB152002013A8 /* MoEModule.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = MoEModule.cpp; sourceTree = "<group>"; };
 		CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CE31C7BF2D783CBB00741F49 /* WorkerThread.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = WorkerThread.hpp; sourceTree = "<group>"; };
@ -1851,7 +1859,6 @@
 		488873A8215B639D0079B12E /* source */ = {
 			isa = PBXGroup;
 			children = (
-				CE482EF5288536DA007CD935 /* internal */,
 				4DF87C482887D3560003E2D4 /* calib3d */,
 				4D4CF4612760946500A36D9F /* imgproc */,
 				4D9A931B26255BDA00F9B43C /* coreml */,
@ -1919,6 +1926,7 @@
 		48887410215B639D0079B12E /* cpu */ = {
 			isa = PBXGroup;
 			children = (
+				CE0AD4E32E1FB106002013A8 /* CountMinMaxValue_FP16.S */,
 				CEA3C8892D6D71E1003EFAD2 /* CPUStft.hpp */,
 				CEA3C88A2D6D71E1003EFAD2 /* CPUStft.cpp */,
 				CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */,
@ -2203,6 +2211,9 @@
 		48C84B6F250F711600EE7666 /* module */ = {
 			isa = PBXGroup;
 			children = (
+				CE0AD4E52E1FB152002013A8 /* ModuleInside.hpp */,
+				CE0AD4E62E1FB152002013A8 /* MoEModule.hpp */,
+				CE0AD4E72E1FB152002013A8 /* MoEModule.cpp */,
 				48C84B71250F711600EE7666 /* PipelineModule.cpp */,
 				48C84B72250F711600EE7666 /* Module.cpp */,
 				48C84B73250F711600EE7666 /* WhileModule.hpp */,
@ -2881,7 +2892,6 @@
 				CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */,
 				4DE4E82C275E307B0016A916 /* cv in Headers */,
 				1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
-				CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */,
 				1F501F822397BA5B004E8721 /* Interpreter.hpp in Headers */,
 				C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */,
 				1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */,
@ -2892,7 +2902,6 @@
 				48C84B98250F71E900EE7666 /* CPUSoftmax.hpp in Headers */,
 				4882C8B8241A22B800DAC168 /* OpCommonUtils.hpp in Headers */,
 				48608B54250632EC00CB1D71 /* GeometryComputer.hpp in Headers */,
-				CECF8C7A299CAD9400D3875B /* sha1.h in Headers */,
 				4894C6EC27016F7200D8BE79 /* CPUResizeCache.hpp in Headers */,
 				92FF04A623AA0BFB00AC97F6 /* FileLoader.hpp in Headers */,
 				48F34733273A7C8400C45394 /* ImageProcessFunction.hpp in Headers */,
@ -2906,7 +2915,6 @@
 				48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */,
 				92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
 				4D9A937826255BDA00F9B43C /* CoreMLBinary.hpp in Headers */,
-				CECF8C85299CAD9400D3875B /* log_util.h in Headers */,
 				4D6D7FD52656896600F80814 /* DenseConvolutionTiledExecutor.hpp in Headers */,
 				4D9A936626255BDA00F9B43C /* CoreMLExecutor.h in Headers */,
 				92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */,
@ -2915,7 +2923,6 @@
 				1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */,
 				19D0FE76285C66F200B74B1A /* MetalLayerNorm.hpp in Headers */,
 				489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */,
-				CECF8C86299CAD9400D3875B /* sds.h in Headers */,
 				1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */,
 				92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */,
 				4D9A935B26255BDA00F9B43C /* NeuralNetwork.pb-c.h in Headers */,
@ -2937,10 +2944,8 @@
 				481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */,
 				4894C6EA27016F7200D8BE79 /* UnaryUtils.hpp in Headers */,
 				EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */,
-				CECF8C81299CAD9400D3875B /* log_util_imp.h in Headers */,
 				92FF037623AA0B5A00AC97F6 /* CPUBinary.hpp in Headers */,
 				4D9A935826255BDA00F9B43C /* FeatureTypes.pb-c.h in Headers */,
-				CECF8C7C299CAD9400D3875B /* hmac-sha.h in Headers */,
 				48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */,
 				489D7A732550FDC800AD896A /* MetalBackend.hpp in Headers */,
 				92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */,
@ -2963,7 +2968,6 @@
 				4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
 				4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
 				48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
-				CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
 				4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
 				92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
 				4AF4FB26269ED235005BA97B /* SparseConvInt8TiledExecutor.hpp in Headers */,
@ -3001,7 +3005,6 @@
 				92FF03CA23AA0B5A00AC97F6 /* CPUConvolutionDepthwise.hpp in Headers */,
 				92FF04A923AA0BFB00AC97F6 /* Schedule.hpp in Headers */,
 				489D7A9F2550FDC900AD896A /* MetalConvolutionCommon.hpp in Headers */,
-				CECF8C80299CAD9400D3875B /* lz4.h in Headers */,
 				92FF028623AA0B5A00AC97F6 /* CPUDeconvolution.hpp in Headers */,
 				489D7A722550FDC800AD896A /* MetalReLU6.hpp in Headers */,
 				92FF04B523AA0BFB00AC97F6 /* TensorUtils.hpp in Headers */,
@ -3042,6 +3045,8 @@
 				92FF026023AA0B5A00AC97F6 /* CPURNNSequenceGRU.hpp in Headers */,
 				48747D4F245D9E13000B9709 /* CPURaster.hpp in Headers */,
 				489D7A822550FDC900AD896A /* MetalPReLU.hpp in Headers */,
+				CE0AD4E82E1FB152002013A8 /* MoEModule.hpp in Headers */,
+				CE0AD4E92E1FB152002013A8 /* ModuleInside.hpp in Headers */,
 				48C84B84250F711700EE7666 /* WhileModule.hpp in Headers */,
 				92FF02A923AA0B5A00AC97F6 /* CPUCropAndResize.hpp in Headers */,
 				4D6D7FD92656897200F80814 /* SparseConvolutionTiledExecutor.hpp in Headers */,
@ -3053,24 +3058,20 @@
 				92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */,
 				92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */,
 				92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
-				CECF8C88299CAD9400D3875B /* log_api.h in Headers */,
 				4A224A0D27D0C2D9000A9260 /* ConvolutionPackWinograd.hpp in Headers */,
 				4A224A0E27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.hpp in Headers */,
 				4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */,
 				48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
 				F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */,
-				CECF8C5B299CACFD00D3875B /* LogHelper.hpp in Headers */,
 				92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */,
 				482BFBCD28351BA1009210E4 /* ShaderMap.hpp in Headers */,
 				489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */,
-				CECF8C7F299CAD9400D3875B /* md5.h in Headers */,
 				92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */,
 				92FF028023AA0B5A00AC97F6 /* CPUFloatToInt8.hpp in Headers */,
 				92FF028723AA0B5A00AC97F6 /* CPUFixedPoint.hpp in Headers */,
 				C43C8227251894F400A0FF84 /* Vec.hpp in Headers */,
 				4819FB1D24C138DF0050BD09 /* GeometryConvUtils.hpp in Headers */,
 				489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */,
-				CECF8C83299CAD9400D3875B /* log_define.h in Headers */,
 				C48CAE2628900C4A00271A6D /* ConvInt8Winograd.hpp in Headers */,
 				48F34730273A7C7300C45394 /* CPUImageProcess.hpp in Headers */,
 				489D7A702550FDC800AD896A /* MetalRaster.hpp in Headers */,
@ -3391,7 +3392,6 @@
 				48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
 				6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */,
 				48958781268EBA6F00EA01A7 /* CPUSegmentMean.cpp in Sources */,
-				CECF8C7B299CAD9400D3875B /* sha1.c in Sources */,
 				4D9A937026255BDA00F9B43C /* CoreMLUnary.cpp in Sources */,
 				92FF04A823AA0BFB00AC97F6 /* AutoTime.cpp in Sources */,
 				92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */,
@ -3418,6 +3418,7 @@
 				48925F342744AC0700919B37 /* CPUROIAlign.cpp in Sources */,
 				4896D36925FE2A3D00717702 /* Arm82Unary.cpp in Sources */,
 				4DCF53902892B17100B5B393 /* ShapeHistogram.cpp in Sources */,
+				CE0AD4EA2E1FB152002013A8 /* MoEModule.cpp in Sources */,
 				92FF043423AA0B7100AC97F6 /* ShapeStridedSlice.cpp in Sources */,
 				4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */,
 				4D4CF46B2760946500A36D9F /* draw.cpp in Sources */,
@ -3446,7 +3447,6 @@
 				92FF03CE23AA0B5A00AC97F6 /* CPUOPRegister.cpp in Sources */,
 				92FF02B323AA0B5A00AC97F6 /* CPUInstanceNorm.cpp in Sources */,
 				4819FB2C24C1396A0050BD09 /* GeometryPoolGrad.cpp in Sources */,
-				CECF8C7E299CAD9400D3875B /* log_builder.cpp in Sources */,
 				92FF042223AA0B7100AC97F6 /* ShapeConcat.cpp in Sources */,
 				4D6D7FD12656891400F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
 				4D5662CC299B76ED0031C1A1 /* MNNMaxPoolInt8.S in Sources */,
@ -3520,11 +3520,11 @@
 				92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
 				EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
 				4AF4FB29269ED244005BA97B /* MNNPackedSparseQuantMatMulEpx1.S in Sources */,
+				CE0AD4E42E1FB106002013A8 /* CountMinMaxValue_FP16.S in Sources */,
 				4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
 				11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
 				48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
 				CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */,
-				CECF8C7D299CAD9400D3875B /* md5.c in Sources */,
 				92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
 				92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */,
 				CE072A182C91AEE700F190FD /* MNNGRAYToC4Fast.S in Sources */,
@ -3588,10 +3588,8 @@
 				92FF042E23AA0B7100AC97F6 /* ShapeProposal.cpp in Sources */,
 				92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */,
 				92FF045B23AA0B7100AC97F6 /* ShapeShape.cpp in Sources */,
-				CECF8C87299CAD9400D3875B /* sds.c in Sources */,
 				9560EAD62BDE426A00C8D0B6 /* GeometryLayernorm.cpp in Sources */,
 				4D6D7FD72656896D00F80814 /* SparseConvolutionTiledExecutor.cpp in Sources */,
-				CECF8C82299CAD9400D3875B /* log_api.cpp in Sources */,
 				92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */,
 				4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
 				92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
@ -3599,7 +3597,6 @@
 				4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
 				CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
 				C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
-				CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
 				48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
 				92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
 				CE072A162C91AEE700F190FD /* MNNBGRAToBGR.S in Sources */,
@ -3627,7 +3624,6 @@
 				CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
 				EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
 				92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
-				CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
 				92FF045623AA0B7100AC97F6 /* ShapeReshape.cpp in Sources */,
 				92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				92FF044423AA0B7100AC97F6 /* ShapeLSTM.cpp in Sources */,
@ -3663,7 +3659,6 @@
 				92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */,
 				92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
 				CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
-				CECF8C78299CAD9400D3875B /* log_util_imp.cpp in Sources */,
 				CE072A152C91AEE700F190FD /* MNNRGBAToGRAYFast.S in Sources */,
 				92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
 				952298B22B4D39050043978B /* MetalLoop.mm in Sources */,
@ -3688,13 +3683,11 @@
 				92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
 				4D9A937926255BDA00F9B43C /* CoreMLRaster.cpp in Sources */,
 				48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */,
-				CECF8C84299CAD9400D3875B /* lz4.c in Sources */,
 				489D7A7E2550FDC900AD896A /* MNNMetalContext.mm in Sources */,
 				92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
 				92FF036B23AA0B5A00AC97F6 /* CPUResize.cpp in Sources */,
 				92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
 				92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
-				CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */,
 				92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
 				CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
 				92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */,
@ -4168,6 +4161,7 @@
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdeve;
+				"PRODUCT_BUNDLE_IDENTIFIER[sdk=iphoneos*]" = com.taobao.mnn.abcdes;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@ -4196,6 +4190,7 @@
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdeve;
+				"PRODUCT_BUNDLE_IDENTIFIER[sdk=iphoneos*]" = com.taobao.mnn.abcdes;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@ -4309,4 +4304,3 @@
 	};
 	rootObject = 0F1465AE1FA18D1000F9860A /* Project object */;
 }
-
--- a/source/backend/cpu/CPUAttention.cpp
+++ b/source/backend/cpu/CPUAttention.cpp
@ -203,6 +203,7 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
    }
    int tileCount = UP_DIV(mNumHead, mThreadNum);
    int group_size = mNumHead / mKvNumHead;
+    mKVCacheManager->setThreadNum(mThreadNum);
    // reduce the value of 'query' to avoid fp16 overflow
    float mScale = 1.0 / sqrt(mHeadDim);
    float q_scale = 1.0;
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -50,6 +50,14 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
    CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
    return NO_ERROR;
 }
+
+int getMajorCPUNumber(const std::vector<CPUGroup>& groups) {
+    int sum = 0;
+    for (const auto& g: groups) {
+        if (g.cpuType != CPUGroup::Efficient) { sum+=g.ids.size(); }
+    }
+    return sum;
+}
 void CPUBackend::computeDivideSizes(int size, int* dst, float avgDiv) const {
    if (mGroupWithComputeRate.size() <= 1 || (avgDiv > 0 && avgDiv < mComputeI)) {
        // Avg divide
@ -136,13 +144,14 @@ void CPURuntime::_bindCPUCore() const {
 }

 void CPURuntime::_resetThreadPool() {
+    if (mThreadNumber <= 0) { mThreadNumber=getMajorCPUNumber(MNNGetCPUInfo()->groups); }
    mThreadNumber = std::max(1, mThreadNumber);
    mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
 #ifdef MNN_USE_THREAD_POOL
    ThreadPool::releaseWorkIndex(mTaskIndex);
    auto cpuInfo = MNNGetCPUInfo();
+    int systemThreadNumber = (int)cpuInfo->cpuNumber;
    if (mThreadNumber > 1) {
-        int systemThreadNumber = (int)cpuInfo->cpuNumber;
        if (systemThreadNumber == 0) {
            systemThreadNumber = mThreadNumber;
        }
@ -389,25 +398,18 @@ BufferAllocator* CPURuntime::createDynamicBufferAlloctor(int index) const {
    }
    return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get()));
 }
-CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags, int initThreadNumber) : Backend(type) {
-#ifdef LOG_VERBOSE
-    MNN_PRINT("cpu backend create\n");
-#endif
-    mMemory = memory;
-    mRuntime = const_cast<CPURuntime*>(runtime);
-    mThreadNumber = mRuntime->mThreadNumber;
-    // Compute Group Rate
-    do {
+void CPUBackend::computeGroupRate() {
+    {
        if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
-            break;
+            return;
        }
        auto rate = mRuntime->hint().cpuDecreaseRate;
        if (rate >= 100 || rate <= 0) {
-            break;
+            return;
        }
        auto cpuInfo = MNNGetCPUInfo();
        if (cpuInfo->groups.size() < 2) {
-            break;
+            return;
        }
        if (cpuInfo->i8mm) {
            mComputeI = 28.f;
@ -435,7 +437,18 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
        for (auto& g : mGroupWithComputeRate) {
            g.first = g.first / totalComputeRate;
        }
-    } while (false);
+    }
+}
+CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags, int initThreadNumber) : Backend(type) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("cpu backend create\n");
+#endif
+    mMemory = memory;
+    mRuntime = const_cast<CPURuntime*>(runtime);
+    mThreadNumber = mRuntime->mThreadNumber;
+    // Compute Group Rate
+    computeGroupRate();
+    // initialize Allocator
    auto dynamicAlloc = mRuntime->mSharedDmaInfo;
    if (nullptr == dynamicAlloc.get()) {
        mDmaInfo.reset(new CPURuntime::DynamicAllocator);
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@ -181,6 +181,7 @@ public:
    void enqueueTask(std::function<int()>&& task);

 protected:
+    void computeGroupRate();
    MemObj* allocBuffer(size_t size, Tensor* dest,  StorageType storageType);
    CoreFunctions* mCoreFunctions;
    CoreInt8Functions* mInt8CoreFunctions;
--- a/source/backend/cpu/CPURuntime.cpp
+++ b/source/backend/cpu/CPURuntime.cpp
@ -38,6 +38,9 @@

 #include <algorithm>
 #include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>

 #include "core/Macro.h"
 #ifdef __ANDROID__
@ -117,7 +120,7 @@ int MNNSetSchedAffinity(const int* cpuIDs, int size) {

 // cpuinfo
 // Reference from: https://github.com/pytorch/cpuinfo
-#if defined(ENABLE_ARMV82) && defined(__arm__)
+#if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))

 /* As per include/sys/system_properties.h in Android NDK */
 #define CPUINFO_HARDWARE_VALUE_MAX 64
@ -1360,6 +1363,36 @@ const MNNCPUInfo* MNNGetCPUInfo() {
    return gCPUInfo;
 }

+#ifdef __linux__
+// Function to trim leading and trailing spaces from a string
+static std::string trim(const std::string& str) {
+    size_t first = str.find_first_not_of(" \t");
+    if (first == std::string::npos)
+        return ""; // Return empty string if all characters are spaces
+    size_t last = str.find_last_not_of(" \t");
+    return str.substr(first, (last - first + 1));
+}
+static std::vector<std::string> _fillCpuPart() {
+    std::vector<std::string> cpu_parts;
+    std::ifstream file("/proc/cpuinfo");
+    std::string line;
+    if (!file.is_open()) { return cpu_parts; } // return empty list if file not exist!
+    while (std::getline(file, line)) {
+        std::istringstream iss(line);
+        std::string key, value;
+        if (std::getline(iss, key, ':') && std::getline(iss, value)) {
+            key = trim(key); // Trim leading and trailing spaces from key
+            value = trim(value); // Trim leading and trailing spaces from value
+            if (key == "CPU part") {
+                cpu_parts.push_back(value);
+            }
+        }
+    }
+    file.close();
+    return cpu_parts;
+}
+#endif
+
 static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
    cpuinfo_isa->dot = false;
    cpuinfo_isa->fp16arith = false;
@ -1371,6 +1404,7 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
 #ifdef __linux__
    do {
        DIR* root;
+        // deal with the CPU policy info and frequency info (maxFreq, minFreq).
        std::string dir = "/sys/devices/system/cpu/cpufreq";
        if ((root = opendir(dir.c_str())) == NULL) {
            break;
@ -1415,23 +1449,52 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
            }
        }
        closedir(root);
+        if (cpuinfo_isa->groups.size()==0) {
+            break;
+        }
        std::sort(cpuinfo_isa->groups.begin(), cpuinfo_isa->groups.end(), [](const CPUGroup& left, const CPUGroup& right) {
            return left.maxFreq < right.maxFreq;
        });
-        // Merge group if needed
-        if (cpuinfo_isa->groups.size() >= 2 && cpuinfo_isa->groups[0].maxFreq == cpuinfo_isa->groups[1].maxFreq) {
-            auto backupGroups = std::move(cpuinfo_isa->groups);
-            CPUGroup&& current = std::move(backupGroups[0]);
-            for (int v=1; v<backupGroups.size(); ++v) {
-                if (backupGroups[v].maxFreq != current.maxFreq) {
-                    cpuinfo_isa->groups.emplace_back(current);
-                    current = std::move(backupGroups[v]);
-                } else {
-                    current.ids.insert(current.ids.end(), backupGroups[v].ids.begin(), backupGroups[v].ids.end());
-                }
+        // do not merge group
+        // deal with cpu capacity info
+        do {
+            dir = "/sys/devices/system/cpu/";
+            if (opendir(dir.c_str()) == NULL) {
+                break;
            }
-            cpuinfo_isa->groups.emplace_back(current);
+            for (auto& group: cpuinfo_isa->groups) {
+                std::string cpu_name = "cpu"+std::to_string(group.ids[0]);
+                MNN::AutoStorage<uint8_t> buffer;
+                if (false == _readAll(dir+cpu_name+"/cpu_capacity", buffer)) {
+                    continue;
+                }
+                group.capacity = _readNumber((const char*)buffer.get(), buffer.size())[0];
+            }
+        } while(false);
+        // get CPU part from /proc/cpuinfo
+        std::vector<std::string> cpu_parts = _fillCpuPart();
+        // classify cpuType
+        // 1. get prime maxFreq, minFreq, capacity, /proc/cpuinfo type code
+        // 2. All the cores with 1) same type code; or 2) >=80% freq and capacity, are classified as prime.
+        // 3. All the cores with 1) >=70% freq and >=50% capacity; or 2) not the lowest freq, are classified as performance.
+        // 4. The rest are classfied as efficient.
+        const auto& prime_info = cpuinfo_isa->groups.back();
+        auto lowest_maxFreq = cpuinfo_isa->groups.front().maxFreq;
+        auto lowesr_minFreq = cpuinfo_isa->groups.front().minFreq;
+        for (auto& group: cpuinfo_isa->groups) {
+            if (cpu_parts.empty()) {
+                if (((float)group.maxFreq >= 0.8*(float)prime_info.maxFreq) && ((float)group.capacity >= 0.8*(float)prime_info.capacity))
+                    { group.cpuType=CPUGroup::Prime; continue; }
+            } else {
+                if (cpu_parts[prime_info.ids.front()] == cpu_parts[group.ids.front()])
+                    { group.cpuType=CPUGroup::Prime; continue; }
+            }
+            if ((((float)group.maxFreq >= 0.6*(float)prime_info.maxFreq) && ((float)group.capacity >= 0.4*(float)prime_info.capacity)) \
+                || ((float)group.minFreq > (float)lowesr_minFreq) && ((float)group.maxFreq > (float)lowest_maxFreq)) 
+                { group.cpuType=CPUGroup::Performance; continue; }
+            group.cpuType=CPUGroup::Efficient;
        }
+        // count total cpu number and display info
        cpuinfo_isa->cpuNumber = 0;
        for (auto& group : cpuinfo_isa->groups) {
            cpuinfo_isa->cpuNumber += group.ids.size();
@ -1440,6 +1503,13 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
                message += " " + std::to_string(group.ids[v]) + " ";
            }
            message += "], " + std::to_string(group.minFreq) + " - " + std::to_string(group.maxFreq);
+            if (group.capacity!=0) { message += ", capacity: " + std::to_string(group.capacity); }
+            message += ", cpu type: ";
+            switch (group.cpuType) {
+                case CPUGroup::Prime: message += "Prime"; break;
+                case CPUGroup::Performance: message += "Performance"; break;
+                case CPUGroup::Efficient: message += "Efficient"; break;
+            }
            MNN_PRINT("%s\n", message.c_str());
        }
    } while (false);
--- a/source/backend/cpu/CPURuntime.hpp
+++ b/source/backend/cpu/CPURuntime.hpp
@ -12,8 +12,15 @@
 #include <vector>
 #include "core/Macro.h"
 struct CPUGroup {
-    uint32_t minFreq;
-    uint32_t maxFreq;
+    enum CPUCapacityType {
+        Prime = 0,
+        Performance,
+        Efficient
+    };
+    uint32_t minFreq = 0;
+    uint32_t maxFreq = 0;
+    uint32_t capacity = 0;
+    CPUCapacityType cpuType = Prime;
    std::vector<int> ids;
 };
 struct MNNCPUInfo {
--- a/source/backend/cpu/KVCacheManager.cpp
+++ b/source/backend/cpu/KVCacheManager.cpp
@ -326,10 +326,6 @@ void KVCacheManager::onResize(int kv_num_head, int head_dim) {
    auto core  = static_cast<CPUBackend *>(mBackend)->functions();
    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
    mBytes = core->bytes;
-    mThreadNum = static_cast<CPUBackend *>(mBackend)->threadNumber();
-    if (mThreadNum > mKvNumHead) {
-        mThreadNum = mKvNumHead;
-    }
    if (mConfig.mUseInt8Kernel) {
        static_cast<CPUBackend *>(mBackend)->int8Functions()->MNNGetGemmUnit(&hP8, &lP8, &eP8);
    }
--- a/source/backend/cpu/KVCacheManager.hpp
+++ b/source/backend/cpu/KVCacheManager.hpp
@ -94,6 +94,12 @@ public:
    const Tensor * keySum() {
        return mKeySum.get();
    }
+    void setThreadNum(int numThread) {
+        mThreadNum = numThread;
+        if (mThreadNum > mKvNumHead) {
+            mThreadNum = mKvNumHead;
+        }
+    }
    bool inDisk() {
        return mKVCacheInDisk;
    }
--- a/source/backend/cpu/arm/CMakeLists.txt
+++ b/source/backend/cpu/arm/CMakeLists.txt
@ -62,7 +62,9 @@ if (MNN_KLEIDIAI)
        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/
        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/
        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/
-        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/)
+        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/)

    list(APPEND MNN_SOURCES_KLEIDIAI ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c)
    list(APPEND MNN_SOURCES_KLEIDIAI ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c)
@ -93,9 +95,15 @@ if (MNN_KLEIDIAI)
        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot.c
        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
        ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa.c
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.c
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.c
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.c
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.c
+	${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.c
    )

-    set_source_files_properties(${MNN_SOURCES_KLEIDIAI} PROPERTIES COMPILE_OPTIONS -march=armv8.2-a+i8mm+dotprod+sve+sve2+fp16)
+    set_source_files_properties(${MNN_SOURCES_KLEIDIAI} PROPERTIES COMPILE_OPTIONS "-fno-tree-vectorize;-march=armv8.2-a+i8mm+dotprod+sve+sve2+fp16")
    set_source_files_properties(${KLEIDIAI_FILES_SME2}  PROPERTIES COMPILE_OPTIONS "-fno-tree-vectorize;-march=armv8.2-a+sve+sve2")

 endif()
--- a/source/backend/cpu/arm/mnn_kleidiai.cpp
+++ b/source/backend/cpu/arm/mnn_kleidiai.cpp
@ -15,13 +15,11 @@ KleidiAI *KleidiAI::mKaiInstance = NULL;
 KleidiAI::StaticInfo KleidiAI::mStaticInfo;

 //Get instance.
-KleidiAI& KleidiAI::getInstance(const MNNCPUInfo& gCPUInfo, bool bFP16, bool bBF16) {
+KleidiAI& KleidiAI::getInstance(const MNNCPUInfo& gCPUInfo) {
    if(!mKaiInstance) {
        mKaiInstance = new KleidiAI;
        mKaiInitialized = true;

-        mStaticInfo.mFP16 = bFP16;
-        mStaticInfo.mBF16 = bBF16;
        mStaticInfo.mDot = gCPUInfo.dot;
        mStaticInfo.mI8mm = gCPUInfo.i8mm;
        mStaticInfo.mSme2 = gCPUInfo.sme2;
@ -45,9 +43,11 @@ void KleidiAI::printInfo(AccelType type) {
    }

    static const char * const names[] = {
-        "QI4_ASYM_CHNLQT",
-        "QI4_ASYM_BLKQT",
-        "QI4_SYM_CHNLQT",
+        "QI4_ASYM_CHNLQT_F32",
+        "QI4_ASYM_CHNLQT_F16",
+        "QI4_ASYM_BLKQT_F32",
+        "QI4_ASYM_BLKQT_F16",
+        "QI4_SYM_CHNLQT_F32",
        "QI4_SYM_BLKQT",
        "QI8_ASYM_CHNLQT",
        "QI8_ASYM_BLKQT",
@ -60,18 +60,11 @@ void KleidiAI::printInfo(AccelType type) {

    KernelInfo *pInfo = &mStaticInfo.mKernelInfo[(size_t)type];
    if(pInfo->mKernelSupport) {
-        MNN_PRINT("\nKleidiAI is running! AccelType is %s. ", names[(size_t)type]);
+        MNN_PRINT("\nKleidiAI is running! AccelType is %s.\n", names[(size_t)type]);
    } else {
-        MNN_PRINT("\nKleidiAI cannot accelerate! AccelType is %s. ", names[(size_t)type]);
+        MNN_PRINT("\nKleidiAI cannot accelerate! AccelType is %s.\n", names[(size_t)type]);
    }

-    if(mStaticInfo.mFP16) {
-        MNN_PRINT("Data type is FP16.\n");
-    } else if(mStaticInfo.mBF16) {
-        MNN_PRINT("Data type is BF16.\n");
-    } else {
-        MNN_PRINT("Data type is FP32.\n");
-    }
 }

 //Init
@ -82,52 +75,50 @@ void KleidiAI::initKernelInfo() {
        bool bSupport = false;

        switch(static_cast<AccelType>(type)) {
-        case AccelType::QI4_SYM_CHNLQT:
+        case AccelType::QI4_SYM_CHNLQT_F32:
        {
-            if(!mStaticInfo.mFP16 && !mStaticInfo.mBF16) { //Currently only support FP32.
-                if(mStaticInfo.mSme2) {
-                    bSupport = true;
-                    pParam->mKaiMstepGemv = 1;
-                    pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
-                    pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
-                    pParam->mKaiMrGemv = 1;
-                    pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
-                    pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
-                    pParam->mKaiKr = 4;
-                    pParam->mKaiSr = 1;
-                } else if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
-                    bSupport = true;
-                    pParam->mKaiMstepGemv = 1;
-                    pParam->mKaiMstepGemm = 8;
-                    pParam->mKaiNStep = 4;
-                    pParam->mKaiMrGemv = 1;
-                    pParam->mKaiMrGemm = 4;
-                    pParam->mKaiNr = 4;
-                    pParam->mKaiKr = 16;
-                    pParam->mKaiSr = 2;
-                } else {
-                    bSupport = false;
-                }
+            if(mStaticInfo.mSme2) {
+                bSupport = true;
+                pParam->mKaiMstepGemv = 1;
+                pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
+                pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
+                pParam->mKaiMrGemv = 1;
+                pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
+                pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa();
+                pParam->mKaiKr = 4;
+                pParam->mKaiSr = 1;
+            } else if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
+                bSupport = true;
+                pParam->mKaiMstepGemv = 1;
+                pParam->mKaiMstepGemm = 8;
+                pParam->mKaiNStep = 4;
+                pParam->mKaiMrGemv = 1;
+                pParam->mKaiMrGemm = 4;
+                pParam->mKaiNr = 4;
+                pParam->mKaiKr = 16;
+                pParam->mKaiSr = 2;
+            } else {
+                bSupport = false;
            }
            break;
        }
-        case AccelType::QI4_ASYM_CHNLQT:
-        case AccelType::QI4_ASYM_BLKQT:
+        case AccelType::QI4_ASYM_CHNLQT_F32:
+        case AccelType::QI4_ASYM_CHNLQT_F16:
+        case AccelType::QI4_ASYM_BLKQT_F32:
+        case AccelType::QI4_ASYM_BLKQT_F16:
        {
-            if(!mStaticInfo.mBF16) { //Currently support FP32 and FP16.
-                if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
-                    bSupport = true;
-                    pParam->mKaiMstepGemv = 1;
-                    pParam->mKaiMstepGemm = 8;
-                    pParam->mKaiNStep = 4;
-                    pParam->mKaiMrGemv = 1;
-                    pParam->mKaiMrGemm = 4;
-                    pParam->mKaiNr = 4;
-                    pParam->mKaiKr = 16;
-                    pParam->mKaiSr = 2;
-                } else {
-                    bSupport = false;
-                }
+            if(mStaticInfo.mDot && mStaticInfo.mI8mm) {
+                bSupport = true;
+                pParam->mKaiMstepGemv = 1;
+                pParam->mKaiMstepGemm = 8;
+                pParam->mKaiNStep = 4;
+                pParam->mKaiMrGemv = 1;
+                pParam->mKaiMrGemm = 4;
+                pParam->mKaiNr = 4;
+                pParam->mKaiKr = 16;
+                pParam->mKaiSr = 2;
+            } else {
+                bSupport = false;
            }
            break;
        }
@ -139,35 +130,31 @@ void KleidiAI::initKernelInfo() {
            break;
        case AccelType::FP16:
        {
-            if (mStaticInfo.mFP16 && !mStaticInfo.mBF16) {
-                if (mStaticInfo.mSme2) {
-                    bSupport = true;
-                    pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
-                    pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
-                    pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
-                    pParam->mKaiNr = kai_get_nr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
-                    pParam->mKaiKr = kai_get_kr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
-                    pParam->mKaiSr = kai_get_sr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
-                } else {
-                    bSupport = false;
-                }
+            if (mStaticInfo.mSme2) {
+                bSupport = true;
+                pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+                pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+                pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+                pParam->mKaiNr = kai_get_nr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+                pParam->mKaiKr = kai_get_kr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+                pParam->mKaiSr = kai_get_sr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+            } else {
+                bSupport = false;
            }
            break;
        }
        case AccelType::FP32:
        {
-            if (!mStaticInfo.mFP16 && !mStaticInfo.mBF16) {
-                if (mStaticInfo.mSme2) {
-                    bSupport = true;
-                    pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
-                    pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
-                    pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
-                    pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
-                    pParam->mKaiKr = kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
-                    pParam->mKaiSr = kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
-                } else {
-                    bSupport = false;
-                }
+            if (mStaticInfo.mSme2) {
+                bSupport = true;
+                pParam->mKaiMstepGemm = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
+                pParam->mKaiMrGemm = kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
+                pParam->mKaiNStep = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
+                pParam->mKaiNr = kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
+                pParam->mKaiKr = kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
+                pParam->mKaiSr = kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
+            } else {
+                bSupport = false;
            }
            break;
        }
@ -183,19 +170,21 @@ void KleidiAI::initKernelInfo() {
 }

 //Get Info
-KleidiAI::AccelType KleidiAI::getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize) {
+KleidiAI::AccelType KleidiAI::getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize, size_t bytes) {
    static std::map<KleidiAI::QIntInfo, KleidiAI::AccelType> infoMap = {
-        {KleidiAI::QIntInfo(4, true,   0),  KleidiAI::AccelType::QI4_ASYM_CHNLQT},
-        {KleidiAI::QIntInfo(4, true,  -1),  KleidiAI::AccelType::QI4_ASYM_BLKQT},
-        {KleidiAI::QIntInfo(4, false,  0),  KleidiAI::AccelType::QI4_SYM_CHNLQT},
-        {KleidiAI::QIntInfo(4, false, -1),  KleidiAI::AccelType::QI4_SYM_BLKQT},
-        {KleidiAI::QIntInfo(8, true,   0),  KleidiAI::AccelType::QI8_ASYM_CHNLQT},
-        {KleidiAI::QIntInfo(8, true,  -1),  KleidiAI::AccelType::QI8_ASYM_BLKQT},
-        {KleidiAI::QIntInfo(8, false,  0),  KleidiAI::AccelType::QI8_SYM_CHNLQT},
-        {KleidiAI::QIntInfo(8, false, -1),  KleidiAI::AccelType::QI8_SYM_BLKQT},
+        {KleidiAI::QIntInfo(4, true,   0, 4),  KleidiAI::AccelType::QI4_ASYM_CHNLQT_F32},
+        {KleidiAI::QIntInfo(4, true,  -1, 4),  KleidiAI::AccelType::QI4_ASYM_BLKQT_F32},
+        {KleidiAI::QIntInfo(4, false,  0, 4),  KleidiAI::AccelType::QI4_SYM_CHNLQT_F32},
+        {KleidiAI::QIntInfo(4, true,   0, 2),  KleidiAI::AccelType::QI4_ASYM_CHNLQT_F16},
+        {KleidiAI::QIntInfo(4, true,  -1, 2),  KleidiAI::AccelType::QI4_ASYM_BLKQT_F16},
+        {KleidiAI::QIntInfo(4, false, -1, -1),  KleidiAI::AccelType::QI4_SYM_BLKQT},
+        {KleidiAI::QIntInfo(8, true,   0, -1),  KleidiAI::AccelType::QI8_ASYM_CHNLQT},
+        {KleidiAI::QIntInfo(8, true,  -1, -1),  KleidiAI::AccelType::QI8_ASYM_BLKQT},
+        {KleidiAI::QIntInfo(8, false,  0, -1),  KleidiAI::AccelType::QI8_SYM_CHNLQT},
+        {KleidiAI::QIntInfo(8, false, -1, -1),  KleidiAI::AccelType::QI8_SYM_BLKQT},
    };

-    QIntInfo info(bits, bAsymmetric, blockSize);
+    QIntInfo info(bits, bAsymmetric, blockSize, bytes);
    auto it = infoMap.find(info);
    if(it != infoMap.end()) {
        return it->second;
@ -223,18 +212,16 @@ size_t KleidiAI::getLhsQuantedPackedSize(AccelType type, size_t m, size_t k, siz
    MNN_ASSERT(type >= AccelType::QINT && type <= AccelType::QINT_END);

    switch(type) {
-    case AccelType::QI4_SYM_CHNLQT:
+    case AccelType::QI4_SYM_CHNLQT_F32:
        return kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(m, k, getMr(type, m), getKr(type), getSr(type));
-    case AccelType::QI4_ASYM_CHNLQT:
+    case AccelType::QI4_ASYM_CHNLQT_F32:
        bl = k;
-    case AccelType::QI4_ASYM_BLKQT:
-    {
-        if(mStaticInfo.mFP16) {
-            return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
-        } else {
-            return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
-        }
-    }
+    case AccelType::QI4_ASYM_BLKQT_F32:
+        return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
+    case AccelType::QI4_ASYM_CHNLQT_F16:
+        bl = k;
+    case AccelType::QI4_ASYM_BLKQT_F16:
+        return kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, getMr(type, m), getKr(type), getSr(type));
    default:
        MNN_ASSERT(0);
    }
@ -250,18 +237,16 @@ size_t KleidiAI::getLhsQuantedPackedOffset(AccelType type, size_t m, size_t mIdx
    }

    switch(type) {
-    case AccelType::QI4_SYM_CHNLQT:
+    case AccelType::QI4_SYM_CHNLQT_F32:
        return kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(mIdx, k, getMr(type, m), getKr(type), getSr(type));
-    case AccelType::QI4_ASYM_CHNLQT:
+    case AccelType::QI4_ASYM_CHNLQT_F32:
        bl = k;
-    case AccelType::QI4_ASYM_BLKQT:
-    {
-        if(mStaticInfo.mFP16) {
-            return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f16_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
-        } else {
-            return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f32_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
-        }
-    }
+    case AccelType::QI4_ASYM_BLKQT_F32:
+        return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f32_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
+    case AccelType::QI4_ASYM_CHNLQT_F16:
+        bl = k;
+    case AccelType::QI4_ASYM_BLKQT_F16:
+        return kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pscalef32_f16_neon(mIdx, k, bl, getMr(type, m), getKr(type), getSr(type));
    default:
        MNN_ASSERT(0);
    }
@ -290,17 +275,18 @@ void KleidiAI::runLhsQuantPack(AccelType type, size_t m, size_t k, size_t bl, si
    MNN_ASSERT(type >= AccelType::QINT && type <= AccelType::QINT_END);

    switch(type) {
-    case AccelType::QI4_SYM_CHNLQT:
+    case AccelType::QI4_SYM_CHNLQT_F32:
        kai_run_lhs_quant_pack_qai8dxp_f32(m, k, mr, getKr(type), getSr(type), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
        break;
-    case AccelType::QI4_ASYM_CHNLQT:
+    case AccelType::QI4_ASYM_CHNLQT_F32:
        bl = k;
-    case AccelType::QI4_ASYM_BLKQT:
-        if(mStaticInfo.mFP16) {
-            kai_run_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const __fp16 *)lhs, k * sizeof(__fp16), lhsQuantedPacked);
-        } else {
-            kai_run_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
-        }
+    case AccelType::QI4_ASYM_BLKQT_F32:
+        kai_run_lhs_quant_pack_qsi8d32pscalef32_f32_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const float *)lhs, k * sizeof(float), lhsQuantedPacked);
+        break;
+    case AccelType::QI4_ASYM_CHNLQT_F16:
+        bl = k;
+    case AccelType::QI4_ASYM_BLKQT_F16:
+        kai_run_lhs_quant_pack_qsi8d32pscalef32_f16_neon(m, k, bl, mr, getKr(type), getSr(type), 0, (const __fp16 *)lhs, k * sizeof(__fp16), lhsQuantedPacked);
        break;
    default:
        MNN_ASSERT(0);
@ -310,15 +296,17 @@ void KleidiAI::runLhsQuantPack(AccelType type, size_t m, size_t k, size_t bl, si
 //Rhs
 size_t KleidiAI::getRhsPackedSize(AccelType type, size_t n, size_t k, size_t bl) {
    switch(type) {
-    case AccelType::QI4_SYM_CHNLQT:
+    case AccelType::QI4_SYM_CHNLQT_F32:
        if(mStaticInfo.mSme2) {
            return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon(n, k, getNr(type), getKr(type), getSr(type));
        } else {
            return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(n, k, getNr(type), getKr(type), getSr(type));
        }
-    case AccelType::QI4_ASYM_CHNLQT:
+    case AccelType::QI4_ASYM_CHNLQT_F32:
+    case AccelType::QI4_ASYM_CHNLQT_F16:
        bl = k;
-    case AccelType::QI4_ASYM_BLKQT:
+    case AccelType::QI4_ASYM_BLKQT_F32:
+    case AccelType::QI4_ASYM_BLKQT_F16:
        return kai_get_rhs_packed_size_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(n, k, getNr(type), getKr(type), bl);
    case AccelType::FP16:
        return kai_get_rhs_packed_size_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme(n, k);
@ -336,15 +324,17 @@ size_t KleidiAI::getRhsPackedOffset(AccelType type, size_t nIdx, size_t k, size_
    }

    switch(type) {
-    case AccelType::QI4_SYM_CHNLQT:
+    case AccelType::QI4_SYM_CHNLQT_F32:
        if(mStaticInfo.mSme2) {
            return kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon(nIdx, k, getNr(type), getKr(type), getSr(type));
        } else {
            return kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(nIdx, k, getNr(type), getKr(type), getSr(type));
        }
-    case AccelType::QI4_ASYM_CHNLQT:
+    case AccelType::QI4_ASYM_CHNLQT_F32:
+    case AccelType::QI4_ASYM_CHNLQT_F16:
        bl = k;
-    case AccelType::QI4_ASYM_BLKQT:
+    case AccelType::QI4_ASYM_BLKQT_F32:
+    case AccelType::QI4_ASYM_BLKQT_F16:
        return kai_get_rhs_packed_offset_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(nIdx, k, getNr(type), getKr(type), bl);
    default:
        MNN_ASSERT(0);
@ -356,7 +346,7 @@ void KleidiAI::runRhsPack(AccelType type, size_t numGroups, size_t n, size_t k,
                          const void* rhs, const void* scale, const void* zeroPoint, const void* bias,
                          void* rhsPacked) {
    switch(type) {
-    case AccelType::QI4_SYM_CHNLQT:
+    case AccelType::QI4_SYM_CHNLQT_F32:
    {
        KleidiAIUtil::rhsPackParamCommon paramCommon;
        if(mStaticInfo.mSme2) {
@ -370,9 +360,11 @@ void KleidiAI::runRhsPack(AccelType type, size_t numGroups, size_t n, size_t k,
        }
        break;
    }
-    case AccelType::QI4_ASYM_CHNLQT:
+    case AccelType::QI4_ASYM_CHNLQT_F32:
+    case AccelType::QI4_ASYM_CHNLQT_F16:
        bl = k;
-    case AccelType::QI4_ASYM_BLKQT:
+    case AccelType::QI4_ASYM_BLKQT_F32:
+    case AccelType::QI4_ASYM_BLKQT_F16:
        struct kai_rhs_pack_nxk_qai4c32p_params params;
        params.lhs_zero_point = 1;
        params.rhs_zero_point = 8;
@ -401,7 +393,7 @@ void KleidiAI::runMatmul(AccelType type, size_t m, size_t n, size_t k, size_t bl
    KAI_UNUSED(bl);

    switch (type) {
-    case AccelType::QI4_SYM_CHNLQT:
+    case AccelType::QI4_SYM_CHNLQT_F32:
    {
        if(mStaticInfo.mSme2) {
            if(m == 1) {
@ -427,29 +419,30 @@ void KleidiAI::runMatmul(AccelType type, size_t m, size_t n, size_t k, size_t bl

        break;
    }
-    case AccelType::QI4_ASYM_CHNLQT:
+    case AccelType::QI4_ASYM_CHNLQT_F32:
        bl = k;
-    case AccelType::QI4_ASYM_BLKQT:
-        if(mStaticInfo.mFP16) {
-            if(m == 1) {
-                kai_run_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
-                                                                                  (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
-                                                                                  dstStrideRow, dstStrideCol, scalarMin, scalarMax);
-            } else {
-                kai_run_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
-                                                                               (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
-                                                                               dstStrideRow, dstStrideCol, scalarMin, scalarMax);
-            }
+    case AccelType::QI4_ASYM_BLKQT_F32:
+        if(m == 1) {
+            kai_run_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
+                                                                            (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
+                                                                            dstStrideRow, dstStrideCol, scalarMin, scalarMax);
        } else {
-            if(m == 1) {
-                kai_run_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
-                                                                                  (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
-                                                                                  dstStrideRow, dstStrideCol, scalarMin, scalarMax);
-            } else {
-                kai_run_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
-                                                                               (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
-                                                                               dstStrideRow, dstStrideCol, scalarMin, scalarMax);
-            }
+            kai_run_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
+                                                                        (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
+                                                                        dstStrideRow, dstStrideCol, scalarMin, scalarMax);
+        }
+        break;
+    case AccelType::QI4_ASYM_CHNLQT_F16:
+        bl = k;
+    case AccelType::QI4_ASYM_BLKQT_F16:
+        if(m == 1) {
+            kai_run_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod(m, n, k, bl,
+                                                                            (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
+                                                                            dstStrideRow, dstStrideCol, scalarMin, scalarMax);
+        } else {
+            kai_run_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm(m, n, k, bl,
+                                                                        (const void *)lhsPacked, (const void *)rhsPacked, (float *)dst,
+                                                                        dstStrideRow, dstStrideCol, scalarMin, scalarMax);
        }
        break;
    case AccelType::FP16:
--- a/source/backend/cpu/arm/mnn_kleidiai.h
+++ b/source/backend/cpu/arm/mnn_kleidiai.h
@ -32,9 +32,11 @@ namespace MNN {
            CHNLQT/BLKQT: channel wise/block wise;
            */
            QINT = 0,
-            QI4_ASYM_CHNLQT = QINT,
-            QI4_ASYM_BLKQT,
-            QI4_SYM_CHNLQT,
+            QI4_ASYM_CHNLQT_F32 = QINT,
+            QI4_ASYM_CHNLQT_F16,
+            QI4_ASYM_BLKQT_F32,
+            QI4_ASYM_BLKQT_F16,
+            QI4_SYM_CHNLQT_F32,
            QI4_SYM_BLKQT,
            QI8_ASYM_CHNLQT,
            QI8_ASYM_BLKQT,
@ -72,9 +74,6 @@ namespace MNN {
        } KernelInfo;

        typedef struct StaticInfo {
-            bool mFP16 = false; //fp16 or fp32.
-            bool mBF16 = false; //bf16 or fp32.
-
            bool mDot = false;
            bool mI8mm = false;
            bool mSme2 = false;
@ -87,11 +86,13 @@ namespace MNN {
            size_t mBits;
            bool mAsymmetric; //Asymmetric quantized model.
            size_t mBlockSize; //0: Per channel quant; others: Per block quant.
+            size_t mBytes; //4: float32; 2: float16.

-            QIntInfo(size_t bits = 4, bool asymmetric = false, size_t blockSize = 0) {
+            QIntInfo(size_t bits = 4, bool asymmetric = false, size_t blockSize = 0, size_t bytes = 0) {
                mBits = bits;
                mAsymmetric = asymmetric;
                mBlockSize = blockSize;
+                mBytes = bytes;
            }

            bool operator<(const QIntInfo& rhs) const {
@ -103,6 +104,10 @@ namespace MNN {
                    return mAsymmetric < rhs.mAsymmetric;
                }

+                if(mBytes != rhs.mBytes) {
+                    return mBytes < rhs.mBytes;
+                }
+
                bool lhsPerChannel = mBlockSize == 0 ? true : false;
                bool rhsPerChannel = rhs.mBlockSize == 0 ? true : false;
                return lhsPerChannel < rhsPerChannel;
@ -115,7 +120,7 @@ namespace MNN {
        static bool mKaiInitialized;

        //Get instance.
-        static KleidiAI &getInstance(const MNNCPUInfo& gCPUInfo, bool bFP16, bool bBF16);
+        static KleidiAI &getInstance(const MNNCPUInfo& gCPUInfo);
        static KleidiAI &getInstance();
        static void initKernelInfo();

@ -126,13 +131,12 @@ namespace MNN {
        //Check and set
        bool canAccelerate();
        bool canAccelerate(AccelType type);
+        bool canAccelerate(AccelType type, const Convolution2DCommon *common);
        bool isLoaded(AccelType type);
        void setLoaded(AccelType type) { mLoaded[(size_t)type] = true; }
-        bool isLinear() { return mLinear; }
-        void setLinear(bool bLinear) { mLinear = bLinear; }

        //Get info
-        static AccelType getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize);
+        static AccelType getQIntAccelType(size_t bits, bool bAsymmetric, size_t blockSize, size_t bytes);
        size_t getMr(AccelType type, size_t m = 1);
        size_t getNr(AccelType type);
        size_t getKr(AccelType type);
@ -142,9 +146,6 @@ namespace MNN {
        size_t getVecNumPerThread(size_t totalVec, size_t totalThread, size_t minStep);
        //Get Static info
        bool bSupportSme2() { return mStaticInfo.mSme2; }
-        bool isFP16() { return mStaticInfo.mFP16; }
-        bool isBF16() { return mStaticInfo.mBF16; }
-        bool isHalf() { return mStaticInfo.mFP16 || mStaticInfo.mBF16; }

        //Lhs
        size_t getLhsPackedSize(AccelType type, size_t m, size_t k);
@ -198,6 +199,27 @@ namespace MNN {
        return mStaticInfo.mKernelInfo[(size_t)type].mKernelSupport;
    }

+    inline bool KleidiAI::canAccelerate(AccelType type, const Convolution2DCommon* common) {
+        if(type >= AccelType::ACC_TYPE_ERROR) {
+            return false;
+        }
+        if(common->group() != 1) {
+            return false;
+        }
+        if(type == AccelType::QI4_ASYM_CHNLQT_F32|| type == AccelType::QI4_ASYM_CHNLQT_F16 || type == AccelType::QI8_ASYM_CHNLQT) {
+            if(common->inputCount() % 32 != 0) {
+                return false;
+            }
+        }
+        if(common->kernelX() == 1 && common->kernelY() == 1
+            && common->padX() == 0 && common->padY() == 0
+            && common->strideX() == 1 && common->strideY() == 1
+            && common->dilateX() == 1 && common->dilateY() == 1) {
+            return mStaticInfo.mKernelInfo[(size_t)type].mKernelSupport;
+        }
+        return false;
+    }
+
    inline bool KleidiAI::isLoaded(AccelType type) {
        MNN_ASSERT(type < AccelType::ACC_TYPE_NUMBER);
        return mLoaded[(size_t)type];
--- a/source/backend/cpu/arm/mnn_kleidiai_util.cpp
+++ b/source/backend/cpu/arm/mnn_kleidiai_util.cpp
@ -41,97 +41,6 @@ inline static size_t kai_rhs_packed_stride(size_t k, size_t nr, size_t kr, size_
    return nr * (num_bytes_per_block * num_blocks_per_row + kai_num_bytes_bias);
 }

-void KleidiAIUtil::transferNCHWToNC4HW4(float* src, float* dst, size_t rowNum, size_t rowSize) {
-    size_t blockNum = rowSize / 4;
-    size_t blockSize = 4 * sizeof(float);
-
-    for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
-        const float *rowSrc = src + blockIndex * 4;
-        for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
-            memcpy(dst, rowSrc, blockSize);
-            dst += 4;
-            rowSrc += rowSize;
-        }
-    }
-
-    size_t remain = rowSize - blockNum * 4;
-    if(remain){
-        const float *rowSrc = src + blockNum * 4;
-        size_t remainSize = remain * sizeof(float);
-        for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
-            memcpy(dst, rowSrc, remainSize);
-            dst += 4;
-            rowSrc += rowSize;
-        }
-    }
-}
-
-void KleidiAIUtil::transferNCHWToNC4HW4(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize) {
-    size_t blockNum = rowSize / 8;
-    size_t blockSize = 8 * sizeof(__fp16);
-
-    for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
-        const __fp16 *rowSrc = src + blockIndex * 8;
-        for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
-            memcpy(dst, rowSrc, blockSize);
-            dst += 8;
-            rowSrc += rowSize;
-        }
-    }
-
-    size_t remain = rowSize - blockNum * 8;
-    if(remain){
-        const __fp16 *rowSrc = src + blockNum * 8;
-        size_t remainSize = remain * sizeof(__fp16);
-        for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
-            memcpy(dst, rowSrc, remainSize);
-            dst += 8;
-            rowSrc += rowSize;
-        }
-    }
-}
-
-void KleidiAIUtil::transferNC4HW4ToNCHW(float* src, float* dst, size_t rowNum, size_t rowSize) {
-    size_t blockNum = (rowSize+3) / 4;
-    size_t blockSize = 4 * sizeof(float);
-
-    for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
-        const float *rowSrc = src + blockIndex * 4 * rowNum;
-        float *block_dst = dst + blockIndex * 4;
-        for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
-            memcpy(block_dst, rowSrc, blockSize);
-            block_dst += rowSize;
-            rowSrc += 4;
-        }
-    }
-}
-
-void KleidiAIUtil::transferNC4HW4ToNCHW(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize) {
-    size_t blockNum = (rowSize+7) / 8;
-    size_t blockSize = 8 * sizeof(__fp16);
-
-    for(size_t blockIndex = 0; blockIndex < blockNum; blockIndex++) {
-        const __fp16 *rowSrc = src + blockIndex * 8 * rowNum;
-        __fp16 *block_dst = dst + blockIndex * 8;
-        for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
-            memcpy(block_dst, rowSrc, blockSize);
-            block_dst += rowSize;
-            rowSrc += 8;
-        }
-    }
-
-    size_t remain = rowSize - blockNum * 8;
-    if(remain){
-        const __fp16 *rowSrc = src + blockNum * 8;
-        size_t remainSize = remain * sizeof(__fp16);
-        for(size_t rowIndex = 0; rowIndex < rowNum; rowIndex++) {
-            memcpy(dst, rowSrc, remainSize);
-            dst += 8;
-            rowSrc += rowSize;
-        }
-    }
-}
-
 // Rhs pack functions for matmul_clamp_f32_qai8dxp_qsi4cxp.
 void KleidiAIUtil::packQsi4cxps16s0Qs4cxs0s1(
    size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, const uint8_t* rhs, const float* bias,
--- a/source/backend/cpu/arm/mnn_kleidiai_util.h
+++ b/source/backend/cpu/arm/mnn_kleidiai_util.h
@ -48,11 +48,6 @@ namespace MNN {
        uint8_t mRhsZeroPoint = 8;
    };

-    static void transferNCHWToNC4HW4(float* src, float* dst, size_t rowNum, size_t rowSize);
-    static void transferNCHWToNC4HW4(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize);
-    static void transferNC4HW4ToNCHW(float* src, float* dst, size_t rowNum, size_t rowSize);
-    static void transferNC4HW4ToNCHW(__fp16* src, __fp16* dst, size_t rowNum, size_t rowSize);
-
    /// Rhs pack functions for matmul_clamp_f32_qai8dxp_qsi4cxp.
    static void packQsi4cxps16s0Qs4cxs0s1(
        size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr,
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@ -408,111 +408,6 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O

    // dynamic quant
    bool directReadInt4weight = (kernelCount == 1 && ROUND_UP(oc, UNIT) == oc && ROUND_UP(ic, SRC_UNIT) == ic);
-#ifdef MNN_KLEIDIAI_ENABLED
-    if(quanCommon->canUseInt4) {
-        bool bFP16 = gcore->bytes == 2 ? true : false;
-        bool bAsym = quanCommon->asymmetric;
-        size_t blkSize = mResourceInt8->mBlockNum == 1 ? 0 : ic / mResourceInt8->mBlockNum;
-        KleidiAI::AccelType accelType = KleidiAI::getQIntAccelType(4, bAsym, blkSize);
-
-        if (!KleidiAI::mKaiInitialized) {
-            KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo(), bFP16, false);
-        }
-
-        KleidiAI& kai = KleidiAI::getInstance();
-        if(!kai.isLoaded(accelType)) {
-            kai.setLoaded(accelType);
-            kai.printInfo(accelType);
-        }
-
-        if(kai.canAccelerate(accelType)) {
-            AutoStorage<int8_t> reorderedQuantInfo;
-            reorderedQuantInfo.reset(2 * scaleSize * QUANT_INFO_BYTES + oc * QUANT_INFO_BYTES);
-            if (reorderedQuantInfo.get() == nullptr) {
-                MNN_ERROR("Memory not enough\n");
-                return;
-            }
-
-            //Prepare scale and zero data.
-            {
-                int outputCount = convOp->common()->outputCount();
-                int originOffset = -8;
-                auto quanInfoPtr = quanCommon->alpha.get();
-                auto scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
-                auto zeroPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(scalePtr) + scaleSize * QUANT_INFO_BYTES);
-                auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(zeroPtr) + scaleSize * QUANT_INFO_BYTES);
-                if (quanCommon->asymmetric) {
-                    for (int i = 0; i < blockNum; ++i) {
-                        auto dstScale = scalePtr + i * ocUp4;
-                        auto dstZero  = zeroPtr + i * ocUp4;
-                        for (int j = 0; j < outputCount; ++j) {
-                            int scaleIndex = j * blockNum + i;
-                            dstScale[j] = quanInfoPtr[2 * scaleIndex + 1];
-                            dstZero[j] = quanInfoPtr[2 * scaleIndex] + (float)originOffset * dstScale[j];
-                        }
-                    }
-                } else {
-                    for (int i = 0; i < blockNum; ++i) {
-                        auto dstScale = scalePtr + i * ocUp4;
-                        auto dstZero  = zeroPtr + i * ocUp4;
-                        for (int j = 0; j < outputCount; ++j) {
-                            int scaleIndex = j * blockNum + i;
-                            dstScale[j] = quanInfoPtr[scaleIndex];
-                            dstZero[j] = (float)originOffset * dstScale[j];
-                        }
-                    }
-                }
-                ::memcpy(biasPtr, convOp->bias()->data(), oc * QUANT_INFO_BYTES);
-            }
-
-            mAccelType = accelType;
-            int n = oc;
-            int k = ic;
-            int packedWeightSize = kai.getRhsPackedSize(mAccelType, n, k, blkSize);
-
-            //Alloc packed weight tensor.
-            mResourceInt8->mWeightInt8.reset(Tensor::createDevice<uint8_t>({packedWeightSize}));
-            bool success = backend->onAcquireBuffer(mResourceInt8->mWeightInt8.get(), Backend::STATIC);
-
-            if (!success) {
-                MNN_ERROR("Out of static memory!\n");
-                return;
-            }
-
-            size_t paraNum = scaleSize;
-            float *scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
-            float *zeroPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + paraNum;
-            float *biasPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + 2 * paraNum;
-            //Reload some parameters to fit ukernels' layout.
-            auto quanInfoPtr = quanCommon->alpha.get();
-            auto alphaSize = quanCommon->alpha.size();
-            if(bAsym) {
-                for(int i = 0; i < paraNum; i++) {
-                    if(i*2 >= alphaSize){
-                        zeroPtr[i] = 0;
-                        scalePtr[i] = 0;
-                    }
-                    else{
-                        zeroPtr[i] = quanInfoPtr[i * 2];
-                        scalePtr[i] = quanInfoPtr[i * 2 + 1];
-                    }
-                }
-            } else {
-                if(blkSize != 0) {
-                    memcpy(scalePtr, (uint8_t*)quanInfoPtr, paraNum * sizeof(float));
-                }
-            }
-
-            //Run rhs pack.
-            auto weightPackedData = mResourceInt8->mWeightInt8->host<uint8_t>();
-            kai.runRhsPack(mAccelType, 1, n, k, blkSize, 0/*unused*/,
-                           (uint8_t*)quanCommon->weight.get(),
-                           (const void*)scalePtr, (const void*)zeroPtr, (const void*)biasPtr,
-                           weightPackedData);
-            return;
-        }
-    }
-#endif
    auto target = mResourceInt8;
    // Save bias
    if (convOp->bias()) {
@ -609,9 +504,6 @@ bool DenseConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution**
    if (!exe->valid()) {
        return false;
    }
-#ifdef MNN_KLEIDIAI_ENABLED
-    exe->mAccelType = this->mAccelType;
-#endif
    *dst = exe;
    return true;
 }
@ -655,38 +547,6 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
    float weightBytes = mResourceInt8->mActBits == 4 ? 0.5 : 1;
    mBlockNum = mResourceInt8->mBlockNum;
    
-#ifdef MNN_KLEIDIAI_ENABLED
-    KleidiAI& kai = KleidiAI::getInstance();
-    if(mResourceInt8->mDynamicQuant && mResourceInt8->mActBits == 4 && kai.canAccelerate(mAccelType)) {
-        MNN_ASSERT(kai.isLoaded(mAccelType));
-        const size_t m = inputs[0]->batch(); //lhs vector number.
-        const size_t n = outputs[0]->channel(); //rhs vector number.
-        const size_t k = inputs[0]->channel(); //vector size.
-        const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
-        
-        int packedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
-        int elementSize = kai.isHalf() ? sizeof(__fp16) : sizeof(float);
-        if(m > 1 && !kai.isLinear()) {
-            int srcSize = m * k * elementSize;
-            int dstSize = m * n * elementSize;
-            int extraSize = srcSize > dstSize ? srcSize : dstSize;
-            packedSize += extraSize;
-        }
-        
-        //Split mTempIm2ColBuffer as two parts for linear/tile transfer:
-        //Part0: Lhs_packed.
-        //Part1: Lhs/Dst before transfer.
-        mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({packedSize}));
-        bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
-        if (!success) {
-            MNN_ERROR("Out of dynamic memory!\n");
-            return OUT_OF_MEMORY;
-        }
-        
-        backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
-        return NO_ERROR;
-    }
-#endif
    CPUConvolution::onResize(inputs, outputs);
    if (mResourceInt8->mDynamicQuant == false) {
        mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
@ -943,99 +803,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
    auto gcore = static_cast<CPUBackend*>(backend())->functions();
    auto dynamicOption = static_cast<CPUBackend*>(backend())->getRuntime()->hint().dynamicQuantOption;

-#ifdef MNN_KLEIDIAI_ENABLED
-    KleidiAI& kai = KleidiAI::getInstance();
-    if(mResourceInt8->mDynamicQuant && mResourceInt8->mActBits == 4 && kai.canAccelerate(mAccelType)) {
-        MNN_ASSERT(kai.isLoaded(mAccelType));
-        const size_t m = input->batch(); //lhs vector number.
-        const size_t n = output->channel(); //rhs vector number.
-        const size_t k = input->channel(); //vector size.
-        const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
-
-        bool bHalf = kai.isHalf();
-        size_t elementSize = bHalf ? sizeof(__fp16) : sizeof(float);
-        size_t lhsPackedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
-
-        auto lhs = input->host<uint8_t>();
-        auto lhsPacked = mTempIm2ColBuffer->host<int8_t>();
-        auto rhsPacked = mResourceInt8->mWeightInt8->host<uint8_t>();
-        auto dst = output->host<uint8_t>();
-
-        uint8_t *linearLhs, *linearDst;
-        if(m > 1 && !kai.isLinear()) {
-            linearLhs = (uint8_t *)lhsPacked + lhsPackedSize;
-            linearDst = linearLhs;
-        } else {
-            linearLhs = lhs;
-            linearDst = dst;
-        }
-
-        int threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
-        int threadNeed, vecPerThread;
-
-        //Dynamic quant pack lhs.
-        if(m == 1) {
-            kai.runLhsQuantPack(mAccelType, 1, k, blkSize, 1, linearLhs, lhsPacked);
-        } else {
-            if(!kai.isLinear()) {
-                if(bHalf) {
-                    KleidiAIUtil::transferNC4HW4ToNCHW((__fp16 *)lhs, (__fp16 *)linearLhs, m, k);
-                } else {
-                    KleidiAIUtil::transferNC4HW4ToNCHW((float *)lhs, (float *)linearLhs, m, k);
-                }
-            }
-
-            vecPerThread = kai.getVecNumPerThread(m, threadNum, kai.getMr(mAccelType, m));
-            threadNeed = m % vecPerThread == 0 ? m / vecPerThread : (m / vecPerThread + 1);
-            size_t srcStride = vecPerThread * k * elementSize;
-
-            auto BatchDynamicQuant = [=, &kai](int tId) {
-                auto threadSrc = linearLhs + tId * srcStride;
-                auto threadDst = lhsPacked + kai.getLhsQuantedPackedOffset(mAccelType, m, tId * vecPerThread, k, blkSize);
-                int vecNum = (tId == threadNeed - 1) ? (m - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
-                kai.runLhsQuantPack(mAccelType, vecNum, k, blkSize, kai.getMr(mAccelType, m), threadSrc, threadDst);
-            };
-
-            MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
-                BatchDynamicQuant((int)tId);
-            }
-            MNN_CONCURRENCY_END();
-        }
-
-        //Run matmul.
-        if(kai.bSupportSme2() && mAccelType == KleidiAI::AccelType::QI4_SYM_CHNLQT) {
-            //SME prefer running on single thread to obtain better performance/power consumption ratio.
-            threadNum = 1;
-        }
-
-        vecPerThread = kai.getVecNumPerThread(n, threadNum, kai.getNStep(mAccelType));
-        threadNeed = n % vecPerThread == 0 ? n / vecPerThread : (n / vecPerThread + 1);
-
-        auto ThreadFunction = [=, &kai](int tId) {
-            auto threadRhsPacked = rhsPacked + kai.getRhsPackedOffset(mAccelType, tId * vecPerThread, k, blkSize);
-            auto threadDst = linearDst + kai.getDstOffset(0, tId * vecPerThread, n, elementSize);
-            int vecNum = (tId == threadNeed - 1) ? (n - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
-            float scalarMax = bHalf ? FLT16_MAX : FLT_MAX;
-            kai.runMatmul(mAccelType, m, vecNum, k, blkSize, lhsPacked, threadRhsPacked, threadDst, n * elementSize, elementSize, scalarMax, -scalarMax);
-        };
-
-        MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
-            ThreadFunction((int)tId);
-        }
-        MNN_CONCURRENCY_END();
-
-        if(m > 1 && !kai.isLinear()) {
-            if(bHalf) {
-                KleidiAIUtil::transferNCHWToNC4HW4((__fp16 *)linearDst, (__fp16 *)dst, m, n);
-            } else {
-                KleidiAIUtil::transferNCHWToNC4HW4((float *)linearDst, (float *)dst, m, n);
-            }
-        }
-
-        return NO_ERROR;
-    }
-#endif
-
    int UNIT, SRC_UNIT, DST_XUNIT;
    core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
    auto blitProc = core->MNNPackC4Int8ForMatMul_A;
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@ -84,9 +84,6 @@ private:
    bool mIm2ColBasedInt8;
    int mSizeInputBlockQuant;
    bool mToFuseInputbias2Bias;
-#ifdef MNN_KLEIDIAI_ENABLED
-    KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
-#endif
 };

 } // namespace MNN
--- a/source/backend/cpu/compute/Convolution1x1Strassen.cpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
@ -52,88 +52,10 @@ Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common
            return;
        }
        core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
-#ifdef MNN_KLEIDIAI_ENABLED
-        if (core->bytes == 2) {
-            if (!KleidiAI::mKaiInitialized) {
-                KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo(), true, false);
-            }
-            KleidiAI::AccelType accelType = KleidiAI::AccelType::FP16;
-            KleidiAI& kai = KleidiAI::getInstance();
-            if (!kai.isLoaded(accelType)) {
-                kai.setLoaded(accelType);
-                kai.printInfo(accelType);
-            }
-
-            if (kai.canAccelerate(accelType)) {
-                mAccelType = accelType;
-                AutoRelease<Tensor> tempBiasTensor(Tensor::createDevice<float>({outputCount}));
-                mValid = b->onAcquireBuffer(tempBiasTensor.get(), Backend::STATIC);
-                if (!mValid) {
-                    b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
-                    MNN_ERROR("Not Enough Memory\n");
-                    return;
-                }
-                core->MNNFp32ToLowp(bias, tempBiasTensor->host<int16_t>(), outputCount);
-
-                int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
-                //Alloc packed weight tensor.
-                mResource->mWeight.reset(Tensor::createDevice<float>({packedSize}));
-                bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
-                if (!success) {
-                    b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
-                    b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
-                    MNN_ERROR("Out of static memory!\n");
-                    return;
-                }
-
-                //Run rhs pack.
-                kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(__fp16),
-                               tempTensor->host<void>(), nullptr, nullptr, tempBiasTensor->host<void>(),
-                               mResource->mWeight->host<void>());
-                b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
-            } else {
-                core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
-            }
-        } else {
-            core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
-        }
-#else
        core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true);
-#endif
        b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
    } else {
-#ifdef MNN_KLEIDIAI_ENABLED
-        if (!KleidiAI::mKaiInitialized) {
-            KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo(), false, false);
-        }
-
-        KleidiAI::AccelType accelType = KleidiAI::AccelType::FP32;
-        KleidiAI& kai = KleidiAI::getInstance();
-        if(!kai.isLoaded(accelType)) {
-            kai.setLoaded(accelType);
-            kai.printInfo(accelType);
-        }
-
-        if (kai.canAccelerate(accelType)) {
-            mAccelType = accelType;
-            int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
-            //Alloc packed weight tensor.
-            mResource->mWeight.reset(Tensor::createDevice<float>({packedSize}));
-            bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
-            if (!success) {
-                MNN_ERROR("Out of static memory!\n");
-                return;
-            }
-
-            //Run rhs pack.
-            kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(float),
-                        originWeight, nullptr, nullptr, bias, mResource->mWeight->host<void>());
-        } else {
-            core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
-        }
-#else
        core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true);
-#endif
    }
 }
 Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
@ -152,9 +74,6 @@ bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst)
        return true;
    }
    auto exe = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn);
-#ifdef MNN_KLEIDIAI_ENABLED
-    exe->mAccelType = this->mAccelType;
-#endif
    *dst = exe;
    return true;
 }
@ -183,26 +102,6 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
    int maxDepth = 5;
    auto icAlign = UP_DIV(ic, lPack) * lPack;
    auto weightTensor = mResource->mWeight.get();
-
-#ifdef MNN_KLEIDIAI_ENABLED
-    KleidiAI& kai = KleidiAI::getInstance();
-    if (kai.canAccelerate(mAccelType)) {
-        if (batch != 1) {
-            int packedSize = kai.getLhsPackedSize(mAccelType, batch, ic);
-
-            mInputResource.reset(Tensor::createDevice<float>({packedSize}));
-            bool success = backend()->onAcquireBuffer(mInputResource.get(), Backend::DYNAMIC);
-            if (!success) {
-                MNN_ERROR("Out of dynamic memory!\n");
-                return OUT_OF_MEMORY;
-            }
-
-            backend()->onReleaseBuffer(mInputResource.get(), Backend::DYNAMIC);
-        }
-        return NO_ERROR;
-    }
-#endif
-
    mWeightBytes = bytes;
    if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
        std::vector<int> divides(numberThread+1);
@ -298,24 +197,6 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs,
    auto weightPtr = mResource->mWeight->host<uint8_t>();
    auto biasPtr = mResource->mBias->host<uint8_t>();

-#ifdef MNN_KLEIDIAI_ENABLED
-    KleidiAI& kai = KleidiAI::getInstance();
-    if (kai.canAccelerate(mAccelType)) {
-        const size_t m = input->batch(); //lhs vector number.
-        const size_t n = output->channel(); //rhs vector number.
-        const size_t k = input->channel(); //vector size.
-        auto lhsPacked = inputPtr;
-        auto dst = output->host<uint8_t>();
-        size_t elementSize = kai.isFP16() ? sizeof(__fp16) : sizeof(float);
-        if(m != 1) {
-            lhsPacked = mInputResource->host<uint8_t>();
-            kai.runLhsPack(mAccelType, m, k, 0, inputPtr, k * elementSize, lhsPacked);
-        }
-        auto postPtr = getPostParameters();
-        kai.runMatmul(mAccelType, m, n, k, 0, lhsPacked, weightPtr, dst, n * elementSize, elementSize, postPtr[3], postPtr[2]);
-        return NO_ERROR;
-    }
-#endif
    MNN_CONCURRENCY_BEGIN(tId, size) {
        auto &unit = mUnits[tId];
        if (unit.mValid) {
--- a/source/backend/cpu/compute/Convolution1x1Strassen.hpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.hpp
@ -26,9 +26,6 @@ public:
    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 private:
    std::shared_ptr<CPUConvolution::Resource> mResource;
-#ifdef MNN_KLEIDIAI_ENABLED
-    std::shared_ptr<Tensor> mInputResource;
-#endif

    struct Unit {
        bool mValid = true;
@ -38,9 +35,6 @@ private:

    std::vector<Unit> mUnits;
    int mWeightBytes = 4;
-#ifdef MNN_KLEIDIAI_ENABLED
-    KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
-#endif
 };
 #endif
 } // namespace MNN
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@ -8,6 +8,7 @@

 #include "backend/cpu/compute/ConvolutionFloatFactory.h"
 #include "backend/cpu/CPUConvolutionDepthwise.hpp"
+#include "backend/cpu/CPURuntime.hpp"
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/compute/Convolution1x1Strassen.hpp"
 #include "backend/cpu/compute/ConvolutionGroup.hpp"
@ -22,6 +23,11 @@
 #include "core/OpCommonUtils.hpp"
 #include "backend/cpu/OneDNNConvolution.hpp"
 #include "backend/cpu/compute/ConvInt8TiledExecutor.hpp"
+#ifdef MNN_KLEIDIAI_ENABLED
+#include "backend/cpu/compute/KleidiAIConvInt8.hpp"
+#include "backend/cpu/compute/KleidiAIConvolution.hpp"
+#include "backend/cpu/compute/KleidiAIDenseConvolution.hpp"
+#endif //MNN_KLEIDIAI_ENABLED

 namespace MNN {

@ -48,6 +54,41 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
 #ifdef MNN_LOW_MEMORY
    if (lowMemory && nullptr != weightQuantInfo.get() && originWeightSize == 0) {
        if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
+#ifdef MNN_KLEIDIAI_ENABLED
+            do {
+                if (!weightQuantInfo->canUseInt4) {
+                    break;
+                }
+                auto convOp = op->main_as_Convolution2D();
+                auto core = static_cast<CPUBackend*>(backend)->functions();
+                int oc = convOp->common()->outputCount();
+                int ic = convOp->common()->inputCount();
+
+                int blockNum = 1;
+                int dequantCnt = weightQuantInfo->alphaSize;
+                if (weightQuantInfo->asymmetric) {
+                    dequantCnt /= 2;
+                }
+                blockNum = dequantCnt / oc;
+
+                bool bAsym = weightQuantInfo->asymmetric;
+                size_t blkSize = blockNum == 1 ? 0 : ic / blockNum;
+
+                KleidiAI::AccelType accelType = KleidiAI::getQIntAccelType(4, bAsym, blkSize, core->bytes);
+
+                KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
+                if(!kai.isLoaded(accelType)) {
+                    kai.setLoaded(accelType);
+                    kai.printInfo(accelType);
+                }
+
+                if(!kai.canAccelerate(accelType, convOp->common())){
+                    break;
+                }
+                return new KleidiAIConvInt8(backend, op, weightQuantInfo, true, kai, accelType, blockNum);
+            } while (0);
+#endif
+
            return new DenseConvInt8TiledExecutor(backend, op, weightQuantInfo, true);
        } else {
            return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
@ -55,14 +96,37 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
    }
 #else
    if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
+#ifdef MNN_KLEIDIAI_ENABLED
+	if (MNNGetCPUInfo()->sme2 && !weigthQauntInfo && cpuBackend->functions()->bytes == 4) {
+	    return new KleidiAIDenseConvolution(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
+	}
+#else
        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
+#endif
    }
 #endif
+
 #ifndef MNN_REDUCE_SIZE
    if (fastWay && cpuBackend->functions()->matmulBytes == 0) {
+#ifdef MNN_KLEIDIAI_ENABLED
+        auto bytes = cpuBackend->functions()->bytes; 
+        auto accelType = (bytes==2) ? KleidiAI::AccelType::FP16 : KleidiAI::AccelType::FP32;
+        KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
+        if (kai.canAccelerate(accelType)){
+            return new KleidiAIConvolution(common, backend, originWeight, originWeightSize, bias, biasSize);
+        }
+#endif //MNN_KLEIDIAI_ENABLED
+
        return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize);
    }
 #endif
+
+#ifdef MNN_KLEIDIAI_ENABLED
+    if (MNNGetCPUInfo()->sme2 && !weightQuantInfo && cpuBackend->functions()->bytes == 4) {
+	return new KleidiAIDenseConvolution(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
+    }
+#endif
+
    if (cpuBackend->getRuntime()->hint().winogradMemoryUsed == 0 || (!ConvolutionWinogradBridge::canUseWinograd(common))) {
        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
    }
--- a/source/backend/cpu/compute/KleidiAIConvInt8.cpp
+++ b/source/backend/cpu/compute/KleidiAIConvInt8.cpp
@ -0,0 +1,306 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef MNN_KLEIDIAI_ENABLED
+#include "KleidiAIConvInt8.hpp"
+#include "core/Macro.h"
+#include "core/BufferAllocator.hpp"
+
+#include <math.h>
+#include "backend/cpu/CPUBackend.hpp"
+#include "core/Concurrency.h"
+#include "core/TensorUtils.hpp"
+#include "backend/cpu/CPUTensorConvert.hpp"
+
+#define QUANT_INFO_BYTES 4
+namespace MNN {
+
+KleidiAIConvInt8::KleidiAIConvInt8(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon, bool isDynamicQuant,
+    KleidiAI &kai, KleidiAI::AccelType accelType, int32_t blockNum)
+    : CPUConvolution(op->main_as_Convolution2D()->common(), backend), kai(kai), mAccelType(accelType), mBlockNum(blockNum) {
+    // convolution info
+    auto convOp = op->main_as_Convolution2D();
+    int oc = convOp->common()->outputCount();
+    int ic = convOp->common()->inputCount();
+
+    // backend info
+    auto core = static_cast<CPUBackend*>(backend)->functions();
+    int pack = core->pack;
+
+    // compute info
+    int ocUp4 = ROUND_UP(oc, pack);
+    int scaleSize = ocUp4 * mBlockNum;
+
+    // kleidia info
+    bool bFP16 = core->bytes == 2 ? true : false;
+    bool bAsym = quanCommon->asymmetric;
+    size_t blkSize = mBlockNum == 1 ? 0 : ic / mBlockNum;
+
+    AutoStorage<int8_t> reorderedQuantInfo;
+    reorderedQuantInfo.reset(2 * scaleSize * QUANT_INFO_BYTES + oc * QUANT_INFO_BYTES);
+    if (reorderedQuantInfo.get() == nullptr) {
+        MNN_ERROR("Memory not enough\n");
+        return;
+    }
+
+    //Prepare scale and zero data.
+    {
+        int outputCount = convOp->common()->outputCount();
+        int originOffset = -8;
+        auto quanInfoPtr = quanCommon->alpha.get();
+        auto scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
+        auto zeroPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(scalePtr) + scaleSize * QUANT_INFO_BYTES);
+        auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(zeroPtr) + scaleSize * QUANT_INFO_BYTES);
+        if (quanCommon->asymmetric) {
+            for (int i = 0; i < blockNum; ++i) {
+                auto dstScale = scalePtr + i * ocUp4;
+                auto dstZero  = zeroPtr + i * ocUp4;
+                for (int j = 0; j < outputCount; ++j) {
+                    int scaleIndex = j * blockNum + i;
+                    dstScale[j] = quanInfoPtr[2 * scaleIndex + 1];
+                    dstZero[j] = quanInfoPtr[2 * scaleIndex] + (float)originOffset * dstScale[j];
+                }
+            }
+        } else {
+            for (int i = 0; i < blockNum; ++i) {
+                auto dstScale = scalePtr + i * ocUp4;
+                auto dstZero  = zeroPtr + i * ocUp4;
+                for (int j = 0; j < outputCount; ++j) {
+                    int scaleIndex = j * blockNum + i;
+                    dstScale[j] = quanInfoPtr[scaleIndex];
+                    dstZero[j] = (float)originOffset * dstScale[j];
+                }
+            }
+        }
+        ::memcpy(biasPtr, convOp->bias()->data(), oc * QUANT_INFO_BYTES);
+    }
+
+    int n = oc;
+    int k = ic;
+    int packedWeightSize = kai.getRhsPackedSize(mAccelType, n, k, blkSize);
+
+    //Alloc packed weight tensor.
+    mWeightInt8.reset(Tensor::createDevice<uint8_t>({packedWeightSize}));
+    bool success = backend->onAcquireBuffer(mWeightInt8.get(), Backend::STATIC);
+
+    if (!success) {
+        MNN_ERROR("Out of static memory!\n");
+        return;
+    }
+
+    size_t paraNum = scaleSize;
+    float *scalePtr = reinterpret_cast<float*>(reorderedQuantInfo.get());
+    float *zeroPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + paraNum;
+    float *biasPtr = reinterpret_cast<float*>(reorderedQuantInfo.get()) + 2 * paraNum;
+    //Reload some parameters to fit ukernels' layout.
+    auto quanInfoPtr = quanCommon->alpha.get();
+    auto alphaSize = quanCommon->alpha.size();
+    if(bAsym) {
+        for(int i = 0; i < paraNum; i++) {
+            if(i*2 >= alphaSize){
+                zeroPtr[i] = 0;
+                scalePtr[i] = 0;
+            }
+            else{
+                zeroPtr[i] = quanInfoPtr[i * 2];
+                scalePtr[i] = quanInfoPtr[i * 2 + 1];
+            }
+        }
+    } else {
+        if(blkSize != 0) {
+            memcpy(scalePtr, (uint8_t*)quanInfoPtr, paraNum * sizeof(float));
+        }
+    }
+
+    //Run rhs pack.
+    auto weightPackedData = mWeightInt8->host<uint8_t>();
+    kai.runRhsPack(mAccelType, 1, n, k, blkSize, 0/*unused*/,
+                    (uint8_t*)quanCommon->weight.get(),
+                    (const void*)scalePtr, (const void*)zeroPtr, (const void*)biasPtr,
+                    weightPackedData);
+    return;
+}
+
+
+KleidiAIConvInt8::KleidiAIConvInt8(Backend* backend, const Op* op, const KleidiAIConvInt8& exe)
+    : CPUConvolution(op->main_as_Convolution2D()->common(), backend), kai(exe.kai), mAccelType(exe.mAccelType),
+    mWeightInt8(exe.mWeightInt8),mBlockNum(exe.mBlockNum),
+      mTempIm2ColBuffer(exe.mTempIm2ColBuffer) {
+}
+
+KleidiAIConvInt8::~KleidiAIConvInt8() {
+    // Do nothing
+}
+
+bool KleidiAIConvInt8::onClone(Backend* bn, const Op* op, Execution** dst) {
+    if (nullptr == dst) {
+        return true;
+    }
+    auto exe = new KleidiAIConvInt8(bn, op, *this);
+    if (!exe->valid()) {
+        return false;
+    }
+    *dst = exe;
+    return true;
+}
+
+// need
+ErrorCode KleidiAIConvInt8::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    // Initialize.
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto core =static_cast<CPUBackend*>(backend())->functions();
+    auto b = backend();
+
+    MNN_ASSERT(kai.isLoaded(mAccelType));
+    const size_t m = inputs[0]->batch() * inputs[0]->width() * inputs[0]->height(); //lhs vector number.
+    const size_t n = outputs[0]->channel(); //rhs vector number.
+    const size_t k = inputs[0]->channel(); //vector size.
+    const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
+
+    auto inputOriginFmt = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+    auto outputOriginFmt = TensorUtils::getDescribe(outputs[0])->dimensionFormat;
+    halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
+    if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        mInputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{input->batch(), input->height(), input->width(), input->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
+        mValid = b->onAcquireBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
+        if (!mValid) {
+            MNN_ERROR("Out of dynamic memory!\n");
+            return OUT_OF_MEMORY;
+        }
+    }
+    if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        mOutputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{output->batch(), output->height(), output->width(), output->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
+        mValid = b->onAcquireBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
+        if (!mValid) {
+            MNN_ERROR("Out of dynamic memory!\n");
+            return OUT_OF_MEMORY;
+        }
+    }
+
+    int packedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
+    int elementSize = core->bytes;
+
+    //Split mTempIm2ColBuffer as two parts for linear/tile transfer:
+    //Part0: Lhs_packed.
+    //Part1: Lhs/Dst before transfer.
+    mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({packedSize}));
+    bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
+    if (!success) {
+        MNN_ERROR("Out of dynamic memory!\n");
+        return OUT_OF_MEMORY;
+    }
+
+    backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
+
+    if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        b->onReleaseBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
+    }
+    if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        b->onReleaseBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
+    }
+    return NO_ERROR;
+}
+
+ErrorCode KleidiAIConvInt8::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    const auto input = inputs[0];
+    auto output      = outputs[0];
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+
+    // Initialize for convert
+    auto inputDes = TensorUtils::getDescribe(inputs[0]);
+    auto outputDes = TensorUtils::getDescribe(outputs[0]);
+    auto b = backend();
+    halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
+
+    MNN_ASSERT(kai.isLoaded(mAccelType));
+    const size_t m = input->batch() * input->width() * input->height(); //lhs vector number.
+    const size_t n = output->channel(); //rhs vector number.
+    const size_t k = input->channel(); //vector size.
+    const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
+
+    size_t elementSize = core->bytes;
+    size_t lhsPackedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
+
+    auto lhs = input->host<uint8_t>();
+    auto lhsPacked = mTempIm2ColBuffer->host<int8_t>();
+    auto rhsPacked = mWeightInt8->host<uint8_t>();
+
+    int threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
+    int threadNeed, vecPerThread;
+
+    if(inputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC) {
+        // Convert input to NHWC format.
+        MNN_CONCURRENCY_BEGIN(tId, threadNum) {
+            CPUTensorConverter::convert(input, mInputConvertBuffer.get(), core, tId, threadNum);
+        };
+        MNN_CONCURRENCY_END();
+        lhs = mInputConvertBuffer->host<uint8_t>();
+    }
+
+    //Dynamic quant pack lhs.
+    if(m == 1) {
+        kai.runLhsQuantPack(mAccelType, 1, k, blkSize, 1, lhs, lhsPacked);
+    } else {
+        vecPerThread = kai.getVecNumPerThread(m, threadNum, kai.getMr(mAccelType, m));
+        threadNeed = m % vecPerThread == 0 ? m / vecPerThread : (m / vecPerThread + 1);
+        size_t srcStride = vecPerThread * k * elementSize;
+
+        auto BatchDynamicQuant = [=](int tId) {
+            auto threadSrc = lhs + tId * srcStride;
+            auto threadDst = lhsPacked + kai.getLhsQuantedPackedOffset(mAccelType, m, tId * vecPerThread, k, blkSize);
+            int vecNum = (tId == threadNeed - 1) ? (m - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
+            kai.runLhsQuantPack(mAccelType, vecNum, k, blkSize, kai.getMr(mAccelType, m), threadSrc, threadDst);
+        };
+
+        MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
+            BatchDynamicQuant((int)tId);
+        }
+        MNN_CONCURRENCY_END();
+    }
+
+    //Run matmul.
+    auto dst = output->host<uint8_t>();
+    if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC) {
+        //store matmul result to convert buffer.
+        dst = mOutputConvertBuffer->host<uint8_t>();
+    }
+
+    if(kai.bSupportSme2() && mAccelType == KleidiAI::AccelType::QI4_SYM_CHNLQT_F32) {
+        //SME prefer running on single thread to obtain better performance/power consumption ratio.
+        threadNum = 1;
+    }
+
+    vecPerThread = kai.getVecNumPerThread(n, threadNum, kai.getNStep(mAccelType));
+    threadNeed = n % vecPerThread == 0 ? n / vecPerThread : (n / vecPerThread + 1);
+    auto postPtr = getPostParameters();
+
+    auto ThreadFunction = [=](int tId) {
+        auto threadRhsPacked = rhsPacked + kai.getRhsPackedOffset(mAccelType, tId * vecPerThread, k, blkSize);
+        auto threadDst = dst + kai.getDstOffset(0, tId * vecPerThread, n, elementSize);
+        int vecNum = (tId == threadNeed - 1) ? (n - vecPerThread * tId) : vecPerThread; //Last threadN may less than vecPerThread.
+        kai.runMatmul(mAccelType, m, vecNum, k, blkSize, lhsPacked, threadRhsPacked, threadDst, n * elementSize, elementSize, postPtr[3], postPtr[2]);
+    };
+
+    MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
+        ThreadFunction((int)tId);
+    }
+    MNN_CONCURRENCY_END();
+
+
+    if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC) {
+        // Convert output from NHWC format to original format.
+        MNN_CONCURRENCY_BEGIN(tId, threadNum) {
+            CPUTensorConverter::convert(mOutputConvertBuffer.get(), output, core, tId, threadNum);
+        };
+        MNN_CONCURRENCY_END();
+    }
+
+    return NO_ERROR;
+}
+
+} // namespace MNN
+#endif //MNN_KLEIDIAI_ENABLED
--- a/source/backend/cpu/compute/KleidiAIConvInt8.hpp
+++ b/source/backend/cpu/compute/KleidiAIConvInt8.hpp
@ -0,0 +1,35 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef KleidiAIConvInt8_hpp
+#define KleidiAIConvInt8_hpp
+#ifdef MNN_KLEIDIAI_ENABLED
+#include "backend/cpu/CPUConvolution.hpp"
+#include "Int8FunctionsOpt.h"
+#include "CommonOptFunction.h"
+
+namespace MNN {
+class KleidiAIConvInt8 : public CPUConvolution {
+public:
+    KleidiAIConvInt8(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon, bool isDynamicQuant, KleidiAI &kai, KleidiAI::AccelType accelType, int32_t blockNum);
+    virtual ~KleidiAIConvInt8();
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
+private:
+    KleidiAIConvInt8(Backend* backend, const Op* op, const KleidiAIConvInt8& exe);
+    std::shared_ptr<Tensor> mWeightInt8;
+    std::shared_ptr<Tensor> mTempIm2ColBuffer;
+    std::shared_ptr<Tensor> mInputConvertBuffer;
+    std::shared_ptr<Tensor> mOutputConvertBuffer;
+    KleidiAI &kai;
+    KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
+    int32_t mBlockNum = 1;
+};
+
+} // namespace MNN
+#endif // MNN_KLEIDIAI_ENABLED
+#endif /* KleidiAIConvInt8_hpp */
--- a/source/backend/cpu/compute/KleidiAIConvolution.cpp
+++ b/source/backend/cpu/compute/KleidiAIConvolution.cpp
@ -0,0 +1,232 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef MNN_KLEIDIAI_ENABLED
+#include "KleidiAIConvolution.hpp"
+#include <string.h>
+#include "core/BufferAllocator.hpp"
+#include "backend/cpu/CPUBackend.hpp"
+#include "core/Concurrency.h"
+#include "core/TensorUtils.hpp"
+#include "backend/cpu/CPUTensorConvert.hpp"
+
+namespace MNN {
+#ifndef MNN_REDUCE_SIZE
+
+KleidiAIConvolution::KleidiAIConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight,
+                                        size_t originWeightSize, const float *bias, size_t biasSize)
+    : CPUConvolution(common, b) {
+
+        auto outputCount = (int)biasSize;
+        auto core = static_cast<CPUBackend*>(b)->functions();
+        mResource.reset(new CPUConvolution::Resource);
+        mResource->backend = b;
+        auto mSrcCount   = (int)originWeightSize / outputCount;
+        if (!mResource->copyBiasAlign(bias, (int)biasSize)) {
+            MNN_ERROR("Not Enough Memory\n");
+            mValid = false;
+            return;
+        }
+        if (b->getRuntime()->hint().useCachedMmap > 1) {
+            return;
+        }
+        KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
+
+        if (core->bytes == 2) {
+            AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount}));
+            mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC);
+            if (!mValid) {
+                MNN_ERROR("Not Enough Memory\n");
+                return;
+            }
+            core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount);
+
+            KleidiAI::AccelType accelType = KleidiAI::AccelType::FP16;
+            if (!kai.isLoaded(accelType)) {
+                kai.setLoaded(accelType);
+                kai.printInfo(accelType);
+            }
+
+            mAccelType = accelType;
+            AutoRelease<Tensor> tempBiasTensor(Tensor::createDevice<float>({outputCount}));
+            mValid = b->onAcquireBuffer(tempBiasTensor.get(), Backend::STATIC);
+            if (!mValid) {
+                b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
+                MNN_ERROR("Not Enough Memory\n");
+                return;
+            }
+            core->MNNFp32ToLowp(bias, tempBiasTensor->host<int16_t>(), outputCount);
+
+            int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
+            //Alloc packed weight tensor.
+            mResource->mWeight.reset(Tensor::createDevice<int8_t>({packedSize}));
+            bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
+            if (!success) {
+                b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
+                b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
+                MNN_ERROR("Out of static memory!\n");
+                return;
+            }
+
+            //Run rhs pack.
+            kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(__fp16),
+                           tempTensor->host<void>(), nullptr, nullptr, tempBiasTensor->host<void>(),
+                           mResource->mWeight->host<void>());
+            b->onReleaseBuffer(tempBiasTensor.get(), Backend::STATIC);
+            b->onReleaseBuffer(tempTensor.get(), Backend::STATIC);
+        } else {
+            KleidiAI::AccelType accelType = KleidiAI::AccelType::FP32;
+            if(!kai.isLoaded(accelType)) {
+                kai.setLoaded(accelType);
+                kai.printInfo(accelType);
+            }
+            mAccelType = accelType;
+            int packedSize = kai.getRhsPackedSize(mAccelType, outputCount, mSrcCount, 0);
+            //Alloc packed weight tensor.
+            mResource->mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{packedSize}));
+            mValid  = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
+            if (!mValid) {
+                MNN_ERROR("Out of static memory!\n");
+                return;
+            }
+
+            //Run rhs pack.
+            kai.runRhsPack(mAccelType, 1, outputCount, mSrcCount, 0, mSrcCount * sizeof(float),
+                        originWeight, nullptr, nullptr, bias, mResource->mWeight->host<void>());
+        }
+
+}
+
+KleidiAIConvolution::KleidiAIConvolution(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) {
+    mResource = resource;
+}
+
+KleidiAIConvolution::~KleidiAIConvolution() {
+    // Do nothing
+}
+
+bool KleidiAIConvolution::onClone(Backend* bn, const Op* op, Execution** dst) {
+    if (!mValid) {
+        return false;
+    }
+    if (nullptr == dst) {
+        return true;
+    }
+    auto exe = new KleidiAIConvolution(mResource, op->main_as_Convolution2D()->common(), bn);
+    exe->mAccelType = this->mAccelType;
+    *dst = exe;
+    return true;
+}
+
+ErrorCode KleidiAIConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    CPUConvolution::onResize(inputs, outputs);
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    int bytes = core->bytes;
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto inputDes       = TensorUtils::getDescribe(inputs[0]);
+    auto outputDes      = TensorUtils::getDescribe(outputs[0]);
+    auto ic = input->channel();
+    auto oc = output->channel();
+    auto batch       = input->batch();
+    auto b = backend();
+
+    KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
+    auto inputOriginFmt = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+    auto outputOriginFmt = TensorUtils::getDescribe(outputs[0])->dimensionFormat;
+    halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
+    if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        mInputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{input->batch(), input->height(), input->width(), input->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
+        mValid = b->onAcquireBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
+        if (!mValid) {
+            MNN_ERROR("Out of dynamic memory!\n");
+            return OUT_OF_MEMORY;
+        }
+    }
+    if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        mOutputConvertBuffer.reset(Tensor::createDevice(std::vector<int>{output->batch(), output->height(), output->width(), output->channel()}, dataType, Tensor::DimensionType::TENSORFLOW));
+        mValid = b->onAcquireBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
+        if (!mValid) {
+            MNN_ERROR("Out of dynamic memory!\n");
+            return OUT_OF_MEMORY;
+        }
+    }
+
+    auto m = batch * input->width() * input->height();
+    if (m != 1) {
+        int packedSize = kai.getLhsPackedSize(mAccelType, m, ic);
+
+        mInputResource.reset(Tensor::createDevice<float>({packedSize}));
+        bool success = backend()->onAcquireBuffer(mInputResource.get(), Backend::DYNAMIC);
+        if (!success) {
+            MNN_ERROR("Out of dynamic memory!\n");
+            return OUT_OF_MEMORY;
+        }
+
+        b->onReleaseBuffer(mInputResource.get(), Backend::DYNAMIC);
+    }
+
+    if(inputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        b->onReleaseBuffer(mInputConvertBuffer.get(), Backend::DYNAMIC);
+    }
+    if (outputOriginFmt != MNN_DATA_FORMAT_NHWC){
+        b->onReleaseBuffer(mOutputConvertBuffer.get(), Backend::DYNAMIC);
+    }
+    return NO_ERROR;
+}
+ErrorCode KleidiAIConvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto core = static_cast<CPUBackend*>(backend())->functions();
+    auto inputPtr = input->host<uint8_t>();
+    auto weightPtr = mResource->mWeight->host<uint8_t>();
+    int threadNum = static_cast<CPUBackend*>(backend())->threadNumber();
+
+    KleidiAI& kai = KleidiAI::getInstance(*MNNGetCPUInfo());
+    const size_t m = input->batch() * input->width() * input->height(); //lhs vector number.
+    const size_t n = output->channel(); //rhs vector number.
+    const size_t k = input->channel(); //vector size.
+    auto dst = output->host<uint8_t>();
+    halide_type_t dataType = core->bytes == 2 ? halide_type_of<int16_t>() : halide_type_of<float>();
+    size_t elementSize = core->bytes;
+    auto b = backend();
+
+    auto inputDes = TensorUtils::getDescribe(inputs[0]);
+    if(inputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC){
+        MNN_CONCURRENCY_BEGIN(tId, threadNum) {
+            CPUTensorConverter::convert(input, mInputConvertBuffer.get(), core, tId, threadNum);
+        };
+        MNN_CONCURRENCY_END();
+        inputPtr = mInputConvertBuffer->host<uint8_t>();
+    }
+    auto lhsPacked = inputPtr;
+    if(m != 1) {
+        lhsPacked = mInputResource->host<uint8_t>();
+        kai.runLhsPack(mAccelType, m, k, 0, inputPtr, k * elementSize, lhsPacked);
+    }
+
+    auto outputDes = TensorUtils::getDescribe(outputs[0]);
+    auto postPtr = getPostParameters();
+    auto outputPtr = output->host<uint8_t>();
+    if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC){
+        outputPtr = mOutputConvertBuffer->host<uint8_t>();
+    }
+
+    kai.runMatmul(mAccelType, m, n, k, 0, lhsPacked, weightPtr, outputPtr, n * elementSize, elementSize, postPtr[3], postPtr[2]);
+
+    if(outputDes->dimensionFormat != MNN_DATA_FORMAT_NHWC){
+        MNN_CONCURRENCY_BEGIN(tId, threadNum) {
+            CPUTensorConverter::convert(mOutputConvertBuffer.get(), output, core, tId, threadNum);
+        };
+        MNN_CONCURRENCY_END();
+    }
+
+    return NO_ERROR;
+}
+
+#endif
+} // namespace MNN
+#endif //MNN_KLEIDIAI_ENABLED
--- a/source/backend/cpu/compute/KleidiAIConvolution.hpp
+++ b/source/backend/cpu/compute/KleidiAIConvolution.hpp
@ -0,0 +1,37 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef KleidiAIConvolution_hpp
+#define KleidiAIConvolution_hpp
+#ifdef MNN_KLEIDIAI_ENABLED
+#include <functional>
+#include "backend/cpu/CPUConvolution.hpp"
+namespace MNN {
+#ifndef MNN_REDUCE_SIZE
+
+class KleidiAIConvolution : public CPUConvolution{
+    public:
+        KleidiAIConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight, size_t originWeightSize, const float *bias, size_t biasSize);
+        KleidiAIConvolution(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b);
+        virtual ~KleidiAIConvolution();
+    
+        virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    
+        virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+        virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
+    private:
+        std::shared_ptr<Tensor> mInputResource;
+        std::shared_ptr<Tensor> mInputConvertBuffer;
+        std::shared_ptr<Tensor> mOutputConvertBuffer;
+        std::shared_ptr<CPUConvolution::Resource> mResource;
+        KleidiAI::AccelType mAccelType = KleidiAI::AccelType::ACC_TYPE_NUMBER;
+
+};
+#endif //MNN_KLEIDIAI_ENABLED
+
+} // namespace MNN
+#endif
+#endif /* KleidiAIConvolution_hpp */
--- a/source/backend/cpu/compute/KleidiAIDenseConvolution.cpp
+++ b/source/backend/cpu/compute/KleidiAIDenseConvolution.cpp
@ -0,0 +1,320 @@
+#if MNN_KLEIDIAI_ENABLED
+#include "KleidiAIDenseConvolution.hpp"
+
+#include <numeric>
+
+#include "CommonOptFunction.h"
+#include "MNN/ErrorCode.hpp"
+#include "backend/cpu/CPUBackend.hpp"
+#include "backend/cpu/CPUTensorConvert.hpp"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h"
+#include "kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
+#include "kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
+
+namespace MNN {
+template <typename T>
+static void initWeight(const T* weight, const T* bias, T* cache, T* output, const std::vector<int>& shape,
+                       const int bytes) {
+    ::memset(cache, 0, sizeof(T) * std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
+    ConvertOIHWToHWIO(cache, weight, shape);
+    auto outputCount = shape[0];
+    auto srcCount    = shape[1];
+    auto kh          = shape[2];
+    auto kw          = shape[3];
+    if (bytes == 4) {
+        kai_run_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(outputCount, kh * kw, srcCount, outputCount * sizeof(T),
+                                                            cache, bias, output);
+    } else {
+        MNN_ERROR("Not fp32, should not be called here\n");
+        abort();
+    }
+}
+
+KleidiAIDenseConvolution::KleidiAIDenseConvolution(const Convolution2DCommon* common, Backend* b,
+                                                   const float* originWeight, size_t originWeightSize,
+                                                   const float* bias, size_t biasSize,
+                                                   std::shared_ptr<ConvolutionCommon::Int8Common> int8Info)
+    : ConvolutionTiledExecutor(b, bias, biasSize) {
+    auto outputCount = (int)biasSize;
+    auto core        = static_cast<CPUBackend*>(b)->functions();
+    int bytes        = core->bytes;
+    auto srcCount    = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
+    if (core->matmulBytes != 0) {
+        bytes = core->matmulBytes;
+    }
+
+    int kai_rhs_packed_size = 0;
+    if (core->bytes == 4) {
+        kai_rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
+            outputCount, common->kernelY() * common->kernelX(), srcCount);
+    } else {
+        MNN_ERROR("Not fp32, should not be called here\n");
+        abort();
+    }
+    mResource->mWeight.reset(Tensor::createDevice<uint8_t>({kai_rhs_packed_size}));
+    mResource->mBias.reset(Tensor::createDevice<uint8_t>({outputCount * core->bytes}));
+
+    mValid = mValid && backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC);
+    if (!mValid) {
+        return;
+    }
+    mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
+    if (!mValid) {
+        return;
+    }
+
+    std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>(
+        {outputCount, srcCount * common->kernelX() * common->kernelY(), (int)sizeof(float)})); // cache must be float
+    mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
+    if (!mValid) {
+        return;
+    }
+
+    std::vector<int> oihwShape = {outputCount, srcCount, common->kernelY(), common->kernelX()};
+    if (core->bytes == 4) {
+        MNN::initWeight(originWeight, bias, cache->host<float>(), mResource->mWeight->host<float>(), oihwShape,
+                        core->bytes);
+    } else {
+        MNN_ERROR("Not fp32, should not be called here\n");
+        abort();
+    }
+
+    backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
+    mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b, mResource.get()));
+}
+
+KleidiAIDenseConvolution::KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res,
+                                                   const Convolution2DCommon* common, Backend* b)
+    : ConvolutionTiledExecutor(res, b) {
+    mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b, mResource.get()));
+}
+
+KleidiAIDenseConvolution::~KleidiAIDenseConvolution() {
+    // Do nothing
+}
+
+bool KleidiAIDenseConvolution::onClone(Backend* bn, const Op* op, Execution** dst) {
+    if (!mValid) {
+        return false;
+    }
+    if (nullptr == dst) {
+        return true;
+    }
+    auto dense                     = new KleidiAIDenseConvolution(mResource, op->main_as_Convolution2D()->common(), bn);
+    dense->mProxy->mConvPerfconfig = mProxy->mConvPerfconfig;
+    *dst                           = dense;
+    return true;
+}
+
+ErrorCode KleidiAIDenseConvolution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto code = mProxy->onExecute(mInputs, outputs);
+    return code;
+}
+ErrorCode KleidiAIDenseConvolution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    mInputs   = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
+    auto code = mProxy->onResize(mInputs, outputs);
+    if (NO_ERROR != code) {
+        return code;
+    }
+    return NO_ERROR;
+}
+
+ErrorCode KleidiAIDenseConvolutionMultiInput::onExecute(const std::vector<Tensor*>& inputs,
+                                                        const std::vector<Tensor*>& outputs) {
+    auto function = static_cast<CPUBackend*>(backend())->functions();
+    if (nullptr != mTempBias) {
+        ::memset(mTempBias->host<float>(), 0, mTempBias->elementSize() * function->bytes);
+        if (inputs.size() > 2) {
+            ::memcpy(mTempBias->host<float>(), inputs[2]->host<float>(), inputs[2]->elementSize() * function->bytes);
+        }
+    }
+    auto cache  = mTempWeightCache->host<float>();
+    auto source = inputs[1]->host<float>();
+    if (function->bytes == 4) {
+        initWeight(source, mInputs[2]->host<float>(), cache, mTempWeight->host<float>(), inputs[1]->shape(),
+                   function->bytes);
+    } else {
+        MNN_ERROR("Not fp32, should not be called here\n");
+        abort();
+    }
+    return mProxy->onExecute(mInputs, outputs);
+}
+ErrorCode KleidiAIDenseConvolutionMultiInput::onResize(const std::vector<Tensor*>& inputs,
+                                                       const std::vector<Tensor*>& outputs) {
+    int depth       = inputs[1]->channel();
+    int outputCount = outputs[0]->channel();
+    auto function   = static_cast<CPUBackend*>(backend())->functions();
+    if (function->bytes == 4) {
+        int kai_rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
+            outputCount, inputs[1]->stride(1), depth);
+        mTempWeight.reset(Tensor::createDevice<uint8_t>({kai_rhs_packed_size}));
+    } else {
+        MNN_ERROR("Not fp32, should not be called here\n");
+        abort();
+    }
+    mTempWeightCache.reset(Tensor::createDevice<float>(
+        {inputs[1]->height(), inputs[1]->width(), inputs[1]->channel(), inputs[1]->batch()}));
+    auto res = backend()->onAcquireBuffer(mTempWeight.get(), Backend::DYNAMIC);
+    res      = res && backend()->onAcquireBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
+    mTempBias.reset();
+    if (!res) {
+        return OUT_OF_MEMORY;
+    }
+    if (inputs.size() > 2 && inputs[2]->elementSize() % function->pack == 0) {
+        mInputs = {inputs[0], mTempWeight.get(), inputs[2]};
+    } else {
+        mTempBias.reset(Tensor::createDevice<float>({UP_DIV(outputCount, function->pack) * function->pack}));
+        backend()->onAcquireBuffer(mTempBias.get(), Backend::DYNAMIC);
+        mInputs = {inputs[0], mTempWeight.get(), mTempBias.get()};
+    }
+    backend()->onReleaseBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
+    auto errorCode = mProxy->onResize(mInputs, outputs);
+    backend()->onReleaseBuffer(mTempWeight.get(), Backend::DYNAMIC);
+    if (nullptr != mTempBias) {
+        backend()->onReleaseBuffer(mTempBias.get(), Backend::DYNAMIC);
+    }
+    return errorCode;
+}
+
+ErrorCode KleidiAIDenseConvolutionImpl::onResize(const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) {
+    CPUConvolution::onResize(inputs, outputs);
+    auto input   = inputs[0];
+    auto weight  = inputs[1];
+    Tensor* bias = nullptr;
+    if (inputs.size() > 2) {
+        bias = inputs[2];
+    }
+    auto core       = static_cast<CPUBackend*>(backend())->functions();
+    int bytes       = core->bytes;
+    int matmulBytes = bytes;
+    if (core->matmulBytes != 0) {
+        matmulBytes = core->matmulBytes;
+    }
+    auto ic     = input->channel();
+    auto output = outputs[0];
+    auto batch  = output->batch();
+
+    auto outputChannel = output->channel();
+    auto kernelSize    = mCommon->kernelX() * mCommon->kernelY();
+
+    mTempBufferTranspose.buffer().type       = halide_type_of<uint8_t>();
+    mTempBufferTranspose.buffer().dimensions = 1;
+    int outputNhwSize                        = batch * output->height() * output->width();
+    if (core->bytes == 4) {
+        mTempBufferTranspose.buffer().dim[0].extent =
+            kai_get_lhs_packed_size_lhs_imatmul_pack_x32p2vlx1_x32p_sme(outputNhwSize, kernelSize, ic);
+    } else {
+        MNN_ERROR("Not fp32, should not be called here\n");
+        abort();
+    }
+    TensorUtils::setLinearLayout(&mTempBufferTranspose);
+
+    bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
+    if (!success) {
+        return OUT_OF_MEMORY;
+    }
+
+    TensorUtils::getDescribe(&mOutputNHWC)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+    mOutputNHWC.buffer().dimensions                         = 4;
+    mOutputNHWC.buffer().dim[0].extent                      = output->batch();
+    mOutputNHWC.buffer().dim[1].extent                      = output->height();
+    mOutputNHWC.buffer().dim[2].extent                      = output->width();
+    mOutputNHWC.buffer().dim[3].extent                      = output->channel();
+    mOutputNHWC.buffer().type                               = output->getType();
+    success = backend()->onAcquireBuffer(&mOutputNHWC, Backend::DYNAMIC);
+    if (!success) {
+        return OUT_OF_MEMORY;
+    }
+
+    TensorUtils::getDescribe(&mInputNHWC)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+    mInputNHWC.buffer().dimensions                         = 4;
+    mInputNHWC.buffer().dim[0].extent                      = input->batch();
+    mInputNHWC.buffer().dim[1].extent                      = input->height();
+    mInputNHWC.buffer().dim[2].extent                      = input->width();
+    mInputNHWC.buffer().dim[3].extent                      = input->channel();
+    mInputNHWC.buffer().type                               = input->getType();
+    success                                                = backend()->onAcquireBuffer(&mInputNHWC, Backend::DYNAMIC);
+    if (!success) {
+        return OUT_OF_MEMORY;
+    }
+
+    TensorUtils::getDescribe(&mPadBuffer)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+    mPadBuffer.buffer().dimensions                         = 1;
+    mPadBuffer.buffer().dim[0].extent                      = input->channel();
+    mPadBuffer.buffer().type                               = input->getType();
+    TensorUtils::setLinearLayout(&mPadBuffer);
+    success = backend()->onAcquireBuffer(&mPadBuffer, Backend::DYNAMIC);
+    if (!success) {
+        return OUT_OF_MEMORY;
+    }
+
+    backend()->onReleaseBuffer(&mOutputNHWC, Backend::DYNAMIC);
+    backend()->onReleaseBuffer(&mInputNHWC, Backend::DYNAMIC);
+    backend()->onReleaseBuffer(&mPadBuffer, Backend::DYNAMIC);
+
+    backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
+
+    auto postParameters = getPostParameters();
+    mFunction.first     = ((CPUBackend*)backend())->threadNumber();
+
+    auto padFull = ConvolutionCommon::convolutionPadFull(input, output, mCommon);
+    ConvParams params{
+        .inputChannel  = ic,
+        .outputChannel = outputChannel,
+        .kernelHeight  = mCommon->kernelY(),
+        .kernelWidth   = mCommon->kernelX(),
+        .strideHeight  = mCommon->strideY(),
+        .strideWidth   = mCommon->strideX(),
+        .padTop        = std::get<1>(padFull),
+        .padBottom     = std::get<3>(padFull),
+        .padLeft       = std::get<0>(padFull),
+        .padRight      = std::get<2>(padFull),
+        .dilatedHeight = mCommon->dilateY(),
+        .dilatedWidth  = mCommon->dilateX(),
+    };
+
+    mFunction.second = [=](int tid) {
+        // Convert NC4HW4 to NHWC
+        auto inputShape = input->shape(); // TODO check for NC4HW4, should be the NCHW
+        CPUTensorConverter::convert(input, &mInputNHWC, core);
+        // Lhs packing
+        if (bytes == 4) {
+            int blockSize = kai_get_m_step_lhs_imatmul_pack_x32p2vlx1_x32p_sme();
+            ::memset(mPadBuffer.host<float>(), 0, params.inputChannel * sizeof(float));
+            auto table = IndirectionTable<float>(mInputNHWC.shape(), params, mInputNHWC.host<float>(),
+                                                 mPadBuffer.host<float>(), blockSize);
+            kai_run_lhs_imatmul_pack_x32p2vlx1_x32p_sme(outputNhwSize, kernelSize, ic, table.data.data(), 0,
+                                                        mPadBuffer.host<uint8_t>(),
+                                                        mTempBufferTranspose.host<uint8_t>());
+        } else {
+            MNN_ERROR("Not fp32, should not be called here\n");
+            abort();
+        }
+
+        // Run Matmul
+        if (bytes == 4) {
+            kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa(
+                outputNhwSize, outputChannel, kernelSize, ic, mTempBufferTranspose.host<uint8_t>(),
+                weight->host<uint8_t>(), mOutputNHWC.host<uint8_t>(), outputChannel * sizeof(float), postParameters[2],
+                postParameters[3]);
+        } else {
+            MNN_ERROR("Not fp32, should not be called here\n");
+            abort();
+        }
+
+        // Convert NHWC to NC4HW4
+        CPUTensorConverter::convert(&mOutputNHWC, output, core);
+    };
+    return NO_ERROR;
+}
+
+ErrorCode KleidiAIDenseConvolutionImpl::onExecute(const std::vector<Tensor*>& inputs,
+                                                  const std::vector<Tensor*>& outputs) {
+    mFunction.second(0);
+    return NO_ERROR;
+}
+} // namespace MNN
+#endif
--- a/source/backend/cpu/compute/KleidiAIDenseConvolution.hpp
+++ b/source/backend/cpu/compute/KleidiAIDenseConvolution.hpp
@ -0,0 +1,245 @@
+#if MNN_KLEIDIAI_ENABLED
+
+#ifndef KleidiAIDenseConvolution_hpp
+#define KleidiAIDenseConvolution_hpp
+
+#include "ConvolutionTiledExecutor.hpp"
+#include "backend/cpu/CPUConvolution.hpp"
+
+namespace MNN {
+struct ConvParams {
+    int inputChannel;
+    int outputChannel;
+    int kernelHeight;
+    int kernelWidth;
+    int strideHeight;
+    int strideWidth;
+    int padTop;
+    int padBottom;
+    int padLeft;
+    int padRight;
+    int dilatedHeight;
+    int dilatedWidth;
+
+    struct Size2D {
+        int height;
+        int width;
+    };
+
+    Size2D getOutputSize(int inputHeight, int inputWidth) const {
+        auto kernelSizeWithDilated = [](int kernel, int dilated) { return kernel + (kernel - 1) * (dilated - 1); };
+        auto outputSize            = [](int input, int pad1, int pad2, int kernel, int stride) {
+            int t = (input + pad1 + pad2 - kernel);
+            return t / stride + 1;
+        };
+
+        int dilatedKernelHeight = kernelSizeWithDilated(kernelHeight, dilatedHeight);
+        int dilatedKernelWidth  = kernelSizeWithDilated(kernelWidth, dilatedWidth);
+
+        int outputHeight = outputSize(inputHeight, padTop, padBottom, dilatedKernelHeight, strideHeight);
+        int outputWidth  = outputSize(inputHeight, padLeft, padRight, dilatedKernelWidth, strideWidth);
+
+        return {outputHeight, outputWidth};
+    }
+};
+
+template <typename T>
+struct IndirectionTable {
+    std::vector<const void*> data;
+    int height;
+    int width;
+    int blockSize;
+
+    /// Creates an indirection table for LHS packing.
+    ///
+    /// When implementing convolution via matrix multiplication, we need to
+    /// transform the input and weight tensors into matrices. This transformation
+    /// for the input is typically referred to as `im2col`. The resulting matrix has
+    /// dimensions:
+    /// - Rows: batch * output_height * output_width
+    /// - Columns: input_channels * kernel_height * kernel_width
+    ///
+    /// The indirection table stores the starting addresses of all these chunks in
+    /// the input tensor. For cases where padding is applied, it stores pointers
+    /// directly to the padded buffer. Note that the length of the padding buffer
+    /// must match the number of input channels.
+    ///
+    /// The indirection table stores the starting addresses of all these chunks in
+    /// the input tensor. Furthermore, LHS packing also requires a transpose over
+    /// every `M_STEP` rows to optimize data layout for computation.
+    ///
+    /// @param[in] shape The NHWC input shape
+    /// @param[in] params The parameters of convolution
+    /// @param[in] input The raw pointer for the input tensor
+    /// @param[in] padValues The raw pointer for the pad tensor
+    /// @param[in] blockSize The block size for the transpose
+    ///
+    /// @return The indirection table ready for lhs packing.
+    IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input, const T* padValues,
+                     const int blockSize);
+
+    ~IndirectionTable() = default;
+
+    /// To compute the offset after blocking of blockSize.
+    ///
+    /// @param[in] row The row index
+    /// @param[in] col The col index
+    /// @param[in] width The table column count
+    /// @param[in] block The block size
+    ///
+    /// @return The offset in blocking table
+    int getReorderedOffset(int row, int col, int width, int block) {
+        int c = row % block;
+        int r = row / block * width + col;
+        return r * block + c;
+    }
+};
+
+template <typename T>
+IndirectionTable<T>::IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input,
+                                      const T* padValues, const int blockSize) {
+    int batchSize    = shape[0];
+    int inputChannel = shape[3];
+    int inputHeight  = shape[1];
+    int inputWidth   = shape[2];
+
+    int elementCount = batchSize * inputChannel * inputHeight * inputWidth;
+    auto outputSize  = params.getOutputSize(inputHeight, inputWidth);
+    int outputHeight = outputSize.height;
+    int outputWidth  = outputSize.width;
+
+    int rowCount = batchSize * outputHeight * outputWidth;
+    int colCount = params.kernelHeight * params.kernelWidth;
+
+    this->data.resize((rowCount + blockSize - 1) / blockSize * blockSize * colCount);
+    this->height    = rowCount;
+    this->width     = colCount;
+    this->blockSize = blockSize;
+
+    for (int i = 0; i < this->data.size(); i++) {
+        this->data[i] = nullptr;
+    }
+
+    for (int b = 0; b < batchSize; b++) {
+        for (int h = 0; h < outputSize.height; h++) {
+            for (int w = 0; w < outputSize.width; w++) {
+                int inputRow = h * params.strideHeight - params.padTop;
+                int inputCol = w * params.strideWidth - params.padLeft;
+
+                for (int kh = 0; kh < params.kernelHeight; kh++) {
+                    // Every row of im2col resulting matrix $kernel height * kernel width$
+                    // chunks. So indirection table has relevant values, which point to the
+                    // relevant chunk. The `tableRow` and `tableCol` is the row and column
+                    // of the table not transposed.
+                    int tableRow = b * outputHeight * outputWidth + h * outputWidth + w;
+                    int tableCol = kh * params.kernelWidth;
+
+                    int inputRowPrime    = inputRow + kh * params.dilatedHeight;
+                    int inputOffsetStart = b * inputHeight * inputWidth + inputRowPrime * inputWidth;
+                    if (inputRowPrime >= 0 && inputRowPrime < inputHeight) {
+                        for (int kw = 0; kw < params.kernelWidth; kw++) {
+                            int tableOffset   = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
+                            int inputColPrime = inputCol + kw * params.dilatedWidth;
+                            if (inputColPrime >= 0 && inputColPrime < inputWidth) {
+                                int inputOffset = (inputOffsetStart + inputColPrime) * inputChannel;
+                                assert(inputOffset < elementCount);
+                                assert(tableOffset < this->data.size());
+                                this->data[tableOffset] = input + inputOffset;
+                            } else {
+                                assert(tableOffset < this->data.size());
+                                this->data[tableOffset] = padValues;
+                            }
+                        }
+                    } else {
+                        for (int kw = 0; kw < params.kernelWidth; kw++) {
+                            int tableOffset = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
+                            assert(tableOffset < this->data.size());
+                            this->data[tableOffset] = padValues;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename DstT, typename SrcT>
+static void ConvertOIHWToHWIO(DstT* dst, const SrcT* src, const std::vector<int>& shape) {
+    assert(shape.size() == 4);
+    int height        = shape[2];
+    int width         = shape[3];
+    int outputChannel = shape[0];
+    int inputChannel  = shape[1];
+
+    int spatialSize = height * width;
+    for (int oc = 0; oc < outputChannel; oc++) {
+        for (int ic = 0; ic < inputChannel; ic++) {
+            for (int s = 0; s < spatialSize; s++) {
+                int inputOffset  = oc * inputChannel * spatialSize + ic * spatialSize + s;
+                int outputOffset = s * inputChannel * outputChannel + ic * outputChannel + oc;
+
+                // TODO Check the force conversion.
+                dst[outputOffset] = (DstT)(src[inputOffset]);
+            }
+        }
+    }
+}
+
+class KleidiAIDenseConvolutionImpl : public ConvolutionTiledImpl {
+public:
+    KleidiAIDenseConvolutionImpl(const Convolution2DCommon *common, Backend *b,
+                                 CPUConvolution::Resource *resource = nullptr)
+        : ConvolutionTiledImpl(common, b) {
+        mResource = resource;
+    }
+    ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ~KleidiAIDenseConvolutionImpl() = default;
+    virtual void getPackParameter(int *eP, int *lP, int *hP, const CoreFunctions *core) override {}
+
+private:
+    Tensor mOutputNHWC;
+    Tensor mInputNHWC;
+    Tensor mPadBuffer;
+};
+
+class KleidiAIDenseConvolution : public ConvolutionTiledExecutor {
+public:
+    KleidiAIDenseConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight,
+                             size_t originWeightSize, const float *bias, size_t biasSize,
+                             std::shared_ptr<ConvolutionCommon::Int8Common>);
+
+    KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common,
+                             Backend *b);
+    virtual ~KleidiAIDenseConvolution();
+
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override;
+    void initWeight(float *dest, const float *source, float *cache, int depth, int outputCount, int kernelSize,
+                    const CoreFunctions *function);
+
+protected:
+    std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
+};
+
+class KleidiAIDenseConvolutionMultiInput : public Execution {
+public:
+    KleidiAIDenseConvolutionMultiInput(const Convolution2DCommon *common, Backend *b) : Execution(b) {
+        mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b));
+    }
+    virtual ~KleidiAIDenseConvolutionMultiInput() = default;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    std::shared_ptr<Tensor> mTempWeight;
+    std::shared_ptr<Tensor> mTempWeightCache;
+    std::shared_ptr<Tensor> mTempBias;
+    std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
+    std::vector<Tensor *> mInputs;
+};
+} // namespace MNN
+
+#endif /* KleidiAIDenseConvolution_hpp */
+#endif
--- a/source/core/TensorUtils.hpp
+++ b/source/core/TensorUtils.hpp
@ -21,12 +21,6 @@

 #ifdef MNN_KLEIDIAI_ENABLED
 #include "../backend/cpu/arm/mnn_kleidiai.h"
-/**
- * Set Convolution's input/output tensor format:
- * 1: format will be NCHW, skip pack/unpack functions.
- * 0: format will be NC4HW4, need pack/unpack functions to fit kleidiAI ukernel.
- **/
-#define KAI_CONV_NCHW_IN_OUT 1
 #endif

 namespace MNN {
--- a/source/geometry/GeometryConvUtils.cpp
+++ b/source/geometry/GeometryConvUtils.cpp
@ -268,19 +268,6 @@ std::shared_ptr<Tensor> GeometryConvUtils::im2Col(Tensor* im2Col, Tensor* input,
    return tempTensor;
 }
 bool GeometryConvUtils::computeSingle(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, GeometryComputer::Context& context, CommandBuffer& res) {
-#if KAI_CONV_NCHW_IN_OUT
-    KleidiAI& kai = KleidiAI::getInstance();
-    auto common = op->main_as_Convolution2D()->common();
-    if(kai.canAccelerate() && common->kernelX() == 1 && common->kernelY() == 1) {
-        kai.setLinear(true);
-        std::shared_ptr<Command> cmd(new Command);
-        cmd->op      = op;
-        cmd->inputs  = std::move(inputs);
-        cmd->outputs = std::move(outputs);
-        res.command.emplace_back(std::move(cmd));
-        return true;
-    }
-#endif
    auto newOutputs   = outputs;
    auto newInputs    = inputs;
    auto originOutput = outputs[0];
--- a/source/shape/ShapeTensorConvert.cpp
+++ b/source/shape/ShapeTensorConvert.cpp
@ -23,13 +23,7 @@ public:
            sourceFmt = MNN_DATA_FORMAT_NCHW;
        }
        auto destFmt                                          = info->dest();
-#if KAI_CONV_NCHW_IN_OUT
-        KleidiAI& kai = KleidiAI::getInstance();
-        if(kai.canAccelerate()) {
-            kai.setLinear(true);
-            destFmt = MNN_DATA_FORMAT_NCHW;
-        }
-#endif
+
        TensorUtils::getDescribe(outputs[0])->dimensionFormat = destFmt;
        if (destFmt == MNN_DATA_FORMAT_NC4HW4) {
            destFmt = MNN_DATA_FORMAT_NCHW;
--- a/test/expr/ModuleTest.cpp
+++ b/test/expr/ModuleTest.cpp
@ -1266,7 +1266,9 @@ public:
        return true;
    }
 };
+#ifndef MNN_KLEIDIAI_ENABLED
 MNNTestSuiteRegister(WinogradMemoryTest, "expr/WinogradMemoryTest");
+#endif


 class SequenceMemoryTest : public MNNTestCase {
--- a/test/kleidiai/imatmul.cpp
+++ b/test/kleidiai/imatmul.cpp
@ -0,0 +1,161 @@
+#ifdef MNN_KLEIDIAI_ENABLED
+
+#include <functional>
+#include <numeric>
+#include <random>
+
+#include "MNNTestSuite.h"
+#include "backend/cpu/compute/KleidiAIDenseConvolution.hpp"
+
+using namespace MNN;
+
+namespace utils {
+enum class FillType { RANDOM, ZERO };
+
+class RandomEngine {
+public:
+    static std::mt19937& get() {
+        static std::random_device device;
+        static std::mt19937 gen(device());
+        return gen;
+    }
+};
+
+template <typename T>
+struct RandomGenerator;
+
+template <>
+struct RandomGenerator<float> {
+    static float generate() {
+        std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+        return dist(RandomEngine::get());
+    }
+};
+
+template <>
+struct RandomGenerator<int> {
+    static int generate() {
+        std::uniform_int_distribution<int> dist(0, 100);
+        return dist(RandomEngine::get());
+    }
+};
+} // namespace utils
+
+class LhsPackingTest : public MNNTestCase {
+public:
+    virtual bool run(int precision) {
+        return testIndirectionTable1() && testIndirectionTable2() && testWeightConversion();
+    }
+
+private:
+    bool testIndirectionTable(const ConvParams& params, int batchSize, int inputHeight, int inputWidth) {
+        auto outputSize             = params.getOutputSize(inputHeight, inputWidth);
+        int outputHeight            = outputSize.height;
+        int outputWidth             = outputSize.width;
+        std::vector<int> inputShape = {batchSize, inputHeight, inputWidth, params.inputChannel};
+
+        std::vector<float> input(std::accumulate(inputShape.begin(), inputShape.end(), 1, std::multiplies<int>()));
+        std::vector<float> padValues(params.inputChannel);
+
+        int blockSize = 32;
+        auto table    = IndirectionTable<float>(inputShape, params, input.data(), padValues.data(), blockSize);
+
+        bool succ = true;
+
+        // Check the first row
+        for (int col = 0; col < blockSize; col++) {
+            int oh = col / outputWidth;
+            int ow = col % outputWidth;
+            int ih = oh * params.strideHeight - params.padTop;
+            int iw = ow * params.strideWidth - params.padLeft;
+
+            if (ih < 0 || ih >= inputHeight) {
+                succ &= (table.data[col] == padValues.data());
+            } else if (iw < 0 || iw >= inputWidth) {
+                succ &= (table.data[col] == padValues.data());
+            } else {
+                int offset = (ih * inputWidth + iw) * params.inputChannel;
+                succ &= (table.data[col] == input.data() + offset);
+            }
+        }
+        return succ;
+    }
+
+    bool testIndirectionTable1() {
+        ConvParams params{
+            .inputChannel  = 3,
+            .outputChannel = 5,
+            .kernelHeight  = 3,
+            .kernelWidth   = 2,
+            .strideHeight  = 2,
+            .strideWidth   = 1,
+            .padTop        = 1,
+            .padBottom     = 3,
+            .padLeft       = 2,
+            .padRight      = 1,
+            .dilatedHeight = 1,
+            .dilatedWidth  = 2,
+        };
+
+        int batchSize   = 4;
+        int inputHeight = 7;
+        int inputWidth  = 5;
+
+        return testIndirectionTable(params, batchSize, inputHeight, inputWidth);
+    }
+
+    bool testIndirectionTable2() {
+        ConvParams params{
+            .inputChannel  = 256,
+            .outputChannel = 256,
+            .kernelHeight  = 3,
+            .kernelWidth   = 3,
+            .strideHeight  = 1,
+            .strideWidth   = 1,
+            .padTop        = 1,
+            .padBottom     = 1,
+            .padLeft       = 1,
+            .padRight      = 1,
+            .dilatedHeight = 1,
+            .dilatedWidth  = 1,
+        };
+
+        int batchSize   = 1;
+        int inputHeight = 24;
+        int inputWidth  = 24;
+
+        return testIndirectionTable(params, batchSize, inputHeight, inputWidth);
+    }
+
+    bool testWeightConversion() {
+        std::vector<int> shape = {4, 5, 6, 7};
+        int size               = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+        std::vector<float> weightSrc(size);
+        std::vector<float> weightDst(size);
+
+        for (int i = 0; i < size; i++) {
+            weightSrc[i] = i;
+        }
+
+        ConvertOIHWToHWIO(weightDst.data(), weightSrc.data(), shape);
+
+        bool succ = true;
+
+        for (int oc = 0; oc < 4; oc++) {
+            for (int ic = 0; ic < 5; ic++) {
+                for (int h = 0; h < 6; h++) {
+                    for (int w = 0; w < 7; w++) {
+                        int oo = (h * 7 + w) * 5 * 4 + ic * 4 + oc;
+                        int io = oc * 5 * 6 * 7 + ic * 6 * 7 + h * 7 + w;
+                        succ &= (weightSrc[io] == weightDst[oo]);
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+};
+
+MNNTestSuiteRegister(LhsPackingTest, "imatmul/lhs");
+#endif
--- a/transformers/llm/engine/demo/llm_demo.cpp
+++ b/transformers/llm/engine/demo/llm_demo.cpp
@ -21,6 +21,7 @@ using namespace MNN::Transformer;
 static void tuning_prepare(Llm* llm) {
    MNN_PRINT("Prepare for tuning opt Begin\n");
    llm->tuning(OP_ENCODER_NUMBER, {1, 5, 10, 20, 30, 50, 100});
+    llm->tuning(PREFILL_BIGLITTLE_CORE, {});
    MNN_PRINT("Prepare for tuning opt End\n");
 }

@ -195,6 +196,9 @@ static int eval(Llm* llm, std::string prompt_file, int max_token_number) {
    prompts = {prompt};
 #else
    while (std::getline(prompt_fs, prompt)) {
+        if (prompt.empty()) {
+            continue;
+        }
        if (prompt.back() == '\r') {
            prompt.pop_back();
        }
--- a/transformers/llm/engine/include/llm/llm.hpp
+++ b/transformers/llm/engine/include/llm/llm.hpp
@ -39,6 +39,7 @@ using ChatMessages = std::vector<ChatMessage>;
 enum TuneType {
    // op encoder number for commit
    OP_ENCODER_NUMBER = 0,
+    PREFILL_BIGLITTLE_CORE,
 };
 enum class MatchStrictLevel : int;
 enum class NgramSelectRule : int;
@ -126,6 +127,7 @@ protected:
    std::shared_ptr<Express::Executor::RuntimeManager> mRuntimeManager, mProcessorRuntimeManager;
    std::vector<std::shared_ptr<Express::Module>> mModules, mPrefillModules, mDecodeModules, mCurrentModules;
    const Express::Module* mBaseModule = nullptr;
+    ScheduleConfig mPrefillConfig, mDecodeConfig;
    Express::VARP inputsEmbeds, attentionMask, positionIds;
    std::vector<Express::VARP> mAttentionMaskVarVec, mPositionIdsVarVec;
    Express::VARP logitsAllIdx, logitsLastIdx;
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@ -95,17 +95,20 @@ bool Llm::set_config(const std::string& content) {
 }

 void Llm::initRuntime() {
-    ScheduleConfig config;
    BackendConfig cpuBackendConfig;
-    config.type      = backend_type_convert(mConfig->backend_type());
-    config.numThread = mConfig->thread_num();
-    if(config.type == 3){
+    // setup mPrefillConfig
+    mPrefillConfig.type      = backend_type_convert(mConfig->backend_type());
+    mPrefillConfig.numThread = (mConfig->prefill_thread_num() < 0) \
+        ? mConfig->thread_num() : mConfig->prefill_thread_num();
+    if(mPrefillConfig.type == 3){
        // opencl need set numThread = 64(buffer mode)
-        config.numThread |= 64;
+        mPrefillConfig.numThread |= 64;
    }
-    if (mConfig->power() == "high") {
+    std::string powerConfig = (mConfig->prefill_power().empty()) \
+        ? mConfig->power() : mConfig->prefill_power();
+    if (powerConfig == "high") {
        cpuBackendConfig.power = BackendConfig::Power_High;
-    } else if (mConfig->power() == "low") {
+    } else if (powerConfig == "low") {
        cpuBackendConfig.power = BackendConfig::Power_Low;
    }
    if (mConfig->memory() == "high") {
@ -118,9 +121,26 @@ void Llm::initRuntime() {
    } else if (mConfig->precision() == "low") {
        cpuBackendConfig.precision = BackendConfig::Precision_Low;
    }
-    config.backendConfig = &cpuBackendConfig;
+    ExecutorScope::Current()->setGlobalExecutorConfig(mPrefillConfig.type, cpuBackendConfig, mPrefillConfig.numThread);
+    mPrefillConfig.backendConfig = new BackendConfig(cpuBackendConfig);
+    // set up mDecodeConfig
+    mDecodeConfig = mPrefillConfig;
+    mDecodeConfig.backendConfig = new BackendConfig(cpuBackendConfig);
+    mDecodeConfig.numThread = (mConfig->decode_thread_num() < 0) \
+        ? mConfig->thread_num() : mConfig->decode_thread_num();
+    if(mDecodeConfig.type == 3){
+        // opencl need set numThread = 64(buffer mode)
+        mDecodeConfig.numThread |= 64;
+    }
+    powerConfig = (mConfig->decode_power().empty()) \
+        ? mConfig->power() : mConfig->decode_power();
+    if (powerConfig == "high") {
+        mDecodeConfig.backendConfig->power = BackendConfig::Power_High;
+    } else if (powerConfig == "low") {
+        mDecodeConfig.backendConfig->power = BackendConfig::Power_Low;
+    }

-    mRuntimeManager.reset(Executor::RuntimeManager::createRuntimeManager(config));
+    mRuntimeManager.reset(Executor::RuntimeManager::createRuntimeManager(mPrefillConfig));
    // Use 4 thread to load llm
    mRuntimeManager->setHint(MNN::Interpreter::INIT_THREAD_NUMBER, 4);

@ -154,7 +174,7 @@ void Llm::initRuntime() {
    mRuntimeManager->setMode(MNN::Interpreter::Session_Debug);
    _initDebug();
 #endif
-    if (config.type != 0) { // not cpu
+    if (mPrefillConfig.type != 0) { // not cpu
        std::string cacheFilePath = tmpPath.length() != 0 ? tmpPath : ".";
        mRuntimeManager->setCache(cacheFilePath + "/mnn_cachefile.bin");
    }
@ -246,6 +266,7 @@ void Llm::load() {
    mModules[0].reset(Module::load(inputNames, outputNames, model_path.c_str(), mRuntimeManager, &module_config));
    
    // set speculative decoding params
+    ExecutorScope::Current()->setGlobalExecutorConfig(mDecodeConfig.type, *(mDecodeConfig.backendConfig), mDecodeConfig.numThread);
    setSpeculativeConfig();
    int decode_type_num = 1;
    if(mLookAhead) {
@ -255,7 +276,7 @@ void Llm::load() {
    mDecodeModules.resize(decode_type_num);

    for (int v = 0; v < mDecodeModules.size(); ++v) {
-        mDecodeModules[v].reset(Module::clone(mModules[0].get()));
+        mDecodeModules[v].reset(Module::clone(mModules[0].get(), &mDecodeConfig));
    }
    mPrefillModules = mModules;
    
@ -294,15 +315,55 @@ Llm* Llm::create_lora(const std::string& lora_path) {
 }

 void Llm::tuning(TuneType type, std::vector<int> candidates) {
-    if (type != OP_ENCODER_NUMBER) {
-        MNN_ERROR("tuning type not supported\n");
-        return;
+    if (type == PREFILL_BIGLITTLE_CORE) {
+        // only CPU power high is tuned
+        if (mPrefillConfig.type != MNN_FORWARD_CPU) {
+            return;
+        }
+        if (mPrefillConfig.backendConfig->power != BackendConfig::Power_High) {
+            return;
+        }
+        if (candidates.empty()){
+            candidates = {40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95};
+        }
+        auto itp_type = Interpreter::CPU_LITTLECORE_DECREASE_RATE;
+        int length = 64;
+        int64_t min_time     = INT64_MAX;
+        int prefer_candidate = 0;
+        for (auto& candidate : candidates) {
+            mRuntimeManager->setHint(itp_type, candidate);
+            // load prefill module again to take effect! the following 2 lines can't be deleted!!
+            for (int v = 0; v < mPrefillModules.size(); ++v) {
+                mPrefillModules[v].reset(Module::clone(mPrefillModules[v].get()));
+            }
+            switchMode(Prefill);
+            Timer _t;
+            std::vector<int> input_ids(length, 0);
+            auto logits = forward(input_ids);
+            auto token   = sample(logits);
+            auto time = _t.durationInUs();
+            MNN_PRINT("CPU_LITTLECORE_DECREASE_RATE:%d, prefill time: %lld us\n", candidate, time);
+            if (time < min_time) {
+                prefer_candidate = candidate;
+                min_time         = time;
+            }
+            setKVCacheInfo(0, getCurrentHistory());
+            reset();
+        }
+        mRuntimeManager->setHint(itp_type, prefer_candidate);
+        // load prefill module again to take effect! the following 2 lines can't be deleted!!
+        for (int v = 0; v < mPrefillModules.size(); ++v) {
+            mPrefillModules[v].reset(Module::clone(mPrefillModules[v].get()));
+        }
+        switchMode(Prefill);
    }
+    if (type == OP_ENCODER_NUMBER) {
    // FIXME: Currently OpenCL Don't support KVMeta
    if (mConfig->backend_type() == "opencl") {
        return;
    }
-    mCurrentModules     = mDecodeModules;
+    auto itp_type = MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT;
+    switchMode(Llm::Decode);
    int decode_seq = 1;
    // Set to decode mode
    mContext->gen_seq_len = 1;
@ -315,7 +376,7 @@ void Llm::tuning(TuneType type, std::vector<int> candidates) {
    int64_t min_time     = INT64_MAX;
    int prefer_candidate = 10;
    for (auto& candidate : candidates) {
-        mRuntimeManager->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, candidate);
+        mRuntimeManager->setHint(itp_type, candidate);
        Timer _t;
        std::vector<int> input_ids(decode_seq, 0);
        auto logits = forward(input_ids);
@ -333,18 +394,21 @@ void Llm::tuning(TuneType type, std::vector<int> candidates) {
            // MNN_PRINT("op encode number:%d, decode time: %lld us\n", candidate, time);
        }
    }
-    mRuntimeManager->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, prefer_candidate);
+    mRuntimeManager->setHint(itp_type, prefer_candidate);
    // clear dirty tuning kv history
    setKVCacheInfo(0, getCurrentHistory());
    reset();
+    }
 }

 void Llm::switchMode(Llm::Stage stage) {
    switch (stage) {
        case Prefill:
+            ExecutorScope::Current()->setGlobalExecutorConfig(mPrefillConfig.type, *(mPrefillConfig.backendConfig), mPrefillConfig.numThread);
            mCurrentModules = mPrefillModules;
            break;
        case Decode:
+            ExecutorScope::Current()->setGlobalExecutorConfig(mDecodeConfig.type, *(mDecodeConfig.backendConfig), mDecodeConfig.numThread);
            mCurrentModules = mDecodeModules;
            break;
        default:
@ -498,7 +562,7 @@ void Llm::generate_init(std::ostream* os, const char* end_with) {
        mMeta->remove = mMeta->previous;
    }
    mContext->output_tokens.clear();
-    mCurrentModules = mPrefillModules;
+    switchMode(Llm::Prefill);
 }

 size_t Llm::getCurrentHistory() const {
@ -584,7 +648,7 @@ std::vector<int> Llm::generate(MNN::Express::VARP input_embeds, int max_tokens)
    }
    mContext->prompt_len = static_cast<int>(input_embeds->getInfo()->dim[0]);
    Timer _t;
-    mCurrentModules = mPrefillModules;
+    switchMode(Llm::Prefill);
    auto logits      = forward(input_embeds);
    if (nullptr == logits.get()) {
        return {};
@ -598,7 +662,7 @@ std::vector<int> Llm::generate(MNN::Express::VARP input_embeds, int max_tokens)
    mContext->history_tokens.push_back(mContext->current_token);
    mContext->output_tokens.push_back(mContext->current_token);
    logits = nullptr;
-    mCurrentModules = mDecodeModules;
+    switchMode(Llm::Decode);
    generate(max_tokens - 1);

    return mContext->output_tokens;
@ -673,6 +737,8 @@ Llm::~Llm() {
    mModules.clear();
    mRuntimeManager.reset();
    mProcessorRuntimeManager.reset();
+    if (mPrefillConfig.backendConfig != nullptr) delete mPrefillConfig.backendConfig;
+    if (mDecodeConfig.backendConfig != nullptr) delete mDecodeConfig.backendConfig;
 }

 bool Llm::reuse_kv() { return mConfig->reuse_kv(); }
--- a/transformers/llm/engine/src/llmconfig.hpp
+++ b/transformers/llm/engine/src/llmconfig.hpp
@ -341,6 +341,15 @@ public:
        return config_.value("thread_num", 4);
    }

+    int prefill_thread_num(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("prefill_thread_num", -1);
+        return config_.value("prefill_thread_num", -1);
+    }
+    int decode_thread_num(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("decode_thread_num", -1);
+        return config_.value("decode_thread_num", -1);
+    }
+
    std::string precision(bool mllm = false) const {
        if (mllm) return mllm_config_.value("precision", "low");
        return config_.value("precision", "low");
@ -349,6 +358,14 @@ public:
        if (mllm) return mllm_config_.value("power", "normal");
        return config_.value("power", "normal");
    }
+    std::string prefill_power(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("prefill_power", "");
+        return config_.value("prefill_power", "");
+    }
+    std::string decode_power(bool mllm = false) const {
+        if (mllm) return mllm_config_.value("decode_power", "");
+        return config_.value("decode_power", "");
+    }

    std::string memory(bool mllm = false) const {
        if (mllm) return mllm_config_.value("memory", "low");
--- a/transformers/llm/engine/src/sampler.cpp
+++ b/transformers/llm/engine/src/sampler.cpp
@ -235,6 +235,17 @@ void Sampler::SamplerConfig::configMixed(std::shared_ptr<LlmConfig> llmConfig) {
        this->configSampler(samplerName, llmConfig);
        // std::cout << samplerName << " " << std::flush;
    }
+    for (int i=1; i<mixedSamplers.size(); ++i) {
+        // "penalty" can only locate at the first position
+        if (mixedSamplers[i]=="penalty") {
+            mixedSamplers.erase(mixedSamplers.begin()+i); 
+            i--;
+            if (mixedSamplers[0]!="penalty") {
+                mixedSamplers.insert(mixedSamplers.begin(), "penalty");
+                i++;
+            }
+        }
+    }
    // std::cout << std::endl;
    // set select type
    // the final sampler select the token
--- a/transformers/llm/engine/src/tokenizer.cpp
+++ b/transformers/llm/engine/src/tokenizer.cpp
@ -449,33 +449,13 @@ void Tiktoken::encode(const std::string& str, std::vector<int>& ids) {
    if (str.empty()) {
        return;
    }
-    size_t i = 0;
-    while (i < str.size()) {
-        bool found_pair = false;
-        // Attempt to match the longest possible symbol
-        size_t longest_match_len = 0;
-        std::string longest_match;
-
-        // Check substrings of decreasing length
-        for (size_t len = str.size() - i; len > 0; --len) {
-            std::string token = str.substr(i, len);
-            auto it = encoder_.find(token);
-            if (it != encoder_.end()) {
-                if (len > longest_match_len) {
-                    longest_match_len = len;
-                    longest_match = it->first;
-                }
-            }
-        }
-
-        if (!longest_match.empty()) {
-            ids.push_back(encoder_.at(longest_match));
-            i += longest_match_len;
-        } else {
-            // If no matching symbol is found, this typically means an error in the encoding
-            // or the input text contains characters that the encoder doesn't know how to handle
-            std::cerr << "Error: No encoding found for the sequence starting at position " << i << " , symbol: " << str[i-2] << std::endl;
-            return;
+    auto it = str.begin();
+    while(it!=str.end()) {
+        auto last_it = it;
+        int token_id = encoder_.find(it, str.end());
+        if (token_id>=0) { ids.push_back(token_id); }
+        else {
+            MNN_ERROR("Error: No encoding found for the sequence %s\n", std::string(last_it, it).c_str());
        }
    }
 }
@ -487,6 +467,28 @@ std::string Tiktoken::decode(int id) {
    return decoder_[id];
 }

+bool BertTokenizer::load_vocab(std::ifstream& tok_file) {
+    std::string line;
+    std::getline(tok_file, line);
+    int vocab_len = std::stoi(line);
+    // load vocab
+    decoder_.resize(vocab_len);
+    for (int i = 0; i < vocab_len; i++) {
+        std::getline(tok_file, line);
+        auto token = base64_decode(line);
+        encoder_.insert({token, i});
+        decoder_[i] = token;
+    }
+    return true;
+}
+
+std::string BertTokenizer::decode(int id) {
+    if (id >= decoder_.size()) {
+        return "";
+    }
+    return decoder_[id];
+}
+
 std::vector<int> BertTokenizer::word_piece(const std::string& token) {
    auto it = encoder_.find(token);
    if (it != encoder_.end()) {
--- a/transformers/llm/engine/src/tokenizer.hpp
+++ b/transformers/llm/engine/src/tokenizer.hpp
@ -63,6 +63,68 @@ namespace MNN {
 namespace Transformer {
 // std::string_view impl in c++11 start

+
+class Trie {
+public:
+    struct TrieNode
+    {
+        std::unordered_map<char, int> children;
+        int id = -1;
+    };
+private:
+    std::vector<TrieNode> list;
+    int size = 1;
+    int getFree() {
+        if (size<list.size()) { return size++; }
+        else {
+            list.resize(list.size()*2);
+            return size++; 
+        }
+    }
+    void insert(int nid, int token_id, std::string::const_iterator it, std::string::const_iterator end) {
+        auto& node = list[nid];
+        if (it==end) { 
+            if (node.id==-1) { node.id=token_id; }
+            return; 
+        }
+        auto cid = node.children.find(*it);
+        if (cid==node.children.end()) {
+            int new_id = getFree();
+            list[nid].children.insert({*it, new_id}); // access the node again even after reallocation!!!
+            insert(new_id, token_id, it+1, end);
+        } else{
+            insert(cid->second, token_id, it+1, end);
+        }
+    }
+    int find(int nid, int current_matched, std::string::const_iterator current_it, std::string::const_iterator& it, const std::string::const_iterator& end) {
+        const auto& node = list[nid];
+        if (node.id!=-1) { 
+            current_matched = node.id; 
+            current_it = it;
+        }
+        auto cid = node.children.find(*it);
+        if (cid != node.children.end()) {
+            return find(cid->second, current_matched, current_it, ++it, end);
+        } else {
+            if (node.id!=-1) { return node.id; }
+            else { it = current_it; return current_matched;} 
+        }
+    }
+public:
+    Trie(int initial_size=10000) {
+        list.resize(initial_size); // init the allocate size
+        size = 1; // root
+    }
+    void insert(std::pair<const std::string&, int> entry) {
+        insert(0, entry.second, entry.first.begin(), entry.first.end());
+    }
+    int find(std::string::const_iterator& it, const std::string::const_iterator& end) {
+        if (it==end) { return -1; }
+        return find(0, -1, it+1, it, end);
+    }
+};
+
+
 class Tokenizer {
 public:
    static constexpr int MAGIC_NUMBER = 430;
@ -149,15 +211,19 @@ public:
 protected:
    virtual bool load_vocab(std::ifstream& file) override;
    virtual void encode(const std::string& str, std::vector<int>& ids) override;
-    std::unordered_map<std::string, int> encoder_;
+    Trie encoder_;
    std::vector<std::string> decoder_;
 };

-class BertTokenizer : public Tiktoken {
+class BertTokenizer : public Tokenizer {
 public:
    BertTokenizer() = default;
+    virtual std::string decode(int id) override;
 protected:
+    virtual bool load_vocab(std::ifstream& file) override;
    virtual void encode(const std::string& str, std::vector<int>& ids) override;
+    std::unordered_map<std::string, int> encoder_;
+    std::vector<std::string> decoder_;
 private:
    std::vector<int> word_piece(const std::string& token);
 };
--- a/transformers/llm/export/llmexport.py
+++ b/transformers/llm/export/llmexport.py
@ -583,7 +583,10 @@ class LlmExporter(torch.nn.Module):
                "llm_model": f"{self.dst_name}.mnn",
                "llm_weight": f"{self.dst_name}.mnn.weight",
                "backend_type": "cpu",
-                "thread_num": 4,
+                "prefill_thread_num": 0,
+                "prefill_power": "high",
+                "decode_thread_num": 4,
+                "decode_power": "normal",
                "precision": "low",
                "memory": "low",
                # "system_prompt": "You are a helpful assistant.",
Author	SHA1	Message	Date
huangzhengxiang	f1c9d55a8d	Merge `91d982616a` into `daa62c77c1`	2025-07-11 11:08:19 +08:00
jxt1234	daa62c77c1	Merge pull request #3713 from alibaba/feature/bugfix android / android_build (push) Has been cancelled Details ios / ios_build (push) Has been cancelled Details linux / linux_buil_test (push) Has been cancelled Details macos / macos_buil_test (push) Has been cancelled Details windows / windows_build_test (push) Has been cancelled Details stale / stale (push) Has been cancelled Details MNN:Bugfix: Fix compile bug for ios compile and llm_demo crash for empty	2025-07-10 16:58:43 +08:00
xiaying	6fbbfda5ec	MNN:Bugfix: Fix compile bug for ios compile and llm_demo crash for empty line	2025-07-10 16:51:22 +08:00
王召德	f845f0e665	Merge pull request #3707 from yanzhang-dev/features/imatmul-fp32 Integrate the KleidiAI imatmul with fp32.	2025-07-10 16:06:11 +08:00
yanzhang	4c9f48b76b	Minor fixes after rebase. - Enable fp32 1x1 impl after it's right. - Remove winograd memory test because kleidiai not support. Change-Id: Ia61ad8c251490e93a2a365020a6183a944e769b2 Signed-off-by: yanzhang <yanzhang.wang@arm.com>	2025-07-10 15:42:55 +08:00
yanzhang	5e3c8a3c12	Integrate the KleidiAI imatmul with fp32. Note, - No support for fp16 and int8 currently. Signed-off-by: yanzhang <yanzhang.wang@arm.com> Change-Id: If17c911977dd7eb0603f41d64b8ba879f468ab98	2025-07-10 15:42:19 +08:00
王召德	4f790e8bd4	Merge pull request #3706 from SixtyWang/kai/refactor Refactor Kleidiai code to fix MNN unit test issues	2025-07-10 14:27:26 +08:00
SixtyWang	b5b5845787	Refactor Kleidiai code to fix MNN unit test issues - Refactor getInstance function - Add 1x1 convolution check in canAccelerate - Use NHWC as input/output format for Kleidiai and convert format in onExecute - Remove KAI_CONV_NCHW_IN_OUT macro - Fix SME build issue on M4	2025-07-08 16:22:23 +08:00
SixtyWang	da8b7337c4	Refactor the KleidiAI integration ConvInt8 code in MNN	2025-07-08 16:21:26 +08:00
Shuheng Deng	875814bfb9	Refactor the KleidiAI integration convolution code in MNN Change-Id: I2e45fb7ec10793afffad8c25c305c1cb1e260753	2025-07-04 09:54:00 +08:00
huangzhengxiang	91d982616a	Merge branch 'alibaba:master' into master	2025-06-12 10:10:03 +08:00
hzx	03dddf264f	Merge remote-tracking branch 'hzx/master'	2025-06-09 23:24:23 +08:00
hzx	e1b5afef37	Merge remote-tracking branch 'origin/master'	2025-06-09 23:19:28 +08:00
huangzhengxiang	2d860125e5	resolve tokenizer.cpp	2025-06-05 09:19:34 +08:00
huangzhengxiang	eb4e8ae92f	Merge branch 'alibaba:master' into master	2025-06-01 15:42:28 +08:00
hzx	3d66ca904e	debug for cloud server vcpu	2025-05-29 09:23:51 +08:00
hzx	664ee20e2b	add pd disaggregation and separate acceleration on CPU backend	2025-05-28 19:50:53 +08:00
hzx	5f0d59958e	Merge remote-tracking branch 'origin/master'	2025-05-28 13:24:49 +08:00
hzx	c9b89abf26	Merge remote-tracking branch 'origin/master'	2025-05-24 20:18:57 +08:00
hzx	16f3281756	accelerate TikToken tokenizer	2025-05-24 20:18:25 +08:00
hzx	69ac2f8f04	ensure penalty sampler to be the first one in mixed samplers	2025-05-21 23:42:30 +08:00