| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | //
 | 
					
						
							|  |  |  | //  CPUMatMul.cpp
 | 
					
						
							|  |  |  | //  MNN
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | //  Created by MNN on 2018/08/06.
 | 
					
						
							|  |  |  | //  Copyright © 2018, Alibaba Group Holding Limited
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  | #include <limits>
 | 
					
						
							| 
									
										
										
										
											2020-02-26 09:57:17 +08:00
										 |  |  | #include "CPUMatMul.hpp"
 | 
					
						
							|  |  |  | #include "CPUBackend.hpp"
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "math/Matrix.hpp"
 | 
					
						
							| 
									
										
										
										
											2020-05-15 14:49:10 +08:00
										 |  |  | #include "compute/CommonOptFunction.h"
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "core/Macro.h"
 | 
					
						
							| 
									
										
										
										
											2020-02-26 09:57:17 +08:00
										 |  |  | #include "core/Concurrency.h"
 | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  | #include "core/BufferAllocator.hpp"
 | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  | #include "core/TensorUtils.hpp"
 | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  | #include "core/OpCommonUtils.hpp"
 | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | #include "math/Vec.hpp"
 | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-05 15:30:28 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | using Vec4 = MNN::Math::Vec<float, 4>; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | namespace MNN { | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  | CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool transposeC, bool multiThread) | 
					
						
							|  |  |  |     : Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mTransposeC(transposeC), mSupportMultiThread(multiThread) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |         // Do nothing
 | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  | void CPUMatMul::_scheduleForVecE(int e, int l, int h) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1; | 
					
						
							|  |  |  |     MNN_ASSERT(e == 1); | 
					
						
							| 
									
										
										
										
											2021-01-06 16:29:37 +08:00
										 |  |  |     MatMulParam param; | 
					
						
							|  |  |  |     param.e = 1; | 
					
						
							|  |  |  |     param.l = l; | 
					
						
							|  |  |  |     param.h = h; | 
					
						
							|  |  |  |     param.BTranspose = mTransposeB; | 
					
						
							|  |  |  |     param.numberThread = numberThread; | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |     auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForE_1; | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     mPreFunctions.emplace_back(std::make_pair([param, func]( | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |                                                                              int tId, const float* A, const float* B, const float* biasPtr, float* C) { | 
					
						
							|  |  |  |         func(A, B, C, biasPtr, ¶m, tId); | 
					
						
							| 
									
										
										
										
											2021-01-06 16:29:37 +08:00
										 |  |  |     }, numberThread)); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  | void CPUMatMul::_scheduleForVec(int e, int l, int h) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1; | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |     MatMulParam param; | 
					
						
							|  |  |  |     param.e = e; | 
					
						
							|  |  |  |     param.l = l; | 
					
						
							|  |  |  |     param.h = 1; | 
					
						
							|  |  |  |     param.ATranspose = mTransposeA; | 
					
						
							|  |  |  |     param.numberThread = numberThread; | 
					
						
							|  |  |  |     auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForH_1; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     // TODD: Support e = 1
 | 
					
						
							|  |  |  |     MNN_ASSERT(h == 1); | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     mPreFunctions.emplace_back(std::make_pair([param, func]( | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |         int tId, const float* A, const float* B, const float* biasPtr, float* C) { | 
					
						
							|  |  |  |         func(A, B, C, biasPtr, ¶m, tId); | 
					
						
							|  |  |  |     }, numberThread)); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-15 14:49:10 +08:00
										 |  |  | ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) { | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |     const Tensor* A = inputs[0]; | 
					
						
							|  |  |  |     const Tensor* B = inputs[1]; | 
					
						
							| 
									
										
										
										
											2020-05-15 14:49:10 +08:00
										 |  |  |     Tensor* C       = outputs[0]; | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |     auto core = static_cast<CPUBackend*>(backend())->functions(); | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |     mPreFunctions.clear(); | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  |     int e, l, h; | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     bool valid = OpCommonUtils::computeMatMulSize(mTransposeA, mTransposeB, A, B, e, l, h); | 
					
						
							|  |  |  |     if (!valid) { | 
					
						
							|  |  |  |         return COMPUTE_SIZE_ERROR; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     mE = 0; | 
					
						
							|  |  |  |     mL = 0; | 
					
						
							|  |  |  |     mH = 0; | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |     // If encoded but resized as h=1/e=1, the computer should clear firstly
 | 
					
						
							|  |  |  |     if (h == 1) { | 
					
						
							|  |  |  |         _scheduleForVec(e, l, h); | 
					
						
							|  |  |  |         return NO_ERROR; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (e == 1) { | 
					
						
							|  |  |  |         const float* biasPtr = nullptr; | 
					
						
							|  |  |  |         _scheduleForVecE(e, l, h); | 
					
						
							|  |  |  |         return NO_ERROR; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |     int eP, lP, hP; | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |     core->MNNGetMatMulPackMode(&eP, &lP, &hP); | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1; | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  |     auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator(); | 
					
						
							| 
									
										
										
										
											2024-07-22 19:51:53 +08:00
										 |  |  |     auto ATPtrAlloc = bufferAlloc->alloc(eP * UP_DIV(l, lP) * lP * core->bytes * numberThread); | 
					
						
							|  |  |  |     int matmulBytes = core->bytes; | 
					
						
							|  |  |  |     if (core->matmulBytes != 0) { | 
					
						
							|  |  |  |         matmulBytes = core->matmulBytes; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * matmulBytes); | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * eP * core->pack * core->bytes * numberThread); | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  |     if (ATPtrAlloc.invalid() || BTPtrAlloc.invalid() || CTPtrAlloc.invalid()) { | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |         return OUT_OF_MEMORY; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     mPreFunctions.emplace_back(std::make_pair([BTPtrAlloc, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias, float* C) { | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  |         core->MNNPackForMatMul_B((float*)BTPtrAlloc.ptr(), BPtr, h, l, mTransposeB); | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |     } , 1)); | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  |     bool useBias = false; | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  |     MemChunk bdestAlloc; | 
					
						
							|  |  |  |     bool bdestNeedFree = false; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     if (inputs.size() > 2) { | 
					
						
							|  |  |  |         auto bias = inputs[2]; | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  |         useBias = true; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         auto biasLength = bias->elementSize(); | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |         if (biasLength % core->pack != 0) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             mUseBiasDirectly = false; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             // Padding to align of 4
 | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  |             bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes); | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  |             bdestNeedFree = true; | 
					
						
							|  |  |  |             if (bdestAlloc.invalid()) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |                 return OUT_OF_MEMORY; | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             mTempBias = bdestAlloc; | 
					
						
							| 
									
										
										
										
											2020-12-15 14:12:35 +08:00
										 |  |  |             mPreFunctions.emplace_back(std::make_pair( | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |                 [biasLength, bdestAlloc, core](int tId, const float* APtr, const float* BPtr, const float* borigin, float* C) { | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  |                 ::memset(bdestAlloc.ptr(), 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack); | 
					
						
							|  |  |  |                 ::memcpy(bdestAlloc.ptr(), borigin, biasLength * core->bytes); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             }, 1)); | 
					
						
							|  |  |  |         } else { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             mUseBiasDirectly = true; | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |             if (TensorUtils::getDescribeOrigin(bias)->mem.get()) { | 
					
						
							|  |  |  |                 bdestAlloc = TensorUtils::getDescribeOrigin(bias)->mem->chunk(); | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |         mPostParameters = { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             1.0f, | 
					
						
							|  |  |  |             1.0f, | 
					
						
							|  |  |  |             -std::numeric_limits<float>().max(), | 
					
						
							|  |  |  |             std::numeric_limits<float>().max(), | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2023-09-04 10:42:11 +08:00
										 |  |  |     if (bdestNeedFree) { | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  |         bufferAlloc->free(bdestAlloc); | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  |     bufferAlloc->free(ATPtrAlloc); | 
					
						
							|  |  |  |     bufferAlloc->free(BTPtrAlloc); | 
					
						
							|  |  |  |     bufferAlloc->free(CTPtrAlloc); | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     mTempA = ATPtrAlloc; | 
					
						
							|  |  |  |     mTempB = BTPtrAlloc; | 
					
						
							|  |  |  |     mTempC = CTPtrAlloc; | 
					
						
							|  |  |  |     mE = e; | 
					
						
							|  |  |  |     mL = l; | 
					
						
							|  |  |  |     mH = h; | 
					
						
							| 
									
										
										
										
											2020-05-15 14:49:10 +08:00
										 |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) { | 
					
						
							| 
									
										
										
										
											2020-12-15 14:12:35 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     auto APtr = inputs[0]->host<float>(); | 
					
						
							|  |  |  |     auto BPtr = inputs[1]->host<float>(); | 
					
						
							|  |  |  |     auto CPtr = outputs[0]->host<float>(); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |     const float* biasPtr = nullptr; | 
					
						
							|  |  |  |     if (inputs.size() > 2) { | 
					
						
							|  |  |  |         biasPtr = inputs[2]->host<float>(); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     execute(APtr, BPtr, CPtr, biasPtr); | 
					
						
							|  |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void CPUMatMul::execute(const float* APtr, const float* BPtr, float* CPtr, const float* biasPtr) { | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |     for (auto& f : mPreFunctions) { | 
					
						
							|  |  |  |         MNN_CONCURRENCY_BEGIN(tId, f.second) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             f.first(tId, APtr, BPtr, biasPtr, CPtr); | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |         MNN_CONCURRENCY_END(); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     if (mE > 0) { | 
					
						
							|  |  |  |         auto core = static_cast<CPUBackend*>(backend())->functions(); | 
					
						
							|  |  |  |         int eP, lP, hP; | 
					
						
							|  |  |  |         core->MNNGetMatMulPackMode(&eP, &lP, &hP); | 
					
						
							|  |  |  |         const float* postPtr = mPostParameters.data(); | 
					
						
							|  |  |  |         if (!mUseBiasDirectly) { | 
					
						
							|  |  |  |             biasPtr = (const float*)mTempBias.ptr(); | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |         if (nullptr == biasPtr) { | 
					
						
							|  |  |  |             postPtr = nullptr; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-07-22 19:51:53 +08:00
										 |  |  |         auto lAlign = UP_DIV(mL, lP) * lP; | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |         int tileCount = UP_DIV(mE, eP); | 
					
						
							|  |  |  |         int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1; | 
					
						
							|  |  |  |         MNN_CONCURRENCY_BEGIN(tId, numberThread) { | 
					
						
							| 
									
										
										
										
											2024-07-22 19:51:53 +08:00
										 |  |  |             auto TA = mTempA.ptr() + tId * eP * lAlign * core->bytes; | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             auto TB = mTempB.ptr(); | 
					
						
							|  |  |  |             auto hC4 = UP_DIV(mH, core->pack); | 
					
						
							|  |  |  |             auto TC = mTempC.ptr() + tId * eP * hC4 * core->pack * core->bytes; | 
					
						
							|  |  |  |             size_t parameters[6]; | 
					
						
							|  |  |  |             parameters[0] = eP * core->bytes; | 
					
						
							|  |  |  |             parameters[1] = mL; | 
					
						
							|  |  |  |             parameters[2] = mH; | 
					
						
							|  |  |  |             parameters[3] = eP * core->pack * core->bytes; | 
					
						
							|  |  |  |             parameters[4] = 0; | 
					
						
							|  |  |  |             parameters[5] = 0; | 
					
						
							|  |  |  |             for (int tx=tId; tx<tileCount; tx+=numberThread) { | 
					
						
							|  |  |  |                 int xStart = tx * eP; | 
					
						
							|  |  |  |                 int xEnd = ALIMIN(xStart + eP, mE); | 
					
						
							|  |  |  |                 int xC = xEnd - xStart; | 
					
						
							|  |  |  |                 if (mTransposeA) { | 
					
						
							| 
									
										
										
										
											2024-07-22 19:51:53 +08:00
										 |  |  |                     // l, e -> l/lp, xC|eP, lp
 | 
					
						
							|  |  |  |                     if (lP > 1) { | 
					
						
							|  |  |  |                         // TODO: Speed up it
 | 
					
						
							|  |  |  |                         if (mL % lP != 0) { | 
					
						
							|  |  |  |                             ::memset(TA, 0, eP * lAlign * core->bytes); | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                         if (core->bytes == 4) { | 
					
						
							|  |  |  |                             auto D = (int32_t*)TA; | 
					
						
							|  |  |  |                             auto S = (int32_t*)APtr; | 
					
						
							|  |  |  |                             for (int y=0; y<mL; ++y) { | 
					
						
							|  |  |  |                                 int yc = y / lP; | 
					
						
							|  |  |  |                                 int yr = y % lP; | 
					
						
							|  |  |  |                                 for (int xx=0; xx<xC; ++xx) { | 
					
						
							|  |  |  |                                     D[yc * lP * eP + xx * lP + yr] = S[y * mE + xStart + xx]; | 
					
						
							|  |  |  |                                 } | 
					
						
							|  |  |  |                             } | 
					
						
							|  |  |  |                         } else { | 
					
						
							|  |  |  |                             MNN_ASSERT(core->bytes == 2); | 
					
						
							|  |  |  |                             auto D = (int16_t*)TA; | 
					
						
							|  |  |  |                             auto S = (int16_t*)APtr; | 
					
						
							|  |  |  |                             for (int y=0; y<mL; ++y) { | 
					
						
							|  |  |  |                                 int yc = y / lP; | 
					
						
							|  |  |  |                                 int yr = y % lP; | 
					
						
							|  |  |  |                                 for (int xx=0; xx<xC; ++xx) { | 
					
						
							|  |  |  |                                     D[yc * lP * eP + xx * lP + yr] = S[y * mE + xStart + xx]; | 
					
						
							|  |  |  |                                 } | 
					
						
							|  |  |  |                             } | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                     } else { | 
					
						
							|  |  |  |                         for (int y=0; y<mL; ++y) { | 
					
						
							|  |  |  |                             ::memcpy(TA + y*eP*core->bytes, (uint8_t*)APtr + (y * mE + xStart) * core->bytes, core->bytes * xC); | 
					
						
							|  |  |  |                         } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |                     } | 
					
						
							|  |  |  |                 } else { | 
					
						
							| 
									
										
										
										
											2024-07-22 19:51:53 +08:00
										 |  |  |                     if (lP > 1) { | 
					
						
							|  |  |  |                         // e, l -> l/lp, 1, xC|eP, lp
 | 
					
						
							|  |  |  |                         int lC = mL / lP; | 
					
						
							|  |  |  |                         int lR = mL % lP; | 
					
						
							|  |  |  |                         for (int yy=0; yy<lC; ++yy) { | 
					
						
							|  |  |  |                             for (int x=0; x<xC; ++x) { | 
					
						
							|  |  |  |                                 ::memcpy(TA + (yy * eP * lP + x * lP) * core->bytes, (uint8_t*)APtr + ((x+xStart)*mL+yy*lP)*core->bytes, lP * core->bytes); | 
					
						
							|  |  |  |                             } | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                         if (lR > 0) { | 
					
						
							|  |  |  |                             int yy = lC; | 
					
						
							|  |  |  |                             for (int x=0; x<xC; ++x) { | 
					
						
							|  |  |  |                                 ::memset(TA + (yy * eP * lP + x * lP) * core->bytes, 0, lP * core->bytes); | 
					
						
							|  |  |  |                                 ::memcpy(TA + (yy * eP * lP + x * lP) * core->bytes, (uint8_t*)APtr + ((x+xStart)*mL+yy*lP)*core->bytes, xC * core->bytes); | 
					
						
							|  |  |  |                             } | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                     } else { | 
					
						
							|  |  |  |                         // e, l -> l, eP
 | 
					
						
							|  |  |  |                         int dims[] = { | 
					
						
							|  |  |  |                             xC, | 
					
						
							|  |  |  |                             mL, | 
					
						
							|  |  |  |                             mL, | 
					
						
							|  |  |  |                             eP | 
					
						
							|  |  |  |                         }; | 
					
						
							|  |  |  |                         if (core->bytes == 2) { | 
					
						
							|  |  |  |                             auto S = (const int16_t*)APtr + xStart * mL; | 
					
						
							|  |  |  |                             auto D = (int16_t*)TA; | 
					
						
							|  |  |  |                             MNNTranspose16Bit(D, S, dims); | 
					
						
							|  |  |  |                         } else if (core->bytes == 4) { | 
					
						
							|  |  |  |                             auto S = (const int32_t*)APtr + xStart * mL; | 
					
						
							|  |  |  |                             auto D = (int32_t*)TA; | 
					
						
							|  |  |  |                             MNNTranspose32Bit(D, S, dims); | 
					
						
							|  |  |  |                         } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2024-07-22 19:51:53 +08:00
										 |  |  |                 if (core->matmulBytes != 0) { | 
					
						
							|  |  |  |                     core->MNNFp32ToLowp((const float*)TA, (int16_t*)TA, eP * lAlign); | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |                 if (xC == eP) { | 
					
						
							|  |  |  |                     core->MNNPackedMatMul((float*)TC, (float*)TA, (float*)TB, parameters, postPtr, biasPtr, nullptr, nullptr); | 
					
						
							|  |  |  |                 } else { | 
					
						
							|  |  |  |                     core->MNNPackedMatMulRemain((float*)TC, (float*)TA, (float*)TB, xC, parameters, postPtr, biasPtr, nullptr, nullptr); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 int area[] = { | 
					
						
							|  |  |  |                     eP, | 
					
						
							|  |  |  |                     mE | 
					
						
							|  |  |  |                 }; | 
					
						
							|  |  |  |                 if (mTransposeC) { | 
					
						
							|  |  |  |                     // hC4, e, 4 -> e, h
 | 
					
						
							|  |  |  |                     auto dst = (uint8_t*)CPtr + xStart * mH * core->bytes; | 
					
						
							|  |  |  |                     core->MNNUnpackCUnitTranspose((float*)dst, (const float*)TC, xC, mH, area); | 
					
						
							|  |  |  |                 } else { | 
					
						
							|  |  |  |                     // hC4, e, 4 -> h, e
 | 
					
						
							|  |  |  |                     auto dst = (uint8_t*)CPtr + xStart * core->bytes; | 
					
						
							|  |  |  |                     core->MNNUnpackCUnit((float*)dst, (const float*)TC, xC, mH, area); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         }; | 
					
						
							| 
									
										
										
										
											2020-07-04 01:21:30 +08:00
										 |  |  |         MNN_CONCURRENCY_END(); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class CPUMatMulCreator : public CPUBackend::Creator { | 
					
						
							|  |  |  | public: | 
					
						
							|  |  |  |     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, | 
					
						
							|  |  |  |                                 const MNN::Op* op, Backend* backend) const override { | 
					
						
							|  |  |  |         auto param = op->main_as_MatMul(); | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |         return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true, true); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | REGISTER_CPU_OP_CREATOR(CPUMatMulCreator, OpType_MatMul); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } // namespace MNN
 |