| 
									
										
										
										
											2025-06-18 17:49:09 +08:00
										 |  |  | #ifndef KleidiAIDenseConvolution_hpp
 | 
					
						
							|  |  |  | #define KleidiAIDenseConvolution_hpp
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include "ConvolutionTiledExecutor.hpp"
 | 
					
						
							|  |  |  | #include "backend/cpu/CPUConvolution.hpp"
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace MNN { | 
					
						
							|  |  |  | struct ConvParams { | 
					
						
							|  |  |  |     int inputChannel; | 
					
						
							|  |  |  |     int outputChannel; | 
					
						
							|  |  |  |     int kernelHeight; | 
					
						
							|  |  |  |     int kernelWidth; | 
					
						
							|  |  |  |     int strideHeight; | 
					
						
							|  |  |  |     int strideWidth; | 
					
						
							|  |  |  |     int padTop; | 
					
						
							|  |  |  |     int padBottom; | 
					
						
							|  |  |  |     int padLeft; | 
					
						
							|  |  |  |     int padRight; | 
					
						
							|  |  |  |     int dilatedHeight; | 
					
						
							|  |  |  |     int dilatedWidth; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     struct Size2D { | 
					
						
							|  |  |  |         int height; | 
					
						
							|  |  |  |         int width; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Size2D getOutputSize(int inputHeight, int inputWidth) const { | 
					
						
							|  |  |  |         auto kernelSizeWithDilated = [](int kernel, int dilated) { return kernel + (kernel - 1) * (dilated - 1); }; | 
					
						
							|  |  |  |         auto outputSize            = [](int input, int pad1, int pad2, int kernel, int stride) { | 
					
						
							|  |  |  |             int t = (input + pad1 + pad2 - kernel); | 
					
						
							|  |  |  |             return t / stride + 1; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         int dilatedKernelHeight = kernelSizeWithDilated(kernelHeight, dilatedHeight); | 
					
						
							|  |  |  |         int dilatedKernelWidth  = kernelSizeWithDilated(kernelWidth, dilatedWidth); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         int outputHeight = outputSize(inputHeight, padTop, padBottom, dilatedKernelHeight, strideHeight); | 
					
						
							| 
									
										
										
										
											2025-07-31 16:56:51 +08:00
										 |  |  |         int outputWidth  = outputSize(inputWidth, padLeft, padRight, dilatedKernelWidth, strideWidth); | 
					
						
							| 
									
										
										
										
											2025-06-18 17:49:09 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return {outputHeight, outputWidth}; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <typename T> | 
					
						
							|  |  |  | struct IndirectionTable { | 
					
						
							|  |  |  |     std::vector<const void*> data; | 
					
						
							|  |  |  |     int height; | 
					
						
							|  |  |  |     int width; | 
					
						
							|  |  |  |     int blockSize; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /// Creates an indirection table for LHS packing.
 | 
					
						
							|  |  |  |     ///
 | 
					
						
							|  |  |  |     /// When implementing convolution via matrix multiplication, we need to
 | 
					
						
							|  |  |  |     /// transform the input and weight tensors into matrices. This transformation
 | 
					
						
							|  |  |  |     /// for the input is typically referred to as `im2col`. The resulting matrix has
 | 
					
						
							|  |  |  |     /// dimensions:
 | 
					
						
							|  |  |  |     /// - Rows: batch * output_height * output_width
 | 
					
						
							|  |  |  |     /// - Columns: input_channels * kernel_height * kernel_width
 | 
					
						
							|  |  |  |     ///
 | 
					
						
							|  |  |  |     /// The indirection table stores the starting addresses of all these chunks in
 | 
					
						
							|  |  |  |     /// the input tensor. For cases where padding is applied, it stores pointers
 | 
					
						
							|  |  |  |     /// directly to the padded buffer. Note that the length of the padding buffer
 | 
					
						
							|  |  |  |     /// must match the number of input channels.
 | 
					
						
							|  |  |  |     ///
 | 
					
						
							|  |  |  |     /// The indirection table stores the starting addresses of all these chunks in
 | 
					
						
							|  |  |  |     /// the input tensor. Furthermore, LHS packing also requires a transpose over
 | 
					
						
							|  |  |  |     /// every `M_STEP` rows to optimize data layout for computation.
 | 
					
						
							|  |  |  |     ///
 | 
					
						
							|  |  |  |     /// @param[in] shape The NHWC input shape
 | 
					
						
							|  |  |  |     /// @param[in] params The parameters of convolution
 | 
					
						
							|  |  |  |     /// @param[in] input The raw pointer for the input tensor
 | 
					
						
							|  |  |  |     /// @param[in] padValues The raw pointer for the pad tensor
 | 
					
						
							|  |  |  |     /// @param[in] blockSize The block size for the transpose
 | 
					
						
							|  |  |  |     ///
 | 
					
						
							|  |  |  |     /// @return The indirection table ready for lhs packing.
 | 
					
						
							|  |  |  |     IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input, const T* padValues, | 
					
						
							|  |  |  |                      const int blockSize); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ~IndirectionTable() = default; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /// To compute the offset after blocking of blockSize.
 | 
					
						
							|  |  |  |     ///
 | 
					
						
							|  |  |  |     /// @param[in] row The row index
 | 
					
						
							|  |  |  |     /// @param[in] col The col index
 | 
					
						
							|  |  |  |     /// @param[in] width The table column count
 | 
					
						
							|  |  |  |     /// @param[in] block The block size
 | 
					
						
							|  |  |  |     ///
 | 
					
						
							|  |  |  |     /// @return The offset in blocking table
 | 
					
						
							|  |  |  |     int getReorderedOffset(int row, int col, int width, int block) { | 
					
						
							|  |  |  |         int c = row % block; | 
					
						
							|  |  |  |         int r = row / block * width + col; | 
					
						
							|  |  |  |         return r * block + c; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <typename T> | 
					
						
							|  |  |  | IndirectionTable<T>::IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input, | 
					
						
							|  |  |  |                                       const T* padValues, const int blockSize) { | 
					
						
							|  |  |  |     int batchSize    = shape[0]; | 
					
						
							|  |  |  |     int inputChannel = shape[3]; | 
					
						
							|  |  |  |     int inputHeight  = shape[1]; | 
					
						
							|  |  |  |     int inputWidth   = shape[2]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     int elementCount = batchSize * inputChannel * inputHeight * inputWidth; | 
					
						
							|  |  |  |     auto outputSize  = params.getOutputSize(inputHeight, inputWidth); | 
					
						
							|  |  |  |     int outputHeight = outputSize.height; | 
					
						
							|  |  |  |     int outputWidth  = outputSize.width; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     int rowCount = batchSize * outputHeight * outputWidth; | 
					
						
							|  |  |  |     int colCount = params.kernelHeight * params.kernelWidth; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     this->data.resize((rowCount + blockSize - 1) / blockSize * blockSize * colCount); | 
					
						
							|  |  |  |     this->height    = rowCount; | 
					
						
							|  |  |  |     this->width     = colCount; | 
					
						
							|  |  |  |     this->blockSize = blockSize; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (int i = 0; i < this->data.size(); i++) { | 
					
						
							|  |  |  |         this->data[i] = nullptr; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (int b = 0; b < batchSize; b++) { | 
					
						
							|  |  |  |         for (int h = 0; h < outputSize.height; h++) { | 
					
						
							|  |  |  |             for (int w = 0; w < outputSize.width; w++) { | 
					
						
							|  |  |  |                 int inputRow = h * params.strideHeight - params.padTop; | 
					
						
							|  |  |  |                 int inputCol = w * params.strideWidth - params.padLeft; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 for (int kh = 0; kh < params.kernelHeight; kh++) { | 
					
						
							|  |  |  |                     // Every row of im2col resulting matrix $kernel height * kernel width$
 | 
					
						
							|  |  |  |                     // chunks. So indirection table has relevant values, which point to the
 | 
					
						
							|  |  |  |                     // relevant chunk. The `tableRow` and `tableCol` is the row and column
 | 
					
						
							|  |  |  |                     // of the table not transposed.
 | 
					
						
							|  |  |  |                     int tableRow = b * outputHeight * outputWidth + h * outputWidth + w; | 
					
						
							|  |  |  |                     int tableCol = kh * params.kernelWidth; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                     int inputRowPrime    = inputRow + kh * params.dilatedHeight; | 
					
						
							|  |  |  |                     int inputOffsetStart = b * inputHeight * inputWidth + inputRowPrime * inputWidth; | 
					
						
							|  |  |  |                     if (inputRowPrime >= 0 && inputRowPrime < inputHeight) { | 
					
						
							|  |  |  |                         for (int kw = 0; kw < params.kernelWidth; kw++) { | 
					
						
							|  |  |  |                             int tableOffset   = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize); | 
					
						
							|  |  |  |                             int inputColPrime = inputCol + kw * params.dilatedWidth; | 
					
						
							|  |  |  |                             if (inputColPrime >= 0 && inputColPrime < inputWidth) { | 
					
						
							|  |  |  |                                 int inputOffset = (inputOffsetStart + inputColPrime) * inputChannel; | 
					
						
							|  |  |  |                                 assert(inputOffset < elementCount); | 
					
						
							|  |  |  |                                 assert(tableOffset < this->data.size()); | 
					
						
							|  |  |  |                                 this->data[tableOffset] = input + inputOffset; | 
					
						
							|  |  |  |                             } else { | 
					
						
							|  |  |  |                                 assert(tableOffset < this->data.size()); | 
					
						
							|  |  |  |                                 this->data[tableOffset] = padValues; | 
					
						
							|  |  |  |                             } | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                     } else { | 
					
						
							|  |  |  |                         for (int kw = 0; kw < params.kernelWidth; kw++) { | 
					
						
							|  |  |  |                             int tableOffset = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize); | 
					
						
							|  |  |  |                             assert(tableOffset < this->data.size()); | 
					
						
							|  |  |  |                             this->data[tableOffset] = padValues; | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <typename DstT, typename SrcT> | 
					
						
							|  |  |  | static void ConvertOIHWToHWIO(DstT* dst, const SrcT* src, const std::vector<int>& shape) { | 
					
						
							|  |  |  |     assert(shape.size() == 4); | 
					
						
							|  |  |  |     int height        = shape[2]; | 
					
						
							|  |  |  |     int width         = shape[3]; | 
					
						
							|  |  |  |     int outputChannel = shape[0]; | 
					
						
							|  |  |  |     int inputChannel  = shape[1]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     int spatialSize = height * width; | 
					
						
							|  |  |  |     for (int oc = 0; oc < outputChannel; oc++) { | 
					
						
							|  |  |  |         for (int ic = 0; ic < inputChannel; ic++) { | 
					
						
							|  |  |  |             for (int s = 0; s < spatialSize; s++) { | 
					
						
							|  |  |  |                 int inputOffset  = oc * inputChannel * spatialSize + ic * spatialSize + s; | 
					
						
							|  |  |  |                 int outputOffset = s * inputChannel * outputChannel + ic * outputChannel + oc; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 // TODO Check the force conversion.
 | 
					
						
							|  |  |  |                 dst[outputOffset] = (DstT)(src[inputOffset]); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class KleidiAIDenseConvolutionImpl : public ConvolutionTiledImpl { | 
					
						
							|  |  |  | public: | 
					
						
							|  |  |  |     KleidiAIDenseConvolutionImpl(const Convolution2DCommon *common, Backend *b, | 
					
						
							|  |  |  |                                  CPUConvolution::Resource *resource = nullptr) | 
					
						
							|  |  |  |         : ConvolutionTiledImpl(common, b) { | 
					
						
							|  |  |  |         mResource = resource; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; | 
					
						
							|  |  |  |     ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; | 
					
						
							|  |  |  |     virtual ~KleidiAIDenseConvolutionImpl() = default; | 
					
						
							|  |  |  |     virtual void getPackParameter(int *eP, int *lP, int *hP, const CoreFunctions *core) override {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | private: | 
					
						
							|  |  |  |     Tensor mOutputNHWC; | 
					
						
							|  |  |  |     Tensor mInputNHWC; | 
					
						
							|  |  |  |     Tensor mPadBuffer; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class KleidiAIDenseConvolution : public ConvolutionTiledExecutor { | 
					
						
							|  |  |  | public: | 
					
						
							|  |  |  |     KleidiAIDenseConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight, | 
					
						
							|  |  |  |                              size_t originWeightSize, const float *bias, size_t biasSize, | 
					
						
							|  |  |  |                              std::shared_ptr<ConvolutionCommon::Int8Common>); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common, | 
					
						
							|  |  |  |                              Backend *b); | 
					
						
							|  |  |  |     virtual ~KleidiAIDenseConvolution(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; | 
					
						
							|  |  |  |     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; | 
					
						
							|  |  |  |     virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override; | 
					
						
							|  |  |  |     void initWeight(float *dest, const float *source, float *cache, int depth, int outputCount, int kernelSize, | 
					
						
							|  |  |  |                     const CoreFunctions *function); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | protected: | 
					
						
							|  |  |  |     std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class KleidiAIDenseConvolutionMultiInput : public Execution { | 
					
						
							|  |  |  | public: | 
					
						
							|  |  |  |     KleidiAIDenseConvolutionMultiInput(const Convolution2DCommon *common, Backend *b) : Execution(b) { | 
					
						
							|  |  |  |         mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b)); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     virtual ~KleidiAIDenseConvolutionMultiInput() = default; | 
					
						
							|  |  |  |     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; | 
					
						
							|  |  |  |     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | private: | 
					
						
							|  |  |  |     std::shared_ptr<Tensor> mTempWeight; | 
					
						
							|  |  |  |     std::shared_ptr<Tensor> mTempWeightCache; | 
					
						
							|  |  |  |     std::shared_ptr<Tensor> mTempBias; | 
					
						
							|  |  |  |     std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy; | 
					
						
							|  |  |  |     std::vector<Tensor *> mInputs; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | } // namespace MNN
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #endif /* KleidiAIDenseConvolution_hpp */
 |