mirror of https://github.com/alibaba/MNN.git
				
				
				
			
		
			
				
	
	
		
			243 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			243 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
| #ifndef KleidiAIDenseConvolution_hpp
 | |
| #define KleidiAIDenseConvolution_hpp
 | |
| 
 | |
| #include "ConvolutionTiledExecutor.hpp"
 | |
| #include "backend/cpu/CPUConvolution.hpp"
 | |
| 
 | |
| namespace MNN {
 | |
| struct ConvParams {
 | |
|     int inputChannel;
 | |
|     int outputChannel;
 | |
|     int kernelHeight;
 | |
|     int kernelWidth;
 | |
|     int strideHeight;
 | |
|     int strideWidth;
 | |
|     int padTop;
 | |
|     int padBottom;
 | |
|     int padLeft;
 | |
|     int padRight;
 | |
|     int dilatedHeight;
 | |
|     int dilatedWidth;
 | |
| 
 | |
|     struct Size2D {
 | |
|         int height;
 | |
|         int width;
 | |
|     };
 | |
| 
 | |
|     Size2D getOutputSize(int inputHeight, int inputWidth) const {
 | |
|         auto kernelSizeWithDilated = [](int kernel, int dilated) { return kernel + (kernel - 1) * (dilated - 1); };
 | |
|         auto outputSize            = [](int input, int pad1, int pad2, int kernel, int stride) {
 | |
|             int t = (input + pad1 + pad2 - kernel);
 | |
|             return t / stride + 1;
 | |
|         };
 | |
| 
 | |
|         int dilatedKernelHeight = kernelSizeWithDilated(kernelHeight, dilatedHeight);
 | |
|         int dilatedKernelWidth  = kernelSizeWithDilated(kernelWidth, dilatedWidth);
 | |
| 
 | |
|         int outputHeight = outputSize(inputHeight, padTop, padBottom, dilatedKernelHeight, strideHeight);
 | |
|         int outputWidth  = outputSize(inputWidth, padLeft, padRight, dilatedKernelWidth, strideWidth);
 | |
| 
 | |
|         return {outputHeight, outputWidth};
 | |
|     }
 | |
| };
 | |
| 
 | |
| template <typename T>
 | |
| struct IndirectionTable {
 | |
|     std::vector<const void*> data;
 | |
|     int height;
 | |
|     int width;
 | |
|     int blockSize;
 | |
| 
 | |
|     /// Creates an indirection table for LHS packing.
 | |
|     ///
 | |
|     /// When implementing convolution via matrix multiplication, we need to
 | |
|     /// transform the input and weight tensors into matrices. This transformation
 | |
|     /// for the input is typically referred to as `im2col`. The resulting matrix has
 | |
|     /// dimensions:
 | |
|     /// - Rows: batch * output_height * output_width
 | |
|     /// - Columns: input_channels * kernel_height * kernel_width
 | |
|     ///
 | |
|     /// The indirection table stores the starting addresses of all these chunks in
 | |
|     /// the input tensor. For cases where padding is applied, it stores pointers
 | |
|     /// directly to the padded buffer. Note that the length of the padding buffer
 | |
|     /// must match the number of input channels.
 | |
|     ///
 | |
|     /// The indirection table stores the starting addresses of all these chunks in
 | |
|     /// the input tensor. Furthermore, LHS packing also requires a transpose over
 | |
|     /// every `M_STEP` rows to optimize data layout for computation.
 | |
|     ///
 | |
|     /// @param[in] shape The NHWC input shape
 | |
|     /// @param[in] params The parameters of convolution
 | |
|     /// @param[in] input The raw pointer for the input tensor
 | |
|     /// @param[in] padValues The raw pointer for the pad tensor
 | |
|     /// @param[in] blockSize The block size for the transpose
 | |
|     ///
 | |
|     /// @return The indirection table ready for lhs packing.
 | |
|     IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input, const T* padValues,
 | |
|                      const int blockSize);
 | |
| 
 | |
|     ~IndirectionTable() = default;
 | |
| 
 | |
|     /// To compute the offset after blocking of blockSize.
 | |
|     ///
 | |
|     /// @param[in] row The row index
 | |
|     /// @param[in] col The col index
 | |
|     /// @param[in] width The table column count
 | |
|     /// @param[in] block The block size
 | |
|     ///
 | |
|     /// @return The offset in blocking table
 | |
|     int getReorderedOffset(int row, int col, int width, int block) {
 | |
|         int c = row % block;
 | |
|         int r = row / block * width + col;
 | |
|         return r * block + c;
 | |
|     }
 | |
| };
 | |
| 
 | |
| template <typename T>
 | |
| IndirectionTable<T>::IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input,
 | |
|                                       const T* padValues, const int blockSize) {
 | |
|     int batchSize    = shape[0];
 | |
|     int inputChannel = shape[3];
 | |
|     int inputHeight  = shape[1];
 | |
|     int inputWidth   = shape[2];
 | |
| 
 | |
|     int elementCount = batchSize * inputChannel * inputHeight * inputWidth;
 | |
|     auto outputSize  = params.getOutputSize(inputHeight, inputWidth);
 | |
|     int outputHeight = outputSize.height;
 | |
|     int outputWidth  = outputSize.width;
 | |
| 
 | |
|     int rowCount = batchSize * outputHeight * outputWidth;
 | |
|     int colCount = params.kernelHeight * params.kernelWidth;
 | |
| 
 | |
|     this->data.resize((rowCount + blockSize - 1) / blockSize * blockSize * colCount);
 | |
|     this->height    = rowCount;
 | |
|     this->width     = colCount;
 | |
|     this->blockSize = blockSize;
 | |
| 
 | |
|     for (int i = 0; i < this->data.size(); i++) {
 | |
|         this->data[i] = nullptr;
 | |
|     }
 | |
| 
 | |
|     for (int b = 0; b < batchSize; b++) {
 | |
|         for (int h = 0; h < outputSize.height; h++) {
 | |
|             for (int w = 0; w < outputSize.width; w++) {
 | |
|                 int inputRow = h * params.strideHeight - params.padTop;
 | |
|                 int inputCol = w * params.strideWidth - params.padLeft;
 | |
| 
 | |
|                 for (int kh = 0; kh < params.kernelHeight; kh++) {
 | |
|                     // Every row of im2col resulting matrix $kernel height * kernel width$
 | |
|                     // chunks. So indirection table has relevant values, which point to the
 | |
|                     // relevant chunk. The `tableRow` and `tableCol` is the row and column
 | |
|                     // of the table not transposed.
 | |
|                     int tableRow = b * outputHeight * outputWidth + h * outputWidth + w;
 | |
|                     int tableCol = kh * params.kernelWidth;
 | |
| 
 | |
|                     int inputRowPrime    = inputRow + kh * params.dilatedHeight;
 | |
|                     int inputOffsetStart = b * inputHeight * inputWidth + inputRowPrime * inputWidth;
 | |
|                     if (inputRowPrime >= 0 && inputRowPrime < inputHeight) {
 | |
|                         for (int kw = 0; kw < params.kernelWidth; kw++) {
 | |
|                             int tableOffset   = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
 | |
|                             int inputColPrime = inputCol + kw * params.dilatedWidth;
 | |
|                             if (inputColPrime >= 0 && inputColPrime < inputWidth) {
 | |
|                                 int inputOffset = (inputOffsetStart + inputColPrime) * inputChannel;
 | |
|                                 assert(inputOffset < elementCount);
 | |
|                                 assert(tableOffset < this->data.size());
 | |
|                                 this->data[tableOffset] = input + inputOffset;
 | |
|                             } else {
 | |
|                                 assert(tableOffset < this->data.size());
 | |
|                                 this->data[tableOffset] = padValues;
 | |
|                             }
 | |
|                         }
 | |
|                     } else {
 | |
|                         for (int kw = 0; kw < params.kernelWidth; kw++) {
 | |
|                             int tableOffset = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
 | |
|                             assert(tableOffset < this->data.size());
 | |
|                             this->data[tableOffset] = padValues;
 | |
|                         }
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| template <typename DstT, typename SrcT>
 | |
| static void ConvertOIHWToHWIO(DstT* dst, const SrcT* src, const std::vector<int>& shape) {
 | |
|     assert(shape.size() == 4);
 | |
|     int height        = shape[2];
 | |
|     int width         = shape[3];
 | |
|     int outputChannel = shape[0];
 | |
|     int inputChannel  = shape[1];
 | |
| 
 | |
|     int spatialSize = height * width;
 | |
|     for (int oc = 0; oc < outputChannel; oc++) {
 | |
|         for (int ic = 0; ic < inputChannel; ic++) {
 | |
|             for (int s = 0; s < spatialSize; s++) {
 | |
|                 int inputOffset  = oc * inputChannel * spatialSize + ic * spatialSize + s;
 | |
|                 int outputOffset = s * inputChannel * outputChannel + ic * outputChannel + oc;
 | |
| 
 | |
|                 // TODO Check the force conversion.
 | |
|                 dst[outputOffset] = (DstT)(src[inputOffset]);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| class KleidiAIDenseConvolutionImpl : public ConvolutionTiledImpl {
 | |
| public:
 | |
|     KleidiAIDenseConvolutionImpl(const Convolution2DCommon *common, Backend *b,
 | |
|                                  CPUConvolution::Resource *resource = nullptr)
 | |
|         : ConvolutionTiledImpl(common, b) {
 | |
|         mResource = resource;
 | |
|     }
 | |
|     ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 | |
|     ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 | |
|     virtual ~KleidiAIDenseConvolutionImpl() = default;
 | |
|     virtual void getPackParameter(int *eP, int *lP, int *hP, const CoreFunctions *core) override {}
 | |
| 
 | |
| private:
 | |
|     Tensor mOutputNHWC;
 | |
|     Tensor mInputNHWC;
 | |
|     Tensor mPadBuffer;
 | |
| };
 | |
| 
 | |
| class KleidiAIDenseConvolution : public ConvolutionTiledExecutor {
 | |
| public:
 | |
|     KleidiAIDenseConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight,
 | |
|                              size_t originWeightSize, const float *bias, size_t biasSize,
 | |
|                              std::shared_ptr<ConvolutionCommon::Int8Common>);
 | |
| 
 | |
|     KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common,
 | |
|                              Backend *b);
 | |
|     virtual ~KleidiAIDenseConvolution();
 | |
| 
 | |
|     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 | |
|     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 | |
|     virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override;
 | |
|     void initWeight(float *dest, const float *source, float *cache, int depth, int outputCount, int kernelSize,
 | |
|                     const CoreFunctions *function);
 | |
| 
 | |
| protected:
 | |
|     std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
 | |
| };
 | |
| 
 | |
| class KleidiAIDenseConvolutionMultiInput : public Execution {
 | |
| public:
 | |
|     KleidiAIDenseConvolutionMultiInput(const Convolution2DCommon *common, Backend *b) : Execution(b) {
 | |
|         mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b));
 | |
|     }
 | |
|     virtual ~KleidiAIDenseConvolutionMultiInput() = default;
 | |
|     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 | |
|     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 | |
| 
 | |
| private:
 | |
|     std::shared_ptr<Tensor> mTempWeight;
 | |
|     std::shared_ptr<Tensor> mTempWeightCache;
 | |
|     std::shared_ptr<Tensor> mTempBias;
 | |
|     std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
 | |
|     std::vector<Tensor *> mInputs;
 | |
| };
 | |
| } // namespace MNN
 | |
| 
 | |
| #endif /* KleidiAIDenseConvolution_hpp */
 |