MNN/source/backend/cpu/compute/KleidiAIDenseConvolution.hpp

#if MNN_KLEIDIAI_ENABLED

#ifndef KleidiAIDenseConvolution_hpp
#define KleidiAIDenseConvolution_hpp

#include "ConvolutionTiledExecutor.hpp"
#include "backend/cpu/CPUConvolution.hpp"

namespace MNN {
struct ConvParams {
    int inputChannel;
    int outputChannel;
    int kernelHeight;
    int kernelWidth;
    int strideHeight;
    int strideWidth;
    int padTop;
    int padBottom;
    int padLeft;
    int padRight;
    int dilatedHeight;
    int dilatedWidth;

    struct Size2D {
        int height;
        int width;
    };

    Size2D getOutputSize(int inputHeight, int inputWidth) const {
        auto kernelSizeWithDilated = [](int kernel, int dilated) { return kernel + (kernel - 1) * (dilated - 1); };
        auto outputSize            = [](int input, int pad1, int pad2, int kernel, int stride) {
            int t = (input + pad1 + pad2 - kernel);
            return t / stride + 1;
        };

        int dilatedKernelHeight = kernelSizeWithDilated(kernelHeight, dilatedHeight);
        int dilatedKernelWidth  = kernelSizeWithDilated(kernelWidth, dilatedWidth);

        int outputHeight = outputSize(inputHeight, padTop, padBottom, dilatedKernelHeight, strideHeight);
        int outputWidth  = outputSize(inputHeight, padLeft, padRight, dilatedKernelWidth, strideWidth);

        return {outputHeight, outputWidth};
    }
};

template <typename T>
struct IndirectionTable {
    std::vector<const void*> data;
    int height;
    int width;
    int blockSize;

    /// Creates an indirection table for LHS packing.
    ///
    /// When implementing convolution via matrix multiplication, we need to
    /// transform the input and weight tensors into matrices. This transformation
    /// for the input is typically referred to as `im2col`. The resulting matrix has
    /// dimensions:
    /// - Rows: batch * output_height * output_width
    /// - Columns: input_channels * kernel_height * kernel_width
    ///
    /// The indirection table stores the starting addresses of all these chunks in
    /// the input tensor. For cases where padding is applied, it stores pointers
    /// directly to the padded buffer. Note that the length of the padding buffer
    /// must match the number of input channels.
    ///
    /// The indirection table stores the starting addresses of all these chunks in
    /// the input tensor. Furthermore, LHS packing also requires a transpose over
    /// every `M_STEP` rows to optimize data layout for computation.
    ///
    /// @param[in] shape The NHWC input shape
    /// @param[in] params The parameters of convolution
    /// @param[in] input The raw pointer for the input tensor
    /// @param[in] padValues The raw pointer for the pad tensor
    /// @param[in] blockSize The block size for the transpose
    ///
    /// @return The indirection table ready for lhs packing.
    IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input, const T* padValues,
                     const int blockSize);

    ~IndirectionTable() = default;

    /// To compute the offset after blocking of blockSize.
    ///
    /// @param[in] row The row index
    /// @param[in] col The col index
    /// @param[in] width The table column count
    /// @param[in] block The block size
    ///
    /// @return The offset in blocking table
    int getReorderedOffset(int row, int col, int width, int block) {
        int c = row % block;
        int r = row / block * width + col;
        return r * block + c;
    }
};

template <typename T>
IndirectionTable<T>::IndirectionTable(const std::vector<int>& shape, const ConvParams& params, const T* input,
                                      const T* padValues, const int blockSize) {
    int batchSize    = shape[0];
    int inputChannel = shape[3];
    int inputHeight  = shape[1];
    int inputWidth   = shape[2];

    int elementCount = batchSize * inputChannel * inputHeight * inputWidth;
    auto outputSize  = params.getOutputSize(inputHeight, inputWidth);
    int outputHeight = outputSize.height;
    int outputWidth  = outputSize.width;

    int rowCount = batchSize * outputHeight * outputWidth;
    int colCount = params.kernelHeight * params.kernelWidth;

    this->data.resize((rowCount + blockSize - 1) / blockSize * blockSize * colCount);
    this->height    = rowCount;
    this->width     = colCount;
    this->blockSize = blockSize;

    for (int i = 0; i < this->data.size(); i++) {
        this->data[i] = nullptr;
    }

    for (int b = 0; b < batchSize; b++) {
        for (int h = 0; h < outputSize.height; h++) {
            for (int w = 0; w < outputSize.width; w++) {
                int inputRow = h * params.strideHeight - params.padTop;
                int inputCol = w * params.strideWidth - params.padLeft;

                for (int kh = 0; kh < params.kernelHeight; kh++) {
                    // Every row of im2col resulting matrix $kernel height * kernel width$
                    // chunks. So indirection table has relevant values, which point to the
                    // relevant chunk. The `tableRow` and `tableCol` is the row and column
                    // of the table not transposed.
                    int tableRow = b * outputHeight * outputWidth + h * outputWidth + w;
                    int tableCol = kh * params.kernelWidth;

                    int inputRowPrime    = inputRow + kh * params.dilatedHeight;
                    int inputOffsetStart = b * inputHeight * inputWidth + inputRowPrime * inputWidth;
                    if (inputRowPrime >= 0 && inputRowPrime < inputHeight) {
                        for (int kw = 0; kw < params.kernelWidth; kw++) {
                            int tableOffset   = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
                            int inputColPrime = inputCol + kw * params.dilatedWidth;
                            if (inputColPrime >= 0 && inputColPrime < inputWidth) {
                                int inputOffset = (inputOffsetStart + inputColPrime) * inputChannel;
                                assert(inputOffset < elementCount);
                                assert(tableOffset < this->data.size());
                                this->data[tableOffset] = input + inputOffset;
                            } else {
                                assert(tableOffset < this->data.size());
                                this->data[tableOffset] = padValues;
                            }
                        }
                    } else {
                        for (int kw = 0; kw < params.kernelWidth; kw++) {
                            int tableOffset = getReorderedOffset(tableRow, tableCol + kw, colCount, blockSize);
                            assert(tableOffset < this->data.size());
                            this->data[tableOffset] = padValues;
                        }
                    }
                }
            }
        }
    }
}

template <typename DstT, typename SrcT>
static void ConvertOIHWToHWIO(DstT* dst, const SrcT* src, const std::vector<int>& shape) {
    assert(shape.size() == 4);
    int height        = shape[2];
    int width         = shape[3];
    int outputChannel = shape[0];
    int inputChannel  = shape[1];

    int spatialSize = height * width;
    for (int oc = 0; oc < outputChannel; oc++) {
        for (int ic = 0; ic < inputChannel; ic++) {
            for (int s = 0; s < spatialSize; s++) {
                int inputOffset  = oc * inputChannel * spatialSize + ic * spatialSize + s;
                int outputOffset = s * inputChannel * outputChannel + ic * outputChannel + oc;

                // TODO Check the force conversion.
                dst[outputOffset] = (DstT)(src[inputOffset]);
            }
        }
    }
}

class KleidiAIDenseConvolutionImpl : public ConvolutionTiledImpl {
public:
    KleidiAIDenseConvolutionImpl(const Convolution2DCommon *common, Backend *b,
                                 CPUConvolution::Resource *resource = nullptr)
        : ConvolutionTiledImpl(common, b) {
        mResource = resource;
    }
    ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ~KleidiAIDenseConvolutionImpl() = default;
    virtual void getPackParameter(int *eP, int *lP, int *hP, const CoreFunctions *core) override {}

private:
    Tensor mOutputNHWC;
    Tensor mInputNHWC;
    Tensor mPadBuffer;
};

class KleidiAIDenseConvolution : public ConvolutionTiledExecutor {
public:
    KleidiAIDenseConvolution(const Convolution2DCommon *common, Backend *b, const float *originWeight,
                             size_t originWeightSize, const float *bias, size_t biasSize,
                             std::shared_ptr<ConvolutionCommon::Int8Common>);

    KleidiAIDenseConvolution(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common,
                             Backend *b);
    virtual ~KleidiAIDenseConvolution();

    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override;
    void initWeight(float *dest, const float *source, float *cache, int depth, int outputCount, int kernelSize,
                    const CoreFunctions *function);

protected:
    std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
};

class KleidiAIDenseConvolutionMultiInput : public Execution {
public:
    KleidiAIDenseConvolutionMultiInput(const Convolution2DCommon *common, Backend *b) : Execution(b) {
        mProxy.reset(new KleidiAIDenseConvolutionImpl(common, b));
    }
    virtual ~KleidiAIDenseConvolutionMultiInput() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

private:
    std::shared_ptr<Tensor> mTempWeight;
    std::shared_ptr<Tensor> mTempWeightCache;
    std::shared_ptr<Tensor> mTempBias;
    std::shared_ptr<KleidiAIDenseConvolutionImpl> mProxy;
    std::vector<Tensor *> mInputs;
};
} // namespace MNN

#endif /* KleidiAIDenseConvolution_hpp */
#endif