MNN/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp

//
//  ConvInt8CutlassExecution.hpp
//  MNN
//
//  Created by MNN on 2023/01/04.
//  Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef ENABLE_CUDA_QUANT

#ifndef ConvInt8CutlassExecution_hpp
#define ConvInt8CutlassExecution_hpp

#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
#include "CutlassGemmInt8Param.hpp"
#include "../MNNCUDADefine.hpp"
#include "../MNNCUDAFunction.cuh"

namespace MNN {
namespace CUDA {

typedef enum {
    GEMM_SIZE_NORMAL = 0,
    GEMM_SIZE_LITTLE = 1,
    GEMM_SIZE_LARGE  = 2
} GemmSizeLevel;

class ConvInt8CutlassExecution : public Execution {
public:
    struct Resource {
        Resource(Backend* bn, const MNN::Op* op);
        ~ Resource();
        void* mWeightInt8Ptr;
        void* mBiasInt32Ptr;
        void* mScaleFloatPtr;
        std::shared_ptr<Tensor> mWeightInt8Tensor;
        std::shared_ptr<Tensor> mBiasInt32Tensor;
        std::shared_ptr<Tensor> mScaleFloatTensor;

        int32_t* mBiasInt32Vec;
        float* mScaleFloatVec;
        Backend* mBackend = nullptr;

        // relu or relu6
        int mActivationType;
        int mActBits;

        int32_t mInputZeroPoint;
        int32_t mOutputZeroPoint;
        int8_t mClampMin;
        int8_t mClampMax;
        float mInputScale;
        float mOutputScale;
        int mOutputChannelPack;
        std::vector<int> mInt8WeightKernelSum;
        bool mUseConvQuan = true;
        void updateInputOutputScale(std::vector<float> inputQuantInfo, std::vector<float> outputQuantInfo);
    };
    ConvInt8CutlassExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
    virtual ~ConvInt8CutlassExecution();
    virtual ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;

    ErrorCode callCutlassGemmInt8TensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
    ErrorCode callCutlassGemmInt8TensorCore16832(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
private:
    std::shared_ptr<Resource> mResource;

    const Op* mOp = nullptr;
    CutlassGemmInfo mGemmInfo;

    ConvolutionCommon::Im2ColParameter mIm2ColParamter;
    MemChunk mGpuIm2ColParam;

    void* mIm2ColBuffer;

    bool mIsConv1x1S1D1P0 = false;
    bool mNeedIm2Col = true;
    MemChunk mGpuKernelParam;
    bool mIsBlock = false;
    int mBlockNum = 1;

    GemmInt8Tensor_Clamp_AlignTensor_Little mGemmInt8ClampLittle;
    GemmInt8Tensor_Clamp_AlignTensor_Normal mGemmInt8ClampNormal;
    GemmInt8Tensor_Clamp_AlignTensor_Large  mGemmInt8ClampLarge;

    GemmInt8Tensor_Clamp_AlignTensor_Normal_Sm80 mGemmInt8ClampNormalSm80;
    
    GemmSizeLevel mGemmShapeSizeLevel = GEMM_SIZE_NORMAL;
    int mGpuComputeCap = 75;
    int mActivationType = 0;
    std::shared_ptr<Tensor> workspaceTensor;
    void* mWorkspace;
};

} // namespace CUDA
} // namespace MNN

#endif /* ConvInt8CutlassExecution */
#endif
[MNN:Sync] Sync Internal Gitlab 2023-02-28 10:41:24 +08:00			`//`
			`// ConvInt8CutlassExecution.hpp`
			`// MNN`
			`//`
			`// Created by MNN on 2023/01/04.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`
[MNN:Sync] Sync Internal Gitlab 2.4.3 2023-04-18 18:54:46 +08:00			`#ifdef ENABLE_CUDA_QUANT`

[MNN:Sync] Sync Internal Gitlab 2023-02-28 10:41:24 +08:00			`#ifndef ConvInt8CutlassExecution_hpp`
			`#define ConvInt8CutlassExecution_hpp`

			`#include "backend/cuda/core/CUDABackend.hpp"`
			`#include "core/Execution.hpp"`
			`#include "CutlassGemmInt8Param.hpp"`
			`#include "../MNNCUDADefine.hpp"`
			`#include "../MNNCUDAFunction.cuh"`

			`namespace MNN {`
			`namespace CUDA {`

			`typedef enum {`
			`GEMM_SIZE_NORMAL = 0,`
			`GEMM_SIZE_LITTLE = 1,`
			`GEMM_SIZE_LARGE = 2`
			`} GemmSizeLevel;`

			`class ConvInt8CutlassExecution : public Execution {`
			`public:`
			`struct Resource {`
			`Resource(Backend* bn, const MNN::Op* op);`
			`~ Resource();`
			`void* mWeightInt8Ptr;`
			`void* mBiasInt32Ptr;`
			`void* mScaleFloatPtr;`
			`std::shared_ptr<Tensor> mWeightInt8Tensor;`
			`std::shared_ptr<Tensor> mBiasInt32Tensor;`
			`std::shared_ptr<Tensor> mScaleFloatTensor;`

			`int32_t* mBiasInt32Vec;`
			`float* mScaleFloatVec;`
			`Backend* mBackend = nullptr;`

			`// relu or relu6`
			`int mActivationType;`
			`int mActBits;`

			`int32_t mInputZeroPoint;`
			`int32_t mOutputZeroPoint;`
			`int8_t mClampMin;`
			`int8_t mClampMax;`
			`float mInputScale;`
			`float mOutputScale;`
			`int mOutputChannelPack;`
			`std::vector<int> mInt8WeightKernelSum;`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`bool mUseConvQuan = true;`
[MNN:Sync] Sync Internal Gitlab 2023-02-28 10:41:24 +08:00			`void updateInputOutputScale(std::vector<float> inputQuantInfo, std::vector<float> outputQuantInfo);`
			`};`
			`ConvInt8CutlassExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);`
			`virtual ~ConvInt8CutlassExecution();`
			`virtual ErrorCode onResize(const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs) override;`
			`virtual ErrorCode onExecute(const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs) override;`
			`virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;`

			`ErrorCode callCutlassGemmInt8TensorCore(const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs);`
			`ErrorCode callCutlassGemmInt8TensorCore16832(const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs);`
			`private:`
			`std::shared_ptr<Resource> mResource;`

			`const Op* mOp = nullptr;`
			`CutlassGemmInfo mGemmInfo;`

			`ConvolutionCommon::Im2ColParameter mIm2ColParamter;`
[MNN:Sync] Sync Internal 2.7.0 2023-09-04 10:42:11 +08:00			`MemChunk mGpuIm2ColParam;`
[MNN:Sync] Sync Internal Gitlab 2023-02-28 10:41:24 +08:00
			`void* mIm2ColBuffer;`

			`bool mIsConv1x1S1D1P0 = false;`
			`bool mNeedIm2Col = true;`
[MNN:Sync] Sync Internal 2.7.0 2023-09-04 10:42:11 +08:00			`MemChunk mGpuKernelParam;`
[MNN:Sync] Sync Internal Gitlab 2023-02-28 10:41:24 +08:00			`bool mIsBlock = false;`
			`int mBlockNum = 1;`

			`GemmInt8Tensor_Clamp_AlignTensor_Little mGemmInt8ClampLittle;`
			`GemmInt8Tensor_Clamp_AlignTensor_Normal mGemmInt8ClampNormal;`
			`GemmInt8Tensor_Clamp_AlignTensor_Large mGemmInt8ClampLarge;`

			`GemmInt8Tensor_Clamp_AlignTensor_Normal_Sm80 mGemmInt8ClampNormalSm80;`

			`GemmSizeLevel mGemmShapeSizeLevel = GEMM_SIZE_NORMAL;`
			`int mGpuComputeCap = 75;`
			`int mActivationType = 0;`
			`std::shared_ptr<Tensor> workspaceTensor;`
			`void* mWorkspace;`
			`};`

			`} // namespace CUDA`
			`} // namespace MNN`

[MNN:Sync] Sync Internal Gitlab 2.4.3 2023-04-18 18:54:46 +08:00			`#endif /* ConvInt8CutlassExecution */`
			`#endif`