MNN/source/backend/cuda/execution/int8/ConvInt8CutlassExecution.hpp

101 lines
3.1 KiB
C++

//
// ConvInt8CutlassExecution.hpp
// MNN
//
// Created by MNN on 2023/01/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef ENABLE_CUDA_QUANT
#ifndef ConvInt8CutlassExecution_hpp
#define ConvInt8CutlassExecution_hpp
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
#include "CutlassGemmInt8Param.hpp"
#include "../MNNCUDADefine.hpp"
#include "../MNNCUDAFunction.cuh"
namespace MNN {
namespace CUDA {
typedef enum {
GEMM_SIZE_NORMAL = 0,
GEMM_SIZE_LITTLE = 1,
GEMM_SIZE_LARGE = 2
} GemmSizeLevel;
class ConvInt8CutlassExecution : public Execution {
public:
struct Resource {
Resource(Backend* bn, const MNN::Op* op);
~ Resource();
void* mWeightInt8Ptr;
void* mBiasInt32Ptr;
void* mScaleFloatPtr;
std::shared_ptr<Tensor> mWeightInt8Tensor;
std::shared_ptr<Tensor> mBiasInt32Tensor;
std::shared_ptr<Tensor> mScaleFloatTensor;
int32_t* mBiasInt32Vec;
float* mScaleFloatVec;
Backend* mBackend = nullptr;
// relu or relu6
int mActivationType;
int mActBits;
int32_t mInputZeroPoint;
int32_t mOutputZeroPoint;
int8_t mClampMin;
int8_t mClampMax;
float mInputScale;
float mOutputScale;
int mOutputChannelPack;
std::vector<int> mInt8WeightKernelSum;
bool mUseConvQuan = true;
void updateInputOutputScale(std::vector<float> inputQuantInfo, std::vector<float> outputQuantInfo);
};
ConvInt8CutlassExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
virtual ~ConvInt8CutlassExecution();
virtual ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
ErrorCode callCutlassGemmInt8TensorCore(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
ErrorCode callCutlassGemmInt8TensorCore16832(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs);
private:
std::shared_ptr<Resource> mResource;
const Op* mOp = nullptr;
CutlassGemmInfo mGemmInfo;
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
MemChunk mGpuIm2ColParam;
void* mIm2ColBuffer;
bool mIsConv1x1S1D1P0 = false;
bool mNeedIm2Col = true;
MemChunk mGpuKernelParam;
bool mIsBlock = false;
int mBlockNum = 1;
GemmInt8Tensor_Clamp_AlignTensor_Little mGemmInt8ClampLittle;
GemmInt8Tensor_Clamp_AlignTensor_Normal mGemmInt8ClampNormal;
GemmInt8Tensor_Clamp_AlignTensor_Large mGemmInt8ClampLarge;
GemmInt8Tensor_Clamp_AlignTensor_Normal_Sm80 mGemmInt8ClampNormalSm80;
GemmSizeLevel mGemmShapeSizeLevel = GEMM_SIZE_NORMAL;
int mGpuComputeCap = 75;
int mActivationType = 0;
std::shared_ptr<Tensor> workspaceTensor;
void* mWorkspace;
};
} // namespace CUDA
} // namespace MNN
#endif /* ConvInt8CutlassExecution */
#endif