MNN/source/core/Backend.hpp

//
//  Backend.hpp
//  MNN
//
//  Created by MNN on 2018/07/06.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#ifndef Backend_hpp
#define Backend_hpp

#include <MNN/MNNForwardType.h>
#include <MNN/ErrorCode.hpp>
#include <map>
#include "Command.hpp"
#include "NonCopyable.hpp"
#include "BufferAllocator.hpp"
#include <future>
#include <atomic>

namespace MNN {

struct Op;
class Execution;

class Runtime;
class Backend;
struct RuntimeHint {
    // 0: Defer, 1: Eager
    int memoryAllocatorType = 0;
    int winogradMemoryUsed = 3;

    // 0-100, 50 means litter core has 50% capacity of large core
    int cpuDecreaseRate = 50;
    int dynamicQuantOption = 0;

    // qkvQuantOption % 8:
    // 0: Do not quantize
    // 1: Only quantize key, use int8 asymmetric quantization
    // 2: Only quantize value, use fp8 quantization
    // 3: quantize both key and value
    // 4: quantize query, key and value, and use gemm int8 kernel to compute K*V

    // qkvQuantOption / 8:
    // 1: use flash attention

    int qkvQuantOption = 8;

    // the kvcache size limit of each layer
    // if the size of kvcache in memory exceeds the limit
    // it will be moved to disk to save memory
    // -1 for no limit
    int kvcacheSizeLimit = -1;

    // path of the kvcache directory
    std::string kvcacheDirPath = "/tmp";

    std::string midMemoryPath;
    std::string weightMemoryPath;
    int mmapFileSize = 1024; // MB
    int useCachedMmap = 0;

    // path of the NPU model directory
    std::string npuModelDirPath;

    // op encoder number for once commit
    int encorderNumForCommit = 10;
    int initThreadNumber = 0;

    // whether to use Arm sme2 cores when threads>1
    bool useArmSme2Cores = true;

    bool enableKleidiAI = false;

    // Use CPU Ids
    std::vector<int> cpuIds;
};
/** abstract backend */
class Backend : public NonCopyable {

public:
    /** info used to create backend */
    struct Info {
        /** forward type. */
        MNNForwardType type = MNN_FORWARD_CPU;
        /** numThread for CPU . number of threads.  gpuMode for GPU only. tuning/memory Mode setting. */
        union {
            int numThread = 4;
            int gpuMode;
        };
        /** user data. */
        BackendConfig* user = NULL;
        enum Mode {
            // The Op will be run in execution->onExecute
            DIRECT = 0,

            // The Op will be recorded. Run in onExecuteBegin and Wait in onExecuteEnd
            INDIRECT = 1
        };
        Mode mode = DIRECT;
    };

    /** backend buffer storage type */
    enum StorageType {
        /**
         use NOT reusable memory.
         - allocates memory when `onAcquireBuffer` is called.
         - releases memory when `onReleaseBuffer` is called or when the backend is deleted.
         - do NOTHING when `onClearBuffer` is called.
         */
        STATIC,
        /**
         use reusable memory.
         - allocates or reuses memory when `onAcquireBuffer` is called. prefers reusing.
         - collects memory for reuse when `onReleaseBuffer` is called.
         - releases memory when `onClearBuffer` is called or when the backend is deleted.
         */
        DYNAMIC,
        /**
         use NOT reusable memory.
         - allocates memory when `onAcquireBuffer` is called.
         - do NOTHING when `onReleaseBuffer` is called.
         - releases memory when `onClearBuffer` is called or when the backend is deleted.
         */
        DYNAMIC_SEPERATE,

        DYNAMIC_IN_EXECUTION
    };

public:
    /**
     * @brief initializer.
     * @param type  forward type.
     */
    Backend(MNNForwardType type) : mType(type) {
        // nothing to do
    }

    /**
     * @brief deinitializer.
     */
    virtual ~Backend() = default;

public:

    /**
     * @brief create execution for op with input and output tensors.
     * @param inputs    input tensors.
     * @param outputs   output tensors.
     * @param op        given op.
     * @return created execution if op is supported, nullptr otherwise.
     */
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op) = 0;

    /**
     * @brief callback before resize ops.
     */
    virtual void onResizeBegin() {
        // nothing to do
    }
    /**
     * @brief callback after resize ops.
     */
    virtual ErrorCode onResizeEnd() = 0;

    /**
     * @brief callback before executing ops.
     */
    virtual void onExecuteBegin() const = 0;
    /**
     * @brief callback after executing ops.
     */
    virtual void onExecuteEnd() const = 0;

    virtual const Runtime* getRuntime() {
        return nullptr;
    }

    /**
     * @brief allocate buffer of tensor for given storage type.
     * @param tensor        buffer provider.
     * @param storageType   buffer storage type.
     * @return success or not.
     */
    MNN_PUBLIC bool onAcquireBuffer(const Tensor* tensor, StorageType storageType);

    /**
     * @brief release buffer of tensor for given storage type.
     * @param tensor        buffer provider.
     * @param storageType   buffer storage type.
     * @return success or not.
     */
    MNN_PUBLIC bool onReleaseBuffer(const Tensor* tensor, StorageType storageType);

    class MemObj : public RefCount {
    public:
        MemObj() {}
        virtual ~ MemObj() {}
        virtual MemChunk chunk() { return MemChunk(); }
    };
    /**
     * @brief allocate buffer of tensor for given storage type.
     * @param tensor        buffer provider.
     * @param storageType   buffer storage type.
     * @return MemObj for release, if failed, return nullptr.
     */
    virtual MemObj* onAcquire(const Tensor* tensor, StorageType storageType) = 0;

    virtual bool onSelectDynamicAllocator(int index, int maxIndex) {
        return false;
    }
    /**
     * @brief get buffer from tensor directly
     * @param tensor        buffer provider.
     * @return support or not
     */
    virtual bool onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
        return false;
    }

    /**
     * @brief clear all dynamic buffers.
     * @return success or not.
     */
    virtual bool onClearBuffer() = 0;

    /**
     * @brief copy buffer from tensor to tensor.
     * @param srcTensor source buffer provider.
     * @param dstTensor dest buffer provider.
     */
    virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const = 0;

public:
    /**
     * @brief get forward type.
     * @return forward type.
     */
    inline MNNForwardType type() const {
        return mType;
    }

public:
    /**
     * @brief get Gpu Tensor map host ptr/ unmap
     */
    virtual void* onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) {
        return nullptr;
    }

    virtual bool onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) {
        return false;
    }

    virtual int onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
        return 0;
    }

public:
    void* getMetaPtr() {
        return mMetaPtr;
    }
    void setMetaPtr(void* ptr) {
        mMetaPtr = ptr;
    }
private:
    const MNNForwardType mType;
    void* mMetaPtr;
};

/** Each backend belong to a runtime*/
class Runtime : public NonCopyable {
public:
    /**
     Origin Op -> (Compiler) -> New Op -> Backend
     Default use Compiler_Geometry, Origin Op -> Compiler_Geometry -> Little Op
     For serveral Backend, we can't use Geometry to decompose origin op, then it set Compiler_Origin
     */
    enum CompilerType {
        Compiler_Geometry = 0,
        Compiler_Origin = 1,
        Compiler_Loop = 2,
    };

    enum AllocatorType {
        Allocator_Defer = 0,
        Allocator_Eager = 1,
    };
    void setRuntimeHint(const RuntimeHint& hint) {
        mHint = hint;
    }
    const RuntimeHint& hint() const {
        return mHint;
    }

    virtual CompilerType onGetCompilerType() const {
        return Compiler_Loop;
    }

    virtual ~Runtime() = default;
    /**
     @brief create backend
     @return created backend
     */
    virtual Backend* onCreate(const BackendConfig* config = nullptr, Backend* origin = nullptr) const = 0;

    /**
     @brief reset runtime
     */
    virtual void onReset(int numberThread, const BackendConfig* config, bool full) {
        // Do nothing
    }

    /**
     @brief clear unuseful resource
     @param level clear level: 0 - 100, bigger mean clear more, smaller mean cache more
     */
    virtual void onGabageCollect(int level) = 0;

    /**
     @brief Measure the memory it used in MB
     */
    virtual float onGetMemoryInMB() {
        return 0.0f;
    }
    // For NPU backend don't support load from buffer , use onSetCachePath
    virtual bool onSetCachePath(const char* path, int mode) {
        return false;
    }

    // If buffer is not nullptr, try copy cache, else delete cache
    virtual bool onSetCache(const void* buffer, size_t size) {
        //default cache valid, avoid being reset
        return true;
    }

    virtual std::pair<const void*, size_t> onGetCache() {
        return std::make_pair(nullptr, 0);
    }
    virtual int onGetRuntimeStatus(RuntimeStatus statusEnum) const {
        return 0;
    }
    // If the info user set can't be match by runtime, return false and set real info
    virtual bool onCheckInfo(Backend::Info& info) const {
        return true;
    }
    struct OpInfo {
        bool initCostLong;
        float exeutionCost; // In ms
        float initCost; // In ms
    };
    /**
     * @brief measure the cost for op with input and output tensors.
     * @param inputs    input tensors.
     * @param outputs   output tensors.
     * @param op        given op.
     * @param dstInfo   the Info for write.
     * @return support the op or not;
     */
    virtual bool onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                             const MNN::Op* op, OpInfo& dstInfo) const {
        return true;
    }

    // FIXME: Temply use to mask cache valid, in future will delete
    virtual void onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                               const MNN::Op* op) {
        // Do nothing
    }
    // FIXME: Temply used, in future will refract
    std::atomic_bool mCancelled = ATOMIC_VAR_INIT(false);
    MNN_PUBLIC bool hasAsyncWork() const;
    void setAsyncWork(std::future<int>&& future);
    MNN_PUBLIC void waitAsyncWork();

    virtual void onConcurrencyBegin() const {
        // Do nothing
    }
    virtual void onConcurrencyEnd() const {
        // Do nothing
    }

    mutable int pCurrentStatus = 0; // NO_ERROR

    // TODO: Move to Backend
    void* pMeta = nullptr;
private:
    std::future<int> mFuture;
    RuntimeHint mHint;
};

/** abstract Runtime register */
class RuntimeCreator {
public:
    /**
     @brief initializer.
     */
    virtual ~RuntimeCreator() = default;

    virtual Runtime* onCreate(const Backend::Info& info) const = 0;
    /**
     @brief Turn info to supported.
     @param info    info to valid.
     @return success or not
     */
    virtual bool onValid(Backend::Info& info) const {
        info.mode = Backend::Info::DIRECT;
        return true;
    }
    virtual bool onGetDeviceInfo(const std::string& deviceKey, std::string& deviceValue) const {
        return false;
    }
protected:
    /**
     @brief deinitializer.
     */
    RuntimeCreator() = default;
};

/**
 * @brief get registered backend creator for given forward type.
 * @param type  given forward type.
 * @return backend creator pointer if registered, nullptr otherwise.
 */
MNN_PUBLIC const RuntimeCreator* MNNGetExtraRuntimeCreator(MNNForwardType type);

/**
 * @brief register backend creator for given forward type.
 * @param type given forward type.
 * @param creator registering backend creator.
 * @return true if backend creator for given forward type was not registered before, false otherwise.
 */
MNN_PUBLIC bool MNNInsertExtraRuntimeCreator(MNNForwardType type, const RuntimeCreator* creator,
                                             bool needCheck = false);

MNN_PUBLIC bool MNNCPUCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor);
} // namespace MNN

#endif /* Backend_hpp */