MNN/source/backend/metal/MetalBackend.mm

//
//  MetalBackend.mm
//  MNN
//
//  Created by MNN on 2019/01/30.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#import "backend/metal/MetalBackend.hpp"
#define MNN_METAL
#import <MNN/MNNSharedContext.h>
#define METAL_CONST_BUFFER_LIMIT 128
#define METAL_SEPERATE_MAX_COUNT 2
#if MNN_METAL_ENABLED
#include <mutex>
#import "backend/metal/MNNMetalContext.h"
#import "core/Macro.h"
#import "core/TensorUtils.hpp"
#include "MetalCache_generated.h"
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
    if (nullptr == content || nullptr == tensor) {
        return 0;
    }
    auto t = (MNN::Tensor*)tensor;
    auto des = MNN::TensorUtils::getDescribe(t);
    content->buffer = ((MNN::MetalRuntimeAllocator::MetalBufferAlloc*)t->deviceId())->getBuffer();
    content->texture = nil;
    content->offset = des->extra.offset;
    return 0;
}

namespace MNN {

static void _MetalApplyTensor(uint8_t* host, size_t offset, Tensor* t) {
    // ptr of MetalBufferAlloc
    t->buffer().device = (uint64_t)host;
    auto des = TensorUtils::getDescribe(t);
    des->extra.offset = offset;
}
BufferAllocator* MetalRuntime::createDynamicAllocator(int index, bool secondResize) const {
    if (hint().memoryAllocatorType == Runtime::Allocator_Defer && secondResize) {
        return new DeferBufferAllocator(buffer(index), 1024, _MetalApplyTensor);
    }
    if (mStaticCache.get() != nullptr) {
        return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticCache.get()), 1024);
    }
    return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStatic.get()), 1024);
}

struct TunedInfo {
    std::vector<std::unique_ptr<MetalCache::OpInfoT>> mInfos;
};

void registerMetalOps();
#ifdef MNN_SUPPORT_RENDER
extern void registerMetalRenderOps();
#endif

static inline std::map<OpType, MetalBackend::Creator *> *getCreatorMap() {
    static std::once_flag of;
    static std::map<OpType, MetalBackend::Creator *> *ret = nullptr;
    std::call_once(of, [&]() { ret = new std::map<OpType, MetalBackend::Creator *>; });
    return ret;
}

void MetalBackend::addCreator(OpType t, Creator *c) {
    auto map = getCreatorMap();
    if (map->find(t) != map->end()) {
        MNN_PRINT("Error: %d type has be added\n", t);
    }
    map->insert(std::make_pair(t, c));
}

MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime, bool usefp16AsFp32, BackendConfig::MemoryMode mode) : Backend(MNN_FORWARD_METAL),
    mEmptyMem(nil)
    {
    mRuntime = runtime;
    auto ctx = (__bridge MNNMetalContext *)runtime->context();
    mBufferPool.reset(runtime->createDynamicAllocator(0, false));
    mCurrentAllocator = mBufferPool.get();
    mStaticBufferPool = staticMem;
    mUseFloatAsFp16 = usefp16AsFp32;
    mMemoryMode = mode;
    mIsIphone = ctx.isIphone;
    if (runtime->getCommandQueue() == nil) {
        // one command queue can create only a few command buffer, so let each backend own a command queue
        _commandQueue = [[ctx device] newCommandQueue];
        mSupportDeferEncode = true;
    } else {
        // otherwise forbid defer encode optimize
        _commandQueue = runtime->getCommandQueue();
        mSupportDeferEncode = false;
    }
    _commandBuffer = nil;
    _commandBuffer_net = nil;
    _waiting = nil;
}
MetalBackend::~MetalBackend() {
    flushEncoder();
}

id<MTLComputeCommandEncoder> MetalBackend::encoder_net() const {
    id<MTLComputeCommandEncoder> result = [getCommandBufferForNet() computeCommandEncoder];
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
    result.label = nil;
#endif
    return result;
}

void *MetalBackend::context() const {
    return mRuntime->context();
}

class MetalMemRelease : public Backend::MemObj {
public:
    MetalMemRelease(MemChunk buffer, BufferAllocator* allocator) {
        mBuffer = buffer;
        mAllocator = allocator;
    }
    virtual ~ MetalMemRelease() {
        mAllocator->free(mBuffer);
    }
    MemChunk chunk() override {
        return mBuffer;
    }
private:
    MemChunk mBuffer;
    BufferAllocator* mAllocator;
};
size_t MetalBackend::getTensorSizeInBytes(const Tensor* tensor) const {
    auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
    size_t size;
    if (MNN_DATA_FORMAT_NC4HW4 == format && tensor->dimensions() >= 2) {
        int width = 1;
        int height = 1;
        int batch    = tensor->length(0);
        int channel  = tensor->length(1);
        if (tensor->dimensions() >= 3) {
            height = tensor->length(2);
        }
        for (int i=3; i<tensor->dimensions(); ++i) {
            width *= tensor->length(i);
        }
        int alignC = ROUND_UP(channel, 4);
        int hR = ROUND_UP(height, 4) - height;
        // width parallel 4, may exceed 3 elements
        int wR = ROUND_UP(width + 3, 4) - width;
        int bhw = batch * width * height;
        int bhwR = UP_DIV(bhw, 16) * 16 - bhw;
        int extraPadding = ALIMAX(bhwR, (hR * width + wR));
        size = batch * alignC * width * height;
        size = size + extraPadding * 4;
    } else {
        size = 1;
        for (int i=0; i<tensor->dimensions(); ++i) {
            size *= tensor->length(i);
        }
        size = ROUND_UP(size, 4);
    }
    if (0 == size) {
        return 0;
    }
    // use metal_float when meets float
    if (halide_type_float == tensor->buffer().type.code && tensor->buffer().type.bits == 32 && mUseFloatAsFp16) {
        size *= 2;
    } else {
        size *= tensor->getType().bytes();
    }
    size_t align = 4 * sizeof(int);
    size = ROUND_UP(size, align);
    return size;
}

Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
    auto tensor  = const_cast<Tensor *>(_tensor);
    size_t size = getTensorSizeInBytes(_tensor);
    if (0 == size) {
        return nullptr;
    }
    // reuse if possible
    MemChunk buffer;
    BufferAllocator* allocator = nullptr;
    switch (storageType) {
        case Backend::STATIC: {
            buffer = mStaticBufferPool->alloc(size, false);
            allocator = mStaticBufferPool.get();
        } break;
        case Backend::DYNAMIC: {
            buffer = mCurrentAllocator->alloc(size, false);
            allocator = mCurrentAllocator;
        } break;
        case Backend::DYNAMIC_SEPERATE: {
            buffer = mCurrentAllocator->alloc(size, true);
            allocator = mCurrentAllocator;
        } break;
        default:{
            break;
        }
    }
    if (storageType == Backend::STATIC) {
        if(nullptr == buffer.first) {
            MNN_ERROR("onAcquireBuffer error!\n");
            return nullptr;
        }
    } else {
        buffer.attach(tensor);
    }
    if (nullptr == buffer.first) {
        _MetalApplyTensor((uint8_t*)(&mEmptyMem), 0, (Tensor*)_tensor);
    } else {
        _MetalApplyTensor((uint8_t*)buffer.first, buffer.second, (Tensor*)_tensor);
    }
    return new MetalMemRelease(buffer, allocator);
}

bool MetalBackend::onClearBuffer() {
    mCurrentAllocator->release(true);
    if (nullptr != mRuntime->mStaticCache.get()) {
        mStaticBufferPool = mRuntime->mStaticCache;
    }
    return true;
}

Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                  const Op *op) {
    auto map  = getCreatorMap();

    auto iter = map->find(op->type());
    if (iter == map->end()) {
        mSupportDeferEncode = false;
        if (nullptr != op->name()) {
            MNN_PRINT("Don't support type [%s], %s\n", EnumNameOpType(op->type()), op->name()->c_str());
        } else {
            MNN_PRINT("Don't support type [%s]\n", EnumNameOpType(op->type()));
        }
        return NULL;
    }
    //MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));

    auto exe = iter->second->onCreate(inputs, op, this, outputs);
    if (NULL == exe) {
        mSupportDeferEncode = false;
        MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name() ? op->name()->c_str() : "");
        return NULL;
    }
    return exe;
}
void MetalBackend::flushEncoder() const {
    if (nil != mComputeEncoder) {
        [mComputeEncoder endEncoding];
        mComputeEncoder = nil;
    }
}
void MetalBackend::_resetDynamicMemory() const {
    mRuntime->pCurrentStatus = mCurrentAllocator->apply();
    if (NO_ERROR != mRuntime->pCurrentStatus) {
        return;
    }
    if (nullptr != mBufferPoolShapeImmutable.get()) {
        mRuntime->pCurrentStatus = mBufferPoolShapeImmutable->apply();
    }
}

void MetalBackend::onExecuteBegin() const {
    _resetDynamicMemory();
    mEncoderCount = 0;
}
void MetalBackend::onExecuteEnd() const {
    flushEncoder();
    commit_net();
}

BufferAllocator* MetalBackend::getBufferPool() const {
    return mCurrentAllocator;
}

bool MetalBackend::onSelectDynamicAllocator(int index, int maxIndex) {
    if (maxIndex > 2) {
        return false;
    }
    if (maxIndex == 2 && mBufferPoolShapeImmutable.get() == nullptr) {
        mBufferPoolShapeImmutable.reset(mRuntime->createDynamicAllocator(1, true));
        mBufferPool.reset(mRuntime->createDynamicAllocator(0, true));
    }
    if (1 == index) {
        mCurrentAllocator = mBufferPoolShapeImmutable.get();
    } else {
        mCurrentAllocator = mBufferPool.get();
    }
    return true;
}

bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
    if (nullptr == dstInfo) {
        return true;
    }
    auto dst = (MNNMetalTensorContent*)dstInfo;
    dst->type.code = halide_type_float;
    if (mUseFloatAsFp16) {
        dst->type.bits = 16;
    } else {
        dst->type.bits = 32;
    }
    MNNMetalGetTensorContent(dst, (void*)tensor);
    return true;
}

bool MetalBackend::isCmdBufferCommit() {
    auto ctx = (__bridge MNNMetalContext *)context();

    //TODO: set magic number
    const int magicNum = mRuntime->hint().encorderNumForCommit;
    mEncoderCount++;
    if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
        return true;
    }
    return false;
}

id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
    size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
    // reuse
    if (nullptr != mHostBuffer && mHostBuffer.length >= size) {
        return mHostBuffer;
    }

    // create larger
    auto context = (__bridge MNNMetalContext *)this->context();
    mHostBuffer  = [context newDeviceBuffer:size access:CPUReadWrite];
    return mHostBuffer;
}

id<MTLBuffer> MetalBackend::getConstBuffer(size_t size) const {
    if (size < METAL_CONST_BUFFER_LIMIT) {
        if (!mHoldBuffers.empty()) {
            auto res = mHoldBuffers.front();
            mHoldBuffers.pop();
            return res;
        }
        size = METAL_CONST_BUFFER_LIMIT;
    }
    auto context = (__bridge MNNMetalContext *)this->context();
    auto buffer  = [context newDeviceBuffer:size access:CPUReadWrite];
    return buffer;
}
void MetalBackend::returnConstBuffer(id<MTLBuffer> buffer) const {
    mHoldBuffers.push(buffer);
}
static inline void _getNCPlane(const Tensor* tensor, int& s, int& c, int& b) {
    auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
    s = 1, c = 1, b = 1;
    b = tensor->length(0);
    if (format == MNN_DATA_FORMAT_NHWC) {
        c = tensor->length(tensor->dimensions()-1);
        for (int i=1; i<tensor->dimensions()-1; ++i) {
            s *= tensor->length(i);
        }
    } else {
        c = tensor->length(1);
        for (int i=2; i<tensor->dimensions(); ++i) {
            s *= tensor->length(i);
        }
    }
}
MTLSize getTensorShape(id<MTLBuffer> shape, const Tensor *tensor) {
    auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
    int s, b, c;
    _getNCPlane(tensor, s, c, b);
    int z = UP_DIV(c, 4);

    // shape
    ((int *)shape.contents)[0] = b;
    ((int *)shape.contents)[1] = c;
    ((int *)shape.contents)[2] = s;
    ((int *)shape.contents)[3] = 1;

    // stride
    if (format == MNN_DATA_FORMAT_NHWC) {
        ((int *)shape.contents)[4] = s * c;
        ((int *)shape.contents)[5] = 1;
        ((int *)shape.contents)[6] = c;
        ((int *)shape.contents)[7] = 1;
    } else {
        ((int *)shape.contents)[4] = s * c;
        ((int *)shape.contents)[5] = s;
        ((int *)shape.contents)[6] = 1;
        ((int *)shape.contents)[7] = 1;
    }
    // threads
    MTLSize threads = {(NSUInteger)s * b * z, 1, 1};
    return threads;
}
static const char* gTranspose = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct tensor_shape {
    uint4 size; // n, c, plane, 1
    uint4 stride;
};
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
    int channel = uConstant.size.y;
    if (gid < channel * uConstant.size.x * uConstant.size.z) {
        int tmp = gid % (channel * uConstant.size.x);
        int x = gid / (channel * uConstant.size.x);
        int b = tmp / channel;
        int c = tmp % channel;
        int outPos = b * uConstant.size.y * uConstant.size.z + c * uConstant.size.z + x;
        int inPos = b * uConstant.size.y * uConstant.size.z + c + x * uConstant.size.y;
        out[outPos] = (OType)(in[inPos]);
    }
})metal";

static const char* gNC4HW4Convert = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct tensor_shape {
    uint4 size; // n, c, plane, 1
    uint4 stride;
};
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
    int channelC4 = (uConstant.size.y + 3) / 4;
    if (gid < channelC4 * uConstant.size.x * uConstant.size.z)
    {
        int3 pos;
        pos.z = gid % (channelC4 * uConstant.size.x);
        pos.y = gid / (channelC4 * uConstant.size.x);
        pos.x = 0;
        int batchIndex = pos.z / channelC4;
        int zDiv4 = pos.z % channelC4;

        int lastZ = uConstant.size.y / 4;
        int cIndex = uConstant.size.y % 4;

        int z = zDiv4*4;
        int basicOffset = 0
            + batchIndex*uConstant.stride.x
            + z * uConstant.stride.y
            + pos.y * uConstant.stride.z
            ;
#ifdef MNN_OUTPUT_C4
        OType color = OType(0);
        if(zDiv4 == lastZ)
        {
            if(cIndex == 1)
            {
                color.r = in[basicOffset+0];
                color.g = 0.0;
                color.b = 0.0;
                color.a = 0.0;
            }
            else if(cIndex == 2)
            {
                color.r = in[basicOffset+0];
                color.g = in[basicOffset+1*uConstant.stride.y];
                color.b = 0.0;
                color.a = 0.0;
            }
            else
            {
                color.r = in[basicOffset+0];
                color.g = in[basicOffset+1*uConstant.stride.y];
                color.b = in[basicOffset+2*uConstant.stride.y];
                color.a = 0.0;
            }
        }
        else
        {
            color.r = in[basicOffset+0];
            color.g = in[basicOffset+1*uConstant.stride.y];
            color.b = in[basicOffset+2*uConstant.stride.y];
            color.a = in[basicOffset+3*uConstant.stride.y];
        }

        out[0
            + pos.y
            + uConstant.size.x * uConstant.size.z*zDiv4
            + batchIndex*uConstant.size.z
            ] = color;
#else
        IType color = in[0
            + pos.y
            + uConstant.size.x * uConstant.size.z*zDiv4
            + batchIndex*uConstant.size.z
            ];
        if(zDiv4 == lastZ)
        {
            if(cIndex == 1)
            {
                out[basicOffset+0*uConstant.stride.y] = color.r;
            }
            else if(cIndex == 2)
            {
                out[basicOffset+0*uConstant.stride.y] = color.r;
                out[basicOffset+1*uConstant.stride.y] = color.g;
            }
            else
            {
                out[basicOffset+0*uConstant.stride.y] = color.r;
                out[basicOffset+1*uConstant.stride.y] = color.g;
                out[basicOffset+2*uConstant.stride.y] = color.b;
            }
        }
        else
        {
            out[basicOffset+0*uConstant.stride.y] = color.r;
            out[basicOffset+1*uConstant.stride.y] = color.g;
            out[basicOffset+2*uConstant.stride.y] = color.b;
            out[basicOffset+3*uConstant.stride.y] = color.a;
        }
#endif
    }
}
)metal";

static const char* gCopy = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buffer(1)]], constant uint4& limit [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
    if (gid < limit.x) {
        out[int(gid)] = (OType)in[int(gid)];
    }
})metal";

void MetalBackend::onResizeBegin() {
    // Abort last inference task if needed
    flushEncoder();
    _commandBuffer_net = nil;
    _commandBuffer = nil;
    wait();
    mCurrentAllocator->reset();
}

ErrorCode MetalBackend::onResizeEnd() {
    auto ctx = (__bridge MNNMetalContext *)context();
    return mCurrentAllocator->compute();
}

static std::string _getType(const halide_type_t& type, MNN_DATA_FORMAT format, bool useFp16AsFp32) {
    std::string res;
    if (type.code == halide_type_float) {
        if (useFp16AsFp32) {
            res = "half";
        } else {
            res = "float";
        }
    } else {
        switch (type.bytes()) {
            case 1:
                res = "char";
                break;
            case 2:
                res = "short";
                break;
            case 4:
                res = "int";
                break;
            default:
                MNN_ASSERT(false);
                break;
        }
    }
    if (format == MNN_DATA_FORMAT_NC4HW4) {
        return res + "4";
    }
    return res;
}
MetalBackend::CopyPipeline MetalBackend::_makeCopyInfo(const Tensor *src, const Tensor *dst, id<MTLBuffer> shape, int castType) const {
    auto ctx = (__bridge MNNMetalContext *)context();
    MetalBackend::CopyPipeline res;
    auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
    auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
    if (shape == nil) {
        shape = getConstBuffer(8 * sizeof(int));
    }
    res.shape = shape;
    if (sfmt == dfmt || src->dimensions() <= 1) {
        auto srcType = _getType(src->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 1);
        auto dstType = _getType(dst->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 2);
        auto size      = dst->elementSize();
        size = UP_DIV(size, 4);
        std::vector<std::string> keys = {
            "copyC4",
            srcType,
            dstType
        };
        ((uint32_t*)[shape contents])[0] = size;
        id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
        if (nil == pipeline) {
            MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
            auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
            [dic setValue:@(keys[1].c_str()) forKey:@"IType"];
            [dic setValue:@(keys[2].c_str()) forKey:@"OType"];
            option.preprocessorMacros = dic;
            pipeline = makeComputePipelineWithSourceOption(gCopy, "main0", option);
            mRuntime->insertPipeline(keys, pipeline);
        }
        res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
        res.localSize = MTLSizeMake(256, 1, 1);
        res.pipeline = pipeline;
        return res;
    }
    auto srcType = _getType(src->getType(), sfmt, mUseFloatAsFp16 && castType != 1);
    auto dstType = _getType(dst->getType(), dfmt, mUseFloatAsFp16 && castType != 2);
    if (sfmt == MNN_DATA_FORMAT_NC4HW4 || dfmt == MNN_DATA_FORMAT_NC4HW4) {
        auto normalTensor = dst;
        if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
            normalTensor = src;
        }
        // convert C4 / NCHW
        std::vector<std::string> keys = {
            "c4convert",
            srcType,
            dstType
        };
        if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
            keys.emplace_back("outputc4");
        }
        id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
        if (nil == pipeline) {
            MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
            auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
            [dic setValue:@(keys[1].c_str()) forKey:@"IType"];
            [dic setValue:@(keys[2].c_str()) forKey:@"OType"];
            if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
                [dic setValue:@"1" forKey:@"MNN_OUTPUT_C4"];
            }
            option.preprocessorMacros = dic;
            pipeline = makeComputePipelineWithSourceOption(gNC4HW4Convert, "main0", option);
            mRuntime->insertPipeline(keys, pipeline);
        }
        res.pipeline = pipeline;
        auto size = getTensorShape(shape, normalTensor);
        auto gl = [ctx computeBestGroupAndLocal:pipeline threads:size];
        res.groupSize = gl.first;
        res.localSize = gl.second;
        return res;
    }
    // NCHW <-> NHWC
    std::vector<std::string> keys = {
        "transpose",
        srcType,
        dstType
    };
    id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
    if (nil == pipeline) {
        MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
        auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
        [dic setValue:@(keys[1].c_str()) forKey:@"IType"];
        [dic setValue:@(keys[2].c_str()) forKey:@"OType"];
        option.preprocessorMacros = dic;
        pipeline = makeComputePipelineWithSourceOption(gTranspose, "main0", option);
        mRuntime->insertPipeline(keys, pipeline);
    }
    res.pipeline = pipeline;
    int n, c, plane;
    _getNCPlane(dst, plane, c, n);
    auto shapePtr = (uint32_t*)shape.contents;
    shapePtr[0] = n;
    shapePtr[3] = 1;
    if (MNN_DATA_FORMAT_NHWC == dfmt) {
        shapePtr[1] = plane;
        shapePtr[2] = c;
    } else {
        shapePtr[1] = c;
        shapePtr[2] = plane;
    }
    auto size = plane * n * c;
    res.localSize = MTLSizeMake(256, 1, 1);
    res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
    return res;
}

static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::CopyPipeline& info, std::pair<id<MTLBuffer>, int> src, std::pair<id<MTLBuffer>, int> dst) {
    [encoder setComputePipelineState:info.pipeline];
    [encoder setBuffer:src.first offset:src.second atIndex:0];
    [encoder setBuffer:dst.first offset:dst.second atIndex:1];
    [encoder setBuffer:info.shape offset:0 atIndex:2];
    [encoder dispatchThreadgroups:info.groupSize threadsPerThreadgroup:info.localSize];
}
void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
                                        id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape, int castType) const {
    auto ctx    = (__bridge MNNMetalContext *)context();
    auto info = _makeCopyInfo(src, dst, shape, castType);
    auto standalone = encoder == nil;
    encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
    _execute(encoder, info, MetalBackend::getBuffer(src), MetalBackend::getBuffer(dst));
    if (standalone) {
        [encoder endEncoding];
        MNN_PRINT_ENCODER(ctx, encoder);
    }
}

void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
    flushEncoder();
    auto ctx = (__bridge MNNMetalContext *)context();
    commit_net();

    _resetDynamicMemory();
    onCopyBuffer(src, dst, nil, nil);
}

id<MTLComputeCommandEncoder> MetalBackend::encoder_for_net() const {
    if (nil == mComputeEncoder) {
        mComputeEncoder = encoder_net();//TO DO :: use which cmdBuffer
    }
    return mComputeEncoder;
}

void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
    MNN_ASSERT(src->buffer().dimensions == dst->buffer().dimensions);

    if (!src->buffer().host && !dst->buffer().host) {
        onCopyDeviceToDevice(src, dst, encoder, shape);
        return;
    }
    auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
    auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
    bool formatDiff = sfmt != dfmt && src->dimensions() > 1;
    auto floats  = src->getType().code == halide_type_float;
    bool dataTypeDiff = floats && mUseFloatAsFp16;
    bool needConvert = formatDiff || dataTypeDiff;

    if (!src->buffer().host && dst->buffer().host) {
        auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)src->deviceId())->getBuffer();
        auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(src)->extra.offset;
        if (needConvert) {
            auto tDst = const_cast<Tensor*>(dst);
            auto tmpBuffer = getHostBuffer(dst->usize());
            auto info = _makeCopyInfo(src, dst, shape, 2);
            auto standalone = encoder == nil;
            encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
            _execute(encoder, info, MetalBackend::getBuffer(src), std::make_pair(tmpBuffer, 0));
            if (standalone) {
                [encoder endEncoding];
            }
            commit();
            devicePtr = (uint8_t*)tmpBuffer.contents;
        }
        wait();
        ::memcpy(dst->host<void>(), devicePtr, dst->usize());
        return;
    }
    if (src->buffer().host && !dst->buffer().host) {
        // For command queue from user, need user to make sure last frame's gpu work is ready
        bool needWait = !mRuntime->userSync();
        if (needWait) {
            wait();
        }
        auto srcSize = src->usize();
        if (needConvert) {
            auto tmpBuffer = getHostBuffer(srcSize);
            ::memcpy(tmpBuffer.contents, src->host<void>(), srcSize);
            auto info = _makeCopyInfo(src, dst, shape, 1);
            auto standalone = encoder == nil;
            encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
            _execute(encoder, info, std::make_pair(tmpBuffer, 0), MetalBackend::getBuffer(dst));
            if (standalone) {
                [encoder endEncoding];
            }
            commit();
        } else {
            auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)dst->deviceId())->getBuffer();
            auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(dst)->extra.offset;
            ::memcpy(devicePtr, src->host<void>(), srcSize);
        }
        return;
    }
    MNN_ASSERT(false); // should not be handled here
}
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
    flushEncoder();
    auto ctx = (__bridge MNNMetalContext *)context();
    commit_net();

    if (toCpu) {
        wait();
    }
    return 0;
}
id<MTLCommandBuffer> MetalBackend::getCommandBufferForBufferCopy() const {
    if (nil == _commandBuffer) {
        _commandBuffer = [_commandQueue commandBuffer];
        if (!mSupportDeferEncode) {
            // In this case _commandBuffer should be the same as _commandBuffer_net
            _commandBuffer_net = _commandBuffer;
        }
    }
    return _commandBuffer;
}
id<MTLCommandBuffer> MetalBackend::getCommandBufferForNet() const {
    if (nil == _commandBuffer_net) {
        _commandBuffer_net = [_commandQueue commandBuffer];
        if (!mSupportDeferEncode) {
            // In this case _commandBuffer should be the same as _commandBuffer_net
            _commandBuffer = _commandBuffer_net;
        }
    }
    return _commandBuffer_net;
}

void MetalBackend::setTensor(const MNN::Tensor* tensor, id<MTLComputeCommandEncoder> encoder, int index) {
    [encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer() offset:TensorUtils::getDescribe(tensor)->extra.offset atIndex:index];
}
void MetalBackend::setMem(const MemChunk& chunk, id<MTLComputeCommandEncoder> encoder, int index) {
    [encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer() offset:chunk.second atIndex:index];
}
uint8_t* MetalBackend::getMemPtr(const MemChunk& chunk) {
    return (uint8_t*)((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer().contents + chunk.second;
}

std::pair<id<MTLBuffer>, int> MetalBackend::getBuffer(const MNN::Tensor* tensor) {
    return std::make_pair(((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer(), TensorUtils::getDescribe(tensor)->extra.offset);
}


void MetalBackend::commit() const {
    if (nil != _commandBuffer &&  _commandBuffer.status < MTLCommandBufferStatusCommitted) {
        [_commandBuffer commit];
        _waiting = _commandBuffer;
        _commandBuffer = nil;
        if (!mSupportDeferEncode) {
            // In this case _commandBuffer should be the same as _commandBuffer_net
            _commandBuffer_net = nil;
        }
    }
}

void MetalBackend::commit_net() const {
    if (nil != _commandBuffer_net && _commandBuffer_net.status < MTLCommandBufferStatusCommitted) {
        [_commandBuffer_net commit];
        _waiting = _commandBuffer_net;
        _commandBuffer_net = nil;
        if (!mSupportDeferEncode) {
            // In this case _commandBuffer should be the same as _commandBuffer_net
            _commandBuffer = nil;
        }
    }
}

void MetalBackend::wait() const {
    if (nil != _waiting) {
        auto buffer = _waiting;
        if (buffer.status >= MTLCommandBufferStatusCompleted) {
            _waiting = nil;
            return;
        }

#if MNN_METAL_BENCHMARK
        NSTimeInterval begin = [NSDate timeIntervalSinceReferenceDate];
        [buffer waitUntilCompleted];
        NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
        if (@available(iOS 10.3, *)) {
            printf("[METAL] commit costs: %.3fms\t(kernel: %.3fms, GPU: %.3fms)\n", (end - begin) * 1000.f,
                   (buffer.kernelEndTime - buffer.kernelStartTime) * 1000.f,
                   (buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f);
        } else {
            printf("[METAL] commit costs: %.3fms\n", (end - begin) * 1000.f);
        }
#else
        [buffer waitUntilCompleted];
#endif

#if MNN_METAL_DEBUG
        if (buffer.error) {
            printf("[METAL] %s\n", buffer.error.localizedDescription.UTF8String);
        }
#endif
    }
    _waiting = nil;
}

id<MTLComputePipelineState> MetalBackend::makeComputePipelineWithSourceOption(const char* csource, const char* cname, MTLCompileOptions *options) const{
    auto ctx = (__bridge MNNMetalContext *)context();
    auto source = [[NSString alloc] initWithUTF8String:csource];
    auto name = [[NSString alloc] initWithUTF8String:cname];
    auto pipeline = [ctx pipelineWithSourceOption:source name:name options:options];
    if (nil == pipeline) {
        mRuntime->pCurrentStatus = NOT_SUPPORT;
    }
    return pipeline;
}
void MetalRuntime::setCommandQueue(id<MTLCommandQueue> queue, bool userSync) {
    mQueue = queue;
    mUserSync = userSync;
}
id<MTLComputePipelineState> MetalRuntime::findPipeline(const std::vector<std::string>& keys) const {
    auto iter = mCachePipeine.find(keys);
    if (iter == mCachePipeine.end()) {
        return nil;
    }
    return iter->second;
}
void MetalRuntime::insertPipeline(const std::vector<std::string>& keys, id<MTLComputePipelineState> pipeline) const {
    if (nil != pipeline) {
        mCachePipeine.insert(std::make_pair(keys, pipeline));
    }
}

void MetalRuntime::setGpuMode(const int mode_num) {
    int totalSet = 0;
    bool isSet = (mode_num & MNN_GPU_MEMORY_BUFFER);
    if(isSet) {
        totalSet++;
    }
    isSet = (mode_num & MNN_GPU_MEMORY_IMAGE);
    if(isSet) {
        totalSet++;
    }
    if(totalSet > 0) {
        MNN_PRINT("warning: set BUFFER and IMAGE mode is not useful for metal, it doesn't matter, cl_mode:%x！\n", mode_num);
    }

    totalSet = 0;
    isSet = (mode_num & MNN_GPU_TUNING_NONE);
    if(isSet) {
        mTuneLevel = Never;
        totalSet++;
    }

    isSet = (mode_num & MNN_GPU_TUNING_FAST);
    if(isSet) {
        mTuneLevel = Fast;
        totalSet++;
    }

    isSet = (mode_num & MNN_GPU_TUNING_NORMAL);
    if(isSet) {
        mTuneLevel = Normal;
        totalSet++;
    }

    isSet = (mode_num & MNN_GPU_TUNING_HEAVY);
    if(isSet) {
        mTuneLevel = Heavy;
        totalSet++;
    }

    isSet = (mode_num & MNN_GPU_TUNING_WIDE);
    if(isSet) {
        mTuneLevel = Wide;
        totalSet++;
    }

    if(totalSet != 1) {
        MNN_PRINT("set multi tuning mode is not permitted, please check cl_mode:%x！\n", mode_num);
    }
}

struct MetalContext {
    std::mutex pLock;
    MNNMetalContext* pContext;
    id<MTLDevice> pDevice;
};
static MetalContext* gContext = nullptr;
MetalRuntime* MetalRuntime::create(const Backend::Info& info) {
    std::unique_lock<std::mutex> _l(gContext->pLock);
    MNNMetalSharedContext sharedContext;
    sharedContext.device = nil;
    sharedContext.queue = nil;
    if (info.user != nullptr) {
        if (info.user->sharedContext != nullptr) {
            sharedContext.device = ((MNNMetalSharedContext*)info.user->sharedContext)->device;
            sharedContext.queue = ((MNNMetalSharedContext*)info.user->sharedContext)->queue;
        }
    }
    if (nil == sharedContext.device) {
        sharedContext.device = MTLCreateSystemDefaultDevice();
    }
    if (nil == gContext->pContext || gContext->pDevice != sharedContext.device) {
        gContext->pContext = [[MNNMetalContext alloc] init];
        gContext->pDevice = sharedContext.device;
        BOOL res = [gContext->pContext initWithSharedContext:&sharedContext dev:sharedContext.device];
        if (!res) {
            gContext->pContext = nil;
            return nullptr;
        }
    }
    auto mContext = (__bridge_retained void *)(gContext->pContext);
    auto rt = new MetalRuntime(mContext);
    rt->setGpuMode(info.gpuMode);
    if (nil != sharedContext.queue) {
        rt->setCommandQueue(sharedContext.queue, true);
    }
    bool supportDefer = info.numThread & MNN_GPU_RECORD_BATCH;
    if ((!supportDefer) && nil == sharedContext.queue) {
        id<MTLCommandQueue> queue = [sharedContext.device newCommandQueue];
        rt->setCommandQueue(queue, false);
    }
    if (nullptr != info.user) {
        rt->mDefaultConfig = *info.user;
    }
    return rt;
}

MetalRuntime::MetalRuntime(void* context) {
    mContext = context;
    auto ctx = (__bridge MNNMetalContext *)mContext;
    std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
    mSimdGroupReduce = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
    mSimdGroupReduce |= [[ctx device] supportsFamily:MTLGPUFamilyMetal3];
    mSimdGroupMatrix = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
    mStatic.reset(new EagerBufferAllocator(allocator));
    mDynamic.resize(METAL_SEPERATE_MAX_COUNT);
    for (auto& buf : mDynamic) {
        buf.root = allocator;
    }
    mTunedInfo = new TunedInfo;
}

MetalRuntime::~ MetalRuntime() {
    if(mContext) {
        CFRelease(mContext);
    }
    delete mTunedInfo;
}

bool MetalRuntime::setCache(std::pair<const void*, size_t> cache) {//Get Cache
    auto buffer = cache.first;
    auto size   = cache.second;
    if (nullptr == buffer) {
        mCacheOutside = nullptr;
        mCacheOutsideSize = 0;
        mBuffer.clear();
        return false;//actually get nothing
    }
    mCacheOutsideSize = size;
    mCacheOutside = buffer;
    auto cacheBuffer = GetCache(buffer);
    flatbuffers::Verifier verify((const uint8_t*)cache.first, cache.second);
    if (false == VerifyCacheBuffer(verify)) {
        return false;
    }
    if (nullptr == cacheBuffer->tunings()) {
        return false;
    }

    // Load Auto Tuning Info
    if (nullptr != cacheBuffer->tunings()) {
        auto tuningInfo = cacheBuffer->tunings();
        for (int i=0; i<tuningInfo->size(); ++i) {
            auto tun = tuningInfo->GetAs<Autotuning>(i);
            if (nullptr == tun->threadSize() || nullptr == tun->groupSize() || nullptr == tun->key()) {
                MNN_ERROR("Error tunning info\n");
                continue;
            }
            std::vector<uint32_t> glo(tun->threadSize()->size());
            for (int v=0; v<glo.size(); ++v) {
                glo[v] = tun->threadSize()->data()[v];
            }
            std::vector<uint32_t> grop(tun->groupNum()->size());
            for (int v=0; v<grop.size(); ++v) {
                grop[v] = tun->groupNum()->data()[v];
            }
            std::vector<uint32_t> loc(tun->groupSize()->size());
            for (int v=0; v<loc.size(); ++v) {
                loc[v] = tun->groupSize()->data()[v];
            }
            uint32_t cost = tun->timeCost();
            mTunedThreadGroup.insert(std::make_pair(std::make_pair(tun->key()->str(), glo), std::make_tuple(grop, loc, cost)));
            mTunedThreadGroupVec[tun->key()->str()].emplace_back(std::make_pair(glo, std::make_tuple(grop, loc, cost)));
        }
    }
    return true;
}

std::pair<const void*, size_t> MetalRuntime::makeCache(TunedInfo* info) {//make Cache
    std::unique_ptr<CacheT> cache(new CacheT);
    // Get All Autotuning cache
    for (auto& iter : mTunedThreadGroup) {
        std::unique_ptr<AutotuningT> tuning(new AutotuningT);
        tuning->key = iter.first.first;
        tuning->threadSize = iter.first.second;

        tuning->groupNum = std::get<0>(iter.second);
        tuning->groupSize = std::get<1>(iter.second);
        tuning->timeCost = std::get<2>(iter.second);

        cache->tunings.emplace_back(std::move(tuning));
    }
    cache->tuned = std::move(info->mInfos);

    flatbuffers::FlatBufferBuilder builder;
    auto lastOffset = Cache::Pack(builder, cache.get());
    builder.Finish(lastOffset);
    mBuffer.resize(builder.GetSize());
    ::memcpy(mBuffer.data(), builder.GetBufferPointer(), builder.GetSize());
    return std::make_pair(mBuffer.data(), mBuffer.size());
}

float MetalRuntime::onGetMemoryInMB() {
    auto staticMemoryInMB = mStatic->totalSize() / 1024.0f / 1024.0f;
    float dynamicMemoryInMB = 0.0f;
    for (auto& buf : mDynamic) {
        dynamicMemoryInMB += buf.currentSize / 1024.0f / 1024.0f;
    }
    return staticMemoryInMB + dynamicMemoryInMB;
}

void MetalRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                           const MNN::Op* op) {
    if (nullptr != op->name()) {
        auto dstInfo = mTunedInfo;
        std::unique_ptr<MetalCache::OpInfoT> opInfo(new MetalCache::OpInfoT);;
        opInfo->type = op->type();
        opInfo->name = op->name()->str();
        opInfo->inputs.resize(inputs.size());
        for (int v=0; v<opInfo->inputs.size(); ++v) {
            opInfo->inputs[v].reset(new MetalCache::TensorInfoT);
            opInfo->inputs[v]->shape.resize(inputs[v]->dimensions());
            for (int u=0; u<opInfo->inputs[v]->shape.size(); ++u) {
                opInfo->inputs[v]->shape[u] = inputs[v]->length(u);
            }
        }
        opInfo->outputs.resize(outputs.size());
        for (int v=0; v<opInfo->outputs.size(); ++v) {
            opInfo->outputs[v].reset(new MetalCache::TensorInfoT);
            opInfo->outputs[v]->shape.resize(outputs[v]->dimensions());
            for (int u=0; u<opInfo->outputs[v]->shape.size(); ++u) {
                opInfo->outputs[v]->shape[u] = outputs[v]->length(u);
            }
        }
        dstInfo->mInfos.emplace_back(std::move(opInfo));
    }
}
static bool _checkTensorInfo(const MetalCache::TensorInfoT* dst, const Tensor* src) {
    if (dst->shape.size() != src->dimensions()) {
        return false;
    }
    for (int j=0; j<dst->shape.size(); ++j) {
        if (dst->shape[j] != src->length(j)) {
            return false;
        }
    }
    return true;
}
bool MetalRuntime::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                       const MNN::Op* op, Runtime::OpInfo& dstInfo) const {
    dstInfo.initCostLong = true;
    if (nullptr == op->name()) {
        dstInfo.initCostLong = false;
        return true;
    }
    for(auto& info : mTunedInfo->mInfos) {
        if (info->type != op->type()) {
            continue;
        }
        if (info->name != op->name()->str()) {
            continue;
        }
        if (info->inputs.size() != inputs.size() || info->outputs.size() != outputs.size()) {
            continue;
        }
        bool match = true;
        for (int i=0; i<inputs.size(); ++i) {
            auto& dst = info->inputs[i];
            auto src = inputs[i];
            if (!_checkTensorInfo(dst.get(), src)) {
                match = false;
                break;
            }
        }
        if (!match) {
            continue;
        }
        for (int i=0; i<outputs.size(); ++i) {
            auto& dst = info->outputs[i];
            auto src = outputs[i];
            if (!_checkTensorInfo(dst.get(), src)) {
                match = false;
                break;
            }
        }
        if (match) {
            // All Info is match
            dstInfo.initCostLong = false;
            break;
        }
    }
    return true;
}

class MetalWrapAllocator : public BufferAllocator::Allocator {
private:
    std::shared_ptr<BufferAllocator::Allocator> mOrigin;
    id<MTLDevice> mDevice;
public:
    MetalWrapAllocator(std::shared_ptr<BufferAllocator::Allocator> origin, id<MTLDevice> device) : mOrigin(origin), mDevice(device) {}
    virtual ~ MetalWrapAllocator() {
        // Do nothing
    }
    virtual MemChunk onAlloc(size_t size, size_t align) override {
        auto mem = mOrigin->onAlloc(size, align);
        MNN_ASSERT(mem.second == 0);
        id<MTLBuffer> buffer = [mDevice newBufferWithBytesNoCopy:mem.first length:size options:MTLResourceStorageModeShared  deallocator:nil];
        auto wrap = new MetalRuntimeAllocator::MetalBufferAlloc(buffer);
        return MemChunk((void *)wrap, 0);
    }
    virtual void onRelease(MemChunk chunk) override {
        auto mem = (MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first;
        mOrigin->onRelease(MemChunk(mem->getBuffer().contents));
        delete mem;
    }
};
Backend* MetalRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
    if (hint().weightMemoryPath.size() > 0 && mStaticCache.get() == nullptr) {
        auto ctx = (__bridge MNNMetalContext *)mContext;
        auto mmap = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "", "metal.weight");
        std::shared_ptr<BufferAllocator::Allocator> mmapMem(new MetalWrapAllocator(mmap, [ctx device]));
        mStaticCache = mStatic;
        mStatic.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
    }
    BackendConfig::PrecisionMode precision = mDefaultConfig.precision;
    BackendConfig::MemoryMode memory = mDefaultConfig.memory;
    if (nullptr != config) {
        precision = config->precision;
        memory = config->memory;
    }
    bool useFp16AsFp32 = precision != BackendConfig::Precision_High;
    return new MetalBackend(mStatic, this, useFp16AsFp32, memory);
}

void MetalRuntime::onGabageCollect(int level) {
    mStatic->release(false);
    if (level >= 100) {
        for (auto& buf : mDynamic) {
            buf.release();
        }
    }
}

std::pair<const void*, size_t> MetalRuntime::onGetCache() {//make Cache
    return makeCache(mTunedInfo);
}

bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
    if (nullptr == buffer) {
        return false;
    }
    auto cacheBuffer = MetalCache::GetCache(buffer);
    flatbuffers::Verifier verify((const uint8_t*)buffer, size);
    if (false == VerifyCacheBuffer(verify)) {
        return false;
    }
    if(nullptr != cacheBuffer->tuned()) {
        for (int i=0; i<cacheBuffer->tuned()->size(); ++i) {
            auto srcInfo = cacheBuffer->tuned()->GetAs<MetalCache::OpInfo>(i);
            std::unique_ptr<MetalCache::OpInfoT> dst(srcInfo->UnPack());
            mTunedInfo->mInfos.emplace_back(std::move(dst));
        }
    }
    return setCache(std::make_pair(buffer, size));
}

MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
    auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
    auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
    return MemChunk((void *)mMetalBufferAlloc, 0);
}
void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
    delete (MetalBufferAlloc *)ptr.first;
}

class MetalRuntimeCreator : public RuntimeCreator {
public:
    MetalRuntimeCreator() {
        // Do nothing
    }
    virtual ~ MetalRuntimeCreator() {
        // Do nothing
    }
    virtual Runtime *onCreate(const Backend::Info &info) const {
        auto rt = MetalRuntime::create(info);
        return rt;
    }
private:
    id<MTLDevice> mDevice;
};

void registerMetalRuntimeCreator() {
    // according to
    // https://developer.apple.com/library/archive/documentation/DeviceInformation/Reference/iOSDeviceCompatibility/HardwareGPUInformation/HardwareGPUInformation.html
    // not all device with iOS 8+ supports metal.
    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
    if (nil != device) {
        gContext = new MetalContext;
        gContext->pContext = nil;
        gContext->pDevice = nil;
        registerMetalOps();
#ifdef MNN_SUPPORT_RENDER
        registerMetalRenderOps();
#endif
        MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator, false);
    } else {
        MNN_ERROR("Init Metal Error\n");
    }
}
} // namespace MNN
#else
namespace MNN {
void registerMetalRuntimeCreator() {
}
};
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
    return -1;
}

#endif