MNN/source/backend/metal/MetalBackend.mm

1311 lines
45 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// MetalBackend.mm
// MNN
//
// Created by MNN on 2019/01/30.
// Copyright © 2018, Alibaba Group Holding Limited
//
#import "backend/metal/MetalBackend.hpp"
#define MNN_METAL
#import <MNN/MNNSharedContext.h>
#define METAL_CONST_BUFFER_LIMIT 128
#define METAL_SEPERATE_MAX_COUNT 2
#if MNN_METAL_ENABLED
#include <mutex>
#import "backend/metal/MNNMetalContext.h"
#import "core/Macro.h"
#import "core/TensorUtils.hpp"
#include "MetalCache_generated.h"
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
if (nullptr == content || nullptr == tensor) {
return 0;
}
auto t = (MNN::Tensor*)tensor;
auto des = MNN::TensorUtils::getDescribe(t);
content->buffer = ((MNN::MetalRuntimeAllocator::MetalBufferAlloc*)t->deviceId())->getBuffer();
content->texture = nil;
content->offset = des->extra.offset;
return 0;
}
namespace MNN {
static void _MetalApplyTensor(uint8_t* host, size_t offset, Tensor* t) {
// ptr of MetalBufferAlloc
t->buffer().device = (uint64_t)host;
auto des = TensorUtils::getDescribe(t);
des->extra.offset = offset;
}
BufferAllocator* MetalRuntime::createDynamicAllocator(int index, bool secondResize) const {
if (hint().memoryAllocatorType == Runtime::Allocator_Defer && secondResize) {
return new DeferBufferAllocator(buffer(index), 1024, _MetalApplyTensor);
}
if (mStaticCache.get() != nullptr) {
return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticCache.get()), 1024);
}
return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStatic.get()), 1024);
}
struct TunedInfo {
std::vector<std::unique_ptr<MetalCache::OpInfoT>> mInfos;
};
void registerMetalOps();
#ifdef MNN_SUPPORT_RENDER
extern void registerMetalRenderOps();
#endif
static inline std::map<OpType, MetalBackend::Creator *> *getCreatorMap() {
static std::once_flag of;
static std::map<OpType, MetalBackend::Creator *> *ret = nullptr;
std::call_once(of, [&]() { ret = new std::map<OpType, MetalBackend::Creator *>; });
return ret;
}
void MetalBackend::addCreator(OpType t, Creator *c) {
auto map = getCreatorMap();
if (map->find(t) != map->end()) {
MNN_PRINT("Error: %d type has be added\n", t);
}
map->insert(std::make_pair(t, c));
}
MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime, bool usefp16AsFp32, BackendConfig::MemoryMode mode) : Backend(MNN_FORWARD_METAL),
mEmptyMem(nil)
{
mRuntime = runtime;
auto ctx = (__bridge MNNMetalContext *)runtime->context();
mBufferPool.reset(runtime->createDynamicAllocator(0, false));
mCurrentAllocator = mBufferPool.get();
mStaticBufferPool = staticMem;
mUseFloatAsFp16 = usefp16AsFp32;
mMemoryMode = mode;
mIsIphone = ctx.isIphone;
if (runtime->getCommandQueue() == nil) {
// one command queue can create only a few command buffer, so let each backend own a command queue
_commandQueue = [[ctx device] newCommandQueue];
mSupportDeferEncode = true;
} else {
// otherwise forbid defer encode optimize
_commandQueue = runtime->getCommandQueue();
mSupportDeferEncode = false;
}
_commandBuffer = nil;
_commandBuffer_net = nil;
_waiting = nil;
}
MetalBackend::~MetalBackend() {
flushEncoder();
}
id<MTLComputeCommandEncoder> MetalBackend::encoder_net() const {
id<MTLComputeCommandEncoder> result = [getCommandBufferForNet() computeCommandEncoder];
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
result.label = nil;
#endif
return result;
}
void *MetalBackend::context() const {
return mRuntime->context();
}
class MetalMemRelease : public Backend::MemObj {
public:
MetalMemRelease(MemChunk buffer, BufferAllocator* allocator) {
mBuffer = buffer;
mAllocator = allocator;
}
virtual ~ MetalMemRelease() {
mAllocator->free(mBuffer);
}
MemChunk chunk() override {
return mBuffer;
}
private:
MemChunk mBuffer;
BufferAllocator* mAllocator;
};
size_t MetalBackend::getTensorSizeInBytes(const Tensor* tensor) const {
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
size_t size;
if (MNN_DATA_FORMAT_NC4HW4 == format && tensor->dimensions() >= 2) {
int width = 1;
int height = 1;
int batch = tensor->length(0);
int channel = tensor->length(1);
if (tensor->dimensions() >= 3) {
height = tensor->length(2);
}
for (int i=3; i<tensor->dimensions(); ++i) {
width *= tensor->length(i);
}
int alignC = ROUND_UP(channel, 4);
int hR = ROUND_UP(height, 4) - height;
// width parallel 4, may exceed 3 elements
int wR = ROUND_UP(width + 3, 4) - width;
int bhw = batch * width * height;
int bhwR = UP_DIV(bhw, 16) * 16 - bhw;
int extraPadding = ALIMAX(bhwR, (hR * width + wR));
size = batch * alignC * width * height;
size = size + extraPadding * 4;
} else {
size = 1;
for (int i=0; i<tensor->dimensions(); ++i) {
size *= tensor->length(i);
}
size = ROUND_UP(size, 4);
}
if (0 == size) {
return 0;
}
// use metal_float when meets float
if (halide_type_float == tensor->buffer().type.code && tensor->buffer().type.bits == 32 && mUseFloatAsFp16) {
size *= 2;
} else {
size *= tensor->getType().bytes();
}
size_t align = 4 * sizeof(int);
size = ROUND_UP(size, align);
return size;
}
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
auto tensor = const_cast<Tensor *>(_tensor);
size_t size = getTensorSizeInBytes(_tensor);
if (0 == size) {
return nullptr;
}
// reuse if possible
MemChunk buffer;
BufferAllocator* allocator = nullptr;
switch (storageType) {
case Backend::STATIC: {
buffer = mStaticBufferPool->alloc(size, false);
allocator = mStaticBufferPool.get();
} break;
case Backend::DYNAMIC: {
buffer = mCurrentAllocator->alloc(size, false);
allocator = mCurrentAllocator;
} break;
case Backend::DYNAMIC_SEPERATE: {
buffer = mCurrentAllocator->alloc(size, true);
allocator = mCurrentAllocator;
} break;
default:{
break;
}
}
if (storageType == Backend::STATIC) {
if(nullptr == buffer.first) {
MNN_ERROR("onAcquireBuffer error!\n");
return nullptr;
}
} else {
buffer.attach(tensor);
}
if (nullptr == buffer.first) {
_MetalApplyTensor((uint8_t*)(&mEmptyMem), 0, (Tensor*)_tensor);
} else {
_MetalApplyTensor((uint8_t*)buffer.first, buffer.second, (Tensor*)_tensor);
}
return new MetalMemRelease(buffer, allocator);
}
bool MetalBackend::onClearBuffer() {
mCurrentAllocator->release(true);
if (nullptr != mRuntime->mStaticCache.get()) {
mStaticBufferPool = mRuntime->mStaticCache;
}
return true;
}
Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const Op *op) {
auto map = getCreatorMap();
auto iter = map->find(op->type());
if (iter == map->end()) {
mSupportDeferEncode = false;
if (nullptr != op->name()) {
MNN_PRINT("Don't support type [%s], %s\n", EnumNameOpType(op->type()), op->name()->c_str());
} else {
MNN_PRINT("Don't support type [%s]\n", EnumNameOpType(op->type()));
}
return NULL;
}
//MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));
auto exe = iter->second->onCreate(inputs, op, this, outputs);
if (NULL == exe) {
mSupportDeferEncode = false;
MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name() ? op->name()->c_str() : "");
return NULL;
}
return exe;
}
void MetalBackend::flushEncoder() const {
if (nil != mComputeEncoder) {
[mComputeEncoder endEncoding];
mComputeEncoder = nil;
}
}
void MetalBackend::_resetDynamicMemory() const {
mRuntime->pCurrentStatus = mCurrentAllocator->apply();
if (NO_ERROR != mRuntime->pCurrentStatus) {
return;
}
if (nullptr != mBufferPoolShapeImmutable.get()) {
mRuntime->pCurrentStatus = mBufferPoolShapeImmutable->apply();
}
}
void MetalBackend::onExecuteBegin() const {
_resetDynamicMemory();
mEncoderCount = 0;
}
void MetalBackend::onExecuteEnd() const {
flushEncoder();
commit_net();
}
BufferAllocator* MetalBackend::getBufferPool() const {
return mCurrentAllocator;
}
bool MetalBackend::onSelectDynamicAllocator(int index, int maxIndex) {
if (maxIndex > 2) {
return false;
}
if (maxIndex == 2 && mBufferPoolShapeImmutable.get() == nullptr) {
mBufferPoolShapeImmutable.reset(mRuntime->createDynamicAllocator(1, true));
mBufferPool.reset(mRuntime->createDynamicAllocator(0, true));
}
if (1 == index) {
mCurrentAllocator = mBufferPoolShapeImmutable.get();
} else {
mCurrentAllocator = mBufferPool.get();
}
return true;
}
bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
if (nullptr == dstInfo) {
return true;
}
auto dst = (MNNMetalTensorContent*)dstInfo;
dst->type.code = halide_type_float;
if (mUseFloatAsFp16) {
dst->type.bits = 16;
} else {
dst->type.bits = 32;
}
MNNMetalGetTensorContent(dst, (void*)tensor);
return true;
}
bool MetalBackend::isCmdBufferCommit() {
auto ctx = (__bridge MNNMetalContext *)context();
//TODO: set magic number
const int magicNum = mRuntime->hint().encorderNumForCommit;
mEncoderCount++;
if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
return true;
}
return false;
}
id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
// reuse
if (nullptr != mHostBuffer && mHostBuffer.length >= size) {
return mHostBuffer;
}
// create larger
auto context = (__bridge MNNMetalContext *)this->context();
mHostBuffer = [context newDeviceBuffer:size access:CPUReadWrite];
return mHostBuffer;
}
id<MTLBuffer> MetalBackend::getConstBuffer(size_t size) const {
if (size < METAL_CONST_BUFFER_LIMIT) {
if (!mHoldBuffers.empty()) {
auto res = mHoldBuffers.front();
mHoldBuffers.pop();
return res;
}
size = METAL_CONST_BUFFER_LIMIT;
}
auto context = (__bridge MNNMetalContext *)this->context();
auto buffer = [context newDeviceBuffer:size access:CPUReadWrite];
return buffer;
}
void MetalBackend::returnConstBuffer(id<MTLBuffer> buffer) const {
mHoldBuffers.push(buffer);
}
static inline void _getNCPlane(const Tensor* tensor, int& s, int& c, int& b) {
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
s = 1, c = 1, b = 1;
b = tensor->length(0);
if (format == MNN_DATA_FORMAT_NHWC) {
c = tensor->length(tensor->dimensions()-1);
for (int i=1; i<tensor->dimensions()-1; ++i) {
s *= tensor->length(i);
}
} else {
c = tensor->length(1);
for (int i=2; i<tensor->dimensions(); ++i) {
s *= tensor->length(i);
}
}
}
MTLSize getTensorShape(id<MTLBuffer> shape, const Tensor *tensor) {
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
int s, b, c;
_getNCPlane(tensor, s, c, b);
int z = UP_DIV(c, 4);
// shape
((int *)shape.contents)[0] = b;
((int *)shape.contents)[1] = c;
((int *)shape.contents)[2] = s;
((int *)shape.contents)[3] = 1;
// stride
if (format == MNN_DATA_FORMAT_NHWC) {
((int *)shape.contents)[4] = s * c;
((int *)shape.contents)[5] = 1;
((int *)shape.contents)[6] = c;
((int *)shape.contents)[7] = 1;
} else {
((int *)shape.contents)[4] = s * c;
((int *)shape.contents)[5] = s;
((int *)shape.contents)[6] = 1;
((int *)shape.contents)[7] = 1;
}
// threads
MTLSize threads = {(NSUInteger)s * b * z, 1, 1};
return threads;
}
static const char* gTranspose = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct tensor_shape {
uint4 size; // n, c, plane, 1
uint4 stride;
};
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
int channel = uConstant.size.y;
if (gid < channel * uConstant.size.x * uConstant.size.z) {
int tmp = gid % (channel * uConstant.size.x);
int x = gid / (channel * uConstant.size.x);
int b = tmp / channel;
int c = tmp % channel;
int outPos = b * uConstant.size.y * uConstant.size.z + c * uConstant.size.z + x;
int inPos = b * uConstant.size.y * uConstant.size.z + c + x * uConstant.size.y;
out[outPos] = (OType)(in[inPos]);
}
})metal";
static const char* gNC4HW4Convert = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct tensor_shape {
uint4 size; // n, c, plane, 1
uint4 stride;
};
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
int channelC4 = (uConstant.size.y + 3) / 4;
if (gid < channelC4 * uConstant.size.x * uConstant.size.z)
{
int3 pos;
pos.z = gid % (channelC4 * uConstant.size.x);
pos.y = gid / (channelC4 * uConstant.size.x);
pos.x = 0;
int batchIndex = pos.z / channelC4;
int zDiv4 = pos.z % channelC4;
int lastZ = uConstant.size.y / 4;
int cIndex = uConstant.size.y % 4;
int z = zDiv4*4;
int basicOffset = 0
+ batchIndex*uConstant.stride.x
+ z * uConstant.stride.y
+ pos.y * uConstant.stride.z
;
#ifdef MNN_OUTPUT_C4
OType color = OType(0);
if(zDiv4 == lastZ)
{
if(cIndex == 1)
{
color.r = in[basicOffset+0];
color.g = 0.0;
color.b = 0.0;
color.a = 0.0;
}
else if(cIndex == 2)
{
color.r = in[basicOffset+0];
color.g = in[basicOffset+1*uConstant.stride.y];
color.b = 0.0;
color.a = 0.0;
}
else
{
color.r = in[basicOffset+0];
color.g = in[basicOffset+1*uConstant.stride.y];
color.b = in[basicOffset+2*uConstant.stride.y];
color.a = 0.0;
}
}
else
{
color.r = in[basicOffset+0];
color.g = in[basicOffset+1*uConstant.stride.y];
color.b = in[basicOffset+2*uConstant.stride.y];
color.a = in[basicOffset+3*uConstant.stride.y];
}
out[0
+ pos.y
+ uConstant.size.x * uConstant.size.z*zDiv4
+ batchIndex*uConstant.size.z
] = color;
#else
IType color = in[0
+ pos.y
+ uConstant.size.x * uConstant.size.z*zDiv4
+ batchIndex*uConstant.size.z
];
if(zDiv4 == lastZ)
{
if(cIndex == 1)
{
out[basicOffset+0*uConstant.stride.y] = color.r;
}
else if(cIndex == 2)
{
out[basicOffset+0*uConstant.stride.y] = color.r;
out[basicOffset+1*uConstant.stride.y] = color.g;
}
else
{
out[basicOffset+0*uConstant.stride.y] = color.r;
out[basicOffset+1*uConstant.stride.y] = color.g;
out[basicOffset+2*uConstant.stride.y] = color.b;
}
}
else
{
out[basicOffset+0*uConstant.stride.y] = color.r;
out[basicOffset+1*uConstant.stride.y] = color.g;
out[basicOffset+2*uConstant.stride.y] = color.b;
out[basicOffset+3*uConstant.stride.y] = color.a;
}
#endif
}
}
)metal";
static const char* gCopy = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buffer(1)]], constant uint4& limit [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
if (gid < limit.x) {
out[int(gid)] = (OType)in[int(gid)];
}
})metal";
void MetalBackend::onResizeBegin() {
// Abort last inference task if needed
flushEncoder();
_commandBuffer_net = nil;
_commandBuffer = nil;
wait();
mCurrentAllocator->reset();
}
ErrorCode MetalBackend::onResizeEnd() {
auto ctx = (__bridge MNNMetalContext *)context();
return mCurrentAllocator->compute();
}
static std::string _getType(const halide_type_t& type, MNN_DATA_FORMAT format, bool useFp16AsFp32) {
std::string res;
if (type.code == halide_type_float) {
if (useFp16AsFp32) {
res = "half";
} else {
res = "float";
}
} else {
switch (type.bytes()) {
case 1:
res = "char";
break;
case 2:
res = "short";
break;
case 4:
res = "int";
break;
default:
MNN_ASSERT(false);
break;
}
}
if (format == MNN_DATA_FORMAT_NC4HW4) {
return res + "4";
}
return res;
}
MetalBackend::CopyPipeline MetalBackend::_makeCopyInfo(const Tensor *src, const Tensor *dst, id<MTLBuffer> shape, int castType) const {
auto ctx = (__bridge MNNMetalContext *)context();
MetalBackend::CopyPipeline res;
auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
if (shape == nil) {
shape = getConstBuffer(8 * sizeof(int));
}
res.shape = shape;
if (sfmt == dfmt || src->dimensions() <= 1) {
auto srcType = _getType(src->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 1);
auto dstType = _getType(dst->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 2);
auto size = dst->elementSize();
size = UP_DIV(size, 4);
std::vector<std::string> keys = {
"copyC4",
srcType,
dstType
};
((uint32_t*)[shape contents])[0] = size;
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
if (nil == pipeline) {
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
option.preprocessorMacros = dic;
pipeline = makeComputePipelineWithSourceOption(gCopy, "main0", option);
mRuntime->insertPipeline(keys, pipeline);
}
res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
res.localSize = MTLSizeMake(256, 1, 1);
res.pipeline = pipeline;
return res;
}
auto srcType = _getType(src->getType(), sfmt, mUseFloatAsFp16 && castType != 1);
auto dstType = _getType(dst->getType(), dfmt, mUseFloatAsFp16 && castType != 2);
if (sfmt == MNN_DATA_FORMAT_NC4HW4 || dfmt == MNN_DATA_FORMAT_NC4HW4) {
auto normalTensor = dst;
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
normalTensor = src;
}
// convert C4 / NCHW
std::vector<std::string> keys = {
"c4convert",
srcType,
dstType
};
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
keys.emplace_back("outputc4");
}
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
if (nil == pipeline) {
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
[dic setValue:@"1" forKey:@"MNN_OUTPUT_C4"];
}
option.preprocessorMacros = dic;
pipeline = makeComputePipelineWithSourceOption(gNC4HW4Convert, "main0", option);
mRuntime->insertPipeline(keys, pipeline);
}
res.pipeline = pipeline;
auto size = getTensorShape(shape, normalTensor);
auto gl = [ctx computeBestGroupAndLocal:pipeline threads:size];
res.groupSize = gl.first;
res.localSize = gl.second;
return res;
}
// NCHW <-> NHWC
std::vector<std::string> keys = {
"transpose",
srcType,
dstType
};
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
if (nil == pipeline) {
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
option.preprocessorMacros = dic;
pipeline = makeComputePipelineWithSourceOption(gTranspose, "main0", option);
mRuntime->insertPipeline(keys, pipeline);
}
res.pipeline = pipeline;
int n, c, plane;
_getNCPlane(dst, plane, c, n);
auto shapePtr = (uint32_t*)shape.contents;
shapePtr[0] = n;
shapePtr[3] = 1;
if (MNN_DATA_FORMAT_NHWC == dfmt) {
shapePtr[1] = plane;
shapePtr[2] = c;
} else {
shapePtr[1] = c;
shapePtr[2] = plane;
}
auto size = plane * n * c;
res.localSize = MTLSizeMake(256, 1, 1);
res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
return res;
}
static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::CopyPipeline& info, std::pair<id<MTLBuffer>, int> src, std::pair<id<MTLBuffer>, int> dst) {
[encoder setComputePipelineState:info.pipeline];
[encoder setBuffer:src.first offset:src.second atIndex:0];
[encoder setBuffer:dst.first offset:dst.second atIndex:1];
[encoder setBuffer:info.shape offset:0 atIndex:2];
[encoder dispatchThreadgroups:info.groupSize threadsPerThreadgroup:info.localSize];
}
void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape, int castType) const {
auto ctx = (__bridge MNNMetalContext *)context();
auto info = _makeCopyInfo(src, dst, shape, castType);
auto standalone = encoder == nil;
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
_execute(encoder, info, MetalBackend::getBuffer(src), MetalBackend::getBuffer(dst));
if (standalone) {
[encoder endEncoding];
MNN_PRINT_ENCODER(ctx, encoder);
}
}
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
flushEncoder();
auto ctx = (__bridge MNNMetalContext *)context();
commit_net();
_resetDynamicMemory();
onCopyBuffer(src, dst, nil, nil);
}
id<MTLComputeCommandEncoder> MetalBackend::encoder_for_net() const {
if (nil == mComputeEncoder) {
mComputeEncoder = encoder_net();//TO DO :: use which cmdBuffer
}
return mComputeEncoder;
}
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
MNN_ASSERT(src->buffer().dimensions == dst->buffer().dimensions);
if (!src->buffer().host && !dst->buffer().host) {
onCopyDeviceToDevice(src, dst, encoder, shape);
return;
}
auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
bool formatDiff = sfmt != dfmt && src->dimensions() > 1;
auto floats = src->getType().code == halide_type_float;
bool dataTypeDiff = floats && mUseFloatAsFp16;
bool needConvert = formatDiff || dataTypeDiff;
if (!src->buffer().host && dst->buffer().host) {
auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)src->deviceId())->getBuffer();
auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(src)->extra.offset;
if (needConvert) {
auto tDst = const_cast<Tensor*>(dst);
auto tmpBuffer = getHostBuffer(dst->usize());
auto info = _makeCopyInfo(src, dst, shape, 2);
auto standalone = encoder == nil;
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
_execute(encoder, info, MetalBackend::getBuffer(src), std::make_pair(tmpBuffer, 0));
if (standalone) {
[encoder endEncoding];
}
commit();
devicePtr = (uint8_t*)tmpBuffer.contents;
}
wait();
::memcpy(dst->host<void>(), devicePtr, dst->usize());
return;
}
if (src->buffer().host && !dst->buffer().host) {
// For command queue from user, need user to make sure last frame's gpu work is ready
bool needWait = !mRuntime->userSync();
if (needWait) {
wait();
}
auto srcSize = src->usize();
if (needConvert) {
auto tmpBuffer = getHostBuffer(srcSize);
::memcpy(tmpBuffer.contents, src->host<void>(), srcSize);
auto info = _makeCopyInfo(src, dst, shape, 1);
auto standalone = encoder == nil;
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
_execute(encoder, info, std::make_pair(tmpBuffer, 0), MetalBackend::getBuffer(dst));
if (standalone) {
[encoder endEncoding];
}
commit();
} else {
auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)dst->deviceId())->getBuffer();
auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(dst)->extra.offset;
::memcpy(devicePtr, src->host<void>(), srcSize);
}
return;
}
MNN_ASSERT(false); // should not be handled here
}
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
flushEncoder();
auto ctx = (__bridge MNNMetalContext *)context();
commit_net();
if (toCpu) {
wait();
}
return 0;
}
id<MTLCommandBuffer> MetalBackend::getCommandBufferForBufferCopy() const {
if (nil == _commandBuffer) {
_commandBuffer = [_commandQueue commandBuffer];
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer_net = _commandBuffer;
}
}
return _commandBuffer;
}
id<MTLCommandBuffer> MetalBackend::getCommandBufferForNet() const {
if (nil == _commandBuffer_net) {
_commandBuffer_net = [_commandQueue commandBuffer];
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer = _commandBuffer_net;
}
}
return _commandBuffer_net;
}
void MetalBackend::setTensor(const MNN::Tensor* tensor, id<MTLComputeCommandEncoder> encoder, int index) {
[encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer() offset:TensorUtils::getDescribe(tensor)->extra.offset atIndex:index];
}
void MetalBackend::setMem(const MemChunk& chunk, id<MTLComputeCommandEncoder> encoder, int index) {
[encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer() offset:chunk.second atIndex:index];
}
uint8_t* MetalBackend::getMemPtr(const MemChunk& chunk) {
return (uint8_t*)((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer().contents + chunk.second;
}
std::pair<id<MTLBuffer>, int> MetalBackend::getBuffer(const MNN::Tensor* tensor) {
return std::make_pair(((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer(), TensorUtils::getDescribe(tensor)->extra.offset);
}
void MetalBackend::commit() const {
if (nil != _commandBuffer && _commandBuffer.status < MTLCommandBufferStatusCommitted) {
[_commandBuffer commit];
_waiting = _commandBuffer;
_commandBuffer = nil;
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer_net = nil;
}
}
}
void MetalBackend::commit_net() const {
if (nil != _commandBuffer_net && _commandBuffer_net.status < MTLCommandBufferStatusCommitted) {
[_commandBuffer_net commit];
_waiting = _commandBuffer_net;
_commandBuffer_net = nil;
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer = nil;
}
}
}
void MetalBackend::wait() const {
if (nil != _waiting) {
auto buffer = _waiting;
if (buffer.status >= MTLCommandBufferStatusCompleted) {
_waiting = nil;
return;
}
#if MNN_METAL_BENCHMARK
NSTimeInterval begin = [NSDate timeIntervalSinceReferenceDate];
[buffer waitUntilCompleted];
NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
if (@available(iOS 10.3, *)) {
printf("[METAL] commit costs: %.3fms\t(kernel: %.3fms, GPU: %.3fms)\n", (end - begin) * 1000.f,
(buffer.kernelEndTime - buffer.kernelStartTime) * 1000.f,
(buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f);
} else {
printf("[METAL] commit costs: %.3fms\n", (end - begin) * 1000.f);
}
#else
[buffer waitUntilCompleted];
#endif
#if MNN_METAL_DEBUG
if (buffer.error) {
printf("[METAL] %s\n", buffer.error.localizedDescription.UTF8String);
}
#endif
}
_waiting = nil;
}
id<MTLComputePipelineState> MetalBackend::makeComputePipelineWithSourceOption(const char* csource, const char* cname, MTLCompileOptions *options) const{
auto ctx = (__bridge MNNMetalContext *)context();
auto source = [[NSString alloc] initWithUTF8String:csource];
auto name = [[NSString alloc] initWithUTF8String:cname];
auto pipeline = [ctx pipelineWithSourceOption:source name:name options:options];
if (nil == pipeline) {
mRuntime->pCurrentStatus = NOT_SUPPORT;
}
return pipeline;
}
void MetalRuntime::setCommandQueue(id<MTLCommandQueue> queue, bool userSync) {
mQueue = queue;
mUserSync = userSync;
}
id<MTLComputePipelineState> MetalRuntime::findPipeline(const std::vector<std::string>& keys) const {
auto iter = mCachePipeine.find(keys);
if (iter == mCachePipeine.end()) {
return nil;
}
return iter->second;
}
void MetalRuntime::insertPipeline(const std::vector<std::string>& keys, id<MTLComputePipelineState> pipeline) const {
if (nil != pipeline) {
mCachePipeine.insert(std::make_pair(keys, pipeline));
}
}
void MetalRuntime::setGpuMode(const int mode_num) {
int totalSet = 0;
bool isSet = (mode_num & MNN_GPU_MEMORY_BUFFER);
if(isSet) {
totalSet++;
}
isSet = (mode_num & MNN_GPU_MEMORY_IMAGE);
if(isSet) {
totalSet++;
}
if(totalSet > 0) {
MNN_PRINT("warning: set BUFFER and IMAGE mode is not useful for metal, it doesn't matter, cl_mode:%x\n", mode_num);
}
totalSet = 0;
isSet = (mode_num & MNN_GPU_TUNING_NONE);
if(isSet) {
mTuneLevel = Never;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_FAST);
if(isSet) {
mTuneLevel = Fast;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_NORMAL);
if(isSet) {
mTuneLevel = Normal;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_HEAVY);
if(isSet) {
mTuneLevel = Heavy;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_WIDE);
if(isSet) {
mTuneLevel = Wide;
totalSet++;
}
if(totalSet != 1) {
MNN_PRINT("set multi tuning mode is not permitted, please check cl_mode:%x\n", mode_num);
}
}
struct MetalContext {
std::mutex pLock;
MNNMetalContext* pContext;
id<MTLDevice> pDevice;
};
static MetalContext* gContext = nullptr;
MetalRuntime* MetalRuntime::create(const Backend::Info& info) {
std::unique_lock<std::mutex> _l(gContext->pLock);
MNNMetalSharedContext sharedContext;
sharedContext.device = nil;
sharedContext.queue = nil;
if (info.user != nullptr) {
if (info.user->sharedContext != nullptr) {
sharedContext.device = ((MNNMetalSharedContext*)info.user->sharedContext)->device;
sharedContext.queue = ((MNNMetalSharedContext*)info.user->sharedContext)->queue;
}
}
if (nil == sharedContext.device) {
sharedContext.device = MTLCreateSystemDefaultDevice();
}
if (nil == gContext->pContext || gContext->pDevice != sharedContext.device) {
gContext->pContext = [[MNNMetalContext alloc] init];
gContext->pDevice = sharedContext.device;
BOOL res = [gContext->pContext initWithSharedContext:&sharedContext dev:sharedContext.device];
if (!res) {
gContext->pContext = nil;
return nullptr;
}
}
auto mContext = (__bridge_retained void *)(gContext->pContext);
auto rt = new MetalRuntime(mContext);
rt->setGpuMode(info.gpuMode);
if (nil != sharedContext.queue) {
rt->setCommandQueue(sharedContext.queue, true);
}
bool supportDefer = info.numThread & MNN_GPU_RECORD_BATCH;
if ((!supportDefer) && nil == sharedContext.queue) {
id<MTLCommandQueue> queue = [sharedContext.device newCommandQueue];
rt->setCommandQueue(queue, false);
}
if (nullptr != info.user) {
rt->mDefaultConfig = *info.user;
}
return rt;
}
MetalRuntime::MetalRuntime(void* context) {
mContext = context;
auto ctx = (__bridge MNNMetalContext *)mContext;
std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
mSimdGroupReduce = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
mSimdGroupReduce |= [[ctx device] supportsFamily:MTLGPUFamilyMetal3];
mSimdGroupMatrix = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
mStatic.reset(new EagerBufferAllocator(allocator));
mDynamic.resize(METAL_SEPERATE_MAX_COUNT);
for (auto& buf : mDynamic) {
buf.root = allocator;
}
mTunedInfo = new TunedInfo;
}
MetalRuntime::~ MetalRuntime() {
if(mContext) {
CFRelease(mContext);
}
delete mTunedInfo;
}
bool MetalRuntime::setCache(std::pair<const void*, size_t> cache) {//Get Cache
auto buffer = cache.first;
auto size = cache.second;
if (nullptr == buffer) {
mCacheOutside = nullptr;
mCacheOutsideSize = 0;
mBuffer.clear();
return false;//actually get nothing
}
mCacheOutsideSize = size;
mCacheOutside = buffer;
auto cacheBuffer = GetCache(buffer);
flatbuffers::Verifier verify((const uint8_t*)cache.first, cache.second);
if (false == VerifyCacheBuffer(verify)) {
return false;
}
if (nullptr == cacheBuffer->tunings()) {
return false;
}
// Load Auto Tuning Info
if (nullptr != cacheBuffer->tunings()) {
auto tuningInfo = cacheBuffer->tunings();
for (int i=0; i<tuningInfo->size(); ++i) {
auto tun = tuningInfo->GetAs<Autotuning>(i);
if (nullptr == tun->threadSize() || nullptr == tun->groupSize() || nullptr == tun->key()) {
MNN_ERROR("Error tunning info\n");
continue;
}
std::vector<uint32_t> glo(tun->threadSize()->size());
for (int v=0; v<glo.size(); ++v) {
glo[v] = tun->threadSize()->data()[v];
}
std::vector<uint32_t> grop(tun->groupNum()->size());
for (int v=0; v<grop.size(); ++v) {
grop[v] = tun->groupNum()->data()[v];
}
std::vector<uint32_t> loc(tun->groupSize()->size());
for (int v=0; v<loc.size(); ++v) {
loc[v] = tun->groupSize()->data()[v];
}
uint32_t cost = tun->timeCost();
mTunedThreadGroup.insert(std::make_pair(std::make_pair(tun->key()->str(), glo), std::make_tuple(grop, loc, cost)));
mTunedThreadGroupVec[tun->key()->str()].emplace_back(std::make_pair(glo, std::make_tuple(grop, loc, cost)));
}
}
return true;
}
std::pair<const void*, size_t> MetalRuntime::makeCache(TunedInfo* info) {//make Cache
std::unique_ptr<CacheT> cache(new CacheT);
// Get All Autotuning cache
for (auto& iter : mTunedThreadGroup) {
std::unique_ptr<AutotuningT> tuning(new AutotuningT);
tuning->key = iter.first.first;
tuning->threadSize = iter.first.second;
tuning->groupNum = std::get<0>(iter.second);
tuning->groupSize = std::get<1>(iter.second);
tuning->timeCost = std::get<2>(iter.second);
cache->tunings.emplace_back(std::move(tuning));
}
cache->tuned = std::move(info->mInfos);
flatbuffers::FlatBufferBuilder builder;
auto lastOffset = Cache::Pack(builder, cache.get());
builder.Finish(lastOffset);
mBuffer.resize(builder.GetSize());
::memcpy(mBuffer.data(), builder.GetBufferPointer(), builder.GetSize());
return std::make_pair(mBuffer.data(), mBuffer.size());
}
float MetalRuntime::onGetMemoryInMB() {
auto staticMemoryInMB = mStatic->totalSize() / 1024.0f / 1024.0f;
float dynamicMemoryInMB = 0.0f;
for (auto& buf : mDynamic) {
dynamicMemoryInMB += buf.currentSize / 1024.0f / 1024.0f;
}
return staticMemoryInMB + dynamicMemoryInMB;
}
void MetalRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
if (nullptr != op->name()) {
auto dstInfo = mTunedInfo;
std::unique_ptr<MetalCache::OpInfoT> opInfo(new MetalCache::OpInfoT);;
opInfo->type = op->type();
opInfo->name = op->name()->str();
opInfo->inputs.resize(inputs.size());
for (int v=0; v<opInfo->inputs.size(); ++v) {
opInfo->inputs[v].reset(new MetalCache::TensorInfoT);
opInfo->inputs[v]->shape.resize(inputs[v]->dimensions());
for (int u=0; u<opInfo->inputs[v]->shape.size(); ++u) {
opInfo->inputs[v]->shape[u] = inputs[v]->length(u);
}
}
opInfo->outputs.resize(outputs.size());
for (int v=0; v<opInfo->outputs.size(); ++v) {
opInfo->outputs[v].reset(new MetalCache::TensorInfoT);
opInfo->outputs[v]->shape.resize(outputs[v]->dimensions());
for (int u=0; u<opInfo->outputs[v]->shape.size(); ++u) {
opInfo->outputs[v]->shape[u] = outputs[v]->length(u);
}
}
dstInfo->mInfos.emplace_back(std::move(opInfo));
}
}
static bool _checkTensorInfo(const MetalCache::TensorInfoT* dst, const Tensor* src) {
if (dst->shape.size() != src->dimensions()) {
return false;
}
for (int j=0; j<dst->shape.size(); ++j) {
if (dst->shape[j] != src->length(j)) {
return false;
}
}
return true;
}
bool MetalRuntime::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Runtime::OpInfo& dstInfo) const {
dstInfo.initCostLong = true;
if (nullptr == op->name()) {
dstInfo.initCostLong = false;
return true;
}
for(auto& info : mTunedInfo->mInfos) {
if (info->type != op->type()) {
continue;
}
if (info->name != op->name()->str()) {
continue;
}
if (info->inputs.size() != inputs.size() || info->outputs.size() != outputs.size()) {
continue;
}
bool match = true;
for (int i=0; i<inputs.size(); ++i) {
auto& dst = info->inputs[i];
auto src = inputs[i];
if (!_checkTensorInfo(dst.get(), src)) {
match = false;
break;
}
}
if (!match) {
continue;
}
for (int i=0; i<outputs.size(); ++i) {
auto& dst = info->outputs[i];
auto src = outputs[i];
if (!_checkTensorInfo(dst.get(), src)) {
match = false;
break;
}
}
if (match) {
// All Info is match
dstInfo.initCostLong = false;
break;
}
}
return true;
}
class MetalWrapAllocator : public BufferAllocator::Allocator {
private:
std::shared_ptr<BufferAllocator::Allocator> mOrigin;
id<MTLDevice> mDevice;
public:
MetalWrapAllocator(std::shared_ptr<BufferAllocator::Allocator> origin, id<MTLDevice> device) : mOrigin(origin), mDevice(device) {}
virtual ~ MetalWrapAllocator() {
// Do nothing
}
virtual MemChunk onAlloc(size_t size, size_t align) override {
auto mem = mOrigin->onAlloc(size, align);
MNN_ASSERT(mem.second == 0);
id<MTLBuffer> buffer = [mDevice newBufferWithBytesNoCopy:mem.first length:size options:MTLResourceStorageModeShared deallocator:nil];
auto wrap = new MetalRuntimeAllocator::MetalBufferAlloc(buffer);
return MemChunk((void *)wrap, 0);
}
virtual void onRelease(MemChunk chunk) override {
auto mem = (MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first;
mOrigin->onRelease(MemChunk(mem->getBuffer().contents));
delete mem;
}
};
Backend* MetalRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
if (hint().weightMemoryPath.size() > 0 && mStaticCache.get() == nullptr) {
auto ctx = (__bridge MNNMetalContext *)mContext;
auto mmap = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "", "metal.weight");
std::shared_ptr<BufferAllocator::Allocator> mmapMem(new MetalWrapAllocator(mmap, [ctx device]));
mStaticCache = mStatic;
mStatic.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
}
BackendConfig::PrecisionMode precision = mDefaultConfig.precision;
BackendConfig::MemoryMode memory = mDefaultConfig.memory;
if (nullptr != config) {
precision = config->precision;
memory = config->memory;
}
bool useFp16AsFp32 = precision != BackendConfig::Precision_High;
return new MetalBackend(mStatic, this, useFp16AsFp32, memory);
}
void MetalRuntime::onGabageCollect(int level) {
mStatic->release(false);
if (level >= 100) {
for (auto& buf : mDynamic) {
buf.release();
}
}
}
std::pair<const void*, size_t> MetalRuntime::onGetCache() {//make Cache
return makeCache(mTunedInfo);
}
bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
if (nullptr == buffer) {
return false;
}
auto cacheBuffer = MetalCache::GetCache(buffer);
flatbuffers::Verifier verify((const uint8_t*)buffer, size);
if (false == VerifyCacheBuffer(verify)) {
return false;
}
if(nullptr != cacheBuffer->tuned()) {
for (int i=0; i<cacheBuffer->tuned()->size(); ++i) {
auto srcInfo = cacheBuffer->tuned()->GetAs<MetalCache::OpInfo>(i);
std::unique_ptr<MetalCache::OpInfoT> dst(srcInfo->UnPack());
mTunedInfo->mInfos.emplace_back(std::move(dst));
}
}
return setCache(std::make_pair(buffer, size));
}
MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
return MemChunk((void *)mMetalBufferAlloc, 0);
}
void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
delete (MetalBufferAlloc *)ptr.first;
}
class MetalRuntimeCreator : public RuntimeCreator {
public:
MetalRuntimeCreator() {
// Do nothing
}
virtual ~ MetalRuntimeCreator() {
// Do nothing
}
virtual Runtime *onCreate(const Backend::Info &info) const {
auto rt = MetalRuntime::create(info);
return rt;
}
private:
id<MTLDevice> mDevice;
};
void registerMetalRuntimeCreator() {
// according to
// https://developer.apple.com/library/archive/documentation/DeviceInformation/Reference/iOSDeviceCompatibility/HardwareGPUInformation/HardwareGPUInformation.html
// not all device with iOS 8+ supports metal.
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
if (nil != device) {
gContext = new MetalContext;
gContext->pContext = nil;
gContext->pDevice = nil;
registerMetalOps();
#ifdef MNN_SUPPORT_RENDER
registerMetalRenderOps();
#endif
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator, false);
} else {
MNN_ERROR("Init Metal Error\n");
}
}
} // namespace MNN
#else
namespace MNN {
void registerMetalRuntimeCreator() {
}
};
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
return -1;
}
#endif