mirror of https://github.com/alibaba/MNN.git
1311 lines
45 KiB
Plaintext
1311 lines
45 KiB
Plaintext
//
|
||
// MetalBackend.mm
|
||
// MNN
|
||
//
|
||
// Created by MNN on 2019/01/30.
|
||
// Copyright © 2018, Alibaba Group Holding Limited
|
||
//
|
||
|
||
#import "backend/metal/MetalBackend.hpp"
|
||
#define MNN_METAL
|
||
#import <MNN/MNNSharedContext.h>
|
||
#define METAL_CONST_BUFFER_LIMIT 128
|
||
#define METAL_SEPERATE_MAX_COUNT 2
|
||
#if MNN_METAL_ENABLED
|
||
#include <mutex>
|
||
#import "backend/metal/MNNMetalContext.h"
|
||
#import "core/Macro.h"
|
||
#import "core/TensorUtils.hpp"
|
||
#include "MetalCache_generated.h"
|
||
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
|
||
if (nullptr == content || nullptr == tensor) {
|
||
return 0;
|
||
}
|
||
auto t = (MNN::Tensor*)tensor;
|
||
auto des = MNN::TensorUtils::getDescribe(t);
|
||
content->buffer = ((MNN::MetalRuntimeAllocator::MetalBufferAlloc*)t->deviceId())->getBuffer();
|
||
content->texture = nil;
|
||
content->offset = des->extra.offset;
|
||
return 0;
|
||
}
|
||
|
||
namespace MNN {
|
||
|
||
static void _MetalApplyTensor(uint8_t* host, size_t offset, Tensor* t) {
|
||
// ptr of MetalBufferAlloc
|
||
t->buffer().device = (uint64_t)host;
|
||
auto des = TensorUtils::getDescribe(t);
|
||
des->extra.offset = offset;
|
||
}
|
||
BufferAllocator* MetalRuntime::createDynamicAllocator(int index, bool secondResize) const {
|
||
if (hint().memoryAllocatorType == Runtime::Allocator_Defer && secondResize) {
|
||
return new DeferBufferAllocator(buffer(index), 1024, _MetalApplyTensor);
|
||
}
|
||
if (mStaticCache.get() != nullptr) {
|
||
return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticCache.get()), 1024);
|
||
}
|
||
return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStatic.get()), 1024);
|
||
}
|
||
|
||
struct TunedInfo {
|
||
std::vector<std::unique_ptr<MetalCache::OpInfoT>> mInfos;
|
||
};
|
||
|
||
void registerMetalOps();
|
||
#ifdef MNN_SUPPORT_RENDER
|
||
extern void registerMetalRenderOps();
|
||
#endif
|
||
|
||
static inline std::map<OpType, MetalBackend::Creator *> *getCreatorMap() {
|
||
static std::once_flag of;
|
||
static std::map<OpType, MetalBackend::Creator *> *ret = nullptr;
|
||
std::call_once(of, [&]() { ret = new std::map<OpType, MetalBackend::Creator *>; });
|
||
return ret;
|
||
}
|
||
|
||
void MetalBackend::addCreator(OpType t, Creator *c) {
|
||
auto map = getCreatorMap();
|
||
if (map->find(t) != map->end()) {
|
||
MNN_PRINT("Error: %d type has be added\n", t);
|
||
}
|
||
map->insert(std::make_pair(t, c));
|
||
}
|
||
|
||
MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime, bool usefp16AsFp32, BackendConfig::MemoryMode mode) : Backend(MNN_FORWARD_METAL),
|
||
mEmptyMem(nil)
|
||
{
|
||
mRuntime = runtime;
|
||
auto ctx = (__bridge MNNMetalContext *)runtime->context();
|
||
mBufferPool.reset(runtime->createDynamicAllocator(0, false));
|
||
mCurrentAllocator = mBufferPool.get();
|
||
mStaticBufferPool = staticMem;
|
||
mUseFloatAsFp16 = usefp16AsFp32;
|
||
mMemoryMode = mode;
|
||
mIsIphone = ctx.isIphone;
|
||
if (runtime->getCommandQueue() == nil) {
|
||
// one command queue can create only a few command buffer, so let each backend own a command queue
|
||
_commandQueue = [[ctx device] newCommandQueue];
|
||
mSupportDeferEncode = true;
|
||
} else {
|
||
// otherwise forbid defer encode optimize
|
||
_commandQueue = runtime->getCommandQueue();
|
||
mSupportDeferEncode = false;
|
||
}
|
||
_commandBuffer = nil;
|
||
_commandBuffer_net = nil;
|
||
_waiting = nil;
|
||
}
|
||
MetalBackend::~MetalBackend() {
|
||
flushEncoder();
|
||
}
|
||
|
||
id<MTLComputeCommandEncoder> MetalBackend::encoder_net() const {
|
||
id<MTLComputeCommandEncoder> result = [getCommandBufferForNet() computeCommandEncoder];
|
||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||
result.label = nil;
|
||
#endif
|
||
return result;
|
||
}
|
||
|
||
void *MetalBackend::context() const {
|
||
return mRuntime->context();
|
||
}
|
||
|
||
class MetalMemRelease : public Backend::MemObj {
|
||
public:
|
||
MetalMemRelease(MemChunk buffer, BufferAllocator* allocator) {
|
||
mBuffer = buffer;
|
||
mAllocator = allocator;
|
||
}
|
||
virtual ~ MetalMemRelease() {
|
||
mAllocator->free(mBuffer);
|
||
}
|
||
MemChunk chunk() override {
|
||
return mBuffer;
|
||
}
|
||
private:
|
||
MemChunk mBuffer;
|
||
BufferAllocator* mAllocator;
|
||
};
|
||
size_t MetalBackend::getTensorSizeInBytes(const Tensor* tensor) const {
|
||
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
|
||
size_t size;
|
||
if (MNN_DATA_FORMAT_NC4HW4 == format && tensor->dimensions() >= 2) {
|
||
int width = 1;
|
||
int height = 1;
|
||
int batch = tensor->length(0);
|
||
int channel = tensor->length(1);
|
||
if (tensor->dimensions() >= 3) {
|
||
height = tensor->length(2);
|
||
}
|
||
for (int i=3; i<tensor->dimensions(); ++i) {
|
||
width *= tensor->length(i);
|
||
}
|
||
int alignC = ROUND_UP(channel, 4);
|
||
int hR = ROUND_UP(height, 4) - height;
|
||
// width parallel 4, may exceed 3 elements
|
||
int wR = ROUND_UP(width + 3, 4) - width;
|
||
int bhw = batch * width * height;
|
||
int bhwR = UP_DIV(bhw, 16) * 16 - bhw;
|
||
int extraPadding = ALIMAX(bhwR, (hR * width + wR));
|
||
size = batch * alignC * width * height;
|
||
size = size + extraPadding * 4;
|
||
} else {
|
||
size = 1;
|
||
for (int i=0; i<tensor->dimensions(); ++i) {
|
||
size *= tensor->length(i);
|
||
}
|
||
size = ROUND_UP(size, 4);
|
||
}
|
||
if (0 == size) {
|
||
return 0;
|
||
}
|
||
// use metal_float when meets float
|
||
if (halide_type_float == tensor->buffer().type.code && tensor->buffer().type.bits == 32 && mUseFloatAsFp16) {
|
||
size *= 2;
|
||
} else {
|
||
size *= tensor->getType().bytes();
|
||
}
|
||
size_t align = 4 * sizeof(int);
|
||
size = ROUND_UP(size, align);
|
||
return size;
|
||
}
|
||
|
||
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
|
||
auto tensor = const_cast<Tensor *>(_tensor);
|
||
size_t size = getTensorSizeInBytes(_tensor);
|
||
if (0 == size) {
|
||
return nullptr;
|
||
}
|
||
// reuse if possible
|
||
MemChunk buffer;
|
||
BufferAllocator* allocator = nullptr;
|
||
switch (storageType) {
|
||
case Backend::STATIC: {
|
||
buffer = mStaticBufferPool->alloc(size, false);
|
||
allocator = mStaticBufferPool.get();
|
||
} break;
|
||
case Backend::DYNAMIC: {
|
||
buffer = mCurrentAllocator->alloc(size, false);
|
||
allocator = mCurrentAllocator;
|
||
} break;
|
||
case Backend::DYNAMIC_SEPERATE: {
|
||
buffer = mCurrentAllocator->alloc(size, true);
|
||
allocator = mCurrentAllocator;
|
||
} break;
|
||
default:{
|
||
break;
|
||
}
|
||
}
|
||
if (storageType == Backend::STATIC) {
|
||
if(nullptr == buffer.first) {
|
||
MNN_ERROR("onAcquireBuffer error!\n");
|
||
return nullptr;
|
||
}
|
||
} else {
|
||
buffer.attach(tensor);
|
||
}
|
||
if (nullptr == buffer.first) {
|
||
_MetalApplyTensor((uint8_t*)(&mEmptyMem), 0, (Tensor*)_tensor);
|
||
} else {
|
||
_MetalApplyTensor((uint8_t*)buffer.first, buffer.second, (Tensor*)_tensor);
|
||
}
|
||
return new MetalMemRelease(buffer, allocator);
|
||
}
|
||
|
||
bool MetalBackend::onClearBuffer() {
|
||
mCurrentAllocator->release(true);
|
||
if (nullptr != mRuntime->mStaticCache.get()) {
|
||
mStaticBufferPool = mRuntime->mStaticCache;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||
const Op *op) {
|
||
auto map = getCreatorMap();
|
||
|
||
auto iter = map->find(op->type());
|
||
if (iter == map->end()) {
|
||
mSupportDeferEncode = false;
|
||
if (nullptr != op->name()) {
|
||
MNN_PRINT("Don't support type [%s], %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
||
} else {
|
||
MNN_PRINT("Don't support type [%s]\n", EnumNameOpType(op->type()));
|
||
}
|
||
return NULL;
|
||
}
|
||
//MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));
|
||
|
||
auto exe = iter->second->onCreate(inputs, op, this, outputs);
|
||
if (NULL == exe) {
|
||
mSupportDeferEncode = false;
|
||
MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name() ? op->name()->c_str() : "");
|
||
return NULL;
|
||
}
|
||
return exe;
|
||
}
|
||
void MetalBackend::flushEncoder() const {
|
||
if (nil != mComputeEncoder) {
|
||
[mComputeEncoder endEncoding];
|
||
mComputeEncoder = nil;
|
||
}
|
||
}
|
||
void MetalBackend::_resetDynamicMemory() const {
|
||
mRuntime->pCurrentStatus = mCurrentAllocator->apply();
|
||
if (NO_ERROR != mRuntime->pCurrentStatus) {
|
||
return;
|
||
}
|
||
if (nullptr != mBufferPoolShapeImmutable.get()) {
|
||
mRuntime->pCurrentStatus = mBufferPoolShapeImmutable->apply();
|
||
}
|
||
}
|
||
|
||
void MetalBackend::onExecuteBegin() const {
|
||
_resetDynamicMemory();
|
||
mEncoderCount = 0;
|
||
}
|
||
void MetalBackend::onExecuteEnd() const {
|
||
flushEncoder();
|
||
commit_net();
|
||
}
|
||
|
||
BufferAllocator* MetalBackend::getBufferPool() const {
|
||
return mCurrentAllocator;
|
||
}
|
||
|
||
bool MetalBackend::onSelectDynamicAllocator(int index, int maxIndex) {
|
||
if (maxIndex > 2) {
|
||
return false;
|
||
}
|
||
if (maxIndex == 2 && mBufferPoolShapeImmutable.get() == nullptr) {
|
||
mBufferPoolShapeImmutable.reset(mRuntime->createDynamicAllocator(1, true));
|
||
mBufferPool.reset(mRuntime->createDynamicAllocator(0, true));
|
||
}
|
||
if (1 == index) {
|
||
mCurrentAllocator = mBufferPoolShapeImmutable.get();
|
||
} else {
|
||
mCurrentAllocator = mBufferPool.get();
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
|
||
if (nullptr == dstInfo) {
|
||
return true;
|
||
}
|
||
auto dst = (MNNMetalTensorContent*)dstInfo;
|
||
dst->type.code = halide_type_float;
|
||
if (mUseFloatAsFp16) {
|
||
dst->type.bits = 16;
|
||
} else {
|
||
dst->type.bits = 32;
|
||
}
|
||
MNNMetalGetTensorContent(dst, (void*)tensor);
|
||
return true;
|
||
}
|
||
|
||
bool MetalBackend::isCmdBufferCommit() {
|
||
auto ctx = (__bridge MNNMetalContext *)context();
|
||
|
||
//TODO: set magic number
|
||
const int magicNum = mRuntime->hint().encorderNumForCommit;
|
||
mEncoderCount++;
|
||
if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
|
||
size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
|
||
// reuse
|
||
if (nullptr != mHostBuffer && mHostBuffer.length >= size) {
|
||
return mHostBuffer;
|
||
}
|
||
|
||
// create larger
|
||
auto context = (__bridge MNNMetalContext *)this->context();
|
||
mHostBuffer = [context newDeviceBuffer:size access:CPUReadWrite];
|
||
return mHostBuffer;
|
||
}
|
||
|
||
id<MTLBuffer> MetalBackend::getConstBuffer(size_t size) const {
|
||
if (size < METAL_CONST_BUFFER_LIMIT) {
|
||
if (!mHoldBuffers.empty()) {
|
||
auto res = mHoldBuffers.front();
|
||
mHoldBuffers.pop();
|
||
return res;
|
||
}
|
||
size = METAL_CONST_BUFFER_LIMIT;
|
||
}
|
||
auto context = (__bridge MNNMetalContext *)this->context();
|
||
auto buffer = [context newDeviceBuffer:size access:CPUReadWrite];
|
||
return buffer;
|
||
}
|
||
void MetalBackend::returnConstBuffer(id<MTLBuffer> buffer) const {
|
||
mHoldBuffers.push(buffer);
|
||
}
|
||
static inline void _getNCPlane(const Tensor* tensor, int& s, int& c, int& b) {
|
||
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
|
||
s = 1, c = 1, b = 1;
|
||
b = tensor->length(0);
|
||
if (format == MNN_DATA_FORMAT_NHWC) {
|
||
c = tensor->length(tensor->dimensions()-1);
|
||
for (int i=1; i<tensor->dimensions()-1; ++i) {
|
||
s *= tensor->length(i);
|
||
}
|
||
} else {
|
||
c = tensor->length(1);
|
||
for (int i=2; i<tensor->dimensions(); ++i) {
|
||
s *= tensor->length(i);
|
||
}
|
||
}
|
||
}
|
||
MTLSize getTensorShape(id<MTLBuffer> shape, const Tensor *tensor) {
|
||
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
|
||
int s, b, c;
|
||
_getNCPlane(tensor, s, c, b);
|
||
int z = UP_DIV(c, 4);
|
||
|
||
// shape
|
||
((int *)shape.contents)[0] = b;
|
||
((int *)shape.contents)[1] = c;
|
||
((int *)shape.contents)[2] = s;
|
||
((int *)shape.contents)[3] = 1;
|
||
|
||
// stride
|
||
if (format == MNN_DATA_FORMAT_NHWC) {
|
||
((int *)shape.contents)[4] = s * c;
|
||
((int *)shape.contents)[5] = 1;
|
||
((int *)shape.contents)[6] = c;
|
||
((int *)shape.contents)[7] = 1;
|
||
} else {
|
||
((int *)shape.contents)[4] = s * c;
|
||
((int *)shape.contents)[5] = s;
|
||
((int *)shape.contents)[6] = 1;
|
||
((int *)shape.contents)[7] = 1;
|
||
}
|
||
// threads
|
||
MTLSize threads = {(NSUInteger)s * b * z, 1, 1};
|
||
return threads;
|
||
}
|
||
static const char* gTranspose = R"metal(
|
||
#include <metal_stdlib>
|
||
#include <simd/simd.h>
|
||
using namespace metal;
|
||
struct tensor_shape {
|
||
uint4 size; // n, c, plane, 1
|
||
uint4 stride;
|
||
};
|
||
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
|
||
int channel = uConstant.size.y;
|
||
if (gid < channel * uConstant.size.x * uConstant.size.z) {
|
||
int tmp = gid % (channel * uConstant.size.x);
|
||
int x = gid / (channel * uConstant.size.x);
|
||
int b = tmp / channel;
|
||
int c = tmp % channel;
|
||
int outPos = b * uConstant.size.y * uConstant.size.z + c * uConstant.size.z + x;
|
||
int inPos = b * uConstant.size.y * uConstant.size.z + c + x * uConstant.size.y;
|
||
out[outPos] = (OType)(in[inPos]);
|
||
}
|
||
})metal";
|
||
|
||
static const char* gNC4HW4Convert = R"metal(
|
||
#include <metal_stdlib>
|
||
#include <simd/simd.h>
|
||
using namespace metal;
|
||
struct tensor_shape {
|
||
uint4 size; // n, c, plane, 1
|
||
uint4 stride;
|
||
};
|
||
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
|
||
int channelC4 = (uConstant.size.y + 3) / 4;
|
||
if (gid < channelC4 * uConstant.size.x * uConstant.size.z)
|
||
{
|
||
int3 pos;
|
||
pos.z = gid % (channelC4 * uConstant.size.x);
|
||
pos.y = gid / (channelC4 * uConstant.size.x);
|
||
pos.x = 0;
|
||
int batchIndex = pos.z / channelC4;
|
||
int zDiv4 = pos.z % channelC4;
|
||
|
||
int lastZ = uConstant.size.y / 4;
|
||
int cIndex = uConstant.size.y % 4;
|
||
|
||
int z = zDiv4*4;
|
||
int basicOffset = 0
|
||
+ batchIndex*uConstant.stride.x
|
||
+ z * uConstant.stride.y
|
||
+ pos.y * uConstant.stride.z
|
||
;
|
||
#ifdef MNN_OUTPUT_C4
|
||
OType color = OType(0);
|
||
if(zDiv4 == lastZ)
|
||
{
|
||
if(cIndex == 1)
|
||
{
|
||
color.r = in[basicOffset+0];
|
||
color.g = 0.0;
|
||
color.b = 0.0;
|
||
color.a = 0.0;
|
||
}
|
||
else if(cIndex == 2)
|
||
{
|
||
color.r = in[basicOffset+0];
|
||
color.g = in[basicOffset+1*uConstant.stride.y];
|
||
color.b = 0.0;
|
||
color.a = 0.0;
|
||
}
|
||
else
|
||
{
|
||
color.r = in[basicOffset+0];
|
||
color.g = in[basicOffset+1*uConstant.stride.y];
|
||
color.b = in[basicOffset+2*uConstant.stride.y];
|
||
color.a = 0.0;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
color.r = in[basicOffset+0];
|
||
color.g = in[basicOffset+1*uConstant.stride.y];
|
||
color.b = in[basicOffset+2*uConstant.stride.y];
|
||
color.a = in[basicOffset+3*uConstant.stride.y];
|
||
}
|
||
|
||
out[0
|
||
+ pos.y
|
||
+ uConstant.size.x * uConstant.size.z*zDiv4
|
||
+ batchIndex*uConstant.size.z
|
||
] = color;
|
||
#else
|
||
IType color = in[0
|
||
+ pos.y
|
||
+ uConstant.size.x * uConstant.size.z*zDiv4
|
||
+ batchIndex*uConstant.size.z
|
||
];
|
||
if(zDiv4 == lastZ)
|
||
{
|
||
if(cIndex == 1)
|
||
{
|
||
out[basicOffset+0*uConstant.stride.y] = color.r;
|
||
}
|
||
else if(cIndex == 2)
|
||
{
|
||
out[basicOffset+0*uConstant.stride.y] = color.r;
|
||
out[basicOffset+1*uConstant.stride.y] = color.g;
|
||
}
|
||
else
|
||
{
|
||
out[basicOffset+0*uConstant.stride.y] = color.r;
|
||
out[basicOffset+1*uConstant.stride.y] = color.g;
|
||
out[basicOffset+2*uConstant.stride.y] = color.b;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
out[basicOffset+0*uConstant.stride.y] = color.r;
|
||
out[basicOffset+1*uConstant.stride.y] = color.g;
|
||
out[basicOffset+2*uConstant.stride.y] = color.b;
|
||
out[basicOffset+3*uConstant.stride.y] = color.a;
|
||
}
|
||
#endif
|
||
}
|
||
}
|
||
)metal";
|
||
|
||
static const char* gCopy = R"metal(
|
||
#include <metal_stdlib>
|
||
#include <simd/simd.h>
|
||
using namespace metal;
|
||
kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buffer(1)]], constant uint4& limit [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
|
||
if (gid < limit.x) {
|
||
out[int(gid)] = (OType)in[int(gid)];
|
||
}
|
||
})metal";
|
||
|
||
void MetalBackend::onResizeBegin() {
|
||
// Abort last inference task if needed
|
||
flushEncoder();
|
||
_commandBuffer_net = nil;
|
||
_commandBuffer = nil;
|
||
wait();
|
||
mCurrentAllocator->reset();
|
||
}
|
||
|
||
ErrorCode MetalBackend::onResizeEnd() {
|
||
auto ctx = (__bridge MNNMetalContext *)context();
|
||
return mCurrentAllocator->compute();
|
||
}
|
||
|
||
static std::string _getType(const halide_type_t& type, MNN_DATA_FORMAT format, bool useFp16AsFp32) {
|
||
std::string res;
|
||
if (type.code == halide_type_float) {
|
||
if (useFp16AsFp32) {
|
||
res = "half";
|
||
} else {
|
||
res = "float";
|
||
}
|
||
} else {
|
||
switch (type.bytes()) {
|
||
case 1:
|
||
res = "char";
|
||
break;
|
||
case 2:
|
||
res = "short";
|
||
break;
|
||
case 4:
|
||
res = "int";
|
||
break;
|
||
default:
|
||
MNN_ASSERT(false);
|
||
break;
|
||
}
|
||
}
|
||
if (format == MNN_DATA_FORMAT_NC4HW4) {
|
||
return res + "4";
|
||
}
|
||
return res;
|
||
}
|
||
MetalBackend::CopyPipeline MetalBackend::_makeCopyInfo(const Tensor *src, const Tensor *dst, id<MTLBuffer> shape, int castType) const {
|
||
auto ctx = (__bridge MNNMetalContext *)context();
|
||
MetalBackend::CopyPipeline res;
|
||
auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
|
||
auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
|
||
if (shape == nil) {
|
||
shape = getConstBuffer(8 * sizeof(int));
|
||
}
|
||
res.shape = shape;
|
||
if (sfmt == dfmt || src->dimensions() <= 1) {
|
||
auto srcType = _getType(src->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 1);
|
||
auto dstType = _getType(dst->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 2);
|
||
auto size = dst->elementSize();
|
||
size = UP_DIV(size, 4);
|
||
std::vector<std::string> keys = {
|
||
"copyC4",
|
||
srcType,
|
||
dstType
|
||
};
|
||
((uint32_t*)[shape contents])[0] = size;
|
||
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
|
||
if (nil == pipeline) {
|
||
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
|
||
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
|
||
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
|
||
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
|
||
option.preprocessorMacros = dic;
|
||
pipeline = makeComputePipelineWithSourceOption(gCopy, "main0", option);
|
||
mRuntime->insertPipeline(keys, pipeline);
|
||
}
|
||
res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
|
||
res.localSize = MTLSizeMake(256, 1, 1);
|
||
res.pipeline = pipeline;
|
||
return res;
|
||
}
|
||
auto srcType = _getType(src->getType(), sfmt, mUseFloatAsFp16 && castType != 1);
|
||
auto dstType = _getType(dst->getType(), dfmt, mUseFloatAsFp16 && castType != 2);
|
||
if (sfmt == MNN_DATA_FORMAT_NC4HW4 || dfmt == MNN_DATA_FORMAT_NC4HW4) {
|
||
auto normalTensor = dst;
|
||
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
|
||
normalTensor = src;
|
||
}
|
||
// convert C4 / NCHW
|
||
std::vector<std::string> keys = {
|
||
"c4convert",
|
||
srcType,
|
||
dstType
|
||
};
|
||
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
|
||
keys.emplace_back("outputc4");
|
||
}
|
||
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
|
||
if (nil == pipeline) {
|
||
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
|
||
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
|
||
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
|
||
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
|
||
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
|
||
[dic setValue:@"1" forKey:@"MNN_OUTPUT_C4"];
|
||
}
|
||
option.preprocessorMacros = dic;
|
||
pipeline = makeComputePipelineWithSourceOption(gNC4HW4Convert, "main0", option);
|
||
mRuntime->insertPipeline(keys, pipeline);
|
||
}
|
||
res.pipeline = pipeline;
|
||
auto size = getTensorShape(shape, normalTensor);
|
||
auto gl = [ctx computeBestGroupAndLocal:pipeline threads:size];
|
||
res.groupSize = gl.first;
|
||
res.localSize = gl.second;
|
||
return res;
|
||
}
|
||
// NCHW <-> NHWC
|
||
std::vector<std::string> keys = {
|
||
"transpose",
|
||
srcType,
|
||
dstType
|
||
};
|
||
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
|
||
if (nil == pipeline) {
|
||
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
|
||
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
|
||
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
|
||
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
|
||
option.preprocessorMacros = dic;
|
||
pipeline = makeComputePipelineWithSourceOption(gTranspose, "main0", option);
|
||
mRuntime->insertPipeline(keys, pipeline);
|
||
}
|
||
res.pipeline = pipeline;
|
||
int n, c, plane;
|
||
_getNCPlane(dst, plane, c, n);
|
||
auto shapePtr = (uint32_t*)shape.contents;
|
||
shapePtr[0] = n;
|
||
shapePtr[3] = 1;
|
||
if (MNN_DATA_FORMAT_NHWC == dfmt) {
|
||
shapePtr[1] = plane;
|
||
shapePtr[2] = c;
|
||
} else {
|
||
shapePtr[1] = c;
|
||
shapePtr[2] = plane;
|
||
}
|
||
auto size = plane * n * c;
|
||
res.localSize = MTLSizeMake(256, 1, 1);
|
||
res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
|
||
return res;
|
||
}
|
||
|
||
static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::CopyPipeline& info, std::pair<id<MTLBuffer>, int> src, std::pair<id<MTLBuffer>, int> dst) {
|
||
[encoder setComputePipelineState:info.pipeline];
|
||
[encoder setBuffer:src.first offset:src.second atIndex:0];
|
||
[encoder setBuffer:dst.first offset:dst.second atIndex:1];
|
||
[encoder setBuffer:info.shape offset:0 atIndex:2];
|
||
[encoder dispatchThreadgroups:info.groupSize threadsPerThreadgroup:info.localSize];
|
||
}
|
||
void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
||
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape, int castType) const {
|
||
auto ctx = (__bridge MNNMetalContext *)context();
|
||
auto info = _makeCopyInfo(src, dst, shape, castType);
|
||
auto standalone = encoder == nil;
|
||
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||
_execute(encoder, info, MetalBackend::getBuffer(src), MetalBackend::getBuffer(dst));
|
||
if (standalone) {
|
||
[encoder endEncoding];
|
||
MNN_PRINT_ENCODER(ctx, encoder);
|
||
}
|
||
}
|
||
|
||
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
|
||
flushEncoder();
|
||
auto ctx = (__bridge MNNMetalContext *)context();
|
||
commit_net();
|
||
|
||
_resetDynamicMemory();
|
||
onCopyBuffer(src, dst, nil, nil);
|
||
}
|
||
|
||
id<MTLComputeCommandEncoder> MetalBackend::encoder_for_net() const {
|
||
if (nil == mComputeEncoder) {
|
||
mComputeEncoder = encoder_net();//TO DO :: use which cmdBuffer
|
||
}
|
||
return mComputeEncoder;
|
||
}
|
||
|
||
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
|
||
MNN_ASSERT(src->buffer().dimensions == dst->buffer().dimensions);
|
||
|
||
if (!src->buffer().host && !dst->buffer().host) {
|
||
onCopyDeviceToDevice(src, dst, encoder, shape);
|
||
return;
|
||
}
|
||
auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
|
||
auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
|
||
bool formatDiff = sfmt != dfmt && src->dimensions() > 1;
|
||
auto floats = src->getType().code == halide_type_float;
|
||
bool dataTypeDiff = floats && mUseFloatAsFp16;
|
||
bool needConvert = formatDiff || dataTypeDiff;
|
||
|
||
if (!src->buffer().host && dst->buffer().host) {
|
||
auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)src->deviceId())->getBuffer();
|
||
auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(src)->extra.offset;
|
||
if (needConvert) {
|
||
auto tDst = const_cast<Tensor*>(dst);
|
||
auto tmpBuffer = getHostBuffer(dst->usize());
|
||
auto info = _makeCopyInfo(src, dst, shape, 2);
|
||
auto standalone = encoder == nil;
|
||
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||
_execute(encoder, info, MetalBackend::getBuffer(src), std::make_pair(tmpBuffer, 0));
|
||
if (standalone) {
|
||
[encoder endEncoding];
|
||
}
|
||
commit();
|
||
devicePtr = (uint8_t*)tmpBuffer.contents;
|
||
}
|
||
wait();
|
||
::memcpy(dst->host<void>(), devicePtr, dst->usize());
|
||
return;
|
||
}
|
||
if (src->buffer().host && !dst->buffer().host) {
|
||
// For command queue from user, need user to make sure last frame's gpu work is ready
|
||
bool needWait = !mRuntime->userSync();
|
||
if (needWait) {
|
||
wait();
|
||
}
|
||
auto srcSize = src->usize();
|
||
if (needConvert) {
|
||
auto tmpBuffer = getHostBuffer(srcSize);
|
||
::memcpy(tmpBuffer.contents, src->host<void>(), srcSize);
|
||
auto info = _makeCopyInfo(src, dst, shape, 1);
|
||
auto standalone = encoder == nil;
|
||
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||
_execute(encoder, info, std::make_pair(tmpBuffer, 0), MetalBackend::getBuffer(dst));
|
||
if (standalone) {
|
||
[encoder endEncoding];
|
||
}
|
||
commit();
|
||
} else {
|
||
auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)dst->deviceId())->getBuffer();
|
||
auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(dst)->extra.offset;
|
||
::memcpy(devicePtr, src->host<void>(), srcSize);
|
||
}
|
||
return;
|
||
}
|
||
MNN_ASSERT(false); // should not be handled here
|
||
}
|
||
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
|
||
flushEncoder();
|
||
auto ctx = (__bridge MNNMetalContext *)context();
|
||
commit_net();
|
||
|
||
if (toCpu) {
|
||
wait();
|
||
}
|
||
return 0;
|
||
}
|
||
id<MTLCommandBuffer> MetalBackend::getCommandBufferForBufferCopy() const {
|
||
if (nil == _commandBuffer) {
|
||
_commandBuffer = [_commandQueue commandBuffer];
|
||
if (!mSupportDeferEncode) {
|
||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||
_commandBuffer_net = _commandBuffer;
|
||
}
|
||
}
|
||
return _commandBuffer;
|
||
}
|
||
id<MTLCommandBuffer> MetalBackend::getCommandBufferForNet() const {
|
||
if (nil == _commandBuffer_net) {
|
||
_commandBuffer_net = [_commandQueue commandBuffer];
|
||
if (!mSupportDeferEncode) {
|
||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||
_commandBuffer = _commandBuffer_net;
|
||
}
|
||
}
|
||
return _commandBuffer_net;
|
||
}
|
||
|
||
void MetalBackend::setTensor(const MNN::Tensor* tensor, id<MTLComputeCommandEncoder> encoder, int index) {
|
||
[encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer() offset:TensorUtils::getDescribe(tensor)->extra.offset atIndex:index];
|
||
}
|
||
void MetalBackend::setMem(const MemChunk& chunk, id<MTLComputeCommandEncoder> encoder, int index) {
|
||
[encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer() offset:chunk.second atIndex:index];
|
||
}
|
||
uint8_t* MetalBackend::getMemPtr(const MemChunk& chunk) {
|
||
return (uint8_t*)((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer().contents + chunk.second;
|
||
}
|
||
|
||
std::pair<id<MTLBuffer>, int> MetalBackend::getBuffer(const MNN::Tensor* tensor) {
|
||
return std::make_pair(((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer(), TensorUtils::getDescribe(tensor)->extra.offset);
|
||
}
|
||
|
||
|
||
void MetalBackend::commit() const {
|
||
if (nil != _commandBuffer && _commandBuffer.status < MTLCommandBufferStatusCommitted) {
|
||
[_commandBuffer commit];
|
||
_waiting = _commandBuffer;
|
||
_commandBuffer = nil;
|
||
if (!mSupportDeferEncode) {
|
||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||
_commandBuffer_net = nil;
|
||
}
|
||
}
|
||
}
|
||
|
||
void MetalBackend::commit_net() const {
|
||
if (nil != _commandBuffer_net && _commandBuffer_net.status < MTLCommandBufferStatusCommitted) {
|
||
[_commandBuffer_net commit];
|
||
_waiting = _commandBuffer_net;
|
||
_commandBuffer_net = nil;
|
||
if (!mSupportDeferEncode) {
|
||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||
_commandBuffer = nil;
|
||
}
|
||
}
|
||
}
|
||
|
||
void MetalBackend::wait() const {
|
||
if (nil != _waiting) {
|
||
auto buffer = _waiting;
|
||
if (buffer.status >= MTLCommandBufferStatusCompleted) {
|
||
_waiting = nil;
|
||
return;
|
||
}
|
||
|
||
#if MNN_METAL_BENCHMARK
|
||
NSTimeInterval begin = [NSDate timeIntervalSinceReferenceDate];
|
||
[buffer waitUntilCompleted];
|
||
NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
|
||
if (@available(iOS 10.3, *)) {
|
||
printf("[METAL] commit costs: %.3fms\t(kernel: %.3fms, GPU: %.3fms)\n", (end - begin) * 1000.f,
|
||
(buffer.kernelEndTime - buffer.kernelStartTime) * 1000.f,
|
||
(buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f);
|
||
} else {
|
||
printf("[METAL] commit costs: %.3fms\n", (end - begin) * 1000.f);
|
||
}
|
||
#else
|
||
[buffer waitUntilCompleted];
|
||
#endif
|
||
|
||
#if MNN_METAL_DEBUG
|
||
if (buffer.error) {
|
||
printf("[METAL] %s\n", buffer.error.localizedDescription.UTF8String);
|
||
}
|
||
#endif
|
||
}
|
||
_waiting = nil;
|
||
}
|
||
|
||
id<MTLComputePipelineState> MetalBackend::makeComputePipelineWithSourceOption(const char* csource, const char* cname, MTLCompileOptions *options) const{
|
||
auto ctx = (__bridge MNNMetalContext *)context();
|
||
auto source = [[NSString alloc] initWithUTF8String:csource];
|
||
auto name = [[NSString alloc] initWithUTF8String:cname];
|
||
auto pipeline = [ctx pipelineWithSourceOption:source name:name options:options];
|
||
if (nil == pipeline) {
|
||
mRuntime->pCurrentStatus = NOT_SUPPORT;
|
||
}
|
||
return pipeline;
|
||
}
|
||
void MetalRuntime::setCommandQueue(id<MTLCommandQueue> queue, bool userSync) {
|
||
mQueue = queue;
|
||
mUserSync = userSync;
|
||
}
|
||
id<MTLComputePipelineState> MetalRuntime::findPipeline(const std::vector<std::string>& keys) const {
|
||
auto iter = mCachePipeine.find(keys);
|
||
if (iter == mCachePipeine.end()) {
|
||
return nil;
|
||
}
|
||
return iter->second;
|
||
}
|
||
void MetalRuntime::insertPipeline(const std::vector<std::string>& keys, id<MTLComputePipelineState> pipeline) const {
|
||
if (nil != pipeline) {
|
||
mCachePipeine.insert(std::make_pair(keys, pipeline));
|
||
}
|
||
}
|
||
|
||
void MetalRuntime::setGpuMode(const int mode_num) {
|
||
int totalSet = 0;
|
||
bool isSet = (mode_num & MNN_GPU_MEMORY_BUFFER);
|
||
if(isSet) {
|
||
totalSet++;
|
||
}
|
||
isSet = (mode_num & MNN_GPU_MEMORY_IMAGE);
|
||
if(isSet) {
|
||
totalSet++;
|
||
}
|
||
if(totalSet > 0) {
|
||
MNN_PRINT("warning: set BUFFER and IMAGE mode is not useful for metal, it doesn't matter, cl_mode:%x!\n", mode_num);
|
||
}
|
||
|
||
totalSet = 0;
|
||
isSet = (mode_num & MNN_GPU_TUNING_NONE);
|
||
if(isSet) {
|
||
mTuneLevel = Never;
|
||
totalSet++;
|
||
}
|
||
|
||
isSet = (mode_num & MNN_GPU_TUNING_FAST);
|
||
if(isSet) {
|
||
mTuneLevel = Fast;
|
||
totalSet++;
|
||
}
|
||
|
||
isSet = (mode_num & MNN_GPU_TUNING_NORMAL);
|
||
if(isSet) {
|
||
mTuneLevel = Normal;
|
||
totalSet++;
|
||
}
|
||
|
||
isSet = (mode_num & MNN_GPU_TUNING_HEAVY);
|
||
if(isSet) {
|
||
mTuneLevel = Heavy;
|
||
totalSet++;
|
||
}
|
||
|
||
isSet = (mode_num & MNN_GPU_TUNING_WIDE);
|
||
if(isSet) {
|
||
mTuneLevel = Wide;
|
||
totalSet++;
|
||
}
|
||
|
||
if(totalSet != 1) {
|
||
MNN_PRINT("set multi tuning mode is not permitted, please check cl_mode:%x!\n", mode_num);
|
||
}
|
||
}
|
||
|
||
struct MetalContext {
|
||
std::mutex pLock;
|
||
MNNMetalContext* pContext;
|
||
id<MTLDevice> pDevice;
|
||
};
|
||
static MetalContext* gContext = nullptr;
|
||
MetalRuntime* MetalRuntime::create(const Backend::Info& info) {
|
||
std::unique_lock<std::mutex> _l(gContext->pLock);
|
||
MNNMetalSharedContext sharedContext;
|
||
sharedContext.device = nil;
|
||
sharedContext.queue = nil;
|
||
if (info.user != nullptr) {
|
||
if (info.user->sharedContext != nullptr) {
|
||
sharedContext.device = ((MNNMetalSharedContext*)info.user->sharedContext)->device;
|
||
sharedContext.queue = ((MNNMetalSharedContext*)info.user->sharedContext)->queue;
|
||
}
|
||
}
|
||
if (nil == sharedContext.device) {
|
||
sharedContext.device = MTLCreateSystemDefaultDevice();
|
||
}
|
||
if (nil == gContext->pContext || gContext->pDevice != sharedContext.device) {
|
||
gContext->pContext = [[MNNMetalContext alloc] init];
|
||
gContext->pDevice = sharedContext.device;
|
||
BOOL res = [gContext->pContext initWithSharedContext:&sharedContext dev:sharedContext.device];
|
||
if (!res) {
|
||
gContext->pContext = nil;
|
||
return nullptr;
|
||
}
|
||
}
|
||
auto mContext = (__bridge_retained void *)(gContext->pContext);
|
||
auto rt = new MetalRuntime(mContext);
|
||
rt->setGpuMode(info.gpuMode);
|
||
if (nil != sharedContext.queue) {
|
||
rt->setCommandQueue(sharedContext.queue, true);
|
||
}
|
||
bool supportDefer = info.numThread & MNN_GPU_RECORD_BATCH;
|
||
if ((!supportDefer) && nil == sharedContext.queue) {
|
||
id<MTLCommandQueue> queue = [sharedContext.device newCommandQueue];
|
||
rt->setCommandQueue(queue, false);
|
||
}
|
||
if (nullptr != info.user) {
|
||
rt->mDefaultConfig = *info.user;
|
||
}
|
||
return rt;
|
||
}
|
||
|
||
MetalRuntime::MetalRuntime(void* context) {
|
||
mContext = context;
|
||
auto ctx = (__bridge MNNMetalContext *)mContext;
|
||
std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
|
||
mSimdGroupReduce = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
|
||
mSimdGroupReduce |= [[ctx device] supportsFamily:MTLGPUFamilyMetal3];
|
||
mSimdGroupMatrix = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
|
||
mStatic.reset(new EagerBufferAllocator(allocator));
|
||
mDynamic.resize(METAL_SEPERATE_MAX_COUNT);
|
||
for (auto& buf : mDynamic) {
|
||
buf.root = allocator;
|
||
}
|
||
mTunedInfo = new TunedInfo;
|
||
}
|
||
|
||
MetalRuntime::~ MetalRuntime() {
|
||
if(mContext) {
|
||
CFRelease(mContext);
|
||
}
|
||
delete mTunedInfo;
|
||
}
|
||
|
||
bool MetalRuntime::setCache(std::pair<const void*, size_t> cache) {//Get Cache
|
||
auto buffer = cache.first;
|
||
auto size = cache.second;
|
||
if (nullptr == buffer) {
|
||
mCacheOutside = nullptr;
|
||
mCacheOutsideSize = 0;
|
||
mBuffer.clear();
|
||
return false;//actually get nothing
|
||
}
|
||
mCacheOutsideSize = size;
|
||
mCacheOutside = buffer;
|
||
auto cacheBuffer = GetCache(buffer);
|
||
flatbuffers::Verifier verify((const uint8_t*)cache.first, cache.second);
|
||
if (false == VerifyCacheBuffer(verify)) {
|
||
return false;
|
||
}
|
||
if (nullptr == cacheBuffer->tunings()) {
|
||
return false;
|
||
}
|
||
|
||
// Load Auto Tuning Info
|
||
if (nullptr != cacheBuffer->tunings()) {
|
||
auto tuningInfo = cacheBuffer->tunings();
|
||
for (int i=0; i<tuningInfo->size(); ++i) {
|
||
auto tun = tuningInfo->GetAs<Autotuning>(i);
|
||
if (nullptr == tun->threadSize() || nullptr == tun->groupSize() || nullptr == tun->key()) {
|
||
MNN_ERROR("Error tunning info\n");
|
||
continue;
|
||
}
|
||
std::vector<uint32_t> glo(tun->threadSize()->size());
|
||
for (int v=0; v<glo.size(); ++v) {
|
||
glo[v] = tun->threadSize()->data()[v];
|
||
}
|
||
std::vector<uint32_t> grop(tun->groupNum()->size());
|
||
for (int v=0; v<grop.size(); ++v) {
|
||
grop[v] = tun->groupNum()->data()[v];
|
||
}
|
||
std::vector<uint32_t> loc(tun->groupSize()->size());
|
||
for (int v=0; v<loc.size(); ++v) {
|
||
loc[v] = tun->groupSize()->data()[v];
|
||
}
|
||
uint32_t cost = tun->timeCost();
|
||
mTunedThreadGroup.insert(std::make_pair(std::make_pair(tun->key()->str(), glo), std::make_tuple(grop, loc, cost)));
|
||
mTunedThreadGroupVec[tun->key()->str()].emplace_back(std::make_pair(glo, std::make_tuple(grop, loc, cost)));
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
std::pair<const void*, size_t> MetalRuntime::makeCache(TunedInfo* info) {//make Cache
|
||
std::unique_ptr<CacheT> cache(new CacheT);
|
||
// Get All Autotuning cache
|
||
for (auto& iter : mTunedThreadGroup) {
|
||
std::unique_ptr<AutotuningT> tuning(new AutotuningT);
|
||
tuning->key = iter.first.first;
|
||
tuning->threadSize = iter.first.second;
|
||
|
||
tuning->groupNum = std::get<0>(iter.second);
|
||
tuning->groupSize = std::get<1>(iter.second);
|
||
tuning->timeCost = std::get<2>(iter.second);
|
||
|
||
cache->tunings.emplace_back(std::move(tuning));
|
||
}
|
||
cache->tuned = std::move(info->mInfos);
|
||
|
||
flatbuffers::FlatBufferBuilder builder;
|
||
auto lastOffset = Cache::Pack(builder, cache.get());
|
||
builder.Finish(lastOffset);
|
||
mBuffer.resize(builder.GetSize());
|
||
::memcpy(mBuffer.data(), builder.GetBufferPointer(), builder.GetSize());
|
||
return std::make_pair(mBuffer.data(), mBuffer.size());
|
||
}
|
||
|
||
float MetalRuntime::onGetMemoryInMB() {
|
||
auto staticMemoryInMB = mStatic->totalSize() / 1024.0f / 1024.0f;
|
||
float dynamicMemoryInMB = 0.0f;
|
||
for (auto& buf : mDynamic) {
|
||
dynamicMemoryInMB += buf.currentSize / 1024.0f / 1024.0f;
|
||
}
|
||
return staticMemoryInMB + dynamicMemoryInMB;
|
||
}
|
||
|
||
void MetalRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||
const MNN::Op* op) {
|
||
if (nullptr != op->name()) {
|
||
auto dstInfo = mTunedInfo;
|
||
std::unique_ptr<MetalCache::OpInfoT> opInfo(new MetalCache::OpInfoT);;
|
||
opInfo->type = op->type();
|
||
opInfo->name = op->name()->str();
|
||
opInfo->inputs.resize(inputs.size());
|
||
for (int v=0; v<opInfo->inputs.size(); ++v) {
|
||
opInfo->inputs[v].reset(new MetalCache::TensorInfoT);
|
||
opInfo->inputs[v]->shape.resize(inputs[v]->dimensions());
|
||
for (int u=0; u<opInfo->inputs[v]->shape.size(); ++u) {
|
||
opInfo->inputs[v]->shape[u] = inputs[v]->length(u);
|
||
}
|
||
}
|
||
opInfo->outputs.resize(outputs.size());
|
||
for (int v=0; v<opInfo->outputs.size(); ++v) {
|
||
opInfo->outputs[v].reset(new MetalCache::TensorInfoT);
|
||
opInfo->outputs[v]->shape.resize(outputs[v]->dimensions());
|
||
for (int u=0; u<opInfo->outputs[v]->shape.size(); ++u) {
|
||
opInfo->outputs[v]->shape[u] = outputs[v]->length(u);
|
||
}
|
||
}
|
||
dstInfo->mInfos.emplace_back(std::move(opInfo));
|
||
}
|
||
}
|
||
static bool _checkTensorInfo(const MetalCache::TensorInfoT* dst, const Tensor* src) {
|
||
if (dst->shape.size() != src->dimensions()) {
|
||
return false;
|
||
}
|
||
for (int j=0; j<dst->shape.size(); ++j) {
|
||
if (dst->shape[j] != src->length(j)) {
|
||
return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
bool MetalRuntime::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||
const MNN::Op* op, Runtime::OpInfo& dstInfo) const {
|
||
dstInfo.initCostLong = true;
|
||
if (nullptr == op->name()) {
|
||
dstInfo.initCostLong = false;
|
||
return true;
|
||
}
|
||
for(auto& info : mTunedInfo->mInfos) {
|
||
if (info->type != op->type()) {
|
||
continue;
|
||
}
|
||
if (info->name != op->name()->str()) {
|
||
continue;
|
||
}
|
||
if (info->inputs.size() != inputs.size() || info->outputs.size() != outputs.size()) {
|
||
continue;
|
||
}
|
||
bool match = true;
|
||
for (int i=0; i<inputs.size(); ++i) {
|
||
auto& dst = info->inputs[i];
|
||
auto src = inputs[i];
|
||
if (!_checkTensorInfo(dst.get(), src)) {
|
||
match = false;
|
||
break;
|
||
}
|
||
}
|
||
if (!match) {
|
||
continue;
|
||
}
|
||
for (int i=0; i<outputs.size(); ++i) {
|
||
auto& dst = info->outputs[i];
|
||
auto src = outputs[i];
|
||
if (!_checkTensorInfo(dst.get(), src)) {
|
||
match = false;
|
||
break;
|
||
}
|
||
}
|
||
if (match) {
|
||
// All Info is match
|
||
dstInfo.initCostLong = false;
|
||
break;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
class MetalWrapAllocator : public BufferAllocator::Allocator {
|
||
private:
|
||
std::shared_ptr<BufferAllocator::Allocator> mOrigin;
|
||
id<MTLDevice> mDevice;
|
||
public:
|
||
MetalWrapAllocator(std::shared_ptr<BufferAllocator::Allocator> origin, id<MTLDevice> device) : mOrigin(origin), mDevice(device) {}
|
||
virtual ~ MetalWrapAllocator() {
|
||
// Do nothing
|
||
}
|
||
virtual MemChunk onAlloc(size_t size, size_t align) override {
|
||
auto mem = mOrigin->onAlloc(size, align);
|
||
MNN_ASSERT(mem.second == 0);
|
||
id<MTLBuffer> buffer = [mDevice newBufferWithBytesNoCopy:mem.first length:size options:MTLResourceStorageModeShared deallocator:nil];
|
||
auto wrap = new MetalRuntimeAllocator::MetalBufferAlloc(buffer);
|
||
return MemChunk((void *)wrap, 0);
|
||
}
|
||
virtual void onRelease(MemChunk chunk) override {
|
||
auto mem = (MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first;
|
||
mOrigin->onRelease(MemChunk(mem->getBuffer().contents));
|
||
delete mem;
|
||
}
|
||
};
|
||
Backend* MetalRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
|
||
if (hint().weightMemoryPath.size() > 0 && mStaticCache.get() == nullptr) {
|
||
auto ctx = (__bridge MNNMetalContext *)mContext;
|
||
auto mmap = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "", "metal.weight");
|
||
std::shared_ptr<BufferAllocator::Allocator> mmapMem(new MetalWrapAllocator(mmap, [ctx device]));
|
||
mStaticCache = mStatic;
|
||
mStatic.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
|
||
}
|
||
BackendConfig::PrecisionMode precision = mDefaultConfig.precision;
|
||
BackendConfig::MemoryMode memory = mDefaultConfig.memory;
|
||
if (nullptr != config) {
|
||
precision = config->precision;
|
||
memory = config->memory;
|
||
}
|
||
bool useFp16AsFp32 = precision != BackendConfig::Precision_High;
|
||
return new MetalBackend(mStatic, this, useFp16AsFp32, memory);
|
||
}
|
||
|
||
void MetalRuntime::onGabageCollect(int level) {
|
||
mStatic->release(false);
|
||
if (level >= 100) {
|
||
for (auto& buf : mDynamic) {
|
||
buf.release();
|
||
}
|
||
}
|
||
}
|
||
|
||
std::pair<const void*, size_t> MetalRuntime::onGetCache() {//make Cache
|
||
return makeCache(mTunedInfo);
|
||
}
|
||
|
||
bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
|
||
if (nullptr == buffer) {
|
||
return false;
|
||
}
|
||
auto cacheBuffer = MetalCache::GetCache(buffer);
|
||
flatbuffers::Verifier verify((const uint8_t*)buffer, size);
|
||
if (false == VerifyCacheBuffer(verify)) {
|
||
return false;
|
||
}
|
||
if(nullptr != cacheBuffer->tuned()) {
|
||
for (int i=0; i<cacheBuffer->tuned()->size(); ++i) {
|
||
auto srcInfo = cacheBuffer->tuned()->GetAs<MetalCache::OpInfo>(i);
|
||
std::unique_ptr<MetalCache::OpInfoT> dst(srcInfo->UnPack());
|
||
mTunedInfo->mInfos.emplace_back(std::move(dst));
|
||
}
|
||
}
|
||
return setCache(std::make_pair(buffer, size));
|
||
}
|
||
|
||
MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
|
||
auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
|
||
auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
|
||
return MemChunk((void *)mMetalBufferAlloc, 0);
|
||
}
|
||
void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
|
||
delete (MetalBufferAlloc *)ptr.first;
|
||
}
|
||
|
||
class MetalRuntimeCreator : public RuntimeCreator {
|
||
public:
|
||
MetalRuntimeCreator() {
|
||
// Do nothing
|
||
}
|
||
virtual ~ MetalRuntimeCreator() {
|
||
// Do nothing
|
||
}
|
||
virtual Runtime *onCreate(const Backend::Info &info) const {
|
||
auto rt = MetalRuntime::create(info);
|
||
return rt;
|
||
}
|
||
private:
|
||
id<MTLDevice> mDevice;
|
||
};
|
||
|
||
void registerMetalRuntimeCreator() {
|
||
// according to
|
||
// https://developer.apple.com/library/archive/documentation/DeviceInformation/Reference/iOSDeviceCompatibility/HardwareGPUInformation/HardwareGPUInformation.html
|
||
// not all device with iOS 8+ supports metal.
|
||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||
if (nil != device) {
|
||
gContext = new MetalContext;
|
||
gContext->pContext = nil;
|
||
gContext->pDevice = nil;
|
||
registerMetalOps();
|
||
#ifdef MNN_SUPPORT_RENDER
|
||
registerMetalRenderOps();
|
||
#endif
|
||
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator, false);
|
||
} else {
|
||
MNN_ERROR("Init Metal Error\n");
|
||
}
|
||
}
|
||
} // namespace MNN
|
||
#else
|
||
namespace MNN {
|
||
void registerMetalRuntimeCreator() {
|
||
}
|
||
};
|
||
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
|
||
return -1;
|
||
}
|
||
|
||
#endif
|