mirror of https://github.com/alibaba/MNN.git
258 lines
11 KiB
Plaintext
Executable File
258 lines
11 KiB
Plaintext
Executable File
//
|
|
// MetalConvolution.mm
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2019/01/30.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#import "backend/metal/MetalConvolution.hpp"
|
|
#import "core/Macro.h"
|
|
#import "backend/metal/MetalBackend.hpp"
|
|
#import "backend/metal/MetalConvolution1x1.hpp"
|
|
#import "backend/metal/MetalConvolutionGEMM.hpp"
|
|
#import "backend/metal/MetalConvolutionWinograd.hpp"
|
|
|
|
#if MNN_METAL_ENABLED
|
|
namespace MNN {
|
|
|
|
MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op) {
|
|
loadWeight(op->main_as_Convolution2D());
|
|
}
|
|
|
|
// definitely less than max threadgroup memory to ensure that it won't take too long in one step.
|
|
#define kMaxGemmStepMemory (8 * 1024)
|
|
|
|
bool MetalConvolution::isThreadgroupLocalPreferred(const Tensor *input, const Tensor *output) {
|
|
if (output->width() * output->height() > 256) {
|
|
return false;
|
|
}
|
|
|
|
auto backend = static_cast<MetalBackend *>(this->backend());
|
|
auto context = (__bridge MNNMetalContext *)backend->context();
|
|
int ic_4 = UP_DIV(input->channel(), 4);
|
|
int oc_4 = UP_DIV(output->channel(), 4);
|
|
|
|
int unit = sizeof(metal_float);
|
|
int sliceMemory = 4 * mKernelY * mKernelX * 4 * unit;
|
|
int maxMemory = sliceMemory > kMaxGemmStepMemory ? (int)context.maxThreadgroupMemoryLength : kMaxGemmStepMemory;
|
|
int maxStepSlices = maxMemory / sliceMemory;
|
|
int steps = UP_DIV(ic_4, maxStepSlices);
|
|
|
|
static int kGemmUnroll = 4;
|
|
return oc_4 * oc_4 * kGemmUnroll / steps / steps >= output->width() * output->height();
|
|
}
|
|
|
|
ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
MetalConvolutionCommon::onResize(inputs, outputs);
|
|
|
|
// prepare
|
|
auto backend = static_cast<MetalBackend *>(this->backend());
|
|
auto context = (__bridge MNNMetalContext *)backend->context();
|
|
auto input = inputs[0];
|
|
auto output = outputs[0];
|
|
auto iw = input->width();
|
|
auto ih = input->height();
|
|
auto ic_4 = UP_DIV(input->channel(), 4);
|
|
auto ow = output->width();
|
|
auto oh = output->height();
|
|
auto oc_4 = UP_DIV(output->channel(), 4);
|
|
auto ob = output->batch();
|
|
|
|
auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
|
|
auto padX = pads.first;
|
|
auto padY = pads.second;
|
|
int stepSlices = ic_4;
|
|
|
|
// create const buffer
|
|
int constants[] = {iw,
|
|
ih,
|
|
iw * ih,
|
|
ic_4,
|
|
ow,
|
|
oh,
|
|
ow * oh,
|
|
oc_4,
|
|
ob,
|
|
oc_4 * ob,
|
|
stepSlices,
|
|
mKernelX,
|
|
mKernelY,
|
|
mKernelX * mKernelY,
|
|
mStrideX,
|
|
mStrideY,
|
|
padX,
|
|
padY,
|
|
mDilateX,
|
|
mDilateY,
|
|
mActivationType};
|
|
mConstBuffer = backend->getConstBuffer(sizeof(constants));
|
|
::memcpy(mConstBuffer.contents, constants, sizeof(constants));
|
|
|
|
// update threadgroup memory if needed
|
|
mLocalPreferred = isThreadgroupLocalPreferred(input, output);
|
|
mLocalPreferred = false;//not used temporarily
|
|
|
|
if (mLocalPreferred) {
|
|
int unit = sizeof(metal_float);
|
|
int sliceMemory = 4 * mKernelY * mKernelX * 4 * unit;
|
|
int maxMemory = sliceMemory > kMaxGemmStepMemory ? (int)context.maxThreadgroupMemoryLength : kMaxGemmStepMemory;
|
|
int maxStepSlices = maxMemory / sliceMemory;
|
|
int steps = UP_DIV(ic_4, maxStepSlices);
|
|
stepSlices = UP_DIV(ic_4, steps);
|
|
mThreadgroupMemory = stepSlices * sliceMemory;
|
|
}
|
|
|
|
MetalRuntime* rt = (MetalRuntime *)backend->runtime();
|
|
bool isMuchComputer = (ow * oh >= 32 ? oc_4 >= 4 : oc_4 >= 128);
|
|
bool is3x3s1Conv = (mKernelX==3 && mKernelY==3 && mStrideX==1 && mStrideY==1 && padX==1 && padY==1 && mDilateX==1 && mDilateY==1);
|
|
|
|
if(isMuchComputer && is3x3s1Conv) {
|
|
mPipeline = [context pipelineWithName:@"convk3s1d1p1_w2z4"];
|
|
|
|
NSUInteger gid_x = UP_DIV(ow, 2);
|
|
NSUInteger gid_y = oh;
|
|
NSUInteger gid_z = UP_DIV(oc_4, 4) * ob;
|
|
|
|
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
|
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
|
mConstBuffer, mWeight, mBias, nil];
|
|
|
|
std::string name = "convk3s1d1p1_w2z4";
|
|
MetalRuntime *rt = (MetalRuntime *)backend->runtime();
|
|
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
|
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
|
|
|
//printf("conv3x3_w2z4, cost:%d\n", (int)std::get<2>(ret));
|
|
} else {
|
|
if(rt->getTuneLevel() == Never) {
|
|
int packC = 1;
|
|
NSString* kernelName = @"conv";
|
|
if(isMuchComputer) {
|
|
packC = 4;
|
|
kernelName = @"conv_z4";
|
|
}
|
|
NSUInteger gid_x = ow;
|
|
NSUInteger gid_y = oh;
|
|
NSUInteger gid_z = UP_DIV(oc_4, packC) * ob;
|
|
|
|
mPipeline = [context pipelineWithName:kernelName];
|
|
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
|
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
|
mConstBuffer, mWeight, mBias, nil];
|
|
|
|
std::string name = [kernelName UTF8String];
|
|
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
|
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
|
} else {
|
|
// {"conv_2d_c4h1w4", "conv_2d_c4h1w2", "conv_2d_c4h1w1", "conv_2d_c8h1w1", };
|
|
const int total_kernel = 2;
|
|
NSString* shaderName[total_kernel] = {@"conv", @"conv_z4"};
|
|
int itemW[total_kernel] = {1, 1};
|
|
int itemH[total_kernel] = {1, 1};
|
|
int itemC[total_kernel] = {1, 4};
|
|
|
|
int actual_kernel = 2;
|
|
std::pair<NSUInteger, int> min_cost(INT_MAX, 0);//(min_time, min_index)
|
|
|
|
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
|
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
|
mConstBuffer, mWeight, mBias, nil];
|
|
|
|
for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
|
|
id<MTLComputePipelineState> pipeline = [context pipelineWithName:shaderName[knl_idx]];
|
|
NSUInteger gid_x = UP_DIV(ow, itemW[knl_idx]);
|
|
NSUInteger gid_y = UP_DIV(oh, itemH[knl_idx]);
|
|
NSUInteger gid_z = UP_DIV(oc_4, itemC[knl_idx]) * ob;
|
|
|
|
std::string name = [shaderName[knl_idx] UTF8String];
|
|
auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
|
|
|
if(min_cost.first > std::get<2>(ret)) {
|
|
min_cost.first = std::get<2>(ret);
|
|
min_cost.second = knl_idx;
|
|
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
|
}
|
|
//printf("conv1x1 idx:%d, global:%d %d %d, local:%d %d %d, min_cost:%d\n", knl_idx, (int)retTune.second.first.width, (int)retTune.second.first.height, (int)retTune.second.first.depth, (int)retTune.second.second.width, (int)retTune.second.second.height, (int)retTune.second.second.depth, (int)retTune.first);
|
|
}
|
|
//printf("conv idx:%d, min_cost:%d\n", (int)min_cost.second, (int)min_cost.first);
|
|
|
|
mPipeline = [context pipelineWithName:shaderName[min_cost.second]];
|
|
}
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
|
|
auto backend = static_cast<MetalBackend *>(this->backend());
|
|
auto context = (__bridge MNNMetalContext *)backend->context();
|
|
|
|
if(backend->isCommandEncoderSet()) {
|
|
return NO_ERROR;
|
|
}
|
|
|
|
auto func = [=](){
|
|
auto oc_4 = UP_DIV(output->channel(), 4);
|
|
auto encoder = backend->encoder();
|
|
|
|
auto bandwidth = (MetalBandwidth){mPipeline.threadExecutionWidth, mPipeline.maxTotalThreadsPerThreadgroup, NO};
|
|
|
|
[encoder setComputePipelineState:mPipeline];
|
|
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
|
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
|
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
|
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
|
[encoder setBuffer:mBias offset:0 atIndex:4];
|
|
if (mLocalPreferred) {
|
|
[encoder setThreadgroupMemoryLength:mThreadgroupMemory atIndex:0];
|
|
//[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
|
[context dispatchEncoder:encoder threads:_mThreads.first threadsPerGroup:{ 1, 1, (NSUInteger)oc_4 } bandwidth:_mThreads.second];
|
|
} else {
|
|
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
|
}
|
|
|
|
//need to commit
|
|
if(backend->isCmdBufferCommit()) {
|
|
backend->flushEncoder();
|
|
[context commit_net];
|
|
}
|
|
};
|
|
|
|
func();
|
|
backend->addOpEncoder(func);
|
|
|
|
return NO_ERROR;
|
|
}
|
|
|
|
class MetalConvolutionCreator : public MetalBackend::Creator {
|
|
public:
|
|
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
|
|
auto param = op->main_as_Convolution2D();
|
|
if (param->quanParameter() != nullptr) {
|
|
if (param->quanParameter()->has_scaleInt()) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
if (inputs.size() > 1) {
|
|
return nullptr;
|
|
}
|
|
if (op->type() == OpType_Convolution) {
|
|
auto conv = op->main_as_Convolution2D();
|
|
auto input = inputs[0];
|
|
if (MetalConvolutionWinograd::isValid(conv, inputs[0], outputs[0])) {
|
|
return new MetalConvolutionWinograd(backend, input, op);
|
|
}
|
|
if (MetalConvolutionGEMM::isValid(conv, input)) {
|
|
return new MetalConvolutionGEMM(backend, input, op);
|
|
}
|
|
if (MetalConvolution1x1::isValid(conv, input)) {
|
|
return new MetalConvolution1x1(backend, op);
|
|
}
|
|
}
|
|
return new MetalConvolution(backend, op);
|
|
}
|
|
};
|
|
REGISTER_METAL_OP_CREATOR(MetalConvolutionCreator, OpType_Convolution);
|
|
} // namespace MNN
|
|
#endif /* MNN_METAL_ENABLED */
|