mirror of https://github.com/alibaba/MNN.git
312 lines
15 KiB
C++
312 lines
15 KiB
C++
|
|
//
|
||
|
|
// VulkanConvolutionImpl.cpp
|
||
|
|
// MNN
|
||
|
|
//
|
||
|
|
// Created by MNN on 2019/01/31.
|
||
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||
|
|
//
|
||
|
|
|
||
|
|
#include "VulkanConvolutionImpl.hpp"
|
||
|
|
#include "Macro.h"
|
||
|
|
#include "VulkanConvolution.hpp"
|
||
|
|
#include "VulkanConvolutionWinograd.hpp"
|
||
|
|
#include "VulkanMatrixMultier.hpp"
|
||
|
|
namespace MNN {
|
||
|
|
static int gPretreatLocalSize[3] = {16, 16, 1};
|
||
|
|
std::shared_ptr<VulkanBuffer> VulkanConvolutionImpl::createBufferForSlideWindow(const VulkanBackend* extra,
|
||
|
|
const Convolution2DCommon* convOption,
|
||
|
|
const float* weightPtr, int ci,
|
||
|
|
int co) {
|
||
|
|
int kw = convOption->kernelX();
|
||
|
|
int kh = convOption->kernelY();
|
||
|
|
const int alignedWeightSize = ALIGN_UP4(ci) * kh * kw * ALIGN_UP4(co);
|
||
|
|
auto ciC4 = UP_DIV(ci, 4);
|
||
|
|
auto coC4 = UP_DIV(co, 4);
|
||
|
|
auto reorderWeight =
|
||
|
|
std::make_shared<VulkanBuffer>(extra->getMemoryPool(), false, alignedWeightSize * sizeof(float));
|
||
|
|
auto destWeight = (float*)reorderWeight->map();
|
||
|
|
::memset(destWeight, 0, alignedWeightSize * sizeof(float));
|
||
|
|
int kC = kw * kh;
|
||
|
|
for (int oz = 0; oz < co; ++oz) {
|
||
|
|
auto srcOz = weightPtr + oz * ci * kC;
|
||
|
|
auto destOz = destWeight + (oz / 4) * ciC4 * 16 + (oz % 4);
|
||
|
|
for (int sz = 0; sz < ci; ++sz) {
|
||
|
|
auto destSz = destOz + (sz / 4) * 16 + (sz % 4) * 4;
|
||
|
|
auto srcSz = srcOz + sz * kC;
|
||
|
|
for (int k = 0; k < kC; ++k) {
|
||
|
|
destSz[k * 16 * ciC4 * coC4] = srcSz[k];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
reorderWeight->unmap();
|
||
|
|
return reorderWeight;
|
||
|
|
}
|
||
|
|
|
||
|
|
class VulkanConvolutionSlideWindow : public VulkanBasicExecution {
|
||
|
|
public:
|
||
|
|
VulkanConvolutionSlideWindow(VulkanBackend* backend, const Convolution2DCommon* convOption, const float* weightPtr,
|
||
|
|
const float* biasPtr, int ci, int co)
|
||
|
|
: VulkanBasicExecution(backend) {
|
||
|
|
auto extra = static_cast<VulkanBackend*>(backend);
|
||
|
|
mCommon = convOption;
|
||
|
|
mSampler = backend->getCommonSampler();
|
||
|
|
int kw = convOption->kernelX();
|
||
|
|
int kh = convOption->kernelY();
|
||
|
|
mBias = std::make_shared<VulkanImage>(backend->getMemoryPool(), false, UP_DIV(co, 4), 1);
|
||
|
|
{
|
||
|
|
auto tempBias =
|
||
|
|
std::make_shared<VulkanBuffer>(backend->getMemoryPool(), false, sizeof(float) * ALIGN_UP4(co));
|
||
|
|
auto bias = tempBias->map();
|
||
|
|
::memset(bias, 0, sizeof(float) * ALIGN_UP4(co));
|
||
|
|
::memcpy(bias, biasPtr, sizeof(float) * co);
|
||
|
|
tempBias->unmap();
|
||
|
|
backend->copyBufferToImage(tempBias.get(), mBias.get());
|
||
|
|
}
|
||
|
|
|
||
|
|
mConvCons = std::make_shared<VulkanBuffer>(extra->getMemoryPool(), false,
|
||
|
|
sizeof(VulkanConvolutionCommon::ConvolutionParameter), nullptr,
|
||
|
|
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
|
||
|
|
|
||
|
|
{
|
||
|
|
auto reorderWeight =
|
||
|
|
VulkanConvolutionImpl::createBufferForSlideWindow(extra, convOption, weightPtr, ci, co);
|
||
|
|
mKernel = std::make_shared<VulkanImage>(extra->getMemoryPool(), false,
|
||
|
|
std::vector<int>{ALIGN_UP4(ci), UP_DIV(co, 4), kh * kw});
|
||
|
|
extra->copyBufferToImage(reorderWeight.get(), mKernel.get());
|
||
|
|
}
|
||
|
|
// Create Pipeline
|
||
|
|
std::vector<VkDescriptorType> convTypes{
|
||
|
|
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||
|
|
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||
|
|
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
|
||
|
|
auto common = mCommon;
|
||
|
|
if (common->relu()) {
|
||
|
|
mConvPipeline =
|
||
|
|
extra->getPipeline("glsl_convolution_RELU_comp",
|
||
|
|
/* glsl_convolution_RELU_comp, glsl_convolution_RELU_comp_len,*/ convTypes);
|
||
|
|
} else if (common->relu6()) {
|
||
|
|
mConvPipeline =
|
||
|
|
extra->getPipeline("glsl_convolution_RELU6_comp",
|
||
|
|
/* glsl_convolution_RELU6_comp, glsl_convolution_RELU6_comp_len,*/ convTypes);
|
||
|
|
} else {
|
||
|
|
mConvPipeline = extra->getPipeline("glsl_convolution_comp",
|
||
|
|
/* glsl_convolution_comp, glsl_convolution_comp_len,*/ convTypes);
|
||
|
|
}
|
||
|
|
mLocalX = 2;
|
||
|
|
mLocalY = 2;
|
||
|
|
mLocalZ = 16;
|
||
|
|
}
|
||
|
|
~VulkanConvolutionSlideWindow() {
|
||
|
|
}
|
||
|
|
virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||
|
|
const VulkanCommandPool::Buffer* cmdBuffer) override {
|
||
|
|
auto input = inputs[0];
|
||
|
|
auto output = outputs[0];
|
||
|
|
/*Set Const Parameters*/
|
||
|
|
int ocDiv4 = UP_DIV(output->channel(), 4);
|
||
|
|
int ow = output->width();
|
||
|
|
int oh = output->height();
|
||
|
|
auto convCons = reinterpret_cast<VulkanConvolutionCommon::ConvolutionParameter*>(mConvCons->map());
|
||
|
|
VulkanConvolutionCommon::writeParameter(convCons, mCommon, input, output);
|
||
|
|
mConvCons->unmap();
|
||
|
|
|
||
|
|
/*Write Command Buffer*/
|
||
|
|
if (true) {
|
||
|
|
mConvSet.reset(mConvPipeline->createSet());
|
||
|
|
mConvSet->writeImage((VkImageView)output->deviceId(), mSampler->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
|
||
|
|
mConvSet->writeImage((VkImageView)input->deviceId(), mSampler->get(),
|
||
|
|
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 1);
|
||
|
|
mConvSet->writeImage(mKernel->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 2);
|
||
|
|
mConvSet->writeImage(mBias->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 3);
|
||
|
|
mConvSet->writeBuffer(mConvCons->buffer(), 4, mConvCons->size());
|
||
|
|
mConvPipeline->bind(cmdBuffer->get(), mConvSet->get());
|
||
|
|
vkCmdDispatch(cmdBuffer->get(), UP_DIV(ow, mLocalX), UP_DIV(oh, mLocalY),
|
||
|
|
UP_DIV(ocDiv4 * input->batch(), mLocalZ));
|
||
|
|
}
|
||
|
|
return NO_ERROR;
|
||
|
|
}
|
||
|
|
|
||
|
|
private:
|
||
|
|
std::shared_ptr<VulkanImage> mBias;
|
||
|
|
const Convolution2DCommon* mCommon;
|
||
|
|
std::shared_ptr<VulkanBuffer> mConvCons;
|
||
|
|
std::shared_ptr<VulkanImage> mKernel;
|
||
|
|
const VulkanPipeline* mConvPipeline;
|
||
|
|
|
||
|
|
std::shared_ptr<VulkanPipeline::DescriptorSet> mConvSet;
|
||
|
|
const VulkanSampler* mSampler;
|
||
|
|
|
||
|
|
int mLocalX = 0;
|
||
|
|
int mLocalY = 0;
|
||
|
|
int mLocalZ = 0;
|
||
|
|
};
|
||
|
|
|
||
|
|
class VulkanConvolutionIm2Col : public VulkanBasicExecution {
|
||
|
|
public:
|
||
|
|
VulkanConvolutionIm2Col(VulkanBackend* backend, const Convolution2DCommon* convOption, const float* weightPtr,
|
||
|
|
const float* biasPtr, int ci, int co, int kh, int kw);
|
||
|
|
~VulkanConvolutionIm2Col();
|
||
|
|
virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||
|
|
const VulkanCommandPool::Buffer* cmdBuffer) override;
|
||
|
|
|
||
|
|
public:
|
||
|
|
private:
|
||
|
|
std::shared_ptr<VulkanMatrixMultier> mMultiler;
|
||
|
|
|
||
|
|
const VulkanPipeline* mIm2Col;
|
||
|
|
std::shared_ptr<VulkanPipeline::DescriptorSet> mIm2ColSet;
|
||
|
|
|
||
|
|
const VulkanPipeline* mCol2Im;
|
||
|
|
std::shared_ptr<VulkanPipeline::DescriptorSet> mCol2ImSet;
|
||
|
|
const VulkanSampler* mSampler;
|
||
|
|
|
||
|
|
std::shared_ptr<VulkanImage> mBias;
|
||
|
|
const Convolution2DCommon* mConvCommonOption;
|
||
|
|
std::shared_ptr<VulkanBuffer> mConvParam;
|
||
|
|
};
|
||
|
|
|
||
|
|
VulkanConvolutionIm2Col::VulkanConvolutionIm2Col(VulkanBackend* backend, const Convolution2DCommon* convOption,
|
||
|
|
const float* weightPtr, const float* biasPtr, int ci, int co, int kh,
|
||
|
|
int kw)
|
||
|
|
: VulkanBasicExecution(backend), mConvCommonOption(convOption) {
|
||
|
|
const int alignedWeightSize = ALIGN_UP4(ci) * kh * kw * ALIGN_UP4(co);
|
||
|
|
// std::make_unique need c++14
|
||
|
|
// std::shared_ptr does not support array
|
||
|
|
std::unique_ptr<float[]> reorderedWeight(new float[alignedWeightSize]);
|
||
|
|
::memset(reorderedWeight.get(), 0, alignedWeightSize * sizeof(float));
|
||
|
|
VulkanConvolutionImpl::MNNReorderWeight<float>(reorderedWeight.get(), weightPtr, ci, co, kh, kw);
|
||
|
|
mMultiler = std::make_shared<VulkanMatrixMultier>(backend, reorderedWeight.get(), ALIGN_UP4(ci) * kh * kw, co);
|
||
|
|
std::vector<VkDescriptorType> im2Coltypes{
|
||
|
|
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
|
||
|
|
if (kw == 1 && kh == 1 && convOption->padX() == 0 && convOption->padY() == 0) {
|
||
|
|
mIm2Col =
|
||
|
|
backend->getPipeline("glsl_im2col1x1_comp", /* glsl_im2col1x1_comp, glsl_im2col1x1_comp_len,*/ im2Coltypes);
|
||
|
|
} else {
|
||
|
|
mIm2Col = backend->getPipeline("glsl_im2col_comp", /*glsl_im2col_comp, glsl_im2col_comp_len,*/ im2Coltypes);
|
||
|
|
}
|
||
|
|
mIm2ColSet.reset(mIm2Col->createSet());
|
||
|
|
|
||
|
|
std::vector<VkDescriptorType> Col2imTypes{
|
||
|
|
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||
|
|
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
|
||
|
|
auto macro = VulkanConvolutionCommon::getPostTreatMacro(convOption);
|
||
|
|
mCol2Im = backend->getPipeline("glsl_col2Im_" + macro + "comp", Col2imTypes);
|
||
|
|
mCol2ImSet.reset(mCol2Im->createSet());
|
||
|
|
|
||
|
|
mSampler = backend->getCommonSampler();
|
||
|
|
mBias = std::make_shared<VulkanImage>(backend->getMemoryPool(), false, UP_DIV(co, 4), 1);
|
||
|
|
auto tempBias = std::make_shared<VulkanBuffer>(backend->getMemoryPool(), false, sizeof(float) * ALIGN_UP4(co));
|
||
|
|
auto bias = tempBias->map();
|
||
|
|
::memset(bias, 0, sizeof(float) * ALIGN_UP4(co));
|
||
|
|
::memcpy(bias, biasPtr, sizeof(float) * co);
|
||
|
|
tempBias->unmap();
|
||
|
|
backend->copyBufferToImage(tempBias.get(), mBias.get());
|
||
|
|
|
||
|
|
mConvParam = std::make_shared<VulkanBuffer>(backend->getMemoryPool(), false,
|
||
|
|
sizeof(VulkanConvolutionCommon::ConvolutionParameter), nullptr,
|
||
|
|
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
|
||
|
|
}
|
||
|
|
|
||
|
|
VulkanConvolutionIm2Col::~VulkanConvolutionIm2Col() {
|
||
|
|
}
|
||
|
|
|
||
|
|
template <typename T>
|
||
|
|
void VulkanConvolutionImpl::MNNReorderWeight(float* reorderedWeight, const T* srcWeight, int ci, int co, int kh, int kw,
|
||
|
|
int unit) {
|
||
|
|
const int alignedWeightSize = ALIGN_UP4(ci) * kh * kw * ALIGN_UP4(co);
|
||
|
|
const int unit2 = unit * unit;
|
||
|
|
int cur = 0;
|
||
|
|
int batch_4 = UP_DIV(co, unit);
|
||
|
|
for (int b = 0; b < co; ++b) {
|
||
|
|
int b_4 = b / unit;
|
||
|
|
T* dst_b = reorderedWeight + b_4 * (alignedWeightSize / batch_4);
|
||
|
|
int mx = b % unit;
|
||
|
|
for (int d = 0; d < ci; ++d) {
|
||
|
|
int my = d % unit;
|
||
|
|
int d_4 = d / unit;
|
||
|
|
T* dst_d = dst_b + d_4 * kw * kh * unit2;
|
||
|
|
for (int y = 0; y < kh; ++y) {
|
||
|
|
T* dst_y = dst_d + y * kw * unit2;
|
||
|
|
for (int x = 0; x < kw; ++x) {
|
||
|
|
T* dst_x = dst_y + x * unit2;
|
||
|
|
dst_x[unit * my + mx] = srcWeight[cur++];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
ErrorCode VulkanConvolutionIm2Col::onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||
|
|
const VulkanCommandPool::Buffer* cmdBuffer) {
|
||
|
|
auto src = inputs[0];
|
||
|
|
auto dst = outputs[0];
|
||
|
|
const int icDiv4 = UP_DIV(src->channel(), 4);
|
||
|
|
const int ocDiv4 = UP_DIV(dst->channel(), 4);
|
||
|
|
{
|
||
|
|
auto convCons = reinterpret_cast<VulkanConvolutionCommon::ConvolutionParameter*>(mConvParam->map());
|
||
|
|
VulkanConvolutionCommon::writeParameter(convCons, mConvCommonOption, src, dst);
|
||
|
|
mConvParam->unmap();
|
||
|
|
}
|
||
|
|
|
||
|
|
mMultiler->prepare(dst->width() * dst->height() * dst->batch());
|
||
|
|
if (true) {
|
||
|
|
auto colImage = mMultiler->source();
|
||
|
|
mIm2ColSet->writeImage(colImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
|
||
|
|
mIm2ColSet->writeImage((reinterpret_cast<VkImageView>(src->deviceId())), mSampler->get(),
|
||
|
|
VK_IMAGE_LAYOUT_GENERAL, 1);
|
||
|
|
mIm2ColSet->writeBuffer(mConvParam->buffer(), 2, mConvParam->size());
|
||
|
|
mIm2Col->bind(cmdBuffer->get(), mIm2ColSet->get());
|
||
|
|
vkCmdDispatch(cmdBuffer->get(), UP_DIV(dst->width(), gPretreatLocalSize[0]),
|
||
|
|
UP_DIV(dst->height(), gPretreatLocalSize[1]), icDiv4 * src->batch());
|
||
|
|
}
|
||
|
|
mMultiler->compute(cmdBuffer);
|
||
|
|
if (true) {
|
||
|
|
auto dstImage = mMultiler->dest();
|
||
|
|
mCol2ImSet->writeImage(dstImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 0);
|
||
|
|
mCol2ImSet->writeImage((reinterpret_cast<VkImageView>(dst->deviceId())), mSampler->get(),
|
||
|
|
VK_IMAGE_LAYOUT_GENERAL, 1);
|
||
|
|
|
||
|
|
mCol2ImSet->writeImage(mBias->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 2);
|
||
|
|
mCol2ImSet->writeBuffer(mConvParam->buffer(), 3, mConvParam->size());
|
||
|
|
mCol2Im->bind(cmdBuffer->get(), mCol2ImSet->get());
|
||
|
|
cmdBuffer->barrierImage(dstImage->get(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
|
||
|
|
vkCmdDispatch(cmdBuffer->get(), UP_DIV(dst->width(), gPretreatLocalSize[0]),
|
||
|
|
UP_DIV(dst->height(), gPretreatLocalSize[1]), ocDiv4 * dst->batch());
|
||
|
|
}
|
||
|
|
|
||
|
|
return NO_ERROR;
|
||
|
|
}
|
||
|
|
|
||
|
|
std::shared_ptr<Execution> VulkanConvolutionImpl::create(VulkanBackend* backend, const Convolution2DCommon* convOption,
|
||
|
|
const Tensor* input, const Tensor* output,
|
||
|
|
const float* weightPtr, const float* biasPtr, int ci, int co) {
|
||
|
|
auto imageLimit = backend->proty().limits.maxImageDimension1D;
|
||
|
|
if (ALIGN_UP4(ci) * convOption->kernelX() * convOption->kernelY() > imageLimit) {
|
||
|
|
return std::make_shared<VulkanConvolutionSlideWindow>(backend, convOption, weightPtr, biasPtr, ci, co);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (VulkanConvolutionWinograd::support(convOption)) {
|
||
|
|
if (output->width() >= 4 && output->height() >= 4) {
|
||
|
|
return std::make_shared<VulkanConvolutionWinograd>(backend, convOption, weightPtr, biasPtr, ci, co);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (UP_DIV(output->width() * output->height(), 4) > imageLimit) {
|
||
|
|
return std::make_shared<VulkanConvolutionSlideWindow>(backend, convOption, weightPtr, biasPtr, ci, co);
|
||
|
|
}
|
||
|
|
// if (backend->gpuType() == VulkanBackend::MALI
|
||
|
|
// && (input->width() < gPretreatLocalSize[0] || input->height() < gPretreatLocalSize[1])
|
||
|
|
// //For mobilenet, use im2col
|
||
|
|
// && (input->channel() < 256 || output->channel() < 256)
|
||
|
|
// ) {
|
||
|
|
// return std::shared_ptr<Execution>(
|
||
|
|
// new VulkanConvolutionSlideWindow(backend, convOption, weightPtr,
|
||
|
|
// biasPtr, ci, co));
|
||
|
|
// }
|
||
|
|
|
||
|
|
return std::make_shared<VulkanConvolutionIm2Col>(backend, convOption, weightPtr, biasPtr, ci, co,
|
||
|
|
convOption->kernelY(), convOption->kernelX());
|
||
|
|
}
|
||
|
|
|
||
|
|
} // namespace MNN
|