mirror of https://github.com/alibaba/MNN.git
Merge pull request #3610 from alibaba/feautre/bugfix
android / android_build (push) Has been cancelled
Details
ios / ios_build (push) Has been cancelled
Details
linux / linux_buil_test (push) Has been cancelled
Details
macos / macos_buil_test (push) Has been cancelled
Details
windows / windows_build_test (push) Has been cancelled
Details
stale / stale (push) Has been cancelled
Details
android / android_build (push) Has been cancelled
Details
ios / ios_build (push) Has been cancelled
Details
linux / linux_buil_test (push) Has been cancelled
Details
macos / macos_buil_test (push) Has been cancelled
Details
windows / windows_build_test (push) Has been cancelled
Details
stale / stale (push) Has been cancelled
Details
Vulkan:Bugfix: Fix bug for radixsort copy sortnumber error
This commit is contained in:
commit
4f39ef5f3f
|
@ -105,6 +105,7 @@ ErrorCode VulkanBackend::onResizeEnd() {
|
|||
if (!mDirect) {
|
||||
mCmdBuffer->end();
|
||||
}
|
||||
mHostBuffer.reset();
|
||||
return NO_ERROR;
|
||||
}
|
||||
class VulkanMemRelease : public Backend::MemObj {
|
||||
|
@ -288,27 +289,36 @@ static Tensor::DimensionType _convert(MNN_DATA_FORMAT format) {
|
|||
}
|
||||
return Tensor::CAFFE;
|
||||
}
|
||||
void VulkanBackend::copyToGPUBuffer(const void* src, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset) const {
|
||||
_requireHostBuffer(size);
|
||||
::memcpy(mHostBuffer->map(), src, size);
|
||||
mHostBuffer->unmap();
|
||||
std::shared_ptr<VulkanBuffer> VulkanBackend::createHostBuffer(size_t size) const {
|
||||
std::shared_ptr<VulkanBuffer> res;
|
||||
res.reset(new VulkanBuffer(*mRuntime->mMemoryPool, false, size, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
|
||||
return res;
|
||||
}
|
||||
|
||||
void VulkanBackend::copyGPUToGPUBuffer(VkBuffer srcBuffer, VkBuffer dstBuffer, VkDeviceSize size, VkDeviceSize srcOffset, VkDeviceSize dstOffset) const {
|
||||
auto cmdbuffer = mCmdBufferForCopy;
|
||||
cmdbuffer->begin(0);
|
||||
VkBufferCopy bufferCopy;
|
||||
bufferCopy.size = size;
|
||||
bufferCopy.dstOffset = offset;
|
||||
bufferCopy.srcOffset = 0;
|
||||
vkCmdCopyBuffer(cmdbuffer->get(), mHostBuffer->buffer(), buffer,
|
||||
bufferCopy.dstOffset = dstOffset;
|
||||
bufferCopy.srcOffset = srcOffset;
|
||||
vkCmdCopyBuffer(cmdbuffer->get(), srcBuffer, dstBuffer,
|
||||
1, &bufferCopy);
|
||||
cmdbuffer->end();
|
||||
pushCommand(cmdbuffer->get());
|
||||
_finish();
|
||||
mHostBuffer.reset();
|
||||
}
|
||||
|
||||
void VulkanBackend::copyToGPUBuffer(const void* src, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset) const {
|
||||
_requireHostBuffer(size);
|
||||
::memcpy(mHostBuffer->map(), src, size);
|
||||
mHostBuffer->unmap();
|
||||
copyGPUToGPUBuffer(mHostBuffer->buffer(), buffer, size, 0, offset);
|
||||
}
|
||||
void VulkanBackend::_requireHostBuffer(size_t size) const {
|
||||
_finish();
|
||||
if (nullptr == mHostBuffer || mHostBuffer->size() < size) {
|
||||
mHostBuffer.reset(new VulkanBuffer(*mRuntime->mMemoryPool, false, size, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
|
||||
mHostBuffer = createHostBuffer(size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -92,7 +92,9 @@ public:
|
|||
VULKAN_TENSOR getBuffer(const Tensor* tensor) const;
|
||||
std::shared_ptr<VulkanBuffer> allocUniform(const void* src = nullptr, int size = 0);
|
||||
void recycleUniform(std::shared_ptr<VulkanBuffer> buffer);
|
||||
void copyGPUToGPUBuffer(VkBuffer srcBuffer, VkBuffer dstBuffer, VkDeviceSize size, VkDeviceSize srcOffset, VkDeviceSize dstOffset) const;
|
||||
void copyToGPUBuffer(const void* src, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset) const;
|
||||
std::shared_ptr<VulkanBuffer> createHostBuffer(size_t size) const;
|
||||
|
||||
const VulkanDevice& device() const;
|
||||
#ifdef ENABLE_VULKAN_TIME_PROFILE
|
||||
|
|
|
@ -93,14 +93,15 @@ public:
|
|||
return;
|
||||
}
|
||||
std::shared_ptr<Tensor> sourceWeight(Tensor::createDevice<float>({ci * co * kernelSize}));
|
||||
res = vkBn->onAcquireBuffer(sourceWeight.get(), Backend::STATIC);
|
||||
if (!res) {
|
||||
auto sourceBuffer = vkBn->createHostBuffer(ci * co * kernelSize * sizeof(float));
|
||||
if (nullptr == sourceBuffer.get()) {
|
||||
return;
|
||||
}
|
||||
{
|
||||
auto vkTensor = extra->getBuffer(sourceWeight.get());
|
||||
extra->copyToGPUBuffer(weightPtr, std::get<0>(vkTensor), sourceWeight->size(), std::get<2>(vkTensor));
|
||||
}
|
||||
::memcpy(sourceBuffer->map(), weightPtr, ci * co * kernelSize * sizeof(float));
|
||||
sourceBuffer->unmap();
|
||||
sourceWeight->buffer().device = (uint64_t)(sourceBuffer.get());
|
||||
TensorUtils::getDescribe(sourceWeight.get())->extra.offset = 0;
|
||||
|
||||
std::shared_ptr<VulkanCommandPool::Buffer> prearrangeCmd( vkBn->getPool().allocBuffer());
|
||||
for (auto& reg : des->regions) {
|
||||
reg.origin = sourceWeight.get();
|
||||
|
|
|
@ -70,6 +70,8 @@ public:
|
|||
}
|
||||
}
|
||||
types.resize(maxIndex+1);
|
||||
std::vector<std::tuple<int, void*, size_t>> constStoragePtrs;
|
||||
std::vector<std::tuple<int, void*, size_t>> constUniformPtrs;
|
||||
for (int i=0; i<extra->attr()->size(); ++i) {
|
||||
auto attr = extra->attr()->GetAs<Attribute>(i);
|
||||
if (attr->key()->str() == "input") {
|
||||
|
@ -89,13 +91,6 @@ public:
|
|||
continue;
|
||||
}
|
||||
if (attr->key()->str() == "const") {
|
||||
auto usageBit = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
|
||||
if (attr->b()) {
|
||||
types[attr->i()] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
|
||||
} else {
|
||||
usageBit = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
|
||||
types[attr->i()] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
|
||||
}
|
||||
auto b = attr->tensor();
|
||||
void* result = nullptr;
|
||||
size_t bufferSize = 0;
|
||||
|
@ -112,14 +107,59 @@ public:
|
|||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
std::shared_ptr<VulkanBuffer> vkBuffer(new VulkanBuffer(vkBn->getMemoryPool(), false, bufferSize, nullptr, usageBit, VK_SHARING_MODE_EXCLUSIVE, 0));
|
||||
vkBn->copyToGPUBuffer(result, vkBuffer->buffer(), bufferSize, 0);
|
||||
mConstIndides.emplace_back(std::make_pair(attr->i(), vkBuffer));
|
||||
if (attr->b()) {
|
||||
types[attr->i()] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
|
||||
constUniformPtrs.emplace_back(std::make_tuple(attr->i(), result, bufferSize));
|
||||
} else {
|
||||
types[attr->i()] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
|
||||
constStoragePtrs.emplace_back(std::make_tuple(attr->i(), result, bufferSize));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
auto alignSize = vkBn->device().proty().limits.minMemoryMapAlignment;
|
||||
size_t offset = 0;
|
||||
std::shared_ptr<VulkanCommandPool::Buffer> cmdbuffer( vkBn->getPool().allocBuffer());
|
||||
cmdbuffer->begin(0);
|
||||
auto merge = [&](const std::vector<std::tuple<int, void*, size_t>>& constPtrs, VkDescriptorType type) {
|
||||
if (constPtrs.empty()) {
|
||||
return std::make_tuple(std::vector<std::tuple<int, size_t, size_t>>{}, std::shared_ptr<VulkanBuffer>(nullptr), std::shared_ptr<VulkanBuffer>(nullptr));
|
||||
}
|
||||
std::vector<std::tuple<int, size_t, size_t>> mConstOffset;
|
||||
for (auto& constAttr : constPtrs) {
|
||||
auto size = UP_DIV(std::get<2>(constAttr), alignSize) * alignSize;
|
||||
mConstOffset.emplace_back(std::make_tuple(std::get<0>(constAttr), size, offset));
|
||||
offset += size;
|
||||
}
|
||||
std::shared_ptr<VulkanBuffer> hostBuffer(new VulkanBuffer(vkBn->getMemoryPool(), false, offset, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
|
||||
auto ptr = (uint8_t*)hostBuffer->map();
|
||||
for (int i=0; i<constPtrs.size(); ++i) {
|
||||
::memcpy(ptr + std::get<2>(mConstOffset[i]), std::get<1>(constPtrs[i]), std::get<2>(constPtrs[i]));
|
||||
}
|
||||
hostBuffer->unmap();
|
||||
std::shared_ptr<VulkanBuffer> vkBuffer(new VulkanBuffer(vkBn->getMemoryPool(), false, offset, nullptr, type, VK_SHARING_MODE_EXCLUSIVE, 0));
|
||||
VkBufferCopy bufferCopy;
|
||||
bufferCopy.size = offset;
|
||||
bufferCopy.dstOffset = 0;
|
||||
bufferCopy.srcOffset = 0;
|
||||
vkCmdCopyBuffer(cmdbuffer->get(), hostBuffer->buffer(), vkBuffer->buffer(),
|
||||
1, &bufferCopy);
|
||||
return std::make_tuple(mConstOffset, vkBuffer, hostBuffer);
|
||||
};
|
||||
mConstStorageOffset.clear();
|
||||
mConstUniformOffset.clear();
|
||||
auto uniforms = merge(constUniformPtrs, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
|
||||
mConstUniformOffset = std::get<0>(uniforms);
|
||||
mConstUniformBuffer = std::get<1>(uniforms);
|
||||
auto storages = merge(constStoragePtrs, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
|
||||
mConstStorageOffset = std::get<0>(storages);
|
||||
mConstStorageBuffer = std::get<1>(storages);
|
||||
cmdbuffer->end();
|
||||
auto fence = vkBn->getPool().submit(cmdbuffer->get());
|
||||
|
||||
mPipeline = factory->createComputePipeline(data, dataSize, types, std::vector<uint32_t>{});
|
||||
mDescriptorSet = mPipeline->createSet();
|
||||
fence->wait();
|
||||
}
|
||||
virtual ~VulkanFuse() {
|
||||
// Remove set firstly before destroy pipeline
|
||||
|
@ -134,8 +174,11 @@ public:
|
|||
for (int i=0; i<outputs.size(); ++i) {
|
||||
mDescriptorSet->writeBuffer(vkBn->getBuffer(outputs[i]), mOutputBinding[i]);
|
||||
}
|
||||
for (auto& iter : mConstIndides) {
|
||||
mDescriptorSet->writeBuffer(iter.second->buffer(), iter.first, iter.second->size());
|
||||
for (auto& iter : mConstStorageOffset) {
|
||||
mDescriptorSet->writeBuffer(mConstStorageBuffer->buffer(), std::get<0>(iter), std::get<1>(iter), std::get<2>(iter));
|
||||
}
|
||||
for (auto& iter : mConstUniformOffset) {
|
||||
mDescriptorSet->writeBuffer(mConstUniformBuffer->buffer(), std::get<0>(iter), std::get<1>(iter), std::get<2>(iter));
|
||||
}
|
||||
if (mNeedAutoTuning) {
|
||||
auto localSize = vkBn->autoTunePipeline(mPipeline.get(), mDescriptorSet, mGlobalSize);
|
||||
|
@ -153,7 +196,11 @@ private:
|
|||
std::vector<int> mGlobalSize;
|
||||
std::vector<int> mInputBinding;
|
||||
std::vector<int> mOutputBinding;
|
||||
std::vector<std::pair<int, std::shared_ptr<VulkanBuffer>>> mConstIndides;
|
||||
std::shared_ptr<VulkanBuffer> mConstStorageBuffer;
|
||||
std::shared_ptr<VulkanBuffer> mConstUniformBuffer;
|
||||
// Index, offset, size
|
||||
std::vector<std::tuple<int, size_t, size_t>> mConstStorageOffset;
|
||||
std::vector<std::tuple<int, size_t, size_t>> mConstUniformOffset;
|
||||
SharedPtr<VulkanPipeline> mPipeline;
|
||||
SharedPtr<VulkanLayout::DescriptorSet> mDescriptorSet;
|
||||
bool mNeedAutoTuning = false;
|
||||
|
|
|
@ -82,7 +82,7 @@ void VulkanRaster::onEncodeFast(const Tensor* input, const Tensor* output, const
|
|||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
Tensor::InsideDescribe::Region newRegion;
|
||||
OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4);
|
||||
OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4, true);
|
||||
// TODO: Find better way
|
||||
newRegion.dst.offset /= 4;
|
||||
newRegion.src.offset /= 4;
|
||||
|
@ -92,6 +92,8 @@ void VulkanRaster::onEncodeFast(const Tensor* input, const Tensor* output, const
|
|||
auto group = UP_DIV(total, 256);
|
||||
std::shared_ptr<VulkanLayout::DescriptorSet> describe(blitPipeline->createSet());
|
||||
std::shared_ptr<VulkanBuffer> uniform = vkBn->allocUniform();
|
||||
::memcpy(uniform->map(), &info, sizeof(SamplerInfo));
|
||||
uniform->unmap();
|
||||
auto srcTensor = vkBn->getTensorBuffer(slice.origin);
|
||||
auto srcTensorSize = vkBn->getTensorSize(slice.origin);
|
||||
describe->writeBuffer(dstTensor.first->buffer(), 0, dstTensorSize, dstTensor.second);
|
||||
|
@ -127,7 +129,7 @@ ErrorCode VulkanRaster::onEncode(const std::vector<Tensor *> &____inputs, const
|
|||
fast = false;
|
||||
break;
|
||||
}
|
||||
if (!OpCommonUtils::canBlitFast(slice, output)) {
|
||||
if (!OpCommonUtils::canBlitFast(slice, output, 4, true)) {
|
||||
fast = false;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -148,7 +148,7 @@ ErrorCode VulkanRasterSort::onEncode(const std::vector<Tensor *> &inputs, const
|
|||
region2.dstOffset = std::get<2>(output);
|
||||
region2.srcOffset = pointOffsetSum.second + (pointOffsetBytes / sizeof(uint32_t) - 1) * sizeof(uint32_t);
|
||||
|
||||
vkCmdCopyBuffer(cmdBuffer->get(), ((VulkanBuffer*)pointOffsetSum.first)->buffer(), std::get<0>(output), 1, ®ion);
|
||||
vkCmdCopyBuffer(cmdBuffer->get(), ((VulkanBuffer*)pointOffsetSum.first)->buffer(), std::get<0>(output), 1, ®ion2);
|
||||
|
||||
cmdBuffer->barrierSource(sortNumber->buffer(), 0, sizeof(uint32_t));
|
||||
}
|
||||
|
|
|
@ -24,8 +24,7 @@ VulkanCommandPool::~VulkanCommandPool() {
|
|||
mDevice.destroyCommandPool(mPool);
|
||||
// FUNC_PRINT(1);
|
||||
}
|
||||
|
||||
void VulkanCommandPool::submitAndWait(VkCommandBuffer buffer) const {
|
||||
std::shared_ptr<VulkanFence> VulkanCommandPool::submit(VkCommandBuffer buffer) const {
|
||||
auto b = buffer;
|
||||
auto fence = std::make_shared<VulkanFence>(mDevice);
|
||||
VkSubmitInfo submit_info = {/* .sType = */ VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
||||
|
@ -40,6 +39,11 @@ void VulkanCommandPool::submitAndWait(VkCommandBuffer buffer) const {
|
|||
auto fenceReal = fence->get();
|
||||
auto queue = mDevice.acquireDefaultDevQueue();
|
||||
CALL_VK(vkQueueSubmit(queue, 1, &submit_info, fenceReal));
|
||||
return fence;
|
||||
}
|
||||
|
||||
void VulkanCommandPool::submitAndWait(VkCommandBuffer buffer) const {
|
||||
auto fence = submit(buffer);
|
||||
fence->wait();
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include "core/NonCopyable.hpp"
|
||||
#include "backend/vulkan/component/VulkanDevice.hpp"
|
||||
#include "backend/vulkan/component/VulkanFence.hpp"
|
||||
#include "backend/vulkan/vulkan/vulkan_wrapper.h"
|
||||
namespace MNN {
|
||||
class VulkanImage;
|
||||
|
@ -49,6 +50,7 @@ public:
|
|||
}
|
||||
|
||||
void submitAndWait(VkCommandBuffer buffer) const;
|
||||
std::shared_ptr<VulkanFence> submit(VkCommandBuffer buffer) const;
|
||||
|
||||
private:
|
||||
const VulkanDevice& mDevice;
|
||||
|
|
|
@ -11,18 +11,23 @@
|
|||
//#define MNN_VULKAN_PRINT_EXT
|
||||
namespace MNN {
|
||||
static uint32_t _getLocalMemorySize(const VkPhysicalDeviceMemoryProperties& memProty) {
|
||||
#ifdef __APPLE__
|
||||
// For mac vulkan driver can not get correct local size
|
||||
return 16384;
|
||||
#else
|
||||
int32_t localMemorySize = 0;
|
||||
for (int i=0; i<VK_MAX_MEMORY_TYPES; ++i) {
|
||||
for (int i=0; i<memProty.memoryHeapCount; ++i) {
|
||||
auto& heap = memProty.memoryHeaps[i];
|
||||
if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
|
||||
auto size = (int32_t)heap.size;
|
||||
if (size > 0) {
|
||||
localMemorySize = size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return localMemorySize;
|
||||
#endif
|
||||
}
|
||||
VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
|
||||
: mOwner(true),
|
||||
|
@ -118,6 +123,7 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
|
|||
}
|
||||
vkGetPhysicalDeviceProperties(mPhysicalDevice, &mDeviceProty);
|
||||
vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mMemoryProty);
|
||||
mLocalMemorySize = _getLocalMemorySize(mMemoryProty);
|
||||
getDeviceQueue(mQueueFamilyIndex, 0, mQueue);
|
||||
|
||||
// query subgroupSize
|
||||
|
@ -132,7 +138,6 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
|
|||
vkGetPhysicalDeviceProperties2(mPhysicalDevice, &deviceProperties2);
|
||||
mSubgroupSize = subgroupProperties.subgroupSize;
|
||||
}
|
||||
mLocalMemorySize = _getLocalMemorySize(mMemoryProty);
|
||||
#ifdef MNN_VULKAN_PRINT_EXT
|
||||
uint32_t pPropertyCount;
|
||||
vkEnumerateInstanceExtensionProperties(nullptr, &pPropertyCount, nullptr);
|
||||
|
@ -146,6 +151,7 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
|
|||
FUNC_PRINT(mDeviceProty.limits.maxComputeWorkGroupCount[0]);
|
||||
FUNC_PRINT(mDeviceProty.limits.maxComputeWorkGroupInvocations);
|
||||
FUNC_PRINT(mDeviceProty.limits.maxComputeSharedMemorySize);
|
||||
FUNC_PRINT(mLocalMemorySize);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -350,7 +350,11 @@ void Tensor::print() const {
|
|||
|
||||
// convert to host if needed
|
||||
auto printee = this;
|
||||
bool device = this->buffer().host == NULL && this->buffer().device != 0;
|
||||
auto bnType = MNN_FORWARD_CPU;
|
||||
if (nullptr != mDescribe->getBackend()) {
|
||||
bnType = mDescribe->getBackend()->type();
|
||||
}
|
||||
bool device = bnType != MNN_FORWARD_CPU;
|
||||
if (device) {
|
||||
printee = this->createHostTensorFromDevice(this, true);
|
||||
}
|
||||
|
|
|
@ -105,6 +105,7 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output, bool forward) {
|
|||
reg.dst.stride[1] = multipler;
|
||||
reg.dst.stride[2] = 1;
|
||||
reg.origin = input;
|
||||
return;
|
||||
}
|
||||
int32_t inputShape[MNN_MAX_TENSOR_DIM];
|
||||
int32_t outputShape[MNN_MAX_TENSOR_DIM];
|
||||
|
|
Loading…
Reference in New Issue