Merge pull request #3610 from alibaba/feautre/bugfix
android / android_build (push) Has been cancelled Details
ios / ios_build (push) Has been cancelled Details
linux / linux_buil_test (push) Has been cancelled Details
macos / macos_buil_test (push) Has been cancelled Details
windows / windows_build_test (push) Has been cancelled Details
stale / stale (push) Has been cancelled Details

Vulkan:Bugfix: Fix bug for radixsort copy sortnumber error
This commit is contained in:
jxt1234 2025-06-09 10:20:00 +08:00 committed by GitHub
commit 4f39ef5f3f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 116 additions and 37 deletions

View File

@ -105,6 +105,7 @@ ErrorCode VulkanBackend::onResizeEnd() {
if (!mDirect) {
mCmdBuffer->end();
}
mHostBuffer.reset();
return NO_ERROR;
}
class VulkanMemRelease : public Backend::MemObj {
@ -288,27 +289,36 @@ static Tensor::DimensionType _convert(MNN_DATA_FORMAT format) {
}
return Tensor::CAFFE;
}
void VulkanBackend::copyToGPUBuffer(const void* src, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset) const {
_requireHostBuffer(size);
::memcpy(mHostBuffer->map(), src, size);
mHostBuffer->unmap();
std::shared_ptr<VulkanBuffer> VulkanBackend::createHostBuffer(size_t size) const {
std::shared_ptr<VulkanBuffer> res;
res.reset(new VulkanBuffer(*mRuntime->mMemoryPool, false, size, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
return res;
}
void VulkanBackend::copyGPUToGPUBuffer(VkBuffer srcBuffer, VkBuffer dstBuffer, VkDeviceSize size, VkDeviceSize srcOffset, VkDeviceSize dstOffset) const {
auto cmdbuffer = mCmdBufferForCopy;
cmdbuffer->begin(0);
VkBufferCopy bufferCopy;
bufferCopy.size = size;
bufferCopy.dstOffset = offset;
bufferCopy.srcOffset = 0;
vkCmdCopyBuffer(cmdbuffer->get(), mHostBuffer->buffer(), buffer,
bufferCopy.dstOffset = dstOffset;
bufferCopy.srcOffset = srcOffset;
vkCmdCopyBuffer(cmdbuffer->get(), srcBuffer, dstBuffer,
1, &bufferCopy);
cmdbuffer->end();
pushCommand(cmdbuffer->get());
_finish();
mHostBuffer.reset();
}
void VulkanBackend::copyToGPUBuffer(const void* src, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset) const {
_requireHostBuffer(size);
::memcpy(mHostBuffer->map(), src, size);
mHostBuffer->unmap();
copyGPUToGPUBuffer(mHostBuffer->buffer(), buffer, size, 0, offset);
}
void VulkanBackend::_requireHostBuffer(size_t size) const {
_finish();
if (nullptr == mHostBuffer || mHostBuffer->size() < size) {
mHostBuffer.reset(new VulkanBuffer(*mRuntime->mMemoryPool, false, size, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
mHostBuffer = createHostBuffer(size);
}
}

View File

@ -92,7 +92,9 @@ public:
VULKAN_TENSOR getBuffer(const Tensor* tensor) const;
std::shared_ptr<VulkanBuffer> allocUniform(const void* src = nullptr, int size = 0);
void recycleUniform(std::shared_ptr<VulkanBuffer> buffer);
void copyGPUToGPUBuffer(VkBuffer srcBuffer, VkBuffer dstBuffer, VkDeviceSize size, VkDeviceSize srcOffset, VkDeviceSize dstOffset) const;
void copyToGPUBuffer(const void* src, VkBuffer buffer, VkDeviceSize size, VkDeviceSize offset) const;
std::shared_ptr<VulkanBuffer> createHostBuffer(size_t size) const;
const VulkanDevice& device() const;
#ifdef ENABLE_VULKAN_TIME_PROFILE

View File

@ -93,14 +93,15 @@ public:
return;
}
std::shared_ptr<Tensor> sourceWeight(Tensor::createDevice<float>({ci * co * kernelSize}));
res = vkBn->onAcquireBuffer(sourceWeight.get(), Backend::STATIC);
if (!res) {
auto sourceBuffer = vkBn->createHostBuffer(ci * co * kernelSize * sizeof(float));
if (nullptr == sourceBuffer.get()) {
return;
}
{
auto vkTensor = extra->getBuffer(sourceWeight.get());
extra->copyToGPUBuffer(weightPtr, std::get<0>(vkTensor), sourceWeight->size(), std::get<2>(vkTensor));
}
::memcpy(sourceBuffer->map(), weightPtr, ci * co * kernelSize * sizeof(float));
sourceBuffer->unmap();
sourceWeight->buffer().device = (uint64_t)(sourceBuffer.get());
TensorUtils::getDescribe(sourceWeight.get())->extra.offset = 0;
std::shared_ptr<VulkanCommandPool::Buffer> prearrangeCmd( vkBn->getPool().allocBuffer());
for (auto& reg : des->regions) {
reg.origin = sourceWeight.get();

View File

@ -70,6 +70,8 @@ public:
}
}
types.resize(maxIndex+1);
std::vector<std::tuple<int, void*, size_t>> constStoragePtrs;
std::vector<std::tuple<int, void*, size_t>> constUniformPtrs;
for (int i=0; i<extra->attr()->size(); ++i) {
auto attr = extra->attr()->GetAs<Attribute>(i);
if (attr->key()->str() == "input") {
@ -89,13 +91,6 @@ public:
continue;
}
if (attr->key()->str() == "const") {
auto usageBit = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
if (attr->b()) {
types[attr->i()] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
} else {
usageBit = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
types[attr->i()] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
}
auto b = attr->tensor();
void* result = nullptr;
size_t bufferSize = 0;
@ -112,14 +107,59 @@ public:
MNN_ASSERT(false);
break;
}
std::shared_ptr<VulkanBuffer> vkBuffer(new VulkanBuffer(vkBn->getMemoryPool(), false, bufferSize, nullptr, usageBit, VK_SHARING_MODE_EXCLUSIVE, 0));
vkBn->copyToGPUBuffer(result, vkBuffer->buffer(), bufferSize, 0);
mConstIndides.emplace_back(std::make_pair(attr->i(), vkBuffer));
if (attr->b()) {
types[attr->i()] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
constUniformPtrs.emplace_back(std::make_tuple(attr->i(), result, bufferSize));
} else {
types[attr->i()] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
constStoragePtrs.emplace_back(std::make_tuple(attr->i(), result, bufferSize));
}
continue;
}
}
auto alignSize = vkBn->device().proty().limits.minMemoryMapAlignment;
size_t offset = 0;
std::shared_ptr<VulkanCommandPool::Buffer> cmdbuffer( vkBn->getPool().allocBuffer());
cmdbuffer->begin(0);
auto merge = [&](const std::vector<std::tuple<int, void*, size_t>>& constPtrs, VkDescriptorType type) {
if (constPtrs.empty()) {
return std::make_tuple(std::vector<std::tuple<int, size_t, size_t>>{}, std::shared_ptr<VulkanBuffer>(nullptr), std::shared_ptr<VulkanBuffer>(nullptr));
}
std::vector<std::tuple<int, size_t, size_t>> mConstOffset;
for (auto& constAttr : constPtrs) {
auto size = UP_DIV(std::get<2>(constAttr), alignSize) * alignSize;
mConstOffset.emplace_back(std::make_tuple(std::get<0>(constAttr), size, offset));
offset += size;
}
std::shared_ptr<VulkanBuffer> hostBuffer(new VulkanBuffer(vkBn->getMemoryPool(), false, offset, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
auto ptr = (uint8_t*)hostBuffer->map();
for (int i=0; i<constPtrs.size(); ++i) {
::memcpy(ptr + std::get<2>(mConstOffset[i]), std::get<1>(constPtrs[i]), std::get<2>(constPtrs[i]));
}
hostBuffer->unmap();
std::shared_ptr<VulkanBuffer> vkBuffer(new VulkanBuffer(vkBn->getMemoryPool(), false, offset, nullptr, type, VK_SHARING_MODE_EXCLUSIVE, 0));
VkBufferCopy bufferCopy;
bufferCopy.size = offset;
bufferCopy.dstOffset = 0;
bufferCopy.srcOffset = 0;
vkCmdCopyBuffer(cmdbuffer->get(), hostBuffer->buffer(), vkBuffer->buffer(),
1, &bufferCopy);
return std::make_tuple(mConstOffset, vkBuffer, hostBuffer);
};
mConstStorageOffset.clear();
mConstUniformOffset.clear();
auto uniforms = merge(constUniformPtrs, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
mConstUniformOffset = std::get<0>(uniforms);
mConstUniformBuffer = std::get<1>(uniforms);
auto storages = merge(constStoragePtrs, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
mConstStorageOffset = std::get<0>(storages);
mConstStorageBuffer = std::get<1>(storages);
cmdbuffer->end();
auto fence = vkBn->getPool().submit(cmdbuffer->get());
mPipeline = factory->createComputePipeline(data, dataSize, types, std::vector<uint32_t>{});
mDescriptorSet = mPipeline->createSet();
fence->wait();
}
virtual ~VulkanFuse() {
// Remove set firstly before destroy pipeline
@ -134,8 +174,11 @@ public:
for (int i=0; i<outputs.size(); ++i) {
mDescriptorSet->writeBuffer(vkBn->getBuffer(outputs[i]), mOutputBinding[i]);
}
for (auto& iter : mConstIndides) {
mDescriptorSet->writeBuffer(iter.second->buffer(), iter.first, iter.second->size());
for (auto& iter : mConstStorageOffset) {
mDescriptorSet->writeBuffer(mConstStorageBuffer->buffer(), std::get<0>(iter), std::get<1>(iter), std::get<2>(iter));
}
for (auto& iter : mConstUniformOffset) {
mDescriptorSet->writeBuffer(mConstUniformBuffer->buffer(), std::get<0>(iter), std::get<1>(iter), std::get<2>(iter));
}
if (mNeedAutoTuning) {
auto localSize = vkBn->autoTunePipeline(mPipeline.get(), mDescriptorSet, mGlobalSize);
@ -153,7 +196,11 @@ private:
std::vector<int> mGlobalSize;
std::vector<int> mInputBinding;
std::vector<int> mOutputBinding;
std::vector<std::pair<int, std::shared_ptr<VulkanBuffer>>> mConstIndides;
std::shared_ptr<VulkanBuffer> mConstStorageBuffer;
std::shared_ptr<VulkanBuffer> mConstUniformBuffer;
// Index, offset, size
std::vector<std::tuple<int, size_t, size_t>> mConstStorageOffset;
std::vector<std::tuple<int, size_t, size_t>> mConstUniformOffset;
SharedPtr<VulkanPipeline> mPipeline;
SharedPtr<VulkanLayout::DescriptorSet> mDescriptorSet;
bool mNeedAutoTuning = false;

View File

@ -82,7 +82,7 @@ void VulkanRaster::onEncodeFast(const Tensor* input, const Tensor* output, const
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
Tensor::InsideDescribe::Region newRegion;
OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4);
OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4, true);
// TODO: Find better way
newRegion.dst.offset /= 4;
newRegion.src.offset /= 4;
@ -92,6 +92,8 @@ void VulkanRaster::onEncodeFast(const Tensor* input, const Tensor* output, const
auto group = UP_DIV(total, 256);
std::shared_ptr<VulkanLayout::DescriptorSet> describe(blitPipeline->createSet());
std::shared_ptr<VulkanBuffer> uniform = vkBn->allocUniform();
::memcpy(uniform->map(), &info, sizeof(SamplerInfo));
uniform->unmap();
auto srcTensor = vkBn->getTensorBuffer(slice.origin);
auto srcTensorSize = vkBn->getTensorSize(slice.origin);
describe->writeBuffer(dstTensor.first->buffer(), 0, dstTensorSize, dstTensor.second);
@ -127,7 +129,7 @@ ErrorCode VulkanRaster::onEncode(const std::vector<Tensor *> &____inputs, const
fast = false;
break;
}
if (!OpCommonUtils::canBlitFast(slice, output)) {
if (!OpCommonUtils::canBlitFast(slice, output, 4, true)) {
fast = false;
break;
}

View File

@ -148,7 +148,7 @@ ErrorCode VulkanRasterSort::onEncode(const std::vector<Tensor *> &inputs, const
region2.dstOffset = std::get<2>(output);
region2.srcOffset = pointOffsetSum.second + (pointOffsetBytes / sizeof(uint32_t) - 1) * sizeof(uint32_t);
vkCmdCopyBuffer(cmdBuffer->get(), ((VulkanBuffer*)pointOffsetSum.first)->buffer(), std::get<0>(output), 1, &region);
vkCmdCopyBuffer(cmdBuffer->get(), ((VulkanBuffer*)pointOffsetSum.first)->buffer(), std::get<0>(output), 1, &region2);
cmdBuffer->barrierSource(sortNumber->buffer(), 0, sizeof(uint32_t));
}

View File

@ -24,8 +24,7 @@ VulkanCommandPool::~VulkanCommandPool() {
mDevice.destroyCommandPool(mPool);
// FUNC_PRINT(1);
}
void VulkanCommandPool::submitAndWait(VkCommandBuffer buffer) const {
std::shared_ptr<VulkanFence> VulkanCommandPool::submit(VkCommandBuffer buffer) const {
auto b = buffer;
auto fence = std::make_shared<VulkanFence>(mDevice);
VkSubmitInfo submit_info = {/* .sType = */ VK_STRUCTURE_TYPE_SUBMIT_INFO,
@ -40,6 +39,11 @@ void VulkanCommandPool::submitAndWait(VkCommandBuffer buffer) const {
auto fenceReal = fence->get();
auto queue = mDevice.acquireDefaultDevQueue();
CALL_VK(vkQueueSubmit(queue, 1, &submit_info, fenceReal));
return fence;
}
void VulkanCommandPool::submitAndWait(VkCommandBuffer buffer) const {
auto fence = submit(buffer);
fence->wait();
}

View File

@ -11,6 +11,7 @@
#include "core/NonCopyable.hpp"
#include "backend/vulkan/component/VulkanDevice.hpp"
#include "backend/vulkan/component/VulkanFence.hpp"
#include "backend/vulkan/vulkan/vulkan_wrapper.h"
namespace MNN {
class VulkanImage;
@ -49,6 +50,7 @@ public:
}
void submitAndWait(VkCommandBuffer buffer) const;
std::shared_ptr<VulkanFence> submit(VkCommandBuffer buffer) const;
private:
const VulkanDevice& mDevice;

View File

@ -11,18 +11,23 @@
//#define MNN_VULKAN_PRINT_EXT
namespace MNN {
static uint32_t _getLocalMemorySize(const VkPhysicalDeviceMemoryProperties& memProty) {
#ifdef __APPLE__
// For mac vulkan driver can not get correct local size
return 16384;
#else
int32_t localMemorySize = 0;
for (int i=0; i<VK_MAX_MEMORY_TYPES; ++i) {
for (int i=0; i<memProty.memoryHeapCount; ++i) {
auto& heap = memProty.memoryHeaps[i];
if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
auto size = (int32_t)heap.size;
if (size > 0) {
localMemorySize = size;
break;
}
break;
}
}
return localMemorySize;
#endif
}
VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
: mOwner(true),
@ -118,6 +123,7 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
}
vkGetPhysicalDeviceProperties(mPhysicalDevice, &mDeviceProty);
vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mMemoryProty);
mLocalMemorySize = _getLocalMemorySize(mMemoryProty);
getDeviceQueue(mQueueFamilyIndex, 0, mQueue);
// query subgroupSize
@ -132,7 +138,6 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
vkGetPhysicalDeviceProperties2(mPhysicalDevice, &deviceProperties2);
mSubgroupSize = subgroupProperties.subgroupSize;
}
mLocalMemorySize = _getLocalMemorySize(mMemoryProty);
#ifdef MNN_VULKAN_PRINT_EXT
uint32_t pPropertyCount;
vkEnumerateInstanceExtensionProperties(nullptr, &pPropertyCount, nullptr);
@ -146,6 +151,7 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
FUNC_PRINT(mDeviceProty.limits.maxComputeWorkGroupCount[0]);
FUNC_PRINT(mDeviceProty.limits.maxComputeWorkGroupInvocations);
FUNC_PRINT(mDeviceProty.limits.maxComputeSharedMemorySize);
FUNC_PRINT(mLocalMemorySize);
#endif
}

View File

@ -350,7 +350,11 @@ void Tensor::print() const {
// convert to host if needed
auto printee = this;
bool device = this->buffer().host == NULL && this->buffer().device != 0;
auto bnType = MNN_FORWARD_CPU;
if (nullptr != mDescribe->getBackend()) {
bnType = mDescribe->getBackend()->type();
}
bool device = bnType != MNN_FORWARD_CPU;
if (device) {
printee = this->createHostTensorFromDevice(this, true);
}

View File

@ -105,6 +105,7 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output, bool forward) {
reg.dst.stride[1] = multipler;
reg.dst.stride[2] = 1;
reg.origin = input;
return;
}
int32_t inputShape[MNN_MAX_TENSOR_DIM];
int32_t outputShape[MNN_MAX_TENSOR_DIM];