MNN/source/backend/opencl/core/OpenCLBackend.cpp

1493 lines
58 KiB
C++
Raw Normal View History

2019-04-17 10:49:11 +08:00
//
// OpenCLBackend.cpp
// MNN
//
// Created by MNN on 2019/02/28.
// Copyright © 2018, Alibaba Group Holding Limited
//
2019-12-27 22:16:57 +08:00
#include "backend/opencl/core/OpenCLBackend.hpp"
2019-04-17 10:49:11 +08:00
#include "MNN_generated.h"
2023-09-04 10:42:11 +08:00
#include "core/BufferAllocator.hpp"
2019-12-27 22:16:57 +08:00
#include "core/TensorUtils.hpp"
2020-11-05 16:41:56 +08:00
#include "shape/SizeComputer.hpp"
2019-04-17 10:49:11 +08:00
#include <map>
#include <mutex>
#include <thread>
2019-12-27 22:16:57 +08:00
#include "core/Macro.h"
2022-01-04 10:50:40 +08:00
#include "runtime/OpenCLTuneInfo.hpp"
2024-03-13 14:55:54 +08:00
#ifdef __ANDROID__
#include <GLES2/gl2.h>
#endif
2023-12-27 17:26:44 +08:00
//#define OPENCL_FALLBACK_LOG
2019-04-17 10:49:11 +08:00
namespace MNN {
namespace OpenCL {
2023-12-27 17:26:44 +08:00
#ifndef MNN_OPENCL_SEP_BUILD
void registerOpenCLOps();
#endif
2019-04-17 10:49:11 +08:00
2024-03-13 14:55:54 +08:00
CLRuntime::CLRuntime(const Backend::Info& info, int platformSize, int platformId, int deviceId, void *contextPtr, void* glshared){
2020-11-05 16:41:56 +08:00
mInfo = info;
2020-11-05 16:41:56 +08:00
BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal;
BackendConfig::PowerMode power = BackendConfig::Power_Normal;
2023-12-27 17:26:44 +08:00
BackendConfig::MemoryMode memory = BackendConfig::Memory_Normal;
2020-11-05 16:41:56 +08:00
if (nullptr != mInfo.user) {
precision = mInfo.user->precision;
power = mInfo.user->power;
2023-12-27 17:26:44 +08:00
memory = mInfo.user->memory;
2020-11-05 16:41:56 +08:00
}
2019-04-17 10:49:11 +08:00
// Shader precision
2025-02-12 11:14:19 +08:00
mOpenCLRuntime.reset(new OpenCLRuntime(precision, mInfo.gpuMode, platformSize, platformId, deviceId, contextPtr, glshared, hint()));
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
//Whether runtimeError
mCLRuntimeError = mOpenCLRuntime->isCreateError();
mPrecision = precision;
2023-12-27 17:26:44 +08:00
mMemory = memory;
2022-01-04 10:50:40 +08:00
mTunedInfo = new TuneInfo;
2022-05-06 19:51:20 +08:00
mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
2020-11-05 16:41:56 +08:00
}
CLRuntime::~CLRuntime() {
2022-05-06 19:51:20 +08:00
mImagePool = nullptr;
mBufferPool = nullptr;
2020-11-05 16:41:56 +08:00
mOpenCLRuntime = nullptr;
2022-01-04 10:50:40 +08:00
delete mTunedInfo;
}
static bool _checkTensorInfo(const CLCache::TensorInfoT* dst, const Tensor* src) {
if (dst->shape.size() != src->dimensions()) {
return false;
}
for (int j=0; j<dst->shape.size(); ++j) {
if (dst->shape[j] != src->length(j)) {
return false;
}
}
return true;
2020-11-05 16:41:56 +08:00
}
2022-01-04 10:50:40 +08:00
bool CLRuntime::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Runtime::OpInfo& dstInfo) const {
dstInfo.initCostLong = true;
if (nullptr == op->name()) {
dstInfo.initCostLong = false;
return true;
}
for(auto& info : mTunedInfo->mInfos) {
if (info->type != op->type()) {
continue;
}
if (info->name != op->name()->str()) {
continue;
}
if (info->inputs.size() != inputs.size() || info->outputs.size() != outputs.size()) {
continue;
}
bool match = true;
for (int i=0; i<inputs.size(); ++i) {
auto& dst = info->inputs[i];
auto src = inputs[i];
if (!_checkTensorInfo(dst.get(), src)) {
match = false;
break;
}
}
if (!match) {
continue;
}
for (int i=0; i<outputs.size(); ++i) {
auto& dst = info->outputs[i];
auto src = outputs[i];
if (!_checkTensorInfo(dst.get(), src)) {
match = false;
break;
}
}
if (match) {
// All Info is match
dstInfo.initCostLong = false;
break;
}
}
return true;
}
int CLRuntime::onGetRuntimeStatus(RuntimeStatus statusEnum) const {
switch (statusEnum) {
case STATUS_SUPPORT_FP16: {
return mOpenCLRuntime->isDeviceSupportedFP16();
break;
}
case STATUS_SUPPORT_DOT_PRODUCT: {
return 0;
break;
}
2022-12-30 15:18:58 +08:00
case STATUS_SUPPORT_POWER_LOW: {
return mOpenCLRuntime->isDeviceSupportedLowPower();
break;
}
default: {
MNN_ERROR("unsupported interface");
break;
}
}
return 0;
}
2022-01-04 10:50:40 +08:00
void CLRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
if (nullptr != op->name()) {
auto dstInfo = mTunedInfo;
std::unique_ptr<CLCache::OpInfoT> opInfo(new CLCache::OpInfoT);;
opInfo->type = op->type();
opInfo->name = op->name()->str();
opInfo->inputs.resize(inputs.size());
for (int v=0; v<opInfo->inputs.size(); ++v) {
opInfo->inputs[v].reset(new CLCache::TensorInfoT);
opInfo->inputs[v]->shape.resize(inputs[v]->dimensions());
for (int u=0; u<opInfo->inputs[v]->shape.size(); ++u) {
opInfo->inputs[v]->shape[u] = inputs[v]->length(u);
}
}
opInfo->outputs.resize(outputs.size());
for (int v=0; v<opInfo->outputs.size(); ++v) {
opInfo->outputs[v].reset(new CLCache::TensorInfoT);
opInfo->outputs[v]->shape.resize(outputs[v]->dimensions());
for (int u=0; u<opInfo->outputs[v]->shape.size(); ++u) {
opInfo->outputs[v]->shape[u] = outputs[v]->length(u);
}
}
dstInfo->mInfos.emplace_back(std::move(opInfo));
}
}
void CLRuntime::onReset(int numberThread, const BackendConfig* config, bool full) {
mOpenCLRuntime->setGpuMode(numberThread);
}
2020-11-05 16:41:56 +08:00
bool CLRuntime::onSetCache(const void* buffer, size_t size) {
2022-01-04 10:50:40 +08:00
if (nullptr == buffer) {
return false;
}
auto cacheBuffer = CLCache::GetCache(buffer);
flatbuffers::Verifier verify((const uint8_t*)buffer, size);
if (false == CLCache::VerifyCacheBuffer(verify)) {
return false;
}
if(nullptr != cacheBuffer->tuned()) {
for (int i=0; i<cacheBuffer->tuned()->size(); ++i) {
auto srcInfo = cacheBuffer->tuned()->GetAs<CLCache::OpInfo>(i);
std::unique_ptr<CLCache::OpInfoT> dst(srcInfo->UnPack());
mTunedInfo->mInfos.emplace_back(std::move(dst));
}
}
2023-04-27 15:11:05 +08:00
bool res = mOpenCLRuntime->setCache(std::make_pair(buffer, size));
return res;
2020-11-05 16:41:56 +08:00
}
std::pair<const void*, size_t> CLRuntime::onGetCache() {
2022-01-04 10:50:40 +08:00
return mOpenCLRuntime->makeCache(mTunedInfo);
2020-11-05 16:41:56 +08:00
}
2024-09-12 12:57:57 +08:00
Backend* CLRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
auto precision = mPrecision;
auto memory = mMemory;
if (nullptr != config) {
precision = config->precision;
memory = config->memory;
}
return new OpenCLBackend(precision, memory, mImagePool, mBufferPool, this);
2020-11-05 16:41:56 +08:00
}
void CLRuntime::onGabageCollect(int level) {
2023-12-04 11:12:20 +08:00
mImagePool->releaseFreeList();
mBufferPool->releaseFreeList();
2020-11-05 16:41:56 +08:00
}
2023-12-27 17:26:44 +08:00
float CLRuntime::onGetMemoryInMB() {
auto staticMemoryInMB = mBufferPool->totalSize() / 1024.0f / 1024.0f;
return staticMemoryInMB;
}
bool CLRuntime::isCLRuntimeError() {
return mCLRuntimeError;
}
std::map<std::pair<OpType, GpuMemObject>, OpenCLBackend::Creator*>* gCreator() {
2020-11-05 16:41:56 +08:00
static std::once_flag once;
static std::map<std::pair<OpType, GpuMemObject>, OpenCLBackend::Creator*>* creators = nullptr;
std::call_once(once, [&]() { creators = new std::map<std::pair<OpType, GpuMemObject>, OpenCLBackend::Creator*>; });
2020-11-05 16:41:56 +08:00
return creators;
};
OpenCLBackend::OpenCLBackend(BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, std::shared_ptr<ImagePool>imgPool, std::shared_ptr<BufferPool> bufPool, const CLRuntime *runtime)
2020-11-05 16:41:56 +08:00
: Backend(MNN_FORWARD_OPENCL) {
2020-11-05 16:41:56 +08:00
mCLRuntime = runtime;
mOpenCLRuntime = mCLRuntime->mOpenCLRuntime;
mPrecision = precision;
mMemory = memory;
mOpenCLRuntime->setPrecision(precision);
2022-05-06 19:51:20 +08:00
mStaticImagePool = imgPool;
mStaticBufferPool = bufPool;
if(mOpenCLRuntime.get()){
if(mOpenCLRuntime->isCreateError() == true) {
mIsCreateError = true;
}
2024-04-19 11:58:21 +08:00
mImagePoolFirst.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPoolFirst.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
2024-12-31 15:34:08 +08:00
mExecutionBufferPool.reset(new BufferExecutionPool(mOpenCLRuntime->context(), mOpenCLRuntime->commandQueue(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
2024-04-19 11:58:21 +08:00
mImagePool = mImagePoolFirst.get();
mBufferPool = mBufferPoolFirst.get();
2019-04-17 10:49:11 +08:00
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
mMapMem = std::make_pair(0, nullptr);
2023-12-27 17:26:44 +08:00
mUseRecordQueue = mOpenCLRuntime->isSupportRecordQueue();
mDevideOpRecord = mOpenCLRuntime->isDevideOpRecord();
mUseRecordableQueueSize = mOpenCLRuntime->getUseRecordableQueueSize();
2019-04-17 10:49:11 +08:00
}
OpenCLBackend::~OpenCLBackend() {
#ifdef LOG_VERBOSE
MNN_PRINT("enter OpenCLBackend::~OpenCLBackend \n");
#endif
2023-12-27 17:26:44 +08:00
releaseRecord();
mRecordings.clear();
mImagePool = nullptr;
mBufferPool = nullptr;
2024-12-31 15:34:08 +08:00
mExecutionBufferPool->clear();
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
if(mMapMem.second != nullptr) {
#ifdef MNN_OPENCL_SVM_ENABLE
if(mUseSvm)
{
clSVMFree(mOpenCLRuntime->context().get(), mMapMem.second);
}
else
#endif
{
free(mMapMem.second);
mMapMem.second = nullptr;
}
}
2019-04-17 10:49:11 +08:00
}
OpenCLRuntime* OpenCLBackend::getOpenCLRuntime() {
return mOpenCLRuntime.get();
}
2024-12-31 15:34:08 +08:00
class CLReleaseExecutionBuffer : public Backend::MemObj {
public:
CLReleaseExecutionBuffer(std::shared_ptr<OpenCLBufferNode> node, BufferExecutionPool* bufferPool) {
mNode = node;
mBufferPool = bufferPool;
}
virtual ~ CLReleaseExecutionBuffer() {
mBufferPool->recycle(mNode);
}
private:
std::shared_ptr<OpenCLBufferNode> mNode;
BufferExecutionPool* mBufferPool;
};
class CLMemReleaseBuffer : public Backend::MemObj {
public:
CLMemReleaseBuffer(cl::Buffer* bId, BufferPool* bufferPool) {
mBuffer = bId;
mBufferPool = bufferPool;
}
virtual ~ CLMemReleaseBuffer() {
mBufferPool->recycle(mBuffer);
}
private:
cl::Buffer* mBuffer;
BufferPool* mBufferPool;
};
class CLMemReleaseImage : public Backend::MemObj {
public:
CLMemReleaseImage(cl::Image* bId, ImagePool* bufferPool) {
mBuffer = bId;
mBufferPool = bufferPool;
}
virtual ~ CLMemReleaseImage() {
mBufferPool->recycle(mBuffer);
}
private:
cl::Image* mBuffer;
ImagePool* mBufferPool;
};
2024-05-11 19:17:02 +08:00
float OpenCLBackend::getBytes(const Tensor* tensor) {
float bytes = (float)tensor->getType().bytes();
if (getOpenCLRuntime()->isSupportedFP16()) {// Fp16
if (halide_type_float == tensor->getType().code) {
bytes = 2.0;
}
}
auto quant = TensorUtils::getDescribe(tensor)->quantAttr.get();
if (nullptr != quant && TensorUtils::getDescribe(tensor)->type == DataType_DT_INT8) {
bytes = 1.0;
}
if(tensor->getType().bits == 4) {
bytes = 0.5;
}
return bytes;
}
Backend::MemObj* OpenCLBackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
#ifdef LOG_VERBOSE
2019-04-17 10:49:11 +08:00
MNN_PRINT("Start OpenCLBackend::onAcquireBuffer !\n");
#endif
2019-04-17 10:49:11 +08:00
auto tensorShape = OpenCL::tensorShapeFormat(nativeTensor);
int N = tensorShape.at(0);
int H = tensorShape.at(1);
int W = tensorShape.at(2);
int C = tensorShape.at(3);
#ifdef LOG_VERBOSE
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
MNN_PRINT("OpenCLBackend::onAcquireBuffer: NHWC:[%d, %d, %d, %d]\n", N, H, W, C);
#endif
#ifndef MNN_OPENCL_BUFFER_CLOSED
2023-07-31 14:24:48 +08:00
if(mOpenCLRuntime->getGpuMemType() == BUFFER) {
size_t size;
2024-05-11 19:17:02 +08:00
float typeSize = getBytes(nativeTensor);
2024-12-19 16:20:00 +08:00
if (MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(nativeTensor)->dimensionFormat && nativeTensor->dimensions() >= 2) {
auto alignC = ROUND_UP(C, 4);
2023-07-31 14:24:48 +08:00
// increment of height and width
auto hR = ROUND_UP(H + 3, 4) - H;
auto wR = ROUND_UP(W + 3, 4) - W;
size = N * alignC * W * H;
size = size + hR * W * 4 + wR * 4;
} else {
2024-07-22 19:51:53 +08:00
size = N * H * W * C;
2023-07-31 14:24:48 +08:00
size = ROUND_UP(size, 4);
}
if (mOpenCLRuntime->isSupportedIntelSubgroup()) {
int cPack = TensorUtils::getTensorChannelPack(nativeTensor);
auto pads = TensorUtils::getDescribe(nativeTensor)->mPads;
size_t imageWidth = (size_t) ROUND_UP(UP_DIV(C, cPack), 2) * ROUND_UP(pads.left + W + pads.right, 4);//C-round to 8,W-round to 4, for memory alloc
size_t imageHeight = (size_t)N * H;
size = imageWidth*imageHeight*cPack;
}
2024-05-11 19:17:02 +08:00
// Align when int4 memory
size = ROUND_UP(size, 2);
if (storageType == DYNAMIC_SEPERATE) {
2024-04-19 11:58:21 +08:00
auto buffer = mBufferPool->alloc(size*typeSize, true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
2024-04-19 11:58:21 +08:00
return new CLMemReleaseBuffer(buffer, mBufferPool);
}
if (storageType == DYNAMIC) {
2024-04-19 11:58:21 +08:00
auto buffer = mBufferPool->alloc(size*typeSize);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
2024-04-19 11:58:21 +08:00
return new CLMemReleaseBuffer(buffer, mBufferPool);
}
2024-12-31 15:34:08 +08:00
if (storageType == DYNAMIC_IN_EXECUTION){
auto node = mExecutionBufferPool->alloc(size*typeSize);
((Tensor*)nativeTensor)->buffer().device = reinterpret_cast<uint64_t>(node.get());
return new CLReleaseExecutionBuffer(node, mExecutionBufferPool.get());
}
MNN_ASSERT(storageType == STATIC);
2024-05-11 19:17:02 +08:00
auto buffer = mStaticBufferPool->alloc(size*typeSize);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer; // fix
return new CLMemReleaseBuffer(buffer, mStaticBufferPool.get());
2019-04-17 10:49:11 +08:00
}
else
#endif /* MNN_OPENCL_BUFFER_CLOSED */
{
size_t imageWidth = (size_t) (UP_DIV(C, 4) * W);//image mode only C pack to 4
size_t imageHeight = (size_t)N * H;
cl_channel_type dataType = CL_HALF_FLOAT;
2024-04-19 11:58:21 +08:00
if(nativeTensor->getType().code == halide_type_int) {
if(nativeTensor->getType().bits == 8){
dataType = CL_SIGNED_INT8;
} else if(nativeTensor->getType().bits == 32){
dataType = CL_SIGNED_INT32;
}
} else if(nativeTensor->getType().code == halide_type_uint){
if(nativeTensor->getType().bits == 8){
dataType = CL_UNSIGNED_INT8;
} else if(nativeTensor->getType().bits == 32){
dataType = CL_UNSIGNED_INT32;
}
} else {
//when user want high precision, use float datatype
if (mPrecision == BackendConfig::Precision_High) {
dataType = CL_FLOAT;
}
}
if (storageType == DYNAMIC_SEPERATE) {
auto image = mImagePool->alloc(imageWidth, imageHeight, dataType, true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
2024-04-19 11:58:21 +08:00
return new CLMemReleaseImage(image, mImagePool);
}
if (storageType == DYNAMIC) {
auto image = mImagePool->alloc(imageWidth, imageHeight, dataType);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
2024-04-19 11:58:21 +08:00
return new CLMemReleaseImage(image, mImagePool);
}
MNN_ASSERT(storageType == STATIC);
auto image = mStaticImagePool->alloc(imageWidth, imageHeight, dataType);
2019-04-17 10:49:11 +08:00
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
return new CLMemReleaseImage(image, mStaticImagePool.get());
2019-04-17 10:49:11 +08:00
}
}
2024-04-19 11:58:21 +08:00
bool OpenCLBackend::onSelectDynamicAllocator(int index, int maxIndex) {
2024-09-12 12:57:57 +08:00
if (mUseRecordQueue && false == mDevideOpRecord){
return false;
}
2024-04-19 11:58:21 +08:00
if (maxIndex > 2) {
return false;
}
if (maxIndex > 1 && mImagePoolSecond.get() == nullptr) {
mImagePoolSecond.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPoolSecond.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
}
if (index == 0) {
mImagePool = mImagePoolFirst.get();
mBufferPool = mBufferPoolFirst.get();
} else if (index == 1) {
mImagePool = mImagePoolSecond.get();
mBufferPool = mBufferPoolSecond.get();
}
return true;
}
2019-04-17 10:49:11 +08:00
bool OpenCLBackend::onClearBuffer() {
mImagePool->clear();
mBufferPool->clear();
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
if(mMapMem.second != nullptr) {
#ifdef MNN_OPENCL_SVM_ENABLE
if(mUseSvm)
{
clSVMFree(mOpenCLRuntime->context().get(), mMapMem.second);
}
else
#endif
{
free(mMapMem.second);
mMapMem.second = nullptr;
}
}
2019-04-17 10:49:11 +08:00
return true;
}
2019-04-17 10:49:11 +08:00
Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
#ifdef LOG_VERBOSE
MNN_PRINT("Start OpenCLBackend::onCreate \n");
#endif
auto creators = gCreator();
auto iter = creators->find(std::make_pair(op->type(), mOpenCLRuntime->getGpuMemType()));
2023-10-18 10:31:02 +08:00
if (0 != inputs.size() && (getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1)) {
2023-12-27 17:26:44 +08:00
#ifdef OPENCL_FALLBACK_LOG
2023-10-18 10:31:02 +08:00
MNN_PRINT("Don't support type %s for int8 input\n", EnumNameOpType(op->type()));
#endif
2024-04-19 11:58:21 +08:00
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
2023-10-18 10:31:02 +08:00
return NULL;
}
if (iter == creators->end()) {
2023-12-27 17:26:44 +08:00
mDevideOpRecord = true;
#ifdef OPENCL_FALLBACK_LOG
if (nullptr != op->name()) {
MNN_PRINT("Don't support type %s memObject:%d, %s\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType(), op->name()->c_str());
} else {
MNN_PRINT("Don't support type %s memObject:%d\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType());
}
2021-04-08 15:34:23 +08:00
#endif
2024-04-19 11:58:21 +08:00
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
2019-04-17 10:49:11 +08:00
return NULL;
}
if(mOpenCLRuntime->getGpuMemType() == IMAGE) {
auto maxImageSize = mOpenCLRuntime->getMaxImage2DSize();
bool valid = true;
for (auto t : inputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
int imageHeight = tensorShape[0] * tensorShape[1];
int imageWidth = tensorShape[2] * UP_DIV(tensorShape[3], 4);
if (imageHeight > maxImageSize.at(0) || imageWidth > maxImageSize.at(1)) {
2020-11-05 16:41:56 +08:00
valid = false;
break;
}
}
for (auto t : outputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
int imageHeight = tensorShape[0] * tensorShape[1];
int imageWidth = tensorShape[2] * UP_DIV(tensorShape[3], 4);
if (imageHeight > maxImageSize.at(0) || imageWidth > maxImageSize.at(1)) {
valid = false;
break;
}
2020-11-05 16:41:56 +08:00
}
2019-04-17 10:49:11 +08:00
if (!valid) {
2023-12-27 17:26:44 +08:00
mDevideOpRecord = true;
#ifdef OPENCL_FALLBACK_LOG
for (auto t : inputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
MNN_PRINT("input n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]);
}
for (auto t : outputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);
MNN_PRINT("output n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]);
}
MNN_PRINT("beyond cl_image creat size! fallback to cpu backend\n");
2021-04-08 15:34:23 +08:00
#endif
2024-04-19 11:58:21 +08:00
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
return NULL;
}
}
2019-04-17 10:49:11 +08:00
auto exe = iter->second->onCreate(inputs, outputs, op, this);
if (NULL == exe) {
2023-12-27 17:26:44 +08:00
mDevideOpRecord = true;
#ifdef OPENCL_FALLBACK_LOG
if (nullptr != op->name()) {
MNN_PRINT("The Creator Don't support type %s, memObject:%d, %s\n", MNN::EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType(), op->name()->c_str());
} else {
MNN_PRINT("The Creator Don't support type %s, memObject:%d,\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType());
}
2021-04-08 15:34:23 +08:00
#endif
2024-04-19 11:58:21 +08:00
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
2019-04-17 10:49:11 +08:00
return NULL;
}
#ifdef LOG_VERBOSE
MNN_PRINT("End OpenCLBackend::onCreate \n");
#endif
return exe;
}
void OpenCLBackend::onResizeBegin() {
2022-12-30 15:18:58 +08:00
#ifndef ENABLE_OPENCL_TIME_PROFILER
mOpenCLRuntime->setCommandQueueProfileEnable();
2022-12-30 15:18:58 +08:00
#endif
2025-02-12 11:14:19 +08:00
// update mUseRecordableQueueSize if hint has changed
mUseRecordableQueueSize = mCLRuntime->hint().encorderNumForCommit <= mUseRecordableQueueSize ? mCLRuntime->hint().encorderNumForCommit : mUseRecordableQueueSize;
2023-12-27 17:26:44 +08:00
releaseRecord();
}
2023-09-20 20:16:25 +08:00
ErrorCode OpenCLBackend::onResizeEnd() {
2020-11-05 16:41:56 +08:00
#ifndef ENABLE_OPENCL_TIME_PROFILER
mOpenCLRuntime->setCommandQueueProfileDisable();
2020-11-05 16:41:56 +08:00
#endif
2023-12-27 17:26:44 +08:00
if(!mRecordings.empty()){
2024-05-11 19:17:02 +08:00
endRecord(mRecordings.back().record, true);
2023-12-27 17:26:44 +08:00
}
2023-09-20 20:16:25 +08:00
return NO_ERROR;
}
2019-04-17 10:49:11 +08:00
void OpenCLBackend::onExecuteBegin() const {
mOpenCLRuntime->mQueueCount = 0;
2023-12-27 17:26:44 +08:00
clearRecord();
mOpenCLRuntime->clearEvent();
2019-04-17 10:49:11 +08:00
}
void OpenCLBackend::onExecuteEnd() const {
mOpenCLRuntime->mQueueCount = 0;
2023-12-27 17:26:44 +08:00
clearRecord();
enqeueRecord();
mOpenCLRuntime->printEventTime();
2019-04-17 10:49:11 +08:00
}
bool OpenCLBackend::isCreateError() const {
return mIsCreateError;
}
2024-12-19 16:20:00 +08:00
bool OpenCLBackend::_allocHostBuffer(int length, const Tensor* srcTensor) const {
2024-03-13 14:55:54 +08:00
auto memType = srcTensor->buffer().flags;
2024-12-19 16:20:00 +08:00
if (nullptr != mHostBuffer.second && length <= mHostBuffer.first && memType != MNN_MEMORY_AHARDWAREBUFFER) {
return true;
2024-03-13 14:55:54 +08:00
}
2024-12-19 16:20:00 +08:00
cl_int error;
2024-03-13 14:55:54 +08:00
#ifdef __ANDROID__
2024-12-19 16:20:00 +08:00
if(MNN_MEMORY_AHARDWAREBUFFER == memType){
if (mOpenCLRuntime->isSupportAHD()){
CLSharedMemReleaseBuffer *sharedMem = (CLSharedMemReleaseBuffer*)TensorUtils::getSharedMem(srcTensor);
if(sharedMem == nullptr || (sharedMem != nullptr && srcTensor->buffer().device != sharedMem->getSharedId())){
if(mOpenCLRuntime->getGpuType() == MALI){
const cl_import_properties_arm properties[] = {CL_IMPORT_TYPE_ARM, CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM, 0};
Backend::MemObj* SharedTmp = new CLSharedMemReleaseBuffer(srcTensor->buffer().device, new cl::Buffer(mOpenCLRuntime->context(), (cl_mem_flags)CL_MEM_READ_WRITE, properties, (void*)srcTensor->buffer().device, CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM, &error));
TensorUtils::setSharedMem(srcTensor, SharedTmp);
}else if(mOpenCLRuntime->getGpuType() == ADRENO){
cl_mem_ahardwarebuffer_host_ptr myAHBmem = {0};
myAHBmem.ext_host_ptr.allocation_type = CL_MEM_ANDROID_AHARDWAREBUFFER_HOST_PTR_QCOM;
myAHBmem.ext_host_ptr.host_cache_policy = CL_MEM_HOST_WRITEBACK_QCOM;
myAHBmem.ahb_ptr = (AHardwareBuffer*)srcTensor->buffer().device;
Backend::MemObj* SharedTmp = new CLSharedMemReleaseBuffer(srcTensor->buffer().device, new cl::Buffer(mOpenCLRuntime->context(), (cl_mem_flags)(CL_MEM_USE_HOST_PTR | CL_MEM_EXT_HOST_PTR_QCOM), 0, &myAHBmem, &error));
TensorUtils::setSharedMem(srcTensor, SharedTmp);
} else{
MNN_ERROR("This device not support AHardWareBuffer\n");
return false;
}
if (error != CL_SUCCESS) {
MNN_ERROR("Alloc mAHardWareBuffer error, code:%d \n", error);
return false;
}
}
} else{
MNN_ERROR("This device not support AHardWareBuffer\n");
return false;
}
} else
2024-03-13 14:55:54 +08:00
#endif
2024-12-19 16:20:00 +08:00
{
2024-03-13 14:55:54 +08:00
MNN_ASSERT(length > 0);
mHostBuffer.first = length;
2024-12-19 16:20:00 +08:00
mHostBuffer.second.reset(new cl::Buffer(mOpenCLRuntime->context(), (cl_mem_flags)(CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (size_t)length, NULL, &error));
if (nullptr == mHostBuffer.second.get() || error != CL_SUCCESS) {
MNN_ERROR("Alloc mHostBuffer %d error, code:%d \n", length, error);
return false;
2024-04-19 11:58:21 +08:00
}
2024-03-13 14:55:54 +08:00
}
2024-12-19 16:20:00 +08:00
return true;
2019-04-17 10:49:11 +08:00
}
2019-12-27 22:16:57 +08:00
void OpenCLBackend::copyFromDeviceInt8(const Tensor* srcTensor, const Tensor* dstTensor) const{
2020-11-05 16:41:56 +08:00
std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(dstTensor);
auto needSize = dstTensor->size();
auto hostPtr = dstTensor->host<int8_t>();
auto DeviceBuffer = (cl::Buffer*)srcTensor->deviceId();
cl_int error = CL_SUCCESS;
#ifndef MNN_OCL_QUANT_DUMP
error = mOpenCLRuntime->commandQueue().enqueueReadBuffer(*DeviceBuffer, CL_TRUE, 0, needSize, hostPtr);
MNN_ASSERT(error == 0);
#else//for dump test
int8_t* tmpPtr = (int8_t *)malloc(needSize);
error = mOpenCLRuntime->commandQueue().enqueueReadBuffer(*DeviceBuffer, CL_TRUE, 0, needSize, tmpPtr);
MNN_ASSERT(error == 0);
int C_4 = (bufferShape[3]+3)/4;
for(int n=0; n<bufferShape[0]; n++) {
for(int c=0; c<bufferShape[3]; c++) {
for(int h=0; h<bufferShape[1]; h++) {
for(int w=0; w<bufferShape[2]; w++) {
hostPtr[n*bufferShape[3]*bufferShape[1]*bufferShape[2] + c*bufferShape[1]*bufferShape[2] + h*bufferShape[2] + w] =
tmpPtr[n*C_4*bufferShape[1]*bufferShape[2]*4 + (c/4)*bufferShape[1]*bufferShape[2]*4 + h*bufferShape[2]*4 + w*4 + c%4];
}
}
}
}
if(tmpPtr != nullptr) {
free(tmpPtr);
tmpPtr = nullptr;
}
#endif
2020-11-05 16:41:56 +08:00
#ifdef ENABLE_OPENCL_TIME_PROFILER
MNN_PRINT("total kernel time:%d us\n", (int)mOpenCLRuntime->mKernelTime);
#endif
2019-12-27 22:16:57 +08:00
}
2019-04-17 10:49:11 +08:00
2019-12-27 22:16:57 +08:00
void OpenCLBackend::copyToDeviceInt8(const Tensor* srcTensor, const Tensor* dstTensor) const{
auto needSize = srcTensor->size();
auto hostPtr = srcTensor->host<int8_t>();
cl_int error = CL_SUCCESS;
auto DeviceBuffer = (cl::Buffer*)dstTensor->deviceId();
2019-12-27 22:16:57 +08:00
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*DeviceBuffer, CL_TRUE, 0, needSize, hostPtr);
}
2022-07-22 09:59:30 +08:00
int OpenCLBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
if (toCpu) {
mOpenCLRuntime->commandQueue().finish();
}
return 0;
}
2024-03-13 14:55:54 +08:00
void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTensor, MNN_DATA_FORMAT data_format, bool svmFlag, int memtype) const {
2024-12-19 16:20:00 +08:00
#ifdef __ANDROID__
if(MNN_MEMORY_AHARDWAREBUFFER == memtype){
convertBetweenAHDandCLmem(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), memtype, false, true);
return;
}
#endif
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
#ifndef MNN_OPENCL_BUFFER_CLOSED
if(mOpenCLRuntime->getGpuMemType() == BUFFER)
{
2023-07-31 14:24:48 +08:00
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
int cPack = TensorUtils::getTensorChannelPack(srcTensor);
2024-11-18 14:37:45 +08:00
if (cPack == 16 && mOpenCLRuntime->isSupportedIntelSubgroup()) {
2023-07-31 14:24:48 +08:00
switch (data_format) {
case MNN_DATA_FORMAT_NHWC:
OpenCL::convertNC4HW4OrNC16HW16BufferToNCHWOrNHWCBuffer(srcTensor, const_cast<Tensor*>(dstTensor),
2024-04-19 11:58:21 +08:00
"nc16hw16_buffer_to_nhwc_buffer", mOpenCLRuntime.get(), true, false, svmFlag);
2023-07-31 14:24:48 +08:00
break;
case MNN_DATA_FORMAT_NCHW:
OpenCL::convertNC4HW4OrNC16HW16BufferToNCHWOrNHWCBuffer(srcTensor, const_cast<Tensor*>(dstTensor),
2024-04-19 11:58:21 +08:00
"nc16hw16_buffer_to_nchw_buffer", mOpenCLRuntime.get(), true, false, svmFlag);
2023-07-31 14:24:48 +08:00
break;
case MNN_DATA_FORMAT_NC4HW4:
OpenCL::convertNC4HW4BufferBetweenNC16HW16Buffer(srcTensor, const_cast<Tensor*>(dstTensor),
2024-04-19 11:58:21 +08:00
"nc16hw16_buffer_to_nc4hw4_buffer", mOpenCLRuntime.get(), OutTrans, false, svmFlag, false, true);
2023-07-31 14:24:48 +08:00
break;
default:
MNN_PRINT("output data format not support for subgroup!\n");
break;
}
} else
#endif
2024-09-12 12:57:57 +08:00
OpenCL::convertBufferToBuffer(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, true, true, svmFlag);
2019-04-17 10:49:11 +08:00
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
else
#endif /* MNN_OPENCL_BUFFER_CLOSED */
{
switch (data_format) {
case MNN_DATA_FORMAT_NHWC:
2024-04-19 11:58:21 +08:00
OpenCL::convertImageToNHWCBuffer(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
break;
case MNN_DATA_FORMAT_NCHW:
2024-04-19 11:58:21 +08:00
OpenCL::convertImageToNCHWBuffer(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
break;
case MNN_DATA_FORMAT_NC4HW4:
OpenCL::convertImageToNC4HW4Buffer(srcTensor, const_cast<Tensor*>(dstTensor),
2024-04-19 11:58:21 +08:00
mOpenCLRuntime.get(), false, svmFlag);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
break;
default:
break;
}
}
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
auto needSize = dstTensor->size();
2024-09-12 12:57:57 +08:00
auto shape = tensorShapeFormat(srcTensor);
auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
auto memType = dstTensor->buffer().flags;
bool directCopy = BUFFER == mOpenCLRuntime->getGpuMemType()
&& (srcDimensionFormat == dstDimensionFormat || srcTensor->dimensions() <= 1)
&& MNN::MNN_DATA_FORMAT_NC4HW4 != dstDimensionFormat && MNN_DATA_FORMAT_NC4HW4 != srcDimensionFormat
&& (getDataType(srcTensor) == getDataType(dstTensor))
2024-12-19 16:20:00 +08:00
&& memType != MNN_MEMORY_AHARDWAREBUFFER;
2024-09-12 12:57:57 +08:00
if (mOpenCLRuntime->isSupportedFP16()) { // Fp16
if (dstTensor->getType().code == halide_type_float) {
directCopy = false;
}
}
if(mOpenCLRuntime->isSupportedIntelSubgroup()){
int cPack = TensorUtils::getTensorChannelPack(srcTensor);
if (cPack == 16){
directCopy = false;
}
}
2024-04-19 11:58:21 +08:00
void* hostPtr = dstTensor->host<float>();
2024-09-12 12:57:57 +08:00
if(directCopy){
mOpenCLRuntime->commandQueue().enqueueReadBuffer(openCLBuffer(srcTensor), CL_TRUE, 0, needSize, hostPtr);
return;
}
2024-03-13 14:55:54 +08:00
_allocHostBuffer(needSize, dstTensor);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
MNN::Tensor interTensor(dstTensor, dstTensor->getDimensionType(), false);
interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
2024-09-12 12:57:57 +08:00
TensorUtils::getDescribe(&interTensor)->dimensionFormat = dstDimensionFormat;
2023-06-16 09:42:45 +08:00
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
//Convert format
2024-09-12 12:57:57 +08:00
mCLRuntime->convertFromDevice(srcTensor, (const Tensor*)&interTensor, dstDimensionFormat, false);
mOpenCLRuntime->printEventTime();
2024-04-19 11:58:21 +08:00
cl_int res;
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
#ifdef ENABLE_OPENCL_TIME_PROFILER
mOpenCLRuntime->commandQueue().finish();
{
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
AUTOTIME;
2024-04-19 11:58:21 +08:00
res = mOpenCLRuntime->commandQueue().enqueueReadBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
2020-11-05 16:41:56 +08:00
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
#else
2024-04-19 11:58:21 +08:00
res = mOpenCLRuntime->commandQueue().enqueueReadBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
#endif
2019-12-27 22:16:57 +08:00
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
2024-03-13 14:55:54 +08:00
void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor, MNN_DATA_FORMAT data_format, bool svmFlag, int memtype) const {
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
// Format: Host -> OpenCL
2024-12-19 16:20:00 +08:00
#ifdef __ANDROID__
if(MNN_MEMORY_AHARDWAREBUFFER == memtype){
convertBetweenAHDandCLmem(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), memtype, true, false);
return;
}
#endif
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
#ifndef MNN_OPENCL_BUFFER_CLOSED
if(mOpenCLRuntime->getGpuMemType() == BUFFER)
{
2023-07-31 14:24:48 +08:00
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
int cPack = TensorUtils::getTensorChannelPack(dstTensor);
2024-11-18 14:37:45 +08:00
if (cPack == 16 && mOpenCLRuntime->isSupportedIntelSubgroup()) {
2023-07-31 14:24:48 +08:00
if (MNN_DATA_FORMAT_NHWC == data_format) {
2024-04-19 11:58:21 +08:00
OpenCL::converNCHWOrNHWCBufferToNC4HW4OrNC16HW16Buffer(srcTensor, const_cast<Tensor*>(dstTensor), "nhwc_buffer_to_nc16hw16_buffer", mOpenCLRuntime.get(), true, false, svmFlag);
2023-07-31 14:24:48 +08:00
} else if (MNN_DATA_FORMAT_NCHW == data_format) {
2024-04-19 11:58:21 +08:00
OpenCL::converNCHWOrNHWCBufferToNC4HW4OrNC16HW16Buffer(srcTensor, const_cast<Tensor*>(dstTensor), "nchw_buffer_to_nc16hw16_buffer", mOpenCLRuntime.get(), true, false, svmFlag);
2023-07-31 14:24:48 +08:00
} else if (MNN_DATA_FORMAT_NC4HW4 == data_format) {
2024-04-19 11:58:21 +08:00
OpenCL::convertNC4HW4BufferBetweenNC16HW16Buffer(srcTensor, const_cast<Tensor*>(dstTensor), "nc4hw4_buffer_to_nc16hw16_buffer", mOpenCLRuntime.get(), InpTrans, false, svmFlag, true, false);
2023-07-31 14:24:48 +08:00
} else {
MNN_PRINT("input data format not support or subgroup\n");
MNN_ASSERT(false);
}
}else
#endif
2024-09-12 12:57:57 +08:00
OpenCL::convertBufferToBuffer(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), true, false, false, svmFlag);
2019-12-27 22:16:57 +08:00
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
else
#endif /* MNN_OPENCL_BUFFER_CLOSED */
{
if (MNN_DATA_FORMAT_NHWC == data_format) {
2024-04-19 11:58:21 +08:00
OpenCL::convertNHWCBufferToImage(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
} else if (MNN_DATA_FORMAT_NCHW == data_format) {
2024-04-19 11:58:21 +08:00
OpenCL::convertNCHWBufferToImage(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
} else if (MNN_DATA_FORMAT_NC4HW4 == data_format) {
OpenCL::convertNC4HW4BufferToImage(srcTensor, const_cast<Tensor*>(dstTensor),
mOpenCLRuntime.get(), false, svmFlag);
} else {
MNN_PRINT("data format not support\n");
MNN_ASSERT(false);
}
}
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
2019-12-27 22:16:57 +08:00
auto needSize = srcTensor->size();
2024-04-19 11:58:21 +08:00
auto shape = tensorShapeFormat(srcTensor);
2024-09-12 12:57:57 +08:00
auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
auto memType = srcTensor->buffer().flags;
void* hostPtr = srcTensor->host<float>();
2024-04-19 11:58:21 +08:00
// 1*1*1*1 don't need convert
2024-11-18 14:37:45 +08:00
if(BUFFER == mOpenCLRuntime->getGpuMemType() && srcTensor->getType().code == halide_type_float && mOpenCLRuntime->isSupportedFP16() && 1 == shape[0] * shape[1] * shape[2] * shape[3]){
2024-09-12 12:57:57 +08:00
needSize /= 2;
void *tmpPtr = malloc(needSize);
((half_float::half*)tmpPtr)[0] = (half_float::half)(((float*)hostPtr)[0]);
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(openCLBuffer(dstTensor), CL_TRUE, 0, needSize, tmpPtr);
free(tmpPtr);
return;
}
bool directCopy = BUFFER == mOpenCLRuntime->getGpuMemType()
&& (srcDimensionFormat == dstDimensionFormat || srcTensor->dimensions() <= 1)
&& MNN_DATA_FORMAT_NC4HW4 != dstDimensionFormat && MNN_DATA_FORMAT_NC4HW4 != srcDimensionFormat
&& (getDataType(srcTensor) == getDataType(dstTensor))
2024-12-19 16:20:00 +08:00
&& memType != MNN_MEMORY_AHARDWAREBUFFER;
2024-09-12 12:57:57 +08:00
if (mOpenCLRuntime->isSupportedFP16()) { // Fp16
if (dstTensor->getType().code == halide_type_float) {
directCopy = false;
}
}
if(mOpenCLRuntime->isSupportedIntelSubgroup()){
int cPack = TensorUtils::getTensorChannelPack(dstTensor);
if (cPack == 16){
directCopy = false;
2020-11-05 16:41:56 +08:00
}
2024-09-12 12:57:57 +08:00
}
if(directCopy){
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(openCLBuffer(dstTensor), CL_TRUE, 0, needSize, hostPtr);
2024-04-19 11:58:21 +08:00
return;
2020-11-05 16:41:56 +08:00
}
2024-03-13 14:55:54 +08:00
_allocHostBuffer(needSize, srcTensor);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
MNN::Tensor interTensor(srcTensor, srcTensor->getDimensionType(), false);
interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
2024-09-12 12:57:57 +08:00
TensorUtils::getDescribe(&interTensor)->dimensionFormat = srcDimensionFormat;
2020-11-05 16:41:56 +08:00
#ifdef ENABLE_OPENCL_TIME_PROFILER
mOpenCLRuntime->commandQueue().finish();
{
AUTOTIME;
2024-04-19 11:58:21 +08:00
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
2020-11-05 16:41:56 +08:00
}
#else
2024-08-24 15:46:21 +08:00
auto res = mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
if(res != CL_SUCCESS) {
2024-12-19 16:20:00 +08:00
MNN_ERROR("OpenCL enqueue write error:%d\n", res);
return;
2024-08-24 15:46:21 +08:00
}
2020-11-05 16:41:56 +08:00
#endif
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
//Covert format
2024-09-12 12:57:57 +08:00
mCLRuntime->convertToDevice((const Tensor*)&interTensor, dstTensor, srcDimensionFormat, false);
2019-12-27 22:16:57 +08:00
}
2024-03-13 14:55:54 +08:00
void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
int srcMemtype = srcTensor->buffer().flags;
int dstMemtype = dstTensor->buffer().flags;
if(MNN_FORWARD_CPU == srcMemtype && MNN_FORWARD_CPU == dstMemtype){
mCLRuntime->copyBetweenDevice(srcTensor, dstTensor);
} else {
2024-12-19 16:20:00 +08:00
const Tensor* hostTensor = MNN_FORWARD_CPU != srcMemtype ? srcTensor : dstTensor;
const Tensor* deviceTensor = MNN_FORWARD_CPU == srcMemtype ? srcTensor : dstTensor;
MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(deviceTensor)->dimensionFormat;
bool alloc_error = _allocHostBuffer(0, hostTensor);
if(false == alloc_error){
MNN_ERROR("Alloc _allocHostBuffer error\n");
2024-08-24 15:46:21 +08:00
return;
}
2024-12-19 16:20:00 +08:00
2024-03-13 14:55:54 +08:00
//Covert format
if(MNN_FORWARD_CPU != srcMemtype){
2024-12-19 16:20:00 +08:00
mCLRuntime->convertToDevice(hostTensor, deviceTensor, data_format, false, srcMemtype);
2024-03-13 14:55:54 +08:00
}else{
2024-12-19 16:20:00 +08:00
mCLRuntime->convertFromDevice(deviceTensor, hostTensor, data_format, false, dstMemtype);
2024-03-13 14:55:54 +08:00
}
}
}
2022-12-30 15:18:58 +08:00
void CLRuntime::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
2021-04-28 18:02:10 +08:00
#ifndef MNN_OPENCL_BUFFER_CLOSED
if(mOpenCLRuntime->getGpuMemType() == BUFFER)
{
2024-09-12 12:57:57 +08:00
OpenCL::convertBufferToBuffer(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), true, true);
2021-04-28 18:02:10 +08:00
}
else
#endif /* MNN_OPENCL_BUFFER_CLOSED */
{
std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(srcTensor);
mOpenCLRuntime.get()->commandQueue().enqueueCopyImage(
openCLImage(srcTensor), openCLImage(dstTensor),
{0, 0, 0}, {0, 0, 0},
{(size_t)bufferShape[2]* UP_DIV(bufferShape[3], 4), (size_t)bufferShape[0]*bufferShape[1], 1});
}
return;
}
2019-12-27 22:16:57 +08:00
void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
#ifdef LOG_VERBOSE
MNN_PRINT("Start onCopyBuffer !\n");
#endif
2023-12-27 17:26:44 +08:00
clearRecord();
2024-03-13 14:55:54 +08:00
if (srcTensor->host<float>() != nullptr) {
2023-09-04 10:42:11 +08:00
copyToDevice(srcTensor, dstTensor);
2024-03-13 14:55:54 +08:00
}else if(dstTensor->host<void>() != nullptr){
2023-09-04 10:42:11 +08:00
copyFromDevice(srcTensor, dstTensor);
2019-12-27 22:16:57 +08:00
}else{
2024-03-13 14:55:54 +08:00
copyBetweenDevice(srcTensor, dstTensor);
}
2019-04-17 10:49:11 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("end onCopyBuffer !\n");
#endif
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
void* OpenCLBackend::allocMapTensorMemory(int length, bool svmFlag, cl_device_svm_capabilities svm_cap_) {
if(length <= mMapMem.first) {
return mMapMem.second;
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
#ifdef MNN_OPENCL_SVM_ENABLE
if(svmFlag)
{
if(mMapMem.first != 0) {
//Release small SVM Memory
clSVMFree(mOpenCLRuntime->context().get(), mMapMem.second);
}
//Alloc proper SVM Memory
cl_svm_mem_flags flags = CL_MEM_READ_WRITE;
flags |= (svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0;
flags |= ((svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) && (svm_cap_ & CL_DEVICE_SVM_ATOMICS)) ? CL_MEM_SVM_ATOMICS : 0;
mMapMem.second = clSVMAlloc(mOpenCLRuntime->context().get(), flags, length, 0);
if(mMapMem.second == nullptr) {
MNN_PRINT("SVM Alloc Failed\n");
}
}
else
#endif
{
if(mMapMem.first != 0) {
free(mMapMem.second);
mMapMem.second = nullptr;
}
mMapMem.second = malloc(length);
}
mMapMem.first = length;
return mMapMem.second;
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
}
void* OpenCLBackend::onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) {
auto needSize = srcTensor->size();
2023-12-27 17:26:44 +08:00
clearRecord();
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
#ifdef MNN_OPENCL_SVM_ENABLE
auto svm_cap_ = mOpenCLRuntime->getSvmCapabilities();
bool use_svm = (svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER);//support fine grain svm
use_svm |= ((svm_cap_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) && mOpenCLRuntime->getGpuType() == ADRENO);//support coarse grain svm and adreno gpu
mUseSvm = (mOpenCLRuntime->getCLVersion() > 1.99f && use_svm);
if(mUseSvm) {// CL version beyond 2.0 & support svm
svmPtr = allocMapTensorMemory(needSize, true, svm_cap_);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
if(mtype == Tensor::MAP_TENSOR_READ) {
//tmpTensor alloc
MNN::Tensor tmpTensor(srcTensor, dtype, false);
tmpTensor.buffer().device = (uint64_t)svmPtr;
//Convert format
MNN_DATA_FORMAT format_type = MNN_DATA_FORMAT_NCHW;
if(dtype == MNN::Tensor::TENSORFLOW) {
format_type = MNN_DATA_FORMAT_NHWC;
} else if(dtype == MNN::Tensor::CAFFE_C4) {
format_type = MNN_DATA_FORMAT_NC4HW4;
}
2022-12-30 15:18:58 +08:00
mCLRuntime->convertFromDevice(srcTensor, &tmpTensor, format_type, true);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
if(svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
//Make sure command finished
mOpenCLRuntime->commandQueue().finish();
return svmPtr;
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
auto map_flag = CL_MAP_WRITE;
if(mtype == Tensor::MAP_TENSOR_READ) {
map_flag = CL_MAP_READ;
}
cl_int res = clEnqueueSVMMap(mOpenCLRuntime->commandQueue().get(), true, map_flag, svmPtr, needSize, 0, nullptr, nullptr);
MNN_CHECK_CL_SUCCESS(res, "svm_map")
return svmPtr;
}
#endif
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
/**
Not Support Svm, Use onopyBuffer
*/
svmPtr = allocMapTensorMemory(needSize, false);
if(mtype == Tensor::MAP_TENSOR_READ) {
//tmpTensor alloc
MNN::Tensor tmpTensor(srcTensor, dtype, false);
tmpTensor.buffer().host = (uint8_t *)svmPtr;
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
//use onCopyBuffer
onCopyBuffer(srcTensor, &tmpTensor);
}
return svmPtr;
}
bool OpenCLBackend::onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) {
#ifdef MNN_OPENCL_SVM_ENABLE
auto svm_cap_ = mOpenCLRuntime->getSvmCapabilities();
if(mUseSvm) {// CL version beyond 2.0 & support svm
//If COARSE_SVM, Unmap first
if(!(svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
cl_int res = clEnqueueSVMUnmap(mOpenCLRuntime->commandQueue().get(), svmPtr, 0, nullptr, nullptr);
MNN_CHECK_CL_SUCCESS(res, "svm_unmap")
}
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
if(mtype == Tensor::MAP_TENSOR_WRITE) {
//interTensor alloc
MNN::Tensor interTensor(dstTensor, dtype, false);
interTensor.buffer().device = (uint64_t)svmPtr;
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
//Convert format
MNN_DATA_FORMAT format_type = MNN_DATA_FORMAT_NCHW;
if(dtype == MNN::Tensor::TENSORFLOW) {
format_type = MNN_DATA_FORMAT_NHWC;
} else if(dtype == MNN::Tensor::CAFFE_C4) {
format_type = MNN_DATA_FORMAT_NC4HW4;
}
2022-12-30 15:18:58 +08:00
mCLRuntime->convertToDevice(&interTensor, dstTensor, format_type, true);
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
}
mOpenCLRuntime->commandQueue().finish();
return true;
}
#endif
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
/**
Not Support Svm, Use onopyBuffer
*/
if(mtype == Tensor::MAP_TENSOR_WRITE) {
//srcTensor alloc
MNN::Tensor srcTensor(dstTensor, dtype, false);
srcTensor.buffer().host = (uint8_t *)svmPtr;
[MNN:Sync] Sync internal github Commits: 8148ae75c 弗人 bugfix 14cb8ec7f 弗人 [Converter:Bugfix] bugfix for onnx depthwise convtranspose 476fbcd90 雁行 [MNN:Feature] Open AVX cast and bugfix for contentCFG. 5e26b9fd3 雁行 [Test:Feature] Add android test. 37e147b25 雁行 [MNN:Bugfix] Bugfix for floordiv. 144c185f5 tianbu.xsw hangxing fix hiai b4fd429d6 tianbu.xsw updateCacheFile bugfix -- update cache size d4ba572a8 雁行 [MNN:Bugfix] Support int8 in AVX2 and some Bugfix. 43061f07e xiaying [MNN:Bugfix] Fix bug for module mode run part of model 398cc5ab6 tianhang.yth refactor demo 736380600 xiaying [Express:Bugfix] Fix memory leak for copy branch b8dab0a27 tianhang.yth MNNFloat2Int8 sizeQuad=0 crash fix 94b95bfed ghz [BugFix]1.Better method for fast pack valid check 6a921f85e xiaying [Converter:Bugfix] Fix bug for Fuseconsttosubgraph 5f77ae889 tianhang.yth numThread bugfix a807ef879 tianhang.yth add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix ad05409d3 xiaying [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode 9d81b8299 xiaying [MNN:Bugfix] Fix bug for Unique op for output size = 1 03b15e9af xiaying [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert c944a76ee tianhang.yth add auto backend and getSessionInfo @tianbu 91fa7267b ghz [BugFix]1.fix the error in eP check bf0041f77 ghz [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error 693871672 雁行 [CPU:Bugfix] rm adrp instruction for clang compiler bug. 1b8f6b3d8 ghz 1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process. feb7ecc4c 弗人 modify log of python offline quant 040c04811 ghz [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version 609f37db8 弗人 add log for python quant, python convert 5511dd30a ghz [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter. a93ff9280 tianhang.yth add tf.Unique op support 9729ff773 allen.lk [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead. 297c1ad14 雁行 [Expr:Bugfix] bugfix for tensor content used by shape compute. ef8c369e3 弗人 catch exception 07c2dd670 弗人 add dependence to setup, base64 encode url, add time log 177e590c1 弗人 [Python:Feature] add aliyun log for python quant tool 40a7928cf allen.lk [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution 3bdea84a1 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. c3c6fbdbd allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. bc590eee4 雁行 [Converter:Bugfix] bugfix for onnx instancenormalization convert. d8918593f tianhang.yth add auto backend and getSessionInfo @tianbu 83a198ed7 杭行 update d0dd3e09b 杭行 update 99540202e xiaying [Converter:Optimize] Opt the tensor convert insert 333d8db82 allen.lk [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64. db5994672 杭行 merge 6293de7b8 tianbu.xsw fix pymnn updateCacheFile 5c2e11cb1 tianbu.xsw do updateCache in createSession 6e7641ff4 tianbu.xsw do not limit cacheFile for a model 5287a65e4 tianbu.xsw bugfix 52ba53a91 tianbu.xsw revert pymnn api 60284d830 tianbu.xsw bugfix 6d8077490 tianbu.xsw rename updateCacheFile api params 3cb172710 tianhang.yth updateCacheFile API size default value is 0 c5b69aabf tianbu.xsw updateCacheFile python api fix 5d5da7aa5 tianbu.xsw reflector code 5707877a4 雁行 [MNN:Speed] Speedup for softmax in x86 and arm. 2a211825c tianbu.xsw reflector code for updateCacheFile 76db3a835 tianbu.xsw [Cache Feature]: Add updateCacheFile API for increment cache b06b0fd43 allen.lk [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd. Avoid to use some registers as arm restriction. e68bfa495 雁行 [Converter:Feature] Add UUID when model convert. a9cb935dc xiaying [MNN:Speed] Support c4nhwc for more fastblit 019f40353 xiaying [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G -> 1G) d2a6d3d05 xiaying [MNN:Bugfix] Fix bug for identity output not find 604d0801b xiaying [Converter:Bugfix] Fix bug for FuseGeLu 4bada2367 xiaying [MNN:Refractor] SegmentMean rewrite as segment 82070e708 xiaying [MNN:Bugfix] Fix bug for GeometryBinary e8ea4266e xiaying Fix bug for ShapeTensorConvert compute for dim = 1 error 1f1cf1991 xiaying [Tools:Bugfix] Fix system compability for fastTestOnnx 6f422efe2 xiaying [Tools:Bugfix] Remove color for checkDir for easy to dump 968f7ec88 xiaying [MNN:Speed] Support turn broadcast binary to loop 3e7aaf46f xiaying [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr 1f65ab163 xiaying [MNN:Bugfix] Fix bug for mini mnn can't convert model d65953d47 xiaying [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82 8b68be45c xiaying [MNN:Feature] Add segment 8a8f264f5 xiaying [Vulkan:Bugfix] Remove unuseful print 025bb0fda xiaying [Converter:Bugfix] Fix bug for oneof don't support 43900251e tianbu.xsw enable setCacheFile python API ebfb05c74 tianbu.xsw [Metal Feature] support metallib obtain from walle transfer task 9665c0a79 弗人 add check for path in json file c66fef224 xiaying [Converter:Bugfix] Fix bug for oneof don't support 42f192852 xiaying [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs 1b95354ff 雁行 [Feature]: Support shape compute for SetDiff1D, and null input for Prod. 83966d043 xiaying [Test:Feature] Add test for static module 42d1be933 xiaying [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model 9067531c3 xiaying [Converter:Refractor] formatLicence 99558bed9 xiaying [Converter:Bugfix] Count the op for unuseful and controlflow 4f6da0fa7 allen.lk [Feature:GRUMultiOutput] fix multi output dimension type c6b219bce xiaying [Converter:Feature] Turn torch converter to object dd4e68a37 xiaying [Converter:Feature] Support dump supported ops 80b6a60a3 xiaying [Converter:Info] If has output name, print output name instead of computed 015278fc3 xiaying [MNN:Refractor] Revert IfModule's debug info 23ac967c4 xiaying Don't transform for multi-input convolution/deconvolution b02b0d4de xiaying Fix bug for multi-input for conv1d 254d8b1d4 xiaying Fix bug for Conv1dSqueezeMove for multi input convolution 1d d47d0b9ca xiaying Fix bug for CPURaster's fuse nc4hw4 357c5bd33 xiaying Fix ConvBiasAdd for conv's inputs op > 1 55b1f0c9c xiaying [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution 1902a30f5 xiaying [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d c23fe617b xiaying [MNN:Bugfix] Fix bug for multi-input for conv1d 8ff018426 xiaying [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4 d4e8cd602 xiaying [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1 846266b42 tianbu.xsw return when program and tune both nullptr fd67c76a9 xiaying [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite e77a242c4 xiaying [Converter:Feature] Support tflite's half pixel be054c377 tianbu.xsw [OpenCL Bugfix] do not rewrite cache when binary program is produced 51e65aa35 xiaying [Converter:Feature] Support tflite for fp16 and multi-input convolution 1ccdfdeb5 tianbu.xsw redefine svm macro name 31234d372 tianbu.xsw [OpenCL SVM] add macro for only use wrapper d739e35da xiaying [MNN:Bugfix] Fix compile bug for grid op 24ab13c79 Joker feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying) 7b142978e xiaying [AVX512:Speed] Optimize for e <= 8 5f6febe7b tianbu.xsw code refactor 998d91b57 xiaying [Express:Speed] Merge submodule for speed 22c89146f tianhang.yth fix alpha div by zero bug and arm server compile bug 8f829a170 tianbu.xsw [OpenCL Pad] unify conv/deconv pad computing 4a28f603e xiaying [Express:Speed] Shared Const for All Submodule c74cf28f3 xiaying [MNN:Refractor] Seperate Const init and schedule 2a1eebb7a xiaying [Tools:Bugfix] Fix bug for modelTest.py count size 72f04008c xiaying [MNN:Refractor] Delete unuseful const op 1e735d03c xiaying [Converter:Bugfix] Fix bug for static module gen 4dfadbc6e xiaying [MNN:Refractor] Rewrite const init mode 1fcf0417a xiaying [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch 41d429cfd xiaying [Train:Bugfix] Revert convert NCHW for mnistTrain f947a5f01 xiaying [Test:Feature] Add testTrain dad59b6f6 tianbu.xsw move realize code from Backend.hpp to Tensor.cpp cf4473ad1 xiaying [Train:Bugfix] Support pad for GeometryPoolGrad 91ab13734 xiaying [MNN:Bugfix] Fix compile bug for avx512 742e80f47 xiaying [MNN:Refractor] Opt the logic for checknan judge 12543b841 xiaying [ARM82:Bugfix] Fix compile bug for ios 3a2b0a49f xiaying [ARM82:Speed] Opt Pack / Unpack for armv8 c0f1995cd xiaying [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm e0fc77dcf xiaying [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it 584bec578 xiaying [MNN:Bugfix] Fix bug for format set error for onnx d5bd4148d xiaying [MNN:Bugfix] Fix bug for format set error for onnx b00265841 xiaying [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor bb09188ac xiaying [Test:Bugfix] Fix bug for run into sparse auto 426d1babd xiaying [MNN:Refractor] Small bugfix for Group convolution and pack 7d0ea1c46 tianbu.xsw [testModel Feature] support testModel.out input resize 4169c54ce xiaying [MNN:Bugfix] Fix bug for checkNAN for origin 412a82222 xiaying [Test:Bugfix] Fix bug for CheckNAN's error of matmul 319b1d425 xiaying [MNN:Bugfix] Fix bug for multi-batch for ConvInt8 050b728a6 xiaying [Test:Bugfix] Use NCHW for ConvInt8Test 7db3423a1 xiaying [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4 adcec6a7f xiaying [Vulkan:Bugfix] Fix bug for invalid tensor size limit d2a7cf4e9 xiaying [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4 557bebdd3 xiaying [MNN:Bugfix] Fix bug for BF16-ARM32 bbe186649 tianbu.xsw [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority 6deb23439 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size b137590e4 xiaying [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size 7003558ea xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case b5f8cae5a xiaying [Converter:Bugfix] Fix bug for onnx pad for serveral case 29b09e125 xiaying [MNN:Bugfix] Fix bug for arm64-bf16 42ce00770 xiaying [MNN:Bugfix] Fix bug for ARM64 - float a2d89fc18 雁行 [Converter:Feature] Support Binary Unary for Torch. 7f1c0deb1 xiaying [MNN:Bugfix] Fix bug for Raster for Int8 8335a6f18 tianbu.xsw [OpenCL Shared Memory] modify data_format method b359e031b xiaying [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8 24bf3fc88 雁行 [Convert:Feature] Support LayerNormFuse without gamma beta. 3e629624b xiaying [MNN:Bugfix] Fix bug for float - armv7a 2b7908ec7 tianbu.xsw modify workItemSize 3cee0d413 xiaying [MNN:Bugfix] test wrong clear 9cbbfb998 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 2d7a44484 xiaying [MNN:Bugfix] fix compile bug for c++ < 14 eb7d0cb53 xiaying [Test:Bugfix] Don't test for NC4HW4 directly 7b40ca8d1 xiaying [MNN:Bugfix] Fix bug for ConvolutionGroup 2694d8a91 xiaying [MNN:Bugfix] Fix bug for CPUGridSample f89af60f6 xiaying [MNN:Bugfix] Fix compile bug for arm a151abcdd xiaying [MNN:Bugfix] Fix bug for convert for int8 / int16 b254dbe61 雁行 [MNN:Bugfix] Bugfix for Conv onClone. d08150631 xiaying [MNN:Bugfix] Fix bug for fast rcnn e5568a0df xiaying [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit 128318933 雁行 [Raster:Bugfix] bugfix for Raster merge onResize. 03caacbea xiaying [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow e1e3c245c xiaying [MNN:Bugfix] Fix bug for ConvolutionWinograd 2524cbc6d xiaying [MNN:Bugfix] Fix bug for CPUSoftmax 44ec79b8f xiaying [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW 21ae956ce xiaying [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor 09a5069c7 xiaying [MNN:Speed] Add offset for src and dst 6776c6784 xiaying [MNN:Bugfix] Fix bug for trainable model cc83ae30b xiaying [MNN:Bugfix] Fix bug for trainable model
2021-07-29 11:46:59 +08:00
//use onCopyBuffer
onCopyBuffer(&srcTensor, dstTensor);
}
return true;
}
2019-12-27 22:16:57 +08:00
bool OpenCLBackend::addCreator(std::pair<OpType, GpuMemObject> t, Creator* c) {
auto map = gCreator();
if (map->find(t) != map->end()) {
MNN_PRINT("Error: %d type, %d GpuMemObject has be added\n", t.first, t.second);
return false;
}
map->insert(std::make_pair(t, c));
return true;
2019-04-17 10:49:11 +08:00
}
2021-01-29 11:35:13 +08:00
// -----------------------------------------------------------------------------
2020-11-05 16:41:56 +08:00
// Runtime Register
2021-01-29 11:35:13 +08:00
// -----------------------------------------------------------------------------
2020-11-05 16:41:56 +08:00
class CLRuntimeCreator : public RuntimeCreator {
virtual Runtime* onCreate(const Backend::Info& info) const {
#ifdef MNN_USE_LIB_WRAPPER
2019-04-17 10:49:11 +08:00
OpenCLSymbolsOperator::createOpenCLSymbolsOperatorSingleInstance();
if (nullptr == OpenCLSymbolsOperator::getOpenclSymbolsPtr()) {
MNN_PRINT("OpenCL init error, fallback ... \n");
return nullptr;
}
if (true == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isError()) {
MNN_PRINT("Parsing OpenCL symbols error !!! \n");
2019-04-17 10:49:11 +08:00
return nullptr;
}
2020-11-05 16:41:56 +08:00
#endif
2023-07-31 14:24:48 +08:00
int platform_id = 0;
2023-06-16 09:42:45 +08:00
int device_id = 0;
2023-07-31 14:24:48 +08:00
int platform_size = 0;
2024-03-13 14:55:54 +08:00
void *context_ptr = nullptr;
void *glShared = nullptr;
2023-06-16 09:42:45 +08:00
if (nullptr != info.user) {
if (info.user->sharedContext != nullptr) {
2023-07-31 14:24:48 +08:00
platform_id = ((MNNDeviceContext*)info.user->sharedContext)->platformId;
device_id = ((MNNDeviceContext*)info.user->sharedContext)->deviceId;
platform_size = ((MNNDeviceContext*)info.user->sharedContext)->platformSize;
2024-03-13 14:55:54 +08:00
context_ptr = (((MNNDeviceContext*)info.user->sharedContext)->contextPtr);
glShared = (((MNNDeviceContext*)info.user->sharedContext)->glShared);
2023-06-16 09:42:45 +08:00
}
}
2024-03-13 14:55:54 +08:00
auto rt = new CLRuntime(info, platform_size, platform_id, device_id, context_ptr, glShared);
if(rt->isCLRuntimeError() == true) {
2020-11-18 16:51:33 +08:00
delete rt;
return nullptr;
}
return rt;
2020-11-05 16:41:56 +08:00
}
virtual bool onValid(Backend::Info& info) const {
return true;
2019-04-17 10:49:11 +08:00
}
};
2024-09-12 12:57:57 +08:00
DataType OpenCLBackend::getDataType(const Tensor* tensor) const{
2023-10-18 10:31:02 +08:00
auto des = TensorUtils::getDescribe(tensor);
if (nullptr == des->quantAttr.get()) {
return DataType_DT_FLOAT;
}
return des->type;
}
2023-12-27 17:26:44 +08:00
cl_channel_type OpenCLBackend::fpType() {
if (getOpenCLRuntime()->isSupportedFP16() &&
mPrecision != BackendConfig::Precision_High) {
return CL_HALF_FLOAT;
}
return CL_FLOAT;
}
int OpenCLBackend::fpBytes() {
return (fpType() == CL_FLOAT ? sizeof(float) : sizeof(half_float::half));
}
void OpenCLBackend::clearRecord() const{
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
if(mUseRecordQueue && mDevideOpRecord){
for(int i = 0; i < mRecordings.size(); ++i){
2024-05-11 19:17:02 +08:00
std::vector<cl_array_arg_qcom> update_kernel_args;
std::vector<cl_workgroup_qcom> update_global_size;
std::vector<cl_workgroup_qcom> update_local_size;
for (int j = 0; j < mRecordings[i].updateInfo.size(); ++j){
for(int k = 0; k < mRecordings[i].updateInfo[j]->update_kernel_args.size(); ++k){
update_kernel_args.emplace_back(mRecordings[i].updateInfo[j]->update_kernel_args[k]);
update_kernel_args.back().dispatch_index = j;
}
for(int k = 0; k < mRecordings[i].updateInfo[j]->update_global_size.size(); ++k){
update_global_size.emplace_back(mRecordings[i].updateInfo[j]->update_global_size[k]);
update_global_size.back().dispatch_index = j;
}
for(int k = 0; k < mRecordings[i].updateInfo[j]->update_local_size.size(); ++k){
update_local_size.emplace_back(mRecordings[i].updateInfo[j]->update_local_size[k]);
update_local_size.back().dispatch_index = j;
}
}
cl_int res = mOpenCLRuntime->commandQueue().EnqueueRecordingQCOM(mRecordings[i].record, update_kernel_args.size(), update_kernel_args.data(), 0, nullptr,
update_global_size.size(), update_global_size.data(), update_local_size.size(), update_local_size.data(), 0, nullptr, nullptr);
2023-12-27 17:26:44 +08:00
MNN_CHECK_CL_SUCCESS(res, "EnqueueRecordingQCOM");
}
mOpenCLRuntime->commandQueue().finish();
mRecordings.clear();
}
#endif
}
void OpenCLBackend::enqeueRecord() const{
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
if(mUseRecordQueue && !mDevideOpRecord){
for(int i = 0; i < mRecordings.size(); ++i){
2024-05-11 19:17:02 +08:00
std::vector<cl_array_arg_qcom> update_kernel_args;
std::vector<cl_workgroup_qcom> update_global_size;
std::vector<cl_workgroup_qcom> update_local_size;
for (int j = 0; j < mRecordings[i].updateInfo.size(); ++j){
for(int k = 0; k < mRecordings[i].updateInfo[j]->update_kernel_args.size(); ++k){
update_kernel_args.emplace_back(mRecordings[i].updateInfo[j]->update_kernel_args[k]);
}
for(int k = 0; k < mRecordings[i].updateInfo[j]->update_global_size.size(); ++k){
update_global_size.emplace_back(mRecordings[i].updateInfo[j]->update_global_size[k]);
}
for(int k = 0; k < mRecordings[i].updateInfo[j]->update_local_size.size(); ++k){
update_local_size.emplace_back(mRecordings[i].updateInfo[j]->update_local_size[k]);
}
}
cl_int res = mOpenCLRuntime->commandQueue().EnqueueRecordingQCOM(mRecordings[i].record, update_kernel_args.size(), update_kernel_args.data(), 0, nullptr,
update_global_size.size(), update_global_size.data(), update_local_size.size(), update_local_size.data(), 0, nullptr, nullptr);
2023-12-27 17:26:44 +08:00
MNN_CHECK_CL_SUCCESS(res, "EnqueueRecordingQCOM");
}
mOpenCLRuntime->commandQueue().finish();
}
#endif
}
void OpenCLBackend::releaseRecord(){
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
if(mUseRecordQueue && !mDevideOpRecord){
for(int i = 0; i < mRecordings.size(); ++i){
2024-05-11 19:17:02 +08:00
cl_int res = clReleaseRecordingQCOM(mRecordings[i].record);
2023-12-27 17:26:44 +08:00
MNN_CHECK_CL_SUCCESS(res, "clReleaseRecordingQCOM");
}
mRecordings.clear();
}
#endif
}
void OpenCLBackend::startRecord(cl_recording_qcom &recording){
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
if(!mUseRecordQueue){
return;
}
#ifdef LOG_VERBOSE
MNN_PRINT("start startRecord !\n");
#endif
cl_int res = CL_SUCCESS;
if(mDevideOpRecord){
if(recording != NULL){
clReleaseRecordingQCOM(recording);
}
recording = mOpenCLRuntime->recordableQueue().NewRecordingQCOM(&res);
MNN_CHECK_CL_SUCCESS(res, "clNewRecordingQCOM");
}
#ifdef LOG_VERBOSE
MNN_PRINT("end startRecord !\n");
#endif
#endif //ENABLE_OPENCL_TIME_PROFILER
}
void OpenCLBackend::endRecord(cl_recording_qcom &recording, bool flag){
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
if(!mUseRecordQueue){
return;
}
#ifdef LOG_VERBOSE
MNN_PRINT("start endRecord !\n");
#endif
if(mDevideOpRecord){
cl_int res = CL_SUCCESS;
res = clEndRecordingQCOM(recording);
MNN_CHECK_CL_SUCCESS(res, "clEndRecordingQCOM");
} else if(flag) {
2024-05-11 19:17:02 +08:00
// endRecord for last kernel be recorded when record mode is MNN_GPU_RECORD_BATCH
2023-12-27 17:26:44 +08:00
if(!mRecordings.empty()){
2024-05-11 19:17:02 +08:00
cl_int res = clEndRecordingQCOM(mRecordings.back().record);
2023-12-27 17:26:44 +08:00
mRecordNums = 0;
MNN_CHECK_CL_SUCCESS(res, "clEndRecordingQCOM");
}
}
#ifdef LOG_VERBOSE
MNN_PRINT("end endRecord !\n");
#endif
#endif //ENABLE_OPENCL_TIME_PROFILER
}
2024-05-11 19:17:02 +08:00
void OpenCLBackend::addRecord(cl_recording_qcom &record, std::vector<RecordUpdateInfo *>updateInfo){
if(mDevideOpRecord){
RecordInfo info;
info.record = record;
for(int i = 0; i < updateInfo.size(); ++i) {
info.updateInfo.emplace_back(updateInfo[i]);
}
mRecordings.emplace_back(info);
}
}
void OpenCLBackend::recordKernel2d(const std::shared_ptr<KernelWrap> &kernelW, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws, RecordUpdateInfo *updateInfo) {
2023-12-27 17:26:44 +08:00
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
if(!mUseRecordQueue){
return;
}
2024-04-19 11:58:21 +08:00
auto kernel = kernelW->get();
2023-12-27 17:26:44 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("start record2dKernel !\n");
#endif
cl_int res = CL_SUCCESS;
if(!mDevideOpRecord){
2024-05-11 19:17:02 +08:00
RecordInfo info;
2025-02-12 11:14:19 +08:00
int recordNum = mRecordNums == mUseRecordableQueueSize ? 0 : mRecordNums;
2024-05-11 19:17:02 +08:00
if(updateInfo != nullptr){
for(int i = 0; i < updateInfo->update_kernel_args.size(); ++i){
2025-02-12 11:14:19 +08:00
updateInfo->update_kernel_args[i].dispatch_index = recordNum;
2024-05-11 19:17:02 +08:00
}
for(int i = 0; i < updateInfo->update_global_size.size(); ++i){
2025-02-12 11:14:19 +08:00
updateInfo->update_global_size[i].dispatch_index = recordNum;
2024-05-11 19:17:02 +08:00
}
for(int i = 0; i < updateInfo->update_local_size.size(); ++i){
2025-02-12 11:14:19 +08:00
updateInfo->update_local_size[i].dispatch_index = recordNum;
2024-05-11 19:17:02 +08:00
}
info.updateInfo.emplace_back(updateInfo);
}
2023-12-27 17:26:44 +08:00
if(mRecordNums == 0){
cl_recording_qcom recording = mOpenCLRuntime->recordableQueue().NewRecordingQCOM(&res);
MNN_CHECK_CL_SUCCESS(res, "clNewRecordingQCOM");
2024-05-11 19:17:02 +08:00
info.record = recording;
mRecordings.emplace_back(info);
2023-12-27 17:26:44 +08:00
}else if(mRecordNums == mUseRecordableQueueSize){
2024-05-11 19:17:02 +08:00
res = clEndRecordingQCOM(mRecordings.back().record);
2023-12-27 17:26:44 +08:00
MNN_CHECK_CL_SUCCESS(res, "clEndRecordingQCOM");
cl_recording_qcom recording = mOpenCLRuntime->recordableQueue().NewRecordingQCOM(&res);
MNN_CHECK_CL_SUCCESS(res, "clNewRecordingQCOM");
2024-05-11 19:17:02 +08:00
info.record = recording;
mRecordings.emplace_back(info);
2023-12-27 17:26:44 +08:00
mRecordNums = 0;
2024-05-11 19:17:02 +08:00
} else if(updateInfo != nullptr){
auto &lastInfo = mRecordings.back();
lastInfo.updateInfo.emplace_back(updateInfo);
2023-12-27 17:26:44 +08:00
}
mRecordNums++;
}
std::vector<uint32_t> internalGlobalWS = gws;
for (size_t i = 0; i < 2; ++i) {
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
}
if(lws[0]==0 || lws[1]==0){
res = mOpenCLRuntime->recordableQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NullRange, nullptr, nullptr);
}else{
res = mOpenCLRuntime->recordableQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NDRange(lws[0], lws[1]), nullptr, nullptr);
}
MNN_CHECK_CL_SUCCESS(res, "recordKernel2d");
#ifdef LOG_VERBOSE
MNN_PRINT("end record2dKernel !\n");
#endif
#endif //ENABLE_OPENCL_TIME_PROFILER
}
2024-05-11 19:17:02 +08:00
void OpenCLBackend::recordKernel3d(const std::shared_ptr<KernelWrap> &kernelW, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws, RecordUpdateInfo *updateInfo) {
2023-12-27 17:26:44 +08:00
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
if(!mUseRecordQueue){
return;
}
2024-04-19 11:58:21 +08:00
auto kernel = kernelW->get();
2023-12-27 17:26:44 +08:00
#ifdef LOG_VERBOSE
MNN_PRINT("start record3dKernel !\n");
#endif
cl_int res = CL_SUCCESS;
std::vector<uint32_t> internalGlobalWS = gws;
for (size_t i = 0; i < 3; ++i) {
internalGlobalWS[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
}
if(!mDevideOpRecord){
2024-05-11 19:17:02 +08:00
RecordInfo info;
2025-02-12 11:14:19 +08:00
int recordNum = mRecordNums == mUseRecordableQueueSize ? 0 : mRecordNums;
2024-05-11 19:17:02 +08:00
if(updateInfo != nullptr){
for(int i = 0; i < updateInfo->update_kernel_args.size(); ++i){
2025-02-12 11:14:19 +08:00
updateInfo->update_kernel_args[i].dispatch_index = recordNum;
2024-05-11 19:17:02 +08:00
}
for(int i = 0; i < updateInfo->update_global_size.size(); ++i){
2025-02-12 11:14:19 +08:00
updateInfo->update_global_size[i].dispatch_index = recordNum;
2024-05-11 19:17:02 +08:00
}
for(int i = 0; i < updateInfo->update_local_size.size(); ++i){
2025-02-12 11:14:19 +08:00
updateInfo->update_local_size[i].dispatch_index = recordNum;
2024-05-11 19:17:02 +08:00
}
info.updateInfo.emplace_back(updateInfo);
}
2023-12-27 17:26:44 +08:00
if(mRecordNums == 0){
cl_recording_qcom recording = mOpenCLRuntime->recordableQueue().NewRecordingQCOM(&res);
MNN_CHECK_CL_SUCCESS(res, "clNewRecordingQCOM");
2024-05-11 19:17:02 +08:00
info.record = recording;
mRecordings.emplace_back(info);
2023-12-27 17:26:44 +08:00
}else if(mRecordNums == mUseRecordableQueueSize){
2024-05-11 19:17:02 +08:00
res = clEndRecordingQCOM(mRecordings.back().record);
2023-12-27 17:26:44 +08:00
MNN_CHECK_CL_SUCCESS(res, "clEndRecordingQCOM");
cl_recording_qcom recording = mOpenCLRuntime->recordableQueue().NewRecordingQCOM(&res);
MNN_CHECK_CL_SUCCESS(res, "clNewRecordingQCOM");
2024-05-11 19:17:02 +08:00
info.record = recording;
mRecordings.emplace_back(info);
2023-12-27 17:26:44 +08:00
mRecordNums = 0;
2024-05-11 19:17:02 +08:00
} else if(updateInfo != nullptr){
auto &lastInfo = mRecordings.back();
lastInfo.updateInfo.emplace_back(updateInfo);
2023-12-27 17:26:44 +08:00
}
mRecordNums++;
}
if(lws[0]==0 || lws[1]==0 || lws[2]==0){
res = mOpenCLRuntime->recordableQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), cl::NullRange, nullptr, nullptr);
}else{
res = mOpenCLRuntime->recordableQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, nullptr);
}
MNN_CHECK_CL_SUCCESS(res, "recordKernel3d");
#ifdef LOG_VERBOSE
MNN_PRINT("end record3dKernel !\n");
#endif
#endif //ENABLE_OPENCL_TIME_PROFILER
}
#ifdef MNN_OPENCL_SEP_BUILD
bool placeholder = []() {
static std::once_flag createOnce;
std::call_once(createOnce, []() {
2024-02-29 16:21:40 +08:00
MNNInsertExtraRuntimeCreator(MNN_FORWARD_OPENCL, new CLRuntimeCreator, true);
2023-12-27 17:26:44 +08:00
});
return true;
}();
#else
void registerOpenCLRuntimeCreator() {
registerOpenCLOps();
2024-02-29 16:21:40 +08:00
MNNInsertExtraRuntimeCreator(MNN_FORWARD_OPENCL, new CLRuntimeCreator, true);
2023-12-27 17:26:44 +08:00
}
#endif
2019-04-17 10:49:11 +08:00
} // namespace OpenCL
2023-12-27 17:26:44 +08:00
2019-04-17 10:49:11 +08:00
} // namespace MNN