| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | //
 | 
					
						
							|  |  |  | //  OpCommonUtils.cpp
 | 
					
						
							|  |  |  | //  MNN
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | //  Created by MNN on 2020/03/08.
 | 
					
						
							|  |  |  | //  Copyright © 2018, Alibaba Group Holding Limited
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-03-08 10:20:18 +08:00
										 |  |  | #include "OpCommonUtils.hpp"
 | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  | #include "core/Execution.hpp"
 | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  | #include "MNN_generated.h"
 | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | #include "Macro.h"
 | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-03-08 10:20:18 +08:00
										 |  |  | namespace MNN { | 
					
						
							| 
									
										
										
										
											2025-05-23 15:21:41 +08:00
										 |  |  | bool OpCommonUtils::checkNet(const void* buffer, size_t length) { | 
					
						
							|  |  |  |     // For mnn build mini, skip check buffer, it will reduce 100k size
 | 
					
						
							|  |  |  | #ifndef MNN_SKIPBUILD_GEOMETRY
 | 
					
						
							|  |  |  |     flatbuffers::Verifier verify((const uint8_t*)buffer, length); | 
					
						
							|  |  |  |     if (false == VerifyNetBuffer(verify)) { | 
					
						
							|  |  |  |         MNN_PRINT("Invalidate buffer to create MNN Module\n"); | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     // Check Auto Inputs and Outputs
 | 
					
						
							|  |  |  |     auto net = GetNet(buffer); | 
					
						
							|  |  |  |     if (nullptr == net->oplists() || nullptr == net->tensorName()) { | 
					
						
							|  |  |  |         MNN_ERROR("Invalid net, for null oplist or tensorName\n"); | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  | Tensor::DimensionType OpCommonUtils::convertDimType(MNN_DATA_FORMAT dimensionFormat) { | 
					
						
							|  |  |  |     auto dimType = Tensor::CAFFE; | 
					
						
							|  |  |  |     switch (dimensionFormat) { | 
					
						
							|  |  |  |         case MNN_DATA_FORMAT_NCHW: | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case MNN_DATA_FORMAT_NC4HW4: | 
					
						
							|  |  |  |             dimType = Tensor::CAFFE_C4; | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case MNN_DATA_FORMAT_NHWC: | 
					
						
							|  |  |  |             dimType = Tensor::TENSORFLOW; | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         default: | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return dimType; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  | void OpCommonUtils::loadBlobData(FileLoader* loader, const Op* op, char* ptr, int size) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     if (OpParameter_Blob != op->main_type()) { | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |         return; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     } | 
					
						
							|  |  |  |     auto b       = op->main_as_Blob(); | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |     if (USE_EXTERNAL_DATA(b)) { | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |         if (op->externalPath() != nullptr) { | 
					
						
							|  |  |  |             loader = new FileLoader(op->externalPath()->c_str()); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         loadExternalDatas(loader, { ptr }, b->external()->data()); | 
					
						
							|  |  |  |         if (op->externalPath() != nullptr) { | 
					
						
							|  |  |  |             delete loader; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     void* result = nullptr; | 
					
						
							|  |  |  |     switch (b->dataType()) { | 
					
						
							|  |  |  |         case DataType_DT_FLOAT: | 
					
						
							|  |  |  |             result = (void*)b->float32s()->Data(); | 
					
						
							|  |  |  |             break; | 
					
						
							| 
									
										
										
										
											2023-09-20 20:16:25 +08:00
										 |  |  |         case DataType_DT_BFLOAT16: | 
					
						
							|  |  |  |             result = (void*)b->uint8s()->Data(); | 
					
						
							|  |  |  |             break; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         case DataType_DT_INT32: | 
					
						
							|  |  |  |             result = (void*)b->int32s()->Data(); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case DataType_DT_QUINT8: | 
					
						
							| 
									
										
										
										
											2021-09-18 15:52:30 +08:00
										 |  |  |         case DataType_DT_UINT8: | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |             result = (void*)b->uint8s()->Data(); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             break; | 
					
						
							|  |  |  |         case DataType_DT_INT8: | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |             result = (void*)b->int8s()->Data(); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             break; | 
					
						
							|  |  |  |         default: | 
					
						
							|  |  |  |             MNN_ASSERT(false); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |     ::memcpy(ptr, result, size); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static std::tuple<int, int, int> _split(int offset, int axisL, int area) { | 
					
						
							|  |  |  |     int inside  = offset % area; | 
					
						
							|  |  |  |     int temp    = offset / area; | 
					
						
							|  |  |  |     int axis    = temp % axisL; | 
					
						
							|  |  |  |     int outside = temp / axisL; | 
					
						
							|  |  |  |     return std::make_tuple(inside, axis, outside); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  | static std::tuple<bool, bool, bool> _computeAxisFused(const std::tuple<int, int, int>& dstTup) { | 
					
						
							|  |  |  |     bool ncFused = std::get<1>(dstTup) > 0 && std::get<2>(dstTup) > 0; | 
					
						
							|  |  |  |     bool nwFused = std::get<0>(dstTup) > 0 && std::get<2>(dstTup) > 0; | 
					
						
							|  |  |  |     bool cwFused = std::get<1>(dstTup) > 0 && std::get<0>(dstTup) > 0; | 
					
						
							|  |  |  |     return std::make_tuple(ncFused, cwFused, nwFused); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  | static std::tuple<int, int, int> _computeStride(const std::tuple<int, int, int>& srcTup, const std::tuple<int, int, int>& srcSplit, int step, bool swapnc, int stride) { | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  |     int inside  = std::get<0>(srcTup) / step; | 
					
						
							|  |  |  |     int axis    = std::get<1>(srcTup) / step; | 
					
						
							|  |  |  |     int outside = std::get<2>(srcTup) / step; | 
					
						
							|  |  |  |     auto fuse = _computeAxisFused(srcTup); | 
					
						
							|  |  |  |     if (std::get<0>(fuse)) { | 
					
						
							|  |  |  |         // nc fused
 | 
					
						
							|  |  |  |         if (swapnc) { | 
					
						
							|  |  |  |             axis = 0; | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |             outside = stride / std::get<0>(srcSplit); | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  |         } else { | 
					
						
							|  |  |  |             outside = 0; | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |             axis = stride / std::get<0>(srcSplit); | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } else if (std::get<2>(fuse)) { | 
					
						
							|  |  |  |         // nw fused
 | 
					
						
							|  |  |  |         outside = 0; | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |         inside = stride; | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  |     } else if (std::get<1>(fuse)) { | 
					
						
							|  |  |  |         // cw fused
 | 
					
						
							|  |  |  |         axis = 0; | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |         inside = stride; | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  |     } | 
					
						
							|  |  |  |     return std::make_tuple(inside, axis, outside); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  | static bool _checkFuseValid(const OpCommonUtils::SPLITS& srcTup, const OpCommonUtils::SPLITS& srcSplits, bool swapnc, bool swapcw, const std::tuple<bool,bool,bool>& valid) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |     auto srcFused = _computeAxisFused(srcTup); | 
					
						
							|  |  |  |     if (swapnc) { | 
					
						
							|  |  |  |         // cw can't be fused if n > 1, because layout is c, n, w
 | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |         if (std::get<1>(srcFused) && std::get<2>(valid)) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (std::get<0>(srcFused)) { | 
					
						
							|  |  |  |             // nc fuse but n is not full, don't support fuse
 | 
					
						
							|  |  |  |             if (std::get<2>(srcTup) + 1 != std::get<2>(srcSplits)) { | 
					
						
							|  |  |  |                 return false; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } else if (swapcw) { | 
					
						
							|  |  |  |         // nc can't be fused if w > 1
 | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |         if (std::get<0>(srcFused) && std::get<0>(valid)) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (std::get<1>(srcFused)) { | 
					
						
							|  |  |  |             // cw fuse but c is not full, don't support fuse
 | 
					
						
							|  |  |  |             if (std::get<1>(srcTup) + 1 != std::get<1>(srcSplits)) { | 
					
						
							|  |  |  |                 return false; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         // nw can't be fused if c > 1
 | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |         if (std::get<2>(srcFused) && std::get<1>(valid)) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         // nc can't be fused if w > 1
 | 
					
						
							|  |  |  |         if (std::get<0>(srcFused) && std::get<0>(valid)) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         // cw can't be fused if n > 1, because layout is c, n, w
 | 
					
						
							|  |  |  |         if (std::get<1>(srcFused) && std::get<2>(valid)) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | bool OpCommonUtils::canBlitFast(const Tensor::InsideDescribe::Region& region, const SPLITS& srcSplits, | 
					
						
							| 
									
										
										
										
											2022-09-30 10:02:52 +08:00
										 |  |  |                                 const SPLITS& dstSplits, int pack, bool swapnc, bool swapcw) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     int srcCOffset = (region.src.offset / std::get<0>(srcSplits)) % std::get<1>(srcSplits); | 
					
						
							|  |  |  |     if (srcCOffset % pack != 0) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int dstCOffset = (region.dst.offset / std::get<0>(dstSplits)) % std::get<1>(dstSplits); | 
					
						
							|  |  |  |     if (dstCOffset % pack != 0) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |     auto wValid = std::get<0>(srcSplits) > 1 || std::get<0>(dstSplits) > 1; | 
					
						
							|  |  |  |     auto cValid = std::get<1>(srcSplits) > 1 || std::get<1>(dstSplits) > 1; | 
					
						
							|  |  |  |     auto nValid = std::get<2>(srcSplits) > 1 || std::get<2>(dstSplits) > 1; | 
					
						
							|  |  |  |     auto valid = std::make_tuple(wValid, cValid, nValid); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     // Check Dst stride
 | 
					
						
							|  |  |  |     for (int i = 0; i < 3; ++i) { | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |         if (region.size[i] <= 1) { | 
					
						
							|  |  |  |             continue; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         int dstStride  = (region.size[i] - 1) * region.dst.stride[i]; | 
					
						
							|  |  |  |         auto srcStride = region.src.stride[i] * (region.size[i] - 1); | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         auto dstTup = _split(dstStride, std::get<1>(dstSplits), std::get<0>(dstSplits)); | 
					
						
							|  |  |  |         auto srcTup = _split(srcStride, std::get<1>(srcSplits), std::get<0>(srcSplits)); | 
					
						
							|  |  |  |         if (std::get<1>(dstTup) != std::get<1>(srcTup)) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             return false; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |         if (!_checkFuseValid(srcTup, srcSplits, swapnc, swapcw, valid)) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             return false; | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |         if (!_checkFuseValid(dstTup, dstSplits, swapnc, swapcw, valid)) { | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |             return false; | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     } | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | void OpCommonUtils::turnToPackRegion(const Tensor::InsideDescribe::Region& region, | 
					
						
							|  |  |  |                                      Tensor::InsideDescribe::Region& c4Region, const SPLITS& srcSplits, | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |                                      const SPLITS& dstSplits, int pack, bool swapnc) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     int srcAxisC4  = UP_DIV(std::get<1>(srcSplits), pack); | 
					
						
							|  |  |  |     auto dstAxisC4 = UP_DIV(std::get<1>(dstSplits), pack); | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |     auto fuseSrc = std::get<0>(srcSplits) * std::get<2>(srcSplits); | 
					
						
							|  |  |  |     auto fuseDst = std::get<0>(dstSplits) * std::get<2>(dstSplits); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for (int i = 0; i < 3; ++i) { | 
					
						
							|  |  |  |         int dstStride = (region.size[i] - 1) * region.dst.stride[i]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Get Last Point's inside, axis, outside postion
 | 
					
						
							|  |  |  |         auto tup = _split(dstStride, std::get<1>(dstSplits), std::get<0>(dstSplits)); | 
					
						
							|  |  |  |         if (std::get<1>(tup) > 0) { | 
					
						
							|  |  |  |             // The size has axis offset, divide the axis and mul axisC4 instead
 | 
					
						
							|  |  |  |             auto midC4       = UP_DIV(std::get<1>(tup) + 1, pack); | 
					
						
							|  |  |  |             c4Region.size[i] = region.size[i] / (std::get<1>(tup) + 1) * midC4; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     for (int i = 0; i < 3; ++i) { | 
					
						
							| 
									
										
										
										
											2021-08-03 14:54:18 +08:00
										 |  |  |         if (region.size[i] <= 1) { | 
					
						
							|  |  |  |             // No need compute stride
 | 
					
						
							|  |  |  |             c4Region.src.stride[i] = 0; | 
					
						
							|  |  |  |             c4Region.dst.stride[i] = 0; | 
					
						
							|  |  |  |             continue; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         int step = region.size[i] - 1; | 
					
						
							|  |  |  |         int dstStride  = region.dst.stride[i] * step; | 
					
						
							|  |  |  |         auto srcStride = region.src.stride[i] * step; | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         auto dstTup = _split(dstStride, std::get<1>(dstSplits), std::get<0>(dstSplits)); | 
					
						
							|  |  |  |         auto srcTup = _split(srcStride, std::get<1>(srcSplits), std::get<0>(srcSplits)); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         { | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |             auto tup = _computeStride(srcTup, srcSplits, step, swapnc, region.src.stride[i]); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             int inside  = std::get<0>(tup); | 
					
						
							|  |  |  |             int axis    = std::get<1>(tup); | 
					
						
							|  |  |  |             int outside = std::get<2>(tup); | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |             if (swapnc) { | 
					
						
							|  |  |  |                 c4Region.src.stride[i] = | 
					
						
							|  |  |  |                     outside * std::get<0>(srcSplits) + axis * std::get<0>(srcSplits) * std::get<2>(srcSplits) + inside; | 
					
						
							|  |  |  |             } else { | 
					
						
							|  |  |  |                 c4Region.src.stride[i] = | 
					
						
							|  |  |  |                     outside * srcAxisC4 * std::get<0>(srcSplits) + axis * std::get<0>(srcSplits) + inside; | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |         { | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |             auto tup = _computeStride(dstTup, dstSplits, step, swapnc, region.dst.stride[i]); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |             int inside  = std::get<0>(tup); | 
					
						
							|  |  |  |             int axis    = std::get<1>(tup); | 
					
						
							|  |  |  |             int outside = std::get<2>(tup); | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |             if (swapnc) { | 
					
						
							|  |  |  |                 c4Region.dst.stride[i] = | 
					
						
							|  |  |  |                     outside * std::get<0>(dstSplits) + axis * std::get<0>(dstSplits) * std::get<2>(dstSplits) + inside; | 
					
						
							|  |  |  |             } else { | 
					
						
							|  |  |  |                 c4Region.dst.stride[i] = | 
					
						
							|  |  |  |                     outside * dstAxisC4 * std::get<0>(dstSplits) + axis * std::get<0>(dstSplits) + inside; | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         // Origin offset is compute as NCHW
 | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         auto offsetTup      = _split(region.src.offset, std::get<1>(srcSplits), std::get<0>(srcSplits)); | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         if (swapnc) { | 
					
						
							|  |  |  |             // New offset is compute as C4NHW
 | 
					
						
							|  |  |  |             c4Region.src.offset = std::get<2>(offsetTup) * pack * std::get<0>(srcSplits) | 
					
						
							|  |  |  |                 + std::get<1>(offsetTup) * std::get<0>(srcSplits) * std::get<2>(srcSplits) | 
					
						
							|  |  |  |                 + std::get<0>(offsetTup) * pack; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             c4Region.src.offset = std::get<2>(offsetTup) * srcAxisC4 * pack * std::get<0>(srcSplits) + | 
					
						
							|  |  |  |                                   std::get<1>(offsetTup) * std::get<0>(srcSplits) + std::get<0>(offsetTup) * pack; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     } | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         // Origin offset is compute as NCHW
 | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         auto offsetTup      = _split(region.dst.offset, std::get<1>(dstSplits), std::get<0>(dstSplits)); | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         if (swapnc) { | 
					
						
							|  |  |  |             // New offset is compute as C4NHW
 | 
					
						
							|  |  |  |             c4Region.dst.offset = std::get<2>(offsetTup) * pack * std::get<0>(dstSplits) | 
					
						
							|  |  |  |                 + std::get<1>(offsetTup) * std::get<0>(dstSplits) * std::get<2>(dstSplits) | 
					
						
							|  |  |  |                 + std::get<0>(offsetTup) * pack; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             c4Region.dst.offset = std::get<2>(offsetTup) * dstAxisC4 * pack * std::get<0>(dstSplits) + | 
					
						
							|  |  |  |                                   std::get<1>(offsetTup) * std::get<0>(dstSplits) + std::get<0>(offsetTup) * pack; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  | #ifdef MNN_DEBUG_BLIT
 | 
					
						
							|  |  |  |     MNN_PRINT("Src WCN: %d-%d-%d, Dst WCN:%d-%d-%d\n", std::get<0>(srcSplits), std::get<1>(srcSplits), std::get<2>(srcSplits), std::get<0>(dstSplits), std::get<1>(dstSplits), std::get<2>(dstSplits)); | 
					
						
							|  |  |  |     MNN_PRINT("Origin:%d, %d, %d, %d, src: %d - %d, %d, %d, dst: %d - %d, %d, %d\n", pack, | 
					
						
							|  |  |  |     region.size[0],region.size[1], region.size[2], region.src | 
					
						
							|  |  |  |               .offset, region.src.stride[0], region.src.stride[1], region.src.stride[2], region.dst.offset, | 
					
						
							|  |  |  |               region.dst.stride[0], region.dst .stride[1], region.dst.stride[2]); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     MNN_PRINT("Pack:%d, %d, %d, %d, src: %d - %d, %d, %d, dst: %d - %d, %d, %d\n", pack, | 
					
						
							|  |  |  |      c4Region.size[0],c4Region.size[1], c4Region.size[2], c4Region.src | 
					
						
							|  |  |  |                .offset, c4Region.src.stride[0], c4Region.src.stride[1], c4Region.src.stride[2], c4Region.dst.offset, | 
					
						
							|  |  |  |                c4Region.dst.stride[0], c4Region.dst .stride[1], c4Region.dst.stride[2]); | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-30 10:02:52 +08:00
										 |  |  | bool OpCommonUtils::canBlitFast(const Tensor::InsideDescribe::Region& region, const Tensor* dest, int pack, bool swapnc, bool swapcw) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     auto src    = region.origin; | 
					
						
							|  |  |  |     int srcArea = 1; | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |     // FIXME: Support dimensions = 1
 | 
					
						
							|  |  |  |     if (src->dimensions() == 1 || dest->dimensions() == 1) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     for (int i = 2; i < src->dimensions(); ++i) { | 
					
						
							|  |  |  |         srcArea *= src->length(i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int dstArea = 1; | 
					
						
							|  |  |  |     for (int i = 2; i < dest->dimensions(); ++i) { | 
					
						
							|  |  |  |         dstArea *= dest->length(i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int inputBatch   = 1; | 
					
						
							|  |  |  |     int inputChannel = 1; | 
					
						
							|  |  |  |     if (src->dimensions() > 0) { | 
					
						
							|  |  |  |         inputBatch = src->length(0); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (src->dimensions() > 1) { | 
					
						
							|  |  |  |         inputChannel = src->length(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int dstBatch   = 1; | 
					
						
							|  |  |  |     int dstChannel = 1; | 
					
						
							|  |  |  |     if (dest->dimensions() > 0) { | 
					
						
							|  |  |  |         dstBatch = dest->length(0); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (dest->dimensions() > 1) { | 
					
						
							|  |  |  |         dstChannel = dest->length(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return canBlitFast(region, std::make_tuple(srcArea, inputChannel, inputBatch), | 
					
						
							| 
									
										
										
										
											2022-09-30 10:02:52 +08:00
										 |  |  |                        std::make_tuple(dstArea, dstChannel, dstBatch), pack, swapnc, swapcw); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void OpCommonUtils::turnToPackRegion(const Tensor::InsideDescribe::Region& region, | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |                                      Tensor::InsideDescribe::Region& c4Region, const Tensor* dest, int pack, bool swapnc) { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     c4Region    = region; | 
					
						
							|  |  |  |     auto src    = region.origin; | 
					
						
							|  |  |  |     int srcArea = 1; | 
					
						
							|  |  |  |     for (int i = 2; i < src->dimensions(); ++i) { | 
					
						
							|  |  |  |         srcArea *= src->length(i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int dstArea = 1; | 
					
						
							|  |  |  |     for (int i = 2; i < dest->dimensions(); ++i) { | 
					
						
							|  |  |  |         dstArea *= dest->length(i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int inputBatch   = 1; | 
					
						
							|  |  |  |     int inputChannel = 1; | 
					
						
							|  |  |  |     if (src->dimensions() > 0) { | 
					
						
							|  |  |  |         inputBatch = src->length(0); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (src->dimensions() > 1) { | 
					
						
							|  |  |  |         inputChannel = src->length(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int dstBatch   = 1; | 
					
						
							|  |  |  |     int dstChannel = 1; | 
					
						
							|  |  |  |     if (dest->dimensions() > 0) { | 
					
						
							|  |  |  |         dstBatch = dest->length(0); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (dest->dimensions() > 1) { | 
					
						
							|  |  |  |         dstChannel = dest->length(1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     turnToPackRegion(region, c4Region, std::make_tuple(srcArea, inputChannel, inputBatch), | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |                      std::make_tuple(dstArea, dstChannel, dstBatch), pack, swapnc); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | void OpCommonUtils::broastCastComputeDim(int* dims, int* stride, int* iStride0, int* iStride1, const Tensor* input0, | 
					
						
							|  |  |  |                                          const Tensor* input1, const Tensor* output) { | 
					
						
							|  |  |  |     for (int i = MNN_MAX_TENSOR_DIM - 1; i >= 0; --i) { | 
					
						
							| 
									
										
										
										
											2020-03-08 10:20:18 +08:00
										 |  |  |         dims[i]     = 1; | 
					
						
							|  |  |  |         stride[i]   = 0; | 
					
						
							|  |  |  |         iStride0[i] = 0; | 
					
						
							|  |  |  |         iStride1[i] = 0; | 
					
						
							|  |  |  |         int input0I = i - (output->dimensions() - input0->dimensions()); | 
					
						
							|  |  |  |         int input1I = i - (output->dimensions() - input1->dimensions()); | 
					
						
							|  |  |  |         if (i < output->dimensions()) { | 
					
						
							|  |  |  |             dims[i]   = output->length(i); | 
					
						
							|  |  |  |             stride[i] = output->stride(i); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (input0I >= 0 && input0->length(input0I) != 1) { | 
					
						
							|  |  |  |             iStride0[i] = input0->stride(input0I); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (input1I >= 0 && input1->length(input1I) != 1) { | 
					
						
							|  |  |  |             iStride1[i] = input1->stride(input1I); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | std::vector<std::tuple<int, int, int>> OpCommonUtils::computeReduceDims(const std::vector<Tensor*>& inputs, | 
					
						
							|  |  |  |                                                                         const Op* op) { | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |     // Compute axises
 | 
					
						
							|  |  |  |     std::vector<int> axises; | 
					
						
							|  |  |  |     if (inputs.size() >= 2) { | 
					
						
							|  |  |  |         auto size = inputs[1]->elementSize(); | 
					
						
							|  |  |  |         auto dims = inputs[1]->host<int32_t>(); | 
					
						
							|  |  |  |         for (int i = 0; i < size; ++i) { | 
					
						
							|  |  |  |             axises.emplace_back(dims[i]); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         auto reduct = op->main_as_ReductionParam(); | 
					
						
							|  |  |  |         if (nullptr != reduct->dim()) { | 
					
						
							|  |  |  |             for (int i = 0; i < reduct->dim()->size(); ++i) { | 
					
						
							|  |  |  |                 axises.emplace_back(reduct->dim()->data()[i]); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |     auto totalSize = TensorUtils::getRawSize(inputs[0]); | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |     if (axises.empty()) { | 
					
						
							|  |  |  |         return {std::make_tuple(1, totalSize, 1)}; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     for (int i = 0; i < axises.size(); ++i) { | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |         if (axises[i] < 0) { | 
					
						
							|  |  |  |             axises[i] = inputs[0]->dimensions() + axises[i]; | 
					
						
							| 
									
										
										
										
											2022-06-24 18:30:05 +08:00
										 |  |  |             if (axises[i] < 0) { | 
					
						
							|  |  |  |                 return {std::make_tuple(1, totalSize, 1)}; | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     // Cache for input's dims
 | 
					
						
							|  |  |  |     std::vector<int> lengths(inputs[0]->dimensions()); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     for (int i = 0; i < lengths.size(); ++i) { | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |         lengths[i] = inputs[0]->length(i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     std::vector<std::pair<int, int>> groupAxises; | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         // Merge adj axis
 | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |         std::sort(axises.begin(), axises.end()); | 
					
						
							|  |  |  |         int lastAxis = axises[0]; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         int length   = 1; | 
					
						
							|  |  |  |         int start    = axises[0]; | 
					
						
							|  |  |  |         for (int i = 1; i < axises.size(); ++i) { | 
					
						
							|  |  |  |             // MNN_PRINT("%d - %d\n", axises[i], lastAxis);
 | 
					
						
							| 
									
										
										
										
											2020-05-07 18:19:02 +08:00
										 |  |  |             if (axises[i] - lastAxis == 1) { | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |                 length++; | 
					
						
							|  |  |  |             } else { | 
					
						
							|  |  |  |                 groupAxises.emplace_back(std::make_pair(start, length)); | 
					
						
							|  |  |  |                 length = 1; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |                 start  = axises[i]; | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |             lastAxis = axises[i]; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         groupAxises.emplace_back(std::make_pair(start, length)); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // Compute inside-outside-axis
 | 
					
						
							|  |  |  |     std::vector<std::tuple<int, int, int>> result; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for (int i = 0; i < groupAxises.size(); ++i) { | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |         int outsideSize = 1; | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         int insideSize  = 1; | 
					
						
							|  |  |  |         int axisSize    = 1; | 
					
						
							|  |  |  |         auto start      = groupAxises[i].first; | 
					
						
							|  |  |  |         auto length     = groupAxises[i].second; | 
					
						
							| 
									
										
										
										
											2020-12-15 22:26:57 +08:00
										 |  |  |         if (start >= (int)lengths.size()) { | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         for (int j = 0; j < start; ++j) { | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |             outsideSize *= lengths[j]; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         for (int j = start; j < start + length; ++j) { | 
					
						
							| 
									
										
										
										
											2020-12-15 22:26:57 +08:00
										 |  |  |             if (j >= (int)lengths.size()) { | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |             axisSize *= lengths[j]; | 
					
						
							|  |  |  |             lengths[j] = 1; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |         for (int j = start + length; j < lengths.size(); ++j) { | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |             insideSize *= lengths[j]; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (1 == axisSize) { | 
					
						
							|  |  |  |             continue; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         result.emplace_back(std::make_tuple(outsideSize, axisSize, insideSize)); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     // FUNC_PRINT(result.size());
 | 
					
						
							| 
									
										
										
										
											2020-03-09 18:02:01 +08:00
										 |  |  |     if (result.empty()) { | 
					
						
							|  |  |  |         result.emplace_back(std::make_tuple(1, 1, totalSize)); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return result; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2021-11-30 10:10:53 +08:00
										 |  |  | void OpCommonUtils::unravelIndexHelper(int32_t* coordinate, const int32_t* mod, int size, | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |                                        int indice) { | 
					
						
							|  |  |  |     int value = indice; | 
					
						
							|  |  |  |     for (int i = 0; i < size; ++i) { | 
					
						
							|  |  |  |         coordinate[i] = value / mod[i]; | 
					
						
							|  |  |  |         value         = value % mod[i]; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-03-08 10:20:18 +08:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | int OpCommonUtils::computeStride(int32_t* strides, const int* shape, int length) { | 
					
						
							|  |  |  |     if (length <= 0) { | 
					
						
							|  |  |  |         return 1; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int stride = 1; | 
					
						
							|  |  |  |     for (int i = length - 1; i >= 0; --i) { | 
					
						
							|  |  |  |         strides[i] = stride; | 
					
						
							|  |  |  |         stride *= shape[i]; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return stride; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-27 17:26:44 +08:00
										 |  |  | bool OpCommonUtils::opNeedContent(const MNN::Op* op, int index) { | 
					
						
							|  |  |  |     int type = op->type(); | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |     switch (type) { | 
					
						
							|  |  |  |         case OpType_ZerosLike: | 
					
						
							|  |  |  |         case OpType_ZeroGrad: | 
					
						
							|  |  |  |         case OpType_Shape: | 
					
						
							|  |  |  |         case OpType_Rank: | 
					
						
							|  |  |  |         case OpType_Const: | 
					
						
							|  |  |  |         case OpType_Size: | 
					
						
							|  |  |  |         case OpType_PriorBox: | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |         case OpType_Interp: | 
					
						
							|  |  |  |         case OpType_Crop: | 
					
						
							|  |  |  |         case OpType_Reshape: | 
					
						
							|  |  |  |         case OpType_Reduction: | 
					
						
							|  |  |  |         case OpType_Resize: | 
					
						
							|  |  |  |             if (1 == index) { | 
					
						
							|  |  |  |                 return false; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             break; | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  |         case OpType_GridSample: | 
					
						
							|  |  |  |             if (2 == index) { | 
					
						
							|  |  |  |                 return false; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  | #ifdef MNN_SUPPORT_RENDER
 | 
					
						
							|  |  |  |         case OpType_RasterAndInterpolate: | 
					
						
							| 
									
										
										
										
											2023-12-27 17:26:44 +08:00
										 |  |  |         { | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  |             if (0 == index) { | 
					
						
							| 
									
										
										
										
											2023-12-27 17:26:44 +08:00
										 |  |  |                 int type = 4; | 
					
						
							|  |  |  |                 if (op->main_type() == OpParameter_Extra) { | 
					
						
							|  |  |  |                     auto extra = op->main_as_Extra(); | 
					
						
							|  |  |  |                     if (nullptr != extra->attr()) { | 
					
						
							|  |  |  |                         for (int i=0; i<extra->attr()->size(); ++i) { | 
					
						
							|  |  |  |                             auto attr = extra->attr()->GetAs<Attribute>(i); | 
					
						
							|  |  |  |                             if (attr->key()->str() == "primitiveType") { | 
					
						
							|  |  |  |                                 type = attr->i(); | 
					
						
							|  |  |  |                                 break; | 
					
						
							|  |  |  |                             } | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 if (type <= 4) { | 
					
						
							|  |  |  |                     return false; | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |             break; | 
					
						
							| 
									
										
										
										
											2023-12-27 17:26:44 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |         default: | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  | bool OpCommonUtils::opCompabilityForLowp(const Op* op, int bytes) { | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |     switch (op->type()) { | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |         case OpType_While: | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             if (bytes == 4) { | 
					
						
							|  |  |  |                 return true; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             if (op->main_type() != OpParameter_LoopParam) { | 
					
						
							|  |  |  |                 return false; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             // Check fuse
 | 
					
						
							|  |  |  |             auto loop = op->main_as_LoopParam(); | 
					
						
							|  |  |  |             if (nullptr != loop->initCommand()) { | 
					
						
							|  |  |  |                 for (int i=0; i<loop->initCommand()->size(); ++i) { | 
					
						
							|  |  |  |                     auto cmd = loop->initCommand()->GetAs<RegionCommand>(i); | 
					
						
							|  |  |  |                     if (cmd->fuse() >= 0) { | 
					
						
							|  |  |  |                         return false; | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             if (nullptr != loop->commands()) { | 
					
						
							|  |  |  |                 for (int i=0; i<loop->commands()->size(); ++i) { | 
					
						
							|  |  |  |                     auto cmd = loop->commands()->GetAs<RegionCommand>(i); | 
					
						
							|  |  |  |                     if (cmd->fuse() >= 0) { | 
					
						
							|  |  |  |                         return false; | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             return true; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |         case OpType_Scale: | 
					
						
							|  |  |  |         case OpType_Convolution: | 
					
						
							|  |  |  |         case OpType_ConvolutionDepthwise: | 
					
						
							|  |  |  |         case OpType_Deconvolution: | 
					
						
							|  |  |  |         case OpType_DeconvolutionDepthwise: | 
					
						
							|  |  |  |         case OpType_MatMul: | 
					
						
							|  |  |  |         case OpType_BatchMatMul: | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |         case OpType_BinaryOp: | 
					
						
							|  |  |  |         case OpType_Eltwise: | 
					
						
							|  |  |  |         case OpType_UnaryOp: | 
					
						
							|  |  |  |         case OpType_Pooling: | 
					
						
							| 
									
										
											  
											
												[MNN:Sync] Sync internal github
Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model
											
										 
											2021-07-29 11:46:59 +08:00
										 |  |  |         case OpType_Raster: | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |         case OpType_ReLU: | 
					
						
							|  |  |  |         case OpType_ReLU6: | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |         case OpType_Select: | 
					
						
							| 
									
										
										
										
											2021-06-11 17:17:13 +08:00
										 |  |  |         case OpType_PReLU: | 
					
						
							| 
									
										
										
										
											2021-06-23 14:10:31 +08:00
										 |  |  |         case OpType_GridSample: | 
					
						
							| 
									
										
										
										
											2022-01-29 18:03:25 +08:00
										 |  |  |         case OpType_ROIPooling: | 
					
						
							|  |  |  |         case OpType_ROIAlign: | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  |         case OpType_RNNSequenceGRU: | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |         case OpType_DynamicQuant: | 
					
						
							| 
									
										
										
										
											2024-05-11 19:17:02 +08:00
										 |  |  |         case OpType_Attention: | 
					
						
							| 
									
										
										
										
											2024-06-03 20:09:34 +08:00
										 |  |  |         case OpType_LayerNorm: | 
					
						
							|  |  |  |         case OpType_Softmax: | 
					
						
							| 
									
										
										
										
											2021-04-08 15:34:23 +08:00
										 |  |  |             return true; | 
					
						
							|  |  |  |         default: | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return false; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  | void OpCommonUtils::rasterInputReset(const std::vector<Tensor *> &inputs, Tensor *output) { | 
					
						
							|  |  |  |     auto outputDes = TensorUtils::getDescribe(output); | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |     // For output with copy, the region's size may not be the same as input's size
 | 
					
						
							|  |  |  |     outputDes->regions.resize(inputs.size()); | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |     for (int i=0; i<outputDes->regions.size(); ++i) { | 
					
						
							|  |  |  |         outputDes->regions[i].origin = inputs[i]; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  | static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flatbuffers::FlatBufferBuilder& builder) { | 
					
						
							|  |  |  |     if (nullptr == external) { | 
					
						
							|  |  |  |         MNN_ERROR("Can't rebuild external op because external is nullptr\n"); | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     builder.Clear(); | 
					
						
							|  |  |  |     bool externalTmp = false; | 
					
						
							|  |  |  |     if (nullptr != origin->externalPath()) { | 
					
						
							|  |  |  |         external = new FileLoader(origin->externalPath()->c_str()); | 
					
						
							|  |  |  |         externalTmp = true; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     std::shared_ptr<MNN::OpT> op(origin->UnPack()); | 
					
						
							|  |  |  |     switch (op->main.type) { | 
					
						
							|  |  |  |         case OpParameter_Scale: | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             auto scale = op->main.AsScale(); | 
					
						
							|  |  |  |             int outputCount = static_cast<int>(scale->external[1] / sizeof(float)); | 
					
						
							|  |  |  |             scale->scaleData.resize(outputCount); | 
					
						
							|  |  |  |             external->offset(scale->external[0]); | 
					
						
							|  |  |  |             external->read((char*)scale->scaleData.data(), scale->external[1]); | 
					
						
							|  |  |  |             if (scale->external.size() > 2) { | 
					
						
							|  |  |  |                 scale->biasData.resize(outputCount); | 
					
						
							|  |  |  |                 external->read((char*)scale->biasData.data(), scale->external[2]); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         case OpParameter_LayerNorm: | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             auto layer_norm_param = op->main.AsLayerNorm(); | 
					
						
							|  |  |  |             int32_t size = static_cast<int32_t>(layer_norm_param->external[1]); | 
					
						
							| 
									
										
										
										
											2024-08-24 15:46:21 +08:00
										 |  |  |             layer_norm_param->gamma.resize(size / sizeof(float)); | 
					
						
							|  |  |  |             layer_norm_param->beta.resize(size / sizeof(float)); | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |             external->offset(layer_norm_param->external[0]); | 
					
						
							|  |  |  |             external->read((char*)layer_norm_param->gamma.data(), layer_norm_param->external[1]); | 
					
						
							|  |  |  |             external->read((char*)layer_norm_param->beta.data(), layer_norm_param->external[2]); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |         case OpParameter_Convolution2D: | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             auto param = op->main.AsConvolution2D(); | 
					
						
							|  |  |  |             if (param->quanParameter) { | 
					
						
							| 
									
										
										
										
											2024-08-24 15:46:21 +08:00
										 |  |  |                 bool isSparse = param->sparseParameter.get() != nullptr; | 
					
						
							|  |  |  |                 bool isPTQ = param->quanParameter->scaleIn != 0; | 
					
						
							|  |  |  |                 if (isSparse || isPTQ) { | 
					
						
							|  |  |  |                     external->offset(param->external[0]); | 
					
						
							|  |  |  |                     if (0 != param->external[1]) { | 
					
						
							|  |  |  |                         param->quanParameter->buffer.resize(param->external[1]); | 
					
						
							|  |  |  |                         external->read((char*)param->quanParameter->buffer.data(), param->external[1]); | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                     param->quanParameter->alpha.resize(param->external[2] / sizeof(float)); | 
					
						
							|  |  |  |                     external->read((char*)param->quanParameter->alpha.data(), param->external[2]); | 
					
						
							|  |  |  |                 } else { | 
					
						
							|  |  |  |                     // skip weight and dequant alpha for load speed
 | 
					
						
							|  |  |  |                     op->externalPath = external->path(); | 
					
						
							|  |  |  |                     external->offset(param->external[0] + param->external[1] + param->external[2]); | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |                 } | 
					
						
							|  |  |  |                 if (param->bias.empty() && param->external.size() > 3) { | 
					
						
							| 
									
										
										
										
											2024-09-12 12:57:57 +08:00
										 |  |  | 		            if (param->external[3] > 0) { | 
					
						
							|  |  |  |                        param->bias.resize(param->external[3]/sizeof(float)); | 
					
						
							|  |  |  |                        external->read((char*)param->bias.data(), param->external[3]); | 
					
						
							|  |  |  |                     } else { | 
					
						
							|  |  |  |                        param->bias.resize(param->common->outputCount); | 
					
						
							|  |  |  | 		            } | 
					
						
							|  |  |  | 		        } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |                 if (param->quanParameter->index.empty() && param->external.size() > 4) { | 
					
						
							| 
									
										
										
										
											2025-03-12 11:35:16 +08:00
										 |  |  |                     if (param->external[4] > 0) { | 
					
						
							|  |  |  |                         param->quanParameter->index.resize(param->external[4]/sizeof(uint32_t)); | 
					
						
							|  |  |  |                         external->read((char*)param->quanParameter->index.data(), param->external[4]); | 
					
						
							|  |  |  |                     } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |                 } | 
					
						
							|  |  |  |             } else { | 
					
						
							| 
									
										
										
										
											2025-02-12 11:14:19 +08:00
										 |  |  |                 // Create quanParameter, will load external weight in ConvolutionCommon::load
 | 
					
						
							|  |  |  |                 param->quanParameter.reset(new IDSTQuanT); | 
					
						
							|  |  |  |                 param->quanParameter->type = 8; | 
					
						
							|  |  |  |                 op->externalPath = external->path(); | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |                 param->bias.resize(param->external[2] / sizeof(float)); | 
					
						
							| 
									
										
										
										
											2025-02-12 11:14:19 +08:00
										 |  |  |                 external->offset(param->external[0] + param->external[1]); | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |                 external->read((char*)param->bias.data(), param->external[2]); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         default: | 
					
						
							|  |  |  |             break; | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |     if (externalTmp) { | 
					
						
							|  |  |  |         delete external; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     builder.Finish(Op::Pack(builder, op.get())); | 
					
						
							|  |  |  |     return true; | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  | Execution* OpCommonUtils::createExecutionWithExternal(Backend* backend, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, | 
					
						
							|  |  |  |                                               const MNN::Op* op, FileLoader* externalFile, std::shared_ptr<BufferStorage>& tmpstore) { | 
					
						
							| 
									
										
										
										
											2025-05-23 15:21:41 +08:00
										 |  |  | #ifdef MNN_SKIPBUILD_GEOMETRY
 | 
					
						
							| 
									
										
										
										
											2024-12-12 14:03:02 +08:00
										 |  |  |     return backend->onCreate(inputs, outputs, op); | 
					
						
							|  |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |     bool hasExternal = false; | 
					
						
							|  |  |  |     switch (op->main_type()) { | 
					
						
							|  |  |  |         case OpParameter_Convolution2D: | 
					
						
							|  |  |  |             hasExternal = USE_EXTERNAL_DATA(op->main_as_Convolution2D()); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case OpParameter_Scale: | 
					
						
							|  |  |  |             hasExternal = USE_EXTERNAL_DATA(op->main_as_Scale()); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case OpParameter_LayerNorm: | 
					
						
							|  |  |  |             hasExternal = USE_EXTERNAL_DATA(op->main_as_LayerNorm()); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         default: | 
					
						
							|  |  |  |             break; | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |     if (!hasExternal) { | 
					
						
							|  |  |  |         return backend->onCreate(inputs, outputs, op); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     flatbuffers::FlatBufferBuilder builder; | 
					
						
							| 
									
										
										
										
											2025-03-12 11:35:16 +08:00
										 |  |  |     bool usemmap = false; | 
					
						
							|  |  |  |     if (backend && backend->getRuntime()) { | 
					
						
							|  |  |  |         usemmap = (backend->getRuntime()->hint().useCachedMmap > 1); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     Execution* execution; | 
					
						
							|  |  |  |     if ((!usemmap)) { | 
					
						
							|  |  |  |         bool res = _RebuildExternalOp(externalFile, op, builder); | 
					
						
							|  |  |  |         if (!res) { | 
					
						
							|  |  |  |             MNN_ERROR("Rebuild External Op failed\n"); | 
					
						
							|  |  |  |             return nullptr; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         auto newOp = flatbuffers::GetRoot<MNN::Op>(builder.GetBufferPointer()); | 
					
						
							|  |  |  |         execution  = backend->onCreate(inputs, outputs, newOp); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         execution  = backend->onCreate(inputs, outputs, op); | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |     } | 
					
						
							|  |  |  |     if (nullptr == execution) { | 
					
						
							|  |  |  |         return execution; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (op->main_type() == OpParameter_Convolution2D) { | 
					
						
							| 
									
										
										
										
											2025-02-12 11:14:19 +08:00
										 |  |  |         // For convolution / deconvolution, execution will need op info after created, try clone it and remove newOp
 | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  |         Execution* copyExe = nullptr; | 
					
						
							|  |  |  |         execution->onClone(backend, op, ©Exe); | 
					
						
							|  |  |  |         if (nullptr != copyExe) { | 
					
						
							|  |  |  |             delete execution; | 
					
						
							|  |  |  |             return copyExe; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  | #ifdef DEBUG
 | 
					
						
							|  |  |  |             MNN_ERROR("Clone error for convolution/deconvolution, will Increase memory\n"); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |             tmpstore.reset(new BufferStorage); | 
					
						
							|  |  |  |             tmpstore->storage = builder.ReleaseRaw(tmpstore->allocated_size, tmpstore->offset); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return execution; | 
					
						
							| 
									
										
										
										
											2024-12-12 14:03:02 +08:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void OpCommonUtils::loadExternalDatas(FileLoader* fileloader, std::vector<char*> addrs,  const int64_t* external) { | 
					
						
							|  |  |  |     fileloader->offset(external[0]); | 
					
						
							|  |  |  |     for (int i = 0; i < addrs.size(); i++) { | 
					
						
							|  |  |  |         fileloader->read(addrs[i], external[i+1]); | 
					
						
							| 
									
										
										
										
											2022-12-30 15:18:58 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) { | 
					
						
							|  |  |  |     if (t->dimensions() == 0) { | 
					
						
							|  |  |  |         batch = 1; | 
					
						
							|  |  |  |         channel = 1; | 
					
						
							|  |  |  |         area = 1; | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (t->dimensions() == 1) { | 
					
						
							|  |  |  |         batch = t->length(0); | 
					
						
							|  |  |  |         channel = 1; | 
					
						
							|  |  |  |         area = 1; | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     batch = t->length(0); | 
					
						
							|  |  |  |     channel = t->length(1); | 
					
						
							|  |  |  |     area = 1; | 
					
						
							|  |  |  |     for (int i=2; i<t->dimensions(); ++i) { | 
					
						
							|  |  |  |         area *= t->length(i); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | bool OpCommonUtils::isTranspose(const Tensor::InsideDescribe::Region& region, int& srcOne, int& dstOne) { | 
					
						
							|  |  |  |     srcOne = -1; | 
					
						
							|  |  |  |     dstOne = -1; | 
					
						
							|  |  |  |     for (int i = 0; i < 3; i++) { | 
					
						
							|  |  |  |         if (region.size[i] == 1) { | 
					
						
							|  |  |  |             continue; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (region.src.stride[i] == 1) { | 
					
						
							|  |  |  |             if (srcOne >= 0) { | 
					
						
							|  |  |  |                 return false; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             srcOne = i; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (region.dst.stride[i] == 1) { | 
					
						
							|  |  |  |             if (dstOne >= 0) { | 
					
						
							|  |  |  |                 return false; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             dstOne = i; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2024-04-19 11:58:21 +08:00
										 |  |  | bool OpCommonUtils::supportDynamicInputMemory(MNNForwardType type) { | 
					
						
							|  |  |  |     if (type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_VULKAN) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | void OpCommonUtils::turnRegion2Convert(const Tensor::InsideDescribe::Region& region, const Tensor* dest, OpCommonUtils::TensorConvertParameter& info) { | 
					
						
							|  |  |  |     auto origin = region.origin; | 
					
						
							|  |  |  |     auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat; | 
					
						
							|  |  |  |     auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat; | 
					
						
							|  |  |  |     info.type = 0; | 
					
						
							|  |  |  |     if (srcFormat == dstFormat) { | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (srcFormat != MNN_DATA_FORMAT_NC4HW4 && dstFormat != MNN_DATA_FORMAT_NC4HW4) { | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     const Tensor* nc4hw4Tensor = origin; | 
					
						
							|  |  |  |     const Tensor* originTensor = dest; | 
					
						
							|  |  |  |     if (dstFormat == MNN_DATA_FORMAT_NC4HW4) { | 
					
						
							|  |  |  |         nc4hw4Tensor = dest; | 
					
						
							|  |  |  |         originTensor = origin; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     getBatchChannelArea(nc4hw4Tensor, info.batch, info.channel, info.area); | 
					
						
							|  |  |  |     if (0 != region.src.offset || 0 != region.dst.offset) { | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (TensorUtils::isCopyRegion(region)) { | 
					
						
							|  |  |  |         if (info.batch * info.channel * info.area == region.size[0] * region.size[1] * region.size[2]) { | 
					
						
							|  |  |  |             info.type = 1; | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int srcOne, dstOne; | 
					
						
							|  |  |  |     if (isTranspose(region, srcOne, dstOne)) { | 
					
						
							|  |  |  |         int keepDim = -1; | 
					
						
							|  |  |  |         for (int i = 0; i < 3; i++) { | 
					
						
							|  |  |  |             if (i != srcOne && i != dstOne) { | 
					
						
							|  |  |  |                 keepDim = i; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (info.batch == region.size[keepDim]) { | 
					
						
							| 
									
										
										
										
											2023-06-16 09:42:45 +08:00
										 |  |  |             if ((info.channel == region.size[srcOne] && info.area == region.size[dstOne]) // NCHW
 | 
					
						
							|  |  |  |                || (info.area == region.size[srcOne] && info.channel == region.size[dstOne])) {// NHWC
 | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  |                 auto srcSize = TensorUtils::getRawSize(originTensor); | 
					
						
							|  |  |  |                 auto dstSize = TensorUtils::getRawSize(nc4hw4Tensor); | 
					
						
							|  |  |  |                 auto regionSize = region.size[0] * region.size[1] * region.size[2]; | 
					
						
							|  |  |  |                 if (srcSize != dstSize || regionSize != srcSize) { | 
					
						
							|  |  |  |                     return; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 info.type = 2; | 
					
						
							|  |  |  |                 return; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             return; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-12-04 11:12:20 +08:00
										 |  |  | bool OpCommonUtils::computeMatMulSize(bool transposeA, bool transposeB, const Tensor* A, const Tensor* B, int& e, int& l, int& h) { | 
					
						
							|  |  |  |     auto i0Dim = A->dimensions(); | 
					
						
							|  |  |  |     auto i1Dim = B->dimensions(); | 
					
						
							|  |  |  |     if (i0Dim < 1 || i1Dim < 1) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     int w0, h0; | 
					
						
							|  |  |  |     int w1, h1; | 
					
						
							|  |  |  |     if (i0Dim == 1) { | 
					
						
							|  |  |  |         w0 = A->length(0); | 
					
						
							|  |  |  |         h0 = 1; | 
					
						
							|  |  |  |         transposeA = false; | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         w0 = A->length(i0Dim - 1); | 
					
						
							|  |  |  |         h0 = A->length(i0Dim - 2); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (i1Dim == 1) { | 
					
						
							|  |  |  |         w1 = 1; | 
					
						
							|  |  |  |         h1 = B->length(0); | 
					
						
							|  |  |  |         transposeB = false; | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         w1 = B->length(i1Dim - 1); | 
					
						
							|  |  |  |         h1 = B->length(i1Dim - 2); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (transposeA) { | 
					
						
							|  |  |  |         auto t = w0; | 
					
						
							|  |  |  |         w0     = h0; | 
					
						
							|  |  |  |         h0     = t; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (transposeB) { | 
					
						
							|  |  |  |         auto t = w1; | 
					
						
							|  |  |  |         w1     = h1; | 
					
						
							|  |  |  |         h1     = t; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (w0 != h1) { | 
					
						
							|  |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     e = h0; | 
					
						
							|  |  |  |     l = w0; | 
					
						
							|  |  |  |     h = w1; | 
					
						
							|  |  |  |     return true; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-04-18 18:54:46 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-18 14:37:45 +08:00
										 |  |  | DataType OpCommonUtils::convertDataType(halide_type_t type) { | 
					
						
							|  |  |  |     if (type.code == halide_type_float) { | 
					
						
							|  |  |  |         return DataType_DT_FLOAT; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (type.code == halide_type_uint && type.bits == 8) { | 
					
						
							|  |  |  |         return DataType_DT_UINT8; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (type.code == halide_type_int && type.bits == 8) { | 
					
						
							|  |  |  |         return DataType_DT_INT8; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (type.code == halide_type_int && type.bits == 32) { | 
					
						
							|  |  |  |         return DataType_DT_INT32; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return DataType_DT_INVALID; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | } // namespace MNN
 |