| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | //
 | 
					
						
							|  |  |  | //  CPUTFQuantizedConv2D.cpp
 | 
					
						
							|  |  |  | //  MNN
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | //  Created by MNN on 2018/08/02.
 | 
					
						
							|  |  |  | //  Copyright © 2018, Alibaba Group Holding Limited
 | 
					
						
							|  |  |  | //
 | 
					
						
							| 
									
										
											  
											
												- build:
	- unify schema building in core and converter;
	- add more build script for android;
	- add linux build script for python;
- ops impl:
	- add floor mod support in binary;
	- use eltwise impl in add/max/sub/mul binary for optimization;
	- remove fake double support in cast;
	- fix 5d support for concat;
	- add adjX and adjY support for batch matmul;
	- optimize conv2d back prop filter;
	- add pad mode support for conv3d;
	- fix bug in conv2d & conv depthwise with very small feature map;
	- optimize binary without broacast;
	- add data types support for gather;
	- add gather ND support;
	- use uint8 data type in gather v2;
	- add transpose support for matmul;
	- add matrix band part;
	- add dim != 4 support for padding, reshape & tensor convert;
	- add pad type support for pool3d;
	- make ops based on TensorFlow Lite quantization optional;
	- add all & any support for reduction;
	- use type in parameter as output type in reduction;
	- add int support for unary;
	- add variable weight support for conv2d;
	- fix conv2d depthwise weights initialization;
	- fix type support for transpose;
	- fix grad outputs count for  reduce grad and reshape grad;
	- fix priorbox & detection output;
	- fix metal softmax error;
- python:
	- add runSessionWithCallBackInfo interface;
	- add max nodes limit (1400) for visualization tool;
	- fix save error in python3;
	- align default dim;
- convert:
	- add extra design for optimization;
	- add more post converting optimizers;
	- add caffe v1 weights blob support;
	- add cast, unary, conv transpose support for onnx model;
	- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
	- add cos/sin/atan/tan support for unary for tensorflow model;
	- add any/all support for reduction for tensorflow model;
	- add elu, conv3d, pool3d support for tensorflow model;
	- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
	- fix size computer lock;
	- fix thread pool deadlock;
	- add express & parameters in express;
	- rewrite blitter chooser without static map;
	- add tests for expr;
											
										 
											2019-10-29 13:37:26 +08:00
										 |  |  | #ifdef MNN_SUPPORT_TFLITE_QUAN
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "backend/cpu/CPUTFQuantizedConv2D.hpp"
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #include <math.h>
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "backend/cpu/CPUBackend.hpp"
 | 
					
						
							|  |  |  | #include "backend/cpu/CPUFixedPoint.hpp"
 | 
					
						
							|  |  |  | #include "backend/cpu/CPUQuantizationUtils.hpp"
 | 
					
						
							|  |  |  | #include "backend/cpu/compute/CommonOptFunction.h"
 | 
					
						
							|  |  |  | #include "core/Concurrency.h"
 | 
					
						
							|  |  |  | #include "core/Macro.h"
 | 
					
						
							|  |  |  | #include "core/TensorUtils.hpp"
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  | #include <arm_neon.h>
 | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define UNIT 4
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  | #define SRC_UNIT 16
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | //SRC_UNIT/UNIT
 | 
					
						
							|  |  |  | #define SRC_C4_UNIT 4
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  | // ugly macro compatible with MNNGemmInt8ToFloat32_XX
 | 
					
						
							|  |  |  | #ifdef DST_XUNIT
 | 
					
						
							|  |  |  | #undef DST_XUNIT
 | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | // One Tile Compute DST_XUNIT * outputChannel 's number
 | 
					
						
							|  |  |  | #ifdef __aarch64__
 | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  | #define DST_XUNIT 4
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							|  |  |  | #define DST_XUNIT 2
 | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | extern "C" { | 
					
						
							|  |  |  | void MNNQuanToDestUint8(uint8_t* outputInTile, const int32_t* gemmOutputAddr, const int32_t* biasData, size_t ocUnit, | 
					
						
							|  |  |  |                         size_t realDstCount, size_t dstZStep, size_t srcZstep, | 
					
						
							|  |  |  |                         const MNN::CPUTFQuantizedConv2D::QuanParameter* parameter); | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  | void MNNLoadU8AndSum(int32_t* inputSum, int8_t* colAddr, const uint8_t* inputOrigin, size_t srcZStep, size_t icDiv8, | 
					
						
							|  |  |  |                         size_t realDstCount, size_t mFilterOffset); | 
					
						
							|  |  |  | void MNNGemmint8to32_8x4_Unit(int32_t* dst, const int8_t* src, const int8_t* weight, const int32_t* inputSummer, size_t src_depth_quad, | 
					
						
							|  |  |  |                                   size_t dst_step, size_t dst_depth_quad); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #ifndef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  | void MNNGemmint8to32_8x4_Unit(int32_t* dst, const int8_t* src, const int8_t* weight, const int32_t* inputSummer, size_t src_depth_quad, | 
					
						
							|  |  |  |                               size_t dst_step, size_t dst_depth_quad) { | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |     for (int dz = 0; dz < dst_depth_quad; ++dz) { | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |         auto weight_dz = weight + src_depth_quad * dz * SRC_UNIT * UNIT; | 
					
						
							|  |  |  |         auto dst_z     = dst + dz * dst_step / sizeof(int32_t); | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |         for (int w = 0; w < DST_XUNIT; ++w) { | 
					
						
							|  |  |  |             auto dst_x = dst_z + 4 * w; | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             ::memset(dst_x, 0, UNIT * sizeof(int32_t)); | 
					
						
							|  |  |  |             auto src_x = src + SRC_UNIT * w; | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |             for (int sz = 0; sz < src_depth_quad; ++sz) { | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                 auto weight_sz = weight_dz +SRC_UNIT * UNIT * sz; | 
					
						
							|  |  |  |                 auto src_z     = src_x + sz * DST_XUNIT * SRC_UNIT; | 
					
						
							|  |  |  |                 for (int j = 0; j < UNIT; ++j) { | 
					
						
							|  |  |  |                     auto weight_j = weight_sz + j * SRC_UNIT; | 
					
						
							|  |  |  |                     for (int i = 0; i < SRC_UNIT; ++i) { | 
					
						
							|  |  |  |                         auto s0 = (int32_t)(src_z[i+0]); | 
					
						
							|  |  |  |                         auto s1 = (int32_t)(weight_j[i+0]); | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |                         dst_x[j] += s0 * s1; | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             for (int j = 0; j < UNIT; ++j) { | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |                 dst_x[j] -= inputSummer[w]; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  | void MNNLoadU8AndSum(int32_t* inputSum, int8_t* colAddr, const uint8_t* inputOrigin, size_t srcZStep, size_t icDiv8, | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                      size_t realDstCount, size_t mFilterOffset) { | 
					
						
							|  |  |  |     for (int i = 0; i < realDstCount; ++i) { | 
					
						
							|  |  |  |         inputSum[i]   = 0; | 
					
						
							|  |  |  |         auto colAddrI = colAddr + SRC_UNIT * i; | 
					
						
							|  |  |  |         auto inputK   = inputOrigin + UNIT * i; | 
					
						
							|  |  |  |         for (int sz = 0; sz < icDiv8; ++sz) { | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             auto inputZ0      = inputK + srcZStep * (SRC_C4_UNIT * sz + 0); | 
					
						
							|  |  |  |             auto inputZ1      = inputK + srcZStep * (SRC_C4_UNIT * sz + 1); | 
					
						
							|  |  |  |             auto inputZ2      = inputK + srcZStep * (SRC_C4_UNIT * sz + 2); | 
					
						
							|  |  |  |             auto inputZ3      = inputK + srcZStep * (SRC_C4_UNIT * sz + 3); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             auto indexOutside = sz; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             auto dstK0 = colAddrI + indexOutside * SRC_UNIT * DST_XUNIT; | 
					
						
							|  |  |  |             auto dstK1 = dstK0 + UNIT; | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             auto dstK2 = dstK1 + UNIT; | 
					
						
							|  |  |  |             auto dstK3 = dstK2 + UNIT; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             for (int u = 0; u < UNIT; ++u) { | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                 dstK0[u] = (int)inputZ0[u] - 128; | 
					
						
							|  |  |  |                 dstK1[u] = (int)inputZ1[u] - 128; | 
					
						
							|  |  |  |                 dstK2[u] = (int)inputZ2[u] - 128; | 
					
						
							|  |  |  |                 dstK3[u] = (int)inputZ3[u] - 128; | 
					
						
							|  |  |  |                 inputSum[i] += ((int32_t)dstK0[u] + (int32_t)dstK1[u] + (int32_t)dstK2[u] + (int32_t)dstK3[u]) * mFilterOffset; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | void MNNQuanToDestUint8(uint8_t* outputInTile, const int32_t* gemmOutputAddr, const int32_t* biasData, size_t ocUnit, | 
					
						
							|  |  |  |                         size_t realDstCount, size_t dstZStep, size_t srcZstep, | 
					
						
							|  |  |  |                         const MNN::CPUTFQuantizedConv2D::QuanParameter* parameter) { | 
					
						
							|  |  |  |     dstZStep = dstZStep / sizeof(uint8_t); | 
					
						
							|  |  |  |     srcZstep = srcZstep / sizeof(int32_t); | 
					
						
							|  |  |  |     for (int dz = 0; dz < ocUnit; ++dz) { | 
					
						
							|  |  |  |         auto dstZ  = outputInTile + dz * dstZStep; | 
					
						
							|  |  |  |         auto srcZ  = gemmOutputAddr + dz * srcZstep; | 
					
						
							|  |  |  |         auto biasZ = biasData + dz * UNIT; | 
					
						
							|  |  |  |         for (int x = 0; x < realDstCount; ++x) { | 
					
						
							|  |  |  |             auto dstX = dstZ + x * UNIT; | 
					
						
							|  |  |  |             auto srcX = srcZ + x * UNIT; | 
					
						
							|  |  |  |             for (int i = 0; i < UNIT; i++) { | 
					
						
							|  |  |  |                 int result = srcX[i]; | 
					
						
							|  |  |  |                 int acc    = result + biasZ[i]; | 
					
						
							|  |  |  |                 acc        = MNN::RoundingDivideByPOT( | 
					
						
							|  |  |  |                     MNN::SaturatingRoundingDoublingHighMul(acc * (1 << parameter->mOutputShiftBefore), | 
					
						
							|  |  |  |                                                            parameter->mOutputMultiplier), | 
					
						
							|  |  |  |                     -parameter->mOutputShiftAfter); | 
					
						
							|  |  |  |                 acc += parameter->mOutputOffset; | 
					
						
							|  |  |  |                 acc     = std::max(acc, parameter->mOutputActivationMin); | 
					
						
							|  |  |  |                 acc     = std::min(acc, parameter->mOutputActivationMax); | 
					
						
							|  |  |  |                 dstX[i] = static_cast<uint8_t>(acc); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace MNN { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CPUTFQuantizedConv2D::CPUTFQuantizedConv2D(Backend* backend, const Op* TFQuantizedConv2DOp) : Execution(backend) { | 
					
						
							|  |  |  |     mTfQuantizedConv2D_param = TFQuantizedConv2DOp->main_as_TfQuantizedConv2D(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // Input filter is of the following dimensions:
 | 
					
						
							|  |  |  |     // [ filter_rows, filter_cols, in_depth, out_depth]
 | 
					
						
							|  |  |  |     auto outputChannel               = mTfQuantizedConv2D_param->common()->outputCount(); | 
					
						
							|  |  |  |     auto kx                          = mTfQuantizedConv2D_param->common()->kernelX(); | 
					
						
							|  |  |  |     auto ky                          = mTfQuantizedConv2D_param->common()->kernelY(); | 
					
						
							|  |  |  |     int inputChannel                 = mTfQuantizedConv2D_param->weight()->size() / outputChannel / kx / ky; | 
					
						
							|  |  |  |     auto outputChannelUnit           = UP_DIV(outputChannel, UNIT); | 
					
						
							|  |  |  |     auto inputChannelUnit            = UP_DIV(inputChannel, UNIT); | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |     mIm2ColParamter                  = new ConvolutionCommon::Im2ColParameter; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     mIm2ColParamter->dilateX         = mTfQuantizedConv2D_param->common()->dilateX(); | 
					
						
							|  |  |  |     mIm2ColParamter->dilateY         = mTfQuantizedConv2D_param->common()->dilateY(); | 
					
						
							|  |  |  |     mIm2ColParamter->strideX         = mTfQuantizedConv2D_param->common()->strideX(); | 
					
						
							|  |  |  |     mIm2ColParamter->strideY         = mTfQuantizedConv2D_param->common()->strideY(); | 
					
						
							|  |  |  |     mIm2ColParamter->kernelX         = mTfQuantizedConv2D_param->common()->kernelX(); | 
					
						
							|  |  |  |     mIm2ColParamter->kernelY         = mTfQuantizedConv2D_param->common()->kernelY(); | 
					
						
							|  |  |  |     mIm2ColParamter->padX            = mTfQuantizedConv2D_param->common()->padX(); | 
					
						
							|  |  |  |     mIm2ColParamter->padY            = mTfQuantizedConv2D_param->common()->padY(); | 
					
						
							|  |  |  |     mIm2ColParamter->icDiv4          = inputChannelUnit; | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |     mIm2ColParamter->kernelCountUnit = UP_DIV(inputChannelUnit * kx * ky, SRC_C4_UNIT); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     mQuanParameter = new QuanParameter; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     float inputScale  = mTfQuantizedConv2D_param->inputQuantizedParam()->scale(); | 
					
						
							|  |  |  |     float filterScale = mTfQuantizedConv2D_param->filterQuantizedParam()->scale(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         double realMultiplier          = 0.0; | 
					
						
							|  |  |  |         const double inputProductScale = inputScale * filterScale; | 
					
						
							|  |  |  |         const double outputScale       = mTfQuantizedConv2D_param->outputQuantizedParam()->scale(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         MNN_ASSERT(inputProductScale >= 0); | 
					
						
							|  |  |  |         realMultiplier = inputProductScale / outputScale; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         MNN_ASSERT(realMultiplier < 1.0); | 
					
						
							|  |  |  |         int shift = 0; | 
					
						
							|  |  |  |         QuantizeMultiplierSmallerThanOne(realMultiplier, &mQuanParameter->mOutputMultiplier, &shift); | 
					
						
							|  |  |  |         shift = -shift; | 
					
						
							|  |  |  |         if (shift < 0) { | 
					
						
							|  |  |  |             mQuanParameter->mOutputShiftBefore = 0; | 
					
						
							|  |  |  |             mQuanParameter->mOutputShiftAfter  = shift; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             mQuanParameter->mOutputShiftBefore = shift; | 
					
						
							|  |  |  |             mQuanParameter->mOutputShiftAfter  = 0; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         mFusedActivationFunction = mTfQuantizedConv2D_param->activationType(); | 
					
						
							|  |  |  |         CalculateActivationRangeUint8(mFusedActivationFunction, | 
					
						
							|  |  |  |                                       mTfQuantizedConv2D_param->outputQuantizedParam()->zeroPoint(), | 
					
						
							|  |  |  |                                       mTfQuantizedConv2D_param->outputQuantizedParam()->scale(), | 
					
						
							|  |  |  |                                       &mQuanParameter->mOutputActivationMin, &mQuanParameter->mOutputActivationMax); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     mQuanParameter->mOutputOffset = mTfQuantizedConv2D_param->outputQuantizedParam()->zeroPoint(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto src                = mTfQuantizedConv2D_param->weight()->data(); | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |     int32_t offsetFilter    = mTfQuantizedConv2D_param->filterQuantizedParam()->zeroPoint() - 128; | 
					
						
							|  |  |  |     auto totalKernelCountD8 = UP_DIV(inputChannelUnit * kx * ky, SRC_C4_UNIT); | 
					
						
							|  |  |  |     mWeight.reset(Tensor::create<int8_t>(std::vector<int>{outputChannelUnit, totalKernelCountD8, UNIT, SRC_UNIT})); | 
					
						
							|  |  |  |     ::memset(mWeight->host<int8_t>(), (int8_t)offsetFilter, mWeight->size()); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     std::shared_ptr<Tensor> mWeightSum; | 
					
						
							|  |  |  |     mWeightSum.reset(Tensor::create<int32_t>(std::vector<int>{outputChannelUnit, 4})); | 
					
						
							|  |  |  |     ::memset(mWeightSum->host<int32_t>(), 0, mWeightSum->size()); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |     mQuanParameter->mFilterOffset = offsetFilter; | 
					
						
							|  |  |  |     mQuanParameter->mInputOffset  = mTfQuantizedConv2D_param->inputQuantizedParam()->zeroPoint() - 128; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     mQuanParameter->mOffsetAdd = | 
					
						
							|  |  |  |         mQuanParameter->mFilterOffset * mQuanParameter->mInputOffset * totalKernelCountD8 * SRC_UNIT; | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |     auto dst        = mWeight->host<int8_t>(); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     int kernelCount = kx * ky; | 
					
						
							|  |  |  |     auto weightSum  = mWeightSum->host<int32_t>(); | 
					
						
							|  |  |  |     for (int i = 0; i < outputChannel; ++i) { | 
					
						
							|  |  |  |         weightSum[i] = (int32_t)offsetFilter * totalKernelCountD8 * SRC_UNIT; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // weight format : hwio -> oc/4, (hw ic/4) / 2, oc4, (hw ic/4) % 2 ic4
 | 
					
						
							|  |  |  |     for (int k = 0; k < kernelCount; ++k) { | 
					
						
							|  |  |  |         auto srcK = src + k * inputChannel * outputChannel; | 
					
						
							|  |  |  |         for (int y = 0; y < inputChannel; ++y) { | 
					
						
							|  |  |  |             int yOutSide    = y / UNIT; | 
					
						
							|  |  |  |             int yInside     = y % UNIT; | 
					
						
							|  |  |  |             int yIndex      = yOutSide + k * inputChannelUnit; | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             int ySubOutside = yIndex / SRC_C4_UNIT; | 
					
						
							|  |  |  |             int ySubInside  = yIndex % SRC_C4_UNIT; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |             auto dstY = dst + ySubOutside * UNIT * SRC_UNIT + ySubInside * UNIT + yInside; | 
					
						
							|  |  |  |             auto srcY = srcK + y * outputChannel; | 
					
						
							|  |  |  |             for (int x = 0; x < outputChannel; ++x) { | 
					
						
							|  |  |  |                 int xOutSide = x / UNIT; | 
					
						
							|  |  |  |                 int xInside  = x % UNIT; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 auto dstX = dstY + xOutSide * mWeight->stride(0) + xInside * SRC_UNIT; | 
					
						
							|  |  |  |                 auto srcX = srcY + x; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                 dstX[0] = (int)srcX[0] - 128; | 
					
						
							|  |  |  |                 if (dstX[0] == -128) { | 
					
						
							|  |  |  |                     dstX[0] = -127; | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                 weightSum[x] += ((int32_t)dstX[0] - (int32_t)offsetFilter); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto originBiasData = mTfQuantizedConv2D_param->bias()->data(); | 
					
						
							|  |  |  |     mBias.reset(outputChannelUnit * 4); | 
					
						
							|  |  |  |     auto biasData = mBias.get(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // Sum[0, kx*ky*sz](x-x0)*(w-w0) = Sum(xw) - Sum(x)*w0 - Sum(w)*x0 + x0w0*(kx*ky*sz)
 | 
					
						
							|  |  |  |     // Let bias[oz] = bias[oz] - Sum[0, kx*ky*sz](w)*x0 + x0w0*(kx*ky*sz)
 | 
					
						
							|  |  |  |     for (int i = 0; i < outputChannel; ++i) { | 
					
						
							|  |  |  |         biasData[i] = originBiasData[i] - weightSum[i] * mQuanParameter->mInputOffset + mQuanParameter->mOffsetAdd; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CPUTFQuantizedConv2D::~CPUTFQuantizedConv2D() { | 
					
						
							|  |  |  |     delete mQuanParameter; | 
					
						
							|  |  |  |     delete mIm2ColParamter; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ErrorCode CPUTFQuantizedConv2D::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) { | 
					
						
							|  |  |  |     auto input        = inputs[0]; | 
					
						
							|  |  |  |     auto output       = outputs[0]; | 
					
						
							|  |  |  |     auto outputWidth  = output->width(); | 
					
						
							|  |  |  |     auto outputHeight = output->height(); | 
					
						
							|  |  |  |     auto inputWidth   = input->width(); | 
					
						
							|  |  |  |     auto inputHeight  = input->height(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto common       = mTfQuantizedConv2D_param->common(); | 
					
						
							|  |  |  |     auto strideX      = common->strideX(); | 
					
						
							|  |  |  |     auto strideY      = common->strideY(); | 
					
						
							|  |  |  |     auto filterWidth  = common->kernelX(); | 
					
						
							|  |  |  |     auto filterHeight = common->kernelY(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (common->padMode() == PadMode::PadMode_VALID) { | 
					
						
							|  |  |  |         mIm2ColParamter->padX = ((outputWidth - 1) * strideX + filterWidth - inputWidth + 1) / 2; | 
					
						
							|  |  |  |         mIm2ColParamter->padY = ((outputHeight - 1) * strideY + filterHeight - inputHeight + 1) / 2; | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         mIm2ColParamter->padX = ((outputWidth - 1) * strideX + filterWidth - inputWidth) / 2; | 
					
						
							|  |  |  |         mIm2ColParamter->padY = ((outputHeight - 1) * strideY + filterHeight - inputHeight) / 2; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     int outputChannel = common->outputCount(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto outputChannelUnit = UP_DIV(outputChannel, UNIT); | 
					
						
							|  |  |  |     auto kernelCountUnit   = mIm2ColParamter->kernelCountUnit; | 
					
						
							|  |  |  |     mIm2ColParamter->iw    = inputWidth; | 
					
						
							|  |  |  |     mIm2ColParamter->ih    = inputHeight; | 
					
						
							|  |  |  |     mIm2ColParamter->ow    = outputWidth; | 
					
						
							|  |  |  |     mIm2ColParamter->oh    = outputHeight; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     int tileCount = UP_DIV(outputWidth * outputHeight, DST_XUNIT); | 
					
						
							|  |  |  |     mThreadNumber = std::max(((CPUBackend*)backend())->threadNumber(), 1); | 
					
						
							|  |  |  |     mThreadNumber = std::min(mThreadNumber, tileCount); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |     mTempBuffer.buffer().type          = halide_type_of<int8_t>(); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     mTempBuffer.buffer().dimensions    = 3; | 
					
						
							|  |  |  |     mTempBuffer.buffer().dim[0].extent = mThreadNumber; | 
					
						
							|  |  |  |     mTempBuffer.buffer().dim[1].extent = DST_XUNIT; | 
					
						
							|  |  |  |     mTempBuffer.buffer().dim[2].extent = kernelCountUnit * SRC_UNIT; | 
					
						
							|  |  |  |     TensorUtils::setLinearLayout(&mTempBuffer); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mTempDstBuffer.buffer().type          = halide_type_of<int32_t>(); | 
					
						
							|  |  |  |     mTempDstBuffer.buffer().dimensions    = 3; | 
					
						
							|  |  |  |     mTempDstBuffer.buffer().dim[0].extent = mThreadNumber; | 
					
						
							|  |  |  |     mTempDstBuffer.buffer().dim[1].extent = DST_XUNIT; | 
					
						
							|  |  |  |     mTempDstBuffer.buffer().dim[2].extent = outputChannelUnit * UNIT; | 
					
						
							|  |  |  |     TensorUtils::setLinearLayout(&mTempDstBuffer); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mTempInputSum.buffer().type          = halide_type_of<int32_t>(); | 
					
						
							|  |  |  |     mTempInputSum.buffer().dimensions    = 2; | 
					
						
							|  |  |  |     mTempInputSum.buffer().dim[0].extent = mThreadNumber; | 
					
						
							|  |  |  |     mTempInputSum.buffer().dim[1].extent = DST_XUNIT; | 
					
						
							|  |  |  |     TensorUtils::setLinearLayout(&mTempInputSum); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC); | 
					
						
							|  |  |  |     backend()->onAcquireBuffer(&mTempDstBuffer, Backend::DYNAMIC); | 
					
						
							|  |  |  |     backend()->onAcquireBuffer(&mTempInputSum, Backend::DYNAMIC); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC); | 
					
						
							|  |  |  |     backend()->onReleaseBuffer(&mTempDstBuffer, Backend::DYNAMIC); | 
					
						
							|  |  |  |     backend()->onReleaseBuffer(&mTempInputSum, Backend::DYNAMIC); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  | static void _im2ColCommon(int32_t* inputSum, int8_t* colAddr, const uint8_t* inputOrigin, | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                           const CPUTFQuantizedConv2D::QuanParameter* quanParamter, | 
					
						
							| 
									
										
										
										
											2020-11-05 16:41:56 +08:00
										 |  |  |                           const ConvolutionCommon::Im2ColParameter* im2ColParameter, size_t xIndexStart, | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                           size_t realDstCount) { | 
					
						
							|  |  |  |     int colBufferSize = im2ColParameter->kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(uint8_t); | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |     ::memset(colAddr, (int8_t)quanParamter->mInputOffset, colBufferSize); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     auto ih        = im2ColParameter->ih; | 
					
						
							|  |  |  |     auto iw        = im2ColParameter->iw; | 
					
						
							|  |  |  |     auto kh        = im2ColParameter->kernelY; | 
					
						
							|  |  |  |     auto kw        = im2ColParameter->kernelX; | 
					
						
							|  |  |  |     auto dilateX   = im2ColParameter->dilateX; | 
					
						
							|  |  |  |     auto dilateY   = im2ColParameter->dilateY; | 
					
						
							|  |  |  |     auto icDiv4    = im2ColParameter->icDiv4; | 
					
						
							|  |  |  |     auto srcZStep  = iw * ih * UNIT; | 
					
						
							|  |  |  |     int countSumC8 = im2ColParameter->kernelCountUnit; | 
					
						
							|  |  |  |     for (int i = 0; i < realDstCount; ++i) { | 
					
						
							|  |  |  |         int xIndex = (int)xIndexStart + i; | 
					
						
							|  |  |  |         int ox     = xIndex % im2ColParameter->ow; | 
					
						
							|  |  |  |         int oy     = xIndex / im2ColParameter->ow; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         int sx = ox * im2ColParameter->strideX - im2ColParameter->padX; | 
					
						
							|  |  |  |         int sy = oy * im2ColParameter->strideY - im2ColParameter->padY; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         int sfy = ALIMAX(0, (UP_DIV(-sy, im2ColParameter->dilateX))); | 
					
						
							|  |  |  |         int efy = ALIMIN(kh, UP_DIV(ih - sy, im2ColParameter->dilateY)); | 
					
						
							|  |  |  |         int sfx = ALIMAX(0, (UP_DIV(-sx, im2ColParameter->dilateX))); | 
					
						
							|  |  |  |         int efx = ALIMIN(kw, UP_DIV(iw - sx, im2ColParameter->dilateX)); | 
					
						
							|  |  |  |         int fyC = efy - sfy; | 
					
						
							|  |  |  |         int fxC = efx - sfx; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         auto colAddrI    = colAddr + SRC_UNIT * i; | 
					
						
							|  |  |  |         auto inputOffset = inputOrigin + (sx + sy * iw) * UNIT + (sfx * dilateX) * UNIT + (sfy * dilateY) * iw * UNIT; | 
					
						
							|  |  |  |         auto indexOffset = (sfy * kw + sfx) * icDiv4; | 
					
						
							|  |  |  |         for (int fy = 0; fy < fyC; ++fy) { | 
					
						
							|  |  |  |             for (int fx = 0; fx < fxC; ++fx) { | 
					
						
							|  |  |  |                 auto inputK     = inputOffset + (fx * dilateX) * UNIT + (fy * dilateY) * iw * UNIT; | 
					
						
							|  |  |  |                 auto indexStart = indexOffset + (fy * kw + fx) * icDiv4; | 
					
						
							|  |  |  |                 for (int sz = 0; sz < icDiv4; ++sz) { | 
					
						
							|  |  |  |                     auto inputZ       = inputK + srcZStep * sz; | 
					
						
							|  |  |  |                     auto index        = indexStart + sz; | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                     auto indexInside  = index % SRC_C4_UNIT; | 
					
						
							|  |  |  |                     auto indexOutside = index / SRC_C4_UNIT; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |                     auto dstK         = colAddrI + indexOutside * SRC_UNIT * DST_XUNIT + UNIT * indexInside; | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                     //TODO Optimize it
 | 
					
						
							|  |  |  |                     for (int j=0; j<UNIT; ++j) { | 
					
						
							|  |  |  |                         dstK[j] = (int32_t)inputZ[j] - 128; | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                     //*((int32_t*)dstK) = *((int32_t*)inputZ);
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         int32_t inputSumValue = 0; | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |         int32x2_t inputSumValueC4 = vmov_n_s32(0); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							|  |  |  |         for (int j = 0; j < countSumC8; ++j) { | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |             auto colAddrIJ = colAddrI + j * SRC_UNIT * DST_XUNIT; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             auto p0 = vld1_s8(colAddrIJ + 0); | 
					
						
							|  |  |  |             auto p1 = vld1_s8(colAddrIJ + 8); | 
					
						
							|  |  |  |             auto q0 = vpaddl_s8(p0); | 
					
						
							|  |  |  |             auto q1 = vpaddl_s8(p1); | 
					
						
							|  |  |  |             inputSumValueC4 += vpaddl_s16(q0); | 
					
						
							|  |  |  |             inputSumValueC4 += vpaddl_s16(q1); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             for (int k = 0; k < SRC_UNIT; ++k) { | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |                 inputSumValue += colAddrIJ[k]; | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |         inputSumValue = inputSumValueC4[0] + inputSumValueC4[1]; | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |         inputSum[i] = inputSumValue * quanParamter->mFilterOffset; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ErrorCode CPUTFQuantizedConv2D::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) { | 
					
						
							|  |  |  |     MNN_ASSERT(inputs.size() == 1); | 
					
						
							|  |  |  |     MNN_ASSERT(outputs.size() == 1); | 
					
						
							|  |  |  |     // Input tensor is of the following dimensions:
 | 
					
						
							|  |  |  |     // [ batch, in_rows, in_cols, in_depth ]
 | 
					
						
							|  |  |  |     const Tensor* input = inputs[0]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const int strideX = mIm2ColParamter->strideX; | 
					
						
							|  |  |  |     const int strideY = mIm2ColParamter->strideY; | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     auto batchs       = input->batch(); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     auto ic           = input->channel(); | 
					
						
							|  |  |  |     auto iw           = input->width(); | 
					
						
							|  |  |  |     auto ih           = input->height(); | 
					
						
							|  |  |  |     auto output       = outputs[0]; | 
					
						
							|  |  |  |     auto oc           = output->channel(); | 
					
						
							|  |  |  |     auto oh           = output->height(); | 
					
						
							|  |  |  |     auto ow           = output->width(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto ocUnit = UP_DIV(oc, UNIT); | 
					
						
							|  |  |  |     int icDiv4  = UP_DIV(ic, UNIT); | 
					
						
							|  |  |  |     int kh      = mIm2ColParamter->kernelY; | 
					
						
							|  |  |  |     int kw      = mIm2ColParamter->kernelX; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     auto kernelCountUnit = mIm2ColParamter->kernelCountUnit; | 
					
						
							|  |  |  |     int outputCount      = ow * oh; | 
					
						
							|  |  |  |     int outputCountTile  = UP_DIV(outputCount, DST_XUNIT); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     bool fastMode = kw == 1 && kh == 1 && strideX == 1 && strideY == 1 && mIm2ColParamter->padY == 0 && | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                     mIm2ColParamter->padX == 0 && icDiv4 % SRC_C4_UNIT == 0; | 
					
						
							|  |  |  |     auto gemmFunction = MNNGemmint8to32_8x4_Unit; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     const int* biasData = mBias.get(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (int batchIndex = 0; batchIndex < batchs; ++batchIndex) { | 
					
						
							|  |  |  |         auto inputOrigin  = input->host<uint8_t>() + batchIndex * input->stride(0); | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |         auto weightOrigin = mWeight->host<int8_t>(); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |         auto outputOrigin = output->host<uint8_t>() + batchIndex * output->stride(0); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         MNN_CONCURRENCY_BEGIN(tId, mThreadNumber) { | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |             auto colAddr        = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             auto gemmOutputAddr = mTempDstBuffer.host<int32_t>() + tId * mTempDstBuffer.buffer().dim[0].stride; | 
					
						
							|  |  |  |             auto inputSum       = mTempInputSum.host<int32_t>() + mTempInputSum.stride(0) * tId; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             for (int tIndex = (int)tId; tIndex < outputCountTile; tIndex += mThreadNumber) { | 
					
						
							|  |  |  |                 int xIndexStart  = tIndex * DST_XUNIT; | 
					
						
							|  |  |  |                 int realDstCount = ALIMIN(outputCount - xIndexStart, DST_XUNIT); | 
					
						
							|  |  |  |                 /*Im2Col Begin*/ | 
					
						
							|  |  |  |                 if (fastMode) { | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                     MNNLoadU8AndSum(inputSum, colAddr, inputOrigin + UNIT * xIndexStart, iw * ih * UNIT, icDiv4 / SRC_C4_UNIT, | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                                     realDstCount, mQuanParameter->mFilterOffset); | 
					
						
							|  |  |  |                 } else { | 
					
						
							|  |  |  |                     _im2ColCommon(inputSum, colAddr, inputOrigin, mQuanParameter, mIm2ColParamter, xIndexStart, | 
					
						
							|  |  |  |                                   realDstCount); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 /*Im2Col End*/ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 // GEMM
 | 
					
						
							| 
									
										
										
										
											2019-08-22 20:13:46 +08:00
										 |  |  |                 gemmFunction(gemmOutputAddr, colAddr, weightOrigin, inputSum, kernelCountUnit, UNIT * DST_XUNIT * sizeof(int32_t), | 
					
						
							|  |  |  |                                           ocUnit); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 /*Copy Data to Real Output*/ | 
					
						
							|  |  |  |                 auto outputInTile = outputOrigin + xIndexStart * UNIT; | 
					
						
							|  |  |  |                 MNNQuanToDestUint8(outputInTile, gemmOutputAddr, biasData, ocUnit, realDstCount, | 
					
						
							|  |  |  |                                    ow * oh * UNIT * sizeof(uint8_t), DST_XUNIT * UNIT * sizeof(int32_t), | 
					
						
							|  |  |  |                                    mQuanParameter); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         MNN_CONCURRENCY_END(); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class CPUTFQuantizedConv2DCreator : public CPUBackend::Creator { | 
					
						
							|  |  |  | public: | 
					
						
							|  |  |  |     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, | 
					
						
							|  |  |  |                                 const MNN::Op* op, Backend* backend) const { | 
					
						
							|  |  |  |         return new CPUTFQuantizedConv2D(backend, op); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | REGISTER_CPU_OP_CREATOR(CPUTFQuantizedConv2DCreator, OpType_TfQuantizedConv2D); | 
					
						
							|  |  |  | } // namespace MNN
 | 
					
						
							| 
									
										
											  
											
												- build:
	- unify schema building in core and converter;
	- add more build script for android;
	- add linux build script for python;
- ops impl:
	- add floor mod support in binary;
	- use eltwise impl in add/max/sub/mul binary for optimization;
	- remove fake double support in cast;
	- fix 5d support for concat;
	- add adjX and adjY support for batch matmul;
	- optimize conv2d back prop filter;
	- add pad mode support for conv3d;
	- fix bug in conv2d & conv depthwise with very small feature map;
	- optimize binary without broacast;
	- add data types support for gather;
	- add gather ND support;
	- use uint8 data type in gather v2;
	- add transpose support for matmul;
	- add matrix band part;
	- add dim != 4 support for padding, reshape & tensor convert;
	- add pad type support for pool3d;
	- make ops based on TensorFlow Lite quantization optional;
	- add all & any support for reduction;
	- use type in parameter as output type in reduction;
	- add int support for unary;
	- add variable weight support for conv2d;
	- fix conv2d depthwise weights initialization;
	- fix type support for transpose;
	- fix grad outputs count for  reduce grad and reshape grad;
	- fix priorbox & detection output;
	- fix metal softmax error;
- python:
	- add runSessionWithCallBackInfo interface;
	- add max nodes limit (1400) for visualization tool;
	- fix save error in python3;
	- align default dim;
- convert:
	- add extra design for optimization;
	- add more post converting optimizers;
	- add caffe v1 weights blob support;
	- add cast, unary, conv transpose support for onnx model;
	- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
	- add cos/sin/atan/tan support for unary for tensorflow model;
	- add any/all support for reduction for tensorflow model;
	- add elu, conv3d, pool3d support for tensorflow model;
	- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
	- fix size computer lock;
	- fix thread pool deadlock;
	- add express & parameters in express;
	- rewrite blitter chooser without static map;
	- add tests for expr;
											
										 
											2019-10-29 13:37:26 +08:00
										 |  |  | #endif
 |