| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | //
 | 
					
						
							|  |  |  | //  CPUPool.cpp
 | 
					
						
							|  |  |  | //  MNN
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | //  Created by MNN on 2018/07/15.
 | 
					
						
							|  |  |  | //  Copyright © 2018, Alibaba Group Holding Limited
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "backend/cpu/CPUPool.hpp"
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #include <float.h>
 | 
					
						
							|  |  |  | #include <math.h>
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "core/Macro.h"
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  | #include <arm_neon.h>
 | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | #include "core/Concurrency.h"
 | 
					
						
							|  |  |  | #include "math/Vec4.hpp"
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | using Vec4 = MNN::Math::Vec4; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | static void pooling_max_pad(const float *channelInput, float *offsetOutput, int inputWidth, int inputHeight, | 
					
						
							|  |  |  |                             int inputStep4, int inputSize4, int kernelWidth, int kernelHeight, int iw, int ih) { | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |     float32x4_t max = vdupq_n_f32(-FLT_MAX); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |     float max0 = -FLT_MAX; | 
					
						
							|  |  |  |     float max1 = -FLT_MAX; | 
					
						
							|  |  |  |     float max2 = -FLT_MAX; | 
					
						
							|  |  |  |     float max3 = -FLT_MAX; | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const float *bottomLine = channelInput + inputSize4 - inputStep4; | 
					
						
							|  |  |  |     for (int kh = 0; kh < kernelHeight; kh++) { | 
					
						
							|  |  |  |         const int h                  = ih + kh; | 
					
						
							|  |  |  |         const float *paddedLineInput = nullptr; | 
					
						
							|  |  |  |         if (h < 0) { // top replicate
 | 
					
						
							|  |  |  |             paddedLineInput = channelInput; | 
					
						
							|  |  |  |         } else if (h >= inputHeight) { // bottom replicate
 | 
					
						
							|  |  |  |             paddedLineInput = bottomLine; | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             paddedLineInput = channelInput + h * inputStep4; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         const float *rightEdge = paddedLineInput + inputStep4 - 4; | 
					
						
							|  |  |  |         for (int kw = 0; kw < kernelWidth; kw++) { | 
					
						
							|  |  |  |             const int w              = iw + kw; | 
					
						
							|  |  |  |             const float *cursorInput = nullptr; | 
					
						
							|  |  |  |             if (w < 0) { // left replicate
 | 
					
						
							|  |  |  |                 cursorInput = paddedLineInput; | 
					
						
							|  |  |  |             } else if (w >= inputWidth) { // right replicate
 | 
					
						
							|  |  |  |                 cursorInput = rightEdge; | 
					
						
							|  |  |  |             } else { | 
					
						
							|  |  |  |                 cursorInput = paddedLineInput + 4 * w; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |             max = vmaxq_f32(max, vld1q_f32(cursorInput)); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |             max0 = std::max(max0, cursorInput[0]); | 
					
						
							|  |  |  |             max1 = std::max(max1, cursorInput[1]); | 
					
						
							|  |  |  |             max2 = std::max(max2, cursorInput[2]); | 
					
						
							|  |  |  |             max3 = std::max(max3, cursorInput[3]); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |     vst1q_f32(offsetOutput, max); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |     offsetOutput[0] = max0; | 
					
						
							|  |  |  |     offsetOutput[1] = max1; | 
					
						
							|  |  |  |     offsetOutput[2] = max2; | 
					
						
							|  |  |  |     offsetOutput[3] = max3; | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  | static void poolingMax(const float *channelInput, int inputWidth, int inputHeight, float *channelOutput, | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                        int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth, | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |                        int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     int padTop    = padHeight <= 0 ? 0 : (padHeight + strideHeight - 1) / strideHeight; | 
					
						
							|  |  |  |     int padBottom = (padHeight + inputHeight - kernelHeight) / strideHeight + 1; | 
					
						
							|  |  |  |     int padLeft   = padWidth <= 0 ? 0 : (padWidth + strideWidth - 1) / strideWidth; | 
					
						
							|  |  |  |     int padRight  = (padWidth + inputWidth - kernelWidth) / strideWidth + 1; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const int inputStep4       = 4 * inputWidth; | 
					
						
							|  |  |  |     const int inputSize4       = inputStep4 * inputHeight; | 
					
						
							|  |  |  |     const int strideInputStep4 = strideHeight * inputStep4; | 
					
						
							|  |  |  |     const int outputStep4      = 4 * outputWidth; | 
					
						
							|  |  |  |     const int strideWidth4     = 4 * strideWidth; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     { // handle paddings top
 | 
					
						
							|  |  |  |         float *lineOutput = channelOutput; | 
					
						
							|  |  |  |         for (int oh = 0, ih = -padHeight; oh < padTop; oh++, ih += strideHeight, lineOutput += outputStep4) { | 
					
						
							|  |  |  |             float *offsetOutput = lineOutput; | 
					
						
							|  |  |  |             for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) { | 
					
						
							|  |  |  |                 pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, | 
					
						
							|  |  |  |                                 kernelWidth, kernelHeight, iw, ih); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |         for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; | 
					
						
							|  |  |  |              oh++, ih += strideHeight, lineOutput += outputStep4) { | 
					
						
							|  |  |  |             float *offsetOutput = lineOutput; | 
					
						
							|  |  |  |             for (int ow = 0, iw = -padWidth; ow < padLeft; ow++, iw += strideWidth, offsetOutput += 4) { | 
					
						
							|  |  |  |                 pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, | 
					
						
							|  |  |  |                                 kernelWidth, kernelHeight, iw, ih); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |             offsetOutput = lineOutput + padRight * 4; | 
					
						
							|  |  |  |             for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth; | 
					
						
							|  |  |  |                  ow++, iw += strideWidth, offsetOutput += 4) { | 
					
						
							|  |  |  |                 pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, | 
					
						
							|  |  |  |                                 kernelWidth, kernelHeight, iw, ih); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |         for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight; | 
					
						
							|  |  |  |              oh++, ih += strideHeight, lineOutput += outputStep4) { | 
					
						
							|  |  |  |             float *offsetOutput = lineOutput; | 
					
						
							|  |  |  |             for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) { | 
					
						
							|  |  |  |                 pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, | 
					
						
							|  |  |  |                                 kernelWidth, kernelHeight, iw, ih); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     { // handle no paddings
 | 
					
						
							|  |  |  |         const float *lineInput = | 
					
						
							|  |  |  |             channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4; | 
					
						
							|  |  |  |         float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |         for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; | 
					
						
							|  |  |  |              oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { | 
					
						
							|  |  |  |             const float *offsetInput = lineInput; | 
					
						
							|  |  |  |             float *offsetOutput      = lineOutput; | 
					
						
							|  |  |  |             for (int ow = padLeft, iw = -padWidth + ow * strideWidth; ow < padRight; | 
					
						
							|  |  |  |                  ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 float32x4_t max = vdupq_n_f32(-FLT_MAX); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 float max0 = -FLT_MAX; | 
					
						
							|  |  |  |                 float max1 = -FLT_MAX; | 
					
						
							|  |  |  |                 float max2 = -FLT_MAX; | 
					
						
							|  |  |  |                 float max3 = -FLT_MAX; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 const float *kernelInput = offsetInput; | 
					
						
							|  |  |  |                 for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { | 
					
						
							|  |  |  |                     const float *cursorInput = kernelInput; | 
					
						
							|  |  |  |                     for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                         max = vmaxq_f32(max, vld1q_f32(cursorInput)); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                         max0 = std::max(max0, cursorInput[0]); | 
					
						
							|  |  |  |                         max1 = std::max(max1, cursorInput[1]); | 
					
						
							|  |  |  |                         max2 = std::max(max2, cursorInput[2]); | 
					
						
							|  |  |  |                         max3 = std::max(max3, cursorInput[3]); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							|  |  |  |                     } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 vst1q_f32(offsetOutput, max); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 offsetOutput[0] = max0; | 
					
						
							|  |  |  |                 offsetOutput[1] = max1; | 
					
						
							|  |  |  |                 offsetOutput[2] = max2; | 
					
						
							|  |  |  |                 offsetOutput[3] = max3; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void poolingAvgPad(const float *offsetInput, float *offsetOutput, int inputWidth, int inputHeight, | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |                           int kernelWidth, int kernelHeight, int inputStep4, int iw, int ih, int padWidth, | 
					
						
							|  |  |  |                           int padHeight, MNN::PoolPadType padType) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |     float32x4_t sum = vdupq_n_f32(0); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |     float sum0 = 0; | 
					
						
							|  |  |  |     float sum1 = 0; | 
					
						
							|  |  |  |     float sum2 = 0; | 
					
						
							|  |  |  |     float sum3 = 0; | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     const int khs = 0 < -ih ? -ih : 0;                                                 // max
 | 
					
						
							|  |  |  |     const int khe = kernelHeight < inputHeight - ih ? kernelHeight : inputHeight - ih; // min
 | 
					
						
							|  |  |  |     const int kws = 0 < -iw ? -iw : 0;                                                 // max
 | 
					
						
							|  |  |  |     const int kwe = kernelWidth < inputWidth - iw ? kernelWidth : inputWidth - iw;     // min
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     // sum
 | 
					
						
							|  |  |  |     int count = 0; | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |     if (padType == MNN::PoolPadType_CAFFE) { | 
					
						
							|  |  |  |         count = (ALIMIN(ih + kernelHeight, inputHeight + padHeight) - ih) * | 
					
						
							|  |  |  |                 (ALIMIN(iw + kernelWidth, inputWidth + padWidth) - iw); | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         count = (khe - khs) * (kwe - kws); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     const float *kernelInput = offsetInput + khs * inputStep4; | 
					
						
							|  |  |  |     for (int kh = khs; kh < khe; kh++, kernelInput += inputStep4) { | 
					
						
							|  |  |  |         const float *cursorInput = kernelInput + kws * 4; | 
					
						
							|  |  |  |         for (int kw = kws; kw < kwe; kw++, cursorInput += 4) { | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |             sum += vld1q_f32(cursorInput); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |             sum0 += cursorInput[0]; | 
					
						
							|  |  |  |             sum1 += cursorInput[1]; | 
					
						
							|  |  |  |             sum2 += cursorInput[2]; | 
					
						
							|  |  |  |             sum3 += cursorInput[3]; | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // avg
 | 
					
						
							|  |  |  |     if (count > 0) { | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |         vst1q_f32(offsetOutput, sum / vdupq_n_f32(count)); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |         offsetOutput[0] = sum0 / (float)count; | 
					
						
							|  |  |  |         offsetOutput[1] = sum1 / (float)count; | 
					
						
							|  |  |  |         offsetOutput[2] = sum2 / (float)count; | 
					
						
							|  |  |  |         offsetOutput[3] = sum3 / (float)count; | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							|  |  |  |         vst1q_f32(offsetOutput, vdupq_n_f32(0)); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |         offsetOutput[0] = 0; | 
					
						
							|  |  |  |         offsetOutput[1] = 0; | 
					
						
							|  |  |  |         offsetOutput[2] = 0; | 
					
						
							|  |  |  |         offsetOutput[3] = 0; | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  | static void poolingAvg(const float *channelInput, int inputWidth, int inputHeight, float *channelOutput, | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                        int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth, | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |                        int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     int padTop    = padHeight <= 0 ? 0 : (padHeight + strideHeight - 1) / strideHeight; | 
					
						
							|  |  |  |     int padBottom = (padHeight + inputHeight - kernelHeight) / strideHeight + 1; | 
					
						
							|  |  |  |     int padLeft   = padWidth <= 0 ? 0 : (padWidth + strideWidth - 1) / strideWidth; | 
					
						
							|  |  |  |     int padRight  = (padWidth + inputWidth - kernelWidth) / strideWidth + 1; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const int inputStep4       = 4 * inputWidth; | 
					
						
							|  |  |  |     const int strideInputStep4 = strideHeight * inputStep4; | 
					
						
							|  |  |  |     const int outputStep4      = 4 * outputWidth; | 
					
						
							|  |  |  |     const int strideWidth4     = 4 * strideWidth; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     { // handle paddings
 | 
					
						
							|  |  |  |         const float *lineInput = channelInput - padHeight * inputStep4 - padWidth * 4; | 
					
						
							|  |  |  |         float *lineOutput      = channelOutput; | 
					
						
							|  |  |  |         for (int oh = 0, ih = -padHeight; oh < padTop; | 
					
						
							|  |  |  |              oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { | 
					
						
							|  |  |  |             const float *offsetInput = lineInput; | 
					
						
							|  |  |  |             float *offsetOutput      = lineOutput; | 
					
						
							|  |  |  |             for (int ow = 0, iw = -padWidth; ow < outputWidth; | 
					
						
							|  |  |  |                  ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { | 
					
						
							|  |  |  |                 poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |                               iw, ih, padWidth, padHeight, padType); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |         for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; | 
					
						
							|  |  |  |              oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { | 
					
						
							|  |  |  |             const float *offsetInput = lineInput; | 
					
						
							|  |  |  |             float *offsetOutput      = lineOutput; | 
					
						
							|  |  |  |             for (int ow = 0, iw = -padWidth; ow < padLeft; | 
					
						
							|  |  |  |                  ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { | 
					
						
							|  |  |  |                 poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |                               iw, ih, padWidth, padHeight, padType); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |             offsetInput  = lineInput + padRight * strideWidth * 4; | 
					
						
							|  |  |  |             offsetOutput = lineOutput + padRight * 4; | 
					
						
							|  |  |  |             for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth; | 
					
						
							|  |  |  |                  ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { | 
					
						
							|  |  |  |                 poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |                               iw, ih, padWidth, padHeight, padType); | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight; | 
					
						
							|  |  |  |              oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { | 
					
						
							|  |  |  |             const float *offsetInput = lineInput; | 
					
						
							|  |  |  |             float *offsetOutput      = lineOutput; | 
					
						
							|  |  |  |             for (int ow = 0, iw = -padWidth; ow < outputWidth; | 
					
						
							|  |  |  |                  ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { | 
					
						
							|  |  |  |                 poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |                               iw, ih, padWidth, padHeight, padType); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     { // handle no paddings
 | 
					
						
							|  |  |  |         const float *lineInput = | 
					
						
							|  |  |  |             channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4; | 
					
						
							|  |  |  |         float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |         for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; | 
					
						
							|  |  |  |              oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { | 
					
						
							|  |  |  |             const float *offsetInput = lineInput; | 
					
						
							|  |  |  |             float *offsetOutput      = lineOutput; | 
					
						
							|  |  |  |             for (int ow = padLeft, iw = -padWidth + ow * strideWidth; ow < padRight; | 
					
						
							|  |  |  |                  ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 float32x4_t sum = vdupq_n_f32(0); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 float sum0 = 0; | 
					
						
							|  |  |  |                 float sum1 = 0; | 
					
						
							|  |  |  |                 float sum2 = 0; | 
					
						
							|  |  |  |                 float sum3 = 0; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 // sum
 | 
					
						
							|  |  |  |                 int count                = 0; | 
					
						
							|  |  |  |                 const float *kernelInput = offsetInput; | 
					
						
							|  |  |  |                 for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { | 
					
						
							|  |  |  |                     const float *cursorInput = kernelInput; | 
					
						
							|  |  |  |                     for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                         sum += vld1q_f32(cursorInput); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                         sum0 += cursorInput[0]; | 
					
						
							|  |  |  |                         sum1 += cursorInput[1]; | 
					
						
							|  |  |  |                         sum2 += cursorInput[2]; | 
					
						
							|  |  |  |                         sum3 += cursorInput[3]; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                         count++; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |                     } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 } | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 // avg
 | 
					
						
							|  |  |  |                 if (count > 0) { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                     vst1q_f32(offsetOutput, sum / vdupq_n_f32(count)); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                     offsetOutput[0] = sum0 / (float)count; | 
					
						
							|  |  |  |                     offsetOutput[1] = sum1 / (float)count; | 
					
						
							|  |  |  |                     offsetOutput[2] = sum2 / (float)count; | 
					
						
							|  |  |  |                     offsetOutput[3] = sum3 / (float)count; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                 } else { | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #ifdef MNN_USE_NEON
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                     vst1q_f32(offsetOutput, vdupq_n_f32(0)); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #else
 | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |                     offsetOutput[0] = 0; | 
					
						
							|  |  |  |                     offsetOutput[1] = 0; | 
					
						
							|  |  |  |                     offsetOutput[2] = 0; | 
					
						
							|  |  |  |                     offsetOutput[3] = 0; | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | #endif
 | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace MNN { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CPUPool::CPUPool(Backend *b, const Pool *parameter) : MNN::Execution(b), mParameter(parameter) { | 
					
						
							|  |  |  |     // nothing to do
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ErrorCode CPUPool::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { | 
					
						
							|  |  |  |     auto layer       = mParameter; | 
					
						
							|  |  |  |     int strideWidth  = layer->strideX(); | 
					
						
							|  |  |  |     int strideHeight = layer->strideY(); | 
					
						
							|  |  |  |     int padWidth     = layer->padX(); | 
					
						
							|  |  |  |     int padHeight    = layer->padY(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // edit const if global
 | 
					
						
							|  |  |  |     auto input       = inputs[0]; | 
					
						
							|  |  |  |     auto output      = outputs[0]; | 
					
						
							|  |  |  |     int kernelWidth  = std::min(layer->kernelX(), input->width()); | 
					
						
							|  |  |  |     int kernelHeight = std::min(layer->kernelY(), input->height()); | 
					
						
							|  |  |  |     if (layer->isGlobal()) { | 
					
						
							|  |  |  |         kernelWidth  = input->width(); | 
					
						
							|  |  |  |         kernelHeight = input->height(); | 
					
						
							|  |  |  |         strideWidth  = input->width(); | 
					
						
							|  |  |  |         strideHeight = input->height(); | 
					
						
							|  |  |  |         padWidth     = 0; | 
					
						
							|  |  |  |         padHeight    = 0; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     if (layer->padType() == PoolPadType_SAME) { | 
					
						
							|  |  |  |         int padNeededWidth  = (output->width() - 1) * strideWidth + kernelWidth - input->width(); | 
					
						
							|  |  |  |         int padNeededHeight = (output->height() - 1) * strideHeight + kernelHeight - input->height(); | 
					
						
							|  |  |  |         padWidth            = padNeededWidth > 0 ? padNeededWidth / 2 : 0; | 
					
						
							|  |  |  |         padHeight           = padNeededHeight > 0 ? padNeededHeight / 2 : 0; | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |     } else if (layer->padType() == PoolPadType_VALID) { | 
					
						
							|  |  |  |         padWidth = padHeight = 0; | 
					
						
							| 
									
										
										
										
											2019-05-05 20:27:57 +08:00
										 |  |  |     } | 
					
						
							|  |  |  |     auto poolType      = layer->type(); | 
					
						
							|  |  |  |     auto planeFunction = poolingMax; | 
					
						
							|  |  |  |     if (poolType == PoolType_AVEPOOL) { | 
					
						
							|  |  |  |         planeFunction = poolingAvg; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     auto totalDepth        = input->batch() * UP_DIV(input->channel(), 4); | 
					
						
							|  |  |  |     auto inputData         = input->host<float>(); | 
					
						
							|  |  |  |     auto outputData        = output->host<float>(); | 
					
						
							|  |  |  |     auto inputPlaneStride  = 4 * input->width() * input->height(); | 
					
						
							|  |  |  |     auto outputPlaneStride = 4 * output->width() * output->height(); | 
					
						
							|  |  |  |     int threadNumber       = ((CPUBackend *)backend())->threadNumber(); | 
					
						
							| 
									
										
										
										
											2019-06-24 11:32:41 +08:00
										 |  |  |     auto padType           = layer->padType(); | 
					
						
							| 
									
										
										
										
											2020-02-26 09:57:17 +08:00
										 |  |  |     if (layer->pads() != nullptr && padType == PoolPadType_CAFFE) { | 
					
						
							|  |  |  |         padType = PoolPadType_VALID; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-07-03 16:47:16 +08:00
										 |  |  |     mFunction              = std::make_pair(threadNumber, [=](int tId) { | 
					
						
							|  |  |  |         for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) { | 
					
						
							|  |  |  |             // run
 | 
					
						
							|  |  |  |             planeFunction(inputData + channel * inputPlaneStride, input->width(), input->height(), | 
					
						
							|  |  |  |                           outputData + outputPlaneStride * channel, output->width(), output->height(), kernelWidth, | 
					
						
							|  |  |  |                           kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2019-07-03 16:47:16 +08:00
										 |  |  |     }); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ErrorCode CPUPool::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { | 
					
						
							| 
									
										
										
										
											2019-07-03 16:47:16 +08:00
										 |  |  |     MNN_CONCURRENCY_BEGIN(tId, mFunction.first) { | 
					
						
							| 
									
										
										
										
											2019-07-19 17:09:09 +08:00
										 |  |  |         mFunction.second((int)tId); | 
					
						
							| 
									
										
										
										
											2019-07-03 16:47:16 +08:00
										 |  |  |     } | 
					
						
							|  |  |  |     MNN_CONCURRENCY_END(); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class CPUPoolCreator : public CPUBackend::Creator { | 
					
						
							|  |  |  | public: | 
					
						
							|  |  |  |     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, | 
					
						
							|  |  |  |                                 const MNN::Op *op, Backend *backend) const override { | 
					
						
							|  |  |  |         return new CPUPool(backend, op->main_as_Pool()); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | REGISTER_CPU_OP_CREATOR(CPUPoolCreator, OpType_Pooling); | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  | CPUPool3D::CPUPool3D(Backend *b, const Pool3D *param) : MNN::Execution(b) { | 
					
						
							| 
									
										
											  
											
												- build:
	- unify schema building in core and converter;
	- add more build script for android;
	- add linux build script for python;
- ops impl:
	- add floor mod support in binary;
	- use eltwise impl in add/max/sub/mul binary for optimization;
	- remove fake double support in cast;
	- fix 5d support for concat;
	- add adjX and adjY support for batch matmul;
	- optimize conv2d back prop filter;
	- add pad mode support for conv3d;
	- fix bug in conv2d & conv depthwise with very small feature map;
	- optimize binary without broacast;
	- add data types support for gather;
	- add gather ND support;
	- use uint8 data type in gather v2;
	- add transpose support for matmul;
	- add matrix band part;
	- add dim != 4 support for padding, reshape & tensor convert;
	- add pad type support for pool3d;
	- make ops based on TensorFlow Lite quantization optional;
	- add all & any support for reduction;
	- use type in parameter as output type in reduction;
	- add int support for unary;
	- add variable weight support for conv2d;
	- fix conv2d depthwise weights initialization;
	- fix type support for transpose;
	- fix grad outputs count for  reduce grad and reshape grad;
	- fix priorbox & detection output;
	- fix metal softmax error;
- python:
	- add runSessionWithCallBackInfo interface;
	- add max nodes limit (1400) for visualization tool;
	- fix save error in python3;
	- align default dim;
- convert:
	- add extra design for optimization;
	- add more post converting optimizers;
	- add caffe v1 weights blob support;
	- add cast, unary, conv transpose support for onnx model;
	- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
	- add cos/sin/atan/tan support for unary for tensorflow model;
	- add any/all support for reduction for tensorflow model;
	- add elu, conv3d, pool3d support for tensorflow model;
	- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
	- fix size computer lock;
	- fix thread pool deadlock;
	- add express & parameters in express;
	- rewrite blitter chooser without static map;
	- add tests for expr;
											
										 
											2019-10-29 13:37:26 +08:00
										 |  |  |     mType = param->type(); | 
					
						
							|  |  |  |     mPadType = param->padType(); | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     for (auto kernel: *param->kernels()) { | 
					
						
							|  |  |  |         mKernels.push_back(kernel); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     for (auto stride: *param->strides()) { | 
					
						
							|  |  |  |         mStrides.push_back(stride); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
											  
											
												- build:
	- unify schema building in core and converter;
	- add more build script for android;
	- add linux build script for python;
- ops impl:
	- add floor mod support in binary;
	- use eltwise impl in add/max/sub/mul binary for optimization;
	- remove fake double support in cast;
	- fix 5d support for concat;
	- add adjX and adjY support for batch matmul;
	- optimize conv2d back prop filter;
	- add pad mode support for conv3d;
	- fix bug in conv2d & conv depthwise with very small feature map;
	- optimize binary without broacast;
	- add data types support for gather;
	- add gather ND support;
	- use uint8 data type in gather v2;
	- add transpose support for matmul;
	- add matrix band part;
	- add dim != 4 support for padding, reshape & tensor convert;
	- add pad type support for pool3d;
	- make ops based on TensorFlow Lite quantization optional;
	- add all & any support for reduction;
	- use type in parameter as output type in reduction;
	- add int support for unary;
	- add variable weight support for conv2d;
	- fix conv2d depthwise weights initialization;
	- fix type support for transpose;
	- fix grad outputs count for  reduce grad and reshape grad;
	- fix priorbox & detection output;
	- fix metal softmax error;
- python:
	- add runSessionWithCallBackInfo interface;
	- add max nodes limit (1400) for visualization tool;
	- fix save error in python3;
	- align default dim;
- convert:
	- add extra design for optimization;
	- add more post converting optimizers;
	- add caffe v1 weights blob support;
	- add cast, unary, conv transpose support for onnx model;
	- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
	- add cos/sin/atan/tan support for unary for tensorflow model;
	- add any/all support for reduction for tensorflow model;
	- add elu, conv3d, pool3d support for tensorflow model;
	- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
	- fix size computer lock;
	- fix thread pool deadlock;
	- add express & parameters in express;
	- rewrite blitter chooser without static map;
	- add tests for expr;
											
										 
											2019-10-29 13:37:26 +08:00
										 |  |  |     if (mPadType != PoolPadType_SAME) { | 
					
						
							|  |  |  |         for (auto pad: *param->pads()) { | 
					
						
							|  |  |  |             mPads.push_back(pad); | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  | ErrorCode CPUPool3D::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { | 
					
						
							|  |  |  |     auto input = inputs[0]; | 
					
						
							|  |  |  |     auto output = outputs[0]; | 
					
						
							| 
									
										
											  
											
												- build:
	- unify schema building in core and converter;
	- add more build script for android;
	- add linux build script for python;
- ops impl:
	- add floor mod support in binary;
	- use eltwise impl in add/max/sub/mul binary for optimization;
	- remove fake double support in cast;
	- fix 5d support for concat;
	- add adjX and adjY support for batch matmul;
	- optimize conv2d back prop filter;
	- add pad mode support for conv3d;
	- fix bug in conv2d & conv depthwise with very small feature map;
	- optimize binary without broacast;
	- add data types support for gather;
	- add gather ND support;
	- use uint8 data type in gather v2;
	- add transpose support for matmul;
	- add matrix band part;
	- add dim != 4 support for padding, reshape & tensor convert;
	- add pad type support for pool3d;
	- make ops based on TensorFlow Lite quantization optional;
	- add all & any support for reduction;
	- use type in parameter as output type in reduction;
	- add int support for unary;
	- add variable weight support for conv2d;
	- fix conv2d depthwise weights initialization;
	- fix type support for transpose;
	- fix grad outputs count for  reduce grad and reshape grad;
	- fix priorbox & detection output;
	- fix metal softmax error;
- python:
	- add runSessionWithCallBackInfo interface;
	- add max nodes limit (1400) for visualization tool;
	- fix save error in python3;
	- align default dim;
- convert:
	- add extra design for optimization;
	- add more post converting optimizers;
	- add caffe v1 weights blob support;
	- add cast, unary, conv transpose support for onnx model;
	- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
	- add cos/sin/atan/tan support for unary for tensorflow model;
	- add any/all support for reduction for tensorflow model;
	- add elu, conv3d, pool3d support for tensorflow model;
	- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
	- fix size computer lock;
	- fix thread pool deadlock;
	- add express & parameters in express;
	- rewrite blitter chooser without static map;
	- add tests for expr;
											
										 
											2019-10-29 13:37:26 +08:00
										 |  |  |     if (mPadType == PoolPadType_SAME) { | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  |         mPads.clear(); | 
					
						
							| 
									
										
											  
											
												- build:
	- unify schema building in core and converter;
	- add more build script for android;
	- add linux build script for python;
- ops impl:
	- add floor mod support in binary;
	- use eltwise impl in add/max/sub/mul binary for optimization;
	- remove fake double support in cast;
	- fix 5d support for concat;
	- add adjX and adjY support for batch matmul;
	- optimize conv2d back prop filter;
	- add pad mode support for conv3d;
	- fix bug in conv2d & conv depthwise with very small feature map;
	- optimize binary without broacast;
	- add data types support for gather;
	- add gather ND support;
	- use uint8 data type in gather v2;
	- add transpose support for matmul;
	- add matrix band part;
	- add dim != 4 support for padding, reshape & tensor convert;
	- add pad type support for pool3d;
	- make ops based on TensorFlow Lite quantization optional;
	- add all & any support for reduction;
	- use type in parameter as output type in reduction;
	- add int support for unary;
	- add variable weight support for conv2d;
	- fix conv2d depthwise weights initialization;
	- fix type support for transpose;
	- fix grad outputs count for  reduce grad and reshape grad;
	- fix priorbox & detection output;
	- fix metal softmax error;
- python:
	- add runSessionWithCallBackInfo interface;
	- add max nodes limit (1400) for visualization tool;
	- fix save error in python3;
	- align default dim;
- convert:
	- add extra design for optimization;
	- add more post converting optimizers;
	- add caffe v1 weights blob support;
	- add cast, unary, conv transpose support for onnx model;
	- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
	- add cos/sin/atan/tan support for unary for tensorflow model;
	- add any/all support for reduction for tensorflow model;
	- add elu, conv3d, pool3d support for tensorflow model;
	- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
	- fix size computer lock;
	- fix thread pool deadlock;
	- add express & parameters in express;
	- rewrite blitter chooser without static map;
	- add tests for expr;
											
										 
											2019-10-29 13:37:26 +08:00
										 |  |  |         for (unsigned int i = 0; i < output->dimensions() - 2; ++i) { | 
					
						
							|  |  |  |             const int inputLength = input->length(i + 2), outputLength = output->length(i + 2); | 
					
						
							|  |  |  |             const int inputLengthNeed = (outputLength - 1) * mStrides[i] + mKernels[i]; | 
					
						
							|  |  |  |             mPads.push_back((inputLengthNeed - inputLength) / 2); | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     if (mKernels[0] != 1 || mStrides[0] != 1) { | 
					
						
							|  |  |  |         const int batch = input->length(0), channel = input->length(1), inputDepth = input->length(2); | 
					
						
							|  |  |  |         const int outputHeight = output->length(3), outputWidth = output->length(4); | 
					
						
							|  |  |  |         mTempStorage.reset(Tensor::createDevice<float>({batch, channel, inputDepth, outputHeight, outputWidth}, Tensor::CAFFE_C4)); | 
					
						
							|  |  |  |         backend()->onAcquireBuffer(mTempStorage.get(), Backend::DYNAMIC); | 
					
						
							|  |  |  |         backend()->onReleaseBuffer(mTempStorage.get(), Backend::DYNAMIC); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  | ErrorCode CPUPool3D::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { | 
					
						
							|  |  |  |     auto input = inputs[0]; | 
					
						
							|  |  |  |     auto output = outputs[0]; | 
					
						
							|  |  |  |     MNN_ASSERT(input->dimensions() == 5); | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     const int kernelDepth = mKernels[0], kernelHeight = mKernels[1], kernelWidth = mKernels[2]; | 
					
						
							|  |  |  |     const int strideDepth = mStrides[0], strideHeight = mStrides[1], strideWidth = mStrides[2]; | 
					
						
							|  |  |  |     const int outputDepth = output->length(2), outputHeight = output->length(3), outputWidth = output->length(4); | 
					
						
							|  |  |  |     const int inputDepth = input->length(2), inputHeight = input->length(3), inputWidth = input->length(4); | 
					
						
							|  |  |  |     const int channel = input->length(1), batch = input->length(0); | 
					
						
							|  |  |  |     const int padDepth = mPads[0], padHeight = mPads[1], padWidth = mPads[2]; | 
					
						
							|  |  |  |     const int threadNumber = ((CPUBackend*)backend())->threadNumber(); | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     { | 
					
						
							|  |  |  |         auto planeFunction = poolingMax; | 
					
						
							|  |  |  |         if (mType == PoolType_AVEPOOL) { | 
					
						
							|  |  |  |             planeFunction = poolingAvg; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         auto srcData           = input->host<float>(); | 
					
						
							|  |  |  |         auto dstData           = mTempStorage.get() != nullptr ? mTempStorage->host<float>() : output->host<float>(); | 
					
						
							|  |  |  |         auto inputPlaneStride  = 4 * inputHeight * inputWidth; | 
					
						
							|  |  |  |         auto outputPlaneStride = 4 * outputHeight * outputWidth; | 
					
						
							|  |  |  |         auto padType           = mPadType; | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |         auto planeFunc = [=](int tId) { | 
					
						
							|  |  |  |             for (int o = tId; o < batch * UP_DIV(channel, 4) * inputDepth; o += threadNumber) { | 
					
						
							|  |  |  |                 planeFunction(srcData + o * inputPlaneStride, inputWidth, inputHeight, | 
					
						
							|  |  |  |                               dstData + o * outputPlaneStride, outputWidth, outputHeight, kernelWidth, | 
					
						
							|  |  |  |                               kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |         MNN_CONCURRENCY_BEGIN(tId, threadNumber) { | 
					
						
							|  |  |  |             planeFunc((int)tId); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         MNN_CONCURRENCY_END(); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     if (mTempStorage.get() != nullptr) { | 
					
						
							|  |  |  |         using InnerFuncType = std::function<void(float*, const float*, int, int)>; | 
					
						
							|  |  |  |         InnerFuncType innerFunc = [=](float* dst, const float* src, int step, int kernel) { | 
					
						
							|  |  |  |             Vec4 max = Vec4::load(src); | 
					
						
							|  |  |  |             for (int i = 1; i < kernel; ++i) { | 
					
						
							|  |  |  |                 max = Vec4::max(max, Vec4::load(src + i * step)); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             Vec4::save(dst, max); | 
					
						
							|  |  |  |         }; | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |         if (mType == PoolType_AVEPOOL) { | 
					
						
							|  |  |  |             innerFunc = [=](float* dst, const float* src, int step, int kernel) { | 
					
						
							|  |  |  |                 Vec4 sum = Vec4::load(src); | 
					
						
							|  |  |  |                 for (int i = 1; i < kernel; ++i) { | 
					
						
							|  |  |  |                     sum = sum + Vec4::load(src + i * step); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 Vec4::save(dst, sum * ((float)1 / kernel)); | 
					
						
							|  |  |  |             }; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |         const float* srcData = mTempStorage->host<float>(); | 
					
						
							|  |  |  |         float* dstData = output->host<float>(); | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |         auto reduceDepthFunc = [=, &innerFunc](int tId) { | 
					
						
							|  |  |  |             const int outputPlaneStride = outputHeight * outputWidth * 4; | 
					
						
							|  |  |  |             for (int o = tId; o < batch * UP_DIV(channel, 4); o += threadNumber) { | 
					
						
							|  |  |  |                 auto srcZData = srcData + o * inputDepth * outputPlaneStride; | 
					
						
							|  |  |  |                 auto dstZData = dstData + o * outputDepth * outputPlaneStride; | 
					
						
							|  |  |  |                 for (int i = 0; i < outputHeight * outputWidth; ++i) { | 
					
						
							|  |  |  |                     for (int d = 0; d < outputDepth; ++d) { | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  |                         int dRawSrc = d * strideDepth - padDepth; | 
					
						
							|  |  |  |                         int dSrc = ALIMAX(dRawSrc, 0); | 
					
						
							|  |  |  |                         int kernel = ALIMIN(dRawSrc + kernelDepth, inputDepth) - dSrc; | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |                         if (kernel == 0) { | 
					
						
							|  |  |  |                             Vec4::save(dstZData + d * outputPlaneStride + i * 4, Vec4((float)0)); | 
					
						
							|  |  |  |                             continue; | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                         innerFunc(dstZData + d * outputPlaneStride + i * 4, srcZData + dSrc * outputPlaneStride + i * 4, | 
					
						
							|  |  |  |                                   outputPlaneStride, kernel); | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |         MNN_CONCURRENCY_BEGIN(tId, threadNumber) { | 
					
						
							|  |  |  |             reduceDepthFunc((int)tId); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         MNN_CONCURRENCY_END(); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  |     return NO_ERROR; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2019-12-27 22:16:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
											
												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes
											
										 
											2019-09-26 21:02:07 +08:00
										 |  |  | class CPUPool3DCreator : public CPUBackend::Creator { | 
					
						
							|  |  |  | public: | 
					
						
							|  |  |  |     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, | 
					
						
							|  |  |  |                                 const MNN::Op *op, Backend *backend) const override { | 
					
						
							|  |  |  |         return new CPUPool3D(backend, op->main_as_Pool3D()); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | REGISTER_CPU_OP_CREATOR(CPUPool3DCreator, OpType_Pooling3D); | 
					
						
							| 
									
										
										
										
											2019-04-17 10:49:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | } // namespace MNN
 |