[MNN:Sync] Sync internal gitlab

2022-06-24 18:30:05 +08:00 · 2022-06-24 18:30:05 +08:00 · d3ffdf4229
parent 47d6fc5adc
commit d3ffdf4229
173 changed files with 51505 additions and 36988 deletions
--- a/.gitignore
+++ b/.gitignore
@ -357,3 +357,6 @@ project/ios/ios_32
 project/ios/MNN.framework

 pymnn_build/
+
+# mnncompress generated
+MNN_compression_pb2.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,28 +1,17 @@
 cmake_minimum_required(VERSION 3.0)
 # Versioning stuff
-if(NOT DEFINED MNN_VERSION_MAJOR)
-  set(MNN_VERSION_MAJOR 0)
-endif()
-if(NOT DEFINED MNN_VERSION_MINOR)
-  set(MNN_VERSION_MINOR 2)
-endif()
-if(NOT DEFINED MNN_VERSION_PATCH)
-  set(MNN_VERSION_PATCH 1)
-endif()
-if(NOT DEFINED MNN_VERSION_BUILD)
-  set(MNN_VERSION_BUILD 5)
-endif()
-if(NOT DEFINED MNN_VERSION_SUFFIX)
-  set(MNN_VERSION_SUFFIX git)
-endif()
-if (NOT PACKAGE_VERSION)
-  set(PACKAGE_VERSION
-    "${MNN_VERSION_MAJOR}.${MNN_VERSION_MINOR}.${MNN_VERSION_PATCH}.${MNN_VERSION_BUILD}${MNN_VERSION_SUFFIX}")
-endif()
-add_definitions("-DMNN_VERSION=\"${PACKAGE_VERSION}\"")
-add_definitions("-DMNN_VERSION_MAJOR=${MNN_VERSION_MAJOR}")
-add_definitions("-DMNN_VERSION_MINOR=${MNN_VERSION_MINOR}")
-add_definitions("-DMNN_VERSION_PATCH=${MNN_VERSION_PATCH}")
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/MNN/MNNDefine.h" MNN_DEFINE)
+string(REGEX MATCH "MNN_VERSION_MAJOR [0-9]+" MNN_VERSION_MAJOR_DEFINE ${MNN_DEFINE})
+string(REGEX MATCH "[0-9]+" MNN_VERSION_MAJOR ${MNN_VERSION_MAJOR_DEFINE})
+string(REGEX MATCH "MNN_VERSION_MINOR [0-9]+" MNN_VERSION_MINOR_DEFINE ${MNN_DEFINE})
+string(REGEX MATCH "[0-9]+" MNN_VERSION_MINOR ${MNN_VERSION_MINOR_DEFINE})
+string(REGEX MATCH "MNN_VERSION_PATCH [0-9]+" MNN_VERSION_PATCH_DEFINE ${MNN_DEFINE})
+string(REGEX MATCH "[0-9]+" MNN_VERSION_PATCH ${MNN_VERSION_PATCH_DEFINE})
+set(MNN_VERSION ${MNN_VERSION_MAJOR}.${MNN_VERSION_MINOR}.${MNN_VERSION_PATCH})
+# add_definitions("-DMNN_VERSION=\"${MNN_VERSION}\"")
+# add_definitions("-DMNN_VERSION_MAJOR=${MNN_VERSION_MAJOR}")
+# add_definitions("-DMNN_VERSION_MINOR=${MNN_VERSION_MINOR}")
+# add_definitions("-DMNN_VERSION_PATCH=${MNN_VERSION_PATCH}")

 # Clear VERSION variables when no VERSION is given to project()
 if(POLICY CMP0048)
@ -32,7 +21,7 @@ endif()
 if(POLICY CMP0091)
  cmake_policy(SET CMP0091 NEW)
 endif()
-project(MNN VERSION ${MNN_VERSION_MAJOR}.${MNN_VERSION_MINOR}.${MNN_VERSION_PATCH}.${MNN_VERSION_BUILD} LANGUAGES C CXX ASM)
+project(MNN VERSION ${MNN_VERSION} LANGUAGES C CXX ASM)
 # complier options
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_CXX_STANDARD 11)
@ -225,11 +214,11 @@ ELSE()
    set(MNN_HIDDEN TRUE)
 ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)

-
 message(STATUS ">>>>>>>>>>>>>")
 message(STATUS "MNN BUILD INFO:")
 message(STATUS "\tSystem: ${CMAKE_SYSTEM_NAME}")
 message(STATUS "\tProcessor: ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "\tVersion: ${MNN_VERSION}")
 message(STATUS "\tMetal: ${MNN_METAL}")
 message(STATUS "\tOpenCL: ${MNN_OPENCL}")
 message(STATUS "\tOpenGL: ${MNN_OPENGL}")
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@
 [MNN Homepage](http://www.mnn.zone)

 ## Intro
-MNN is a highly efficient and lightweight deep learning framework. It supports inference and training of deep learning models, and has industry leading performance for inference and training on-device. At present, MNN has been integrated in more than 20 apps of Alibaba Inc, such as Taobao, Tmall, Youku, Dingtalk, Xianyu and etc., covering more than 70 usage scenarios such as live broadcast, short video capture, search recommendation, product searching by image, interactive marketing, equity distribution, security risk control. In addition, MNN is also used on embedded devices, such as IoT.
+MNN is a highly efficient and lightweight deep learning framework. It supports inference and training of deep learning models, and has industry leading performance for inference and training on-device. At present, MNN has been integrated in more than 30 apps of Alibaba Inc, such as Taobao, Tmall, Youku, Dingtalk, Xianyu and etc., covering more than 70 usage scenarios such as live broadcast, short video capture, search recommendation, product searching by image, interactive marketing, equity distribution, security risk control. In addition, MNN is also used on embedded devices, such as IoT.

 The design principles and performance data of MNN has been published in an MLSys 2020 paper [here](https://arxiv.org/pdf/2002.12418.pdf). Please cite MNN in your publications if it helps your research:

@ -16,44 +16,89 @@ The design principles and performance data of MNN has been published in an MLSys
      year = {2020}
    }

-## Documentation and Tools
+![image.png](doc/workflow.png)
+
+## Documentation and Workbench
 MNN's docs are in placed in [Yuque docs here](https://www.yuque.com/mnn/en).

 MNN Workbench could be downloaded from [MNN's homepage](http://www.mnn.zone), which provides pretrained models, visualized training tools, and one-click deployment of models to devices.

 ## Key Features
-### High performance
- Implements core computing with lots of optimized assembly code to make full use of the ARM CPU.
- For iOS, GPU acceleration (Metal) can be turned on, which is faster than Apple's native CoreML.
- For Android, `OpenCL`, `Vulkan`, and `OpenGL` are available and deep tuned for mainstream GPUs (`Adreno` and `Mali`).
- Convolution and transposition convolution algorithms are efficient and stable. The Winograd convolution algorithm is widely used to better symmetric convolutions such as 3x3 -> 7x7.
- Twice speed increase for the new architecture ARM v8.2 with FP16 half-precision calculation support.
-
 ### Lightweight
 - Optimized for devices, no dependencies, can be easily deployed to mobile devices and a variety of embedded devices.
- iOS platform: static library size for armv7+arm64 platforms is about 5MB, size increase of linked executables is about 620KB, and metallib file is about 600KB.
- Android platform: core so size is about 400KB, OpenCL so is about 400KB, Vulkan so is about 400KB.
+- iOS platform: static library size will full option for armv7+arm64 platforms is about 12MB, size increase of linked executables is about 2M.
+- Android platform: core so size is about 800KB (armv7a - c++_shared).
+- Use MNN_BUILD_MINI can reduce package size about 25% , with limit of fix model input size
+- Support FP16 / Int8 qunatize, can reduce model size 50%-70%

 ### Versatility
- Supports `Tensorflow`, `Caffe`, `ONNX`, and supports common neural networks such as `CNN`, `RNN`, `GAN`.
- MNN model converter supports 149 `Tensorflow` OPs, 58 `TFLite` OPs, 47 `Caffe` OPs and 74 `ONNX` OPs; Number of OPs by different MNN hardware backends: 111 for CPU, 6 for ARM V8.2, 55 for Metal, 43 for OpenCL, and 32 for Vulkan.
+- Supports `Tensorflow`, `Caffe`, `ONNX`,`Torchscripts` and supports common neural networks such as `CNN`, `RNN`, `GAN`, `Transformork`.
+- Supports AI model with multi-inputs or multi-outputs, every kind of dimenstion format, dynamic inputs, controlflow.
+- MNN supports approximate full OPs used for AI Model. The converter supports 178 `Tensorflow` OPs, 52 `Caffe` OPs, 163 `Torchscripts` OPs, 158 `ONNX` OPs.
 - Supports iOS 8.0+, Android 4.3+ and embedded devices with POSIX interface.
 - Supports hybrid computing on multiple devices. Currently supports CPU and GPU.

+
+### High performance
+- Implements core computing with lots of optimized assembly code to make full use of the ARM / x64 CPU.
+- Use Metal / OpenCL / Vulkan to support GPU inference on mobile.
+- Use CUDA and tensorcore to support NVIDIA GPU for better performance
+- Convolution and transposition convolution algorithms are efficient and stable. The Winograd convolution algorithm is widely used to better symmetric convolutions such as 3x3,4x4,5x5,6x6,7x7.
+- Twice speed increase for the new architecture ARM v8.2 with FP16 half-precision calculation support. 2.5 faster to use sdot for ARM v8.2 and VNNI.
+
 ### Ease of use
- Efficient image processing module, speeding up affine transform and color space transform without libyuv or opencv.
- Provides callbacks throughout the workflow to extract data or control the execution precisely.
- Provides options for selecting inference branch and paralleling branches on CPU and GPU.
- (BETA) MNN Python API helps ML engineers to easily use MNN to build a model, train it and quantize it, without dipping their toes in C++ code.
+- Support use MNN's OP to do numerical calculating like numpy.
+- Support lightweight image process module like OpenCV, which is only 100k.
+- Support build model and train it on PC / mobile.
+- MNN Python API helps ML engineers to easily use MNN to inference, train, process image, without dipping their toes in C++ code.
+
+
+- S ：Support and work well, deeply optimized, recommend to use
+- A ：Support and work well, can use
+- B ：Support but has bug or not optimized, no recommend to use
+- C ：Not Support
+
+| Architecture / Precision |  | Normal | FP16 | BF16 | Int8 |
+| --- | --- | --- | --- | --- | --- |
+| CPU | Native | B | C | B | B |
+|  | x86/x64-SSE4.1 | A | B | B | A |
+|  | x86/x64-AVX2 | S | B | B | A |
+|  | x86/x64-AVX512 | S | B | B | S |
+|  | ARMv7a | S | S (ARMv8.2) | S | S |
+|  | ARMv8 | S | S (ARMv8.2) | S | S |
+| GPU | OpenCL | A | S | C | C |
+|  | Vulkan | A | A | C | C |
+|  | Metal | A | S | C | C |
+|  | CUDA | A | S | C | C |
+| NPU | CoreML | B | C | C | C |
+|  | HIAI | B | C | C | B |
+
+

 ## Architecture
 ![architecture](doc/architecture.png)

-MNN can be divided into two parts: Converter and Interpreter.
+MNN can be divided into two parts: Inference Engine and Tools.

-Converter consists of Frontends and Graph Optimize. The former is responsible for supporting different training frameworks. MNN currently supports Tensorflow, Tensorflow Lite, Caffe and ONNX (PyTorch/MXNet); the latter optimizes graphs by operator fusion, operator substitution, and layout adjustment.
+### Inference Engine

-Interpreter consists of Engine and Backends. The former is responsible for the loading of the model and the scheduling of the calculation graph; the latter includes the memory allocation and the Op implementation under each computing device. In Engine and Backends, MNN applies a variety of optimization schemes, including applying Winograd algorithm in convolution and deconvolution, applying Strassen algorithm in matrix multiplication, low-precision calculation, Neon optimization, hand-written assembly, multi-thread optimization, memory reuse, heterogeneous computing, etc.
+The input of Inference Engine, AI model is a Directed Acyclic Graph(DAG), each node in model is an operator, which describe a kind of tensor compute function. Inference Engine will load and execute the graph. It can seperate into schedule and execute:
+![runflow.png](doc/runflow.png)
+
+- Schedule: Load Graph and Pretreat it
+    - Decompose OP, reduce kinds of OPs
+    - Search best compute stratagy
+    - Find best resource allocation
+- Execute: Implete OP, use algorithm and hardware feature to optimize
+    - Algorithm: Winograd Convolution, Strassen Matrix Multiply, Low Precision Compute
+    - Hardware: SIMD for CPU (SSE/NEON/AVX), GPU API (OpenCL / CUDA / Metal)
+
+### Tools
+- MNN-Converter: Convert other model to MNN model, such as Tensorflow(lite), Caffe, ONNX, Torchscripts. And do graph optimization to reduce computation.
+- MNN-Compress: Compress model to reduce size and increase performance / speed
+- MNN-Express: Support model with controlflow, use MNN's OP to do general-purpose compute.
+- MNN-CV: A OpenCV liked library, but based on MNN and then much more lightweight.
+- MNN-Train: Support train MNN model.

 ## How to Discuss and Get Help From MNN Community

--- a/README_CN.md
+++ b/README_CN.md
@ -2,67 +2,109 @@

 [English Version](README.md)

-[MNN官网](http://www.mnn.zone)
+[MNN Homepage](http://www.mnn.zone)

-## 简介
-MNN是一个高效、轻量的深度学习框架。它支持深度模型推理与训练，尤其在端侧的推理与训练性能在业界处于领先地位。目前，MNN已经在阿里巴巴的手机淘宝、手机天猫、优酷、钉钉、闲鱼等20多个App中使用，覆盖直播、短视频、搜索推荐、商品图像搜索、互动营销、权益发放、安全风控等70多个场景。此外，IoT等场景下也有若干应用。
+[MNN](https://github.com/alibaba/MNN)是一个轻量级的深度神经网络推理引擎，加载深度神经网络模型进行推理预测。适用于服务器/个人电脑/手机/嵌入式各类设备。目前，MNN已经在阿里巴巴的手机淘宝、手机天猫、优酷等30多个App中使用，覆盖直播、短视频、搜索推荐、商品图像搜索、互动营销、权益发放、安全风控等场景。

 MNN的架构设计理念与性能数据在MLSys 2020上面发表。Paper [在此处](https://arxiv.org/pdf/2002.12418.pdf)。如果MNN对你的研究有所助益，欢迎引用MNN的论文：

-    @inproceedings{alibaba2020mnn,
+	@inproceedings{alibaba2020mnn,
      author = {Jiang, Xiaotang and Wang, Huan and Chen, Yiliu and Wu, Ziqi and Wang, Lichuan and Zou, Bin and Yang, Yafeng and Cui, Zongyang and Cai, Yu and Yu, Tianhang and Lv, Chengfei and Wu, Zhihua},
      title = {MNN: A Universal and Efficient Inference Engine},
      booktitle = {MLSys},
      year = {2020}
    }

-## 文档与工具
+![image.png](doc/workflow.png)
+
+## 文档与工作台
 MNN的使用文档统一放在语雀，请移步至[语雀文档](https://www.yuque.com/mnn/cn)。

 [MNN官网](http://www.mnn.zone)上还可以下载MNN团队全新力作MNN工作台，涵盖开箱即用模型、可视化训练等工具，更可以一键部署到多端设备。

-## MNN 特色
+## 整体特点
+
+### 轻量性 
+
+- 主体功能（模型推理CPU+GPU）无任何依赖，代码精简，可以方便地部署到移动设备和各种嵌入式设备中。 
+   - iOS平台：功能全开的MNN静态库 armv7+arm64大小12MB左右，链接生成可执行文件增加大小2M左右。可裁剪主体功能后静态库大小6.1M ，链接生成可执行文件增加大小 600 KB。
+   - Android平台：主体功能 armv7a - c++_shared 动态库大小800KB左右。
+- 支持采用 Mini 编辑选项进一步降低包大小，大约能在上述库体积基础上进一步降低 25% 左右。
+- 支持模型FP16/Int8压缩与量化，可减少模型50% - 75% 的体积
+
+### 通用性 
+
+- 支持 Tensorflow、Caffe、ONNX、Torchscripts 等主流模型文件格式，支持CNN / RNN / GAN / Transformer 等主流网络结构。
+- 支持多输入多输出，支持任意维度的输入输出，支持动态输入（输入大小可变），支持带控制流的模型
+- 算子丰富，支持 178 个Tensorflow Op、52个 Caffe Op、163个 Torchscipts Op、158 个 ONNX Op（ONNX 基本完整支持）
+- 支持 服务器 / 个人电脑 / 手机 及具有POSIX接口的嵌入式设备，支持使用设备的 CPU / GPU 计算，支持部分设备的 NPU 计算（IOS 11 + CoreML / Huawei + HIAI）
+- 支持 Windows / iOS 8.0+ / Android 4.3+ / Linux  及具有POSIX接口的操作系统
+
 ### 高性能
- 不依赖任何第三方计算库，依靠大量手写汇编实现核心运算，充分发挥ARM CPU的算力。
- iOS设备上可以开启GPU加速（Metal），常用模型上快于苹果原生的CoreML。
- Android上提供了`OpenCL`、`Vulkan`、`OpenGL`三套方案，尽可能多地满足设备需求，针对主流GPU（`Adreno`和`Mali`）做了深度调优。
- 卷积、转置卷积算法高效稳定，对于任意形状的卷积均能高效运行，广泛运用了 Winograd 卷积算法，对3x3 -> 7x7之类的对称卷积有高效的实现。
- 针对ARM v8.2的新架构额外作了优化，新设备可利用FP16半精度计算的特性获得两倍提速。

-### 轻量性
- 针对端侧设备特点深度定制和裁剪，无任何依赖，可以方便地部署到移动设备和各种嵌入式设备中。
- iOS平台：armv7+arm64静态库大小5MB左右，链接生成可执行文件增加大小620KB左右，metallib文件600KB左右。
- Android平台：so大小400KB左右，OpenCL库400KB左右，Vulkan库400KB左右。
-
-### 通用性
- 支持`Tensorflow`、`Caffe`、`ONNX`等主流模型文件格式，支持`CNN`、`RNN`、`GAN`等常用网络。
- 转换器支持149个`Tensorflow`OP、58个`TFLite` OP、47个`Caffe` OP、74个`ONNX` OP；各计算设备后端支持的MNN OP数：CPU 111个，ARM V8.2 6个，Metal 55个，OpenCL 43个，Vulkan 32个。
- 支持iOS 8.0+、Android 4.3+和具有POSIX接口的嵌入式设备。
- 支持异构设备混合计算，目前支持CPU和GPU。
+- 对iOS / Android / PC / Server 的CPU架构进行了适配，编写SIMD代码或手写汇编以实现核心运算，充分发挥 CPU的算力，单线程下运行常见CV模型接近设备算力峰值
+- 支持基于 Metal / OpenCL / Vulkan 使用移动端设备上的GPU进行推理
+- 支持基于 CUDA 使用 PC / Server 上的 NVIDIA GPU 实现更快速的推理
+- 广泛运用了 Winograd 卷积算法提升卷积性能，首次在业界工程实践中实现转置卷积的Winograd算法优化与矩阵乘的Strassen算法优化，并取得加速效果
+- 支持低精度计算（ int8 / fp16 / bf16）以提升推理性能。并对 ARMv8.2 和 AVX512架构的相关指令进行了适配，这两种架构下有更好的加速效果

 ### 易用性
- 有高效的图像处理模块，覆盖常见的形变、转换等需求，一般情况下，无需额外引入libyuv或opencv库处理图像。
- 支持回调机制，可以在网络运行中插入回调，提取数据或者控制运行走向。
- 支持只运行网络中的一部分，或者指定CPU和GPU间并行运行。
- （BETA）MNN Python API，让算法工程师可以轻松地使用MNN构图、训练、量化训练，无需编写C++。
+
+- 支持使用 MNN 的算子进行常用的数值计算，覆盖 numpy 常用功能
+- 提供 MNN CV 模块，支持图像仿射变换与归一化等 MNN_CV 库，支持常用的图像处理（armv7a 架构下小于 100 k ）
+- 支持各平台下的模型训练，尤其是移动端上的模型训练
+- 支持 python 调用
+
+MNN适配的硬件架构与精度详见下表：
+
+- S ：支持，深度优化并已有应用场景，推荐使用
+- A ：支持，有初步优化或已有应用场景，可以使用
+- B ：支持，无优化或在实验状态，不推荐使用
+- C ：不支持
+
+| Architecture / Precision |  | Normal | FP16 | BF16 | Int8 |
+| --- | --- | --- | --- | --- | --- |
+| CPU | Native | B | C | B | B |
+|  | x86/x64-SSE4.1 | A | B | B | A |
+|  | x86/x64-AVX2 | S | B | B | A |
+|  | x86/x64-AVX512 | S | B | B | S |
+|  | ARMv7a | S | S (ARMv8.2) | S | S |
+|  | ARMv8 | S | S (ARMv8.2) | S | S |
+| GPU | OpenCL | A | S | C | C |
+|  | Vulkan | A | A | C | C |
+|  | Metal | A | S | C | C |
+|  | CUDA | A | S | C | C |
+| NPU | CoreML | B | C | C | C |
+|  | HIAI | B | C | C | B |
+
+

 ## 架构设计
-![architecture](doc/architecture.png)

-MNN可以分为Converter和Interpreter两部分。
+![架构图](doc/architecture.png)

-Converter由Frontends和Graph Optimize构成。前者负责支持不同的训练框架，MNN当前支持Tensorflow(Lite)、Caffe和ONNX(PyTorch/MXNet的模型可先转为ONNX模型再转到MNN)；后者通过算子融合、算子替代、布局调整等方式优化图。
+MNN可以分为主体（推理引擎）和工具两大部分。

-Interpreter由Engine和Backends构成。前者负责模型的加载、计算图的调度；后者包含各计算设备下的内存分配、Op实现。在Engine和Backends中，MNN应用了多种优化方案，包括在卷积和反卷积中应用Winograd算法、在矩阵乘法中应用Strassen算法、低精度计算、Neon优化、手写汇编、多线程优化、内存复用、异构计算等。
+### 主体
+MNN 的输入（AI推理模型）是一个有向无环图（DAG），图中每个节点称为算子，描述一种张量计算函数。推理引擎负责这个图的加载与执行，可分为调度（预推理）与执行（推理）两层：
+![runflow.png](doc/runflow.png)

-##  社区交流与反馈
-钉钉讨论群：
+- 调度：加载计算图并做预处理，以使执行过程高效
+   - 对模型中的算子进行预处理，降低算子数
+   - 搜索最优计算方案
+   - 进行资源分配
+- 执行：实现算子，基于各类算法与不同硬件提供的并行接口进行优化，降低执行耗时
+   - 算法层面，采用 Winograd 卷积 / Strassen 矩阵乘 / 分段线性 / 低精度等方案
+   - 硬件层面，使用 CPU 的 SIMD指令 (SSE / NEON / AVX / AVX512) ，各类 GPU 计算 API 进行优化

-一群（已满）：23329087
+### 工具

-二群（已满）：23350225
+- MNN-Converter：模型转换工具，由Frontends和Graph Optimize构成。前者负责支持不同的训练框架，MNN当前支持Tensorflow(Lite)、Caffe、ONNX(PyTorch/MXNet的模型可先转为ONNX模型再转到MNN)和Torchscripts；后者通过算子融合、算子替代、布局调整等方式优化图，一般离线运行。
+- MNN-Compress: 模型压缩工具，在一定的精度误差许可下，对MNN模型进行压缩，减少模型体积，提升运行性能。
+- MNN-Express ：支持带控制流的模型运行，支持调用 MNN 的算子进行自定义的计算。
+- MNN-CV ：类似 OpenCV ，但核心计算功能基于 MNN 实现的图像处理算法库
+- MNN-Train ：MNN 训练模块，支持各平台训练

-三群: https://h5.dingtalk.com/circle/healthCheckin.html?dtaction=os&corpId=ding2c1d5c85a81030b9a483726330e8af54&574b2bb2-c53a-4=497bad6b-25a5-4&cbdbhh=qwertyuiop

 ## License
 Apache 2.0
@ -71,6 +113,7 @@ Apache 2.0
 MNN参与人员：淘宝技术部、搜索工程团队、达摩院团队、优酷等集团员工。

 MNN参考、借鉴了下列项目：
+
 - [Caffe](https://github.com/BVLC/caffe)
 - [flatbuffer](https://github.com/google/flatbuffers)
 - [gemmlowp](https://github.com/google/gemmlowp)
@ -91,3 +134,4 @@ MNN参考、借鉴了下列项目：
 - [libyuv](https://chromium.googlesource.com/libyuv/libyuv)
 - [libjpeg](https://github.com/libjpeg-turbo/libjpeg-turbo)
 - [opencv](https://github.com/opencv/opencv)
+
--- a/doc/DingTalkQR1.png
+++ b/doc/DingTalkQR1.png
--- a/doc/DingTalkQR2.png
+++ b/doc/DingTalkQR2.png
--- a/doc/DingTalkQR3.png
+++ b/doc/DingTalkQR3.png
--- a/doc/architecture.png
+++ b/doc/architecture.png
--- a/doc/runflow.png
+++ b/doc/runflow.png
--- a/doc/workflow.png
+++ b/doc/workflow.png
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -73,14 +73,6 @@ void Executor::Profiler::addFlops(const std::string& opType, float flops) {
 }
 #endif

-struct Executor::Cache{
-    AutoStorage<uint8_t> modelBuffer;
-    AutoStorage<uint8_t> cacheBuffer;
-    size_t cacheOffset = 0;
-    std::string cacheFile;
-    size_t lastCacheSize = 0;
-};
-
 void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
    std::lock_guard<std::mutex> _l(mMutex);
    mFirstType = std::make_pair(type, numberThread);
@ -237,7 +229,7 @@ static std::pair<const void*, size_t> getCache(std::shared_ptr<Runtime> &rt) {
    return std::make_pair(nullptr, 0);
 }

-static void writeCacheFile(std::shared_ptr<Executor::Cache> cache, std::pair<const void*, size_t> buffer) {
+static void writeCacheFile(std::shared_ptr<Cache> cache, std::pair<const void*, size_t> buffer) {
    auto verifyInfo = std::make_pair((const void*)cache->modelBuffer.get(), cache->cacheOffset);
    bool res = FileLoader::write(cache->cacheFile.c_str(), buffer);
    if (!res) {
@ -273,9 +265,9 @@ bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void*
    switch (code) {
        case Interpreter::MEMORY: {
            auto dst     = (float*)ptr;
-            float summer = mRuntime.second->onGetMemoryInMB();
-            for (auto& r : mRuntime.first) {
-                if (r.second.get() != mRuntime.second.get()) {
+            float summer = mInside->mRuntime.second->onGetMemoryInMB();
+            for (auto& r : mInside->mRuntime.first) {
+                if (r.second.get() != mInside->mRuntime.second.get()) {
                    summer += r.second->onGetMemoryInMB();
                }
            }
@ -326,21 +318,33 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
        }
        originRt.insert(std::make_pair(std::make_pair(compute.type, compute.numThread), std::shared_ptr<Runtime>(newBn)));
    }
-    res->mRuntime.second =  originRt[DEFAULT_BACKUP_RUNTIME_KEY];
-    res->mRuntime.first.insert(std::make_pair(compute.type, originRt[std::make_pair(compute.type, compute.numThread)]));
-    res->mInfo = originRt[std::make_pair(compute.type, compute.numThread)];
+    res->mInside->mRuntime.second =  originRt[DEFAULT_BACKUP_RUNTIME_KEY];
+    res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[std::make_pair(compute.type, compute.numThread)]));
+    res->mInside->mInfo = originRt[std::make_pair(compute.type, compute.numThread)];
+    if (nullptr != config.backendConfig) {
+        res->mInside->mConfig = *config.backendConfig;
+        res->mInside->mUserConfig = true;
+    } else {
+        res->mInside->mUserConfig = false;
+    }
    return res;
 }
+BackendConfig* Executor::RuntimeManager::getBnConfig() {
+    if (mInside->mUserConfig) {
+        return &mInside->mConfig;
+    }
+    return nullptr;
+}


 void Executor::RuntimeManager::setCache(std::string cacheName) {
-    mCache.reset(new Cache);
-    mCache->cacheFile = cacheName;
-    if (nullptr == mCache->cacheFile.c_str()) {
+    mInside->mCache.reset(new Cache);
+    mInside->mCache->cacheFile = cacheName;
+    if (nullptr == mInside->mCache->cacheFile.c_str()) {
        MNN_ERROR("Empty cacheFile\n");
        return;
    }
-    std::unique_ptr<FileLoader> loader(new FileLoader(mCache->cacheFile.c_str()));
+    std::unique_ptr<FileLoader> loader(new FileLoader(mInside->mCache->cacheFile.c_str()));
    if (!loader->valid()) {
        MNN_ERROR("Load Cache file error.\n");
        return;
@ -354,36 +358,36 @@ void Executor::RuntimeManager::setCache(std::string cacheName) {
        MNN_ERROR("Load Cache file error.\n");
        return;
    }
-    bool success = loader->merge(mCache->cacheBuffer);
+    bool success = loader->merge(mInside->mCache->cacheBuffer);
    if (!success) {
        MNN_ERROR("Alloc memory for Cache error.\n");
        return;
    }

    // load cache
-    bool valid = loadCache(mInfo, mCache->cacheBuffer.get() + mCache->cacheOffset,
-                           mCache->cacheBuffer.size() - mCache->cacheOffset);
+    bool valid = loadCache(mInside->mInfo, mInside->mCache->cacheBuffer.get() + mInside->mCache->cacheOffset,
+                           mInside->mCache->cacheBuffer.size() - mInside->mCache->cacheOffset);
    if(!valid) {
        // Reset cache
-        loadCache(mInfo, nullptr, 0);
+        loadCache(mInside->mInfo, nullptr, 0);
        MNN_PRINT("Cache invalid, will be reset\n");
    }

-    mCache->lastCacheSize = mCache->cacheBuffer.size() - mCache->cacheOffset;
+    mInside->mCache->lastCacheSize = mInside->mCache->cacheBuffer.size() - mInside->mCache->cacheOffset;
 }

 void Executor::RuntimeManager::updateCache() {
-    mInfo->waitAsyncWork();
-    auto buffer = getCache(mInfo);
+    mInside->mInfo->waitAsyncWork();
+    auto buffer = getCache(mInside->mInfo);

    //When current cacheSize bigger than previous, update
-    if (buffer.first != nullptr && buffer.second > mCache->lastCacheSize) {
-        MNN_PRINT("Update cache to %s, size = %zu\n", mCache->cacheFile.c_str(), buffer.second);
-        writeCacheFile(mCache, buffer);
-        mCache->lastCacheSize = buffer.second;
+    if (buffer.first != nullptr && buffer.second > mInside->mCache->lastCacheSize) {
+        MNN_PRINT("Update cache to %s, size = %zu\n", mInside->mCache->cacheFile.c_str(), buffer.second);
+        writeCacheFile(mInside->mCache, buffer);
+        mInside->mCache->lastCacheSize = buffer.second;
    }
    // Reset cache
-    loadCache(mInfo, nullptr, 0);
+    loadCache(mInside->mInfo, nullptr, 0);
 }

 std::vector<bool> Executor::RuntimeManager::isBackendSupport(const std::vector<MNNForwardType> types) {
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@ -978,6 +978,9 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
            }
        }
    }
+    // add version number
+    dest->extraInfo.reset(new ExtraInfoT);
+    dest->extraInfo->version = MNN_VERSION;
 }
 void Variable::save(const std::vector<VARP>& vars, const char* fileName) {
    std::unique_ptr<NetT> net(new NetT);
--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@ -1221,5 +1221,26 @@ VARP _Mod(VARP x, VARP y) {
    return _Binary(x, y, BinaryOpOperation_MOD);
 }

+VARP _CumSum(VARP x, int axis, bool exclusive, bool reverse) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_CumSum;
+    op->main.type = OpParameter_CumSum;
+    auto param = new CumSumT;
+    param->exclusive = exclusive;
+    param->reverse = reverse;
+    op->main.value = param;
+    return (Variable::create(Expr::create(std::move(op), {x ,_Scalar(axis)})));
+}
+
+VARP _CumProd(VARP x, int axis) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_CumProd;
+    op->main.type = OpParameter_Axis;
+    auto param = new AxisT;
+    param->axis = axis;
+    op->main.value = param;
+    return (Variable::create(Expr::create(std::move(op), {x})));
+}
+
 } // namespace Express
 } // namespace MNN
--- a/express/RuntimeAttr.hpp
+++ b/express/RuntimeAttr.hpp
@ -3,8 +3,21 @@
 #include "core/Session.hpp"
 namespace MNN{
 namespace Express {
+struct Cache {
+    AutoStorage<uint8_t> modelBuffer;
+    AutoStorage<uint8_t> cacheBuffer;
+    size_t cacheOffset = 0;
+    std::string cacheFile;
+    size_t lastCacheSize = 0;
+};
 struct RuntimeAttr {
    Session::ModeGroup modes;
+    RuntimeInfo mRuntime;
+    std::shared_ptr<Runtime> mInfo;
+    std::shared_ptr<Cache> mCache;
+    RuntimeAttr* mInside;
+    BackendConfig mConfig;
+    bool mUserConfig;
 };
 };
 };
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -12,6 +12,7 @@
 #include "core/FileLoader.hpp"
 #include "MNN_generated.h"
 #include "Utils.hpp"
+#include "RuntimeAttr.hpp"

 #ifdef MNN_INTERNAL_ENABLED
 #include "internal/auth/ModelAuth.hpp"
@ -252,7 +253,7 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s

 static Module* loadInternal(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> _rtMgr, const Module::Config* config, bool enforceAuth) {
    // Check if runtime is valid
-    if (nullptr != _rtMgr && _rtMgr->getRuntimeInfo().first.empty()) {
+    if (nullptr != _rtMgr && _rtMgr->getInside()->mRuntime.first.empty()) {
        MNN_ERROR("Invalid runtime\n");
        return nullptr;
    }
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@ -175,14 +175,16 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
        sche_config.backendConfig = moduleconfig.backend->config;
        rtMgr.reset(Executor::RuntimeManager::createRuntimeManager(sche_config));
    }
+    const BackendConfig* userConfig = nullptr;
    if (nullptr == rtMgr) {
        rt = Executor::getRuntime();
    } else {
        mResource->mModes = rtMgr->getInside()->modes;
-        rt = rtMgr->getRuntimeInfo();
+        rt = rtMgr->getInside()->mRuntime;
+        userConfig = &rtMgr->getInside()->mConfig;
    }
    if (moduleconfig.rearrange) {
-        mResourceBackend.reset(rt.first.begin()->second->onCreate());
+        mResourceBackend.reset(rt.first.begin()->second->onCreate(userConfig));
        if (mResourceBackend->type() == MNN_FORWARD_CPU) {
            mBackupResourceBackend = mResourceBackend;
        } else {
@ -236,6 +238,7 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
    mResource->mConfig.path.outputs = outputs;
    mResource->mConfig.saveTensors = outputs;
    mResource->mConfig.path.inputs = inputs;
+    mResource->mConfig.backendConfig = (BackendConfig*)userConfig;
    Schedule::ScheduleInfo scheduleInfo;
    // Copy Const
    if (nullptr != mResource->mSharedConst) {
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -96,6 +96,12 @@ typedef std::function<bool(const std::vector<Tensor*>&, const std::string& /*opN
 typedef std::function<bool(const std::vector<Tensor*>&, const OperatorInfo*)> TensorCallBackWithInfo;
 typedef std::pair<std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;

+/**
+ * @brief get mnn version info.
+ * @return mnn version string.
+ */
+MNN_PUBLIC const char* getVersion();
+
 /** net data holder. multiple sessions could share same net. */
 class MNN_PUBLIC Interpreter {
 public:
@ -240,6 +246,13 @@ public:
     */
    std::pair<const void*, size_t> getModelBuffer() const;

+    /**
+     * @brief Get the model's version info.
+     * @return const char* of model's version info like "2.0.0";
+     * If model is not loaded or model no version info, return "version info not found".
+     */
+    const char* getModelVersion() const;
+
    /**
     * @brief update Session's Tensor to model's Const Op
     * @param session   given session.
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -65,5 +65,10 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #else
 #define MNN_PUBLIC __attribute__((visibility("default")))
 #endif
-
+#define STR_IMP(x) #x
+#define STR(x) STR_IMP(x)
+#define MNN_VERSION_MAJOR 2
+#define MNN_VERSION_MINOR 0
+#define MNN_VERSION_PATCH 0
+#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
--- a/include/MNN/MNNForwardType.h
+++ b/include/MNN/MNNForwardType.h
@ -104,7 +104,6 @@ struct BackendConfig {
        STATUS_COUNT
    };

-
 }; // namespace MNN
 #endif
 #endif /* MNNForwardType_h */
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -69,7 +69,6 @@ public:
    const DebugTools* getDebugTools() const {
        return mDebug.get();
    }
-    struct Cache;
    class MNN_PUBLIC RuntimeManager {
    public:
        ~RuntimeManager();
@ -100,22 +99,18 @@ public:
         */
        void updateCache();
        std::vector<bool> isBackendSupport(const std::vector<MNNForwardType> type);
-        RuntimeInfo getRuntimeInfo() {
-            return mRuntime;
-        }
        friend class Executor;
        void setMode(Interpreter::SessionMode mode);
        void setHint(Interpreter::HintMode mode, int value);
        bool getInfo(Interpreter::SessionInfoCode code, void* ptr);
+        BackendConfig* getBnConfig();
        const RuntimeAttr* getInside() const {
            return mInside;
        }
    private:
-        RuntimeManager();
-        RuntimeInfo mRuntime;
-        std::shared_ptr<Runtime> mInfo;
-        std::shared_ptr<Cache> mCache;
        RuntimeAttr* mInside;
+        friend class StaticModule;
+        RuntimeManager();
    };
 private:
    void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
--- a/include/MNN/expr/MathOp.hpp
+++ b/include/MNN/expr/MathOp.hpp
@ -130,6 +130,8 @@ MNN_PUBLIC VARP _BroadcastTo(VARP a, VARP shape);
 MNN_PUBLIC VARP _LinSpace(VARP start, VARP stop, VARP num);

 MNN_PUBLIC VARP _RandomUnifom(VARP shape, halide_type_t dtype, float low = 0.0f, float high = 1.0f, int seed0 = 0, int seed1 = 0);
+MNN_PUBLIC VARP _CumSum(VARP x, int axis, bool exclusive = false, bool reverse = false);
+MNN_PUBLIC VARP _CumProd(VARP x, int axis);

 }; // namespace Express
 }; // namespace MNN
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -11,6 +11,12 @@
 		11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */; };
 		11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */; };
 		11A01A0D258785FB00745FA7 /* MNNVectorTop1Int32.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */; };
+		19D0FE6F28534C4500B74B1A /* MetalSoftmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 19D0FE6C28534C4500B74B1A /* MetalSoftmax.metal */; };
+		19D0FE7028534C4500B74B1A /* MetalSoftmax.mm in Sources */ = {isa = PBXBuildFile; fileRef = 19D0FE6D28534C4500B74B1A /* MetalSoftmax.mm */; };
+		19D0FE7128534C4500B74B1A /* MetalSoftmax.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 19D0FE6E28534C4500B74B1A /* MetalSoftmax.hpp */; };
+		19D0FE75285C66F200B74B1A /* MetalLayerNorm.metal in Sources */ = {isa = PBXBuildFile; fileRef = 19D0FE72285C66F200B74B1A /* MetalLayerNorm.metal */; };
+		19D0FE76285C66F200B74B1A /* MetalLayerNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 19D0FE73285C66F200B74B1A /* MetalLayerNorm.hpp */; };
+		19D0FE77285C66F200B74B1A /* MetalLayerNorm.mm in Sources */ = {isa = PBXBuildFile; fileRef = 19D0FE74285C66F200B74B1A /* MetalLayerNorm.mm */; };
 		1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F722397BA5A004E8721 /* HalideRuntime.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F732397BA5A004E8721 /* MNNDefine.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		1F501F812397BA5B004E8721 /* AutoTime.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F742397BA5A004E8721 /* AutoTime.hpp */; settings = {ATTRIBUTES = (Public, ); }; };
@ -301,6 +307,8 @@
 		4AF4FB2A269ED244005BA97B /* MNNPackedSparseQuantMatMulEpx4.S in Sources */ = {isa = PBXBuildFile; fileRef = 4AF4FB28269ED244005BA97B /* MNNPackedSparseQuantMatMulEpx4.S */; };
 		4AF4FB2D269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx1.S in Sources */ = {isa = PBXBuildFile; fileRef = 4AF4FB2B269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx1.S */; };
 		4AF4FB2E269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx4.S in Sources */ = {isa = PBXBuildFile; fileRef = 4AF4FB2C269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx4.S */; };
+		4D0C80E32862FC4100C7CAD6 /* CoreMLOPRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D0C80E22862FC4100C7CAD6 /* CoreMLOPRegister.cpp */; };
+		4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4D0C80E42862FC4700C7CAD6 /* CoreMLRaster.metal */; };
 		4D4CF4672760946500A36D9F /* miscellaneous.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D4CF4622760946500A36D9F /* miscellaneous.cpp */; };
 		4D4CF4682760946500A36D9F /* geometric.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D4CF4632760946500A36D9F /* geometric.cpp */; };
 		4D4CF4692760946500A36D9F /* filter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D4CF4642760946500A36D9F /* filter.cpp */; };
@ -795,6 +803,12 @@
 		11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Float.S; sourceTree = "<group>"; };
 		11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Float.S; sourceTree = "<group>"; };
 		11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Int32.S; sourceTree = "<group>"; };
+		19D0FE6C28534C4500B74B1A /* MetalSoftmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalSoftmax.metal; sourceTree = "<group>"; };
+		19D0FE6D28534C4500B74B1A /* MetalSoftmax.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalSoftmax.mm; sourceTree = "<group>"; };
+		19D0FE6E28534C4500B74B1A /* MetalSoftmax.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalSoftmax.hpp; sourceTree = "<group>"; };
+		19D0FE72285C66F200B74B1A /* MetalLayerNorm.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalLayerNorm.metal; sourceTree = "<group>"; };
+		19D0FE73285C66F200B74B1A /* MetalLayerNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalLayerNorm.hpp; sourceTree = "<group>"; };
+		19D0FE74285C66F200B74B1A /* MetalLayerNorm.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalLayerNorm.mm; sourceTree = "<group>"; };
 		1F501F722397BA5A004E8721 /* HalideRuntime.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HalideRuntime.h; path = MNN/HalideRuntime.h; sourceTree = "<group>"; };
 		1F501F732397BA5A004E8721 /* MNNDefine.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNDefine.h; path = MNN/MNNDefine.h; sourceTree = "<group>"; };
 		1F501F742397BA5A004E8721 /* AutoTime.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = AutoTime.hpp; path = MNN/AutoTime.hpp; sourceTree = "<group>"; };
@ -1084,6 +1098,8 @@
 		4AF4FB28269ED244005BA97B /* MNNPackedSparseQuantMatMulEpx4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackedSparseQuantMatMulEpx4.S; sourceTree = "<group>"; };
 		4AF4FB2B269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx1.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackedSparseQuantMatMulEpx1.S; sourceTree = "<group>"; };
 		4AF4FB2C269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackedSparseQuantMatMulEpx4.S; sourceTree = "<group>"; };
+		4D0C80E22862FC4100C7CAD6 /* CoreMLOPRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CoreMLOPRegister.cpp; sourceTree = "<group>"; };
+		4D0C80E42862FC4700C7CAD6 /* CoreMLRaster.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = CoreMLRaster.metal; sourceTree = "<group>"; };
 		4D4CF4622760946500A36D9F /* miscellaneous.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = miscellaneous.cpp; sourceTree = "<group>"; };
 		4D4CF4632760946500A36D9F /* geometric.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = geometric.cpp; sourceTree = "<group>"; };
 		4D4CF4642760946500A36D9F /* filter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = filter.cpp; sourceTree = "<group>"; };
@ -2008,6 +2024,12 @@
 		489D7A152550FDC800AD896A /* metal */ = {
 			isa = PBXGroup;
 			children = (
+				19D0FE73285C66F200B74B1A /* MetalLayerNorm.hpp */,
+				19D0FE72285C66F200B74B1A /* MetalLayerNorm.metal */,
+				19D0FE74285C66F200B74B1A /* MetalLayerNorm.mm */,
+				19D0FE6E28534C4500B74B1A /* MetalSoftmax.hpp */,
+				19D0FE6C28534C4500B74B1A /* MetalSoftmax.metal */,
+				19D0FE6D28534C4500B74B1A /* MetalSoftmax.mm */,
 				48925F302744AA4000919B37 /* MetalCache_generated.h */,
 				4838EA802611C00B0027232C /* MetalGridSample.hpp */,
 				4838EA812611C00B0027232C /* MetalGridSample.metal */,
@ -2217,6 +2239,8 @@
 		4D9A933526255BDA00F9B43C /* backend */ = {
 			isa = PBXGroup;
 			children = (
+				4D0C80E42862FC4700C7CAD6 /* CoreMLRaster.metal */,
+				4D0C80E22862FC4100C7CAD6 /* CoreMLOPRegister.cpp */,
 				4D4DAE67263905390060D37E /* CoreMLDefine.h */,
 				4DDE2018263809920085AC8F /* CoreMLExecutorWrapper.h */,
 				4DDE2017263809920085AC8F /* CoreMLExecutorWrapper.mm */,
@ -2778,6 +2802,7 @@
 				1F501F892397BA5B004E8721 /* MNNForwardType.h in Headers */,
 				92FF027323AA0B5A00AC97F6 /* CPUPoolInt8.hpp in Headers */,
 				1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */,
+				19D0FE76285C66F200B74B1A /* MetalLayerNorm.hpp in Headers */,
 				489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */,
 				1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */,
 				92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */,
@ -2836,6 +2861,7 @@
 				EBECA39524643D320062C7A3 /* Arm82Backend.hpp in Headers */,
 				92FF04C323AA0BFB00AC97F6 /* Session.hpp in Headers */,
 				48FA474423AA127B00172C3B /* MergeOptimizer.hpp in Headers */,
+				19D0FE7128534C4500B74B1A /* MetalSoftmax.hpp in Headers */,
 				92FF039F23AA0B5A00AC97F6 /* CommonOptFunction.h in Headers */,
 				4D9A935A26255BDA00F9B43C /* Parameters.pb-c.h in Headers */,
 				4896D36D25FE2A3D00717702 /* Arm82Vec.hpp in Headers */,
@ -3161,6 +3187,7 @@
 				92FF036423AA0B5A00AC97F6 /* CPUUnravelIndex.cpp in Sources */,
 				92FF02C623AA0B5A00AC97F6 /* MNNBlitC1ToFloatRGBA.S in Sources */,
 				4D9A935F26255BDA00F9B43C /* NeuralNetwork.pb-c.c in Sources */,
+				4D0C80E32862FC4100C7CAD6 /* CoreMLOPRegister.cpp in Sources */,
 				92FF02BE23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
 				4A224A0B27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.cpp in Sources */,
 				48608B52250632EC00CB1D71 /* GeometryComputerUtils.cpp in Sources */,
@ -3387,6 +3414,7 @@
 				EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
 				4AF4FB29269ED244005BA97B /* MNNPackedSparseQuantMatMulEpx1.S in Sources */,
 				4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
+				19D0FE75285C66F200B74B1A /* MetalLayerNorm.metal in Sources */,
 				11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
 				92FF035323AA0B5A00AC97F6 /* CPUScatterNd.cpp in Sources */,
 				48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
@ -3410,6 +3438,7 @@
 				92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */,
 				92FF044923AA0B7100AC97F6 /* ShapeGatherND.cpp in Sources */,
 				489D7AB32550FDC900AD896A /* MetalPReLU.mm in Sources */,
+				19D0FE7028534C4500B74B1A /* MetalSoftmax.mm in Sources */,
 				4AF4FB24269ED235005BA97B /* SparseConvInt8TiledExecutor.cpp in Sources */,
 				489D7AB12550FDC900AD896A /* MetalDefine.metal in Sources */,
 				48FB9DCE24AB080C008E1A2D /* MNNPackC8.S in Sources */,
@ -3446,6 +3475,7 @@
 				4D6D7FD72656896D00F80814 /* SparseConvolutionTiledExecutor.cpp in Sources */,
 				92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */,
 				4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
+				4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */,
 				92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
 				4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
 				92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */,
@ -3520,6 +3550,7 @@
 				4888772B215B639F0079B12E /* Matrix.cpp in Sources */,
 				92FF045823AA0B7100AC97F6 /* ShapeReduction.cpp in Sources */,
 				92FF026D23AA0B5A00AC97F6 /* CPUMatrixBandPart.cpp in Sources */,
+				19D0FE6F28534C4500B74B1A /* MetalSoftmax.metal in Sources */,
 				92FF02A323AA0B5A00AC97F6 /* CPUQuantizedLogistic.cpp in Sources */,
 				4838EA852611C00B0027232C /* MetalGridSample.mm in Sources */,
 				489D7AAF2550FDC900AD896A /* MetalConvolutionWinograd.mm in Sources */,
@ -3561,6 +3592,7 @@
 				92FF036323AA0B5A00AC97F6 /* CPUScale.cpp in Sources */,
 				92FF02FE23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */,
 				92FF039B23AA0B5A00AC97F6 /* CommonOptFunction.cpp in Sources */,
+				19D0FE77285C66F200B74B1A /* MetalLayerNorm.mm in Sources */,
 				92FF02BC23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */,
 				92FF02DD23AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseUint8.S in Sources */,
 				92FF026323AA0B5A00AC97F6 /* CPUFloatToInt8.cpp in Sources */,
--- a/pymnn/examples/MNNEngineDemo/gpu_session_demo.py
+++ b/pymnn/examples/MNNEngineDemo/gpu_session_demo.py
@ -2,6 +2,7 @@ import os
 import sys
 import MNN
 import numpy as np
+import cv2
        
 def createTensor(tensor):
    shape = tensor.getShape()
@ -28,12 +29,28 @@ def modelTest(modelPath):
    session = net.createSession(config)
    
    print("Run on backendtype: %d \n" % net.getSessionInfo(session, 2))
+
+    image = cv2.imread(sys.argv[2])
+    #cv2 read as bgr format
+    image = image[..., ::-1]
+    #change to rgb format
+    image = cv2.resize(image, (224, 224))
+    #resize to mobile_net tensor size
+    image = image - (103.94, 116.78, 123.68)
+    image = image * (0.017, 0.017, 0.017)
+    #preprocess it
+    image = image.transpose((2, 0, 1))
+    #change numpy data type as np.float32 to match tensor's format
+    image = image.astype(np.float32)
+    #cv2 read shape is NHWC, Tensor's need is NCHW,transpose it
+    tmp_input = MNN.Tensor((1, 3, 224, 224), MNN.Halide_Type_Float,\
+                    image, MNN.Tensor_DimensionType_Caffe)
    
-    allInput = net.getSessionInputAll(session)
    # input
    inputTensor = net.getSessionInput(session)
-    inputHost = createTensor(inputTensor)
-    inputTensor.copyFrom(inputHost)
+    net.resizeTensor(inputTensor, (1, 3, 224, 224))
+    net.resizeSession(session)
+    inputTensor.copyFrom(tmp_input)
    # infer
    net.runSession(session)
    outputTensor = net.getSessionOutput(session)
@ -43,6 +60,8 @@ def modelTest(modelPath):
    outputTensor.copyToHostTensor(outputHost)
    
    net.updateCacheFile(session, 0)
+    print("expect 983")
+    print("output belong to class: {}".format(np.argmax(outputHost.getData())))

 if __name__ == '__main__':
    modelName = sys.argv[1] # model path
--- a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
+++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
@ -5,15 +5,20 @@ from __future__ import print_function
 import numpy as np
 import MNN
 import cv2
+import sys
+
 def inference():
    """ inference mobilenet_v1 using a specific picture """
-    interpreter = MNN.Interpreter("mobilenet_v1.mnn")
+    interpreter = MNN.Interpreter(sys.argv[1])
    interpreter.setCacheFile('.tempcache')
    config = {}
    config['precision'] = 'low'
    session = interpreter.createSession()
    input_tensor = interpreter.getSessionInput(session)
-    image = cv2.imread('0000.jpg')
+    interpreter.resizeTensor(input_tensor, (1, 3, 224, 224))
+    interpreter.resizeSession(session)
+
+    image = cv2.imread(sys.argv[2])
    #cv2 read as bgr format
    image = image[..., ::-1]
    #change to rgb format
--- a/pymnn/examples/MNNEngineDemo/mobilenet_demo_2.py
+++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo_2.py
@ -5,9 +5,11 @@ from __future__ import print_function
 import numpy as np
 import MNN
 import cv2
+import sys
+
 def inference():
    """ inference mobilenet_v1 using a specific picture """
-    interpreter = MNN.Interpreter("mobilenet_v1.mnn")
+    interpreter = MNN.Interpreter(sys.argv[1])
    interpreter.setCacheFile('.tempcache')
    config = {}
    config['precision'] = 'low'
@ -23,7 +25,10 @@ def inference():
    print('backend_info: %d' % interpreter.getSessionInfo(session, 2))
    
    input_tensor = interpreter.getSessionInput(session)
-    image = cv2.imread('ILSVRC2012_val_00049999.JPEG')
+    interpreter.resizeTensor(input_tensor, (1, 3, 224, 224))
+    interpreter.resizeSession(session)
+
+    image = cv2.imread(sys.argv[2])
    #cv2 read as bgr format
    image = image[..., ::-1]
    #change to rgb format
@ -46,5 +51,6 @@ def inference():
    output_tensor.copyToHostTensor(tmp_output) 
    print("expect 983")
    print("output belong to class: {}".format(np.argmax(tmp_output.getData())))
+
 if __name__ == "__main__":
    inference()
--- a/pymnn/examples/MNNExpr/gpu_express_demo.py
+++ b/pymnn/examples/MNNExpr/gpu_express_demo.py
@ -4,13 +4,15 @@ from __future__ import print_function
 import numpy as np
 import MNN
 import sys
+import cv2
+
 def inference():
    """ inference mobilenet_v1 using a specific picture """
    
    config = {}
    config['precision'] = 'low'
    config['backend'] = 3
-    config['numThread'] = 4    
+    config['numThread'] = 4
    
    rt = MNN.nn.create_runtime_manager((config,))
    rt.set_cache(".cachefile")
@ -19,13 +21,26 @@ def inference():
    # set_hint(type, value) //type 0 for "tune_num" 
    rt.set_hint(0, 20)
    
-    net = MNN.nn.load_module_from_file(sys.argv[1], ["data"], ["prob"], runtime_manager=rt)
+    net = MNN.nn.load_module_from_file(sys.argv[1], ["input"], ["MobilenetV1/Predictions/Reshape_1"], runtime_manager=rt)
    
+    image = cv2.imread(sys.argv[2])
+    #cv2 read as bgr format
+    image = image[..., ::-1]
+    #change to rgb format
+    image = cv2.resize(image, (224, 224))
+    #resize to mobile_net tensor size
+    image = image - (103.94, 116.78, 123.68)
+    image = image * (0.017, 0.017, 0.017)
+    #change numpy data type as np.float32 to match tensor's format
+    image = image.astype(np.float32)
    input_var = MNN.expr.placeholder([1, 224, 224, 3], MNN.expr.NHWC)
+    input_var.write(image)
    input_var = MNN.expr.convert(input_var, MNN.expr.NC4HW4)
    #inference
    output_var = net.forward(input_var)
-    #output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
+    output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
+    print("expect 983")
+    print("output belong to class: {}".format(np.argmax(output_var.read())))
    
    rt.update_cache()
 if __name__ == "__main__":
--- a/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py
+++ b/pymnn/examples/MNNExpr/mnn_numpy_cv_demo.py
@ -6,9 +6,10 @@ import MNN.numpy as np
 import MNN
 import MNN.cv as cv2
 import sys
+
 def inference():
    """ inference mobilenet_v1 using a specific picture """
-    net = MNN.nn.load_module_from_file(sys.argv[1], ["data"], ["prob"])
+    net = MNN.nn.load_module_from_file(sys.argv[1], ["input"], ["MobilenetV1/Predictions/Reshape_1"])
    image = cv2.imread(sys.argv[2])
    #cv2 read as bgr format
    image = image[..., ::-1]
@ -27,6 +28,8 @@ def inference():
    output_var = net.forward(input_var)
    #the output from net may be NC4HW4, turn to linear layout
    output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
+    print("expect 983")
    print("output belong to class: {}".format(np.argmax(output_var)))
+
 if __name__ == "__main__":
    inference()
--- a/pymnn/examples/MNNExpr/mobilenet_demo.py
+++ b/pymnn/examples/MNNExpr/mobilenet_demo.py
@ -6,9 +6,10 @@ import numpy as np
 import MNN
 import cv2
 import sys
+
 def inference():
    """ inference mobilenet_v1 using a specific picture """
-    net = MNN.nn.load_module_from_file(sys.argv[1], ["data"], ["prob"])
+    net = MNN.nn.load_module_from_file(sys.argv[1], ["input"], ["MobilenetV1/Predictions/Reshape_1"])
    image = cv2.imread(sys.argv[2])
    #cv2 read as bgr format
    image = image[..., ::-1]
@ -28,6 +29,8 @@ def inference():
    output_var = net.forward(input_var)
    #the output from net may be NC4HW4, turn to linear layout
    output_var = MNN.expr.convert(output_var, MNN.expr.NHWC)
+    print("expect 983")
    print("output belong to class: {}".format(np.argmax(output_var.read())))
+
 if __name__ == "__main__":
    inference()
--- a/pymnn/examples/MNNQuant/test_mnn_offline_quant.py
+++ b/pymnn/examples/MNNQuant/test_mnn_offline_quant.py
@ -0,0 +1,200 @@
+from __future__ import print_function
+import time
+import argparse
+import numpy as np
+import tqdm
+import os
+import MNN
+from PIL import Image
+
+nn = MNN.nn
+F = MNN.expr
+
+
+# adapted from pycaffe
+def load_image(filename, color=True):
+    """
+    Load an image converting from grayscale or alpha as needed.
+
+    Parameters
+    ----------
+    filename : string
+    color : boolean
+        flag for color format. True (default) loads as RGB while False
+        loads as intensity (if image is already grayscale).
+
+    Returns
+    -------
+    image : an image with type np.float32 in range [0, 1]
+        of size (H x W x 3) in RGB or
+        of size (H x W x 1) in grayscale.
+    """
+    img = Image.open(filename)
+    img = np.array(img)
+    if img.ndim == 2:
+        img = img[:, :, np.newaxis]
+        if color:
+            img = np.tile(img, (1, 1, 3))
+    elif img.shape[2] == 4:
+        img = img[:, :, :3]
+    return img
+
+
+def center_crop(image_data, crop_factor):
+    height, width, channels = image_data.shape
+
+    h_size = int(height * crop_factor)
+    h_start = int((height - h_size) / 2)
+    h_end = h_start + h_size
+
+    w_size = int(width * crop_factor)
+    w_start = int((width - w_size) / 2)
+    w_end = w_start + w_size
+
+    cropped_image = image_data[h_start:h_end, w_start:w_end, :]
+
+    return cropped_image
+
+
+def resize_image(image, shape):
+    im = Image.fromarray(image)
+    im = im.resize(shape)
+    resized_image = np.array(im)
+
+    return resized_image
+
+
+class CalibrationDataset(MNN.data.Dataset):
+    '''
+    This is demo for Imagenet calibration dataset. like pytorch, you need to overload __getiterm__ and __len__ methods
+    __getiterm__ should return a sample in F.const, and you should not use batch dimension here
+    __len__ should return the number of total samples in the calibration dataset
+    '''
+    def __init__(self, image_folder):
+        super(CalibrationDataset, self).__init__()
+        self.image_folder = image_folder
+        self.image_list = os.listdir(image_folder)[0:64]
+
+    def __getitem__(self, index):
+        image_name = os.path.join(self.image_folder, self.image_list[index].split(' ')[0])
+
+
+        # preprocess your data here, the following code are for tensorflow mobilenets
+        image_data = load_image(image_name)
+        image_data = center_crop(image_data, 0.875)
+        image_data = resize_image(image_data, (224, 224))
+        image_data = (image_data - 127.5) / 127.5
+
+
+        # after preprocessing the data, convert it to MNN data structure
+        dv = F.const(image_data.flatten().tolist(), [224, 224, 3], F.data_format.NHWC, F.dtype.float)
+
+        '''
+        first list for inputs, and may have many inputs, so it's a list
+        if your model have more than one inputs, add the preprocessed MNN const data to the input list
+
+        second list for targets, also, there may be more than one targets
+        for calibration dataset, we don't need labels, so leave it blank
+
+        Note that, the input order in the first list should be the same in your 'config.yaml' file.
+        '''
+        
+        return [dv], []
+
+    def __len__(self):
+        # size of the dataset
+        return len(self.image_list)
+
+
+def get_mnn_format(format_str):
+    fmt = str.lower(format_str)
+    if fmt == 'nchw':
+        return F.NCHW
+    elif fmt == 'nhwc':
+        return F.NHWC
+    elif fmt == 'nc4hw4':
+        return F.NC4HW4
+    else:
+        raise ValueError("unknown format:", format_str)
+
+def quant_func(net, dataloader, opt):
+    net.train(True)
+    dataloader.reset()
+
+    t0 = time.time()
+    for i in tqdm.trange(dataloader.iter_number):
+        example = dataloader.next()
+        input_data = example[0]
+        predicts = net.forward(input_data)
+        # fake update
+        opt.step(F.const([0.0], []))
+        for predict in predicts:
+            predict.read()
+
+    t1 = time.time()
+    cost = t1 - t0
+    print("Epoch cost: %.3f s." % cost)
+
+    return cost
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mnn_model", type=str, required=True,\
+        help="original float MNN model file")
+    parser.add_argument("--quant_imgs", type=str, required=True, \
+        help="path of quant images")
+    parser.add_argument("--quant_model", type=str, required=True, \
+        help="name of quantized model to save")
+    parser.add_argument("--batch_size", type=int, required=False, default=32,\
+                        help="calibration batch size")
+
+    args = parser.parse_args()
+
+    mnn_model = args.mnn_model
+    quant_imgs = args.quant_imgs
+    quant_model = args.quant_model
+    batch_size = args.batch_size
+
+    calibration_dataset = CalibrationDataset(image_folder=quant_imgs)
+
+    dataloader = MNN.data.DataLoader(calibration_dataset, batch_size=batch_size, shuffle=True)
+
+    m = F.load_as_dict(mnn_model)
+
+    inputs_outputs = F.get_inputs_and_outputs(m)
+    for key in inputs_outputs[0].keys():
+        print('input names:\t', key)
+    for key in inputs_outputs[1].keys():
+        print('output names:\t', key)
+    
+    # set inputs and outputs
+    inputs = [m['input']]
+    outputs = [m['MobilenetV2/Predictions/Reshape_1']]
+    input_placeholders = []
+    for i in range(len(inputs)):
+        shape = [1, 3, 224, 224]
+        fmt = 'nchw'
+        nnn_format = get_mnn_format(fmt)
+        placeholder = F.placeholder(shape, nnn_format)
+        placeholder.name = 'input'
+        input_placeholders.append(placeholder)
+
+    net = nn.load_module(inputs, outputs, True)
+
+    # no use optimizer
+    opt = MNN.optim.SGD(net, 0.01, 0.9, 0.0005)
+
+    nn.compress.train_quant(net, quant_bits=8)
+
+    used_time = quant_func(net, dataloader, opt)
+
+    # save model
+    net.train(False)
+    predicts = net.forward(input_placeholders)
+    print("quantized model save to " + quant_model)
+    F.save(predicts, quant_model)
+
+
+if __name__ == "__main__":
+    main()
--- a/pymnn/examples/MNNTrain/mnist/train_mnist.py
+++ b/pymnn/examples/MNNTrain/mnist/train_mnist.py
@ -6,7 +6,6 @@ from dataset import MnistDataset
 nn = MNN.nn
 F = MNN.expr

-
 class Net(nn.Module):
    """construct a lenet 5 model"""
    def __init__(self):
@ -51,7 +50,6 @@ def test_func(net, test_dataloader):

    print("test acc: ", correct * 100.0 / test_dataloader.size, "%")

-
 def train_func(net, train_dataloader, opt):
    """train function"""
    net.train(True)
@ -99,8 +97,7 @@ def demo():
    opt = MNN.optim.SGD(model, 0.01, 0.9, 0.0005)

    F.set_thread_number(4)
-
-    for epoch in range(0, 10):
+    for epoch in range(0, 1):
        opt.learning_rate = learning_rate_scheduler(opt.learning_rate, epoch)
        train_func(model, train_dataloader, opt)
        
--- a/pymnn/src/MNN.cc
+++ b/pymnn/src/MNN.cc
@ -204,6 +204,7 @@ static PyObject* PyMNNInterpreter_setSessionHint(PyMNNInterpreter *self, PyObjec
 static PyObject* PyMNNInterpreter_cache(PyMNNInterpreter *self, PyObject *args);
 static PyObject* PyMNNInterpreter_removeCache(PyMNNInterpreter *self, PyObject *args);
 static PyObject* PyMNNInterpreter_updateSessionToModel(PyMNNInterpreter *self, PyObject *args);
+static PyObject* PyMNNInterpreter_getModelVersion(PyMNNInterpreter *self, PyObject *args);
 static PyObject* PyMNNInterpreter_new(struct _typeobject *type, PyObject *args, PyObject *kwds);
 static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObject *kwds);
 static void PyMNNInterpreter_dealloc(PyMNNInterpreter *);
@ -232,6 +233,7 @@ static PyMethodDef PyMNNInterpreter_methods[] = {
    {"cache", (PyCFunction)PyMNNInterpreter_cache, METH_VARARGS, "cache current net instance"},
    {"removeCache", (PyCFunction)PyMNNInterpreter_removeCache, METH_VARARGS, "remove cache with given path"},
    {"updateSessionToModel", (PyCFunction)PyMNNInterpreter_updateSessionToModel, METH_VARARGS, "updateSessionToModel"},
+    {"getModelVersion", (PyCFunction)PyMNNInterpreter_getModelVersion, METH_VARARGS, "getModelVersion"},
 #ifdef PYMNN_INTERNAL_SERVING
    {"createSessionWithToken", (PyCFunction)PyMNNInterpreter_createSessionWithToken, METH_VARARGS, "create session with token"},
 #endif
@ -1410,6 +1412,10 @@ static PyObject* PyMNNInterpreter_updateSessionToModel(PyMNNInterpreter *self, P
    Py_RETURN_NONE;
 }

+static PyObject* PyMNNInterpreter_getModelVersion(PyMNNInterpreter *self, PyObject *args) {
+    return toPyObj(self->interpreter->getModelVersion());
+}
+
 static void PyMNNInterpreter_dealloc(PyMNNInterpreter *self) {
    if (!self->modelPath) {
        return;
@ -2522,7 +2528,7 @@ static void PyMNNOpInfo_dealloc(PyMNNOpInfo *self) {
 }

 #ifdef PYMNN_TRAIN_API
-static PyObject* PyMN_get_model_uuid(PyObject *self, PyObject *args) {
+static PyObject* PyMNN_get_model_uuid(PyObject *self, PyObject *args) {
    const char* modelFile;
    if (!PyArg_ParseTuple(args, "s", &modelFile)) {
        printf("PyArg_ParseTuple Error\n");
@ -2531,11 +2537,15 @@ static PyObject* PyMN_get_model_uuid(PyObject *self, PyObject *args) {
    return toPyObj(HelperFuncs::getModelUUID(modelFile));
 }
 #endif
+static PyObject* PyMNN_version(PyObject *self, PyObject *args) {
+    return toPyObj(MNN::getVersion());
+}
 /// module init
 static PyMethodDef module_methods[] = {
 #ifdef PYMNN_TRAIN_API
-    {"get_model_uuid", (PyCFunction)PyMN_get_model_uuid, METH_VARARGS, "get model's uuid"},
+    {"get_model_uuid", (PyCFunction)PyMNN_get_model_uuid, METH_VARARGS, "get model's uuid"},
 #endif
+    {"version", (PyCFunction)PyMNN_version, METH_VARARGS, "get MNN version number"},
    {NULL, NULL, 0, NULL}
 };

--- a/pymnn/src/compress.h
+++ b/pymnn/src/compress.h
@ -7,21 +7,23 @@ def_enum(Scale_Update_Method, NN::ScaleUpdateMethod,
    NN::Maximum, "MAXIMUM",
    NN::MovingAverage, "MOVING_AVERAGE"
 )
-static PyObject* PyMNNCompress_train_quant(PyMNNOptimizer *self, PyObject *args) {
-    PyObject *module,
-             *feature_scale_method = toPyObj(NN::PerTensor),
-             *scale_update_method = toPyObj(NN::MovingAverage);
+static PyObject* PyMNNCompress_train_quant(PyMNNOptimizer *self, PyObject *args, PyObject *kwargs) {
+    PyObject *module = nullptr,
+             *feature_scale_method = nullptr /* PerTensor */,
+             *scale_update_method = nullptr /* MovingAverage */;
    int quant_bits = 8;
-    if (!PyArg_ParseTuple(args, "O|iOO", &module, &quant_bits,
-                          &feature_scale_method, &scale_update_method)) {
-        Py_RETURN_NONE;
+    static char *kwlist[] = { "module", "quant_bits", "feature_scale_method", "scale_update_method", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iOO", kwlist, &module, &quant_bits, &feature_scale_method, &scale_update_method)) {
+        PyMNN_ERROR("train_quant require args: (Module, |int, Feature_Scale_Method, Scale_Update_Method)");
    }
-    auto feature_scale_method_ = toEnum<NN::FeatureScaleStatMethod>(feature_scale_method);
-    auto scale_update_method_ = toEnum<NN::ScaleUpdateMethod>(scale_update_method);
-    return toPyObj(NN::turnQuantize(to_Module(module), quant_bits, feature_scale_method_, scale_update_method_));
+    auto feature_scale_method_ = feature_scale_method == nullptr ? NN::PerTensor :
+                                 toEnum<NN::FeatureScaleStatMethod>(feature_scale_method);
+    auto scale_update_method_ = scale_update_method == nullptr ? NN::MovingAverage :
+                                toEnum<NN::ScaleUpdateMethod>(scale_update_method);
+    return toPyObj(NN::turnQuantize(to_Module(module)->get(), quant_bits, feature_scale_method_, scale_update_method_));
 }
 static PyMethodDef PyMNNCompress_methods[] = {
-    register_methods(Compress,
+    register_methods_kw(Compress,
        train_quant, "train_quant"
    )
 };
--- a/pymnn/src/data.h
+++ b/pymnn/src/data.h
@ -7,26 +7,118 @@ static PyObject* toPyObj(Example example) {
    PyList_SetItem(ret, 1, toPyObj<VARP, toPyObj>(example.second));
    return ret;
 }
-def_class_start(Dataset, Dataset)
-def_class_without_getset(Dataset)
-def_class_methods(Dataset,
-    __getitem__, "get item: []",
-    __len__, "get length: len()"
-)
-def_class_end(Dataset, Dataset)
-// class DataSet impl
-class_basic_new_impl(Dataset)
-static PyObject* PyMNNDataset___getitem__(PyMNNDataset *self, PyObject *args) {
-    int index;
-    if (!PyArg_ParseTuple(args, "i", &index)) {
-        Py_RETURN_NONE;
+class DatasetWrapper : public Dataset {
+public:
+    using Dataset::Dataset;
+    DatasetWrapper(PyObject* py_dataset) {
+        Py_INCREF(py_dataset);
+        this->py_dataset = py_dataset;
    }
-    return toPyObj(self->ptr->get(index));
+    ~DatasetWrapper() {
+        if (py_dataset) {
+            Py_DECREF(py_dataset);
+        }
+    }
+    Example get(size_t index) override {
+        auto getfunc = PyObject_GetAttrString(py_dataset, "__getitem__");
+        auto arg = PyTuple_New(1);
+        PyTuple_SetItem(arg, 0, PyLong_FromLong(index));
+        auto res = PyEval_CallObject(getfunc, arg);
+        Py_DECREF(arg);
+        Py_DECREF(getfunc);
+        // res to Example
+        auto py_example = PyTuple_GetItem(res, 0);
+        auto py_example_second = PyTuple_GetItem(res, 1);
+        auto example = std::make_pair(
+            toVars(py_example),
+            toVars(py_example_second)
+        );
+        Py_DECREF(res);
+        return example;
+    }
+    size_t size() override {
+        auto sizefunc = PyObject_GetAttrString(py_dataset, "__len__");
+        auto res = PyEval_CallObject(sizefunc, NULL);
+        Py_DECREF(sizefunc);
+        auto size = toInt(res);
+        Py_DECREF(res);
+        return size;
+    }
+private:
+    PyObject *py_dataset = nullptr;
+};
+
+typedef struct {
+    PyObject_HEAD
+    std::shared_ptr<Dataset>* ptr;
+} PyMNNDataset;
+
+static PyObject* PyMNNDataset_new(struct _typeobject *type, PyObject *args, PyObject *kwds) {
+    PyMNNDataset* self = (PyMNNDataset *)type->tp_alloc(type, 0);
+    return (PyObject*)self;
 }
-static PyObject* PyMNNDataset___len__(PyMNNDataset *self, PyObject *args) {
-    return toPyObj((int)self->ptr->size());
+
+static int PyMNNDataset_init(PyMNNDataset *self, PyObject *args, PyObject *kwds) {
+    self->ptr = new std::shared_ptr<Dataset>(new DatasetWrapper((PyObject*)self));
+    return 0;
 }

+static void PyMNNDataset_dealloc(PyMNNDataset *self) {
+    if (self->ptr) {
+        // delete self->ptr;
+        self->ptr->reset();
+    }
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyTypeObject PyMNNDatasetType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "Dataset",                                /*tp_name*/
+    sizeof(PyMNNDataset),                     /*tp_basicsize*/
+    0,                                        /*tp_itemsize*/
+    (destructor)PyMNNDataset_dealloc,         /*tp_dealloc*/
+    0,                                        /*tp_print*/
+    0,                                        /*tp_getattr*/
+    0,                                        /*tp_setattr*/
+    0,                                        /*tp_compare*/
+    0,                                        /*tp_repr*/
+    0,                                        /*tp_as_number*/
+    0,                                        /*tp_as_sequence*/
+    0,                                        /*tp_as_mapping*/
+    0,                                        /*tp_hash */
+    0,                                        /*tp_call*/
+    0,                                        /*tp_str*/
+    0,                                        /*tp_getattro*/
+    0,                                        /*tp_setattro*/
+    0,                                        /*tp_as_buffer*/
+    // Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE, /*tp_flags*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
+    "Dataset",                                /* tp_doc */
+    0,                                        /* tp_traverse */
+    0,                                        /* tp_clear */
+    0,                                        /* tp_richcompare */
+    0,                                        /* tp_weaklistoffset */
+    0,                                        /* tp_iter */
+    0,                                        /* tp_iternext */
+    0,                                        /* tp_methods */
+    0,                                        /* tp_members */
+    0,                                        /* tp_getset */
+    0,                                        /* tp_base */
+    0,                                        /* tp_dict */
+    0,                                        /* tp_descr_get */
+    0,                                        /* tp_descr_set */
+    0,                                        /* tp_dictoffset */
+    (initproc)PyMNNDataset_init,              /* tp_init */
+    0,                                        /* tp_alloc */
+    PyMNNDataset_new,                         /* tp_new */
+};
+
+static std::shared_ptr<Dataset> toDataset(PyObject* m) {
+    return *((PyMNNDataset*)m)->ptr;
+}
+
+def_class_register(Dataset)
+
 // class DataLoader def
 def_class_start(DataLoader, DataLoader)
 def_class_getset(
@ -40,14 +132,17 @@ def_class_methods(DataLoader,
 )
 def_class_end(DataLoader, DataLoader)
 // class DataLoader impl
-static PyObject* PyMNNDataLoader_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
-    PyObject* dataset;
+class_basic_call_impl(DataLoader)
+class_basic_init_impl(DataLoader)
+static PyObject* PyMNNDataLoader_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
+    PyObject* dataset = nullptr;
    int batch_size, num_workers = 0;
    int shuffle = 1;
-    if (!PyArg_ParseTuple(args, "Oi|ii", &dataset, &batch_size, &shuffle, &num_workers)) {
-        Py_RETURN_NONE;
+    static char *kwlist[] = { "dataset", "batch_size", "shuffle", "num_workers", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi|ii", kwlist, &dataset, &batch_size, &shuffle, &num_workers)) {
+        PyMNN_ERROR("DataLoader require args: Dataset, int, |int, int)");
    }
-    std::shared_ptr<Dataset> dataset_(toDataset(dataset));
+    std::shared_ptr<Dataset> dataset_ = std::move(toDataset(dataset));
    PyMNNDataLoader *self = (PyMNNDataLoader *)type->tp_alloc(type, 0);
    self->ptr = DataLoader::makeDataLoader(dataset_, batch_size, true, shuffle, num_workers);
    return (PyObject*)self;
--- a/pymnn/src/expr.h
+++ b/pymnn/src/expr.h
@ -630,8 +630,11 @@ static PyObject* PyMNNVar_repr(PyObject *self) {
 #else
    auto content = PyMNNVar_read_as_tuple((PyMNNVar*)self, NULL);
 #endif
-    auto repr = PyObject_GetAttrString(content, "__repr__");
-    return PyEval_CallObject(repr, NULL);
+    auto reprfunc = PyObject_GetAttrString(content, "__repr__");
+    auto str = PyEval_CallObject(reprfunc, NULL);
+    Py_DECREF(content);
+    Py_DECREF(reprfunc);
+    return str;
 }
 // PyMNNVar getter/setter functions impl
 static PyObject* PyMNNVar_getshape(PyMNNVar *self, void *closure) {
@ -921,6 +924,22 @@ static PyObject* PyMNNVar_write(PyMNNVar *self, PyObject *args) {
    Py_RETURN_NONE;
 }
 // Expr methods
+static PyObject* PyMNNExpr_set_thread_number(PyObject *self, PyObject *args) {
+    int numberThread;
+    if (!PyArg_ParseTuple(args, "i", &numberThread)) {
+        Py_RETURN_NONE;
+    }
+    if (numberThread < 1) {
+        numberThread = 1;
+    }
+    if (numberThread > 8) {
+        numberThread = 8;
+    }
+    auto exe = Executor::getGlobalExecutor();
+    BackendConfig config;
+    exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, numberThread);
+    Py_RETURN_NONE;
+}
 static PyObject* PyMNNExpr_load_as_list(PyObject *self, PyObject *args) {
    const char *fileName;
    if (!PyArg_ParseTuple(args, "s", &fileName)) {
@ -1558,6 +1577,7 @@ static PyMethodDef PyMNNExpr_methods[] = {
    )
    register_methods(Expr,
        // Var methods
+        set_thread_number, "set threan number of expr.",
        load_as_list, "load file as var list.",
        save, "save vars to file.",
        load_as_dict, "load file as var dict.",
--- a/pymnn/src/nn.h
+++ b/pymnn/src/nn.h
@ -11,7 +11,7 @@
 #endif

 // NN Module Start
-def_class_start(_Module, Module)
+def_class_smart_start(_Module, Module)
 def_class_getset(
    _Module,
    name, 0,
@ -28,7 +28,7 @@ def_class_methods(_Module,
    _register_submodules, "register submodules",
    _add_parameter, "add parameter"
 )
-def_class_end(_Module, Module)
+def_class_smart_end(_Module, Module)

 // NN RuntimeManager Start
 def_class_smart_start(RuntimeManager, Executor::RuntimeManager)
@ -40,6 +40,7 @@ def_class_methods(RuntimeManager,
 )
 def_class_without_getset(RuntimeManager)
 def_class_smart_end(RuntimeManager, Executor::RuntimeManager)
+class_basic_call_impl(RuntimeManager)

 static PyObject* load_module(PyObject *runtime_manager, PyObject *inputs, PyObject *outputs,
                             MNNForwardType backend, MemoryMode memory_mode, PowerMode power_mode, PrecisionMode precision_mode,
@ -84,9 +85,10 @@ static PyObject* load_module(PyObject *runtime_manager, PyObject *inputs, PyObje
    return toPyObj(m_ptr);
 }

+class_basic_init_impl(_Module)
 static PyObject* PyMNN_Module_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
    PyMNN_Module *self = (PyMNN_Module *)type->tp_alloc(type, 0);
-    self->ptr = Module::createEmpty({});
+    self->ptr = new std::shared_ptr<Module>(Module::createEmpty({}));
    return (PyObject*)self;
 }

@ -99,19 +101,19 @@ static PyObject* PyMNNRuntimeManager_new(PyTypeObject *type, PyObject *args, PyO
 // PyMNN_Module getter/setter impl
 static PyObject* PyMNN_Module_getname(PyMNN_Module *self, void *closure) {
    if (self->ptr) {
-        return toPyObj(self->ptr->name());
+        return toPyObj((*(self->ptr))->name());
    }
    Py_RETURN_NONE;
 }
 static PyObject* PyMNN_Module_getis_training(PyMNN_Module *self, void *closure) {
    if (self->ptr) {
-        return toPyObj(self->ptr->getIsTraining());
+        return toPyObj((*(self->ptr))->getIsTraining());
    }
    Py_RETURN_NONE;
 }
 static PyObject* PyMNN_Module_getparameters(PyMNN_Module *self, void *closure) {
    if (self->ptr) {
-        return toPyObj<VARP, toPyObj>(self->ptr->parameters());
+        return toPyObj<VARP, toPyObj>((*(self->ptr))->parameters());
    }
    Py_RETURN_NONE;
 }
@ -134,7 +136,7 @@ static PyObject* PyMNN_Module_forward(PyMNN_Module *self, PyObject *args) {
        (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
        return toPyObj<VARP, toPyObj>(vars);
 #else
-        return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(input)));
+        return toPyObj<VARP, toPyObj>((*(self->ptr))->onForward(toVars(input)));
 #endif
    }
    if (isVar(input)) {
@ -145,7 +147,7 @@ static PyObject* PyMNN_Module_forward(PyMNN_Module *self, PyObject *args) {
        (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
        return toPyObj(var);
 #else
-        return toPyObj(self->ptr->forward(toVar(input)));
+        return toPyObj((*(self->ptr))->forward(toVar(input)));
 #endif
    }
    PyMNN_ERROR("PyMNN_Module_forward: args must be Var/[Var].");
@ -158,25 +160,29 @@ static PyObject* PyMNN_Module_onForward(PyMNN_Module *self, PyObject *args) {
 #ifdef PYMNN_INTERNAL_SERVING
    int status = 0;
    Timer timer;
-    auto vars = self->ptr->onForward(toVars(inputs));
+    auto vars = (*(self->ptr))->onForward(toVars(inputs));
    if (vars.empty()) {
        PyMNN_ERROR("module onForward occur error.");
        status = -1;
    }

-    (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_onForward");
+    (void) MonitorService::GetInstance().EventTrack(self->ptr->get(), timer, status, "PyMNN_Module_onForward");
    return toPyObj<VARP, toPyObj>(vars);
 #else
-    return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(inputs)));
+    return toPyObj<VARP, toPyObj>((*(self->ptr))->onForward(toVars(inputs)));
 #endif
 }

+static PyObject* PyMNN_Module_call(PyObject *self, PyObject *args, PyObject *kwds) {
+    return PyMNN_Module_forward((PyMNN_Module*)self, args);
+}
+
 static PyObject* PyMNN_Module_set_name(PyMNN_Module *self, PyObject *args) {
    const char* name;
    if (!PyArg_ParseTuple(args, "s", &name)) {
        Py_RETURN_NONE;
    }
-    self->ptr->setName(name);
+    (*(self->ptr))->setName(name);
    Py_RETURN_NONE;
 }
 static PyObject* PyMNN_Module_train(PyMNN_Module *self, PyObject *args) {
@ -184,7 +190,7 @@ static PyObject* PyMNN_Module_train(PyMNN_Module *self, PyObject *args) {
    if (!PyArg_ParseTuple(args, "i", &isTraining)) {
        Py_RETURN_NONE;
    }
-    self->ptr->setIsTraining(isTraining);
+    (*(self->ptr))->setIsTraining(isTraining);
    Py_RETURN_NONE;
 }
 static PyObject* PyMNN_Module_load_parameters(PyMNN_Module *self, PyObject *args) {
@ -192,21 +198,21 @@ static PyObject* PyMNN_Module_load_parameters(PyMNN_Module *self, PyObject *args
    if (!PyArg_ParseTuple(args, "O", &parameters)) {
        Py_RETURN_NONE;
    }
-    return toPyObj(self->ptr->loadParameters(toVars(parameters)));
+    return toPyObj((*(self->ptr))->loadParameters(toVars(parameters)));
 }
 static PyObject* PyMNN_Module_clear_cache(PyMNN_Module *self, PyObject *args) {
-    self->ptr->clearCache();
+    (*(self->ptr))->clearCache();
    Py_RETURN_NONE;
 }
 std::shared_ptr<Module> toSharedModule(PyObject* obj)  {
-    return std::shared_ptr<Module>(to_Module(obj), [](Module*){});
+    return *to_Module(obj);
 }
 static PyObject* PyMNN_Module__register_submodules(PyMNN_Module *self, PyObject *args) {
    PyObject *children;
    if (!PyArg_ParseTuple(args, "O", &children)) {
        Py_RETURN_NONE;
    }
-    self->ptr->registerModel(toVec<std::shared_ptr<Module>, toSharedModule>(children));
+    (*(self->ptr))->registerModel(toVec<std::shared_ptr<Module>, toSharedModule>(children));
    Py_RETURN_NONE;
 }
 static PyObject* PyMNN_Module__add_parameter(PyMNN_Module *self, PyObject *args) {
@ -214,7 +220,7 @@ static PyObject* PyMNN_Module__add_parameter(PyMNN_Module *self, PyObject *args)
    if (!PyArg_ParseTuple(args, "O", &parameter)) {
        Py_RETURN_NONE;
    }
-    return toPyObj(self->ptr->addParameter(toVar(parameter)));
+    return toPyObj((*(self->ptr))->addParameter(toVar(parameter)));
 }
 // NN methods
 static PyObject* PyMNNNN_load_module(PyObject *self, PyObject *args) {
@ -465,7 +471,7 @@ static PyObject* PyMNNNN_load_module_from_file_with_token(PyObject *self, PyObje
 #endif

 #ifdef PYMNN_TRAIN_API
-static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args) {
+static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args, PyObject* kwargs) {
    INTS default_1 = {1, 1}, default_0 = {0, 0};
    int in_channel, out_channel;
    PyObject *kernel_size,
@ -474,10 +480,11 @@ static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args) {
             *dilation = nullptr /* default_1 */,
             *padding_mode = nullptr /* PaddingMode::VALID */;
    int depthwise = 0, bias = 1;
-    if (!PyArg_ParseTuple(args, "iiO|OOOiiO", &in_channel, &out_channel, &kernel_size,
+    static char *kwlist[] = { "in_channels", "out_channels", "kernel_size", "stride", "padding",
+                              "dilation", "depthwise", "bias", "padding_mode", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "iiO|OOOiiO", kwlist, &in_channel, &out_channel, &kernel_size,
                          &stride, &padding, &dilation, &depthwise, &bias, &padding_mode)) {
-        printf("PyArg_ParseTuple Error\n");
-        return NULL;
+        PyMNN_ERROR("conv require args: int, int, [int], |[int], [int], [int], bool, bool, PaddingMode)");
    }
    NN::ConvOption option;
    option.channel = {in_channel, out_channel};
@ -498,29 +505,29 @@ static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args) {
    option.depthwise = depthwise;
    return toPyObj(NN::Conv(std::move(option), bias));
 }
-static PyObject* PyMNNNN_linear(PyObject *self, PyObject *args) {
+static PyObject* PyMNNNN_linear(PyObject *self, PyObject *args, PyObject* kwargs) {
    int in_channel, out_channel;
    int bias = 1;
-    if (!PyArg_ParseTuple(args, "ii|i", &in_channel, &out_channel, &bias)) {
-        printf("PyArg_ParseTuple Error\n");
-        return NULL;
+    static char *kwlist[] = { "in_channels", "out_channels", "bias", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ii|i", kwlist, &in_channel, &out_channel, &bias)) {
+        PyMNN_ERROR("linear require args: int, int, |bool)");
    }
    return toPyObj(NN::Linear(in_channel, out_channel, bias));
 }
-static PyObject* PyMNNNN_batch_norm(PyObject *self, PyObject *args) {
+static PyObject* PyMNNNN_batch_norm(PyObject *self, PyObject *args, PyObject* kwargs) {
    int channels, dims = 4;
    float momentum = 0.99, epsilon = 1e-5;
-    if (!PyArg_ParseTuple(args, "i|iff", &channels, &dims, &momentum, &epsilon)) {
-        printf("PyArg_ParseTuple Error\n");
-        return NULL;
+    static char *kwlist[] = { "channels", "dims", "momentum", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|iff", kwlist, &channels, &dims, &momentum, &epsilon)) {
+        PyMNN_ERROR("batch_norm require args: int, |int, float, float)");
    }
    return toPyObj(NN::BatchNorm(channels, dims, momentum, epsilon));
 }
-static PyObject* PyMNNNN_dropout(PyObject *self, PyObject *args) {
+static PyObject* PyMNNNN_dropout(PyObject *self, PyObject *args, PyObject* kwargs) {
    float dropout_ratio;
-    if (!PyArg_ParseTuple(args, "f", &dropout_ratio)) {
-        printf("PyArg_ParseTuple Error\n");
-        return NULL;
+    static char *kwlist[] = { "dropout_ratio", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "f", kwlist, &dropout_ratio)) {
+        PyMNN_ERROR("dropout require args: float)");
    }
    return toPyObj(NN::Dropout(dropout_ratio));
 }
@ -540,7 +547,7 @@ static PyMethodDef PyMNNNN_methods[] = {
    )
 #endif
 #ifdef PYMNN_TRAIN_API
-    register_methods(NN,
+    register_methods_kw(NN,
        conv, "conv Module",
        linear, "linear Module",
        batch_norm, "batch_norm Module",
--- a/pymnn/src/optim.h
+++ b/pymnn/src/optim.h
@ -23,6 +23,8 @@ def_class_methods(Optimizer,
 def_class_end(Optimizer, ParameterOptimizer)
 // impl
 class_basic_new_impl(Optimizer)
+class_basic_init_impl(Optimizer)
+class_basic_call_impl(Optimizer)
 // PyMNNOptimizer getter/setter functions impl
 static PyObject* PyMNNOptimizer_getlearning_rate(PyMNNOptimizer *self, void *closure) {
    if (self->ptr) {
@ -110,33 +112,35 @@ static PyObject* PyMNNOptimizer_step(PyMNNOptimizer *self, PyObject *args) {
    }
    return toPyObj(self->ptr->step(toVar(loss)));
 }
-static PyObject* PyMNNOptim_SGD(PyObject *self, PyObject *args) {
-    PyObject *module, *method = toPyObj(RegularizationMethod::L2);
+static PyObject* PyMNNOptim_SGD(PyObject *self, PyObject *args, PyObject *kwargs) {
+    PyObject *module = nullptr, *method = nullptr /* L2 */;
    float learning_rate = 1e-3, momentum = 0.9, weight_decay = 0.0;
-    if (!PyArg_ParseTuple(args, "O|fffO", &module, &learning_rate,
+    static char *kwlist[] = { "module", "learning_rate", "momentum", "weight_decay", "regularization_method", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|fffO", kwlist, &module, &learning_rate,
                          &momentum, &weight_decay, &method)) {
-        return NULL;
+        PyMNN_ERROR("SGD require args: Module, |float, float, float, RegularizationMethod)");
    }
-    auto method_ = toEnum<RegularizationMethod>(method);
-    std::shared_ptr<Module> m(to_Module(module));
+    auto method_ = method == nullptr ? RegularizationMethod::L2 : toEnum<RegularizationMethod>(method);
+    std::shared_ptr<Module> m = *to_Module(module);
    return toPyObj(ParameterOptimizer::createSGD(m, learning_rate, momentum,
                                                 weight_decay, method_));
 }
-static PyObject* PyMNNOptim_ADAM(PyObject *self, PyObject *args) {
-    PyObject *module, *method = toPyObj(RegularizationMethod::L2);
+static PyObject* PyMNNOptim_ADAM(PyObject *self, PyObject *args, PyObject *kwargs) {
+    PyObject *module = nullptr, *method = nullptr /* L2 */;
    float learning_rate = 1e-3, momentum = 0.9, momentum2 = 0.999,
          weight_decay = 0.0, eps = 1e-8;
-    if (!PyArg_ParseTuple(args, "O|fffffO", &module, &learning_rate, &momentum,
+    static char *kwlist[] = { "module", "learning_rate", "momentum", "momentum2", "weight_decay", "eps", "regularization_method", NULL };
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|fffffO", kwlist, &module, &learning_rate, &momentum,
                          &momentum2, &weight_decay, &eps, &method)) {
-        return NULL;
+        PyMNN_ERROR("ADAM require args: Module, |float, float, float, float, float, RegularizationMethod)");
    }
-    auto method_ = toEnum<RegularizationMethod>(method);
-    std::shared_ptr<Module> m(to_Module(module));
+    auto method_ = method == nullptr ? RegularizationMethod::L2 : toEnum<RegularizationMethod>(method);
+    std::shared_ptr<Module> m = *to_Module(module);
    return toPyObj(ParameterOptimizer::createADAM(m, learning_rate, momentum, momentum2,
                                                  weight_decay, eps, method_));
 }
 static PyMethodDef PyMNNOptim_methods[] = {
-    register_methods(Optim,
+    register_methods_kw(Optim,
        SGD, "SGD Optimizer",
        ADAM, "ADAM Optimizer"
    )
--- a/pymnn/src/util.h
+++ b/pymnn/src/util.h
@ -1037,6 +1037,8 @@ static PyMethodDef PyMNN##NAME##_methods[] = { \
 };
 #define def_class_end(NAME, TYPE) \
 static PyObject* PyMNN##NAME##_new(PyTypeObject *type, PyObject *args, PyObject *kwds); \
+static int PyMNN##NAME##_init(PyTypeObject *self, PyObject *args, PyObject *kwds); \
+static PyObject* PyMNN##NAME##_call(PyObject *self, PyObject *args, PyObject *kwds); \
 static void PyMNN##NAME##_dealloc(PyMNN##NAME *self) { \
    if (self->ptr) { \
        delete self->ptr; \
@ -1058,7 +1060,7 @@ static PyTypeObject PyMNN##NAME##Type = { \
    0,                                        /*tp_as_sequence*/\
    0,                                        /*tp_as_mapping*/\
    0,                                        /*tp_hash*/\
-    0,                                        /*tp_call*/\
+    PyMNN##NAME##_call,                       /*tp_call*/\
    0,                                        /*tp_str*/\
    0,                                        /*tp_getattro*/\
    0,                                        /*tp_setattro*/\
@ -1079,7 +1081,7 @@ static PyTypeObject PyMNN##NAME##Type = { \
    0,                                        /*tp_descr_get*/\
    0,                                        /*tp_descr_set*/\
    0,                                        /*tp_dictoffset*/\
-    0,                                        /*tp_init*/\
+    (initproc)PyMNN##NAME##_init,             /*tp_init*/\
    0,                                        /*tp_alloc*/\
    PyMNN##NAME##_new                         /*tp_new*/\
 };\
@ -1107,6 +1109,14 @@ static PyObject* PyMNN##NAME##_new(PyTypeObject *type, PyObject *args, PyObject
    PyMNN##NAME *self = (PyMNN##NAME *)type->tp_alloc(type, 0); \
    return (PyObject*)self; \
 }
+#define class_basic_init_impl(NAME) \
+static int PyMNN##NAME##_init(PyTypeObject *self, PyObject *args, PyObject *kwds) { \
+    return 0; \
+}
+#define class_basic_call_impl(NAME) \
+static PyObject* PyMNN##NAME##_call(PyObject *self, PyObject *args, PyObject *kwds) { \
+    return (PyObject*)self; \
+}
 // ------------------------ class start ------------------------
 // ------------------------ capsule start ------------------------

@ -1118,6 +1128,7 @@ typedef struct { \
 } PyMNN##NAME;
 #define def_class_smart_end(NAME, TYPE) \
 static PyObject* PyMNN##NAME##_new(PyTypeObject *type, PyObject *args, PyObject *kwds); \
+static PyObject* PyMNN##NAME##_call(PyObject *self, PyObject *args, PyObject *kwds); \
 static void PyMNN##NAME##_dealloc(PyMNN##NAME *self) { \
    Py_TYPE(self)->tp_free((PyObject *) self); \
 } \
@ -1136,7 +1147,7 @@ static PyTypeObject PyMNN##NAME##Type = { \
    0,                                        /*tp_as_sequence*/\
    0,                                        /*tp_as_mapping*/\
    0,                                        /*tp_hash*/\
-    0,                                        /*tp_call*/\
+    PyMNN##NAME##_call,                       /*tp_call*/\
    0,                                        /*tp_str*/\
    0,                                        /*tp_getattro*/\
    0,                                        /*tp_setattro*/\
--- a/pymnn/test/README.md
+++ b/pymnn/test/README.md
@ -1,4 +1,38 @@
-拷贝AliNNModel所有模型和测试数据到MNN工作台工程下
+# Pymnn Test Cases
+
+# 1. Unit Test
+```bash
+python3 unit_test.py
+```
+
+# 2. Model Test
+```bash
+python3 model_test.py ~/AliNNModel
+```
+
+# 3. Train Test
+```bash
+./train_test.sh
+```
+
+# 4. Quant Test
+```bash
+python3 ../examples/MNNQuant/test_mnn_offline_quant.py \
+        --mnn_model your_model.mnn \
+        --quant_imgs <path_to_imgs> \
+        --quant_model ./quant_model.mnn
+```
+
+# 5. Benchmark MNN.numpy
+```bash
+pip install prettytable
+python3 benchmark.py
+```
+
+# 6. Playgroud Test (just internal usage) 
+```bash
+# 拷贝AliNNModel所有模型和测试数据到MNN工作台工程下
 python scripts/pullTestModel.py --alinnmodel_path ../../../AliNNModel --playground_path playground/playground
-拷贝AliNNModel指定模型（mobilenet, Ranfa）和测试数据到MNN工作台工程下
+# 拷贝AliNNModel指定模型（mobilenet, Ranfa）和测试数据到MNN工作台工程下
 python scripts/pullTestModel.py --alinnmodel_path ../../../AliNNModel --playground_path playground/playground --models mobilenet Ranfa
+```
--- a/pymnn/test/benchmark.py
+++ b/pymnn/test/benchmark.py
@ -0,0 +1,143 @@
+# -*- coding: UTF-8 -*-
+import os
+os.environ["MKL_NUM_THREADS"] = "1" 
+os.environ["NUMEXPR_NUM_THREADS"] = "1" 
+os.environ["OMP_NUM_THREADS"] = "1" 
+import time
+import MNN.numpy as mp
+import numpy as np
+from prettytable import PrettyTable
+
+res = PrettyTable()
+res.field_names = ["function", "numpy", "MNN.numpy"]
+x = {'shape':[64000],'dtype':'float32'}
+
+def gen_data(args):
+    np_args = []
+    mp_args = []
+    for arg in args:
+        if type(arg) == type({'a':1}):
+            shape = arg['shape']
+            dtype = arg['dtype']
+            np_x = np.random.rand(*shape).astype(dtype)
+            mp_x = mp.random.random(shape).astype(getattr(mp, dtype))
+            mp_x.fix_as_const()
+        else:
+            np_x = arg
+            mp_x = arg
+        np_args.append(np_x)
+        mp_args.append(mp_x)
+    return np_args, mp_args
+
+def np_eval(func, args, loop, mode):
+    if mode == 3:
+        t1 = time.time()
+        for i in range(loop):
+            np_res = func(args)
+            np_res.__str__()
+        t2 = time.time()
+    else:
+        t1 = time.time()
+        for i in range(loop):
+            np_res = func(*args)
+            np_res.__str__()
+        t2 = time.time()
+    return round((t2 - t1) * 1000 / loop, 3)
+    
+def mnn_eval(func, args, loop, mode):
+    if mode == 0:
+        t1 = time.time()
+        for i in range(loop):
+            mp_res = func(*args)
+            mp_res.__str__()
+        t2 = time.time()
+    elif mode == 1:
+        t1 = time.time()
+        for i in range(loop):
+            mp_res = func(*args)
+        t2 = time.time()
+    elif mode == 2:
+        t1 = time.time()
+        for i in range(loop):
+            mp_res = func(*args)
+            for r in mp_res: r.__str__()
+        t2 = time.time()
+    elif mode == 3:
+        t1 = time.time()
+        for i in range(loop):
+            mp_res = func(args)
+            for r in mp_res: r.__str__()
+        t2 = time.time()
+    return round((t2 - t1) * 1000 / loop, 3)
+
+def bench_funcs(funcs, args, mode=0):
+    loop = 10
+    np_args, mp_args = gen_data(args)
+    for func in funcs:
+        np_func = getattr(np, func)
+        mp_func = getattr(mp, func)
+        np_time = np_eval(np_func, np_args, loop, mode)
+        mp_time = mnn_eval(mp_func, mp_args, loop, mode)
+        # np_sum += np_time
+        # mp_sum += mp_time
+        # count += 1
+        res.add_row([func, np_time, mp_time])
+
+def unary():
+    inputs = [x]
+    maths = ['sin', 'cos', 'tan', 'arcsin', 'arccos', 'arctan', 'sinh', 'cosh', 'tanh', 'arcsinh', 'arccosh', 'arctanh', 'around',
+             'floor', 'ceil', 'trunc', 'exp', 'expm1', 'exp2', 'log', 'log2', 'log10', 'log1p', 'sinc', 'signbit', 'positive', 'cbrt',
+             'negative', 'reciprocal', 'sqrt', 'cbrt', 'square', 'sign', 'argwhere', 'flatnonzero', 'sort', 'argsort', 'copy']
+    bench_funcs(maths, inputs)
+    bench_funcs(['modf'], inputs, 2)
+
+def binary():
+    inputs = [x] * 2
+    funcs = ['greater', 'greater_equal', 'less', 'less_equal', 'equal', 'not_equal', 'multiply', 'add', 'divide', 'power',
+             'subtract', 'true_divide', 'floor_divide', 'mod', 'maximum', 'minimum', 'hypot', 'logaddexp', 'logaddexp2', 
+             'copysign' ]
+    bench_funcs(funcs, inputs)
+    bench_funcs(['divmod'], inputs, 2)
+    bench_funcs(['ldexp'], [x, 2], 1)
+    bench_funcs(['dot', 'vdot', 'inner', 'matmul'], [{'shape':[1024, 1024], 'dtype':'float32'}]*2)
+    bench_funcs(['array_equal', 'array_equiv'], [x, x], 1)
+    bench_funcs(['bitwise_and', 'bitwise_or', 'bitwise_xor'], [{'shape':[64000], 'dtype':'int32'}]*2)
+    bench_funcs(['where'], [{'shape':[64000], 'dtype':'int32'}, x, x])
+
+def reduce():
+    inputs = [x]
+    reduce = ['prod', 'sum', 'argmax', 'nonzero', 'count_nonzero', 'max', 'min', 'ptp', 'mean', 'var', 'std']
+    bench_funcs(reduce, inputs, 1)
+    bench_funcs(['all', 'any'], [{'shape':[64000], 'dtype':'int32'}], 1)
+
+def memory():
+    y = {'shape':[4, 16, 10, 100],'dtype':'float32'}
+    bench_funcs(['reshape'], [y, [10, 64, 100]])
+    bench_funcs(['ravel', 'transpose', 'atleast_1d', 'atleast_2d', 'atleast_3d', 'squeeze'], [y])
+    bench_funcs(['moveaxis', 'rollaxis', 'swapaxes'], [y, 0, 3])
+    bench_funcs(['broadcast_to'], [y, [3, 4, 16, 10, 100]])
+    bench_funcs(['expand_dims'], [y, 0])
+    bench_funcs(['concatenate', 'stack', 'vstack', 'hstack', 'dstack', 'column_stack', 'row_stack'], [y, y], 3)
+    bench_funcs(['split', 'dsplit', 'hsplit', 'vsplit'], [y, 2], 2)
+    bench_funcs(['pad'], [x, 2])
+    bench_funcs(['tile', 'repeat'], [x, 2])
+
+def all():
+    unary()
+    binary()
+    reduce()
+    memory()
+
+def log():
+    np_sum = 0
+    mp_sum = 0
+    count  = len(res.rows)
+    for row in res.rows:
+        np_sum += row[1]
+        mp_sum += row[2]
+    res.add_row(['avg', round(np_sum/count, 3), round(mp_sum/count, 3)])
+    print(res)
+    
+if __name__ == '__main__':
+    all()
+    log()
--- a/pymnn/test/train_test.sh
+++ b/pymnn/test/train_test.sh
@ -0,0 +1,4 @@
+python3 ../examples/MNNTrain/mnist/train_mnist.py
+rm ./0.mnist.mnn
+train_wrong=$[$? > 0]
+printf "TEST_NAME_TRAIN_TEST: pymnn训练测试\nTEST_CASE_AMOUNT_TRAIN_TEST: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n" $train_wrong $[1 - $train_wrong]
--- a/schema/current/ExtraInfo_generated.h
+++ b/schema/current/ExtraInfo_generated.h
@ -19,6 +19,7 @@ struct ExtraInfoT : public flatbuffers::NativeTable {
  typedef ExtraInfo TableType;
  std::vector<int8_t> buffer;
  std::string name;
+  std::string version;
  ExtraInfoT() {
  }
 };
@ -34,12 +35,17 @@ struct ExtraInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
  const flatbuffers::String *name() const {
    return GetPointer<const flatbuffers::String *>(6);
  }
+  const flatbuffers::String *version() const {
+    return GetPointer<const flatbuffers::String *>(8);
+  }
  bool Verify(flatbuffers::Verifier &verifier) const {
    return VerifyTableStart(verifier) &&
           VerifyOffset(verifier, 4) &&
           verifier.VerifyVector(buffer()) &&
           VerifyOffset(verifier, 6) &&
           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, 8) &&
+           verifier.VerifyString(version()) &&
           verifier.EndTable();
  }
  ExtraInfoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@ -56,6 +62,9 @@ struct ExtraInfoBuilder {
  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
    fbb_.AddOffset(6, name);
  }
+  void add_version(flatbuffers::Offset<flatbuffers::String> version) {
+    fbb_.AddOffset(8, version);
+  }
  explicit ExtraInfoBuilder(flatbuffers::FlatBufferBuilder &_fbb)
        : fbb_(_fbb) {
    start_ = fbb_.StartTable();
@ -71,8 +80,10 @@ struct ExtraInfoBuilder {
 inline flatbuffers::Offset<ExtraInfo> CreateExtraInfo(
    flatbuffers::FlatBufferBuilder &_fbb,
    flatbuffers::Offset<flatbuffers::Vector<int8_t>> buffer = 0,
-    flatbuffers::Offset<flatbuffers::String> name = 0) {
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::String> version = 0) {
  ExtraInfoBuilder builder_(_fbb);
+  builder_.add_version(version);
  builder_.add_name(name);
  builder_.add_buffer(buffer);
  return builder_.Finish();
@ -91,6 +102,7 @@ inline void ExtraInfo::UnPackTo(ExtraInfoT *_o, const flatbuffers::resolver_func
  (void)_resolver;
  { auto _e = buffer(); if (_e) { _o->buffer.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->buffer[_i] = _e->Get(_i); } } };
  { auto _e = name(); if (_e) _o->name = _e->str(); };
+  { auto _e = version(); if (_e) _o->version = _e->str(); };
 }

 inline flatbuffers::Offset<ExtraInfo> ExtraInfo::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExtraInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@ -103,23 +115,27 @@ inline flatbuffers::Offset<ExtraInfo> CreateExtraInfo(flatbuffers::FlatBufferBui
  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ExtraInfoT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
  auto _buffer = _o->buffer.size() ? _fbb.CreateVector(_o->buffer) : 0;
  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _version = _o->version.empty() ? 0 : _fbb.CreateString(_o->version);
  return MNN::CreateExtraInfo(
      _fbb,
      _buffer,
-      _name);
+      _name,
+      _version);
 }

 inline const flatbuffers::TypeTable *ExtraInfoTypeTable() {
  static const flatbuffers::TypeCode type_codes[] = {
    { flatbuffers::ET_CHAR, 1, -1 },
+    { flatbuffers::ET_STRING, 0, -1 },
    { flatbuffers::ET_STRING, 0, -1 }
  };
  static const char * const names[] = {
    "buffer",
-    "name"
+    "name",
+    "version"
  };
  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, names
+    flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, names
  };
  return &tt;
 }
--- a/schema/current/MNN_generated.h
+++ b/schema/current/MNN_generated.h
@ -230,6 +230,7 @@ enum OpType {
  OpType_EyeLike = 147,
  OpType_CumSum = 148,
  OpType_Det = 149,
+  OpType_CumProd = 150,
  OpType_Plugin = 256,
  OpType_Select = 257,
  OpType_ZerosLike = 258,
@ -258,7 +259,7 @@ enum OpType {
  OpType_MAX = OpType_GridSample
 };

-inline const OpType (&EnumValuesOpType())[168] {
+inline const OpType (&EnumValuesOpType())[169] {
  static const OpType values[] = {
    OpType_AbsVal,
    OpType_QuantizedAdd,
@ -404,6 +405,7 @@ inline const OpType (&EnumValuesOpType())[168] {
    OpType_EyeLike,
    OpType_CumSum,
    OpType_Det,
+    OpType_CumProd,
    OpType_Plugin,
    OpType_Select,
    OpType_ZerosLike,
@ -584,7 +586,7 @@ inline const char * const *EnumNamesOpType() {
    "EyeLike",
    "CumSum",
    "Det",
-    "",
+    "CumProd",
    "",
    "",
    "",
@ -7394,12 +7396,13 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
    { flatbuffers::ET_INT, 0, 0 },
    { flatbuffers::ET_INT, 0, 0 },
    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
    { flatbuffers::ET_INT, 0, 0 }
  };
  static const flatbuffers::TypeFunction type_refs[] = {
    OpTypeTypeTable
  };
-  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 };
+  static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 };
  static const char * const names[] = {
    "AbsVal",
    "QuantizedAdd",
@ -7545,6 +7548,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
    "EyeLike",
    "CumSum",
    "Det",
+    "CumProd",
    "Plugin",
    "Select",
    "ZerosLike",
@ -7571,7 +7575,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
    "GridSample"
  };
  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_ENUM, 168, type_codes, type_refs, values, names
+    flatbuffers::ST_ENUM, 169, type_codes, type_refs, values, names
  };
  return &tt;
 }
--- a/schema/default/ExtraInfo.fbs
+++ b/schema/default/ExtraInfo.fbs
@ -11,4 +11,5 @@ namespace MNN;
 table ExtraInfo {
    buffer:[int8];
    name: string;
+    version: string;
 }
--- a/schema/default/MNN.fbs
+++ b/schema/default/MNN.fbs
@ -161,6 +161,7 @@ enum OpType : int {
    EyeLike = 147,
    CumSum = 148,
    Det = 149,
+    CumProd = 150,

    Plugin = 256, //The Type load from plugin
    //Training Op Start from 257
--- a/source/backend/coreml/backend/CoreMLBackend.cpp
+++ b/source/backend/coreml/backend/CoreMLBackend.cpp
@ -15,6 +15,7 @@

 extern bool isAvailable();
 namespace MNN {
+    void registerCoreMLOps();
    static inline std::map<OpType, CoreMLBackend::Creator*>* getCreatorMap() {
        static std::once_flag of;
        static std::map<OpType, CoreMLBackend::Creator*>* ret = nullptr;
@ -298,6 +299,7 @@ namespace MNN {
        if (!isAvailable()) {
            return;
        }
+        registerCoreMLOps();
        MNNInsertExtraRuntimeCreator(MNN_FORWARD_NN, new CoreMLBackendCreator, true);
    }
 }
--- a/source/backend/coreml/backend/CoreMLBackend.hpp
+++ b/source/backend/coreml/backend/CoreMLBackend.hpp
@ -146,6 +146,12 @@ namespace MNN {
        }
    };

+#define REGISTER_COREML_OP_CREATOR(name, opType)     \
+    void ___##name##__##opType##__() {            \
+        static TypedCreator<name> _temp;\
+        CoreMLBackend::addCreator(opType, &_temp); \
+    }
+
 }

 #endif //MNN_COREMLBACKEND_H
--- a/source/backend/coreml/backend/CoreMLExecutor.mm
+++ b/source/backend/coreml/backend/CoreMLExecutor.mm
@ -156,32 +156,37 @@ id<MTLComputePipelineState> getRasterPipeline() {
    if (_model == nil) {
        return NO;
    }
-    NSError* error = nil;
-    MultiArrayFeatureProvider* inputFeature = [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs coreMlVersion:[self coreMlVersion]];
-    if (inputFeature == nil) {
-        NSLog(@"inputFeature is not initialized.");
-        return NO;
-    }
-    MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
-    // options.usesCPUOnly = true;
-    id<MLFeatureProvider> outputFeature = [_model predictionFromFeatures:inputFeature
-                                                                 options:options
-                                                                   error:&error];
-    if (error != nil) {
-        NSLog(@"Error executing model: %@", [error localizedDescription]);
-        return NO;
-    }
-    NSSet<NSString*>* outputFeatureNames = [outputFeature featureNames];
-    for (auto& output : outputs) {
-        NSString* outputName = [NSString stringWithCString:output.second.c_str()
-                                                  encoding:[NSString defaultCStringEncoding]];
-        MLFeatureValue* outputValue = [outputFeature featureValueForName:[outputFeatureNames member:outputName]];
-        auto* data = [outputValue multiArrayValue];
-        float* outputData = (float*)data.dataPointer;
-        if (outputData == nullptr) {
+    @autoreleasepool {
+        NSError* error = nil;
+        MultiArrayFeatureProvider* inputFeature = [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs coreMlVersion:[self coreMlVersion]];
+        if (inputFeature == nil) {
+            NSLog(@"inputFeature is not initialized.");
            return NO;
        }
-        memcpy(output.first->host<float*>(), outputData, output.first->size());
+        MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
+        // options.usesCPUOnly = true;
+        //NSDate* timeStartData = [NSDate date];
+        id<MLFeatureProvider> outputFeature = [_model predictionFromFeatures:inputFeature
+                                                                     options:options
+                                                                       error:&error];
+        //float deltaTime = [[NSDate date] timeIntervalSinceDate:timeStartData];
+        //NSLog(@"cost time = %f", deltaTime * 1000);
+        if (error != nil) {
+            NSLog(@"Error executing model: %@", [error localizedDescription]);
+            return NO;
+        }
+        NSSet<NSString*>* outputFeatureNames = [outputFeature featureNames];
+        for (auto& output : outputs) {
+            NSString* outputName = [NSString stringWithCString:output.second.c_str()
+                                                      encoding:[NSString defaultCStringEncoding]];
+            MLFeatureValue* outputValue = [outputFeature featureValueForName:[outputFeatureNames member:outputName]];
+            auto* data = [outputValue multiArrayValue];
+            float* outputData = (float*)data.dataPointer;
+            if (outputData == nullptr) {
+                return NO;
+            }
+            memcpy(output.first->host<float*>(), outputData, output.first->size());
+        }
    }
    return YES;
 }
@ -451,6 +456,13 @@ id<MTLComputePipelineState> getRasterPipeline() {

 - (NSArray<NSArray<NSNumber *> *> *)outputShapesForInputShapes:(NSArray<NSArray<NSNumber *> *> *)inputShapes
                                                         error:(NSError * _Nullable *)error {
+    for (int i = 0; i < inputShapes.count; i++) {
+        printf("### shape_%d : { ", i);
+        for (int j = 0; j < inputShapes[i].count; j++) {
+            printf("%d, ", inputShapes[i][j].intValue);
+        }
+        printf(" }\n");
+    }
    return inputShapes;
 }

--- a/source/backend/coreml/backend/CoreMLOPRegister.cpp
+++ b/source/backend/coreml/backend/CoreMLOPRegister.cpp
@ -0,0 +1,42 @@
+// This file is generated by Shell for ops register
+namespace MNN {
+extern void ___CoreMLReduction__OpType_Reduction__();
+extern void ___CoreMLBinary__OpType_BinaryOp__();
+extern void ___CoreMLBinary__OpType_Eltwise__();
+extern void ___CoreMLArgMax__OpType_ArgMax__();
+extern void ___CoreMLConvolution__OpType_Convolution__();
+extern void ___CoreMLConvolution__OpType_ConvolutionDepthwise__();
+extern void ___CoreMLConvolution__OpType_Deconvolution__();
+extern void ___CoreMLInterp__OpType_Interp__();
+extern void ___CoreMLUnary__OpType_UnaryOp__();
+extern void ___CoreMLScale__OpType_Scale__();
+extern void ___CoreMLPool__OpType_Pooling__();
+extern void ___CoreMLRaster__OpType_Raster__();
+extern void ___CoreMLActivation__OpType_ReLU__();
+extern void ___CoreMLActivation__OpType_ReLU6__();
+extern void ___CoreMLActivation__OpType_ELU__();
+extern void ___CoreMLActivation__OpType_PReLU__();
+extern void ___CoreMLActivation__OpType_Sigmoid__();
+extern void ___CoreMLActivation__OpType_Softmax__();
+
+void registerCoreMLOps() {
+___CoreMLReduction__OpType_Reduction__();
+___CoreMLBinary__OpType_BinaryOp__();
+___CoreMLBinary__OpType_Eltwise__();
+___CoreMLArgMax__OpType_ArgMax__();
+___CoreMLConvolution__OpType_Convolution__();
+___CoreMLConvolution__OpType_ConvolutionDepthwise__();
+___CoreMLConvolution__OpType_Deconvolution__();
+___CoreMLInterp__OpType_Interp__();
+___CoreMLUnary__OpType_UnaryOp__();
+___CoreMLScale__OpType_Scale__();
+___CoreMLPool__OpType_Pooling__();
+___CoreMLRaster__OpType_Raster__();
+___CoreMLActivation__OpType_ReLU__();
+___CoreMLActivation__OpType_ReLU6__();
+___CoreMLActivation__OpType_ELU__();
+___CoreMLActivation__OpType_PReLU__();
+___CoreMLActivation__OpType_Sigmoid__();
+___CoreMLActivation__OpType_Softmax__();
+}
+}
--- a/source/backend/coreml/execution/CoreMLActivation.cpp
+++ b/source/backend/coreml/execution/CoreMLActivation.cpp
@ -41,6 +41,7 @@ ErrorCode CoreMLActivation::onResize(const std::vector<Tensor *> &inputs, const
                auto reluLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
                core_ml__specification__neural_network_layer__init(reluLayer);
                mCoreMLBackend->setLayerName(reluLayer, "relu6-relu");
+                reluLayer->activation = mCoreMLBackend->create<CoreML__Specification__ActivationParams>();
                reluLayer->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_RE_LU;
                reluLayer->activation->relu = mCoreMLBackend->create<CoreML__Specification__ActivationReLU>();
                core_ml__specification__activation_re_lu__init(reluLayer->activation->relu);
@ -98,12 +99,10 @@ ErrorCode CoreMLActivation::onResize(const std::vector<Tensor *> &inputs, const
    return NO_ERROR;
 }

-CoreMLCreatorRegister<TypedCreator<CoreMLActivation>> __relu_op(OpType_ReLU);
-CoreMLCreatorRegister<TypedCreator<CoreMLActivation>> __relu6_op(OpType_ReLU6);
-CoreMLCreatorRegister<TypedCreator<CoreMLActivation>> __elu_op(OpType_ELU);
-CoreMLCreatorRegister<TypedCreator<CoreMLActivation>> __prelu_op(OpType_PReLU);
-CoreMLCreatorRegister<TypedCreator<CoreMLActivation>> __sigmoid_op(OpType_Sigmoid);
-CoreMLCreatorRegister<TypedCreator<CoreMLActivation>> __softmax_op(OpType_Softmax);
-
-
+REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_ReLU)
+REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_ReLU6)
+REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_ELU)
+REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_PReLU)
+REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_Sigmoid)
+REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_Softmax)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLArgMax.cpp
+++ b/source/backend/coreml/execution/CoreMLArgMax.cpp
@ -31,5 +31,5 @@ ErrorCode CoreMLArgMax::onResize(const std::vector<Tensor *> &inputs, const std:
    return NO_ERROR;
 }

-CoreMLCreatorRegister<TypedCreator<CoreMLArgMax>> __argmax_op(OpType_ArgMax);
+REGISTER_COREML_OP_CREATOR(CoreMLArgMax, OpType_ArgMax)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLBinary.cpp
+++ b/source/backend/coreml/execution/CoreMLBinary.cpp
@ -192,8 +192,7 @@ ErrorCode CoreMLBinary::onResize(const std::vector<Tensor *> &inputs, const std:
 }


-CoreMLCreatorRegister<TypedCreator<CoreMLBinary>> __bianry_op(OpType_BinaryOp);
-CoreMLCreatorRegister<TypedCreator<CoreMLBinary>> __elemwise_op(OpType_Eltwise);
-
+REGISTER_COREML_OP_CREATOR(CoreMLBinary, OpType_BinaryOp)
+REGISTER_COREML_OP_CREATOR(CoreMLBinary, OpType_Eltwise)

 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLConvolution.cpp
+++ b/source/backend/coreml/execution/CoreMLConvolution.cpp
@ -12,6 +12,7 @@ namespace MNN {


 CoreMLConvolution::CoreMLConvolution(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : CoreMLCommonExecution(b, op) {
+    isDeconv = op->type() == OpType_Deconvolution;
    initLayer();
 }

@ -111,6 +112,7 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
    mLayer_->convolution = mCoreMLBackend->create<CoreML__Specification__ConvolutionLayerParams>();
    core_ml__specification__convolution_layer_params__init(mLayer_->convolution);
    mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_CONVOLUTION;
+    mLayer_->convolution->isdeconvolution = isDeconv;
    mLayer_->convolution->ngroups = group;
    mLayer_->convolution->n_stride = 2;
    mLayer_->convolution->stride = mCoreMLBackend->create<uint64_t>(mLayer_->convolution->n_stride);
@ -183,10 +185,7 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
    return NO_ERROR;
 }

-
-CoreMLCreatorRegister<TypedCreator<CoreMLConvolution>> __convolution_op(OpType_Convolution);
-CoreMLCreatorRegister<TypedCreator<CoreMLConvolution>> __convdepwise_op(OpType_ConvolutionDepthwise);
-// CoreMLCreatorRegister<TypedCreator<CoreMLConvolution>> __deconvolution_op(OpType_Deconvolution);
-
-
+REGISTER_COREML_OP_CREATOR(CoreMLConvolution, OpType_Convolution)
+REGISTER_COREML_OP_CREATOR(CoreMLConvolution, OpType_ConvolutionDepthwise)
+REGISTER_COREML_OP_CREATOR(CoreMLConvolution, OpType_Deconvolution)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLConvolution.hpp
+++ b/source/backend/coreml/execution/CoreMLConvolution.hpp
@ -27,6 +27,7 @@ private:
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    const float *weightPtr, *biasPtr;
    int weightSize, biasSize;
+    bool isDeconv = false;
 };
 } // namespace MNN

--- a/source/backend/coreml/execution/CoreMLInterp.cpp
+++ b/source/backend/coreml/execution/CoreMLInterp.cpp
@ -72,5 +72,5 @@ ErrorCode CoreMLInterp::onResize(const std::vector<Tensor *> &inputs, const std:
    return NO_ERROR;
 }

-CoreMLCreatorRegister<TypedCreator<CoreMLInterp>> __interp_op(OpType_Interp);
+REGISTER_COREML_OP_CREATOR(CoreMLInterp, OpType_Interp)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLPool.cpp
+++ b/source/backend/coreml/execution/CoreMLPool.cpp
@ -15,8 +15,56 @@ CoreMLPool::CoreMLPool(MNN::Backend *b, const MNN::Op *op, const std::vector<Ten
    initLayer();
 }

+void CoreMLPool::addPadLayer(const Tensor * input, const Pool* common) {
+    MNN_ASSERT(common->padType() == PoolPadType_CAFFE);
+    int top, left, bottom, right;
+    if (nullptr != common->pads()) {
+        MNN_ASSERT(common->pads()->size() >= 4);
+        top = common->pads()->Get(0);
+        left = common->pads()->Get(1);
+        bottom = common->pads()->Get(2);
+        right = common->pads()->Get(3);
+    } else {
+        top = common->padY();
+        left = common->padX();
+        bottom = common->padY();
+        right = common->padX();
+    }
+    if (top == 0 && left == 0 && bottom == 0 && right == 0) {
+        return;
+    }
+    auto paddingLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
+    core_ml__specification__neural_network_layer__init(paddingLayer);
+    paddingLayer->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_PADDING;
+    mCoreMLBackend->setLayerName(paddingLayer, "PoolPadding");
+    paddingLayer->padding = mCoreMLBackend->create<CoreML__Specification__PaddingLayerParams>();
+    core_ml__specification__padding_layer_params__init(paddingLayer->padding);
+    paddingLayer->padding->padding_type_case = CORE_ML__SPECIFICATION__PADDING_LAYER_PARAMS__PADDING_TYPE_CONSTANT;
+    paddingLayer->padding->constant = mCoreMLBackend->create<CoreML__Specification__PaddingLayerParams__PaddingConstant>();
+    core_ml__specification__padding_layer_params__padding_constant__init(paddingLayer->padding->constant);
+    paddingLayer->padding->constant->value = 0;
+    paddingLayer->padding->paddingamounts = mCoreMLBackend->create<CoreML__Specification__BorderAmounts>();
+    core_ml__specification__border_amounts__init(paddingLayer->padding->paddingamounts);
+    paddingLayer->padding->paddingamounts->n_borderamounts = 2;
+    paddingLayer->padding->paddingamounts->borderamounts = mCoreMLBackend->create<CoreML__Specification__BorderAmounts__EdgeSizes*>(2);
+    paddingLayer->padding->paddingamounts->borderamounts[0] = mCoreMLBackend->create<CoreML__Specification__BorderAmounts__EdgeSizes>();
+    core_ml__specification__border_amounts__edge_sizes__init(paddingLayer->padding->paddingamounts->borderamounts[0]);
+    paddingLayer->padding->paddingamounts->borderamounts[0]->startedgesize = top;
+    paddingLayer->padding->paddingamounts->borderamounts[0]->endedgesize = bottom;
+    paddingLayer->padding->paddingamounts->borderamounts[1] = mCoreMLBackend->create<CoreML__Specification__BorderAmounts__EdgeSizes>();
+    core_ml__specification__border_amounts__edge_sizes__init(paddingLayer->padding->paddingamounts->borderamounts[1]);
+    paddingLayer->padding->paddingamounts->borderamounts[1]->startedgesize = left;
+    paddingLayer->padding->paddingamounts->borderamounts[1]->endedgesize = right;
+    auto inputName = mPoolInputName;
+    mPoolInputName = mPoolInputName + "-" + mPoolOutputName + "-Padding";
+    setLayerInputsAndOutputs(paddingLayer, {inputName}, {mPoolInputName});
+    mCoreMLBackend->addLayer(paddingLayer);
+}
+
 ErrorCode CoreMLPool::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    MNN_ASSERT(inputs.size() == 1 && outputs.size() == 1);
+    mPoolInputName = mCoreMLBackend->getTensorName(inputs[0]);
+    mPoolOutputName = mCoreMLBackend->getTensorName(outputs[0]);
    auto pool    = mOp->main_as_Pool();
    auto strideX = pool->strideX();
    auto strideY = pool->strideY();
@ -48,10 +96,16 @@ ErrorCode CoreMLPool::onResize(const std::vector<Tensor *> &inputs, const std::v
            core_ml__specification__valid_padding__init(mLayer_->pooling->valid);
            break;
        case PoolPadType_CAFFE:
-            // TODO: deal caffe pad mode
-            mLayer_->pooling->pooling_padding_type_case = CORE_ML__SPECIFICATION__POOLING_LAYER_PARAMS__POOLING_PADDING_TYPE_INCLUDE_LAST_PIXEL;
-            mLayer_->pooling->includelastpixel = mCoreMLBackend->create<CoreML__Specification__PoolingLayerParams__ValidCompletePadding>();
-            core_ml__specification__pooling_layer_params__valid_complete_padding__init(mLayer_->pooling->includelastpixel);
+            if (pool->pads()->size() > 0 || pool->padX() > 0) {
+                addPadLayer(inputs[0], pool);
+                mLayer_->pooling->pooling_padding_type_case = CORE_ML__SPECIFICATION__POOLING_LAYER_PARAMS__POOLING_PADDING_TYPE_VALID;
+                mLayer_->pooling->valid = mCoreMLBackend->create<CoreML__Specification__ValidPadding>();
+                core_ml__specification__valid_padding__init(mLayer_->pooling->valid);
+            } else {
+                mLayer_->pooling->pooling_padding_type_case = CORE_ML__SPECIFICATION__POOLING_LAYER_PARAMS__POOLING_PADDING_TYPE_INCLUDE_LAST_PIXEL;
+                mLayer_->pooling->includelastpixel = mCoreMLBackend->create<CoreML__Specification__PoolingLayerParams__ValidCompletePadding>();
+                core_ml__specification__pooling_layer_params__valid_complete_padding__init(mLayer_->pooling->includelastpixel);
+            }
            break;
        default:
            break;
@ -62,13 +116,11 @@ ErrorCode CoreMLPool::onResize(const std::vector<Tensor *> &inputs, const std::v
    } else {
        mLayer_->pooling->type = CORE_ML__SPECIFICATION__POOLING_LAYER_PARAMS__POOLING_TYPE__MAX;
    }
-    setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(inputs[0])}, {mCoreMLBackend->getTensorName(outputs[0])});
+    setLayerInputsAndOutputs(mLayer_, {mPoolInputName}, {mPoolOutputName});
    mCoreMLBackend->addLayer(mLayer_);
    return NO_ERROR;
 }


-CoreMLCreatorRegister<TypedCreator<CoreMLPool>> __pool_op(OpType_Pooling);
-
-
+REGISTER_COREML_OP_CREATOR(CoreMLPool, OpType_Pooling)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLPool.hpp
+++ b/source/backend/coreml/execution/CoreMLPool.hpp
@ -19,6 +19,9 @@ public:
    CoreMLPool(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
    ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
    virtual ~CoreMLPool() = default;
+private:
+    void addPadLayer(const Tensor * input, const Pool* common);
+    std::string mPoolInputName, mPoolOutputName;
 };
 } // namespace MNN

--- a/source/backend/coreml/execution/CoreMLRaster.cpp
+++ b/source/backend/coreml/execution/CoreMLRaster.cpp
@ -366,5 +366,5 @@ ErrorCode CoreMLRaster::onResize(const std::vector<Tensor *> &inputs, const std:
    return NO_ERROR;
 }

-CoreMLCreatorRegister<TypedCreator<CoreMLRaster>> __raster_op(OpType_Raster);
+REGISTER_COREML_OP_CREATOR(CoreMLRaster, OpType_Raster)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLReduction.cpp
+++ b/source/backend/coreml/execution/CoreMLReduction.cpp
@ -67,5 +67,5 @@ ErrorCode CoreMLReduction::onResize(const std::vector<Tensor *> &inputs, const s
    return NO_ERROR;
 }

-CoreMLCreatorRegister<TypedCreator<CoreMLReduction>> __reduction_op(OpType_Reduction);
+REGISTER_COREML_OP_CREATOR(CoreMLReduction, OpType_Reduction)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLScale.cpp
+++ b/source/backend/coreml/execution/CoreMLScale.cpp
@ -47,5 +47,5 @@ ErrorCode CoreMLScale::onResize(const std::vector<Tensor *> &inputs, const std::
    return NO_ERROR;
 }

-CoreMLCreatorRegister<TypedCreator<CoreMLScale>> __scale_op(OpType_Scale);
+REGISTER_COREML_OP_CREATOR(CoreMLScale, OpType_Scale)
 } // namespace MNN
--- a/source/backend/coreml/execution/CoreMLUnary.cpp
+++ b/source/backend/coreml/execution/CoreMLUnary.cpp
@ -234,5 +234,5 @@ ErrorCode CoreMLUnary::onResize(const std::vector<Tensor *> &inputs, const std::
    return NO_ERROR;
 }

-CoreMLCreatorRegister<TypedCreator<CoreMLUnary>> __unary_op(OpType_UnaryOp);
+REGISTER_COREML_OP_CREATOR(CoreMLUnary, OpType_UnaryOp)
 } // namespace MNN
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -220,6 +220,8 @@ Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType sto
    if (nullptr != originMem) {
        if (static_cast<CPUMemObj*>(originMem)->getSize() >= size) {
            return originMem;
+        } else {
+            TensorUtils::getDescribe(dest)->mem.reset(nullptr);
        }
    }
    // MNN_PRINT("Acquire size = %d\n", size);
--- a/source/backend/cpu/CPUGridSample.cpp
+++ b/source/backend/cpu/CPUGridSample.cpp
@ -30,9 +30,17 @@ ErrorCode CPUGridSample::onResize(const std::vector<Tensor *> &inputs, const std
    int numberThread = static_cast<CPUBackend*>(backend())->threadNumber();
    auto core = static_cast<CPUBackend*>(backend())->functions();
    auto outputTensor = outputs[0];
-    auto outH = outputTensor->buffer().dim[2].extent;
-    auto outW = outputTensor->buffer().dim[3].extent;
-    mTempCordBuffer.reset(Tensor::createDevice<uint8_t>({1, outH * outW * 2 * core->bytes}));
+    int outD, outH, outW;
+    if (outputTensor->dimensions() == 4) {
+        outH = outputTensor->buffer().dim[2].extent;
+        outW = outputTensor->buffer().dim[3].extent;
+        mTempCordBuffer.reset(Tensor::createDevice<uint8_t>({1, outH * outW * 2 * core->bytes}));
+    } else {
+        outD = outputTensor->buffer().dim[2].extent;
+        outH = outputTensor->buffer().dim[3].extent;
+        outW = outputTensor->buffer().dim[4].extent;
+        mTempCordBuffer.reset(Tensor::createDevice<uint8_t>({1, outD * outH * outW * 3 * core->bytes}));
+    }
    auto res = backend()->onAcquireBuffer(mTempCordBuffer.get(), Backend::DYNAMIC);
    if (!res) {
        return OUT_OF_MEMORY;
@ -52,35 +60,71 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
    auto batches = inputTensor->buffer().dim[0].extent;
    auto channels = inputTensor->buffer().dim[1].extent;
    auto channelC4 = UP_DIV(channels, core->pack);
-    auto inH = inputTensor->buffer().dim[2].extent;
-    auto inW = inputTensor->buffer().dim[3].extent;
-    auto outH = outputTensor->buffer().dim[2].extent;
-    auto outW = outputTensor->buffer().dim[3].extent;
-    auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
-    auto tileCount = outH;
-    auto inOffset  = batches * inH * inW * core->pack;
-    auto outOffset = batches * outH * outW * core->pack;
-    auto cordPtr = mTempCordBuffer->host<uint8_t>();
-    for (auto b = 0; b < batches; ++b) {
-        auto _inputPtr = inputPtr + b * inH * inW * core->pack * core->bytes;
-        auto _gridPtr = gridPtr + b * gridTensor->buffer().dim[0].stride * core->bytes;
-        auto _outputPtr = outputPtr + b * outH * outW * core->pack * core->bytes;
-        core->MNNGridSampleComputeCord((float *)cordPtr, (const float *)_gridPtr, inH, inW, outH, outW, gridTensor->buffer().dim[1].stride, mAlignCorners);
-        // Compute cord
-        MNN_CONCURRENCY_BEGIN(tId, threadCount) {
-            for (int index=tId; index < tileCount; index += threadCount) {
-                auto c = index / outH;
-                auto h = index % outH;
-                auto inputC = _inputPtr + c * inW * inH * batches * core->pack * core->bytes;
-                auto outputC = _outputPtr + c * outW * outH * batches * core->pack * core->bytes;
-                auto cordH = cordPtr + h * outW * 2 * core->bytes;
-                auto outputH = outputC + h * outW * core->pack * core->bytes;
-                core->MNNGridSampleInterp((float *)outputH, (const float *)inputC, (const float *)cordH, inH, inW, outW, channelC4, inOffset, outOffset, (mMode == SampleMode_NEAREST), (mPaddingMode == BorderMode_ZEROS));
+    if (outputs[0]->dimensions() == 4) {
+        auto inH = inputTensor->buffer().dim[2].extent;
+        auto inW = inputTensor->buffer().dim[3].extent;
+        auto outH = outputTensor->buffer().dim[2].extent;
+        auto outW = outputTensor->buffer().dim[3].extent;
+        auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
+        auto tileCount = outH;
+        auto inOffset  = batches * inH * inW * core->pack;
+        auto outOffset = batches * outH * outW * core->pack;
+        auto cordPtr = mTempCordBuffer->host<uint8_t>();
+        for (auto b = 0; b < batches; ++b) {
+            auto _inputPtr = inputPtr + b * inH * inW * core->pack * core->bytes;
+            auto _gridPtr = gridPtr + b * gridTensor->buffer().dim[0].stride * core->bytes;
+            auto _outputPtr = outputPtr + b * outH * outW * core->pack * core->bytes;
+            core->MNNGridSampleComputeCord((float *)cordPtr, (const float *)_gridPtr, inH, inW, outH, outW, gridTensor->buffer().dim[1].stride, mAlignCorners);
+            // Compute cord
+            MNN_CONCURRENCY_BEGIN(tId, threadCount) {
+                for (int index=tId; index < tileCount; index += threadCount) {
+                    auto c = index / outH;
+                    auto h = index % outH;
+                    auto inputC = _inputPtr + c * inW * inH * batches * core->pack * core->bytes;
+                    auto outputC = _outputPtr + c * outW * outH * batches * core->pack * core->bytes;
+                    auto cordH = cordPtr + h * outW * 2 * core->bytes;
+                    auto outputH = outputC + h * outW * core->pack * core->bytes;
+                    core->MNNGridSampleInterp((float *)outputH, (const float *)inputC, (const float *)cordH, inH, inW, outW, channelC4, inOffset, outOffset, (mMode == SampleMode_NEAREST), (mPaddingMode == BorderMode_ZEROS));
+                }
            }
+            MNN_CONCURRENCY_END();
+        }
+    } else {
+        auto inD = inputTensor->buffer().dim[2].extent;
+        auto inH = inputTensor->buffer().dim[3].extent;
+        auto inW = inputTensor->buffer().dim[4].extent;
+        auto outD = outputTensor->buffer().dim[2].extent;
+        auto outH = outputTensor->buffer().dim[3].extent;
+        auto outW = outputTensor->buffer().dim[4].extent;
+        auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
+        auto tileCount = outD;
+        auto inOffset  = batches * inH * inW * core->pack;
+        auto outOffset = batches * outH * outW * core->pack;
+        auto cordPtr = mTempCordBuffer->host<uint8_t>();
+        for (auto b = 0; b < batches; ++b) {
+            auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes;
+            auto _gridPtr = gridPtr + b * gridTensor->buffer().dim[0].stride * core->bytes;
+            auto _outputPtr = outputPtr + b * outD * outH * outW * core->pack * core->bytes;
+            core->MNNGridSampleComputeCord3D((float *)cordPtr, (const float *)_gridPtr, inD, inH, inW, outD, outH, outW, gridTensor->buffer().dim[1].stride, gridTensor->buffer().dim[2].stride, mAlignCorners);
+            // Compute cord
+            MNN_CONCURRENCY_BEGIN(tId, threadCount) {
+                for (int index=tId; index < tileCount; index += threadCount) {
+                    auto c = index / outD;
+                    auto d = index % outD;
+                    auto inputC = _inputPtr + c * inD * inW * inH * batches * core->pack * core->bytes;
+                    auto outputC = _outputPtr + c * outD * outW * outH * batches * core->pack * core->bytes;
+                    auto cordD = cordPtr + d * outH * outW * 3 * core->bytes;
+                    auto outputD = outputC + d * outH * outW * core->pack * core->bytes;
+                    for (int h = 0; h < outH; h++) {
+                        auto cordH = cordD + h * outW * 3 * core->bytes;
+                        auto outputH = outputD + h * outW * core->pack * core->bytes;
+                        core->MNNGridSampleInterp3D((float *)outputH, (const float *)inputC, (const float *)cordH, inD, inH, inW, outW, channelC4, inOffset, outOffset, (mMode == SampleMode_NEAREST), (mPaddingMode == BorderMode_ZEROS));
+                    }
+                }
+            }
+            MNN_CONCURRENCY_END();
        }
-        MNN_CONCURRENCY_END();
    }
-
    return NO_ERROR;
 }

--- a/source/backend/cpu/CPUInterp.cpp
+++ b/source/backend/cpu/CPUInterp.cpp
@ -74,12 +74,6 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
    const int inH  = inputs[0]->buffer().dim[2].extent;
    const int outW = outputs[0]->buffer().dim[3].extent;
    const int outH = outputs[0]->buffer().dim[2].extent;
-    if (mInit && mResizeType == 2) {
-        backend()->onReleaseBuffer(&mWidthPosition, Backend::STATIC);
-        backend()->onReleaseBuffer(&mWidthFactor, Backend::STATIC);
-        backend()->onReleaseBuffer(&mHeightPosition, Backend::STATIC);
-        backend()->onReleaseBuffer(&mHeightFactor, Backend::STATIC);
-    }
    const float xScaling = mWidthScale;
    const float yScaling = mHeightScale;

@ -105,8 +99,6 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
    if (!res) {
        return OUT_OF_MEMORY;
    }
-    mInit = true;
-
    auto _wPosition = mWidthPosition.host<int>();
    auto _wFactor   = mWidthFactor.host<float>();

--- a/source/backend/cpu/CPULayerNorm.cpp
+++ b/source/backend/cpu/CPULayerNorm.cpp
@ -29,7 +29,7 @@ public:
                       const std::vector<Tensor*> &outputs) override;

 private:
-    std::vector<int> axis_;
+    int axis_size = 0;
    int inner_size_ = 1;
    int outter_size_ = 1;
    int group_ = 1;
@ -43,11 +43,7 @@ private:
 CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
        : Execution(backend) {
    const auto* layer_norm_param = op->main_as_LayerNorm();
-    int axis_size = layer_norm_param->axis()->size();
-    axis_.resize(axis_size);
-    for (int i = 0; i < axis_size; ++i) {
-        axis_[i] = layer_norm_param->axis()->Get(i);
-    }
+    axis_size = layer_norm_param->axis()->size();
    group_ = layer_norm_param->group();
    epsilon_ = layer_norm_param->epsilon();

@ -104,18 +100,10 @@ ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs,
        inner_size_ /= group_;
        return NO_ERROR;
    }
-    std::vector<int> axis(axis_.size());
-    for (int i = 0; i < axis_.size(); ++i) {
-        if (axis_[i] < 0) {
-            axis[i] += rank;
-        }
-    }
-    std::sort(axis.begin(), axis.end());
-
-    for (int i = 0; i < rank - axis.size(); ++i) {
+    for (int i = 0; i < rank - axis_size; ++i) {
        outter_size_ *= inputs.at(0)->length(i);
    }
-    for (int i = rank - axis.size(); i < rank; ++i) {
+    for (int i = rank - axis_size; i < rank; ++i) {
        inner_size_ *= inputs.at(0)->length(i);
    }
    return NO_ERROR;
--- a/source/backend/cpu/CPURaster.cpp
+++ b/source/backend/cpu/CPURaster.cpp
@ -35,6 +35,15 @@ static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int&
            channel = t->length(1);
            area    = t->length(2);
        }
+    } else if (t->dimensions() == 5) {
+        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
+        if (format == MNN_DATA_FORMAT_NHWC) {
+            channel = t->length(4);
+            area    = t->length(1) * t->length(2) * t->length(3);
+        } else {
+            channel = t->length(1);
+            area    = t->length(2) * t->length(3) * t->length(4);
+        }
    } else {
        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
        if (format == MNN_DATA_FORMAT_NHWC) {
--- a/source/backend/cpu/CPUResize.cpp
+++ b/source/backend/cpu/CPUResize.cpp
@ -153,65 +153,63 @@ void CPUResizeCommon::CPUResizeBilinearC4(halide_buffer_t& input, halide_buffer_
    const int outW            = output.dim[3].extent;
    const int outH            = output.dim[2].extent;

-    int depthQuad = UP_DIV(input.dim[1].extent, 4);
+    int depthQuad = UP_DIV(input.dim[1].extent, 4) * batches;

-    for (int b = 0; b < batches; ++b) {
-        auto threadFunction = [&](size_t tId) {
-            for (int n = (int)tId; n < depthQuad; n += threadNumber) {
-                auto _lineBuffer = lineBuffer + 2 * 4 * outW * tId;
-                auto _line0      = _lineBuffer + 4 * outW * 0;
-                auto _line1      = _lineBuffer + 4 * outW * 1;
-                int yUsed[2]     = {0, 0};
-                int yCache[2]    = {-1, -1};
+    auto threadFunction = [&](size_t tId) {
+        for (int n = (int)tId; n < depthQuad; n += threadNumber) {
+            auto _lineBuffer = lineBuffer + 2 * 4 * outW * tId;
+            auto _line0      = _lineBuffer + 4 * outW * 0;
+            auto _line1      = _lineBuffer + 4 * outW * 1;
+            int yUsed[2]     = {0, 0};
+            int yCache[2]    = {-1, -1};

-                float* yCacheLine[2]          = {_line0, _line1};
-                float* const yCacheStorage[2] = {_line0, _line1};
+            float* yCacheLine[2]          = {_line0, _line1};
+            float* const yCacheStorage[2] = {_line0, _line1};

-                auto bottomData =
-                    reinterpret_cast<const float*>(input.host) + b * inputBatchSize + (int)n * 4 * inW * inH;
-                auto topData = reinterpret_cast<float*>(output.host) + b * outputBatchSize + (int)n * 4 * outW * outH;
-                for (int dy = 0; dy < outH; dy++) {
-                    int yp[2];
-                    yp[0] = heightPosition[2 * dy + 0];
-                    yp[1] = heightPosition[2 * dy + 1];
-                    // Search cache
-                    for (int j = 0; j < 2; ++j) {
-                        yUsed[j] = 0;
+            auto bottomData =
+                reinterpret_cast<const float*>(input.host)  + (int)n * 4 * inW * inH;
+            auto topData = reinterpret_cast<float*>(output.host) + (int)n * 4 * outW * outH;
+            for (int dy = 0; dy < outH; dy++) {
+                int yp[2];
+                yp[0] = heightPosition[2 * dy + 0];
+                yp[1] = heightPosition[2 * dy + 1];
+                // Search cache
+                for (int j = 0; j < 2; ++j) {
+                    yUsed[j] = 0;
+                }
+                for (int j = 0; j < 2; ++j) {
+                    int find = 0;
+                    for (int k = 0; k < 2; ++k) {
+                        if (yp[j] == yCache[k]) {
+                            yUsed[k]      = 1;
+                            yCacheLine[j] = yCacheStorage[k];
+                            find          = 1;
+                            break;
+                        }
                    }
-                    for (int j = 0; j < 2; ++j) {
-                        int find = 0;
+                    if (!find) {
+                        const float* bottomY0 = bottomData + yp[j] * inW * 4;
                        for (int k = 0; k < 2; ++k) {
-                            if (yp[j] == yCache[k]) {
+                            if (!yUsed[k]) {
+                                yCache[k]     = yp[j];
                                yUsed[k]      = 1;
                                yCacheLine[j] = yCacheStorage[k];
-                                find          = 1;
+                                CPUBilinearSampleC4(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
                                break;
                            }
                        }
-                        if (!find) {
-                            const float* bottomY0 = bottomData + yp[j] * inW * 4;
-                            for (int k = 0; k < 2; ++k) {
-                                if (!yUsed[k]) {
-                                    yCache[k]     = yp[j];
-                                    yUsed[k]      = 1;
-                                    yCacheLine[j] = yCacheStorage[k];
-                                    CPUBilinearSampleC4(bottomY0, yCacheLine[j], widthPosition, widthFactor, outW);
-                                    break;
-                                }
-                            }
-                        }
                    }
-                    auto topY = topData + outW * 4 * dy;
-                    // Sample Input
-                    CPUBilinearLineC4(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
                }
+                auto topY = topData + outW * 4 * dy;
+                // Sample Input
+                CPUBilinearLineC4(topY, yCacheLine[0], yCacheLine[1], &heightFactor[dy], outW);
            }
-        };
-        MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
-            threadFunction(tId);
        }
-        MNN_CONCURRENCY_END();
+    };
+    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
+        threadFunction(tId);
    }
+    MNN_CONCURRENCY_END();
 }

 void CPUResizeCommon::CPUResizeNearestneighborRoundC4(halide_buffer_t &input, halide_buffer_t &output, float wScale, float hScale, float wOffset, float hOffset) {
--- a/source/backend/cpu/CPUScatterNd.cpp
+++ b/source/backend/cpu/CPUScatterNd.cpp
@ -18,6 +18,9 @@ void ScatterNdImpl(const Tensor* indices, const Tensor* updates, const Tensor* s
    auto outputPtr             = output->host<T>();
    const int indicesDimension = indices->dimensions();
    const int indicesLastDim   = indices->length(indicesDimension - 1);
+    if (indicesLastDim == 0) {
+        return;
+    }
    const int indexes          = indices->elementSize() / indicesLastDim;
    int accNumber              = 1;
    for (int i = indicesDimension - 1; i < updates->dimensions(); ++i) {
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@ -1543,6 +1543,24 @@ void MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, size_t i
        }
    }
 }
+void MNNGridSampleComputeCord3D(float* dst, const float* src, size_t inD, size_t inH, size_t inW, size_t outD, size_t outH, size_t outW, size_t strideD, size_t strideH, bool alignCorners) {
+    float a = alignCorners ? 1.0f : 0.0f;
+    float b = alignCorners ? 0.0f : 1.0f;
+    for (auto d = 0; d < outD; ++d) {
+        for (auto h = 0; h < outH; ++h) {
+            auto __gridPtr = src + d * strideD + h * strideH;
+            auto cordH = dst + (d * outH + h) * outW * 3;
+            for (auto w = 0; w < outW; ++w) {
+                auto x = __gridPtr[3 * w + 0];
+                auto y = __gridPtr[3 * w + 1];
+                auto z = __gridPtr[3 * w + 2];
+                cordH[3 * w + 0] = ((1 + x) * (inW - a) - b) * 0.5f;
+                cordH[3 * w + 1] = ((1 + y) * (inH - a) - b) * 0.5f;
+                cordH[3 * w + 2] = ((1 + z) * (inD - a) - b) * 0.5f;
+            }
+        }
+    }
+}

 #ifndef MNN_USE_SSE
 void MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
@ -1585,6 +1603,22 @@ size_t MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padM
    return h * width * 4 + w * 4;
 }

+size_t MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int height, int width, bool padMode) {
+    if (padMode == true) { //padMode == BorderMode_ZEROS
+        if (h < 0 || h >= height || w < 0 || w >= width || d < 0 || d >= depth) {
+            return -1;
+        }
+    } else {
+        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
+        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
+        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
+        d = d < 0 ? 0 : (d > (depth - 1) ? (depth - 1) : d);
+        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
+        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
+    }
+    return ((d * height + h) * width + w) * 4;
+}
+
 void MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
    for (auto ow = 0; ow < outW; ++ow) {
        auto w = cordPtr[2 * ow + 0];
@ -1695,6 +1729,70 @@ void MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<
    }
 }

+void MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
+    for (auto ow = 0; ow < outW; ++ow) {
+        auto w = cordPtr[3 * ow + 0];
+        auto h = cordPtr[3 * ow + 1];
+        auto d = cordPtr[3 * ow + 2];
+        Vec4 interp;
+
+        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
+            int nd = ::floor(d + 0.5f);
+            int nh = ::floor(h + 0.5f);
+            int nw = ::floor(w + 0.5f);
+            size_t ns = MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode);
+            for (int k = 0; k < channelCUnit; ++k) {
+                interp = ns == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + ns);
+                Vec4::save(outputPtr + k * outOffset + 4 * ow, interp);
+            }
+        } else { //sampleMode == GridSampleMode_BILINEAR
+            int w0_d = ::floor(d);
+            int w0_h = ::floor(h);
+            int w0_w = ::floor(w);
+            int w1_d = ::ceil(d);
+            int w1_h = ::ceil(h);
+            int w1_w = ::ceil(w);
+            auto oneV = Vec4(1.0f);
+
+            auto f0 = Vec4((float)w1_w - w);
+            auto f1 = oneV - f0;
+            auto h0 = Vec4((float)w1_h - h);
+            auto h1 = oneV - h0;
+            auto d0 = Vec4((float)w1_d - d);
+            auto d1 = oneV - d0;
+
+            size_t s000 = MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode);
+            size_t s001 = MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode);
+            size_t s010 = MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode);
+            size_t s011 = MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode);
+            size_t s100 = MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode);
+            size_t s101 = MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode);
+            size_t s110 = MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode);
+            size_t s111 = MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode);
+
+            for (int k = 0; k < channelCUnit; ++k) {
+                Vec4 i000 = s000 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s000);
+                Vec4 i001 = s001 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s001);
+                Vec4 i010 = s010 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s010);
+                Vec4 i011 = s011 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s011);
+                Vec4 i100 = s100 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s100);
+                Vec4 i101 = s101 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s101);
+                Vec4 i110 = s110 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s110);
+                Vec4 i111 = s111 == -1 ? Vec4(0.f) : Vec4::load(inputPtr + k * inOffset + s111);
+
+                Vec4 i00 = i000 * f0 + i001 * f1;
+                Vec4 i01 = i010 * f0 + i011 * f1;
+                Vec4 i0 = i00 * h0 + i01 * h1;
+                Vec4 i10 = i100 * f0 + i101 * f1;
+                Vec4 i11 = i110 * f0 + i111 * f1;
+                Vec4 i1 = i10 * h0 + i11 * h1;
+                interp = i0 * d0 + i1 * d1;
+
+                Vec4::save(outputPtr + k * outOffset + 4 * ow, interp);
+            }
+        }
+    }
+}

 void MNNPackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area,size_t depth, int* areaOffset) {
    MNNPackC4Common(dst, src, area, depth, areaOffset);
@ -2714,6 +2812,8 @@ void MNNCoreFunctionInit() {
    gCoreFunction->MNNScaleAndAddBias = MNNScaleAndAddBias;
    gCoreFunction->MNNGridSampleComputeCord = MNNGridSampleComputeCord;
    gCoreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
+    gCoreFunction->MNNGridSampleComputeCord3D = MNNGridSampleComputeCord3D;
+    gCoreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
    gCoreFunction->MNNRoiPoolingMax = MNNRoiPoolingMax;
    gCoreFunction->MNNRoiAlignMax = MNNRoiAlignMax;
    gCoreFunction->MNNRoiAlignAvg = MNNRoiAlignAvg;
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@ -149,10 +149,13 @@ void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
 void MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, size_t inW, size_t outH, size_t outW, size_t stride, bool alignCorners);
 void MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW,
                            size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode);
+void MNNGridSampleComputeCord3D(float* dst, const float* src, size_t inD, size_t inH, size_t inW, size_t outD, size_t outH, size_t outW, size_t stride, bool alignCorners);
+void MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode);
 void MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw);
 void MNNRoiAlignMax(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
 void MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);

+
 typedef void(*MNNBinaryExecute)(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int elementSize, int broadcastIndex);
 typedef void(*MNNUnaryExecute)(void* outputRaw, const void* inputRaw, int elementSize);
 typedef void(*MNNCopyWithStride)(uint8_t* dstO, const uint8_t* srcO, int size, int stride, int ds);
@ -237,6 +240,8 @@ struct CoreFunctions {
    void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
    void(*MNNGridSampleComputeCord)(float* dst, const float* src, size_t inH, size_t inW, size_t outH, size_t outW, size_t stride, bool alignCorners);
    void(*MNNGridSampleInterp)(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode);
+    void(*MNNGridSampleComputeCord3D)(float* dst, const float* src, size_t inD, size_t inH, size_t inW, size_t outD, size_t outH, size_t outW, size_t stride1, size_t stride2, bool alignCorners);
+    void(*MNNGridSampleInterp3D)(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode);
    void(*MNNRoiPoolingMax)(float* dst, const float* src, int hLen, int wLen, int iw);
    void(*MNNRoiAlignMax)(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
    void(*MNNRoiAlignAvg)(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
--- a/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
@ -438,6 +438,154 @@ void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::ve
    }
 }

+void _AVX_MNNGridSampleComputeCord3D(float* dst, const float* src, size_t inD, size_t inH, size_t inW, size_t outD, size_t outH, size_t outW, size_t strideD, size_t strideH, bool alignCorners) {
+    __m256 zero = _mm256_setzero_ps();
+    __m256 one = _mm256_set1_ps(1);
+    __m256 half = _mm256_set1_ps(0.5f);
+    __m256 a = alignCorners ? one : zero;
+    __m256 b = alignCorners ? zero : one;
+    __m256 in0 = _mm256_set_ps(inH, inW, inD, inH, inW, inD, inH, inW);
+    __m256 in1 = _mm256_set_ps(inW, inD, inH, inW, inD, inH, inW, inD);
+    __m256 in2 = _mm256_set_ps(inD, inH, inW, inD, inH, inW, inD, inH);
+    int area = outD * outH * outW;
+    int areaC4 = area / PACK_UNIT;
+    int areaRemain = area - areaC4 * PACK_UNIT;
+    float buffer[3 * PACK_UNIT] = { 0 };
+    for (int i = 0; i < areaC4; ++i) {
+        __m256 cord0 = _mm256_loadu_ps(src);
+        __m256 cord1 = _mm256_loadu_ps(src + PACK_UNIT);
+        __m256 cord2 = _mm256_loadu_ps(src + PACK_UNIT * 2);
+        cord0 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord0), _mm256_sub_ps(in0, a)), b));
+        cord1 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord1), _mm256_sub_ps(in1, a)), b));
+        cord2 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord2), _mm256_sub_ps(in2, a)), b));
+        _mm256_storeu_ps(dst, cord0);
+        _mm256_storeu_ps(dst + PACK_UNIT, cord1);
+        _mm256_storeu_ps(dst + PACK_UNIT * 2, cord2);
+        src += PACK_UNIT * 3;
+        dst += PACK_UNIT * 3;
+    }
+
+    if (areaRemain > 0) {
+        float flag[PACK_UNIT] = {0.f};
+        __m256i mask;
+        if (areaRemain < 3) {
+            for (int i = 0; i < areaRemain * 3; i++) {
+                flag[i] = -0.f;
+            }
+            mask = _mm256_loadu_si256((__m256i*)flag);
+            __m256 cord0 = _mm256_loadu_ps(src);
+            cord0 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord0), _mm256_sub_ps(in0, a)), b));
+            _mm256_maskstore_ps(dst, mask, cord0);
+        } else if (areaRemain < 6) {
+            for (int i = 0; i < areaRemain * 3 - 8; i++) {
+                flag[i] = -0.f;;
+            }
+            mask = _mm256_loadu_si256((__m256i*)flag);
+            __m256 cord0 = _mm256_loadu_ps(src);
+            __m256 cord1 = _mm256_maskload_ps(src + PACK_UNIT, mask);
+            cord0 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord0), _mm256_sub_ps(in0, a)), b));
+            cord1 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord1), _mm256_sub_ps(in1, a)), b));
+            _mm256_storeu_ps(dst, cord0);
+            _mm256_maskstore_ps(dst + PACK_UNIT, mask, cord1);
+        } else {
+            for (int i = 0; i < areaRemain * 3 - 16; i++) {
+                flag[i] = -0.f;;
+            }
+            mask = _mm256_loadu_si256((__m256i*)flag);
+            __m256 cord0 = _mm256_loadu_ps(src);
+            __m256 cord1 = _mm256_loadu_ps(src + PACK_UNIT);
+            __m256 cord2 = _mm256_maskload_ps(src + PACK_UNIT * 2, mask);
+            cord0 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord0), _mm256_sub_ps(in0, a)), b));
+            cord1 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord1), _mm256_sub_ps(in1, a)), b));
+            cord2 = _mm256_mul_ps(half, _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(one, cord2), _mm256_sub_ps(in2, a)), b));
+            _mm256_storeu_ps(dst, cord0);
+            _mm256_storeu_ps(dst + PACK_UNIT, cord1);
+            _mm256_maskstore_ps(dst + PACK_UNIT * 2, mask, cord2);
+        }
+    }
+}
+
+static size_t _AVX_MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int height, int width, bool padMode) {
+    if (padMode == true) { //padMode == BorderMode_ZEROS
+        if (h < 0 || h >= height || w < 0 || w >= width) {
+            return -1;
+        }
+    } else {
+        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
+        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
+        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
+        d = d < 0 ? 0 : (d > (depth - 1) ? (depth - 1) : d);
+        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
+        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
+    }
+    return ((d * height + h) * width + w) *  PACK_UNIT;
+}
+
+void _AVX_MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
+    for (auto ow = 0; ow < outW; ++ow) {
+        auto w = cordPtr[3 * ow + 0];
+        auto h = cordPtr[3 * ow + 1];
+        auto d = cordPtr[3 * ow + 2];
+        __m256 interp;
+
+        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
+            int nd = ::floor(d + 0.5f);
+            int nh = ::floor(h + 0.5f);
+            int nw = ::floor(w + 0.5f);
+            size_t ns = _AVX_MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode);
+            for (int k = 0; k < channelCUnit; ++k) {
+                interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
+                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
+            }
+        } else { //sampleMode == GridSampleMode_BILINEAR
+            int w0_d = ::floor(d);
+            int w0_h = ::floor(h);
+            int w0_w = ::floor(w);
+            int w1_d = ::ceil(d);
+            int w1_h = ::ceil(h);
+            int w1_w = ::ceil(w);
+            auto oneV = _mm256_set1_ps(1.0f);
+
+            auto f0 = _mm256_set1_ps((float)w1_w - w);
+            auto f1 = _mm256_sub_ps(oneV, f0);
+            auto h0 = _mm256_set1_ps((float)w1_h - h);
+            auto h1 = _mm256_sub_ps(oneV, h0);
+            auto d0 = _mm256_set1_ps((float)w1_d - d);
+            auto d1 = _mm256_sub_ps(oneV, d0);
+
+            size_t s000 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode);
+            size_t s001 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode);
+            size_t s010 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode);
+            size_t s011 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode);
+            size_t s100 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode);
+            size_t s101 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode);
+            size_t s110 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode);
+            size_t s111 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode);
+
+            for (int k = 0; k < channelCUnit; ++k) {
+                __m256 i000 = s000 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s000);
+                __m256 i001 = s001 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s001);
+                __m256 i010 = s010 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s010);
+                __m256 i011 = s011 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s011);
+                __m256 i100 = s100 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s100);
+                __m256 i101 = s101 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s101);
+                __m256 i110 = s110 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s110);
+                __m256 i111 = s111 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s111);
+
+                __m256 i00 = _mm256_add_ps(_mm256_mul_ps(i000, f0), _mm256_mul_ps(i001, f1));
+                __m256 i01 = _mm256_add_ps(_mm256_mul_ps(i010, f0), _mm256_mul_ps(i011, f1));
+                __m256 i0  = _mm256_add_ps(_mm256_mul_ps(i00, h0), _mm256_mul_ps(i01, h1));
+                __m256 i10 = _mm256_add_ps(_mm256_mul_ps(i100, f0), _mm256_mul_ps(i101, f1));
+                __m256 i11 = _mm256_add_ps(_mm256_mul_ps(i110, f0), _mm256_mul_ps(i111, f1));
+                __m256 i1  = _mm256_add_ps(_mm256_mul_ps(i10, h0), _mm256_mul_ps(i11, h1));
+
+                interp = _mm256_add_ps(_mm256_mul_ps(i0, d0), _mm256_mul_ps(i1, d1));
+                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
+            }
+        }
+    }
+}
+
 void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                       size_t bStride, size_t height) {
    for (int y = 0; y < height; ++y) {
@ -727,6 +875,8 @@ void _AVX_ExtraInit(void* functions) {
    coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise;
    coreFunction->MNNGridSampleComputeCord = _AVX_MNNGridSampleComputeCord;
    coreFunction->MNNGridSampleInterp = _AVX_MNNGridSampleInterp;
+    coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D;
+    coreFunction->MNNGridSampleInterp3D = _AVX_MNNGridSampleInterp3D;
    coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax;
    coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax;
    coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg;
--- a/source/backend/metal/MNNMetalLib.h
+++ b/source/backend/metal/MNNMetalLib.h
--- a/source/backend/metal/MetalBackend.metal
+++ b/source/backend/metal/MetalBackend.metal
@ -282,3 +282,58 @@ kernel void blit_int64(const device short4 *in   [[buffer(0)]],
        out[int(dstOffset)] = in[int(srcOffset)];
    }
 }
+
+template<typename IType, typename OType>
+static inline void template_NHWC_to_NCHW(const device IType* in,
+        device OType* out, constant tensor_shape &s, uint2 gid) {
+    int b  = gid.y / s.slice;
+    int c4 = gid.y % s.slice;
+    
+    auto in_off  = (b * s.size + gid.x) * s.channel + c4 * 4;
+    auto out_off = (b * s.channel + c4 * 4) * s.size + gid.x;
+    
+    out[out_off] = in[in_off];
+    if(c4 * 4 + 1 < s.channel) {
+        out[out_off + s.size] = in[in_off + 1];
+    }
+    if(c4 * 4 + 2 < s.channel) {
+        out[out_off + s.size * 2] = in[in_off + 2];
+    }
+    if(c4 * 4 + 3 < s.channel) {
+        out[out_off + s.size * 3] = in[in_off + 3];
+    }
+}
+
+kernel void upcast_f_NHWC_to_NCHW(const device ftype *in      [[buffer(0)]],
+                                    device float *out          [[buffer(1)]],
+                                    constant tensor_shape &s    [[buffer(2)]],
+                                    uint2 gid                   [[thread_position_in_grid]]) {
+    if ((int)gid.x < s.size && (int)gid.y < s.batch_slices) template_NHWC_to_NCHW<ftype, float>(in, out, s, gid);
+}
+
+template<typename IType, typename OType>
+static inline void template_NCHW_to_NHWC(const device IType* in,
+        device OType* out, constant tensor_shape &s, uint2 gid) {
+    int b  = gid.y / s.slice;
+    int c4 = gid.y % s.slice;
+    
+    auto in_off  = (b * s.channel + c4 * 4) * s.size + gid.x;
+    auto out_off = (b * s.size + gid.x) * s.channel + c4 * 4;
+    
+    out[out_off] = in[in_off];
+    if(c4 * 4 + 1 < s.channel) {
+        out[out_off + 1] = in[in_off + s.size];
+    }
+    if(c4 * 4 + 2 < s.channel) {
+        out[out_off + 2] = in[in_off + s.size * 2];
+    }
+    if(c4 * 4 + 3 < s.channel) {
+        out[out_off + 3] = in[in_off + s.size * 3];
+    }
+}
+kernel void downcast_f_NCHW_to_NHWC(const device float *in      [[buffer(0)]],
+                                    device ftype *out          [[buffer(1)]],
+                                    constant tensor_shape &s    [[buffer(2)]],
+                                    uint2 gid                   [[thread_position_in_grid]]) {
+    if ((int)gid.x < s.size && (int)gid.y < s.batch_slices) template_NCHW_to_NHWC<float, ftype>(in, out, s, gid);
+}
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@ -282,7 +282,7 @@ static NSString *kernelForConvert(halide_type_t type, MNN_DATA_FORMAT from, MNN_
                // from MNN_DATA_FORMAT_NCHW
                {nil, nil, @"upcast_f_NCHW_to_NC4HW4", nil, nil},
                // from MNN_DATA_FORMAT_NHWC
-                {nil, nil, @"upcast_f_NHWC_to_NC4HW4", nil, nil},
+                {@"upcast_f_NHWC_to_NCHW", nil, @"upcast_f_NHWC_to_NC4HW4", nil, nil},
                // from MNN_DATA_FORMAT_NC4HW4
                {@"upcast_f_NC4HW4_to_NCHW", @"upcast_f_NC4HW4_to_NHWC", nil, nil, nil},
                // from MNN_DATA_FORMAT_NHWC4
@ -293,7 +293,7 @@ static NSString *kernelForConvert(halide_type_t type, MNN_DATA_FORMAT from, MNN_
            // down
            {
                // from MNN_DATA_FORMAT_NCHW
-                {nil, nil, @"downcast_f_NCHW_to_NC4HW4", nil, nil},
+                {nil, @"downcast_f_NCHW_to_NHWC", @"downcast_f_NCHW_to_NC4HW4", nil, nil},
                // from MNN_DATA_FORMAT_NHWC
                {nil, nil, @"downcast_f_NHWC_to_NC4HW4", nil, nil},
                // from MNN_DATA_FORMAT_NC4HW4
--- a/source/backend/metal/MetalCodeGen.py
+++ b/source/backend/metal/MetalCodeGen.py
@ -35,7 +35,7 @@ def genSchema():
    FLATC = shaderPath + "/../../../3rd_party/flatbuffers/tmp/flatc"
    sourceFile = shaderPath + "/schema/MetalCache.fbs"
    destFile = shaderPath + "/"
-    cmd = FLATC + " -c " + sourceFile +" --gen-object-api"
+    cmd = FLATC + " -c " + sourceFile +" --gen-object-api" +" --reflect-names"
    print(cmd)
    print(os.popen(cmd).read())
    return
--- a/source/backend/metal/MetalLayerNorm.hpp
+++ b/source/backend/metal/MetalLayerNorm.hpp
@ -0,0 +1,44 @@
+//
+//  MetalLayerNorm.hpp
+//  MNN
+//
+//  Created by MNN on 2019/01/30.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MetalLayerNorm_hpp
+#define MetalLayerNorm_hpp
+
+#import "core/Execution.hpp"
+#import "MNN_generated.h"
+#import "MetalDefine.h"
+
+#if MNN_METAL_ENABLED
+namespace MNN {
+
+class MetalLayerNorm : public Execution {
+public:
+    MetalLayerNorm(Backend *backend, const LayerNorm *layernorm);
+    virtual ~MetalLayerNorm() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    int mOutside;
+    std::vector<int> mAxis;
+    int mInside;
+    int mGroup = 1;
+    float mEps;
+    
+    bool has_gamma_beta_ = false;
+    id<MTLBuffer> mGammaBuffer = nil;
+    id<MTLBuffer> mBetaBuffer = nil;
+    id<MTLBuffer> mShapeBuffer;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
+    
+};
+
+} // namespace MNN
+#endif /* MNN_METAL_ENABLED */
+#endif /* MetalLayerNorm_hpp */
--- a/source/backend/metal/MetalLayerNorm.metal
+++ b/source/backend/metal/MetalLayerNorm.metal
@ -0,0 +1,99 @@
+//
+//  MetalLayerNorm.metal
+//  MNN
+//
+//  Created by MNN on 2022/06/14.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <metal_stdlib>
+#include "MetalDefine.metal"
+
+using namespace metal;
+
+struct layernorm_constants {
+    int inside;
+    int outside;
+    float eps;
+    int has_gamma_beta;
+};
+
+kernel void layernorm_x1(const device ftype *in       [[buffer(0)]],
+                         device ftype *out            [[buffer(1)]],
+                         constant layernorm_constants& cst  [[buffer(2)]],
+                         const device float *gamma    [[buffer(3)]],
+                         const device float *beta     [[buffer(4)]],
+                         uint2 gid                         [[thread_position_in_grid]]) {
+    if ((int)gid.x >= cst.inside || (int)gid.y >= cst.outside) {
+        return;
+    }
+    auto in_data = in + gid.y * cst.inside;
+    auto out_data = out + gid.y * cst.inside;
+
+    float mean;
+    float sum = 0.0f;
+    float square_sum = 0.0f;
+    
+    for(int i = 0; i < cst.inside; i++) {
+        sum += in_data[i];
+    }
+    mean = sum / cst.inside;
+    
+    for(int i = 0; i < cst.inside; i++) {
+        float dis = (in_data[i] - mean);
+        square_sum += dis * dis;
+    }
+    float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
+    
+    float norm = var * ((float)in_data[gid.x] - mean);
+    if(cst.has_gamma_beta) {
+        out_data[gid.x] = (ftype)(norm * gamma[gid.x] + beta[gid.x]);
+    } else {
+        out_data[gid.x] = (ftype)(norm);
+    }
+}
+
+
+kernel void layernorm_x4(const device ftype4 *in       [[buffer(0)]],
+                         device ftype4 *out            [[buffer(1)]],
+                         constant layernorm_constants& cst  [[buffer(2)]],
+                         const device float4 *gamma    [[buffer(3)]],
+                         const device float4 *beta     [[buffer(4)]],
+                         uint2 gid                         [[thread_position_in_grid]]) {
+    if ((int)gid.x >= cst.inside/4 || (int)gid.y >= cst.outside) {
+        return;
+    }
+    auto in_data = in + gid.y * cst.inside/4;
+    auto out_data = out + gid.y * cst.inside/4;
+
+    float mean;
+    float sum = 0.0f;
+    float square_sum = 0.0f;
+    
+    for(int i = 0; i < cst.inside/4; i++) {
+        sum += in_data[i].x;
+        sum += in_data[i].y;
+        sum += in_data[i].z;
+        sum += in_data[i].w;
+    }
+    mean = sum / cst.inside;
+    
+    for(int i = 0; i < cst.inside/4; i++) {
+        float dis = (in_data[i].x - mean);
+        square_sum += dis * dis;
+        dis = (in_data[i].y - mean);
+        square_sum += dis * dis;
+        dis = (in_data[i].z - mean);
+        square_sum += dis * dis;
+        dis = (in_data[i].w - mean);
+        square_sum += dis * dis;
+    }
+    float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
+    
+    float4 norm = var * ((float4)in_data[gid.x] - mean);
+    if(cst.has_gamma_beta) {
+        out_data[gid.x] = (ftype4)(norm * gamma[gid.x] + beta[gid.x]);
+    } else {
+        out_data[gid.x] = (ftype4)(norm);
+    }
+}
--- a/source/backend/metal/MetalLayerNorm.mm
+++ b/source/backend/metal/MetalLayerNorm.mm
@ -0,0 +1,135 @@
+//
+//  MetalLayerNorm.mm
+//  MNN
+//
+//  Created by MNN on 2022/06/14.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#import "backend/metal/MetalLayerNorm.hpp"
+#import "backend/metal/MNNMetalContext.h"
+#import "backend/metal/MetalBackend.hpp"
+
+#if MNN_METAL_ENABLED
+namespace MNN {
+
+MetalLayerNorm::MetalLayerNorm(Backend *backend, const LayerNorm *layernorm)
+    : Execution(backend), mGroup(layernorm->group()),
+        mEps(layernorm->epsilon()) {
+    auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
+
+    int axis_size = layernorm->axis()->size();
+    mAxis.resize(axis_size);
+    for (int i = 0; i < axis_size; ++i) {
+        mAxis[i] = layernorm->axis()->Get(i);
+    }
+
+    if (layernorm->gamma() && layernorm->beta()) {
+        has_gamma_beta_ = true;
+        int gamma_size = layernorm->gamma()->size();
+        const float* gamma_data = layernorm->gamma()->data();
+        mGammaBuffer =
+            [context newDeviceBuffer:gamma_size * sizeof(float) access:CPUWriteOnly];
+
+        memcpy(mGammaBuffer.contents, (const void *)gamma_data, gamma_size * sizeof(float));
+        
+        if (layernorm->beta()->size() != gamma_size) {
+            MNN_ERROR("Size of gamma and beta are not match in MetalLayerNorm.\n");
+        }
+
+        const float* beta_data = layernorm->beta()->data();
+        mBetaBuffer =
+            [context newDeviceBuffer:gamma_size * sizeof(float) access:CPUWriteOnly];
+        memcpy(mBetaBuffer.contents, (const void *)beta_data, gamma_size * sizeof(float));
+    }
+}
+
+ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+
+    auto input = inputs[0], output = outputs[0];
+    
+    mOutside = 1;
+    mInside = 1;
+    int rank = input->dimensions();
+    if (mGroup > 1) {
+        mOutside = input->length(0) * mGroup;
+        for (int i = 1; i < rank; i++) {
+            mInside *= input->length(i);
+        }
+        mInside /= mGroup;
+        return NO_ERROR;
+    }
+    std::vector<int> axis(mAxis.size());
+    for (int i = 0; i < mAxis.size(); ++i) {
+        if (mAxis[i] < 0) {
+            mAxis[i] += rank;
+        }
+    }
+    std::sort(mAxis.begin(), mAxis.end());
+
+    for (int i = 0; i < rank - axis.size(); ++i) {
+        mOutside *= input->length(i);
+    }
+    for (int i = rank - axis.size(); i < rank; ++i) {
+        mInside *= input->length(i);
+    }
+    
+    mShapeBuffer = [context newDeviceBuffer:3 * sizeof(int) + sizeof(float) access:CPUWriteOnly];
+    ((int *)mShapeBuffer.contents)[0]   = mInside;
+    ((int *)mShapeBuffer.contents)[1]   = mOutside;
+    ((float *)mShapeBuffer.contents)[2] = mEps;
+    ((int *)mShapeBuffer.contents)[3]   = (int)has_gamma_beta_;
+
+    
+    bool parallel = (mInside > 32) && ((mInside & 3) == 0);
+    mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4" : @"layernorm_x1"];
+    
+    auto inside = parallel ? mInside/4 : mInside;
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
+    return NO_ERROR;
+}
+
+ErrorCode MetalLayerNorm::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    
+    if(backend->isCommandEncoderSet()) {
+        return NO_ERROR;
+    }
+    
+    auto func = [=](){
+        auto input = inputs[0], output = outputs[0];
+
+        auto encoder   = backend->encoder();
+        [encoder setComputePipelineState:mPipeline];
+        [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
+        [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
+        [encoder setBuffer:mShapeBuffer offset:0 atIndex:2];
+        [encoder setBuffer:mGammaBuffer offset:0 atIndex:3];
+        [encoder setBuffer:mBetaBuffer offset:0 atIndex:4];
+
+        [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
+        MNN_PRINT_ENCODER(context, encoder);
+
+        auto context = (__bridge MNNMetalContext *)backend->context();
+        if(backend->isCmdBufferCommit()) {
+            backend->flushEncoder();
+            [context commit_net];
+        }
+    };
+    func();
+    backend->addOpEncoder(func);
+    return NO_ERROR;
+}
+
+class MetalLayerNormCreator : public MetalBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *> &outputs) const {
+        return new MetalLayerNorm(backend, op->main_as_LayerNorm());
+    }
+};
+REGISTER_METAL_OP_CREATOR(MetalLayerNormCreator, OpType_LayerNorm);
+} // namespace MNN
+#endif /* MNN_METAL_ENABLED */
--- a/source/backend/metal/MetalOPRegister.mm
+++ b/source/backend/metal/MetalOPRegister.mm
@ -2,10 +2,12 @@
 #import "backend/metal/MetalDefine.h"
   namespace MNN {
 #if MNN_METAL_ENABLED
+  extern void ___MetalSoftmaxCreator__OpType_Softmax__();
  extern void ___MetalCastCreator__OpType_Cast__();
  extern void ___MetalReductionCreator__OpType_Reduction__();
  extern void ___MetalEltwiseCreator__OpType_Eltwise__();
  extern void ___MetalConvolutionCreator__OpType_Convolution__();
+  extern void ___MetalLayerNormCreator__OpType_LayerNorm__();
  extern void ___MetalMatMulCreator__OpType_MatMul__();
  extern void ___MetalBinaryCreator__OpType_BinaryOp__();
  extern void ___MetalConvolutionDepthwiseCreator__OpType_ConvolutionDepthwise__();
@ -24,10 +26,12 @@
  extern void ___MetalPReLUCreator__OpType_PReLU__();
  extern void ___MetalReLU6Creator__OpType_ReLU6__();
 void registerMetalOps() {
+   ___MetalSoftmaxCreator__OpType_Softmax__();
   ___MetalCastCreator__OpType_Cast__();
   ___MetalReductionCreator__OpType_Reduction__();
   ___MetalEltwiseCreator__OpType_Eltwise__();
   ___MetalConvolutionCreator__OpType_Convolution__();
+   ___MetalLayerNormCreator__OpType_LayerNorm__();
   ___MetalMatMulCreator__OpType_MatMul__();
   ___MetalBinaryCreator__OpType_BinaryOp__();
   ___MetalConvolutionDepthwiseCreator__OpType_ConvolutionDepthwise__();
--- a/source/backend/metal/MetalSoftmax.hpp
+++ b/source/backend/metal/MetalSoftmax.hpp
@ -0,0 +1,34 @@
+//
+//  MetalSoftmax.hpp
+//  MNN
+//
+//  Created by MNN on 2019/01/30.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MetalSoftmax_hpp
+#define MetalSoftmax_hpp
+
+#import "core/Execution.hpp"
+
+#if MNN_METAL_ENABLED
+namespace MNN {
+
+class MetalSoftmax : public Execution {
+public:
+    MetalSoftmax(Backend *backend, int32_t axis);
+    virtual ~MetalSoftmax() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    int32_t mAxis;
+    int32_t mAxisLen;
+    id<MTLBuffer> mShapeBuffer;
+    id<MTLComputePipelineState> mPipeline;
+    std::pair<MTLSize, MTLSize> mThreads;
+};
+
+} // namespace MNN
+#endif /* MNN_METAL_ENABLED */
+#endif /* MetalSoftmax_hpp */
--- a/source/backend/metal/MetalSoftmax.metal
+++ b/source/backend/metal/MetalSoftmax.metal
@ -0,0 +1,117 @@
+//
+//  MetalSoftmax.metal
+//  MNN
+//
+//  Created by MNN on 2018/08/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <metal_stdlib>
+#include "MetalDefine.metal"
+
+using namespace metal;
+
+struct softmax_shape {
+    int inside_size;
+    int axis_length;
+    int outside_size;
+    int flat_length;
+};
+
+static inline float softmax_max4(float4 value) {
+    return max(max(value[0], value[1]), max(value[2], value[3]));
+}
+
+static inline float softmax_sum4(float4 value) {
+    return value[0] + value[1] + value[2] + value[3];
+}
+
+static inline float4 softmax_filter(float4 value, int z, int limit) {
+    return select(0, value, z * 4 + int4(0, 1, 2, 3) < limit);
+}
+
+kernel void softmax_plane(const device ftype *in     [[buffer(0)]],
+                       device ftype *out          [[buffer(1)]],
+                       constant softmax_shape& s   [[buffer(2)]],
+                       uint2 gid                   [[thread_position_in_grid]]) {
+    if ((int)gid.x >= s.inside_size || (int)gid.y >= s.outside_size) return;
+    
+    auto axis_off = gid.y * s.axis_length * s.inside_size + gid.x;
+    auto axis_in  = in + axis_off;
+    auto axis_out = out + axis_off;
+    
+    // get max
+    auto max1 = axis_in[0];
+    for (int i = 1; i < s.axis_length; i++) {
+        max1 = max(max1, axis_in[i * s.inside_size]);
+    }
+    
+    // get sum
+    float sum1 = 0;
+    for (int i = 0; i < s.axis_length; i++) {
+        sum1 += float(exp(axis_in[i * s.inside_size] - max1));
+    }
+    
+    // output
+    for (int i = 0; i < s.axis_length; i++) {
+        axis_out[i * s.inside_size] = ftype(exp(float(axis_in[i * s.inside_size] - max1)) / sum1);
+    }
+}
+
+kernel void softmax_on_reorder(const device ftype4 *in      [[buffer(0)]],
+                               device ftype4 *out           [[buffer(1)]],
+                               constant softmax_shape& s    [[buffer(2)]],
+                               uint2 gid                    [[thread_position_in_grid]]) {
+    if ((int)gid.x >= s.inside_size || (int)gid.y >= s.outside_size) return;
+    
+    auto axis_off = gid.y * s.axis_length * s.inside_size + gid.x;
+    auto axis_in  = in + axis_off;
+    auto axis_out = out + axis_off;
+
+    // get max
+    auto max4 = softmax_filter(float4(axis_in[0]), 0, s.flat_length);
+    for (int i = 1; i < s.axis_length; i++) {
+        max4 = max(max4, softmax_filter(float4(axis_in[i * s.inside_size]), i, s.flat_length));
+    }
+    float max1 = softmax_max4(max4);
+    
+    // get sum
+    float4 sum4 = 0;
+    for (int i = 0; i < s.axis_length; i++) {
+        sum4 += softmax_filter(exp(float4(axis_in[i * s.inside_size] - max1)), i, s.flat_length);
+    }
+    float sum1 = softmax_sum4(sum4);
+    
+    // output
+    for (int i = 0; i < s.axis_length; i++) {
+        axis_out[i * s.inside_size] = ftype4(exp(float4(axis_in[i * s.inside_size]) - max1) / sum1);
+    }
+}
+
+kernel void softmax_off_reorder(const device ftype4 *in     [[buffer(0)]],
+                                device ftype4 *out          [[buffer(1)]],
+                                constant softmax_shape& s   [[buffer(2)]],
+                                uint2 gid                   [[thread_position_in_grid]]) {
+    if ((int)gid.x >= s.inside_size || (int)gid.y >= s.outside_size) return;
+
+    auto axis_off = gid.y * s.axis_length * s.inside_size + gid.x;
+    auto axis_in  = in + axis_off;
+    auto axis_out = out + axis_off;
+
+    // get max
+    auto max4 = axis_in[0];
+    for (int i = 1; i < s.axis_length; i++) {
+        max4 = max(max4, axis_in[i * s.inside_size]);
+    }
+
+    // get sum
+    float4 sum4 = 0;
+    for (int i = 0; i < s.axis_length; i++) {
+        sum4 += exp(float4(axis_in[i * s.inside_size] - max4));
+    }
+
+    // output
+    for (int i = 0; i < s.axis_length; i++) {
+        axis_out[i * s.inside_size] = ftype4(exp(float4(axis_in[i * s.inside_size] - max4)) / sum4);
+    }
+}
--- a/source/backend/metal/MetalSoftmax.mm
+++ b/source/backend/metal/MetalSoftmax.mm
@ -0,0 +1,110 @@
+//
+//  MetalSoftmax.mm
+//  MNN
+//
+//  Created by MNN on 2019/01/30.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#import "backend/metal/MNNMetalContext.h"
+#import "backend/metal/MetalSoftmax.hpp"
+#import "core/Macro.h"
+#import "backend/metal/MetalBackend.hpp"
+#import "core/TensorUtils.hpp"
+
+#if MNN_METAL_ENABLED
+namespace MNN {
+
+MetalSoftmax::MetalSoftmax(Backend *backend, int32_t axis) : Execution(backend), mAxis(axis) {
+    // nothing to do
+}
+
+ErrorCode MetalSoftmax::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    auto input = inputs[0], output = outputs[0];
+    auto dimensions    = input->buffer().dimensions;
+    auto realAxis      = mAxis < 0 ? dimensions + mAxis : mAxis;
+    auto c4 = TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4; // even dims != 4
+    auto reorder       = realAxis == 1 && c4;
+    // shape
+    auto inside = 1;
+    auto flat = input->length(realAxis);
+    auto axis = flat;
+    auto outside = 1;
+    
+    for (int i = 0; i < realAxis; i++) {
+        auto length = input->length(i);
+        if (1 == i && c4) {
+            length = UP_DIV(length, 4);
+        }
+        outside *= length;
+    }
+    for (int i = realAxis + 1; i < input->dimensions(); i++) {
+        auto length = input->length(i);
+        if (1 == i && c4) {
+            length = UP_DIV(length, 4);
+        }
+        inside *= length;
+    }
+    if (reorder) {
+        axis = UP_DIV(axis, 4);
+    }
+    mShapeBuffer               = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
+    ((int *)mShapeBuffer.contents)[0] = inside;
+    ((int *)mShapeBuffer.contents)[1] = axis;
+    ((int *)mShapeBuffer.contents)[2] = outside;
+    ((int *)mShapeBuffer.contents)[3] = flat;
+    
+    // encode
+    auto plane     = !(TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4);
+    auto kernel = plane ? @"softmax_plane" : reorder ? @"softmax_on_reorder" : @"softmax_off_reorder";
+    mPipeline = [context pipelineWithName:kernel];
+    
+    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)outside, 1)];
+    return NO_ERROR;
+}
+
+ErrorCode MetalSoftmax::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto backend = static_cast<MetalBackend *>(this->backend());
+    auto context = (__bridge MNNMetalContext *)backend->context();
+    
+    if(backend->isCommandEncoderSet()) {
+        return NO_ERROR;
+    }
+    
+    auto func = [=](){
+
+        auto input = inputs[0], output = outputs[0];
+
+        auto encoder   = backend->encoder();
+        [encoder setComputePipelineState:mPipeline];
+        [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
+        [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
+        [encoder setBuffer:mShapeBuffer offset:0 atIndex:2];
+
+        [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
+        MNN_PRINT_ENCODER(context, encoder);
+
+        auto context = (__bridge MNNMetalContext *)backend->context();
+        if(backend->isCmdBufferCommit()) {
+            backend->flushEncoder();
+            [context commit_net];
+        }
+    };
+    func();
+    backend->addOpEncoder(func);
+    return NO_ERROR;
+    
+}
+
+class MetalSoftmaxCreator : public MetalBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *> &outputs) const {
+        auto softmax = op->main_as_Axis();
+        return new MetalSoftmax(backend, softmax->axis());
+    }
+};
+REGISTER_METAL_OP_CREATOR(MetalSoftmaxCreator, OpType_Softmax);
+} // namespace MNN
+#endif /* MNN_METAL_ENABLED */
--- a/source/core/Interpreter.cpp
+++ b/source/core/Interpreter.cpp
@ -42,6 +42,10 @@ struct Content {
    std::string uuid;
 };

+const char* getVersion() {
+    return MNN_VERSION;
+}
+
 static void writeCacheFile(const Content *net, std::pair<const void*, size_t> buffer) {
    bool res = FileLoader::write(net->cacheFile.c_str(), buffer);
    if (!res) {
@ -554,6 +558,13 @@ ErrorCode Interpreter::updateSessionToModel(Session* session) {
    return session->updateToModel((Net*)mNet->net);
 }

+const char* Interpreter::getModelVersion() const {
+    if (mNet && mNet->net && mNet->net->extraInfo() && mNet->net->extraInfo()->version()) {
+        return mNet->net->extraInfo()->version()->c_str();
+    }
+    return "version info not found";
+}
+
 bool Interpreter::getSessionInfo(const Session* session, SessionInfoCode code, void* ptr) {
    std::unique_lock<std::mutex> _l(mNet->lock);
    if (nullptr == session || nullptr == ptr) {
--- a/source/core/OpCommonUtils.cpp
+++ b/source/core/OpCommonUtils.cpp
@ -360,6 +360,9 @@ std::vector<std::tuple<int, int, int>> OpCommonUtils::computeReduceDims(const st
    for (int i = 0; i < axises.size(); ++i) {
        if (axises[i] < 0) {
            axises[i] = inputs[0]->dimensions() + axises[i];
+            if (axises[i] < 0) {
+                return {std::make_tuple(1, totalSize, 1)};
+            }
        }
    }
    // Cache for input's dims
--- a/source/geometry/GeometryCumSum.cpp
+++ b/source/geometry/GeometryCumSum.cpp
@ -22,7 +22,7 @@ public:
        int outside = std::accumulate(shape.begin(), shape.begin() + axis, 1, [](int a, int b) { return a * b; });
        int inside = std::accumulate(shape.begin() + axis + 1, shape.end(), 1, [](int a, int b) { return a * b; });
        bool exclusive = op->main_as_CumSum()->exclusive(), reverse = op->main_as_CumSum()->reverse();
-        
+
        auto outDes = TensorUtils::getDescribe(outputs[0]);
        outDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
        if (!exclusive) {
@ -38,7 +38,7 @@ public:
        if (shape[axis] == 1) {
            return true;
        }
-        
+
        flatbuffers::FlatBufferBuilder builder;
        BinaryOpBuilder binaryOpParamBuilder(builder);
        binaryOpParamBuilder.add_opType(BinaryOpOperation_ADD);
@ -48,7 +48,7 @@ public:
        cmdOpBuilder.add_main(binaryOpParamOffset.Union());
        cmdOpBuilder.add_main_type(OpParameter_BinaryOp);
        auto cmdOpOffset = cmdOpBuilder.Finish();
-        
+
        auto viewStride = builder.CreateVector(std::vector<int>{shape[axis] * inside, 1, 1});
        int step = inside, offset = inside;
        if (reverse) {
@ -65,13 +65,13 @@ public:
        view1.add_offset(offset - step);
        views[1] = view1.Finish();
        views[2] = views[exclusive ? 1 : 0];
-        
+
        auto viewsOffset = builder.CreateVector<flatbuffers::Offset<View>>(views);
        auto sizeOffset = builder.CreateVector(std::vector<int>{outside, inside, 1});
        auto stepOffset = builder.CreateVector(std::vector<int>{step, step, step});
        auto iterIndexesOffset = builder.CreateVector(std::vector<int>{-1, -1, -1});
        auto indexesOffset = builder.CreateVector(std::vector<int>{2, 0, 1});
-        
+
        RegionCommandBuilder cmdBuilder(builder);
        cmdBuilder.add_op(cmdOpOffset);
        cmdBuilder.add_view(viewsOffset);
@ -79,7 +79,7 @@ public:
        cmdBuilder.add_steps(stepOffset);
        cmdBuilder.add_iterIndexes(iterIndexesOffset);
        cmdBuilder.add_indexes(indexesOffset);
-        
+
        std::vector<flatbuffers::Offset<RegionCommand>> regionCommands;
        regionCommands.emplace_back(cmdBuilder.Finish());
        auto rcmdAllOffset = builder.CreateVector<flatbuffers::Offset<RegionCommand>>(regionCommands);
@ -107,14 +107,111 @@ public:
        builder.Finish(finishBuilder.Finish());
        auto cmd = GeometryComputerUtils::makeCommand(builder, {outputs[0], inputs[0]}, outputs);
        res.command.emplace_back(std::move(cmd));
-        
+
        return true;
    }
 };

+class GeometryCumProd : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        auto shape = inputs[0]->shape();
+        int axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis += shape.size();
+        }
+        int outside = std::accumulate(shape.begin(), shape.begin() + axis, 1, [](int a, int b) { return a * b; });
+        int inside = std::accumulate(shape.begin() + axis + 1, shape.end(), 1, [](int a, int b) { return a * b; });
+        auto outDes = TensorUtils::getDescribe(outputs[0]);
+        outDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        outDes->regions.resize(1);
+        auto& reg = outDes->regions[0];
+        reg.origin = inputs[0];
+        reg.src.offset = reg.dst.offset = 0;
+        reg.src.stride[0] = reg.dst.stride[0] = inside * shape[axis];
+        reg.size[0] = outside;
+        reg.size[1] = inside;
+
+        if (shape[axis] == 1) {
+            return true;
+        }
+
+        flatbuffers::FlatBufferBuilder builder;
+        BinaryOpBuilder binaryOpParamBuilder(builder);
+        binaryOpParamBuilder.add_opType(BinaryOpOperation_MUL);
+        auto binaryOpParamOffset = binaryOpParamBuilder.Finish();
+        OpBuilder cmdOpBuilder(builder);
+        cmdOpBuilder.add_type(OpType_BinaryOp);
+        cmdOpBuilder.add_main(binaryOpParamOffset.Union());
+        cmdOpBuilder.add_main_type(OpParameter_BinaryOp);
+        auto cmdOpOffset = cmdOpBuilder.Finish();
+
+        auto viewStride = builder.CreateVector(std::vector<int>{shape[axis] * inside, 1, 1});
+        int step = inside, offset = inside;
+        std::vector<flatbuffers::Offset<View>> views(3);
+        ViewBuilder view0(builder);
+        view0.add_stride(viewStride);
+        view0.add_offset(offset);
+        views[0] = view0.Finish();
+        ViewBuilder view1(builder);
+        view1.add_stride(viewStride);
+        view1.add_offset(offset - step);
+        views[1] = view1.Finish();
+        views[2] = views[0];
+
+        auto viewsOffset = builder.CreateVector<flatbuffers::Offset<View>>(views);
+        auto sizeOffset = builder.CreateVector(std::vector<int>{outside, inside, 1});
+        auto stepOffset = builder.CreateVector(std::vector<int>{step, step, step});
+        auto iterIndexesOffset = builder.CreateVector(std::vector<int>{-1, -1, -1});
+        auto indexesOffset = builder.CreateVector(std::vector<int>{2, 0, 1});
+
+        RegionCommandBuilder cmdBuilder(builder);
+        cmdBuilder.add_op(cmdOpOffset);
+        cmdBuilder.add_view(viewsOffset);
+        cmdBuilder.add_size(sizeOffset);
+        cmdBuilder.add_steps(stepOffset);
+        cmdBuilder.add_iterIndexes(iterIndexesOffset);
+        cmdBuilder.add_indexes(indexesOffset);
+
+        std::vector<flatbuffers::Offset<RegionCommand>> regionCommands;
+        regionCommands.emplace_back(cmdBuilder.Finish());
+        auto rcmdAllOffset = builder.CreateVector<flatbuffers::Offset<RegionCommand>>(regionCommands);
+        auto inputIndexesOffset = builder.CreateVector(std::vector<int>{0, 1});
+        auto outputIndexesOffset = builder.CreateVector(std::vector<int>{2});
+        LoopParamBuilder loopBuilder(builder);
+        loopBuilder.add_parallel(false); // cumprod(i) = cumprod(i-1) * x(i), so can't do outside parallel
+        loopBuilder.add_commands(rcmdAllOffset);
+        loopBuilder.add_loopNumber(shape[axis] - 1);
+        loopBuilder.add_tensorNumber(3);
+        loopBuilder.add_inputIndexes(inputIndexesOffset);
+        loopBuilder.add_outputIndexes(outputIndexesOffset);
+        auto loopOffset = loopBuilder.Finish();
+        flatbuffers::Offset<flatbuffers::String> nameOffset;
+        if (nullptr != op->name()) {
+            nameOffset = builder.CreateString(op->name()->c_str());
+        }
+        OpBuilder finishBuilder(builder);
+        finishBuilder.add_main(loopOffset.Union());
+        finishBuilder.add_main_type(OpParameter_LoopParam);
+        finishBuilder.add_type(OpType_While);
+        if (nullptr != op->name()) {
+            finishBuilder.add_name(nameOffset);
+        }
+        builder.Finish(finishBuilder.Finish());
+        auto cmd = GeometryComputerUtils::makeCommand(builder, {outputs[0], inputs[0]}, outputs);
+        res.command.emplace_back(std::move(cmd));
+
+        return true;
+    }
+};
+
+
 static void _create() {
    std::shared_ptr<GeometryComputer> comp(new GeometryCumSum);
    GeometryComputer::registerGeometryComputer(comp, {OpType_CumSum});
+    std::shared_ptr<GeometryComputer> comp1(new GeometryCumProd);
+    GeometryComputer::registerGeometryComputer(comp1, {OpType_CumProd});
 }

 REGISTER_GEOMETRY(GeometryCumSum, _create);
--- a/source/geometry/GeometryGather.cpp
+++ b/source/geometry/GeometryGather.cpp
@ -161,8 +161,8 @@ public:
    enum MID_POSITION {
        P_constStride = 0,
        P_reshapeIndice = 1,
-        P_reshapeIndiceFloat = 2,
-        P_indiceFloat = 3,
+        P_broadcastStride = 2,
+        P_mulIndice = 3,
        P_indiceOneLine = 4,
        P_MAX
    };
@ -236,18 +236,20 @@ public:
        auto paramSize = params->elementSize();
        auto constStride = cmd.extras[P_constStride];
        auto reshapeIndice = cmd.extras[P_reshapeIndice];
-        auto reshapeIndiceFloat = cmd.extras[P_reshapeIndiceFloat];
-        auto indiceFloat = cmd.extras[P_indiceFloat];
+        auto broadcastStride = cmd.extras[P_broadcastStride];
+        auto mulIndice = cmd.extras[P_mulIndice];
        auto indiceOneLine = cmd.extras[P_indiceOneLine];
        // Set length
        bool needAlloc = constStride->length(0) < indiceNd;
        constStride->setLength(0, indiceNd);
        reshapeIndice->setLength(0, mSliceN);
        reshapeIndice->setLength(1, indiceNd);
-        reshapeIndiceFloat->setLength(0, mSliceN);
-        reshapeIndiceFloat->setLength(1, indiceNd);
-        indiceFloat->setLength(0, mSliceN);
+        broadcastStride->setLength(0, mSliceN);
+        broadcastStride->setLength(1, indiceNd);
+        mulIndice->setLength(0, mSliceN);
+        mulIndice->setLength(1, indiceNd);
        indiceOneLine->setLength(0, mSliceN);
+        indiceOneLine->setLength(1, 1);

        if (needAlloc) {
            if (!context.allocTensor(constStride.get())) {
@ -256,9 +258,10 @@ public:
        }
        for (int i=0; i<indiceNd; ++i) {
            int dimCount = paramSize / params->length(i);
-            constStride->host<float>()[i] = (float)dimCount;
+            constStride->host<int>()[i] = dimCount;
            paramSize = dimCount;
        }
+        // recompute reshape
        reshapeIndice->buffer().device = 0;
        reshapeIndice->buffer().host = 0;
        auto des = TensorUtils::getDescribe(reshapeIndice.get());
@ -266,7 +269,21 @@ public:
        des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
        des->backend = nullptr;
        des->regions = {GeometryComputerUtils::makeRawAddressRef(indice, 0, mSliceN * indiceNd)};
-        
+        // recompute broadcast
+        broadcastStride->buffer().device = 0;
+        broadcastStride->buffer().host = 0;
+        des = TensorUtils::getDescribe(broadcastStride.get());
+        des->extra.offset = 0;
+        des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        des->backend = nullptr;
+        des->regions[0].origin = constStride.get();
+        des->regions[0].size[0] = 1;
+        des->regions[0].size[1] = mSliceN;
+        des->regions[0].size[2] = indiceNd;
+        des->regions[0].dst.stride[0] = indiceNd*mSliceN;
+        des->regions[0].dst.stride[1] = indiceNd;
+        des->regions[0].dst.stride[2] = 1;
+        // recompute loop
        auto loopCmd = cmd.command[cmd.command.size() - 1];
        auto param = loopCmd->op->main_as_LoopParam();
        // Reset parameters for last command
@ -304,14 +321,14 @@ public:
        }
        auto paramSize = params->elementSize();
        std::array<std::shared_ptr<Tensor>, 5> midTensors;
-        std::shared_ptr<Tensor> constStride(Tensor::createDevice<float>({indiceNd, 1}));
+        std::shared_ptr<Tensor> constStride(Tensor::createDevice<int>({indiceNd}));
        if (!context.allocTensor(constStride.get())) {
            return false;
        }
        midTensors[P_constStride] = constStride;
        for (int i=0; i<indiceNd; ++i) {
            int dimCount = paramSize / params->length(i);
-            constStride->host<float>()[i] = (float)dimCount;
+            constStride->host<int>()[i] = dimCount;
            paramSize = dimCount;
        }
        std::shared_ptr<Tensor> reshapeIndice(Tensor::createDevice<int>({mSliceN, indiceNd}));
@ -321,41 +338,36 @@ public:
            des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
            des->regions = {GeometryComputerUtils::makeRawAddressRef(indice, 0, mSliceN * indiceNd)};
        }
-        std::shared_ptr<Tensor> reshapeIndiceFloat(Tensor::createDevice<float>({mSliceN, indiceNd}));
-        midTensors[P_reshapeIndiceFloat] = reshapeIndiceFloat;
+        std::shared_ptr<Tensor> broadcastStride(Tensor::createDevice<int>({mSliceN, indiceNd}));
+        midTensors[P_broadcastStride] = broadcastStride;
        {
-            flatbuffers::FlatBufferBuilder builder;
-            CastParamBuilder builder_(builder);
-            builder_.add_dstT(DataType_DT_FLOAT);
-            auto mainOffset = builder_.Finish().Union();
-            OpBuilder opB(builder);
-            opB.add_type(OpType_Cast);
-            opB.add_main(mainOffset);
-            opB.add_main_type(OpParameter_CastParam);
-            builder.Finish(opB.Finish());
-            auto cmd = GeometryComputerUtils::makeCommand(builder, {reshapeIndice.get()}, {reshapeIndiceFloat.get()});
-            res.command.emplace_back(std::move(cmd));
+            // [D] => [N, D]
+            auto des = TensorUtils::getDescribe(broadcastStride.get());
+            des->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            des->regions.resize(1);
+            des->regions[0].origin = constStride.get();
+            des->regions[0].size[0] = 1;
+            des->regions[0].size[1] = mSliceN;
+            des->regions[0].size[2] = indiceNd;
+            des->regions[0].dst.stride[0] = indiceNd*mSliceN;
+            des->regions[0].dst.stride[1] = indiceNd;
+            des->regions[0].dst.stride[2] = 1;
+            des->regions[0].src.stride[0] = 0;
+            des->regions[0].src.stride[1] = 0;
+            des->regions[0].src.stride[2] = 1;
        }
-        std::shared_ptr<Tensor> indiceFloat(Tensor::createDevice<float>({mSliceN, 1}));
-        midTensors[P_indiceFloat] = indiceFloat;
+        std::shared_ptr<Tensor> mulIndice(Tensor::createDevice<int>({mSliceN, indiceNd}));
+        midTensors[P_mulIndice] = mulIndice;
        {
-            // MatMul
-            auto cmd = GeometryComputerUtils::makeMatMul(reshapeIndiceFloat.get(), constStride.get(), indiceFloat.get());
+            // [N, D] * [N, D] => [N, D]
+            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_MUL, reshapeIndice.get(), broadcastStride.get(), mulIndice.get());
            res.command.emplace_back(std::move(cmd));
        }
        std::shared_ptr<Tensor> indiceOneLine(Tensor::createDevice<int>({mSliceN, 1}));
        midTensors[P_indiceOneLine] = indiceOneLine;
        {
-            flatbuffers::FlatBufferBuilder builder;
-            CastParamBuilder builder_(builder);
-            builder_.add_dstT(DataType_DT_INT32);
-            auto mainOffset = builder_.Finish().Union();
-            OpBuilder opB(builder);
-            opB.add_type(OpType_Cast);
-            opB.add_main(mainOffset);
-            opB.add_main_type(OpParameter_CastParam);
-            builder.Finish(opB.Finish());
-            auto cmd = GeometryComputerUtils::makeCommand(builder, {indiceFloat.get()}, {indiceOneLine.get()});
+            // [N, D] => [N, 1]
+            auto cmd = GeometryComputerUtils::makeReduce(ReductionType_SUM, mulIndice.get(), indiceOneLine.get());
            res.command.emplace_back(std::move(cmd));
        }

--- a/source/shape/ShapeGatherND.cpp
+++ b/source/shape/ShapeGatherND.cpp
@ -24,6 +24,14 @@ public:
            MNN_ERROR("params->dimensions() < 1 || indices->dimensions() < 1\n");
            return false;
        }
+        if (indices->elementSize() == 0) {
+            outputs[0]->buffer().type = params->buffer().type;
+            TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+            outputs[0]->buffer().dimensions = 2;
+            outputs[0]->setLength(0, 0);
+            outputs[0]->setLength(1, params->shape().back());
+            return true;
+        }
        auto indiceNd = indices->length(indices->dimensions()-1);
        if (indiceNd >  params->dimensions()) {
            MNN_ERROR("indiceNd >  params->dimensions()\n");
--- a/source/shape/ShapeGridSample.cpp
+++ b/source/shape/ShapeGridSample.cpp
@ -17,9 +17,11 @@ class GridSampleSizeComputer : public SizeComputer {
        // inputs[0] is input, inputs[1] is grid
        MNN_ASSERT(2 == inputs.size());
        MNN_ASSERT(1 == outputs.size());
-        MNN_ASSERT(4 == inputs[0]->buffer().dimensions && 4 == inputs[1]->buffer().dimensions);
+        int input_dim = inputs[0]->buffer().dimensions;
+        int grid_dim = inputs[1]->buffer().dimensions;
+        MNN_ASSERT((4 == input_dim && 4 == grid_dim) || (5 == input_dim && 5 == grid_dim));
        MNN_ASSERT(inputs[0]->buffer().dim[0].extent == inputs[1]->buffer().dim[0].extent);
-        MNN_ASSERT(2 == inputs[1]->buffer().dim[3].extent);
+        MNN_ASSERT(grid_dim - 2 == inputs[1]->buffer().dim[grid_dim - 1].extent);

        auto &ibInput0 = inputs[0]->buffer();
        auto &ibInput1 = inputs[1]->buffer();
@ -30,6 +32,9 @@ class GridSampleSizeComputer : public SizeComputer {
        ob.dim[1].extent = ibInput0.dim[1].extent;
        ob.dim[2].extent = ibInput1.dim[1].extent;
        ob.dim[3].extent = ibInput1.dim[2].extent;
+        if (grid_dim == 5) {
+            ob.dim[4].extent = ibInput1.dim[3].extent;
+        }

        ob.type = ibInput0.type;
        TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(
--- a/source/shape/ShapeReshape.cpp
+++ b/source/shape/ShapeReshape.cpp
@ -21,21 +21,43 @@ public:
            return false;
        }
        auto axis = flatten->axis();
+        auto endAxis = flatten->endAxis();
        auto dim = inputs[0]->dimensions();
        if (axis < 0) {
            axis += dim;
        }
+        if (endAxis < 0) {
+            endAxis += dim;
+        }
        int inside = 1;
+        int middle = 1;
        int outside = 1;
-        for (int i=0; i<axis; ++i) {
-            outside *= inputs[0]->length(i);
+        if (endAxis == 0) {
+            for (int i=0; i<axis; ++i) {
+                outside *= inputs[0]->length(i);
+            }
+            for (int i=axis; i<dim; ++i) {
+                inside *= inputs[0]->length(i);
+            }
+            outputs[0]->buffer().dimensions = 2;
+            outputs[0]->setLength(0, outside);
+            outputs[0]->setLength(1, inside);
+        } else {
+            // [ 0 - axis, 1, endAxis - lastDim]
+            outputs[0]->buffer().dimensions = dim - endAxis + axis;
+            for (int i = 0; i < axis; ++i) {
+                outputs[0]->setLength(i, inputs[0]->length(i));
+            }
+            for (int i = axis; i <= endAxis; ++i) {
+                outside *= inputs[0]->length(i);
+            }
+            outputs[0]->setLength(axis, outside);
+            if (dim > endAxis + 1) {
+                for (int i = endAxis + 1; i < dim; ++i) {
+                    outputs[0]->setLength(i, inputs[0]->length(i));
+                }
+            }
        }
-        for (int i=axis; i<dim; ++i) {
-            inside *= inputs[0]->length(i);
-        }
-        outputs[0]->buffer().dimensions = 2;
-        outputs[0]->setLength(0, outside);
-        outputs[0]->setLength(1, inside);
        outputs[0]->buffer().type = inputs[0]->getType();
        TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
        return true;
--- a/source/shape/ShapeTopKV2.cpp
+++ b/source/shape/ShapeTopKV2.cpp
@ -22,7 +22,8 @@ class TopKV2SizeComputer : public SizeComputer {
        MNN_ASSERT(kTensor->getType().code == halide_type_int);
        const int k              = kTensor->host<int32_t>()[0];
        const int inputDimension = input->buffer().dimensions;
-        const int axis = (inputs.size() == 3 ? inputs[2]->host<int32_t>()[0] : inputDimension - 1);
+        int axis = (inputs.size() == 3 ? inputs[2]->host<int32_t>()[0] : inputDimension - 1);
+        if (axis < 0) axis += input->dimensions();
        // outputs: 0 --> data, 1 --> index

        auto outputData                 = outputs[0];
--- a/test.sh
+++ b/test.sh
@ -293,7 +293,18 @@ pymnn_test() {
        echo '### PYMNN模型测试失败，测试终止！'
        failed
    fi
-    # 4. uninstall pymnn
+    # 4. train test
+    ./train_test.sh
+    # 5. quant test
+    python3 ../examples/MNNQuant/test_mnn_offline_quant.py \
+            --mnn_model ~/AliNNModel/TestQuant/mobilenet_v2_tfpb_train_withBN.mnn \
+            --quant_imgs ~/AliNNModel/TestQuant/quant_imgs \
+            --quant_model ./quant_model.mnn
+    rm ./quant_model.mnn
+    quant_wrong=$[$? > 0]
+    printf "TEST_NAME_QUANT_TEST: pymnn量化测试\nTEST_CASE_AMOUNT_QUANT_TEST: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n" \
+            $quant_wrong $[1 - $quant_wrong]
+    # 6. uninstall pymnn
    pip uninstall --yes MNN-Internal
    popd
    popd
--- a/test/op/CumTest.cpp
+++ b/test/op/CumTest.cpp
@ -0,0 +1,83 @@
+//
+//  CumTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2022/05/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+
+using namespace MNN::Express;
+class CumProdTest : public MNNTestCase {
+public:
+    virtual ~CumProdTest() = default;
+    virtual bool run(int precision) {
+        auto input = _Input({2, 2, 2},NCHW);
+        input->setName("input_tensor");
+        const float inpudata[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+        auto inputPtr          = input->writeMap<float>();
+        memcpy(inputPtr, inpudata, 8 * sizeof(float));
+        auto output0                             = _CumProd(input, 0);
+        const std::vector<float> expectedOutput0 = {1., 2., 3., 4., 5., 12., 21., 32.};
+        auto gotOutput0                          = output0->readMap<float>();
+        if (!checkVector<float>(gotOutput0, expectedOutput0.data(), 8, 0.01)) {
+            MNN_ERROR("CumProdTest axis=0 test failed!\n");
+            return false;
+        }
+        auto output1                             = _CumProd(input, 1);
+        const std::vector<float> expectedOutput1 = {1., 2., 3., 8., 5., 6., 35., 48.};
+        auto gotOutput1                          = output1->readMap<float>();
+        if (!checkVector<float>(gotOutput1, expectedOutput1.data(), 8, 0.01)) {
+            MNN_ERROR("CumProdTest axis=1 test failed!\n");
+            return false;
+        }
+        auto output2                             = _CumProd(input, 2);
+        const std::vector<float> expectedOutput2 = {1., 2., 3., 12., 5., 30., 7., 56.};
+        auto gotOutput2                          = output2->readMap<float>();
+        if (!checkVector<float>(gotOutput2, expectedOutput2.data(), 8, 0.01)) {
+            MNN_ERROR("CumProdTest axis=2 test failed!\n");
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(CumProdTest, "op/cumprod");
+
+class CumSumTest : public MNNTestCase {
+public:
+    virtual ~CumSumTest() = default;
+    virtual bool run(int precision) {
+        auto input = _Input({2, 2, 2},NCHW);
+        input->setName("input_tensor");
+        const float inpudata[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+        auto inputPtr          = input->writeMap<float>();
+        memcpy(inputPtr, inpudata, 8 * sizeof(float));
+        auto output0                             = _CumSum(input, 0);
+        const std::vector<float> expectedOutput0 = {1., 2., 3., 4., 6., 8., 10., 12.};
+        auto gotOutput0                          = output0->readMap<float>();
+        if (!checkVector<float>(gotOutput0, expectedOutput0.data(), 8, 0.01)) {
+            MNN_ERROR("CumSumTest axis=0 test failed!\n");
+            return false;
+        }
+        auto output1                             = _CumSum(input, 1);
+        const std::vector<float> expectedOutput1 = {1., 2., 4., 6., 5., 6., 12., 14.};
+        auto gotOutput1                          = output1->readMap<float>();
+        if (!checkVector<float>(gotOutput1, expectedOutput1.data(), 8, 0.01)) {
+            MNN_ERROR("CumSumTest axis=1 test failed!\n");
+            return false;
+        }
+        auto output2                             = _CumSum(input, 2);
+        const std::vector<float> expectedOutput2 = {1., 3., 3., 7., 5., 11., 7., 15.};
+        auto gotOutput2                          = output2->readMap<float>();
+        if (!checkVector<float>(gotOutput2, expectedOutput2.data(), 8, 0.01)) {
+            MNN_ERROR("CumSumTest axis=2 test failed!\n");
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(CumSumTest, "op/cumsum");
--- a/tools/converter/CMakeLists.txt
+++ b/tools/converter/CMakeLists.txt
@ -36,8 +36,6 @@ IF(MNN_BUILD_CONVERTER)
  endif()

  file(GLOB COMMON_SRC ${CMAKE_CURRENT_LIST_DIR}/source/common/*.cpp)
-  add_executable(MNNDump2Json ${CMAKE_CURRENT_LIST_DIR}/source/MNNDump2Json.cpp)
-  add_executable(MNNRevert2Buffer ${CMAKE_CURRENT_LIST_DIR}/source/MNNRevert2Buffer.cpp)
  add_executable(MNNConvert
    ${CMAKE_CURRENT_LIST_DIR}/source/MNNConverter.cpp
  )
@ -74,6 +72,10 @@ IF(MNN_BUILD_CONVERTER)
    if (MNN_BUILD_TORCH)
        target_link_libraries(MNNConvertDeps PUBLIC ${TORCH_LIBRARIES})
    endif()
+    add_executable(MNNRevert2Buffer ${CMAKE_CURRENT_LIST_DIR}/source/MNNRevert2Buffer.cpp)
+    target_link_libraries(MNNRevert2Buffer MNNConvertDeps)
+    add_executable(MNNDump2Json ${CMAKE_CURRENT_LIST_DIR}/source/MNNDump2Json.cpp)
+    target_link_libraries(MNNDump2Json MNNConvertDeps)
    add_executable(TestConvertResult ${CMAKE_CURRENT_LIST_DIR}/source/TestConvertResult.cpp)
    target_link_libraries(TestConvertResult MNNConvertDeps)
    add_executable(TestPassManager ${CMAKE_CURRENT_LIST_DIR}/source/TestPassManager.cpp)
--- a/tools/converter/include/cli.hpp
+++ b/tools/converter/include/cli.hpp
@ -17,6 +17,8 @@ public:
    static bool initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv);
    static bool convertModel(modelConfig& modelPath);
    static int testconvert(const std::string& defaultCacheFile, const std::string& directName, float maxErrorRate);
+    static bool mnn2json(const char* modelFile, const char* jsonFile, int flag = 3);
+    static bool json2mnn(const char* jsonFile, const char* modelFile);
 };
 };

--- a/tools/converter/include/config.hpp
+++ b/tools/converter/include/config.hpp
@ -22,7 +22,7 @@ public:
          benchmarkModel(false),
          saveHalfFloat(false){
    }
-    enum MODEL_SOURCE { TENSORFLOW = 0, CAFFE, ONNX, MNN, TFLITE, TORCH, MAX_SOURCE };
+    enum MODEL_SOURCE { TENSORFLOW = 0, CAFFE, ONNX, MNN, TFLITE, TORCH, JSON, MAX_SOURCE };

    // MNN model path
    std::string MNNModel;
@ -55,6 +55,7 @@ public:
    std::string authCode = "";
    std::string testDir = "";
    float testThredhold = 0.01;
+    bool mnn2json = false;
 };

 #endif // CONFIG_HPP
--- a/tools/converter/source/MNNDump2Json.cpp
+++ b/tools/converter/source/MNNDump2Json.cpp
@ -7,113 +7,13 @@
 //

 #include <stdio.h>
-#include <stdlib.h>
-#include <fstream>
-#include "MNN_generated.h"
-#include "flatbuffers/idl.h"
-#include "flatbuffers/minireflect.h"
-#include "flatbuffers/util.h"
-#include <string.h>
+#include "cli.hpp"

 int main(int argc, const char** argv) {
    if (argc <= 2) {
        printf("Usage: ./MNNDump2Json.out XXX.MNN XXX.json\n");
        return 0;
    }
-    std::ifstream inputFile(argv[1], std::ios::binary);
-    inputFile.seekg(0, std::ios::end);
-    auto size = inputFile.tellg();
-    inputFile.seekg(0, std::ios::beg);
-
-    char* buffer = new char[size];
-
-    inputFile.read((char*)buffer, size);
-    std::ofstream output(argv[2]);
-
-    if (argc > 3) {
-        printf("Dont't add convweight\n");
-        auto netT = MNN::UnPackNet((void*)buffer);
-        auto treatFunction = [&](MNN::OpT* opParam) {
-            auto type = opParam->main.type;
-            if (type == MNN::OpParameter::OpParameter_Convolution2D) {
-                auto param = opParam->main.AsConvolution2D();
-                param->weight.clear();
-                param->bias.clear();
-                if (param->symmetricQuan) {
-                    param->symmetricQuan->weight.clear();
-                }
-                if (param->quanParameter) {
-                    param->quanParameter->buffer.clear();
-                }
-            } else if (type == MNN::OpParameter::OpParameter_Blob) {
-                auto blobT = opParam->main.AsBlob();
-                blobT->float32s.clear();
-                blobT->int8s.clear();
-                blobT->uint8s.clear();
-                blobT->int32s.clear();
-                blobT->int64s.clear();
-            } else if (type == MNN::OpParameter::OpParameter_Convolution2D) {
-                opParam->main.AsConvolution2D()->weight.clear();
-                opParam->main.AsConvolution2D()->bias.clear();
-            } else if (type == MNN::OpParameter::OpParameter_MatMul) {
-                opParam->main.AsMatMul()->weight.clear();
-                opParam->main.AsMatMul()->bias.clear();
-            } else if (type == MNN::OpParameter::OpParameter_PRelu) {
-                opParam->main.AsPRelu()->slope.clear();
-            } else if (type == MNN::OpParameter::OpParameter_Extra) {
-                auto extra = opParam->main.AsExtra();
-                extra->info.clear();
-            } else if(type == MNN::OpParameter::OpParameter_LSTM){
-                auto param = opParam->main.AsLSTM();
-                if (param->weightH) {
-                    param->weightH->float32s.clear();
-                }
-                if (param->weightI) {
-                    param->weightI->float32s.clear();
-                }
-                if (param->bias) {
-                    param->bias->float32s.clear();
-                }
-            }
-        };
-        for (int i = 0; i < netT->oplists.size(); ++i) {
-            treatFunction(netT->oplists[i].get());
-        }
-        for (int i = 0; i < netT->subgraphs.size(); ++i) {
-            for (int j=0; j<netT->subgraphs[i]->nodes.size(); ++j) {
-                treatFunction(netT->subgraphs[i]->nodes[j].get());
-            }
-        }
-        if (argc > 4) {
-            printf("Separate dump subgraph\n");
-            for (int i=0; i<netT->subgraphs.size(); ++i) {
-                auto& g = netT->subgraphs[i];
-                flatbuffers::FlatBufferBuilder newBuilder(1024);
-                auto root = MNN::SubGraphProto::Pack(newBuilder, g.get());
-                newBuilder.Finish(root);
-                auto content = newBuilder.GetBufferPointer();
-                char subGraphNameStr[128];
-                sprintf(subGraphNameStr, "%s_%d", argv[2], i);
-                printf("Dump subgraph %s to %s\n", g->name.c_str(), subGraphNameStr);
-                std::ofstream tempOutput(subGraphNameStr);
-                auto s       = flatbuffers::FlatBufferToString((const uint8_t*)content, MNN::SubGraphProtoTypeTable());
-                tempOutput << s;
-            }
-            netT->subgraphs.clear();
-        }
-        flatbuffers::FlatBufferBuilder newBuilder(1024);
-        auto root = MNN::Net::Pack(newBuilder, netT.get());
-        MNN::FinishNetBuffer(newBuilder, root);
-        {
-            auto content = newBuilder.GetBufferPointer();
-            auto s       = flatbuffers::FlatBufferToString((const uint8_t*)content, MNN::NetTypeTable());
-            output << s;
-        }
-    } else {
-        auto s = flatbuffers::FlatBufferToString((const uint8_t*)buffer, MNN::NetTypeTable());
-        output << s;
-    }
-
-    delete[] buffer;
+    MNN::Cli::mnn2json(argv[1], argv[2], argc);
    return 0;
 }
--- a/tools/converter/source/MNNRevert2Buffer.cpp
+++ b/tools/converter/source/MNNRevert2Buffer.cpp
@ -7,253 +7,13 @@
 //

 #include <stdio.h>
-#include <stdlib.h>
-#include <fstream>
-#include <map>
-#include "MNN_generated.h"
-#include "flatbuffers/idl.h"
-#include "flatbuffers/minireflect.h"
-#include "flatbuffers/util.h"
-#include <string.h>
-#include <MNN/MNNDefine.h>
-#include "rapidjson/document.h"
-#include "rapidjson/stringbuffer.h"
-#include "rapidjson/prettywriter.h"
-
-#define VECTOR_EXTRACT(FLATBUFFER_TYPE, CPP_TYPE, JSON_TYPE)\
-case flatbuffers::ET_##FLATBUFFER_TYPE:\
-{\
-    std::vector<CPP_TYPE> data(array.Size());\
-    for (int i=0; i<array.Size(); ++i) {\
-        data[i] = array[i].JSON_TYPE();\
-    }\
-    indexes[pos].second = builder.CreateVector(data).Union();\
-    break;\
-}\
-
-#define SCALAR_EXTRACT(FLATBUFFER_TYPE, CPP_TYPE, JSON_TYPE)\
-case flatbuffers::ET_##FLATBUFFER_TYPE:\
-{\
-builder.AddElement(field, (CPP_TYPE)(iter->value.JSON_TYPE()), (CPP_TYPE)0);\
-break;\
-}\
-
-static flatbuffers::Offset<void> _writeJsonToFlatbuffer(const flatbuffers::TypeTable * table, flatbuffers::FlatBufferBuilder& builder, const rapidjson::GenericObject<false, rapidjson::GenericValue<rapidjson::UTF8<>>>& object) {
-    std::vector<std::pair<int, flatbuffers::Offset<void>>> indexes;
-    // Load union type for easy to use
-    std::map<std::string, int> unionNames;
-    for (int i=0; i<table->num_elems; ++i) {
-        if (table->type_codes[i].sequence_ref == -1) {
-            continue;
-        }
-        const flatbuffers::TypeTable *ref = table->type_refs[table->type_codes[i].sequence_ref]();
-        if (ref->st == flatbuffers::ST_UNION) {
-            unionNames.insert(std::make_pair(std::string(table->names[i]) + "_type", i));
-        }
-    }
-    // Find index and cache
-    std::map<int, int> unionTypes;
-    for (auto iter = object.begin(); iter !=object.end(); iter++) {
-        auto name = iter->name.GetString();
-        int index = -1;
-        for (int i=0; i<table->num_elems; ++i) {
-            if (0 == ::strcmp(table->names[i], name)) {
-                index = i;
-                break;
-            }
-        }
-        auto uiter = unionNames.find(name);
-        if (uiter != unionNames.end()) {
-            // Find union type id
-            auto value = iter->value.GetString();
-            int typePos = -1;
-            auto unionIndex = uiter->second;
-            auto ref = table->type_refs[table->type_codes[unionIndex].sequence_ref]();
-            for (int j=0; j<ref->num_elems; ++j) {
-                if (0 == ::strcmp(ref->names[j], value)) {
-                    typePos = j;
-                    break;
-                }
-            }
-            if (-1 == typePos) {
-                MNN_ERROR("Can't find union type\n");
-                continue;
-            }
-            if (typePos > 0) {
-                // First is None
-                unionTypes.insert(std::make_pair(unionIndex, typePos-1));
-            }
-        }
-        if (index == -1) {
-            MNN_PRINT("Invalid: %s, Skip it\n", name);
-        }
-        indexes.emplace_back(std::make_pair(index, 0));
-    }
-    
-    // resolve single object
-    int pos = 0;
-    for (auto iter = object.begin(); iter !=object.end(); iter++, pos++) {
-        int index = indexes[pos].first;
-        if (-1 == index) {
-            continue;
-        }
-        auto code = table->type_codes[index];
-        if (code.is_vector) {
-            continue;
-        }
-        if (code.sequence_ref != -1 && code.base_type == flatbuffers::ET_SEQUENCE) {
-            const flatbuffers::TypeTable *ref = table->type_refs[code.sequence_ref]();
-            if (ref->st == flatbuffers::ST_TABLE) {
-                indexes[pos].second = _writeJsonToFlatbuffer(ref, builder, iter->value.GetObject());
-            } else if (ref->st == flatbuffers::ST_UNION) {
-                auto unionInd = unionTypes.find(index)->second;
-                ref = ref->type_refs[unionInd]();
-                indexes[pos].second = _writeJsonToFlatbuffer(ref, builder, iter->value.GetObject());
-            }
-        }
-    }
-
-    // Resolve Vector and String
-    pos = 0;
-    for (auto iter = object.begin(); iter !=object.end(); iter++, pos++) {
-        int index = indexes[pos].first;
-        if (-1 == index) {
-            continue;
-        }
-        auto code = table->type_codes[index];
-        if (!code.is_vector) {
-            if (code.base_type == flatbuffers::ET_STRING) {
-                indexes[pos].second = builder.CreateString(iter->value.GetString()).Union();
-            }
-            continue;
-        }
-        auto array = iter->value.GetArray();
-        if (code.sequence_ref != -1) {
-            const flatbuffers::TypeTable *ref = table->type_refs[code.sequence_ref]();
-            std::vector<flatbuffers::Offset<void>> offsets(array.Size());
-            for (int i=0; i<array.Size(); ++i) {
-                offsets[i] = _writeJsonToFlatbuffer(ref, builder, array[i].GetObject());
-            }
-            indexes[pos].second = builder.CreateVector(offsets.data(), offsets.size()).Union();
-            continue;
-        }
-        switch (code.base_type) {
-                VECTOR_EXTRACT(BOOL, bool, GetBool);
-                VECTOR_EXTRACT(CHAR, char, GetInt);
-                VECTOR_EXTRACT(UCHAR, uint8_t, GetInt);
-                VECTOR_EXTRACT(SHORT, int16_t, GetInt);
-                VECTOR_EXTRACT(USHORT, uint16_t, GetInt);
-                VECTOR_EXTRACT(INT, int, GetInt);
-                VECTOR_EXTRACT(UINT, uint32_t, GetUint);
-                VECTOR_EXTRACT(LONG, int64_t, GetInt64);
-                VECTOR_EXTRACT(ULONG, uint64_t, GetUint64);
-                VECTOR_EXTRACT(FLOAT, float, GetFloat);
-                VECTOR_EXTRACT(DOUBLE, double, GetDouble);
-            case flatbuffers::ET_STRING:
-            {
-                std::vector<std::string> data(array.Size());
-                for (int i=0; i<array.Size(); ++i) {
-                    data[i] = array[i].GetString();
-                }
-                indexes[pos].second = builder.CreateVectorOfStrings(data).Union();
-                break;
-            }
-            default:
-                break;
-        }
-    }
-    
-    // Resolve Others
-    pos = 0;
-    auto start = builder.StartTable();
-    for (auto iter = object.begin(); iter !=object.end(); iter++, pos++) {
-        int index = indexes[pos].first;
-        if (-1 == index) {
-            continue;
-        }
-        auto field = 4 + index * 2;
-        if (indexes[pos].second.o != 0) {
-            builder.AddOffset(field, indexes[pos].second);
-            continue;
-        }
-        auto code = table->type_codes[index];
-        if (code.sequence_ref != -1) {
-            const flatbuffers::TypeTable *ref = table->type_refs[code.sequence_ref]();
-            int value = -1;
-            if (ref->st == flatbuffers::ST_UNION || ref->st == flatbuffers::ST_ENUM) {
-                auto type = iter->value.GetString();
-                for (int i=0; i<ref->num_elems; ++i) {
-                    if (0 == ::strcmp(type, ref->names[i])) {
-                        if (nullptr == ref->values) {
-                            value = i;
-                        } else {
-                            value = ref->values[i];
-                        }
-                    }
-                }
-                switch (code.base_type) {
-                    case flatbuffers::ET_UTYPE:
-                    case flatbuffers::ET_UINT:
-                        builder.AddElement(field, (uint32_t)value, (uint32_t)0);
-                        break;
-                    case flatbuffers::ET_INT:
-                        builder.AddElement(field, (int32_t)value, (int32_t)-1);
-                        break;
-                    case flatbuffers::ET_UCHAR:
-                        builder.AddElement(field, (uint8_t)value, (uint8_t)0);
-                        break;
-                    case flatbuffers::ET_CHAR:
-                        builder.AddElement(field, (int8_t)value, (int8_t)0);
-                        break;
-                    default:
-                        break;
-                }
-                continue;
-            }
-        }
-        switch (code.base_type) {
-                SCALAR_EXTRACT(BOOL, bool, GetBool);
-                SCALAR_EXTRACT(CHAR, char, GetInt);
-                SCALAR_EXTRACT(UCHAR, uint8_t, GetInt);
-                SCALAR_EXTRACT(SHORT, int16_t, GetInt);
-                SCALAR_EXTRACT(USHORT, uint16_t, GetInt);
-                SCALAR_EXTRACT(INT, int, GetInt);
-                SCALAR_EXTRACT(UINT, uint32_t, GetUint);
-                SCALAR_EXTRACT(LONG, int64_t, GetInt64);
-                SCALAR_EXTRACT(ULONG, uint64_t, GetUint64);
-                SCALAR_EXTRACT(FLOAT, float, GetFloat);
-                SCALAR_EXTRACT(DOUBLE, double, GetDouble);
-            default:
-                break;
-        }
-    }
-    return builder.EndTable(start);
-}
+#include "cli.hpp"

 int main(int argc, const char** argv) {
    if (argc <= 2) {
        printf("Usage: ./MNNRevert2Buffer.out XXX.json XXX.mnn\n");
        return 0;
    }
-    rapidjson::Document document;
-    {
-        std::ifstream fileNames(argv[1]);
-        std::ostringstream output;
-        output << fileNames.rdbuf();
-        auto outputStr = output.str();
-        document.Parse(outputStr.c_str());
-        if (document.HasParseError()) {
-            MNN_ERROR("Invalid json\n");
-            return 0;
-        }
-    }
-    auto object = document.GetObject();
-    flatbuffers::FlatBufferBuilder builder;
-    builder.ForceDefaults(true);
-    auto table = MNN::NetTypeTable();
-    auto offset = _writeJsonToFlatbuffer(table, builder, object);
-    builder.Finish(offset);
-    std::ofstream outputOs(argv[2]);
-    outputOs.write((char*)builder.GetBufferPointer(), builder.GetSize());
+    MNN::Cli::json2mnn(argv[1], argv[2]);
    return 0;
 }
--- a/tools/converter/source/caffe/caffeConverter.cpp
+++ b/tools/converter/source/caffe/caffeConverter.cpp
@ -26,8 +26,12 @@ int caffe2MNNNet(const std::string prototxtFile, const std::string modelFile, co
    bool succ = read_proto_from_text(prototxtFile.c_str(), &caffeProtxt);
    DCHECK(succ) << "read_proto_from_text failed";

-    succ = read_proto_from_binary(modelFile.c_str(), &caffeModel);
+    succ &= read_proto_from_binary(modelFile.c_str(), &caffeModel);
    DCHECK(succ) << "read_proto_from_binary failed";
+    if (!succ) {
+        MNN_ERROR("[ERROR] Model file is not caffe model.\n");
+        return 1;
+    }
    std::map<std::string, int> tensorName;

    // Load Parameters
--- a/tools/converter/source/common/cli.cpp
+++ b/tools/converter/source/common/cli.cpp
@ -38,7 +38,6 @@
 #include <cmath>
 #include "common/MemoryFormater.h"
 namespace MNN {
-static float gMNNVersion = 1.2f;

 bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv) {
    cxxopts::Options options("MNNConvert");
@ -49,9 +48,9 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
                                                                                                                                 std::make_pair("v", "version"), "show current version")
        (std::make_pair("f", "framework"),
 #ifdef MNN_BUILD_TORCH
-        "model type, ex: [TF,CAFFE,ONNX,TFLITE,MNN,TORCH]",
+        "model type, ex: [TF,CAFFE,ONNX,TFLITE,MNN,TORCH,JSON]",
 #else
-        "model type, ex: [TF,CAFFE,ONNX,TFLITE,MNN]",
+        "model type, ex: [TF,CAFFE,ONNX,TFLITE,MNN,JSON]",
 #endif
            cxxopts::value<std::string>())
        (
@ -165,6 +164,11 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
            "if set test dir, thredhold mean the max rate permit for run MNN model and origin error",
            cxxopts::value<float>()
        )
+        (
+            "JsonFile",
+            "if input model is MNN and give jsonfile, while Dump MNN model to the JsonFile.",
+            cxxopts::value<std::string>()
+        )
        (
            "alignDenormalizedValue",
             "if 1, converter would align denormalized float(|x| < 1.18e-38) as zero, because of in ubuntu/protobuf or android/flatbuf, system behaviors are different. default: 1, range: {0, 1}",
@ -179,7 +183,7 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
    }

    if (result.count("version")) {
-        std::cout << gMNNVersion << std::endl;
+        std::cout << MNN_VERSION << std::endl;
        return false;
    }

@ -202,13 +206,15 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
        } else if ("TORCH" == frameWork) {
            modelPath.model = modelConfig::TORCH;
 #endif
+        } else if ("JSON" == frameWork) {
+            modelPath.model = modelConfig::JSON;
        } else {
            std::cout << "Framework Input ERROR or Not Support This Model Type Now!" << std::endl;
            return false;
        }
    } else {
        std::cout << options.help({""}) << std::endl;
-        DLOG(INFO) << "framework Invalid, use -f CAFFE/MNN/ONNX/TFLITE/TORCH !";
+        DLOG(INFO) << "framework Invalid, use -f CAFFE/MNN/ONNX/TFLITE/TORCH/JSON !";
        return false;
    }
    if (result.count("OP")) {
@ -266,6 +272,10 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
    if (result.count("MNNModel")) {
        const std::string MNNModelPath = result["MNNModel"].as<std::string>();
        modelPath.MNNModel             = MNNModelPath;
+    } else if (result.count("JsonFile")) {
+        const std::string JsonFilePath = result["JsonFile"].as<std::string>();
+        modelPath.mnn2json             = true;
+        modelPath.MNNModel             = JsonFilePath;
    } else {
        DLOG(INFO) << "MNNModel File Not Set, use --MNNModel XXX.prototxt to set it!";
        return false;
@ -340,32 +350,50 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
    if (result.count("thredhold")) {
        modelPath.testThredhold = result["thredhold"].as<float>();
    }
-
    return true;
 }

 bool Cli::convertModel(modelConfig& modelPath) {
    std::cout << "Start to Convert Other Model Format To MNN Model..." << std::endl;
    std::unique_ptr<MNN::NetT> netT = std::unique_ptr<MNN::NetT>(new MNN::NetT());
+    int parseRes = 1;
    if (modelPath.model == modelConfig::CAFFE) {
-        caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, netT);
+        parseRes = caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, netT);
    } else if (modelPath.model == modelConfig::TENSORFLOW) {
-        tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
+        parseRes = tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
    } else if (modelPath.model == modelConfig::MNN) {
-        addBizCode(modelPath.modelFile, modelPath.bizCode, netT);
+        if (modelPath.mnn2json) {
+            if (mnn2json(modelPath.modelFile.c_str(), modelPath.MNNModel.c_str())) {
+                MNN_PRINT("MNNModel %s has convert to JsonFile %s.\n", modelPath.modelFile.c_str(), modelPath.MNNModel.c_str());
+                return true;
+            } else {
+                MNN_ERROR("[ERROR] MNN to Json failed.\n");
+                return false;
+            }
+        } else {
+            parseRes = addBizCode(modelPath.modelFile, modelPath.bizCode, netT);
+        }
    } else if (modelPath.model == modelConfig::ONNX) {
-        onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
+        parseRes = onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
    } else if (modelPath.model == modelConfig::TFLITE) {
-        tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
+        parseRes = tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, netT);
 #ifdef MNN_BUILD_TORCH
    } else if (modelPath.model == modelConfig::TORCH) {
-        torch2MNNNet(modelPath.modelFile, modelPath.bizCode, netT, modelPath.customOpLibs);
+        parseRes = torch2MNNNet(modelPath.modelFile, modelPath.bizCode, netT, modelPath.customOpLibs);
 #endif
+    } else if (modelPath.model == modelConfig::JSON) {
+        if (json2mnn(modelPath.modelFile.c_str(), modelPath.MNNModel.c_str())) {
+            MNN_PRINT("JsonFile %s has convert to MNNModel %s.\n", modelPath.modelFile.c_str(), modelPath.MNNModel.c_str());
+            return true;
+        } else {
+            MNN_ERROR("[ERROR] Json to MNN failed.\n");
+            return false;
+        }
    } else {
-        std::cout << "Not Support Model Type" << std::endl;
+        MNN_ERROR("[ERROR] Not Support Model Type.\n");
    }
-    if (netT.get() == nullptr) {
-        MNN_ERROR("Convert error\n");
+    if (netT.get() == nullptr || parseRes) {
+        MNN_ERROR("[ERROR] Convert error, please check your file format.\n");
        return false;
    }
    int error = 0;
@ -457,12 +485,6 @@ static bool compareOutput(MNN::Express::VARP output, const std::string& directNa
    auto diffAbsMaxV = diffAbsMax->readMap<float>()[0];
    if (absMaxV * maxError < diffAbsMaxV || std::isnan(absMaxV)) {
        MNN_ERROR("TESTERROR %s value error : absMaxV:%f - DiffMax %f\n", name.c_str(), absMaxV, diffAbsMaxV);
-
-        MNN_PRINT("expected value\n");
-        formatMatrix(targetValue->readMap<float>(), targetValue->getInfo()->dim);
-        MNN_PRINT("real value\n");
-        formatMatrix(output->readMap<float>(), output->getInfo()->dim);
-
        return false;
    }
    return true;
@ -650,6 +672,336 @@ int Cli::testconvert(const std::string& defaultCacheFile, const std::string& dir
    return 0;
 }

+bool Cli::mnn2json(const char* modelFile, const char* jsonFile, int flag) {
+    std::ifstream inputFile(modelFile, std::ios::binary);
+    inputFile.seekg(0, std::ios::end);
+    auto size = inputFile.tellg();
+    inputFile.seekg(0, std::ios::beg);
+
+    char* buffer = new char[size];
+
+    inputFile.read((char*)buffer, size);
+    std::ofstream output(jsonFile);
+
+    if (flag > 3) {
+        MNN_PRINT("Dont't add convweight\n");
+        auto netT = MNN::UnPackNet((void*)buffer);
+        auto treatFunction = [&](MNN::OpT* opParam) {
+            auto type = opParam->main.type;
+            if (type == MNN::OpParameter::OpParameter_Convolution2D) {
+                auto param = opParam->main.AsConvolution2D();
+                param->weight.clear();
+                param->bias.clear();
+                if (param->symmetricQuan) {
+                    param->symmetricQuan->weight.clear();
+                }
+                if (param->quanParameter) {
+                    param->quanParameter->buffer.clear();
+                }
+            } else if (type == MNN::OpParameter::OpParameter_Blob) {
+                auto blobT = opParam->main.AsBlob();
+                blobT->float32s.clear();
+                blobT->int8s.clear();
+                blobT->uint8s.clear();
+                blobT->int32s.clear();
+                blobT->int64s.clear();
+            } else if (type == MNN::OpParameter::OpParameter_Convolution2D) {
+                opParam->main.AsConvolution2D()->weight.clear();
+                opParam->main.AsConvolution2D()->bias.clear();
+            } else if (type == MNN::OpParameter::OpParameter_MatMul) {
+                opParam->main.AsMatMul()->weight.clear();
+                opParam->main.AsMatMul()->bias.clear();
+            } else if (type == MNN::OpParameter::OpParameter_PRelu) {
+                opParam->main.AsPRelu()->slope.clear();
+            } else if (type == MNN::OpParameter::OpParameter_Extra) {
+                auto extra = opParam->main.AsExtra();
+                extra->info.clear();
+            } else if(type == MNN::OpParameter::OpParameter_LSTM){
+                auto param = opParam->main.AsLSTM();
+                if (param->weightH) {
+                    param->weightH->float32s.clear();
+                }
+                if (param->weightI) {
+                    param->weightI->float32s.clear();
+                }
+                if (param->bias) {
+                    param->bias->float32s.clear();
+                }
+            }
+        };
+        for (int i = 0; i < netT->oplists.size(); ++i) {
+            treatFunction(netT->oplists[i].get());
+        }
+        for (int i = 0; i < netT->subgraphs.size(); ++i) {
+            for (int j=0; j<netT->subgraphs[i]->nodes.size(); ++j) {
+                treatFunction(netT->subgraphs[i]->nodes[j].get());
+            }
+        }
+        if (flag > 4) {
+            printf("Separate dump subgraph\n");
+            for (int i=0; i<netT->subgraphs.size(); ++i) {
+                auto& g = netT->subgraphs[i];
+                flatbuffers::FlatBufferBuilder newBuilder(1024);
+                auto root = MNN::SubGraphProto::Pack(newBuilder, g.get());
+                newBuilder.Finish(root);
+                auto content = newBuilder.GetBufferPointer();
+                char subGraphNameStr[128];
+                sprintf(subGraphNameStr, "%s_%d", jsonFile, i);
+                printf("Dump subgraph %s to %s\n", g->name.c_str(), subGraphNameStr);
+                std::ofstream tempOutput(subGraphNameStr);
+                auto s       = flatbuffers::FlatBufferToString((const uint8_t*)content, MNN::SubGraphProtoTypeTable());
+                tempOutput << s;
+            }
+            netT->subgraphs.clear();
+        }
+        flatbuffers::FlatBufferBuilder newBuilder(1024);
+        auto root = MNN::Net::Pack(newBuilder, netT.get());
+        MNN::FinishNetBuffer(newBuilder, root);
+        {
+            auto content = newBuilder.GetBufferPointer();
+            auto s       = flatbuffers::FlatBufferToString((const uint8_t*)content, MNN::NetTypeTable());
+            output << s;
+        }
+    } else {
+        auto s = flatbuffers::FlatBufferToString((const uint8_t*)buffer, MNN::NetTypeTable());
+        output << s;
+    }
+
+    delete[] buffer;
+    return true;
+}
+
+#define VECTOR_EXTRACT(FLATBUFFER_TYPE, CPP_TYPE, JSON_TYPE)\
+case flatbuffers::ET_##FLATBUFFER_TYPE:\
+{\
+    std::vector<CPP_TYPE> data(array.Size());\
+    for (int i=0; i<array.Size(); ++i) {\
+        data[i] = array[i].JSON_TYPE();\
+    }\
+    indexes[pos].second = builder.CreateVector(data).Union();\
+    break;\
+}\
+
+#define SCALAR_EXTRACT(FLATBUFFER_TYPE, CPP_TYPE, JSON_TYPE)\
+case flatbuffers::ET_##FLATBUFFER_TYPE:\
+{\
+builder.AddElement(field, (CPP_TYPE)(iter->value.JSON_TYPE()), (CPP_TYPE)0);\
+break;\
+}
+static flatbuffers::Offset<void> _writeJsonToFlatbuffer(const flatbuffers::TypeTable * table, flatbuffers::FlatBufferBuilder& builder, const rapidjson::GenericObject<false, rapidjson::GenericValue<rapidjson::UTF8<>>>& object) {
+    std::vector<std::pair<int, flatbuffers::Offset<void>>> indexes;
+    // Load union type for easy to use
+    std::map<std::string, int> unionNames;
+    for (int i=0; i<table->num_elems; ++i) {
+        if (table->type_codes[i].sequence_ref == -1) {
+            continue;
+        }
+        const flatbuffers::TypeTable *ref = table->type_refs[table->type_codes[i].sequence_ref]();
+        if (ref->st == flatbuffers::ST_UNION) {
+            unionNames.insert(std::make_pair(std::string(table->names[i]) + "_type", i));
+        }
+    }
+    // Find index and cache
+    std::map<int, int> unionTypes;
+    for (auto iter = object.begin(); iter !=object.end(); iter++) {
+        auto name = iter->name.GetString();
+        int index = -1;
+        for (int i=0; i<table->num_elems; ++i) {
+            if (0 == ::strcmp(table->names[i], name)) {
+                index = i;
+                break;
+            }
+        }
+        auto uiter = unionNames.find(name);
+        if (uiter != unionNames.end()) {
+            // Find union type id
+            auto value = iter->value.GetString();
+            int typePos = -1;
+            auto unionIndex = uiter->second;
+            auto ref = table->type_refs[table->type_codes[unionIndex].sequence_ref]();
+            for (int j=0; j<ref->num_elems; ++j) {
+                if (0 == ::strcmp(ref->names[j], value)) {
+                    typePos = j;
+                    break;
+                }
+            }
+            if (-1 == typePos) {
+                MNN_ERROR("Can't find union type\n");
+                continue;
+            }
+            if (typePos > 0) {
+                // First is None
+                unionTypes.insert(std::make_pair(unionIndex, typePos-1));
+            }
+        }
+        if (index == -1) {
+            MNN_PRINT("Invalid: %s, Skip it\n", name);
+        }
+        indexes.emplace_back(std::make_pair(index, 0));
+    }
+
+    // resolve single object
+    int pos = 0;
+    for (auto iter = object.begin(); iter !=object.end(); iter++, pos++) {
+        int index = indexes[pos].first;
+        if (-1 == index) {
+            continue;
+        }
+        auto code = table->type_codes[index];
+        if (code.is_vector) {
+            continue;
+        }
+        if (code.sequence_ref != -1 && code.base_type == flatbuffers::ET_SEQUENCE) {
+            const flatbuffers::TypeTable *ref = table->type_refs[code.sequence_ref]();
+            if (ref->st == flatbuffers::ST_TABLE) {
+                indexes[pos].second = _writeJsonToFlatbuffer(ref, builder, iter->value.GetObject());
+            } else if (ref->st == flatbuffers::ST_UNION) {
+                auto unionInd = unionTypes.find(index)->second;
+                ref = ref->type_refs[unionInd]();
+                indexes[pos].second = _writeJsonToFlatbuffer(ref, builder, iter->value.GetObject());
+            }
+        }
+    }
+
+    // Resolve Vector and String
+    pos = 0;
+    for (auto iter = object.begin(); iter !=object.end(); iter++, pos++) {
+        int index = indexes[pos].first;
+        if (-1 == index) {
+            continue;
+        }
+        auto code = table->type_codes[index];
+        if (!code.is_vector) {
+            if (code.base_type == flatbuffers::ET_STRING) {
+                indexes[pos].second = builder.CreateString(iter->value.GetString()).Union();
+            }
+            continue;
+        }
+        auto array = iter->value.GetArray();
+        if (code.sequence_ref != -1) {
+            const flatbuffers::TypeTable *ref = table->type_refs[code.sequence_ref]();
+            std::vector<flatbuffers::Offset<void>> offsets(array.Size());
+            for (int i=0; i<array.Size(); ++i) {
+                offsets[i] = _writeJsonToFlatbuffer(ref, builder, array[i].GetObject());
+            }
+            indexes[pos].second = builder.CreateVector(offsets.data(), offsets.size()).Union();
+            continue;
+        }
+        switch (code.base_type) {
+                VECTOR_EXTRACT(BOOL, bool, GetBool);
+                VECTOR_EXTRACT(CHAR, char, GetInt);
+                VECTOR_EXTRACT(UCHAR, uint8_t, GetInt);
+                VECTOR_EXTRACT(SHORT, int16_t, GetInt);
+                VECTOR_EXTRACT(USHORT, uint16_t, GetInt);
+                VECTOR_EXTRACT(INT, int, GetInt);
+                VECTOR_EXTRACT(UINT, uint32_t, GetUint);
+                VECTOR_EXTRACT(LONG, int64_t, GetInt64);
+                VECTOR_EXTRACT(ULONG, uint64_t, GetUint64);
+                VECTOR_EXTRACT(FLOAT, float, GetFloat);
+                VECTOR_EXTRACT(DOUBLE, double, GetDouble);
+            case flatbuffers::ET_STRING:
+            {
+                std::vector<std::string> data(array.Size());
+                for (int i=0; i<array.Size(); ++i) {
+                    data[i] = array[i].GetString();
+                }
+                indexes[pos].second = builder.CreateVectorOfStrings(data).Union();
+                break;
+            }
+            default:
+                break;
+        }
+    }
+
+    // Resolve Others
+    pos = 0;
+    auto start = builder.StartTable();
+    for (auto iter = object.begin(); iter !=object.end(); iter++, pos++) {
+        int index = indexes[pos].first;
+        if (-1 == index) {
+            continue;
+        }
+        auto field = 4 + index * 2;
+        if (indexes[pos].second.o != 0) {
+            builder.AddOffset(field, indexes[pos].second);
+            continue;
+        }
+        auto code = table->type_codes[index];
+        if (code.sequence_ref != -1) {
+            const flatbuffers::TypeTable *ref = table->type_refs[code.sequence_ref]();
+            int value = -1;
+            if (ref->st == flatbuffers::ST_UNION || ref->st == flatbuffers::ST_ENUM) {
+                auto type = iter->value.GetString();
+                for (int i=0; i<ref->num_elems; ++i) {
+                    if (0 == ::strcmp(type, ref->names[i])) {
+                        if (nullptr == ref->values) {
+                            value = i;
+                        } else {
+                            value = ref->values[i];
+                        }
+                    }
+                }
+                switch (code.base_type) {
+                    case flatbuffers::ET_UTYPE:
+                    case flatbuffers::ET_UINT:
+                        builder.AddElement(field, (uint32_t)value, (uint32_t)0);
+                        break;
+                    case flatbuffers::ET_INT:
+                        builder.AddElement(field, (int32_t)value, (int32_t)-1);
+                        break;
+                    case flatbuffers::ET_UCHAR:
+                        builder.AddElement(field, (uint8_t)value, (uint8_t)0);
+                        break;
+                    case flatbuffers::ET_CHAR:
+                        builder.AddElement(field, (int8_t)value, (int8_t)0);
+                        break;
+                    default:
+                        break;
+                }
+                continue;
+            }
+        }
+        switch (code.base_type) {
+                SCALAR_EXTRACT(BOOL, bool, GetBool);
+                SCALAR_EXTRACT(CHAR, char, GetInt);
+                SCALAR_EXTRACT(UCHAR, uint8_t, GetInt);
+                SCALAR_EXTRACT(SHORT, int16_t, GetInt);
+                SCALAR_EXTRACT(USHORT, uint16_t, GetInt);
+                SCALAR_EXTRACT(INT, int, GetInt);
+                SCALAR_EXTRACT(UINT, uint32_t, GetUint);
+                SCALAR_EXTRACT(LONG, int64_t, GetInt64);
+                SCALAR_EXTRACT(ULONG, uint64_t, GetUint64);
+                SCALAR_EXTRACT(FLOAT, float, GetFloat);
+                SCALAR_EXTRACT(DOUBLE, double, GetDouble);
+            default:
+                break;
+        }
+    }
+    return builder.EndTable(start);
+}
+bool Cli::json2mnn(const char* jsonFile, const char* modelFile) {
+    rapidjson::Document document;
+    {
+        std::ifstream fileNames(jsonFile);
+        std::ostringstream output;
+        output << fileNames.rdbuf();
+        auto outputStr = output.str();
+        document.Parse(outputStr.c_str());
+        if (document.HasParseError()) {
+            MNN_ERROR("Invalid json\n");
+            return 0;
+        }
+    }
+    auto object = document.GetObject();
+    flatbuffers::FlatBufferBuilder builder;
+    builder.ForceDefaults(true);
+    auto table = MNN::NetTypeTable();
+    auto offset = _writeJsonToFlatbuffer(table, builder, object);
+    builder.Finish(offset);
+    std::ofstream outputOs(modelFile);
+    outputOs.write((char*)builder.GetBufferPointer(), builder.GetSize());
+    return true;
+}

 };

--- a/tools/converter/source/common/convertToStaticModel.cpp
+++ b/tools/converter/source/common/convertToStaticModel.cpp
@ -192,8 +192,9 @@ void converToStaticModel(const Net* net, std::map<std::string,std::vector<int>>&
        auto name = net->tensorName()->GetAsString(i)->str();
        if (inputConfig.find(name) != inputConfig.end()) {
            auto& dims = inputConfig[name];
+            allTensors[i]->buffer().dimensions = dims.size();
            for (int j = 0; j < dims.size(); j++) {
-                allTensors[i]->buffer().dim[j].extent = dims[j];
+                allTensors[i]->setLength(j, dims[j]);
            }
        }
    }
--- a/tools/converter/source/common/writeFb.cpp
+++ b/tools/converter/source/common/writeFb.cpp
@ -37,8 +37,11 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, c

    addUUID(netT, proto);
    
+    // add version info to model
+    netT->extraInfo.reset(new ExtraInfoT);
+    netT->extraInfo->version = MNN_VERSION;
    if (!config.authCode.empty()) {
-        netT->extraInfo.reset(new ExtraInfoT);
+        // add auth code to model
        netT->extraInfo->name = config.authCode;
    }

--- a/Show More
+++ b/Show More