MNN/source/backend/cpu/compute/ConvOpt.h

//
//  ConvOpt.h
//  MNN
//
//  Created by MNN on 2018/07/16.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#ifndef ConvOpt_h
#define ConvOpt_h

#include <stdint.h>
#include <stdio.h>

#ifdef __cplusplus
extern "C" {
#endif

#define CONVOLUVTION_RUN_BASIC(l, t, r, b, TYPE, alpha)                                                               \
    for (dy = t; dy < b; ++dy) {                                                                                      \
        int srcStartY      = dy * strideY - padY;                                                                     \
        float* dst_y       = dst_z + width * 4 * dy;                                                                  \
        const TYPE* src_dy = srcOrigin + srcStartY * src_width * 4;                                                   \
        int sfy            = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));                                                \
        int efy            = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));                          \
        for (dx = l; dx < r; ++dx) {                                                                                  \
            int srcStartX            = dx * strideX - padX;                                                           \
            const TYPE* src_dx       = src_dy + 4 * srcStartX;                                                        \
            float* dst_x             = dst_y + 4 * dx;                                                                \
            int sfx                  = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));                                      \
            int efx                  = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));                  \
            const TYPE* src_unit     = src_dx + (sfx * dilateX_step + sfy * dilateY_step);                            \
            const TYPE* weight_start = weight_dz + (16 * sfx + weight_sy_step * sfy);                                 \
            MNNConvSlideWindowBorder(dst_x, src_unit, weight_start, src_depth_quad, src_z_step, efx - sfx, efy - sfy, \
                                     weight_sy_step, weight_sz_step, dilateX_step, dilateY_step, alpha);              \
        }                                                                                                             \
    }


void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                                size_t srcHStep, size_t dstHStep);

void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                  size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);

void MNNGemmFloatUnit_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
                        size_t dst_depth_quad, size_t weight_depth_offset);

void MNNGemmFloatOne_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
                       size_t dst_depth_quad, size_t weight_depth_offset);
void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,
                          size_t dst_depth_quad, size_t width, size_t weight_depth_offset);
void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                  size_t bStride, size_t height);
void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                  size_t bStride, size_t height);
void MNNMatrixMax(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                  size_t bStride, size_t height);
void MNNMatrixProd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                   size_t bStride, size_t height);

void MNNMatrixAddCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);
void MNNMatrixSubCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);
void MNNMatrixMaxCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);

void MNNMatrixProdCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);

int MNNGetConvolutionTileNumber();

#ifdef __cplusplus
}
#endif

#endif /* ConvOpt_h */
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// ConvOpt.h`
			`// MNN`
			`//`
			`// Created by MNN on 2018/07/16.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

			`#ifndef ConvOpt_h`
			`#define ConvOpt_h`

			`#include <stdint.h>`
			`#include <stdio.h>`

			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`#define CONVOLUVTION_RUN_BASIC(l, t, r, b, TYPE, alpha) \`
			`for (dy = t; dy < b; ++dy) { \`
			`int srcStartY = dy * strideY - padY; \`
			`float* dst_y = dst_z + width * 4 * dy; \`
			`const TYPE* src_dy = srcOrigin + srcStartY * src_width * 4; \`
			`int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); \`
			`int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY)); \`
			`for (dx = l; dx < r; ++dx) { \`
			`int srcStartX = dx * strideX - padX; \`
			`const TYPE* src_dx = src_dy + 4 * srcStartX; \`
			`float* dst_x = dst_y + 4 * dx; \`
			`int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); \`
			`int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX)); \`
			`const TYPE* src_unit = src_dx + (sfx * dilateX_step + sfy * dilateY_step); \`
			`const TYPE* weight_start = weight_dz + (16 * sfx + weight_sy_step * sfy); \`
			`MNNConvSlideWindowBorder(dst_x, src_unit, weight_start, src_depth_quad, src_z_step, efx - sfx, efy - sfy, \`
			`weight_sy_step, weight_sz_step, dilateX_step, dilateY_step, alpha); \`
			`} \`
			`}`


			`void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,`
			`size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);`
			`void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,`
			`size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,`
			`size_t srcHStep, size_t dstHStep);`

			`void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,`
			`size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);`
			`void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,`
			`size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);`

[PATCH 24/28] rename MNNGemmFloatUnit -> MNNGemmFloatUnit_4 2020-03-27 17:45:51 +08:00			`void MNNGemmFloatUnit_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`size_t dst_depth_quad, size_t weight_depth_offset);`
[PATCH 22/28] [MNN:Speed] 8x8 Gemm and cache prefetch optimize 2020-03-12 11:34:45 +08:00
beta 0.1.0 2019-04-17 10:49:11 +08:00			`void MNNGemmFloatOne_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,`
			`size_t dst_depth_quad, size_t weight_depth_offset);`
			`void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step,`
			`size_t dst_depth_quad, size_t width, size_t weight_depth_offset);`
			`void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,`
			`size_t bStride, size_t height);`
			`void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,`
			`size_t bStride, size_t height);`
			`void MNNMatrixMax(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,`
			`size_t bStride, size_t height);`
			`void MNNMatrixProd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,`
			`size_t bStride, size_t height);`

- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr; 2019-10-29 13:37:26 +08:00			`void MNNMatrixAddCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);`
			`void MNNMatrixSubCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);`
			`void MNNMatrixMaxCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);`

			`void MNNMatrixProdCommon(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height);`
[MNN:Sync] Sync Internal Github 2020-07-04 01:21:30 +08:00
			`int MNNGetConvolutionTileNumber();`
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr; 2019-10-29 13:37:26 +08:00
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#ifdef __cplusplus`
			`}`
			`#endif`

			`#endif /* ConvOpt_h */`