2019-04-17 10:49:11 +08:00
|
|
|
|
//
|
|
|
|
|
// CPUBinary.cpp
|
|
|
|
|
// MNN
|
|
|
|
|
//
|
|
|
|
|
// Created by MNN on 2018/08/02.
|
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
|
//
|
|
|
|
|
|
2020-01-15 13:33:47 +08:00
|
|
|
|
#include "CPUBinary.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
|
#include <math.h>
|
|
|
|
|
#include <algorithm>
|
2020-01-15 13:33:47 +08:00
|
|
|
|
#include "CPUBackend.hpp"
|
|
|
|
|
#include "compute/CommonOptFunction.h"
|
|
|
|
|
#include "compute/ConvOpt.h"
|
2019-12-27 22:16:57 +08:00
|
|
|
|
#include "core/Macro.h"
|
2020-01-15 13:33:47 +08:00
|
|
|
|
#include "core/Concurrency.h"
|
|
|
|
|
#include "CPUEltwise.hpp"
|
2019-04-17 10:49:11 +08:00
|
|
|
|
namespace MNN {
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
CPUBinary<T>::CPUBinary(Backend* b, int32_t type) : MNN::Execution(b), mType(type) {
|
|
|
|
|
// nothing to do
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
ErrorCode CPUBinary<T>::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
|
|
|
MNN_ASSERT(1 == outputs.size());
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
|
const int input0DataCount = inputs[0]->elementSize();
|
|
|
|
|
const int input1DataCount = inputs[1]->elementSize();
|
2020-01-15 13:33:47 +08:00
|
|
|
|
mElementProc = nullptr;
|
|
|
|
|
mSupportScale = false;
|
|
|
|
|
int maxCount = input0DataCount > input1DataCount ? input0DataCount : input1DataCount;
|
|
|
|
|
if (outputs[0]->getType().code == halide_type_float && maxCount >= 4) {
|
|
|
|
|
if (input1DataCount == input0DataCount) {
|
|
|
|
|
switch (mType) {
|
|
|
|
|
case BinaryOpOperation_MUL:
|
|
|
|
|
mElementProc = MNNMatrixProdCommon;
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_ADD:
|
|
|
|
|
mElementProc = MNNMatrixAddCommon;
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_MAXIMUM:
|
|
|
|
|
mElementProc = MNNMatrixMaxCommon;
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_SUB:
|
|
|
|
|
mElementProc = MNNMatrixSubCommon;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
} else if (input1DataCount == 1 || input0DataCount == 1) {
|
|
|
|
|
switch (mType) {
|
|
|
|
|
case BinaryOpOperation_MUL:
|
|
|
|
|
case BinaryOpOperation_ADD:
|
|
|
|
|
case BinaryOpOperation_SUB:
|
|
|
|
|
mSupportScale = true;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
|
return NO_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename Tin, typename Tout, typename Func>
|
|
|
|
|
static ErrorCode _binaryOp(Tensor* input0, Tensor* input1, Tensor* output) {
|
|
|
|
|
Func f;
|
|
|
|
|
|
2019-06-17 20:10:35 +08:00
|
|
|
|
const int input0DataCount = input0->elementSize();
|
|
|
|
|
const int input1DataCount = input1->elementSize();
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
|
|
const Tin* input0Data = input0->host<Tin>();
|
|
|
|
|
const Tin* input1Data = input1->host<Tin>();
|
|
|
|
|
Tout* outputData = output->host<Tout>();
|
|
|
|
|
|
|
|
|
|
if (input0DataCount == 1) { // data count == 1, not only mean scalar input, maybe of shape (1, 1, 1, ...,1)
|
|
|
|
|
for (int i = 0; i < input1DataCount; i++) {
|
|
|
|
|
outputData[i] = static_cast<Tout>(f(input0Data[0], input1Data[i]));
|
|
|
|
|
}
|
|
|
|
|
} else if (input1DataCount == 1) {
|
|
|
|
|
for (int i = 0; i < input0DataCount; i++) {
|
|
|
|
|
outputData[i] = static_cast<Tout>(f(input0Data[i], input1Data[0]));
|
|
|
|
|
}
|
|
|
|
|
} else { // both input contains more than one element,which means no scalar input
|
|
|
|
|
bool sameShape = input0->elementSize() == input1->elementSize();
|
|
|
|
|
if (sameShape) { // two inputs have the same shape, apply element-wise operation
|
|
|
|
|
for (int i = 0; i < input0DataCount; i++) {
|
|
|
|
|
outputData[i] = static_cast<Tout>(f(input0Data[i], input1Data[i]));
|
|
|
|
|
}
|
|
|
|
|
} else { // not the same shape, use broadcast
|
|
|
|
|
#define MAX_DIM 6
|
|
|
|
|
MNN_ASSERT(output->dimensions() <= MAX_DIM);
|
|
|
|
|
int dims[MAX_DIM];
|
|
|
|
|
int stride[MAX_DIM];
|
|
|
|
|
int iStride0[MAX_DIM];
|
|
|
|
|
int iStride1[MAX_DIM];
|
|
|
|
|
for (int i = MAX_DIM - 1; i >= 0; --i) {
|
|
|
|
|
dims[i] = 1;
|
|
|
|
|
stride[i] = 0;
|
|
|
|
|
iStride0[i] = 0;
|
|
|
|
|
iStride1[i] = 0;
|
|
|
|
|
int input0I = i - (output->dimensions() - input0->dimensions());
|
|
|
|
|
int input1I = i - (output->dimensions() - input1->dimensions());
|
|
|
|
|
if (i < output->dimensions()) {
|
|
|
|
|
dims[i] = output->length(i);
|
|
|
|
|
stride[i] = output->stride(i);
|
|
|
|
|
}
|
|
|
|
|
if (input0I >= 0 && input0->length(input0I) != 1) {
|
|
|
|
|
iStride0[i] = input0->stride(input0I);
|
|
|
|
|
}
|
|
|
|
|
if (input1I >= 0 && input1->length(input1I) != 1) {
|
|
|
|
|
iStride1[i] = input1->stride(input1I);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for (int w = 0; w < dims[5]; ++w) {
|
|
|
|
|
auto ow = outputData + w * stride[5];
|
|
|
|
|
auto i0w = input0Data + w * iStride0[5];
|
|
|
|
|
auto i1w = input1Data + w * iStride1[5];
|
|
|
|
|
#define PTR(x, y, i) \
|
|
|
|
|
auto o##x = o##y + x * stride[i]; \
|
|
|
|
|
auto i0##x = i0##y + x * iStride0[i]; \
|
|
|
|
|
auto i1##x = i1##y + x * iStride1[i]
|
|
|
|
|
|
|
|
|
|
for (int v = 0; v < dims[4]; ++v) {
|
|
|
|
|
PTR(v, w, 4);
|
|
|
|
|
for (int u = 0; u < dims[3]; ++u) {
|
|
|
|
|
PTR(u, v, 3);
|
|
|
|
|
for (int z = 0; z < dims[2]; ++z) {
|
|
|
|
|
PTR(z, u, 2);
|
|
|
|
|
for (int y = 0; y < dims[1]; ++y) {
|
|
|
|
|
PTR(y, z, 1);
|
|
|
|
|
for (int x = 0; x < dims[0]; ++x) {
|
|
|
|
|
PTR(x, y, 0);
|
|
|
|
|
*ox = static_cast<Tout>(f(*i0x, *i1x));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef MAX_DIM
|
|
|
|
|
#undef PTR
|
|
|
|
|
}
|
|
|
|
|
// broadcast-capable check is done in compute size
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return NO_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryMax : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return std::max(x, y);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryMin : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return std::min(x, y);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryMul : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return x * y;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryAdd : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return x + y;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinarySub : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return x - y;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryRealDiv : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return x / y;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2019-12-27 22:16:57 +08:00
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryMod : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return x - x / y;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryGreater : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
2019-06-17 20:10:35 +08:00
|
|
|
|
return (_ErrorCode)((x > y) ? 1 : 0);
|
2019-04-17 10:49:11 +08:00
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryLess : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
2019-06-17 20:10:35 +08:00
|
|
|
|
return (_ErrorCode)((x < y) ? 1 : 0);
|
2019-04-17 10:49:11 +08:00
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryGreaterEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
2019-06-17 20:10:35 +08:00
|
|
|
|
return (_ErrorCode)((x >= y) ? 1 : 0);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
- dynamic computation graph (beta)
- add supports (/express)
- add tests
- add benchmarks with it (/benchmark/exprModels)
- Python
- MNN engine and tools were submitted to pip
- available on Windows/macOS/Linux
- Engine/Converter
- add supports for each op benchmarking
- refactor optimizer by separating steps
- CPU
- add supports for Conv3D, Pool3D, ELU, ReverseSequence
- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
- add half transform in CPU
- add broadcast supports for binary
- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
- add sub, real div supports for binary
- add supports for unary
- optimize Conv2D, Reshape
- Vulkan
- add max supports for eltwise
- Metal
- fix metallib missing problem
- Train/Quantization
- use express to refactor training codes
2019-09-26 21:02:07 +08:00
|
|
|
|
struct BinaryLessEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return (_ErrorCode)((x <= y) ? 1 : 0);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
2019-06-17 20:10:35 +08:00
|
|
|
|
struct BinaryEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return (_ErrorCode)((x == y) ? 1 : 0);
|
2019-04-17 10:49:11 +08:00
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryFloorDiv : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return floor(x / y);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryFloorMod : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return x - floor(x / y) * y;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinarySquaredDifference : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return (x - y) * (x - y);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryPow : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return pow(x, y);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2019-12-27 22:16:57 +08:00
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryAtan2 : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return atan(x / y);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryLogicalOr : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return (_ErrorCode)((x || y) ? 1 : 0);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
|
|
|
|
|
struct BinaryNotEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> {
|
|
|
|
|
_ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
|
|
|
|
|
return (_ErrorCode)((x != y) ? 1 : 0);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2019-04-17 10:49:11 +08:00
|
|
|
|
template <typename T>
|
|
|
|
|
ErrorCode CPUBinary<T>::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
|
|
|
auto input = inputs[0];
|
|
|
|
|
auto input1 = inputs[1];
|
|
|
|
|
auto output = outputs[0];
|
2020-01-15 13:33:47 +08:00
|
|
|
|
|
|
|
|
|
if (nullptr != mElementProc || mSupportScale) {
|
|
|
|
|
auto numberThread = ((CPUBackend*)backend())->threadNumber();
|
|
|
|
|
auto i1Size = input->elementSize();
|
|
|
|
|
auto i2Size = input1->elementSize();
|
|
|
|
|
auto size = i1Size;
|
|
|
|
|
if (size == 1) {
|
|
|
|
|
size = i2Size;
|
|
|
|
|
}
|
|
|
|
|
int sizeDivide = size / numberThread;
|
|
|
|
|
sizeDivide = UP_DIV(sizeDivide, 4) * 4;
|
|
|
|
|
int scheduleNumber = 1;
|
|
|
|
|
if (sizeDivide > 0) {
|
|
|
|
|
scheduleNumber = UP_DIV(size, sizeDivide);
|
|
|
|
|
}
|
|
|
|
|
if (nullptr != mElementProc) {
|
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
|
|
|
|
|
int start = sizeDivide * (int)tId;
|
|
|
|
|
int realSize = sizeDivide;
|
|
|
|
|
if (tId == scheduleNumber -1 ) {
|
|
|
|
|
realSize = size - start;
|
|
|
|
|
}
|
|
|
|
|
if (realSize > 0) {
|
|
|
|
|
mElementProc(output->host<float>() + start, input->host<float>() + start, input1->host<float>() + start, realSize, 0, 0, 0, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
MNN_CONCURRENCY_END();
|
|
|
|
|
} else {
|
|
|
|
|
float scale;
|
|
|
|
|
float bias;
|
|
|
|
|
float scalar;
|
|
|
|
|
float* inputPtr;
|
|
|
|
|
if (i1Size == 1) {
|
|
|
|
|
scalar = input->host<float>()[0];
|
|
|
|
|
inputPtr = input1->host<float>();
|
|
|
|
|
} else {
|
|
|
|
|
scalar = input1->host<float>()[0];
|
|
|
|
|
inputPtr = input->host<float>();
|
|
|
|
|
}
|
|
|
|
|
switch (mType) {
|
|
|
|
|
case BinaryOpOperation_MUL:
|
|
|
|
|
scale = scalar;
|
|
|
|
|
bias = 0.0f;
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_ADD:
|
|
|
|
|
scale = 1.0f;
|
|
|
|
|
bias = scalar;
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_SUB:
|
|
|
|
|
if (1 == i2Size) {
|
|
|
|
|
scale = 1.0f;
|
|
|
|
|
bias = -scalar;
|
|
|
|
|
} else {
|
|
|
|
|
scale = -1.0f;
|
|
|
|
|
bias = scalar;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
|
|
|
|
|
int start = sizeDivide * (int)tId;
|
|
|
|
|
int realSize = sizeDivide;
|
|
|
|
|
if (tId == scheduleNumber -1 ) {
|
|
|
|
|
realSize = size - start;
|
|
|
|
|
}
|
|
|
|
|
if (realSize > 0) {
|
|
|
|
|
MNNScaleAndAddBiasScalar(output->host<float>() + start, inputPtr + start, bias, scale, realSize);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
MNN_CONCURRENCY_END();
|
|
|
|
|
}
|
|
|
|
|
return NO_ERROR;
|
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
|
|
|
|
|
|
switch (mType) {
|
|
|
|
|
case BinaryOpOperation_MUL:
|
|
|
|
|
_binaryOp<T, T, BinaryMul<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_ADD:
|
|
|
|
|
_binaryOp<T, T, BinaryAdd<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_SUB:
|
|
|
|
|
_binaryOp<T, T, BinarySub<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BinaryOpOperation_REALDIV:
|
|
|
|
|
_binaryOp<T, T, BinaryRealDiv<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_MINIMUM:
|
|
|
|
|
_binaryOp<T, T, BinaryMin<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_MAXIMUM:
|
|
|
|
|
_binaryOp<T, T, BinaryMax<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_GREATER:
|
|
|
|
|
_binaryOp<T, int32_t, BinaryGreater<T, T, int32_t>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_LESS:
|
|
|
|
|
_binaryOp<T, T, BinaryLess<T, T, int32_t>>(input, input1, output);
|
|
|
|
|
break;
|
- dynamic computation graph (beta)
- add supports (/express)
- add tests
- add benchmarks with it (/benchmark/exprModels)
- Python
- MNN engine and tools were submitted to pip
- available on Windows/macOS/Linux
- Engine/Converter
- add supports for each op benchmarking
- refactor optimizer by separating steps
- CPU
- add supports for Conv3D, Pool3D, ELU, ReverseSequence
- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
- add half transform in CPU
- add broadcast supports for binary
- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
- add sub, real div supports for binary
- add supports for unary
- optimize Conv2D, Reshape
- Vulkan
- add max supports for eltwise
- Metal
- fix metallib missing problem
- Train/Quantization
- use express to refactor training codes
2019-09-26 21:02:07 +08:00
|
|
|
|
case BinaryOpOperation_LESS_EQUAL:
|
|
|
|
|
_binaryOp<T, T, BinaryLessEqual<T, T, int32_t>>(input, input1, output);
|
|
|
|
|
break;
|
2019-04-17 10:49:11 +08:00
|
|
|
|
case BinaryOpOperation_GREATER_EQUAL:
|
|
|
|
|
_binaryOp<T, T, BinaryGreaterEqual<T, T, int32_t>>(input, input1, output);
|
|
|
|
|
break;
|
2019-06-17 20:10:35 +08:00
|
|
|
|
case BinaryOpOperation_EQUAL:
|
|
|
|
|
_binaryOp<T, T, BinaryEqual<T, T, int32_t>>(input, input1, output);
|
|
|
|
|
break;
|
2019-04-17 10:49:11 +08:00
|
|
|
|
case BinaryOpOperation_FLOORDIV:
|
|
|
|
|
_binaryOp<T, T, BinaryFloorDiv<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
- build:
- unify schema building in core and converter;
- add more build script for android;
- add linux build script for python;
- ops impl:
- add floor mod support in binary;
- use eltwise impl in add/max/sub/mul binary for optimization;
- remove fake double support in cast;
- fix 5d support for concat;
- add adjX and adjY support for batch matmul;
- optimize conv2d back prop filter;
- add pad mode support for conv3d;
- fix bug in conv2d & conv depthwise with very small feature map;
- optimize binary without broacast;
- add data types support for gather;
- add gather ND support;
- use uint8 data type in gather v2;
- add transpose support for matmul;
- add matrix band part;
- add dim != 4 support for padding, reshape & tensor convert;
- add pad type support for pool3d;
- make ops based on TensorFlow Lite quantization optional;
- add all & any support for reduction;
- use type in parameter as output type in reduction;
- add int support for unary;
- add variable weight support for conv2d;
- fix conv2d depthwise weights initialization;
- fix type support for transpose;
- fix grad outputs count for reduce grad and reshape grad;
- fix priorbox & detection output;
- fix metal softmax error;
- python:
- add runSessionWithCallBackInfo interface;
- add max nodes limit (1400) for visualization tool;
- fix save error in python3;
- align default dim;
- convert:
- add extra design for optimization;
- add more post converting optimizers;
- add caffe v1 weights blob support;
- add cast, unary, conv transpose support for onnx model;
- optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model;
- add cos/sin/atan/tan support for unary for tensorflow model;
- add any/all support for reduction for tensorflow model;
- add elu, conv3d, pool3d support for tensorflow model;
- optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model;
- others:
- fix size computer lock;
- fix thread pool deadlock;
- add express & parameters in express;
- rewrite blitter chooser without static map;
- add tests for expr;
2019-10-29 13:37:26 +08:00
|
|
|
|
case BinaryOpOperation_FLOORMOD:
|
|
|
|
|
_binaryOp<T, T, BinaryFloorMod<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
2019-04-17 10:49:11 +08:00
|
|
|
|
case BinaryOpOperation_POW:
|
|
|
|
|
_binaryOp<T, T, BinaryPow<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_SquaredDifference:
|
|
|
|
|
_binaryOp<T, T, BinarySquaredDifference<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
2019-12-27 22:16:57 +08:00
|
|
|
|
case BinaryOpOperation_ATAN2:
|
|
|
|
|
_binaryOp<T, T, BinaryAtan2<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_LOGICALOR:
|
|
|
|
|
_binaryOp<T, T, BinaryLogicalOr<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_NOTEQUAL:
|
|
|
|
|
_binaryOp<T, T, BinaryNotEqual<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
|
|
|
|
case BinaryOpOperation_MOD:
|
|
|
|
|
_binaryOp<T, T, BinaryMod<T, T, T>>(input, input1, output);
|
|
|
|
|
break;
|
2019-04-17 10:49:11 +08:00
|
|
|
|
default:
|
|
|
|
|
MNN_ASSERT(false);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
return NO_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class CPUBinaryCreator : public CPUBackend::Creator {
|
|
|
|
|
public:
|
|
|
|
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
|
|
|
|
const MNN::Op* op, Backend* backend) const override {
|
2019-06-17 20:10:35 +08:00
|
|
|
|
auto dataType = outputs[0]->getType();
|
2019-04-17 10:49:11 +08:00
|
|
|
|
int32_t type = op->main_as_BinaryOp()->opType();
|
2019-06-17 20:10:35 +08:00
|
|
|
|
if (dataType.bits == 32) {
|
|
|
|
|
if (dataType.code == halide_type_int) {
|
2019-04-17 10:49:11 +08:00
|
|
|
|
return new CPUBinary<int32_t>(backend, type);
|
2019-06-17 20:10:35 +08:00
|
|
|
|
}
|
|
|
|
|
if (dataType.code == halide_type_float) {
|
2019-04-17 10:49:11 +08:00
|
|
|
|
return new CPUBinary<float>(backend, type);
|
2019-06-17 20:10:35 +08:00
|
|
|
|
}
|
2019-04-17 10:49:11 +08:00
|
|
|
|
}
|
2019-06-17 20:10:35 +08:00
|
|
|
|
return nullptr;
|
2019-04-17 10:49:11 +08:00
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUBinaryCreator, OpType_BinaryOp);
|
|
|
|
|
|
|
|
|
|
} // namespace MNN
|