MNN/source/backend/cpu/BinaryUtils.hpp

#include <math.h>
#include <algorithm>
#include "compute/CommonOptFunction.h"
#include "MNN_generated.h"

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryMax {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return std::max(x, y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryMin {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return std::min(x, y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryMul {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return x * y;
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryAdd {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return x + y;
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinarySub {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return x - y;
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryRealDiv {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return x / y;
    }
};

/**
 Ref from onnxruntime/onnxruntime/core/providers/cpu/math/element_wise_ops.cc :: Modulus
 */
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryModInt {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        auto res = x % y;
        if ((res < 0 && y > 0) || (res > 0 && y < 0)) {
            res += y;
        }
        return (_ErrorCode)res;
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryMod {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return fmodf(x, y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryGreater {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x > y) ? 1 : 0);
    }
};
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryLess {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x < y) ? 1 : 0);
    }
};
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryGreaterEqual {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x >= y) ? 1 : 0);
    }
};
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryLessEqual {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x <= y) ? 1 : 0);
    }
};
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryEqual {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x == y) ? 1 : 0);
    }
};
template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryFloorDiv {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return floor(static_cast<double>(x) / y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryFloorMod {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return x - floor(x / y) * y;
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinarySquaredDifference {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (x - y) * (x - y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryPow {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return pow(x, y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryAtan2 {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return atan2(x, y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryLogicalOr {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x || y) ? 1 : 0);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryLogicalXor {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x ^ y) ? 1 : 0);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryNotEqual {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)((x != y) ? 1 : 0);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryLeftShift {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)(x << y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryBitwiseAnd {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)(x & y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryRightShift {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)(x >> y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryBitwiseOr {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)(x | y);
    }
};

template <typename _Arg1, typename _Arg2, typename _ErrorCode>
struct BinaryBitwiseXor {
    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
        return (_ErrorCode)(x ^ y);
    }
};

template<typename Func, typename V, int pack>
void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int elementSize, int needBroadcastIndex) {
    Func compute;
    const int sizeDivUnit = elementSize / pack;
    const int remainCount = elementSize - sizeDivUnit * pack;
    auto src0 = (const float*)(inputRaw0);
    auto src1 = (const float*)(inputRaw1);
    auto dst = (float*)outputRaw;

    if (-1 == needBroadcastIndex) {
        if (sizeDivUnit > 0) {
            for (int i = 0; i < sizeDivUnit; ++i) {
                V a = V::load(src0);
                V b = V::load(src1);
                V::save(dst, compute(a, b));
                src0 += pack;
                src1 += pack;
                dst += pack;
            }
        }
        if (remainCount > 0) {
            float tempSrc0[pack];
            float tempSrc1[pack];
            float tempDst[pack];
            ::memcpy(tempSrc0, src0, remainCount * sizeof(float));
            ::memcpy(tempSrc1, src1, remainCount * sizeof(float));
            V a = V::load(tempSrc0);
            V b = V::load(tempSrc1);
            V::save(tempDst, compute(a, b));
            ::memcpy(dst, tempDst, remainCount * sizeof(float));
        }
    } else if (0 == needBroadcastIndex) {
        const float srcValue0 = src0[0];
        V a = V(srcValue0);
        if (sizeDivUnit > 0) {
            for (int i = 0; i < sizeDivUnit; ++i) {
                const auto src1Ptr = src1;
                auto dstPtr = dst;
                V b = V::load(src1Ptr);
                V::save(dstPtr, compute(a, b));
                src1 += pack;
                dst += pack;
            }
        }
        if (remainCount > 0) {
            float tempSrc1[pack];
            float tempDst[pack];
            ::memcpy(tempSrc1, src1, remainCount * sizeof(float));
            V b = V::load(tempSrc1);
            V::save(tempDst, compute(a, b));
            ::memcpy(dst, tempDst, remainCount * sizeof(float));
        }
    } else {
        const float srcValue1 = src1[0];
        V b = V(srcValue1);
        if (sizeDivUnit > 0) {
            for (int i = 0; i < sizeDivUnit; ++i) {
                const auto src0Ptr = src0;
                auto dstPtr = dst;
                V a = V::load(src0Ptr);
                V::save(dstPtr, compute(a, b));
                src0 += pack;
                dst += pack;
            }
        }
        if (remainCount > 0) {
            float tempSrc0[pack];
            float tempDst[pack];
            ::memcpy(tempSrc0, src0, remainCount * sizeof(float));
            V a = V::load(tempSrc0);
            V::save(tempDst, compute(a, b));
            ::memcpy(dst, tempDst, remainCount * sizeof(float));
        }
    }
}

template<typename Vec>
struct VecBinaryAdd  {
    Vec operator()(Vec& x, Vec& y) const {
        return x + y;
    }
};

template<typename Vec>
struct VecBinarySub  {
    Vec operator()(Vec& x, Vec& y) const {
        return x - y;
    }
};

template<typename Vec>
struct VecBinaryMul  {
    Vec operator()(Vec& x, Vec& y) const {
        return x * y;
    }
};

template<typename Vec>
struct VecBinaryMin  {
    Vec operator()(Vec& x, Vec& y) const {
        return Vec::min(x, y);
    }
};

template<typename Vec>
struct VecBinaryMax  {
    Vec operator()(Vec& x, Vec& y) const {
        return Vec::max(x, y);
    }
};

template<typename Vec>
struct VecBinarySqd  {
    Vec operator()(Vec& x, Vec& y) const {
        return (x-y)*(x-y);
    }
};
namespace MNN {
template<typename Tin, typename Tout, typename Func>
void execute(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int elementSize, int broadcastIndex) {
    Func f;
    const int input0DataCount = elementSize;
    const int input1DataCount = elementSize;
    const Tin* input0Data = (const Tin*)inputRaw0;
    const Tin* input1Data = (const Tin*)inputRaw1;
    Tout* outputData = (Tout*)outputRaw;

    if (broadcastIndex == 0) { // data count == 1, not only mean scalar input, maybe of shape (1, 1, 1, ...,1)
        for (int i = 0; i < input1DataCount; i++) {
            outputData[i] = (Tout)(f(input0Data[0], input1Data[i]));
        }
    } else if (broadcastIndex == 1) {
        for (int i = 0; i < input0DataCount; i++) {
            outputData[i] = (Tout)(f(input0Data[i], input1Data[0]));
        }
    } else { // both input contains more than one element，which means no scalar input
        for (int i = 0; i < input0DataCount; i++) {
            outputData[i] = (Tout)(f(input0Data[i], input1Data[i]));
        }
    }
}

template<typename Tin, typename Tout, typename Func>
void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
    Func f;
    int size = elementSize;
#ifdef MNN_USE_NEON
    size *= 4;
#endif

    float inp0 = 0, inp1 = 0, output = 0;
#ifdef MNN_USE_SSE
    const int zeroPoint = 128;
    const int maxValue = 255;
    const int minValue = 0;
    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
    uint8_t* outputData = (uint8_t*)outputRaw;
#else
    const int zeroPoint = 0;
    const int maxValue = 127;
    const int minValue = -128;
    const int8_t* inputData0 = (int8_t*)inputRaw0;
    const int8_t* inputData1 = (int8_t*)inputRaw1;
    int8_t* outputData = (int8_t*)outputRaw;
#endif
    for (int i = 0; i < size; ++i) {
        if (needBroadcast == 0) {
            inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
            inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
            output = f(inp0, inp1);
        } else if (needBroadcast == 1) {
            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
            inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
            output = f(inp0, inp1);
        } else {
            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
            inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
            output = f(inp0, inp1);
        }
        int value = (int)roundf(output * outputScale[i]) + zeroPoint;
        if (value > maxValue) {
            value = maxValue;
        }
        if (value < minValue) {
            value = minValue;
        }
        outputData[i] = value;
    }
}

template<typename V, int pack>
MNNBinaryExecute selectVector(int type) {
    switch (type) {
        case BinaryOpOperation_ADD:
            return executeVec<VecBinaryAdd<V>, V, pack>;
        case BinaryOpOperation_SUB:
            return executeVec<VecBinarySub<V>, V, pack>;
        case BinaryOpOperation_MUL:
            return executeVec<VecBinaryMul<V>, V, pack>;
        case BinaryOpOperation_MINIMUM:
            return executeVec<VecBinaryMin<V>, V, pack>;
        case BinaryOpOperation_MAXIMUM:
            return executeVec<VecBinaryMax<V>, V, pack>;
        case BinaryOpOperation_SquaredDifference:
            return executeVec<VecBinarySqd<V>, V, pack>;
    }
    return nullptr;
}
};
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								#include <math.h>
 								#include <algorithm>
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								#include "compute/CommonOptFunction.h"
 								#include "MNN_generated.h"
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryMax {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return std::max(x, y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryMin {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return std::min(x, y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryMul {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return x * y;
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryAdd {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return x + y;
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinarySub {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return x - y;
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryRealDiv {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return x / y;
 								    }
 								};
-												[MNN:Sync] A few bugfixes
    1. 支持 Onnx If 空子图的情况（这种情况是条件判断一定为真或假）
    2. 修正 Where 算子在 zeroshape 下维度计算出错的问题
    3. 修正 Reduce 计算 zeroshape 的非 prod 情况
    4. 修正 arch64-linux 上编译错误
    5. 修正 头文件 NNAPI 的注释错误
    6, 部分训练相关问题修正

											
										
										
											2022-12-04 15:17:36 +08:00
+								/**
 								 Ref from onnxruntime/onnxruntime/core/providers/cpu/math/element_wise_ops.cc :: Modulus
 								 */
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal gitlab
Main Feature:
1. Add OpenCV API and Numpy API Support
2. Protobuf move into MNN
3. Add more op for torchscript convert
4. Add recompute to speed up geometry compute
5. Add ModuleBasic Test

											
										
										
											2021-11-30 10:10:53 +08:00
+								struct BinaryModInt {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
-												[MNN:Sync] A few bugfixes
    1. 支持 Onnx If 空子图的情况（这种情况是条件判断一定为真或假）
    2. 修正 Where 算子在 zeroshape 下维度计算出错的问题
    3. 修正 Reduce 计算 zeroshape 的非 prod 情况
    4. 修正 arch64-linux 上编译错误
    5. 修正 头文件 NNAPI 的注释错误
    6, 部分训练相关问题修正

											
										
										
											2022-12-04 15:17:36 +08:00
+								        auto res = x % y;
 								        if ((res < 0 && y > 0) || (res > 0 && y < 0)) {
 								            res += y;
 								        }
 								        return (_ErrorCode)res;
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    }
 								};
-												[MNN:Sync] Sync internal gitlab
Main Feature:
1. Add OpenCV API and Numpy API Support
2. Protobuf move into MNN
3. Add more op for torchscript convert
4. Add recompute to speed up geometry compute
5. Add ModuleBasic Test

											
										
										
											2021-11-30 10:10:53 +08:00
+								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
 								struct BinaryMod {
 								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return fmodf(x, y);
 								    }
 								};
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryGreater {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x > y) ? 1 : 0);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryLess {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x < y) ? 1 : 0);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryGreaterEqual {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x >= y) ? 1 : 0);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryLessEqual {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x <= y) ? 1 : 0);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryEqual {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x == y) ? 1 : 0);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryFloorDiv {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
-												[MNN:Sync] Sync internal github

Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model

											
										
										
											2021-07-29 11:46:59 +08:00
+								        return floor(static_cast<double>(x) / y);
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryFloorMod {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return x - floor(x / y) * y;
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinarySquaredDifference {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (x - y) * (x - y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryPow {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return pow(x, y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryAtan2 {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
-												[MNN:Sync] Sync Internal 2.4.1

											
										
										
											2023-03-20 11:32:29 +08:00
+								        return atan2(x, y);
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryLogicalOr {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x || y) ? 1 : 0);
 								    }
 								};
-												[MNN:Sync] Sync internal gitlab

											
										
										
											2022-01-04 10:50:40 +08:00
+								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
 								struct BinaryLogicalXor {
 								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x ^ y) ? 1 : 0);
 								    }
 								};
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
-												[MNN:Sync] Sync internal Gitlab

											
										
										
											2021-04-08 15:34:23 +08:00
+								struct BinaryNotEqual {
-												[PATCH 14/19] [Arm82:Feature] Support All Binary of float for arm82

											
										
										
											2021-01-07 17:08:34 +08:00
+								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)((x != y) ? 1 : 0);
 								    }
 								};
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
-												[MNN:Sync] Sync internal gitlab

											
										
										
											2022-01-04 10:50:40 +08:00
+								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
 								struct BinaryLeftShift {
 								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)(x << y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
 								struct BinaryBitwiseAnd {
 								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)(x & y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
 								struct BinaryRightShift {
 								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)(x >> y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
 								struct BinaryBitwiseOr {
 								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)(x | y);
 								    }
 								};
 								template <typename _Arg1, typename _Arg2, typename _ErrorCode>
 								struct BinaryBitwiseXor {
 								    _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const {
 								        return (_ErrorCode)(x ^ y);
 								    }
 								};
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								template<typename Func, typename V, int pack>
 								void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int elementSize, int needBroadcastIndex) {
 								    Func compute;
 								    const int sizeDivUnit = elementSize / pack;
 								    const int remainCount = elementSize - sizeDivUnit * pack;
 								    auto src0 = (const float*)(inputRaw0);
 								    auto src1 = (const float*)(inputRaw1);
 								    auto dst = (float*)outputRaw;
 								    if (-1 == needBroadcastIndex) {
 								        if (sizeDivUnit > 0) {
 								            for (int i = 0; i < sizeDivUnit; ++i) {
 								                V a = V::load(src0);
 								                V b = V::load(src1);
 								                V::save(dst, compute(a, b));
 								                src0 += pack;
 								                src1 += pack;
 								                dst += pack;
 								            }
 								        }
 								        if (remainCount > 0) {
 								            float tempSrc0[pack];
 								            float tempSrc1[pack];
 								            float tempDst[pack];
 								            ::memcpy(tempSrc0, src0, remainCount * sizeof(float));
 								            ::memcpy(tempSrc1, src1, remainCount * sizeof(float));
 								            V a = V::load(tempSrc0);
 								            V b = V::load(tempSrc1);
 								            V::save(tempDst, compute(a, b));
 								            ::memcpy(dst, tempDst, remainCount * sizeof(float));
 								        }
 								    } else if (0 == needBroadcastIndex) {
 								        const float srcValue0 = src0[0];
 								        V a = V(srcValue0);
 								        if (sizeDivUnit > 0) {
 								            for (int i = 0; i < sizeDivUnit; ++i) {
 								                const auto src1Ptr = src1;
 								                auto dstPtr = dst;
 								                V b = V::load(src1Ptr);
 								                V::save(dstPtr, compute(a, b));
 								                src1 += pack;
 								                dst += pack;
 								            }
 								        }
 								        if (remainCount > 0) {
 								            float tempSrc1[pack];
 								            float tempDst[pack];
 								            ::memcpy(tempSrc1, src1, remainCount * sizeof(float));
 								            V b = V::load(tempSrc1);
 								            V::save(tempDst, compute(a, b));
 								            ::memcpy(dst, tempDst, remainCount * sizeof(float));
 								        }
 								    } else {
 								        const float srcValue1 = src1[0];
 								        V b = V(srcValue1);
 								        if (sizeDivUnit > 0) {
 								            for (int i = 0; i < sizeDivUnit; ++i) {
 								                const auto src0Ptr = src0;
 								                auto dstPtr = dst;
 								                V a = V::load(src0Ptr);
 								                V::save(dstPtr, compute(a, b));
 								                src0 += pack;
 								                dst += pack;
 								            }
 								        }
 								        if (remainCount > 0) {
 								            float tempSrc0[pack];
 								            float tempDst[pack];
 								            ::memcpy(tempSrc0, src0, remainCount * sizeof(float));
 								            V a = V::load(tempSrc0);
 								            V::save(tempDst, compute(a, b));
 								            ::memcpy(dst, tempDst, remainCount * sizeof(float));
 								        }
 								    }
 								}
 								template<typename Vec>
 								struct VecBinaryAdd  {
 								    Vec operator()(Vec& x, Vec& y) const {
 								        return x + y;
 								    }
 								};
 								template<typename Vec>
 								struct VecBinarySub  {
 								    Vec operator()(Vec& x, Vec& y) const {
 								        return x - y;
 								    }
 								};
 								template<typename Vec>
 								struct VecBinaryMul  {
 								    Vec operator()(Vec& x, Vec& y) const {
 								        return x * y;
 								    }
 								};
 								template<typename Vec>
 								struct VecBinaryMin  {
 								    Vec operator()(Vec& x, Vec& y) const {
 								        return Vec::min(x, y);
 								    }
 								};
 								template<typename Vec>
 								struct VecBinaryMax  {
 								    Vec operator()(Vec& x, Vec& y) const {
 								        return Vec::max(x, y);
 								    }
 								};
 								template<typename Vec>
 								struct VecBinarySqd  {
 								    Vec operator()(Vec& x, Vec& y) const {
 								        return (x-y)*(x-y);
 								    }
 								};
 								namespace MNN {
 								template<typename Tin, typename Tout, typename Func>
 								void execute(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int elementSize, int broadcastIndex) {
 								    Func f;
 								    const int input0DataCount = elementSize;
 								    const int input1DataCount = elementSize;
 								    const Tin* input0Data = (const Tin*)inputRaw0;
 								    const Tin* input1Data = (const Tin*)inputRaw1;
 								    Tout* outputData = (Tout*)outputRaw;
 								    if (broadcastIndex == 0) { // data count == 1, not only mean scalar input, maybe of shape (1, 1, 1, ...,1)
 								        for (int i = 0; i < input1DataCount; i++) {
 								            outputData[i] = (Tout)(f(input0Data[0], input1Data[i]));
 								        }
 								    } else if (broadcastIndex == 1) {
 								        for (int i = 0; i < input0DataCount; i++) {
 								            outputData[i] = (Tout)(f(input0Data[i], input1Data[0]));
 								        }
 								    } else { // both input contains more than one element，which means no scalar input
 								        for (int i = 0; i < input0DataCount; i++) {
 								            outputData[i] = (Tout)(f(input0Data[i], input1Data[i]));
 								        }
 								    }
 								}
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								template<typename Tin, typename Tout, typename Func>
 								void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
 								    Func f;
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								    int size = elementSize;
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								#ifdef MNN_USE_NEON
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								    size *= 4;
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								#endif
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								    float inp0 = 0, inp1 = 0, output = 0;
 								#ifdef MNN_USE_SSE
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								    const int zeroPoint = 128;
 								    const int maxValue = 255;
 								    const int minValue = 0;
 								    const uint8_t* inputData0 = (uint8_t*)inputRaw0;
 								    const uint8_t* inputData1 = (uint8_t*)inputRaw1;
 								    uint8_t* outputData = (uint8_t*)outputRaw;
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								#else
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								    const int zeroPoint = 0;
 								    const int maxValue = 127;
 								    const int minValue = -128;
 								    const int8_t* inputData0 = (int8_t*)inputRaw0;
 								    const int8_t* inputData1 = (int8_t*)inputRaw1;
 								    int8_t* outputData = (int8_t*)outputRaw;
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								#endif
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								    for (int i = 0; i < size; ++i) {
 								        if (needBroadcast == 0) {
 								            inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
 								            inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								            output = f(inp0, inp1);
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								        } else if (needBroadcast == 1) {
 								            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
 								            inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								            output = f(inp0, inp1);
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								        } else {
 								            inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
 								            inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								            output = f(inp0, inp1);
 								        }
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								        int value = (int)roundf(output * outputScale[i]) + zeroPoint;
 								        if (value > maxValue) {
 								            value = maxValue;
 								        }
 								        if (value < minValue) {
 								            value = minValue;
 								        }
 								        outputData[i] = value;
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								    }
 								}
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								template<typename V, int pack>
 								MNNBinaryExecute selectVector(int type) {
 								    switch (type) {
 								        case BinaryOpOperation_ADD:
 								            return executeVec<VecBinaryAdd<V>, V, pack>;
 								        case BinaryOpOperation_SUB:
 								            return executeVec<VecBinarySub<V>, V, pack>;
 								        case BinaryOpOperation_MUL:
 								            return executeVec<VecBinaryMul<V>, V, pack>;
 								        case BinaryOpOperation_MINIMUM:
 								            return executeVec<VecBinaryMin<V>, V, pack>;
 								        case BinaryOpOperation_MAXIMUM:
 								            return executeVec<VecBinaryMax<V>, V, pack>;
 								        case BinaryOpOperation_SquaredDifference:
 								            return executeVec<VecBinarySqd<V>, V, pack>;
 								    }
 								    return nullptr;
 								}
 								};