MNN/source/backend/cpu/compute/ResizeFunction.cpp

//
//  ResizeFunction.cpp
//  MNN
//
//  Created by MNN on 2018/07/23.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "backend/cpu/compute/ResizeFunction.h"
#include <math.h>
#include "core/AutoStorage.h"
#include "core/Macro.h"
#include "math/Vec.hpp"

using namespace MNN::Math;
using Vec4 = Vec<float, 4>;
using Vec16 = Vec<float, 16>;
using Vec8 = Vec<float, 8>;
// F = -0.5
static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {
    Vec4 a = (B - C) + (B - A) * 0.5f  + (D - C) * 0.5f;
    Vec4 b = C - ((B - A) + (B - C)) - (B + D) * 0.5f;

    Vec4 c = (C - A) * 0.5f;
    Vec4 d = B;
    return ((a * t + b) * t + c) * t + d;
}

// F = -0.75
template<typename T, int pack>
static Vec<T, pack> CubicInterpolation2(Vec<T, pack>& A, Vec<T, pack>& B, Vec<T, pack>& C, Vec<T, pack>& D, float t) {
    float b0 = 1.0f - 2.25f * t * t + 1.25f * t * t * t;
    float c0 = 1.0f - 2.25f * (1.0f - t) * (1.0f - t) + 1.25 * (1.0f - t) * (1.0f - t) * (1.0f - t);
    auto t_a = 1.0f + t;
    auto t_d = 2.0f - t;
    auto a0 = 3.0f - 6.0f * (t_a) + 5.0f * 0.75 * t_a * t_a - 0.75f * t_a * t_a * t_a;
    auto d0 = 3.0f - 6.0f * (t_d) + 5.0f * 0.75 * t_d * t_d - 0.75f * t_d * t_d * t_d;
    
    return A * a0 + B * b0 + C * c0 + D * d0;
}

void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,
                                size_t number) {
    int pack = 4;
    for (int i = 0; i < number; ++i) {
        float f = factor[i];
        Vec4 df(f);
        Vec4 sf(1.0f - f);
        Vec4 A = Vec4::load(src + position[2 * i] * pack);
        Vec4 B = Vec4::load(src + position[2 * i + 1] * pack);
        Vec4 Result = B * df + A * sf;
        Vec4::save(dst + pack * i, B * df + A * sf);
    }
}

void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {
    int pack = 4;
    Vec4 df(*t);
    Vec4 sf(1.0f - *t);
    for (int i = 0; i < number; ++i) {
        Vec4 value = Vec4::load(A + pack * i) * sf + Vec4::load(B + pack * i) * df;
        Vec4::save(dst + pack * i, value);
    }
}

void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number) {
    for (int i = 0; i < number; ++i) {
        float f = factor[i];
        auto A        = Vec4::load(src + 4 * position[4 * i + 0]);
        auto B        = Vec4::load(src + 4 * position[4 * i + 1]);
        auto C        = Vec4::load(src + 4 * position[4 * i + 2]);
        auto D        = Vec4::load(src + 4 * position[4 * i + 3]);
        Vec4::save(dst + 4 * i, CubicInterpolation2(A, B, C, D, f));
    }
}

void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,
                    size_t number) {
    float f = *t;
    for (int i = 0; i < number; ++i) {
        auto a = Vec4::load(A + 4 * i);
        auto b = Vec4::load(B + 4 * i);
        auto c = Vec4::load(C + 4 * i);
        auto d = Vec4::load(D + 4 * i);
        Vec4::save(dst + 4 * i, CubicInterpolation2<float, 4>(a, b, c, d, f));
    }
}

#ifndef MNN_USE_NEON
void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number) {
    int pack = 16;
    using  Vec16 = Vec<float, 16>;
#ifdef MNN_USE_SSE
    Vec16 zeroPointV(128);
    const uint8_t* srcPtr = (uint8_t*)src;
#else
    Vec16 zeroPointV(0);
    const int8_t* srcPtr = src;
#endif
    for (int i = 0; i < number; ++i) {
        float f = factor[i];
        auto A        = Vec16::load(srcPtr + pack * position[4 * i + 0]) - zeroPointV;
        auto B        = Vec16::load(srcPtr + pack * position[4 * i + 1]) - zeroPointV;
        auto C        = Vec16::load(srcPtr + pack * position[4 * i + 2]) - zeroPointV;
        auto D        = Vec16::load(srcPtr + pack * position[4 * i + 3]) - zeroPointV;
        auto val16 = CubicInterpolation2<float, 16>(A, B, C, D, f);
        Vec16::save(dst + pack * i, CubicInterpolation2<float, 16>(A, B, C, D, f));
    }
}

void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,
                    size_t number) {
    int pack = 16;
    using  Vec16 = Vec<float, 16>;
#ifdef MNN_USE_SSE
    uint8_t* dstPtr = (uint8_t*)dst;
    int offset = 128;
    int minValue = 0;
    int maxValue = 255;
#else
    int8_t* dstPtr = dst;
    int offset = 0;
    int minValue = -128;
    int maxValue = 127;
#endif
    float f = *t;
    for (int i = 0; i < number; ++i) {
        auto a = Vec16::load(A + pack * i);
        auto b = Vec16::load(B + pack * i);
        auto c = Vec16::load(C + pack * i);
        auto d = Vec16::load(D + pack * i);
        auto val16 = CubicInterpolation2<float, 16>(a, b, c, d, f);
        for (int j = 0; j < pack; ++j) {
            int val = (int)roundf(val16[j]) + offset;
            if (val > maxValue) {
                val = maxValue;
            }
            if (val < minValue) {
                val = minValue;
            }
            *(dstPtr + pack * i + j) = val;
        }
    }
}

void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor,
                                size_t number) {
#ifdef MNN_USE_SSE
    int offset = 128;
    const uint8_t* srcPtr = (uint8_t*)src;
#else
    int offset = 0;
    const int8_t* srcPtr = src;
#endif
    int pack = 8;
    for (int i = 0; i < number; ++i) {
        int16_t df = factor[i] * 128;
        int16_t sf = (1 - factor[i]) * 128;
        auto aPtr = srcPtr + position[2 * i] * pack;
        auto bPtr = srcPtr + position[2 * i + 1] * pack;
        for (int j = 0; j < pack; ++j) {
            int a = static_cast<int32_t>(*(aPtr + j) - offset);
            int b = static_cast<int32_t>(*(bPtr + j) - offset);
            int16_t val = static_cast<int16_t>(a * sf + b * df);
            *(dst + pack * i + j) = val;
        }
    }
}

void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) {
#ifdef MNN_USE_SSE
    int offset = 128;
    uint8_t* dstPtr = (uint8_t*)dst;
#else
    int offset = 0;
    int8_t* dstPtr = dst;
#endif
    int pack = 8;
    int16_t df = (*t) * 128;
    int16_t sf = (1 - *t) * 128;
    for (int i = 0; i < number; ++i) {
        auto aPtr = A + pack * i;
        auto bPtr = B + pack * i;
        for (int j = 0; j < pack; ++j) {
            int32_t val = *(aPtr + j) * sf + *(bPtr + j) * df;
            int8_t valOut = (val + (1<<13)) / (1 << 14);
            if (val < 0) {
                valOut = (val - (1 << 13)) / (1 << 14);
            }
            *(dstPtr+ pack * i + j) = valOut+ offset;
        }
    }
}

#endif
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// ResizeFunction.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2018/07/23.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

Update 2019-12-27 22:16:57 +08:00			`#include "backend/cpu/compute/ResizeFunction.h"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#include <math.h>`
Update 2019-12-27 22:16:57 +08:00			`#include "core/AutoStorage.h"`
			`#include "core/Macro.h"`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`#include "math/Vec.hpp"`
beta 0.1.0 2019-04-17 10:49:11 +08:00
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`using namespace MNN::Math;`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`using Vec4 = Vec<float, 4>;`
			`using Vec16 = Vec<float, 16>;`
			`using Vec8 = Vec<float, 8>;`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`// F = -0.5`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`static Vec4 CubicInterpolation(Vec4& A, Vec4& B, Vec4& C, Vec4& D, float t) {`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`Vec4 a = (B - C) + (B - A) * 0.5f + (D - C) * 0.5f;`
			`Vec4 b = C - ((B - A) + (B - C)) - (B + D) * 0.5f;`
beta 0.1.0 2019-04-17 10:49:11 +08:00
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`Vec4 c = (C - A) * 0.5f;`
			`Vec4 d = B;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return ((a * t + b) * t + c) * t + d;`
			`}`

Github release 1.1.0 2020-11-05 16:41:56 +08:00			`// F = -0.75`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`template<typename T, int pack>`
			`static Vec<T, pack> CubicInterpolation2(Vec<T, pack>& A, Vec<T, pack>& B, Vec<T, pack>& C, Vec<T, pack>& D, float t) {`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`float b0 = 1.0f - 2.25f * t * t + 1.25f * t * t * t;`
			`float c0 = 1.0f - 2.25f * (1.0f - t) * (1.0f - t) + 1.25 * (1.0f - t) * (1.0f - t) * (1.0f - t);`
			`auto t_a = 1.0f + t;`
			`auto t_d = 2.0f - t;`
			`auto a0 = 3.0f - 6.0f * (t_a) + 5.0f * 0.75 * t_a * t_a - 0.75f * t_a * t_a * t_a;`
			`auto d0 = 3.0f - 6.0f * (t_d) + 5.0f * 0.75 * t_d * t_d - 0.75f * t_d * t_d * t_d;`

			`return A * a0 + B * b0 + C * c0 + D * d0;`
			`}`

[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`void CPUBilinearSampleC4(const float* src, float* dst, const int32_t* position, const float* factor,`
			`size_t number) {`
			`int pack = 4;`
			`for (int i = 0; i < number; ++i) {`
			`float f = factor[i];`
			`Vec4 df(f);`
			`Vec4 sf(1.0f - f);`
			`Vec4 A = Vec4::load(src + position[2 * i] * pack);`
			`Vec4 B = Vec4::load(src + position[2 * i + 1] * pack);`
			`Vec4 Result = B * df + A * sf;`
			`Vec4::save(dst + pack * i, B * df + A * sf);`
			`}`
			`}`

			`void CPUBilinearLineC4(float* dst, const float* A, const float* B, const float* t, size_t number) {`
			`int pack = 4;`
			`Vec4 df(*t);`
			`Vec4 sf(1.0f - *t);`
			`for (int i = 0; i < number; ++i) {`
			`Vec4 value = Vec4::load(A + pack * i) * sf + Vec4::load(B + pack * i) * df;`
			`Vec4::save(dst + pack * i, value);`
			`}`
			`}`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`void MNNCubicSampleC4(const float* src, float* dst, int32_t* position, const float* factor, size_t number) {`
			`for (int i = 0; i < number; ++i) {`
			`float f = factor[i];`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`auto A = Vec4::load(src + 4 * position[4 * i + 0]);`
			`auto B = Vec4::load(src + 4 * position[4 * i + 1]);`
			`auto C = Vec4::load(src + 4 * position[4 * i + 2]);`
			`auto D = Vec4::load(src + 4 * position[4 * i + 3]);`
			`Vec4::save(dst + 4 * i, CubicInterpolation2(A, B, C, D, f));`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
			`}`

			`void MNNCubicLineC4(float* dst, const float* A, const float* B, const float* C, const float* D, float* t,`
			`size_t number) {`
			`float f = *t;`
			`for (int i = 0; i < number; ++i) {`
Synchronize internal github for version 1.2.0 (#1518) 2021-06-11 17:17:13 +08:00			`auto a = Vec4::load(A + 4 * i);`
			`auto b = Vec4::load(B + 4 * i);`
			`auto c = Vec4::load(C + 4 * i);`
			`auto d = Vec4::load(D + 4 * i);`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`Vec4::save(dst + 4 * i, CubicInterpolation2<float, 4>(a, b, c, d, f));`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
			`}`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00
			`#ifndef MNN_USE_NEON`
			`void MNNCubicSampleC16(const int8_t* src, float* dst, int32_t* position, const float* factor, size_t number) {`
			`int pack = 16;`
			`using Vec16 = Vec<float, 16>;`
			`#ifdef MNN_USE_SSE`
			`Vec16 zeroPointV(128);`
			`const uint8_t* srcPtr = (uint8_t*)src;`
			`#else`
			`Vec16 zeroPointV(0);`
			`const int8_t* srcPtr = src;`
			`#endif`
			`for (int i = 0; i < number; ++i) {`
			`float f = factor[i];`
			`auto A = Vec16::load(srcPtr + pack * position[4 * i + 0]) - zeroPointV;`
			`auto B = Vec16::load(srcPtr + pack * position[4 * i + 1]) - zeroPointV;`
			`auto C = Vec16::load(srcPtr + pack * position[4 * i + 2]) - zeroPointV;`
			`auto D = Vec16::load(srcPtr + pack * position[4 * i + 3]) - zeroPointV;`
			`auto val16 = CubicInterpolation2<float, 16>(A, B, C, D, f);`
			`Vec16::save(dst + pack * i, CubicInterpolation2<float, 16>(A, B, C, D, f));`
			`}`
			`}`

			`void MNNCubicLineC16(int8_t* dst, const float* A, const float* B, const float* C, const float* D, float* t,`
			`size_t number) {`
			`int pack = 16;`
			`using Vec16 = Vec<float, 16>;`
			`#ifdef MNN_USE_SSE`
			`uint8_t* dstPtr = (uint8_t*)dst;`
			`int offset = 128;`
			`int minValue = 0;`
			`int maxValue = 255;`
			`#else`
			`int8_t* dstPtr = dst;`
			`int offset = 0;`
			`int minValue = -128;`
			`int maxValue = 127;`
			`#endif`
			`float f = *t;`
			`for (int i = 0; i < number; ++i) {`
			`auto a = Vec16::load(A + pack * i);`
			`auto b = Vec16::load(B + pack * i);`
			`auto c = Vec16::load(C + pack * i);`
			`auto d = Vec16::load(D + pack * i);`
			`auto val16 = CubicInterpolation2<float, 16>(a, b, c, d, f);`
			`for (int j = 0; j < pack; ++j) {`
			`int val = (int)roundf(val16[j]) + offset;`
			`if (val > maxValue) {`
			`val = maxValue;`
			`}`
			`if (val < minValue) {`
			`val = minValue;`
			`}`
			`(dstPtr + pack i + j) = val;`
			`}`
			`}`
			`}`

			`void MNNBilinearSampleC8(const int8_t* src, int16_t* dst, const int32_t* position, const float* factor,`
			`size_t number) {`
			`#ifdef MNN_USE_SSE`
			`int offset = 128;`
			`const uint8_t* srcPtr = (uint8_t*)src;`
			`#else`
			`int offset = 0;`
			`const int8_t* srcPtr = src;`
			`#endif`
			`int pack = 8;`
			`for (int i = 0; i < number; ++i) {`
			`int16_t df = factor[i] * 128;`
			`int16_t sf = (1 - factor[i]) * 128;`
			`auto aPtr = srcPtr + position[2 * i] * pack;`
			`auto bPtr = srcPtr + position[2 * i + 1] * pack;`
			`for (int j = 0; j < pack; ++j) {`
			`int a = static_cast<int32_t>(*(aPtr + j) - offset);`
			`int b = static_cast<int32_t>(*(bPtr + j) - offset);`
			`int16_t val = static_cast<int16_t>(a * sf + b * df);`
			`(dst + pack i + j) = val;`
			`}`
			`}`
			`}`

			`void MNNBilinearLineC8(int8_t* dst, const int16_t* A, const int16_t* B, const float* t, size_t number) {`
			`#ifdef MNN_USE_SSE`
			`int offset = 128;`
			`uint8_t* dstPtr = (uint8_t*)dst;`
			`#else`
			`int offset = 0;`
			`int8_t* dstPtr = dst;`
			`#endif`
			`int pack = 8;`
			`int16_t df = (t) 128;`
			`int16_t sf = (1 - t) 128;`
			`for (int i = 0; i < number; ++i) {`
			`auto aPtr = A + pack * i;`
			`auto bPtr = B + pack * i;`
			`for (int j = 0; j < pack; ++j) {`
			`int32_t val = (aPtr + j) sf + (bPtr + j) df;`
			`int8_t valOut = (val + (1<<13)) / (1 << 14);`
			`if (val < 0) {`
			`valOut = (val - (1 << 13)) / (1 << 14);`
			`}`
			`(dstPtr+ pack i + j) = valOut+ offset;`
			`}`
			`}`
			`}`

			`#endif`