MNN/source/backend/cpu/CPUImageProcess.cpp

//
//  CPUImageProcess.cpp
//  MNN
//
//  Created by MNN on 2021/10/27.
//  Copyright © 2018 Alibaba. All rights reserved.
//

#include "backend/cpu/CPUImageProcess.hpp"
#include "compute/ImageProcessFunction.hpp"
#include <string.h>
#include <mutex>
#include "core/Macro.h"
#ifdef MNN_USE_NEON
#include <arm_neon.h>
#endif
#ifdef MNN_USE_SSE
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#endif
#include <map>
#include <utility>

namespace MNN {
#define CACHE_SIZE 256
#define CHECKFORMAT(src, dst, func) if (source == ImageFormatType_##src && dest == ImageFormatType_##dst) return func
#define CHECKFORMAT_CORE(src, dst, func) if (source == ImageFormatType_##src && dest == ImageFormatType_##dst) return coreFunctions ? coreFunctions->func : func;

BLITTER CPUImageProcess::choose(ImageFormatType source, ImageFormatType dest) {
    // YUV only different in sampler
    if (source == ImageFormatType_YUV_NV12) {
        source = ImageFormatType_YUV_NV21;
    }
    if (source == ImageFormatType_YUV_I420) {
        source = ImageFormatType_YUV_NV21;
    }
    CHECKFORMAT(RGBA, RGBA, MNNCopyC4);
    CHECKFORMAT_CORE(RGBA, BGRA, MNNRGBAToBGRA);
    CHECKFORMAT(RGBA, BGR, MNNRGBAToBGR);
    CHECKFORMAT(RGBA, RGB, MNNBGRAToBGR);
    CHECKFORMAT(RGBA, GRAY, MNNRGBAToGRAY);

    CHECKFORMAT_CORE(BGRA, RGBA, MNNRGBAToBGRA);
    CHECKFORMAT(BGRA, BGRA, MNNCopyC4);
    CHECKFORMAT(BGRA, BGR, MNNBGRAToBGR);
    CHECKFORMAT(BGRA, RGB, MNNRGBAToBGR);
    CHECKFORMAT(BGRA, GRAY, MNNBGRAToGRAY);

    CHECKFORMAT(RGB, RGB, MNNCopyC3);
    CHECKFORMAT(RGB, BGR, MNNRGBToBGR);
    CHECKFORMAT(RGB, GRAY, MNNRGBToGRAY);
    CHECKFORMAT(RGB, RGBA, MNNC3ToC4);
    CHECKFORMAT(RGB, YCrCb, MNNRGBToCrCb);
    CHECKFORMAT(RGB, YUV, MNNRGBToYUV);
    CHECKFORMAT(RGB, XYZ, MNNRGBToXYZ);
    CHECKFORMAT(RGB, HSV, MNNRGBToHSV);
    CHECKFORMAT(RGB, BGR555, MNNRGBToBGR555);
    CHECKFORMAT(RGB, BGR565, MNNRGBToBGR565);
    CHECKFORMAT(RGB, HSV_FULL, MNNRGBToHSV_FULL);

    CHECKFORMAT(BGR, BGR, MNNCopyC3);
    CHECKFORMAT(BGR, RGB, MNNRGBToBGR);
    CHECKFORMAT(BGR, GRAY, MNNBRGToGRAY);
    CHECKFORMAT(BGR, BGRA, MNNC3ToC4);
    CHECKFORMAT(BGR, YCrCb, MNNBGRToCrCb);
    CHECKFORMAT(BGR, YUV, MNNBGRToYUV);
    CHECKFORMAT(BGR, XYZ, MNNBGRToXYZ);
    CHECKFORMAT(BGR, HSV, MNNBGRToHSV);
    CHECKFORMAT(BGR, BGR555, MNNBGRToBGR555);
    CHECKFORMAT(BGR, BGR565, MNNBGRToBGR565);
    CHECKFORMAT(BGR, HSV_FULL, MNNBGRToHSV_FULL);

    CHECKFORMAT(GRAY, RGBA, MNNGRAYToC4);
    CHECKFORMAT(GRAY, BGRA, MNNGRAYToC4);
    CHECKFORMAT(GRAY, BGR, MNNGRAYToC3);
    CHECKFORMAT(GRAY, RGB, MNNGRAYToC3);
    CHECKFORMAT(GRAY, GRAY, MNNCopyC1);

    CHECKFORMAT(YUV_NV21, GRAY, MNNCopyC1);
    CHECKFORMAT_CORE(YUV_NV21, RGB, MNNNV21ToRGB);
    CHECKFORMAT_CORE(YUV_NV21, BGR, MNNNV21ToBGR);
    CHECKFORMAT_CORE(YUV_NV21, RGBA, MNNNV21ToRGBA);
    CHECKFORMAT_CORE(YUV_NV21, BGRA, MNNNV21ToBGRA);
    return nullptr;
}

BLITTER CPUImageProcess::choose(int channelByteSize) {
    switch (channelByteSize) {
        case 4:
            return MNNC4blitH;
        case 3:
            return MNNC3blitH;
        case 1:
            return MNNC1blitH;
        default:
            return nullptr;
    }
}

SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool identity) {
    if (identity) {
        switch (format) {
            case ImageFormatType_RGBA:
            case ImageFormatType_BGRA:
                return MNNSamplerC4Copy;
            case ImageFormatType_GRAY:
                return MNNSamplerC1Copy;

            case ImageFormatType_RGB:
            case ImageFormatType_BGR:
                return MNNSamplerC3Copy;
            case ImageFormatType_YUV_NV21:
                return MNNSamplerNV21Copy;
            case ImageFormatType_YUV_NV12:
                return MNNSamplerNV12Copy;
            case ImageFormatType_YUV_I420:
                return MNNSamplerI420Copy;
            default:
                break;
        }
    }
    if (FilterType_BILINEAR == type) {
        switch (format) {
            case ImageFormatType_RGBA:
            case ImageFormatType_BGRA:
                return coreFunctions->MNNSamplerC4Bilinear;
            case ImageFormatType_GRAY:
                return MNNSamplerC1Bilinear;

            case ImageFormatType_RGB:
            case ImageFormatType_BGR:
                return MNNSamplerC3Bilinear;
            default:
                break;
        }
    }

    // Nearest
    switch (format) {
        case ImageFormatType_RGBA:
        case ImageFormatType_BGRA:
            return coreFunctions->MNNSamplerC4Nearest;
        case ImageFormatType_GRAY:
            return MNNSamplerC1Nearest;

        case ImageFormatType_RGB:
        case ImageFormatType_BGR:
            return MNNSamplerC3Nearest;
        case ImageFormatType_YUV_NV12:
            return MNNSamplerNV12Nearest;
        case ImageFormatType_YUV_NV21:
            return MNNSamplerNV21Nearest;
        case ImageFormatType_YUV_I420:
            return MNNSamplerI420Nearest;
        default:
            break;
    }
    MNN_PRINT("Don't support sampler for format:%d, type:%d", format, type);
    return nullptr;
}

BLIT_FLOAT CPUImageProcess::choose(ImageFormatType format, int dstBpp) {
    if (4 == dstBpp) {
        switch (format) {
            case ImageFormatType_GRAY:
                return MNNC1ToFloatRGBA;
            case ImageFormatType_RGBA:
            case ImageFormatType_BGRA:
                return MNNC4ToFloatC4;
            case ImageFormatType_RGB:
            case ImageFormatType_BGR:
                return MNNC3ToFloatRGBA;
            default:
                break;
        }
    }
    switch (format) {
        case ImageFormatType_GRAY:
            return MNNC1ToFloatC1;
        case ImageFormatType_RGBA:
        case ImageFormatType_BGRA:
            return MNNC4ToFloatC4;
        case ImageFormatType_RGB:
        case ImageFormatType_BGR:
            return MNNC3ToFloatC3;
        default:
            break;
    }
    return nullptr;
}

static int LEFT   = 1 << 0;
static int RIGHT  = 1 << 1;
static int TOP    = 1 << 2;
static int BOTTOM = 1 << 3;
inline static uint8_t _encode(const CV::Point& p, int iw, int ih) {
    uint8_t mask = 0;
    if (p.fX < 0) {
        mask |= LEFT;
    }
    if (p.fX > iw - 1) {
        mask |= RIGHT;
    }
    if (p.fY < 0) {
        mask |= TOP;
    }
    if (p.fY > ih - 1) {
        mask |= BOTTOM;
    }
    return mask;
}
static std::pair<int, int> _computeClip(CV::Point* points, int iw, int ih, const CV::Matrix& invert, int xStart, int count) {
    auto code1 = _encode(points[0], iw, ih);
    auto code2 = _encode(points[1], iw, ih);
    int sta    = 0;
    int end    = count;

    float x1     = points[0].fX;
    float x2     = points[1].fX;
    float y1     = points[0].fY;
    float y2     = points[1].fY;
    int code     = 0;
    int pIndex   = 0;
    float deltaY = y2 - y1;
    float deltaX = x2 - x1;
    if (deltaX > 0.01f || deltaX < -0.01f) {
        deltaY = (y2 - y1) / (x2 - x1);
    } else {
        deltaY = 0;
    }
    if (deltaY > 0.01f || deltaY < -0.01f) {
        deltaX = (x2 - x1) / (y2 - y1);
    } else {
        deltaX = 0;
    }
    while (code1 != 0 || code2 != 0) {
        if ((code1 & code2) != 0) {
            sta = end;
            break;
        }
        if (code1 != 0) {
            code   = code1;
            pIndex = 0;
        } else if (code2 != 0) {
            code   = code2;
            pIndex = 1;
        }
        if ((LEFT & code) != 0) {
            points[pIndex].fY = points[pIndex].fY + deltaY * (0 - points[pIndex].fX);
            points[pIndex].fX = 0;
        } else if ((RIGHT & code) != 0) {
            points[pIndex].fY = points[pIndex].fY + deltaY * (iw - 1 - points[pIndex].fX);
            points[pIndex].fX = iw - 1;
        } else if ((BOTTOM & code) != 0) {
            points[pIndex].fX = points[pIndex].fX + deltaX * (ih - 1 - points[pIndex].fY);
            points[pIndex].fY = ih - 1;
        } else if ((TOP & code) != 0) {
            points[pIndex].fX = points[pIndex].fX + deltaX * (0 - points[pIndex].fY);
            points[pIndex].fY = 0;
        }
        auto tmp = invert.mapXY(points[pIndex].fX, points[pIndex].fY);
        if (0 == pIndex) {
            code1 = _encode(points[pIndex], iw, ih);
            // FUNC_PRINT_ALL(tmp.fX, f);
            // sta = (int)::ceilf(tmp.fX) - xStart;
            sta = (int)::round(tmp.fX) - xStart;
        } else {
            code2 = _encode(points[pIndex], iw, ih);
            // FUNC_PRINT_ALL(tmp.fX, f);
            // end = (int)::ceilf(tmp.fX) - xStart + 1;
            end = (int)::floor(tmp.fX) - xStart + 1;
        }
    }
    if (end > count) {
        end = count;
    }
    if (sta > end) {
        sta = end;
    }
    return std::make_pair(sta, end);
}

ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto input = inputs[0];
    if (input->dimensions() == 3) {
        ih = input->length(0);
        iw = input->length(1);
        ic = input->length(2);
    } else {
        ih = input->height();
        iw = input->width();
        ic = input->channel();
    }
    if (draw) {
        blitter = choose(ic * inputs[0]->getType().bytes());
        return NO_ERROR;
    }
    auto output = outputs[0];
    oh = output->height();
    ow = output->width();
    oc = output->channel();
    dtype = output->getType();

    bool identity = transform.isIdentity() && iw >= ow && ih >= oh;
    // choose sampler
    sampler  = choose(sourceFormat, filterType, identity);
    if (nullptr == sampler) {
        return INPUT_DATA_ERROR;
    }
    // choose blitter
    if (sourceFormat != destFormat) {
        blitter = choose(sourceFormat, destFormat);
        if (nullptr == blitter) {
            return INPUT_DATA_ERROR;
        }
        if (backend()) {
            cacheBuffer.reset(Tensor::createDevice<uint8_t>(std::vector<int>{4 * CACHE_SIZE}));
            backend()->onAcquireBuffer(cacheBuffer.get(), Backend::DYNAMIC);
            samplerDest = cacheBuffer->host<uint8_t>();
        } else {
            samplerBuffer.reset(new uint8_t[4 * CACHE_SIZE]);
            samplerDest = samplerBuffer.get();
        }
    }
    // choose float blitter
    if (dtype.code == halide_type_float) {
        blitFloat = choose(destFormat, oc);
        if (nullptr == blitFloat) {
            return INPUT_DATA_ERROR;
        }
        if (backend()) {
            cacheBufferRGBA.reset(Tensor::createDevice<uint8_t>(std::vector<int>{4 * CACHE_SIZE}));
            backend()->onAcquireBuffer(cacheBufferRGBA.get(), Backend::DYNAMIC);
            blitDest = cacheBufferRGBA->host<uint8_t>();
        } else {
            blitBuffer.reset(new uint8_t[4 * CACHE_SIZE]);
            blitDest = blitBuffer.get();
        }
    }
    return NO_ERROR;
}

ErrorCode CPUImageProcess::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    if (0 == mStride) {
        mStride = iw * ic;
    }
    auto source = inputs[0]->host<uint8_t>();
    void* dest = nullptr;
    CV::Point points[2];
    auto destBytes = dtype.bytes();
    int tileCount = UP_DIV(ow, CACHE_SIZE);
    const int* regions = nullptr;
    if (draw) {
        // change input to output
        dest = source;
        oh = inputs[1]->length(0);
        ow = iw;
        oc = ic;
        destBytes = inputs[0]->getType().bytes();
        // draw one
        tileCount = 1;
        // src is color
        samplerDest = inputs[2]->host<uint8_t>();
        // get region info ptr
        regions = inputs[1]->host<int>();
    } else {
        dest = outputs[0]->host<void>();
    }
    for (int i = 0; i < oh; ++i) {
        int dy = draw ? regions[3 * i] : i;
        auto dstY = (uint8_t*)dest + dy * destBytes * ow * oc;
        for (int tIndex = 0; tIndex < tileCount; ++tIndex) {
            int xStart    = tIndex * CACHE_SIZE;
            int count     = std::min(CACHE_SIZE, ow - xStart);
            if (draw) {
                xStart = regions[3 * i + 1];
                count = regions[3 * i + 2] - xStart + 1;
            }
            auto dstStart = dstY + destBytes * oc * xStart;

            if (!blitFloat) {
                blitDest = dstStart;
            }
            if (!blitter) {
                samplerDest = blitDest;
            }

            // Sample
            if (!draw) {
                // Compute position
                points[0].fX = xStart;
                points[0].fY = dy;

                points[1].fX = xStart + count;
                points[1].fY = dy;
                transform.mapPoints(points, 2);
                float deltaY = points[1].fY - points[0].fY;
                float deltaX = points[1].fX - points[0].fX;

                int sta = 0;
                int end = count;

                // FUNC_PRINT(sta);
                if (wrap == WrapType_ZERO) {
                    // Clip: Cohen-Sutherland
                    auto clip    = _computeClip(points, iw, ih, transformInvert, xStart, count);
                    sta          = clip.first;
                    end          = clip.second;
                    points[0].fX = sta + xStart;
                    points[0].fY = dy;

                    transform.mapPoints(points, 1);
                    if (sta != 0 || end < count) {
                        if (ic > 0) {
                            if (sta > 0) {
                                ::memset(samplerDest, paddingValue, ic * sta);
                            }
                            if (end < count) {
                                ::memset(samplerDest + end * ic, paddingValue, (count - end) * ic);
                            }
                        } else {
                            // TODO, Only support NV12 / NV21
                            ::memset(samplerDest, paddingValue, count);
                            ::memset(samplerDest + count, 128, UP_DIV(count, 2) * 2);
                        }
                    }
                }
                points[1].fX = (deltaX) / (float)(count);
                points[1].fY = (deltaY) / (float)(count);

                sampler(source, samplerDest, points, sta, end - sta, count, iw, ih, mStride);
            }
            // Convert format
            if (blitter) {
                blitter(samplerDest, blitDest, count);
            }
            // Turn float
            if (blitFloat) {
                blitFloat(blitDest, (float*)dstStart, mean, normal, count);
            }
        }
    }

    return NO_ERROR;
}

class CPUImageProcessCreator : public CPUBackend::Creator {
public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                const MNN::Op *op, Backend *backend) const {
        auto process = op->main_as_ImageProcessParam();
        return new CPUImageProcess(backend, process);
    }
};

REGISTER_CPU_OP_CREATOR(CPUImageProcessCreator, OpType_ImageProcess);
} // namespace MNN