MNN/test/core/BackendTest.cpp

//
//  BackendTest.cpp
//  MNNTests
//
//  Created by MNN on 2019/01/10.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include <MNN/MNNDefine.h>
#include <math.h>
#include <MNN/Tensor.hpp>
#include "MNNTestSuite.h"
#include "core/Backend.hpp"
#include "core/Macro.h"

using namespace MNN;

template <typename T>
void NCHW2NHWC(const T* source, T* dest, int b, int h, int w, int c) {
    int sourceBatchsize = h * w * c;
    int destBatchSize   = sourceBatchsize;
    for (int bi = 0; bi < b; ++bi) {
        auto srcBatch = source + bi * sourceBatchsize;
        auto dstBatch = dest + bi * destBatchSize;
        for (int hi = 0; hi < h; ++hi) {
            auto srcHeight = srcBatch + hi * w;
            auto dstHeight = dstBatch + hi * w * c;
            for (int wi = 0; wi < w; ++wi) {
                auto srcWidth = srcHeight + wi;
                auto dstWidth = dstHeight + wi * c;
                for (int ci = 0; ci < c; ++ci) {
                    dstWidth[ci] = srcWidth[ci * w * h];
                }
            }
        }
    }
}

template <typename T>
void MNNTensorConvertNHWCToNC4HW4(T* dst, const T* src, size_t area, size_t depth) {
    int c      = (int)depth;
    int cDiv4  = c / 4;
    int cAlign = cDiv4 * 4;
    for (int hi = 0; hi < area; ++hi) {
        const auto srcHeight = src + hi * c;
        auto dstHeight       = dst + hi * 4;
        for (int ci = 0; ci < cDiv4; ++ci) {
            for (int i = 0; i < 4; ++i) {
                dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i];
            }
        }
    }

    if (cAlign == c) {
        return;
    }

    int cReamin   = c - cAlign;
    auto srcAlign = src + area * cAlign;
    auto dstAlign = dst + area * cAlign;

    for (int hi = 0; hi < area; ++hi) {
        const auto srcHeight = srcAlign + hi * c;
        auto dstHeight       = dstAlign + hi * 4;

        for (int i = 0; i < 4; ++i) {
            dstHeight[i] = 0;
        }

        for (int ci = 0; ci < cReamin; ++ci) {
            dstHeight[ci] = srcHeight[ci];
        }
    }
}

template <typename T>
void MNNTensorConvertNC4HW4ToNHWC(T* dst, const T* src, size_t area, size_t depth) {
    int c      = (int)depth;
    int cDiv4  = c / 4;
    int cAlign = cDiv4 * 4;
    for (int hi = 0; hi < area; ++hi) {
        const auto srcHeight = src + hi * 4;
        auto dstHeight       = dst + hi * c;
        for (int ci = 0; ci < cDiv4; ++ci) {
            for (int i = 0; i < 4; ++i) {
                dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i];
            }
        }
    }

    if (cAlign == c) {
        return;
    }

    int cReamin   = c - cAlign;
    auto srcAlign = src + area * cAlign;
    auto dstAlign = dst + cAlign;

    for (int hi = 0; hi < area; ++hi) {
        const auto srcHeight = srcAlign + hi * 4;
        auto dstHeight       = dstAlign + hi * c;

        for (int ci = 0; ci < cReamin; ++ci) {
            dstHeight[ci] = srcHeight[ci];
        }
    }
}

template <typename T>
void NHWC2NCHW(const T* source, T* dest, int b, int h, int w, int c) {
    int sourceBatchsize = h * w * c;
    int destBatchSize   = sourceBatchsize;
    for (int bi = 0; bi < b; ++bi) {
        auto srcBatch = source + bi * sourceBatchsize;
        auto dstBatch = dest + bi * destBatchSize;
        for (int hi = 0; hi < h; ++hi) {
            auto srcHeight = srcBatch + hi * w * c;
            auto dstHeight = dstBatch + hi * w;
            for (int wi = 0; wi < w; ++wi) {
                auto dstWidth = dstHeight + wi;
                auto srcWidth = srcHeight + wi * c;
                for (int ci = 0; ci < c; ++ci) {
                    dstWidth[ci * w * h] = srcWidth[ci];
                }
            }
        }
    }
}

bool nhwc_2_nhwc_uint8(std::shared_ptr<Backend> bn) {
    MNN_PRINT("\n ========= check NHWC result ! ========= \n");
    std::shared_ptr<Tensor> hostTensor(Tensor::create<uint8_t>(std::vector<int>{1, 224, 224, 3}));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<uint8_t>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom = i % 255;
        hostData[i]    = flagRandom;
    }

    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<uint8_t>(std::vector<int>{1, 224, 224, 3}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::DYNAMIC_SEPERATE);

    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());

    std::shared_ptr<Tensor> checkHostTensor(Tensor::create<uint8_t>(std::vector<int>{1, 224, 224, 3}));
    bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());

    auto backendCopyData = checkHostTensor->host<uint8_t>();

    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
            return false;
        }
    }
    return true;
}

template <typename T>
bool NC4HW4_2_NC4HW4_IntType(std::shared_ptr<Backend> bn) {
    MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_IntType result ! ========= \n");

    std::shared_ptr<Tensor> hostTensor(
        Tensor::create<T>(std::vector<int>{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<T>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom = i % 255;
        hostData[i]    = flagRandom;
    }

    bn->onResizeBegin();
    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<T>(std::vector<int>{1, 224, 224, 8}, Tensor::CAFFE_C4));
    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<T>(std::vector<int>{1, 224, 224, 8}, Tensor::CAFFE_C4));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());

    std::shared_ptr<Tensor> checkHostTensor(
        Tensor::create<T>(std::vector<int>{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4));
    bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());

    auto backendCopyData = checkHostTensor->host<T>();

    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for NCHW Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
            return false;
        }
    }

    std::shared_ptr<Tensor> deviceTensor2(
        Tensor::createDevice<T>(std::vector<int>{1, 8, 224, 224}, Tensor::TENSORFLOW));
    bn->onAcquireBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE);
    bn->onReleaseBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE);
    bn->onResizeEnd();
    bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get());
    bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get());
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for NHWC Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
            return false;
        }
    }
    return true;
}

bool NCHW_NC4HW4_NCHW(std::shared_ptr<Backend> bn, int batch, int width, int height, int channel) {
    std::shared_ptr<Tensor> srcTensor(
        Tensor::create<float>({batch, channel, width, height}, nullptr, Tensor::CAFFE));
    auto host = srcTensor->host<float>();
    for (int b=0; b<batch; ++b) {
        for (int c=0; c<channel; ++c) {
            for (int y=0; y<height; ++y) {
                for (int x=0; x<width; ++x) {
                    host[0
                         + b * channel * height * width
                         + c * height * width
                         + y * width
                         + x
                         ] = b / (float)batch * 100.f + c / (float)channel * 10.f + y / (float)height * 0.1f + x / (float)width * 0.001f;
                }
            }
        }
    }
    std::shared_ptr<Tensor> dstTensor(
        Tensor::create<float>({batch, channel, width, height}, nullptr, Tensor::CAFFE));
    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>({batch, channel, width, height}, Tensor::CAFFE_C4));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
    bn->onCopyBuffer(srcTensor.get(), deviceTensor.get());
    bn->onCopyBuffer(deviceTensor.get(), dstTensor.get());
    int elementSize = srcTensor->elementSize();
    auto backendCopyData = dstTensor->host<float>();
    auto hostData = srcTensor->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= 0.1f) {
            MNN_PRINT("Error for bn:%d, %f -> %f, %f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
            return false;
        }
    }
    bn->onReleaseBuffer(deviceTensor.get(), Backend::STATIC);
    return true;
}

bool NC4HW4_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
//    MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_float result ! ========= \n");
    std::vector<int> nhwc_shape = {1, 32, 12, 13};
    std::vector<int> nchw_shape = {1, 12, 13, 32};
    std::shared_ptr<Tensor> hostTensor(
        Tensor::create<float>(nhwc_shape, nullptr, Tensor::CAFFE_C4));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom = i % 255;
        hostData[i]    = flagRandom;
    }

    bn->onResizeBegin();
//    MNN_PRINT("\nalloc deviceTensor_pre\n");
    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<float>(nhwc_shape, Tensor::CAFFE_C4));
    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);

//    MNN_PRINT("\nalloc deviceTensor");
    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(nhwc_shape, Tensor::CAFFE_C4));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);

//    MNN_PRINT("\ncopy from host to  deviceTensor_pre\n");
    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());

//    MNN_PRINT("\ncopy from deviceTensor_pre to  deviceTensor\n");
    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());

//    MNN_PRINT("\ncopy from deviceTensor to  new host\n");
    std::shared_ptr<Tensor> checkHostTensor(
        Tensor::create<float>(nhwc_shape, nullptr, Tensor::CAFFE_C4));
    bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());


    auto backendCopyData = checkHostTensor->host<float>();

    for (int i = 0; i < elementSize; ++i) {
        if (backendCopyData[i] != hostData[i]) {
            MNN_PRINT("Error for NCHW Mid bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
            return false;
        }
    }

    std::shared_ptr<Tensor> deviceTensor2(
        Tensor::createDevice<float>(nchw_shape, Tensor::TENSORFLOW));
    bn->onAcquireBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE);
    bn->onReleaseBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE);
    bn->onResizeEnd();
    bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get());
    bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get());
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for NHWC Mid bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
            return false;
        }
    }
    return true;
}

void NC4HW4_2_NC4HW4_uint8(std::shared_ptr<Backend> bn) {
//    MNN_PRINT("\n ========= check NC4HW4 result ! ========= \n");
    std::shared_ptr<Tensor> hostTensor(
        Tensor::create<uint8_t>(std::vector<int>{1, 8, 224, 224}, nullptr, Tensor::CAFFE_C4));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<uint8_t>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom = i % 255;
        hostData[i]    = flagRandom;
    }

    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<uint8_t>(std::vector<int>{1, 224, 224, 8}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::DYNAMIC_SEPERATE);

    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());

    std::shared_ptr<Tensor> checkHostTensor(
        Tensor::create<uint8_t>(std::vector<int>{1, 8, 224, 224}, nullptr, Tensor::CAFFE_C4));
    bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());

    auto backendCopyData = checkHostTensor->host<uint8_t>();

    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], (int32_t)backendCopyData[i]);
            break;
        }
    }
}

void nhwc_2_nhwc_float(std::shared_ptr<Backend> bn) {
//    MNN_PRINT("\n ========= check NHWC result ! ========= \n");
    std::shared_ptr<Tensor> hostTensor(Tensor::create<float>(std::vector<int>{1, 224, 224, 3}));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom    = (rand() % 2 == 0);
        float valueRandom = rand() % 255 / 255.f;
        hostData[i]       = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom;
    }

    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{1, 224, 224, 3}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::DYNAMIC);

    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());

    std::shared_ptr<Tensor> checkHostTensor(Tensor::create<float>(std::vector<int>{1, 224, 224, 3}));
    bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());

    auto backendCopyData = checkHostTensor->host<float>();

    for (int i = 0; i < elementSize; ++i) {
        if (backendCopyData[i] - hostData[i] >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
        }
    }
}

void nchw_2_nchw_float(std::shared_ptr<Backend> bn) {
//    MNN_PRINT("\n ========= check NCHW result ! ========= \n");
    std::shared_ptr<Tensor> hostTensor(Tensor::create<float>(std::vector<int>{1, 7, 224, 224}, nullptr, Tensor::CAFFE));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom    = (rand() % 2 == 0);
        float valueRandom = rand() % 255 / 255.f;
        hostData[i]       = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom;
    }

    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{1, 224, 224, 7}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::DYNAMIC_SEPERATE);

    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());

    std::shared_ptr<Tensor> checkHostTensor(
        Tensor::create<float>(std::vector<int>{1, 7, 224, 224}, nullptr, Tensor::CAFFE));
    bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());

    auto backendCopyData = checkHostTensor->host<float>();

    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
        }
    }
}

void nchw_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
//    MNN_PRINT("\n ========= check NC4HW4 result ! ========= \n");
    int batch   = 1;
    int channel = 12;
    int width   = 20;
    int height  = 20;
    std::shared_ptr<Tensor> hostTensor(
        Tensor::create<float>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom    = (rand() % 2 == 0);
        float valueRandom = rand() % 255 / 255.f;
        hostData[i]       = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom;
    }

    float* temp = (float*)malloc(hostTensor->size());
    memset(temp, 0.0f, hostTensor->size());
    NCHW2NHWC(hostData, temp, batch, height, width, channel);

    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::DYNAMIC_SEPERATE);
    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());

    //            // nhwc -> NC4HW4
    //            MNN_PRINT("nhwc -> NC4HW4 !\n");

    MNNTensorConvertNHWCToNC4HW4(hostData, temp, height * width, channel);
    std::shared_ptr<Tensor> NC4HW4_HostTensor(
        Tensor::create<float>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE_C4));

    bn->onCopyBuffer(deviceTensor.get(), NC4HW4_HostTensor.get());
    auto backendCopyData = NC4HW4_HostTensor->host<float>();

    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
        }
    }

    // NC4HW4 -> nhwc

    MNNTensorConvertNC4HW4ToNHWC(temp, hostData, height * width, channel);

    bn->onCopyBuffer(NC4HW4_HostTensor.get(), deviceTensor.get());
    NHWC2NCHW(temp, backendCopyData, batch, height, width, channel);
    bn->onCopyBuffer(deviceTensor.get(), hostTensor.get());

    //            MNN_PRINT("NC4HW4 -> nhwc !\n");
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
        }
    }

    free(temp);
}

void nchw_2_NC4HW4_2_nchw_float(std::shared_ptr<Backend> bn) {
    // Test NCHW -> NC4HW4 -> NCHW
    {
        std::shared_ptr<Tensor> hostTensor(
            Tensor::create<float>(std::vector<int>{1, 3, 224, 224}, nullptr, Tensor::CAFFE));
        auto elementSize = hostTensor->elementSize();
        auto hostData    = hostTensor->host<float>();
        for (int i = 0; i < elementSize; ++i) {
            hostData[i] = ((i * 67 * 73) % 255);
        }

        std::shared_ptr<Tensor> deviceTensor(
            Tensor::createDevice<float>(std::vector<int>{1, 3, 224, 224}, Tensor::CAFFE_C4));
        bn->onAcquireBuffer(deviceTensor.get(), Backend::DYNAMIC);

        bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());
        std::shared_ptr<Tensor> checkHostTensor(
            Tensor::create<float>(std::vector<int>{1, 3, 224, 224}, nullptr, Tensor::CAFFE));
        bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get());

        auto backendCopyData = checkHostTensor->host<float>();

        for (int i = 0; i < elementSize; ++i) {
            if (abs(backendCopyData[i] != hostData[i]) >= F32_BF16_MAX_LOSS) {
                MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
                break;
            }
        }
    }
}

template <typename T>
bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr<Backend> bn) {
    // Test NHWC -> NC4HW4 -> NHWC
    MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_inttype result ! ========= \n");
    int batch   = 1;
    int channel = 12;
    int width   = 20;
    int height  = 20;
    std::shared_ptr<Tensor> hostTensor(
        Tensor::create<T>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<T>();
    for (int i = 0; i < elementSize; ++i) {
        hostData[i]       = rand() % 255;
    }

    T* temp = (T*)malloc(hostTensor->size());
    memset(temp, 0.0f, hostTensor->size());
    NCHW2NHWC<T>(hostData, temp, batch, height, width, channel);

    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<T>(std::vector<int>{batch, height, width, channel}));
    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<T>(std::vector<int>{batch, height, width, channel}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());

    //            // nhwc -> NC4HW4
    //            MNN_PRINT("nhwc -> NC4HW4 !\n");

    MNNTensorConvertNHWCToNC4HW4<T>(hostData, temp, height * width, channel);
    std::shared_ptr<Tensor> NC4HW4_HostTensor(
        Tensor::create<T>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE_C4));

    bn->onCopyBuffer(deviceTensor.get(), NC4HW4_HostTensor.get());
    auto backendCopyData = NC4HW4_HostTensor->host<T>();

    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
            return false;
        }
    }

    // NC4HW4 -> nhwc

    MNNTensorConvertNC4HW4ToNHWC<T>(temp, hostData, height * width, channel);

    bn->onCopyBuffer(NC4HW4_HostTensor.get(), deviceTensor.get());
    NHWC2NCHW(temp, backendCopyData, batch, height, width, channel);
    bn->onCopyBuffer(deviceTensor.get(), hostTensor.get());

    // MNN_PRINT("NC4HW4 -> nhwc !\n");
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
        }
    }

    free(temp);
    return true;
}
bool nchwTonhwc(std::shared_ptr<Backend> bn) {
    // Test NHWC -> NC4HW4 -> NHWC
    MNN_PRINT("\n ========= check nchwTonhwc result ! ========= \n");
    int batch   = 2;
    int channel = 12;
    int width   = 21;
    int height  = 5;
    std::shared_ptr<Tensor> hostTensor(
        Tensor::create<float>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom    = (rand() % 2 == 0);
        float valueRandom = rand() % 255 / 255.f;
        hostData[i]       = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom;
    }
    std::vector<float> tempStorage(hostTensor->elementSize());
    float* temp = tempStorage.data();
    memset(temp, 0.0f, hostTensor->size());
    NCHW2NHWC(hostData, temp, batch, height, width, channel);
    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());
    std::shared_ptr<Tensor> hostTensorNHWC(
        Tensor::create<float>(std::vector<int>{batch, height, width, channel}, nullptr, Tensor::TENSORFLOW));
    bn->onCopyBuffer(deviceTensor.get(), hostTensorNHWC.get());
    auto backendCopyData = hostTensorNHWC->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - temp[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001
            MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
            return false;
        }
    }
    return true;
}


bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
    // Test NHWC -> NC4HW4 -> NHWC
    MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_float result ! ========= \n");
    int batch   = 1;
    int channel = 12;
    int width   = 3;
    int height  = 2;
    std::shared_ptr<Tensor> hostTensor(
        Tensor::create<float>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE));
    auto elementSize = hostTensor->elementSize();
    auto hostData    = hostTensor->host<float>();
    for (int i = 0; i < elementSize; ++i) {
        int flagRandom    = (rand() % 2 == 0);
        float valueRandom = rand() % 255 / 255.f;
        hostData[i]       = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom;
    }

    float* temp = (float*)malloc(hostTensor->size());
    memset(temp, 0.0f, hostTensor->size());
    NCHW2NHWC(hostData, temp, batch, height, width, channel);

    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());

    // // nhwc -> NC4HW4
    // MNN_PRINT("nhwc -> NC4HW4 !\n");

    MNNTensorConvertNHWCToNC4HW4(hostData, temp, height * width, channel);
    std::shared_ptr<Tensor> NC4HW4_HostTensor(
        Tensor::create<float>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE_C4));

    bn->onCopyBuffer(deviceTensor.get(), NC4HW4_HostTensor.get());
    auto backendCopyData = NC4HW4_HostTensor->host<float>();

    bool res = true;
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001
            MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
            res = false;
            break;
        }
    }
    if (!res) {
        for (int i = 0; i < elementSize; ++i) {
            MNN_PRINT("%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
        }
        return false;
    }

    // NC4HW4 -> nhwc

    MNNTensorConvertNC4HW4ToNHWC(temp, hostData, height * width, channel);

    bn->onCopyBuffer(NC4HW4_HostTensor.get(), deviceTensor.get());
    NHWC2NCHW(temp, backendCopyData, batch, height, width, channel);
    bn->onCopyBuffer(deviceTensor.get(), hostTensor.get());

    MNN_PRINT("NC4HW4 -> nhwc !\n");
    for (int i = 0; i < elementSize; ++i) {
        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
            MNN_PRINT("NC4HW4 -> nhwc Error for bn:%d, %f -> %f.  F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
            return false;
        }
    }

    free(temp);
    return true;
}

class BackendCopyBufferFloatTest : public MNNTestCase {
public:
    virtual bool run(int precision) {
        for (int i = 0; i < MNN_FORWARD_ALL; ++i) {
            auto type    = (MNNForwardType)i;
            auto creator = MNNGetExtraRuntimeCreator(type);
            if (nullptr == creator) {
                continue;
            }
            for (int p = 0; p < 3; ++p) {
                MNN::Backend::Info info;
                info.type = type;
                BackendConfig user;
                user.precision = (MNN::BackendConfig::PrecisionMode)p;
                info.user = &user;
                std::shared_ptr<Runtime> runtime(creator->onCreate(info));
                MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
                std::shared_ptr<Backend> bn(runtime->onCreate(&user));
                auto res = NC4HW4_2_NC4HW4_float(bn);
                FUNC_PRINT(res);
                res = res && nchwTonhwc(bn);
                FUNC_PRINT(res);
                res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
                FUNC_PRINT(res);
                res = res && NCHW_NC4HW4_NCHW(bn, 3, 16, 17, 19);
                FUNC_PRINT(res);
                res = res && NCHW_NC4HW4_NCHW(bn, 12, 16, 38, 16);
                FUNC_PRINT(res);
                res = res && NCHW_NC4HW4_NCHW(bn, 5, 128, 8, 6);
                FUNC_PRINT(res);
                if (!res) {
                    MNN_ERROR("Error for %d bn\n", i);
                    return false;
                }
            }
        }
        return true;
    }
};

class CPUBackendCopyBufferTest : public MNNTestCase {
public:
    virtual bool run(int precision) {
        auto type    = MNN_FORWARD_CPU;
        auto creator = MNNGetExtraRuntimeCreator(type);
        for (int p = 0; p < 3; ++p) {
            MNN::Backend::Info info;
            info.type = type;
            BackendConfig user;
            user.precision = (MNN::BackendConfig::PrecisionMode)p;
            info.user = &user;
            std::shared_ptr<Runtime> runtime(creator->onCreate(info));
            MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
            std::shared_ptr<Backend> bn(runtime->onCreate(&user));
            auto res = NC4HW4_2_NC4HW4_IntType<int32_t>(bn);
            res = res && NC4HW4_2_NC4HW4_IntType<int16_t>(bn);
            res = res && NC4HW4_2_NC4HW4_IntType<int8_t>(bn);
            res = res && nhwc_2_NC4HW4_2_nhwc_inttype<int32_t>(bn);
            res = res && nhwc_2_NC4HW4_2_nhwc_inttype<int16_t>(bn);
            res = res && nhwc_2_NC4HW4_2_nhwc_inttype<int8_t>(bn);
            if (!res) {
                MNN_ERROR("Error for Int Copy\n");
                return false;
            }
        }
        return true;
    }
};

class BackendCopyBufferUint8Test : public MNNTestCase {
public:
    virtual bool run(int precision) {
        for (int i = 0; i < MNN_FORWARD_ALL; ++i) {
            auto type    = (MNNForwardType)i;
            auto creator = MNNGetExtraRuntimeCreator(type);
            if (nullptr == creator) {
                continue;
            }
            MNN::Backend::Info info;
            info.type = type;
            BackendConfig user;
            user.precision = MNN::BackendConfig::Precision_High;
            info.user = &user;
            std::shared_ptr<Runtime> runtime(creator->onCreate(info));

            MNN_PRINT("Test %d Backend\n", type);
            std::shared_ptr<Backend> bn(runtime->onCreate());
            // uint8
            auto res = nhwc_2_nhwc_uint8(bn);
            if (!res) {
                MNN_ERROR("Error for %d bn\n", i);
                return false;
            }
            //        NC4HW4_2_NC4HW4_uint8(bn);
        }
        return true;
    }
};
MNNTestSuiteRegister(BackendCopyBufferFloatTest, "engine/backend/copy_buffer_float");
//MNNTestSuiteRegister(BackendCopyBufferUint8Test, "engine/backend/copy_buffer_uint8");
MNNTestSuiteRegister(CPUBackendCopyBufferTest, "engine/backend/copy_buffer_cpu");