MNN/source/backend/opencl/core/runtime/OpenCLRuntime.cpp

//
//  OpenCLRuntime.cpp
//  MNN
//
//  Created by MNN on 2019/02/28.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "backend/opencl/core/runtime/OpenCLRuntime.hpp"
#include <cstdlib>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "core/Macro.h"
#include "OpenCLTuneInfo.hpp"
//#define MNN_OPEN_TIME_TRACE
#include <MNN/AutoTime.hpp>
#include "CLCache_generated.h"
#include "backend/opencl/execution/cl/opencl_source_map.hpp"
//#define ARM_OPENCL_PRINTF_DEBUG
using namespace CLCache;
namespace MNN {

extern const std::map<std::string, const char*> OpenCLProgramMap;
extern std::mutex gCLMutex;

bool OpenCLRuntime::getDeviceSupportsExtension(const cl::Device &device, const char *extensionName) {
    std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
    auto pos               = extensions.find(extensionName);
    return (pos != std::string::npos);
}

#ifdef ARM_OPENCL_PRINTF_DEBUG
static void callback(const char *buffer, size_t length, size_t final, void *user_data)
{
    fwrite(buffer, 1, length, stdout);
}
#endif

OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int platformSize, int platformId, int deviceId, void *contextPtr, void *glShared, const RuntimeHint& hint) {
#ifdef LOG_VERBOSE
    MNN_PRINT("start OpenCLRuntime !\n");
#endif
    mDefaultBuildParams = " -cl-mad-enable";
    std::vector<cl::Platform> platforms;
    cl_int res = cl::Platform::get(&platforms, platformSize);
    MNN_CHECK_CL_SUCCESS(res, "getPlatform");
    if(platforms.size() > 0 && res == CL_SUCCESS) {
        if(platformId >= platforms.size() || platformId < 0) {
            platformId = 0;
        }
        cl::Platform::setDefault(platforms[platformId]);
        std::vector<cl::Device> gpuDevices;

        res = platforms[platformId].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
        if(1 <= gpuDevices.size() && res == CL_SUCCESS) {
            if(deviceId >= gpuDevices.size() || deviceId < 0) {
                deviceId = 0;
            }
            mFirstGPUDevicePtr = std::make_shared<cl::Device>(gpuDevices[deviceId]);
            if(mFirstGPUDevicePtr == nullptr) {
                mIsCreateError = true;
                return;
            }
            const std::string deviceName    = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NAME>();
            mDeviceName = deviceName;
            const std::string deviceVersion = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VERSION>();
            std::map<std::string, std::pair<MNN::MaliAr, MNN::GpuLevel>> maliArMap {
                {"Mali-T860", {MIDGARD, LOW}},
                {"Mali-T880", {MIDGARD, LOW}},
                {"Mali-G31", {BIFROST, LOW}},
                {"Mali-G51", {BIFROST, LOW}},
                {"Mali-G52", {BIFROST, LOW}},
                {"Mali-G71", {BIFROST, LOW}},
                {"Mali-G72", {BIFROST, LOW}},
                {"Mali-G76", {BIFROST, MEDIUM}},
                {"Mali-G57", {VALHALL, LOW}},
                {"Mali-G68", {VALHALL, LOW}},
                {"Mali-G77", {VALHALL, MEDIUM}},
                {"Mali-G78", {VALHALL, MEDIUM}},
                {"Mali-G310", {VALHALL, LOW}},
                {"Mali-G510", {VALHALL, LOW}},
                {"Mali-G610", {VALHALL, LOW}},
                {"Mali-G615", {VALHALL, LOW}},
                {"Mali-G710", {VALHALL, TOP}},
                {"Mali-G715", {VALHALL, TOP}},
            };
        
            const std::string deviceVendor  = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VENDOR>();
            cl_command_queue_properties properties = 0;

        #ifdef ENABLE_OPENCL_TIME_PROFILER
            properties |= CL_QUEUE_PROFILING_ENABLE;
        #endif
            cl_int res;
            // if device is QUALCOMM's and version is 2.0 , set spacial optimized param

            sscanf(deviceVersion.c_str(), "%*s%f%*s", &mCLVersion);
            
        #ifdef MNN_OPENCL_SVM_ENABLE
            if(mCLVersion > 1.99f && (false == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isSvmError())) {
                res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_SVM_CAPABILITIES, &mSvmCapabilities);

                #ifdef LOG_VERBOSE
                if (res != CL_SUCCESS || mSvmCapabilities == 0) {
                    MNN_PRINT("SVM capalibilties: NONE\n");
                } else {
                    if (mSvmCapabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
                        MNN_PRINT("SVM capalibilties: SVM_FINE_GRAIN_BUFFER\n");
                        if (mSvmCapabilities & CL_DEVICE_SVM_ATOMICS) {
                            MNN_PRINT("SVM capalibilties: SVM_ATOMICS\n");
                        }
                    } else if (mSvmCapabilities & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
                        MNN_PRINT("SVM capalibilties: SVM_COARSE_GRAIN_BUFFER\n");
                    }
                }
                #endif
            }
        #endif
            
            if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) {
                mGpuType = ADRENO;
                
                // if device is QUALCOMM's and version is 2.0 , set spacial optimized param
                //if Adreno version is less than Adreno512, donot set WorkGroupAttribute option
                std::string adrenoVersion = deviceVersion.substr(deviceVersion.size()-3);
                // MNN_PRINT("Adreno Version:%s   %s\n", deviceVersion.c_str(), adrenoVersion.c_str());
                if(mCLVersion > 1.99f && adrenoVersion >= "512") {
                    isSetWorkGroupAttribute = true;
                }
                // 8Gen1 and after
                if(adrenoVersion >= "730") {
                    mGpuLevel = TOP;
                }
            } else if (deviceName.find("Mali") != std::string::npos) {
                mGpuType = MALI;
                if(maliArMap.find(deviceName) != maliArMap.end()){
                    mMaliAr = maliArMap[deviceName].first;
                    mGpuLevel = maliArMap[deviceName].second;
                }else{
                    mMaliAr = VALHALL;
                    mGpuLevel = UNDEFINED;
                }
            } else if (deviceVendor.find("Advanced Micro Devices") != std::string::npos) {
                // Radeon series GPU is main product of Advanced Micro Devices (AMD)
                mGpuType = RADEON;
                isSetWorkGroupAttribute = true;
            } 
            else if (deviceVendor.find("Intel") != std::string::npos) {
                mGpuType = INTEL;
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
                const std::string extensions = mFirstGPUDevicePtr->getInfo<CL_DEVICE_EXTENSIONS>();
                if (extensions.find("cl_intel_subgroups") != std::string::npos) {
                    mSupportedIntelSubgroup = true;
                    uint32_t execution_units_count = mFirstGPUDevicePtr->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
                    uint32_t num_threads_per_eu = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NUM_THREADS_PER_EU_INTEL>();
                    uint32_t maxThreadsPerExecutionUnit = num_threads_per_eu > 0 ? num_threads_per_eu : 7;
                    mMaxThreadsPerDevice =  maxThreadsPerExecutionUnit * execution_units_count;
                }
#endif 
            }
            else {
                mGpuType = OTHER;
            }
            const std::string extensions = platforms[0].getInfo<CL_PLATFORM_EXTENSIONS>();
            bool isPriorityHint = (extensions.find("cl_khr_priority_hints") != std::string::npos);
            std::vector<cl_context_properties> context_properties;
            if(mGpuType == ADRENO && !isPriorityHint){
                context_properties.push_back(CL_CONTEXT_PERF_HINT_QCOM);
                context_properties.push_back(CL_PERF_HINT_HIGH_QCOM);
                context_properties.push_back(CL_CONTEXT_PRIORITY_HINT_QCOM);
                context_properties.push_back(CL_PRIORITY_HINT_LOW_QCOM);
                mIsDeviceSupportedLowPower = true;
            }
            #ifdef ARM_OPENCL_PRINTF_DEBUG
            context_properties.push_back(CL_PRINTF_CALLBACK_ARM);
            context_properties.push_back((cl_context_properties)callback);
            context_properties.push_back(CL_PRINTF_BUFFERSIZE_ARM);
            context_properties.push_back(0x1000);
            #endif
            std::string deviceextensions = mFirstGPUDevicePtr.get()->getInfo<CL_DEVICE_EXTENSIONS>();
#ifdef MNN_USE_LIB_WRAPPER
            mIsSupportAHD = (getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_import_memory_android_hardware_buffer")
                 && mGpuType == MALI && OpenCLSymbolsOperator::getOpenclSymbolsPtr()->getFuncAddress(platforms[platformId](), "clImportMemoryARM"))
                 || (mGpuType == ADRENO && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_qcom_android_ahardwarebuffer_host_ptr"));
#endif
            if(nullptr != contextPtr){
                mContext = std::shared_ptr<cl::Context>((cl::Context*)contextPtr, [](void* ptr) {
                    // Do nothing
                });
            }else{
                if(context_properties.size() > 0){
                    context_properties.push_back(0);
                    mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), context_properties.data(), nullptr, nullptr, &res));
                }else{
                    mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), nullptr, nullptr, nullptr, &res));
                }
            }
            MNN_CHECK_CL_SUCCESS(res, "context");
            if (res != CL_SUCCESS) {
                mIsCreateError = true;
                return;
            }
            
            mIsDeviceSupportedLowPower = (mIsDeviceSupportedLowPower || isPriorityHint);
            
            #ifdef MNN_USE_LIB_WRAPPER
            if(isPriorityHint)
            {
                if(true == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isPropError())
                {
                    mIsCreateError = true;
                    return;
                }

                cl_queue_properties prop[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR,
#ifdef ENABLE_OPENCL_TIME_PROFILER
                    CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE,
#endif
                    0};
                mCommandQueuePtr.reset(new cl::CommandQueue(clCreateCommandQueueWithProperties((*mContext).get(), (*mFirstGPUDevicePtr).get(), prop, &res)));
            }
            else
            #endif
            {
                mCommandQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, properties, &res);
            }
            MNN_CHECK_CL_SUCCESS(res, "commandQueue");
            if (res != CL_SUCCESS) {
                mIsCreateError = true;
                return;
            }
#ifdef ENABLE_OPENCL_TIME_PROFILER
            mCommandQueueTuning = mCommandQueuePtr;
#else
            mCommandQueueTuning = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, CL_QUEUE_PROFILING_ENABLE, &res);
#endif
            mCurrentCommandQueue = mCommandQueuePtr.get();
            mFirstGPUDevicePtr->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &mGPUGlobalMemeryCacheSize);
            mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &mGPUComputeUnits);
            mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &mMaxFreq);
            mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &mMaxMemAllocSize);
            mFirstGPUDevicePtr->getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &mMaxLocalMemSize);
            mMaxWorkGroupSize = mFirstGPUDevicePtr->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
            
            //set gpu mode, tuning level and memory object
            setGpuMode(cl_mode);
            
            if(mMemType == AUTO) {
                if(mGpuType == MALI || mGpuType == INTEL) {
                    mMemType = BUFFER;
                } else {
                    mMemType = IMAGE;
                }
            }
            setPrecision(precision);
            
            if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_int8")){
                mSupportDotInt8 = true;
            }
            if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_accumulate_int8")){
                mSupportDotAccInt8 = true;
            }
          
#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
            {
                if((false == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isQcomError()) 
                   && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_qcom_recordable_queues")
                   && (cl_mode & MNN_GPU_RECORD_OP || cl_mode & MNN_GPU_RECORD_BATCH)){
                    uint32_t MaxRecordableQueueSize = mFirstGPUDevicePtr->getInfo<CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE>();
                    cl_int err;
                    if(MaxRecordableQueueSize > 0){
                        mUseRecordableQueueSize = hint.encorderNumForCommit;
                        mUseRecordableQueueSize = MaxRecordableQueueSize < mUseRecordableQueueSize ? MaxRecordableQueueSize : mUseRecordableQueueSize;
                        mUseRecordQueue = true;
                        mRecordableQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, CL_QUEUE_RECORDABLE_QCOM, &err);
                        if(err != CL_SUCCESS){
                            mIsCreateError = true;
                            return;
                        }
                    }
                }
            }
#endif
            
        }else{
            mIsCreateError = true;
            MNN_ASSERT(1 <= gpuDevices.size());
        }
    }else{
        mIsCreateError = true;
        MNN_ASSERT(platforms.size() > 0);
    }
    if (mIsCreateError) {
        return;
    }
    if (mMemType == IMAGE){
        // Init info
        size_t max_height, max_width;
        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
        MNN_CHECK_CL_SUCCESS(res, "image2Dsize");
        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
        MNN_CHECK_CL_SUCCESS(res, "image2Dsize");
        mMaxImageSize = {max_height, max_width};
    }
    do {
        int dims = 3;
        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &dims);
        MNN_CHECK_CL_SUCCESS(res, "DeviceGetInfo");

        if(dims < 3) {
            std::vector<uint32_t> workItem(3, 8);
            mMaxWorkIterms = workItem;
            break;
        }
        cl::vector<cl::size_type> _workItems(dims, 1);
        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &_workItems);
        MNN_CHECK_CL_SUCCESS(res, "DeviceGetInfo");
        
        std::vector<uint32_t> workItems(dims, 1);
        for (int i = 0; i < dims; ++i) {
            workItems[i] = _workItems[i];
        }
        mMaxWorkIterms = workItems;
    } while(false);

}

void OpenCLRuntime::setGpuMode(const int cl_mode_num) {
    int totalSet = 0;
    bool isSet = (cl_mode_num & MNN_GPU_MEMORY_BUFFER);
    if(isSet) {
        mMemType = BUFFER;
        totalSet++;
    }
    isSet = (cl_mode_num & MNN_GPU_MEMORY_IMAGE);
    if(isSet) {
        mMemType = IMAGE;
        totalSet++;
    }
    if(totalSet > 1) {
        MNN_PRINT("set both BUFFER and IMAGE mode is not permitted, please check cl_mode:%x！\n", cl_mode_num);
    }
    
    totalSet = 0;
    isSet = (cl_mode_num & MNN_GPU_TUNING_NONE);
    if(isSet) {
        mTuneLevel = None;
        totalSet++;
    }
    
    isSet = (cl_mode_num & MNN_GPU_TUNING_FAST);
    if(isSet) {
        mTuneLevel = Fast;
        totalSet++;
    }
    
    isSet = (cl_mode_num & MNN_GPU_TUNING_NORMAL);
    if(isSet) {
        mTuneLevel = Normal;
        totalSet++;
    }
    
    isSet = (cl_mode_num & MNN_GPU_TUNING_HEAVY);
    if(isSet) {
        mTuneLevel = Heavy;
        totalSet++;
    }
    
    isSet = (cl_mode_num & MNN_GPU_TUNING_WIDE);
    if(isSet) {
        mTuneLevel = Wide;
        totalSet++;
    }

    if(totalSet != 1) {
        MNN_PRINT("set multi tuning mode is not permitted, please check cl_mode:%x！\n", cl_mode_num);
    }
    
    totalSet = 0;
    isSet = (cl_mode_num & MNN_GPU_RECORD_OP);
    if(isSet) {
        mDevideOpRecord = true;
        totalSet++;
    }
    
    isSet = (cl_mode_num & MNN_GPU_RECORD_BATCH);
    if(isSet) {
        mDevideOpRecord = false;
        totalSet++;
    }
    
    if(totalSet > 1) {
        MNN_PRINT("set multi record kernel mode is not permitted, please check cl_mode:%x！\n", cl_mode_num);
    }
}

void OpenCLRuntime::setCommandQueueProfileEnable() {
    mCurrentCommandQueue->finish();
    mCurrentCommandQueue = mCommandQueueTuning.get();
}

void OpenCLRuntime::setCommandQueueProfileDisable() {
    mCurrentCommandQueue->finish();
    mCurrentCommandQueue = mCommandQueuePtr.get();
}

unsigned int OpenCLRuntime::getQueueNum() {
    mQueueCount++;
    return mQueueCount;
}

std::map<std::string, uint32_t>& OpenCLRuntime::preParamsMap(){
    return mPreParams;
}

std::map<std::vector<uint32_t>, std::vector<uint32_t>>& OpenCLRuntime::tunedGemmParamsMap() {
    return mTunedGemmParams;
}

std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& OpenCLRuntime::tunedLwsMap() {
    return mTunedLws;
}
    
std::map<std::string, std::vector<std::pair<std::vector<uint32_t>, std::pair<std::vector<uint32_t>, uint32_t>>>>& OpenCLRuntime::getTuneLwsMap() {
    return mTuneLws;
}

OpenCLRuntime::~OpenCLRuntime() {
#ifdef LOG_VERBOSE
    MNN_PRINT("start ~OpenCLRuntime !\n");
#endif
    clearEvent();
    mBuildProgramMap.clear();
    mCommandQueuePtr.reset();
    mCommandQueueTuning.reset();
    mRecordableQueuePtr.reset();
    mContext.reset();
    mFirstGPUDevicePtr.reset();
#ifdef LOG_VERBOSE
    MNN_PRINT("end ~OpenCLRuntime !\n");
#endif
}

std::vector<size_t> OpenCLRuntime::getMaxImage2DSize() {
    return mMaxImageSize;
}

bool OpenCLRuntime::isSupportedFP16() const {
    return mIsSupportedFP16;
}

bool OpenCLRuntime::isDeviceSupportedFP16() const {
    return mIsDeviceSupportedFP16;
}

bool OpenCLRuntime::isDeviceSupportedLowPower() const {
    return mIsDeviceSupportedLowPower;
}

bool OpenCLRuntime::isSupportedDotInt8() const {
    return mSupportDotInt8;
}

bool OpenCLRuntime::isSupportedDotAccInt8() const {
    return mSupportDotAccInt8;
}

bool OpenCLRuntime::isSupportedIntelSubgroup() const {
    return mSupportedIntelSubgroup;
 }
cl::Context &OpenCLRuntime::context() {
    return *mContext;
}

cl::CommandQueue &OpenCLRuntime::commandQueue() {
    return *mCurrentCommandQueue;
}

cl::CommandQueue &OpenCLRuntime::recordableQueue(){
    return *mRecordableQueuePtr;
}

uint64_t OpenCLRuntime::deviceGlobalMemeryCacheSize() const {
    return mGPUGlobalMemeryCacheSize;
}

uint32_t OpenCLRuntime::deviceComputeUnits() const {
    return mGPUComputeUnits;
}

uint32_t OpenCLRuntime::MaxThreadsPerDevice() const {
    return mMaxThreadsPerDevice;
}
uint32_t OpenCLRuntime::MaxWorkGroupSize() const {
    return mMaxWorkGroupSize;
}
uint32_t OpenCLRuntime::getPrecisionLevel() const {
    return mPrecisionLevel;
}
uint32_t OpenCLRuntime::maxFreq() const {
    return mMaxFreq;
}

uint64_t OpenCLRuntime::maxAllocSize() const {
    return mMaxMemAllocSize;
}

void OpenCLRuntime::setPrecision(const BackendConfig::PrecisionMode precision){
    cl_device_fp_config fpConfig;
    auto success = mFirstGPUDevicePtr->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fpConfig);
    mIsDeviceSupportedFP16     = CL_SUCCESS == success && fpConfig > 0;
    bool checkFp16Exetension = getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_khr_fp16");
    mIsDeviceSupportedFP16 = (mIsDeviceSupportedFP16 && checkFp16Exetension);
    mPrecisionLevel = 1;
    if (mIsDeviceSupportedFP16) {
        if (precision == BackendConfig::Precision_Low) {
            mPrecisionLevel = 2;
        } else if (precision == BackendConfig::Precision_Normal && mMemType == BUFFER) {
            mPrecisionLevel = 0;
        }
    }
    
    // Is supported fp16 IO storage
    mIsSupportedFP16 = (mPrecisionLevel == 2 || mPrecisionLevel == 0);
}

bool OpenCLRuntime::loadProgram(const std::string &programName, cl::Program *program) {
    std::lock_guard<std::mutex> lck(gCLMutex);
    auto it_source = OpenCLProgramMap.find(programName);
    if (it_source != OpenCLProgramMap.end()) {
        cl::Program::Sources sources;
        std::string source(it_source->second);
        sources.push_back(source);
        *program = cl::Program(context(), sources);
        return true;
    } else {
        MNN_PRINT("Can't find kernel source !\n");
        return false;
    }
}

bool OpenCLRuntime::buildProgram(const std::string &buildOptionsStr, cl::Program *program) {
    AUTOTIME;
    cl_int ret = program->build({*mFirstGPUDevicePtr}, buildOptionsStr.c_str());
    if (ret != CL_SUCCESS) {
        if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*mFirstGPUDevicePtr) == CL_BUILD_ERROR) {
            std::string buildLog = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(*mFirstGPUDevicePtr);
            MNN_PRINT("Program build log: %s \n", buildLog.c_str());
        }
        MNN_PRINT("Build program failed, err:%d ! \n", ret);
        return false;
    }
    return true;
}


std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernel(const std::string &programName, const std::string &kernelName,
                                      const std::set<std::string> &buildOptions, const Tensor *input, const Tensor *output) {
    auto kwp = buildKernelWithCache(programName, kernelName, buildOptions, input, output, true);
    return kwp;
}

std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernelWithCache(const std::string &programName, const std::string &kernelName,
                                      const std::set<std::string> &buildOptions, const Tensor *input, const Tensor *output, bool useCache) {
    std::string buildOptionsStr;
    if (mPrecisionLevel == 2) {// Fp16 Memory and fp16 compute
        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=half  -DCOMPUTE_FLOAT2=half2 -DCOMPUTE_FLOAT3=half3 -DCOMPUTE_FLOAT4=half4 -DCOMPUTE_FLOAT8=half8 -DCOMPUTE_FLOAT16=half16 -DCONVERT_COMPUTE_FLOAT=convert_half -DCONVERT_COMPUTE_FLOAT2=convert_half2 -DCONVERT_COMPUTE_FLOAT3=convert_half3 -DCONVERT_COMPUTE_FLOAT4=convert_half4 -DCONVERT_COMPUTE_FLOAT8=convert_half8 -DCONVERT_COMPUTE_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DCONVERT_FLOAT=convert_half  -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT3=convert_half3 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DMNN_SUPPORT_FP16";
    } else if (mPrecisionLevel == 0) {// Fp16 Memory and fp32 compute
        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT=convert_float -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT3=convert_float3 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DCONVERT_FLOAT=convert_half  -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT3=convert_half3 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DMNN_SUPPORT_FP16";
    } else {// Fp32 Memory and fp32 compute
        buildOptionsStr = "-DFLOAT=float -DFLOAT2=float2 -DFLOAT3=float3 -DFLOAT4=float4 -DFLOAT8=float8 -DFLOAT16=float16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT=convert_float  -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT3=convert_float3 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT=convert_float  -DCONVERT_FLOAT2=convert_float2 -DCONVERT_FLOAT3=convert_float3 -DCONVERT_FLOAT4=convert_float4 -DCONVERT_FLOAT8=convert_float8 -DCONVERT_FLOAT16=convert_float16";
    }
    
    if(nullptr != input){
        if(input->getType().code == halide_type_int) {
            buildOptionsStr += " -DINPUT_TYPE_I=int";
            buildOptionsStr += " -DINPUT_TYPE_I4=int4";
            if(input->getType().bits == 8){
                buildOptionsStr += " -DINPUT_TYPE=char";
                buildOptionsStr += " -DINPUT_TYPE4=char4";
                buildOptionsStr += " -DRI_DATA=read_imagei";
            } else if(input->getType().bits == 32){
                buildOptionsStr += " -DINPUT_TYPE=int";
                buildOptionsStr += " -DINPUT_TYPE4=int4";
                buildOptionsStr += " -DRI_DATA=read_imagei";
            } else {
                MNN_PRINT("opencl input datatype not support, bit:%d\n", input->getType().bits);
                MNN_ASSERT(false);
            }
        } else if(input->getType().code == halide_type_uint){
            buildOptionsStr += " -DINPUT_TYPE_I=uint";
            buildOptionsStr += " -DINPUT_TYPE_I4=uint4";
            if(input->getType().bits == 8){
                buildOptionsStr += " -DINPUT_TYPE=uchar";
                buildOptionsStr += " -DINPUT_TYPE4=uchar4";
                buildOptionsStr += " -DRI_DATA=read_imageui";
            } else if(input->getType().bits == 32){
                buildOptionsStr += " -DINPUT_TYPE=uint";
                buildOptionsStr += " -DINPUT_TYPE4=uint4";
                buildOptionsStr += " -DRI_DATA=read_imageui";
            } else {
                MNN_PRINT("opencl input datatype not support, bit:%d\n", input->getType().bits);
                MNN_ASSERT(false);
            }
        } else {
            if(mIsSupportedFP16){
                buildOptionsStr += " -DINPUT_TYPE_I=half";
                buildOptionsStr += " -DINPUT_TYPE_I4=half4";
                buildOptionsStr += " -DINPUT_TYPE=half";
                buildOptionsStr += " -DINPUT_TYPE4=half4";
                buildOptionsStr += " -DINPUT_TYPE16=half16";
                buildOptionsStr += " -DRI_DATA=read_imageh";
            }else{
                buildOptionsStr += " -DINPUT_TYPE_I=float";
                buildOptionsStr += " -DINPUT_TYPE_I4=float4";
                buildOptionsStr += " -DINPUT_TYPE=float";
                buildOptionsStr += " -DINPUT_TYPE4=float4";
                buildOptionsStr += " -DINPUT_TYPE16=float16";
                buildOptionsStr += " -DRI_DATA=read_imagef";
            }
        }
    }
    
    if(nullptr != output){
        if(output->getType().code == halide_type_int) {
            buildOptionsStr += " -DOUTPUT_TYPE_I=int";
            buildOptionsStr += " -DOUTPUT_TYPE_I4=int4";
            buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_int4";
            if(output->getType().bits == 8){
                buildOptionsStr += " -DOUTPUT_TYPE=char";
                buildOptionsStr += " -DOUTPUT_TYPE4=char4";
                buildOptionsStr += " -DOUTPUT_TYPE16=char16";
                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_char4";
                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_char16";
                buildOptionsStr += " -DWI_DATA=write_imagei";
            } else if(output->getType().bits == 32){
                buildOptionsStr += " -DOUTPUT_TYPE=int";
                buildOptionsStr += " -DOUTPUT_TYPE4=int4";
                buildOptionsStr += " -DOUTPUT_TYPE16=int16";
                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_int4";
                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_int16";
                buildOptionsStr += " -DWI_DATA=write_imagei";
            } else {
                MNN_PRINT("opencl output datatype not support, bit:%d\n", output->getType().bits);
                MNN_ASSERT(false);
            }
        } else if(output->getType().code == halide_type_uint){
            buildOptionsStr += " -DOUTPUT_TYPE_I=uint";
            buildOptionsStr += " -DOUTPUT_TYPE_I4=uint4";
            buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_uint4";
            if(output->getType().bits == 8){
                buildOptionsStr += " -DOUTPUT_TYPE=uchar";
                buildOptionsStr += " -DOUTPUT_TYPE4=uchar4";
                buildOptionsStr += " -DOUTPUT_TYPE16=uchar16";
                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_uchar4";
                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_uchar16";
                buildOptionsStr += " -DWI_DATA=write_imageui";
            } else if(output->getType().bits == 32){
                buildOptionsStr += " -DOUTPUT_TYPE=uint";
                buildOptionsStr += " -DOUTPUT_TYPE4=uint4";
                buildOptionsStr += " -DOUTPUT_TYPE16=uint16";
                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_uint4";
                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_uint16";
                buildOptionsStr += " -DWI_DATA=write_imageui";
            } else {
                MNN_PRINT("opencl output datatype not support, bit:%d\n", output->getType().bits);
                MNN_ASSERT(false);
            }
        } else {
            if(mIsSupportedFP16){
                buildOptionsStr += " -DOUTPUT_TYPE_I=half";
                buildOptionsStr += " -DOUTPUT_TYPE_I4=half4";
                buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_half4";
                buildOptionsStr += " -DOUTPUT_TYPE=half";
                buildOptionsStr += " -DOUTPUT_TYPE4=half4";
                buildOptionsStr += " -DOUTPUT_TYPE16=half16";
                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_half4";
                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_half16";
                buildOptionsStr += " -DWI_DATA=write_imageh";
            }else{
                buildOptionsStr += " -DOUTPUT_TYPE_I=float";
                buildOptionsStr += " -DOUTPUT_TYPE_I4=float4";
                buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_float4";
                buildOptionsStr += " -DOUTPUT_TYPE=float";
                buildOptionsStr += " -DOUTPUT_TYPE4=float4";
                buildOptionsStr += " -DOUTPUT_TYPE16=float16";
                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_float4";
                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_float16";
                buildOptionsStr += " -DWI_DATA=write_imagef";
            }
        }
    }
    
    if(isSetWorkGroupAttribute) {
        buildOptionsStr += " -DSET_ATTRIBUTE=true";
    } else {
        buildOptionsStr += " -DSET_ATTRIBUTE=false";
    }
    for (auto &option : buildOptions) {
        buildOptionsStr += " " + option;
    }
    buildOptionsStr += mDefaultBuildParams;
    auto key = std::make_tuple(programName, buildOptionsStr);

    auto buildProgramInter = mBuildProgramMap.find(key);
    cl::Program program;
    if (buildProgramInter != mBuildProgramMap.end()) {
        program = buildProgramInter->second.program;
    } else {
        this->loadProgram(programName, &program);
        auto status = this->buildProgram(buildOptionsStr, &program);
        if (!status) {
            FUNC_PRINT_ALL(programName.c_str(), s);
            return nullptr;
        }
        ProgramWithKernel pwk;
        pwk.program = program;
        mBuildProgramMap.emplace(key, pwk);
        buildProgramInter = mBuildProgramMap.find(key);
    }
    auto kiter = buildProgramInter->second.kernels.find(kernelName);
    std::shared_ptr<cl::Kernel> kernel;
    bool firstCreate = false;
    if (kiter == buildProgramInter->second.kernels.end()) {
        KernelPool pool;
        buildProgramInter->second.kernels.insert(std::make_pair(kernelName, pool));
        kiter = buildProgramInter->second.kernels.find(kernelName);
        firstCreate = true;
    }
    if (kiter->second.recycle.empty()) {
        cl_int res;
        kernel.reset(new cl::Kernel(program, kernelName.c_str(), &res));
        if(res != CL_SUCCESS) {
            MNN_ERROR("getKernel: %s error, res:%d\n", kernelName.c_str(), res);
            return nullptr;
        }
        if (firstCreate) {
            kernel->getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WORK_GROUP_SIZE, &kiter->second.maxWorkGroupSize);
        }
    } else {
        kernel = kiter->second.recycle.front();
        kiter->second.recycle.pop();
    }
    std::shared_ptr<KernelWrap> kw(new KernelWrap(kernel, &kiter->second));
    return kw;
}

std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernelFromSource(const std::string& source, const std::string &kernelName,
                                                const std::set<std::string> &buildOptions) {
    std::string buildOptionsStr;
    if (mIsSupportedFP16) {
        buildOptionsStr = "-DFLOAT=half -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DRI_F=read_imageh -DWI_F=write_imageh -DCONVERT_FLOAT4=convert_half4 -DMNN_SUPPORT_FP16";
    } else {
        buildOptionsStr = "-DFLOAT=float -DFLOAT4=float4 -DFLOAT8=float8 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT4=convert_float4";
    }
    
    if(isSetWorkGroupAttribute) {
        buildOptionsStr += " -DSET_ATTRIBUTE=true";
    } else {
        buildOptionsStr += " -DSET_ATTRIBUTE=false";
    }
    for (auto &option : buildOptions) {
        buildOptionsStr += " " + option;
    }
    buildOptionsStr += mDefaultBuildParams;
    
    cl::Program::Sources sources;
    sources.push_back(source);
    cl::Program program = cl::Program(context(), sources);
    auto status = this->buildProgram(buildOptionsStr, &program);
    if (!status) {
        FUNC_PRINT_ALL(kernelName.c_str(), s);
    }
    // mBuildProgramMap.emplace(key, program);

    cl_int res;
    std::shared_ptr<cl::Kernel> kernel;
    kernel.reset(new cl::Kernel(program, kernelName.c_str(), &res));
    MNN_CHECK_CL_SUCCESS(res, "getKernel");
    std::shared_ptr<KernelWrap> kw(new KernelWrap(kernel, nullptr));
    return kw;
}


uint64_t OpenCLRuntime::getMaxWorkGroupSize(std::shared_ptr<KernelWrap> kernel) {
    if (nullptr != kernel->mRecycle) {
        return kernel->mRecycle->maxWorkGroupSize;
    }
    uint64_t maxWorkGroupSize = 0;
    kernel->get().getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WORK_GROUP_SIZE, &maxWorkGroupSize);
    return maxWorkGroupSize;
}

uint64_t OpenCLRuntime::GetKernelWaveSize(std::shared_ptr<KernelWrap> kernel) {
    uint64_t kernelWaveSize = 0;
    kernel->get().getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WAVE_SIZE_QCOM, &kernelWaveSize);
    return kernelWaveSize;
}

std::vector<uint32_t> OpenCLRuntime::getMaxWorkItemSizes() {
    return mMaxWorkIterms;
}

uint64_t OpenCLRuntime::getMaxLocalMem() const {
    return mMaxLocalMemSize;
}
double OpenCLRuntime::getCostTime(const cl::Event *event){
    //cl_int res = mCommandQueuePtr->finish();
    cl_int res = event->wait();
    MNN_CHECK_CL_SUCCESS(res, "clEvent");
    mStartNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_START>();
    mStopNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_END>();
    mKernelTime += (unsigned int)((mStopNanos - mStartNanos) / 1000.0);
    return (mStopNanos - mStartNanos) / 1000.0;
}

double OpenCLRuntime::getQueuedTime(const cl::Event *event){
    //cl_int res = mCommandQueuePtr->finish();
    cl_int res = event->wait();
    MNN_CHECK_CL_SUCCESS(res, "clEvent");
    return (event->getProfilingInfo<CL_PROFILING_COMMAND_START>() - event->getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>()) / 1000.0;
}

double OpenCLRuntime::getSubmitTime(const cl::Event *event){
    //cl_int res = mCommandQueuePtr->finish();
    cl_int res = event->wait();
    MNN_CHECK_CL_SUCCESS(res, "clEvent");
    return (event->getProfilingInfo<CL_PROFILING_COMMAND_START>() - event->getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()) / 1000.0;
}


std::pair<const void*, size_t> OpenCLRuntime::makeCache(void* tuneInfo) {
    auto tune = reinterpret_cast<MNN::OpenCL::TuneInfo*>(tuneInfo);
    std::unique_ptr<CacheT> cache(new CacheT);
    for (auto& p : tune->mInfos) {
        cache->tuned.emplace_back(std::move(p));
    }
    tune->mInfos.clear();
    // Get All program's binary
    for (auto& iter : mBuildProgramMap) {
        std::unique_ptr<ShaderT> pro(new ShaderT);
        auto program = iter.second.program;
        auto bufferSize = iter.second.BufferSize;
        // Only use first one
        pro->program = std::get<0>(iter.first);
        pro->buildInfo = std::get<1>(iter.first);
        
        //MNN_PRINT("%s - %s - %s\n", pro->program.c_str(), pro->kernel.c_str(), pro->buildInfo.c_str());
        if(bufferSize != 0){
            pro->buffer.resize(bufferSize);
            ::memcpy(pro->buffer.data(), iter.second.Buffer.get(), bufferSize);
            cache->programs.emplace_back(std::move(pro));
            continue;
        }
        auto devicesNumber = program.getInfo<CL_PROGRAM_NUM_DEVICES>();
        auto devices = program.getInfo<CL_PROGRAM_DEVICES>();
        auto binSizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
        if (binSizes.empty() || devices.empty()) {
            MNN_ERROR("Can't load binary, binarySize:%lu, deviceSize:%lu\n", binSizes.size(), devices.size());
            continue;
        }
        
        pro->buffer.resize(binSizes[0]);
        auto proRaw = program.get();
        auto c = pro->buffer.data();
        clGetProgramInfo(proRaw, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &c, nullptr);
        cache->programs.emplace_back(std::move(pro));
    }
    // Get All Autotuning cache
    for (auto& iter : mTunedLws) {
        std::unique_ptr<AutotuningT> tuning(new AutotuningT);
        tuning->gloablSize = iter.first.second;
        tuning->localSize = iter.second.first;
        tuning->timeCost = iter.second.second;
        tuning->key = iter.first.first;
        cache->tunings.emplace_back(std::move(tuning));
    }

    // Get All GemmInfo cache
    for (auto& iter : mTunedGemmParams) {
        std::unique_ptr<GemmInfoT> tuning(new GemmInfoT);
        tuning->gemmSize = iter.first;
        tuning->paramInfo = iter.second;
        cache->gemm.emplace_back(std::move(tuning));
    }
    
    // Get All PreParam cache
    for(auto& iter : mPreParams){
        std::unique_ptr<PreParamInfoT> info(new PreParamInfoT);
        info->preParamName = iter.first;
        info->preParamData = iter.second;
        cache->preParam.emplace_back(std::move(info));
    }
    
    flatbuffers::FlatBufferBuilder builder;
    auto lastOffset = Cache::Pack(builder, cache.get());
    builder.Finish(lastOffset);
    mBuffer.resize(builder.GetSize());
    ::memcpy(mBuffer.data(), builder.GetBufferPointer(), builder.GetSize());
    return std::make_pair(mBuffer.data(), mBuffer.size());
}

bool OpenCLRuntime::setCache(std::pair<const void*, size_t> cache) {
    if (nullptr == cache.first) {
        mBuffer.clear();
        return true;
    }
    
    auto cacheBuffer = GetCache(cache.first);
    
    if(nullptr == cacheBuffer->programs() && nullptr == cacheBuffer->tunings() && nullptr == cacheBuffer->gemm()) {
        return false;
    }
    
    // Load Program
    if (nullptr != cacheBuffer->programs()) {
        auto programs = cacheBuffer->programs();
        for (int i=0; i<programs->size(); ++i) {
            auto shaderInfo = programs->GetAs<Shader>(i);
            if (nullptr == shaderInfo->program()|| nullptr == shaderInfo->buildInfo() || nullptr == shaderInfo->buffer()) {
                MNN_ERROR("Invalid Cache\n");
                return false;
            }
            auto program = shaderInfo->program()->str();
            // Builder Info
            std::string buildinfo = shaderInfo->buildInfo()->str();
            
            auto buffer = shaderInfo->buffer()->data();
            size_t bufferSize = shaderInfo->buffer()->size();
            auto deviceId = mFirstGPUDevicePtr->get();
            auto programRaw = clCreateProgramWithBinary(context().get(), 1, &deviceId, &bufferSize, (const unsigned char**)(&buffer), nullptr, nullptr);
            if (!programRaw) {
                MNN_ERROR("Can't load %s - %s load program\n", program.c_str(), buildinfo.c_str());
                return false;
            }
            auto pro = cl::Program(programRaw);
            auto res = buildProgram(buildinfo, &pro);
            if (!res) {
                MNN_ERROR("Can't build %s - %s load program\n", program.c_str(), buildinfo.c_str());
                return false;
            }
            ProgramWithKernel pwk;
            pwk.program = pro;
            pwk.Buffer.reset(new char[bufferSize]);
            pwk.BufferSize = bufferSize;
            ::memcpy(pwk.Buffer.get(), buffer, bufferSize);
            mBuildProgramMap.insert(std::make_pair(std::make_tuple(program, buildinfo), pwk));
        }
    }

    // Load Auto Tuning Info
    if (nullptr != cacheBuffer->tunings()) {
        auto tuningInfo = cacheBuffer->tunings();
        for (int i=0; i<tuningInfo->size(); ++i) {
            auto tun = tuningInfo->GetAs<Autotuning>(i);
            if (nullptr == tun->gloablSize() || nullptr == tun->localSize() || nullptr == tun->key()) {
                MNN_ERROR("Error tunning info\n");
                return false;
            }
            std::vector<uint32_t> glo(tun->gloablSize()->size());
            for (int v=0; v<glo.size(); ++v) {
                glo[v] = tun->gloablSize()->data()[v];
            }
            std::vector<uint32_t> loc(tun->localSize()->size());
            for (int v=0; v<loc.size(); ++v) {
                loc[v] = tun->localSize()->data()[v];
            }
            uint32_t cost = tun->timeCost();
            mTunedLws.insert(std::make_pair(std::make_pair(tun->key()->str(), glo), std::make_pair(loc, cost)));
            mTuneLws[tun->key()->str()].push_back(std::make_pair(glo, std::make_pair(loc, cost)));
        }
    }
    
    // Load Gemm Info
    if (nullptr != cacheBuffer->gemm()) {
        auto tuningInfo = cacheBuffer->gemm();
        for (int i=0; i<tuningInfo->size(); ++i) {
            auto tun = tuningInfo->GetAs<GemmInfo>(i);
            if (nullptr == tun->gemmSize() || nullptr == tun->paramInfo()) {
                MNN_ERROR("Error tunning gemm info\n");
                return false;
            }
            MNN_ASSERT(tun->gemmSize()->size() == 7);
            std::vector<uint32_t> info(tun->gemmSize()->size());
            for (int v=0; v<info.size(); ++v) {
                info[v] = tun->gemmSize()->data()[v];
            }
            MNN_ASSERT(tun->paramInfo()->size() == 14);
            std::vector<uint32_t> params(tun->paramInfo()->size());
            for (int v=0; v<params.size(); ++v) {
                params[v] = tun->paramInfo()->data()[v];
            }
            mTunedGemmParams.insert(std::make_pair(info, params));
            mTuneLws["Xgemm_tune"].push_back(std::make_pair(info, std::make_pair(params, 0)));
        }
    }
    
    //Load PreParam Info
    if(nullptr != cacheBuffer->preParam()){
        auto preParamInfo = cacheBuffer->preParam();
        for(int i = 0; i < preParamInfo->size(); ++i){
            auto info = preParamInfo->GetAs<PreParamInfo>(i);
            if (nullptr == info->preParamName()) {
                MNN_ERROR("Error preParam info\n");
                return false;
            }
            mPreParams.insert(std::make_pair(info->preParamName()->str(), info->preParamData()));
        }
    }
    return true;
}

void OpenCLRuntime::printEventTime(){
#ifdef ENABLE_OPENCL_TIME_PROFILER
    if(mEvents.empty()){
        return;
    }
    int raster_num = 0, raster_time = 0;
    unsigned int conv_time = 0, loop_bg_time = 0, loop_bg_gemm_time = 0, loop_softmax_time = 0, ori_softmax_time = 0;
    unsigned int conv_gemm2_buf_time = 0, conv_gemm1_buf_time = 0;
    unsigned int conv_1x1_buf_time = 0, conv_ori_buf_time = 0, wino_gemm_time = 0;

    std::vector<std::pair<std::string, int>> kernels(mEvents.size());
    for(int i = 0; i < mEvents.size(); ++i){
        auto event = &mEvents[i].second;
        cl_int res = event->wait();
        MNN_CHECK_CL_SUCCESS(res, "clEvent");
        auto StartNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_START>();
        auto StopNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_END>();
        auto kernel_time = (unsigned int)((StopNanos - StartNanos) / 1000.0);
        mKernelTime += kernel_time;
        if (mEvents[i].first.length() >= 15 && mEvents[i].first.substr(0, 15) == "ConvBuf2D-gemm2") {
            conv_gemm2_buf_time += kernel_time;
            conv_time += kernel_time;
        } else if (mEvents[i].first.length() >= 15 && mEvents[i].first.substr(0, 15) == "ConvBuf2D-gemm1") {
            conv_gemm1_buf_time += kernel_time;
            conv_time += kernel_time;
        } else if (mEvents[i].first.length() >= 17 && mEvents[i].first.substr(0, 17) == "ConvBuf2D-conv1x1") {
            conv_1x1_buf_time += kernel_time;
            conv_time += kernel_time;
        } else if (mEvents[i].first.length() >= 13 && mEvents[i].first.substr(0, 13) == "ConvBuf2D-ori") {
            conv_ori_buf_time += kernel_time;
            conv_time += kernel_time;
        } else if (mEvents[i].first.length() >= 11 && mEvents[i].first.substr(0, 11) == "Convolution") {
            conv_time += kernel_time;
        } else if (mEvents[i].first.length() >= 8 && mEvents[i].first.substr(0, 8) == "Strassen") {
            conv_time += kernel_time;
        }
        if((mEvents[i].first.length() >= 10 && mEvents[i].first.substr(0, 10) == "While-gemm")) {
            loop_bg_time += kernel_time;
        }
        if((mEvents[i].first.length() >= 20 && mEvents[i].first.substr(0, 20) == "While-gemm-batchgemm")) {
            loop_bg_gemm_time += kernel_time;
        }
        if((mEvents[i].first.length() >= 18 && mEvents[i].first.substr(0, 18) == "While-gemm-softmax")) {
            loop_softmax_time += kernel_time;
        }
        if((mEvents[i].first.length() >= 7 && mEvents[i].first.substr(0, 7) == "Softmax")) {
            ori_softmax_time += kernel_time;
        }
        if((mEvents[i].first.length() >= 23 && mEvents[i].first.substr(0, 23) == "Conv-winograd-batchgemm")) {
            wino_gemm_time += kernel_time;
            conv_time += kernel_time;
        }
        if((mEvents[i].first.length() >= 6 && mEvents[i].first.substr(0, 6) == "Raster")) {
            raster_num++;
            raster_time += kernel_time;
        }
        
        kernels[i] = std::make_pair(mEvents[i].first, kernel_time);
    }
#ifdef SORT_PROFILE_TIME
    for(int i = 0; i < mEvents.size(); i++) {
        for(int j = i+1; j < mEvents.size(); j++) {
            if(kernels[i].second > kernels[j].second) {
                auto tmp = kernels[i];
                kernels[i].first = kernels[j].first;
                kernels[i].second = kernels[j].second;
                kernels[j].first = tmp.first;
                kernels[j].second = tmp.second;
            }
        }
    }
#endif
    for(int i = 0; i < mEvents.size(); i++) {
        MNN_PRINT("kernel time = %d    us %s\n", kernels[i].second, kernels[i].first.c_str());
    }
    mEvents.clear();
    MNN_PRINT("total kernel time = %d  us, conv time = %d us (gemm2:%d us, gemm1:%d us, 1x1:%d us, ori:%d us, wino: %d us, other: %d us), while gemm time = %d us (core gemm time: %d us, softmax:%d us), ori softmax: %d us, raster[%d] time: %d us\n", mKernelTime, conv_time, conv_gemm2_buf_time, conv_gemm1_buf_time, conv_1x1_buf_time, conv_ori_buf_time, wino_gemm_time, conv_time-conv_gemm2_buf_time-conv_gemm1_buf_time-conv_1x1_buf_time-conv_ori_buf_time-wino_gemm_time, loop_bg_time, loop_bg_gemm_time, loop_softmax_time, ori_softmax_time, raster_num, raster_time);
#endif
}
} // namespace MNN
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								//
 								//  OpenCLRuntime.cpp
 								//  MNN
 								//
 								//  Created by MNN on 2019/02/28.
 								//  Copyright © 2018, Alibaba Group Holding Limited
 								//
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								#include "backend/opencl/core/runtime/OpenCLRuntime.hpp"
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								#include <cstdlib>
 								#include <memory>
 								#include <string>
 								#include <utility>
 								#include <vector>
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								#include "core/Macro.h"
-												[MNN:Sync] Sync internal gitlab

											
										
										
											2022-01-04 10:50:40 +08:00
+								#include "OpenCLTuneInfo.hpp"
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								//#define MNN_OPEN_TIME_TRACE
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								#include <MNN/AutoTime.hpp>
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								#include "CLCache_generated.h"
-												MNN:Sync: Sync Internal 3.0.0

											
										
										
											2024-11-18 14:37:45 +08:00
+								#include "backend/opencl/execution/cl/opencl_source_map.hpp"
 								//#define ARM_OPENCL_PRINTF_DEBUG
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								using namespace CLCache;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								namespace MNN {
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								extern const std::map<std::string, const char*> OpenCLProgramMap;
-												[MNN:Sync] Sync Internal 2.3.1

											
										
										
											2023-02-15 10:30:27 +08:00
+								extern std::mutex gCLMutex;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
 								bool OpenCLRuntime::getDeviceSupportsExtension(const cl::Device &device, const char *extensionName) {
 								    std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
 								    auto pos               = extensions.find(extensionName);
 								    return (pos != std::string::npos);
 								}
-												MNN:Sync: Sync Internal 3.0.0

											
										
										
											2024-11-18 14:37:45 +08:00
+								#ifdef ARM_OPENCL_PRINTF_DEBUG
 								static void callback(const char *buffer, size_t length, size_t final, void *user_data)
 								{
 								    fwrite(buffer, 1, length, stdout);
 								}
 								#endif
-												MNN:Sync: Sync Internal 3.0.5

											
										
										
											2025-02-12 11:14:19 +08:00
+								OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int platformSize, int platformId, int deviceId, void *contextPtr, void *glShared, const RuntimeHint& hint) {
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								#ifdef LOG_VERBOSE
 								    MNN_PRINT("start OpenCLRuntime !\n");
 								#endif
 								    mDefaultBuildParams = " -cl-mad-enable";
 								    std::vector<cl::Platform> platforms;
-												[MNN:Sync] Sync Internal 2.6.2

											
										
										
											2023-07-31 14:24:48 +08:00
+								    cl_int res = cl::Platform::get(&platforms, platformSize);
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								    MNN_CHECK_CL_SUCCESS(res, "getPlatform");
-												[MNN:Sync] Sync Internal 2.6.2

											
										
										
											2023-07-31 14:24:48 +08:00
+								    if(platforms.size() > 0 && res == CL_SUCCESS) {
 								        if(platformId >= platforms.size() || platformId < 0) {
 								            platformId = 0;
-												[MNN:Sync] Sync Internal 2.5.3

											
										
										
											2023-06-16 09:42:45 +08:00
+								        }
-												[MNN:Sync] Sync Internal 2.6.2

											
										
										
											2023-07-31 14:24:48 +08:00
+								        cl::Platform::setDefault(platforms[platformId]);
 								        std::vector<cl::Device> gpuDevices;
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
-												[MNN:Sync] Sync Internal 2.6.2

											
										
										
											2023-07-31 14:24:48 +08:00
+								        res = platforms[platformId].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
 								        if(1 <= gpuDevices.size() && res == CL_SUCCESS) {
 								            if(deviceId >= gpuDevices.size() || deviceId < 0) {
 								                deviceId = 0;
 								            }
 								            mFirstGPUDevicePtr = std::make_shared<cl::Device>(gpuDevices[deviceId]);
 								            if(mFirstGPUDevicePtr == nullptr) {
 								                mIsCreateError = true;
 								                return;
 								            }
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								            const std::string deviceName    = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NAME>();
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								            mDeviceName = deviceName;
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								            const std::string deviceVersion = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VERSION>();
-												MNN:Sync: Sync Internal 3.0.4

											
										
										
											2025-01-22 14:47:50 +08:00
+								            std::map<std::string, std::pair<MNN::MaliAr, MNN::GpuLevel>> maliArMap {
 								                {"Mali-T860", {MIDGARD, LOW}},
 								                {"Mali-T880", {MIDGARD, LOW}},
 								                {"Mali-G31", {BIFROST, LOW}},
 								                {"Mali-G51", {BIFROST, LOW}},
 								                {"Mali-G52", {BIFROST, LOW}},
 								                {"Mali-G71", {BIFROST, LOW}},
 								                {"Mali-G72", {BIFROST, LOW}},
 								                {"Mali-G76", {BIFROST, MEDIUM}},
 								                {"Mali-G57", {VALHALL, LOW}},
 								                {"Mali-G68", {VALHALL, LOW}},
 								                {"Mali-G77", {VALHALL, MEDIUM}},
 								                {"Mali-G78", {VALHALL, MEDIUM}},
 								                {"Mali-G310", {VALHALL, LOW}},
 								                {"Mali-G510", {VALHALL, LOW}},
 								                {"Mali-G610", {VALHALL, LOW}},
 								                {"Mali-G615", {VALHALL, LOW}},
 								                {"Mali-G710", {VALHALL, TOP}},
 								                {"Mali-G715", {VALHALL, TOP}},
-												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes

											
										
										
											2019-09-26 21:02:07 +08:00
+								            };
-												[MNN:Sync] Sync Internal Github

											
										
										
											2020-07-04 01:21:30 +08:00
-												beta 0.2.0.9
- fix quantization tool compiling on Windows
- fix converter compiling on Windows
- fix eltwise optimization on Windows
- separate sse & avx for Windows
- add LeakyReLU support for TensorFlow
- fix reshape, const for TensorFlow
- fix dimension format error for ONNX ops
- optimize winograd, ReLU for OpenCL
- add fp16 availability & dimensions size check-up for OpenCL
- optimize GEMM for arm32
- fix ExpandDims shape calculation when inputs size == 1

											
										
										
											2019-09-01 19:25:26 +08:00
+								            const std::string deviceVendor  = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VENDOR>();
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								            cl_command_queue_properties properties = 0;
-												add macro ENABLE_OPENCL_TIME_PROFILER

											
										
										
											2020-06-23 18:04:00 +08:00
+								        #ifdef ENABLE_OPENCL_TIME_PROFILER
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								            properties |= CL_QUEUE_PROFILING_ENABLE;
 								        #endif
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								            cl_int res;
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								            // if device is QUALCOMM's and version is 2.0 , set spacial optimized param
-												[MNN:Sync] Sync Internal Github

											
										
										
											2020-07-04 01:21:30 +08:00
-												[MNN:Sync] Sync internal github

Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model

											
										
										
											2021-07-29 11:46:59 +08:00
+								            sscanf(deviceVersion.c_str(), "%*s%f%*s", &mCLVersion);
 								        #ifdef MNN_OPENCL_SVM_ENABLE
 								            if(mCLVersion > 1.99f && (false == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isSvmError())) {
 								                res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_SVM_CAPABILITIES, &mSvmCapabilities);
-												[Sync] Sync Internal 2.2.2

											
										
										
											2022-11-18 22:35:31 +08:00
+								                #ifdef LOG_VERBOSE
-												[MNN:Sync] Sync internal github

Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model

											
										
										
											2021-07-29 11:46:59 +08:00
+								                if (res != CL_SUCCESS || mSvmCapabilities == 0) {
 								                    MNN_PRINT("SVM capalibilties: NONE\n");
 								                } else {
 								                    if (mSvmCapabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
 								                        MNN_PRINT("SVM capalibilties: SVM_FINE_GRAIN_BUFFER\n");
 								                        if (mSvmCapabilities & CL_DEVICE_SVM_ATOMICS) {
 								                            MNN_PRINT("SVM capalibilties: SVM_ATOMICS\n");
 								                        }
 								                    } else if (mSvmCapabilities & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
 								                        MNN_PRINT("SVM capalibilties: SVM_COARSE_GRAIN_BUFFER\n");
 								                    }
 								                }
-												[Sync] Sync Internal 2.2.2

											
										
										
											2022-11-18 22:35:31 +08:00
+								                #endif
-												[MNN:Sync] Sync internal github

Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model

											
										
										
											2021-07-29 11:46:59 +08:00
+								            }
 								        #endif
-												MNN:Sync: Sync Internal 2.9.6

											
										
										
											2024-10-14 19:26:28 +08:00
+								            if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) {
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								                mGpuType = ADRENO;
-												[MNN:Sync] Sync Internal Github

											
										
										
											2020-07-04 01:21:30 +08:00
-												[MNN:Sync] Sync internal github

Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model

											
										
										
											2021-07-29 11:46:59 +08:00
+								                // if device is QUALCOMM's and version is 2.0 , set spacial optimized param
-												[MNN:Sync] Sync Internal Github

											
										
										
											2020-07-04 01:21:30 +08:00
+								                //if Adreno version is less than Adreno512, donot set WorkGroupAttribute option
 								                std::string adrenoVersion = deviceVersion.substr(deviceVersion.size()-3);
-												MNN:Sync: sync internal 3.0.3

											
										
										
											2024-12-31 15:34:08 +08:00
+								                // MNN_PRINT("Adreno Version:%s   %s\n", deviceVersion.c_str(), adrenoVersion.c_str());
-												[MNN:Sync] Sync internal github

Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model

											
										
										
											2021-07-29 11:46:59 +08:00
+								                if(mCLVersion > 1.99f && adrenoVersion >= "512") {
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								                    isSetWorkGroupAttribute = true;
-												[MNN:Sync] Sync Internal Github

											
										
										
											2020-07-04 01:21:30 +08:00
+								                }
-												MNN:Sync: sync internal 3.0.3

											
										
										
											2024-12-31 15:34:08 +08:00
+								                // 8Gen1 and after
 								                if(adrenoVersion >= "730") {
 								                    mGpuLevel = TOP;
 								                }
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								            } else if (deviceName.find("Mali") != std::string::npos) {
 								                mGpuType = MALI;
-												[MNN:Sync] Sync Internal 2.4.1

											
										
										
											2023-03-20 11:32:29 +08:00
+								                if(maliArMap.find(deviceName) != maliArMap.end()){
-												MNN:Sync: Sync Internal 3.0.4

											
										
										
											2025-01-22 14:47:50 +08:00
+								                    mMaliAr = maliArMap[deviceName].first;
 								                    mGpuLevel = maliArMap[deviceName].second;
-												[MNN:Sync] Sync Internal 2.4.1

											
										
										
											2023-03-20 11:32:29 +08:00
+								                }else{
 								                    mMaliAr = VALHALL;
-												MNN:Sync: Sync Internal 3.0.4

											
										
										
											2025-01-22 14:47:50 +08:00
+								                    mGpuLevel = UNDEFINED;
-												[MNN:Sync] Sync Internal 2.4.1

											
										
										
											2023-03-20 11:32:29 +08:00
+								                }
-												beta 0.2.0.9
- fix quantization tool compiling on Windows
- fix converter compiling on Windows
- fix eltwise optimization on Windows
- separate sse & avx for Windows
- add LeakyReLU support for TensorFlow
- fix reshape, const for TensorFlow
- fix dimension format error for ONNX ops
- optimize winograd, ReLU for OpenCL
- add fp16 availability & dimensions size check-up for OpenCL
- optimize GEMM for arm32
- fix ExpandDims shape calculation when inputs size == 1

											
										
										
											2019-09-01 19:25:26 +08:00
+								            } else if (deviceVendor.find("Advanced Micro Devices") != std::string::npos) {
 								                // Radeon series GPU is main product of Advanced Micro Devices (AMD)
 								                mGpuType = RADEON;
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								                isSetWorkGroupAttribute = true;
-												[MNN:Sync] Sync Internal 2.6.2

											
										
										
											2023-07-31 14:24:48 +08:00
+								            }
 								            else if (deviceVendor.find("Intel") != std::string::npos) {
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								                mGpuType = INTEL;
-												[MNN:Sync] Sync Internal 2.6.2

											
										
										
											2023-07-31 14:24:48 +08:00
+								#ifdef MNN_SUPPORT_INTEL_SUBGROUP
-												[MNN:Sync] Sync Internal Gitlab: 2.5.1

											
										
										
											2023-05-18 19:11:50 +08:00
+								                const std::string extensions = mFirstGPUDevicePtr->getInfo<CL_DEVICE_EXTENSIONS>();
 								                if (extensions.find("cl_intel_subgroups") != std::string::npos) {
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								                    mSupportedIntelSubgroup = true;
 								                    uint32_t execution_units_count = mFirstGPUDevicePtr->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
 								                    uint32_t num_threads_per_eu = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NUM_THREADS_PER_EU_INTEL>();
 								                    uint32_t maxThreadsPerExecutionUnit = num_threads_per_eu > 0 ? num_threads_per_eu : 7;
 								                    mMaxThreadsPerDevice =  maxThreadsPerExecutionUnit * execution_units_count;
 								                }
-												[MNN:Sync] Sync Internal 2.6.2

											
										
										
											2023-07-31 14:24:48 +08:00
+								#endif
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								            }
 								            else {
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								                mGpuType = OTHER;
 								            }
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								            const std::string extensions = platforms[0].getInfo<CL_PLATFORM_EXTENSIONS>();
-												[MNN:Sync] Sync Internal 2.3.0

											
										
										
											2022-12-30 15:18:58 +08:00
+								            bool isPriorityHint = (extensions.find("cl_khr_priority_hints") != std::string::npos);
-												MNN:Sync: Sync Interal 3.0.2

											
										
										
											2024-12-19 16:20:00 +08:00
+								            std::vector<cl_context_properties> context_properties;
 								            if(mGpuType == ADRENO && !isPriorityHint){
 								                context_properties.push_back(CL_CONTEXT_PERF_HINT_QCOM);
 								                context_properties.push_back(CL_PERF_HINT_HIGH_QCOM);
 								                context_properties.push_back(CL_CONTEXT_PRIORITY_HINT_QCOM);
 								                context_properties.push_back(CL_PRIORITY_HINT_LOW_QCOM);
 								                mIsDeviceSupportedLowPower = true;
 								            }
 								            #ifdef ARM_OPENCL_PRINTF_DEBUG
 								            context_properties.push_back(CL_PRINTF_CALLBACK_ARM);
 								            context_properties.push_back((cl_context_properties)callback);
 								            context_properties.push_back(CL_PRINTF_BUFFERSIZE_ARM);
 								            context_properties.push_back(0x1000);
 								            #endif
 								            std::string deviceextensions = mFirstGPUDevicePtr.get()->getInfo<CL_DEVICE_EXTENSIONS>();
 								#ifdef MNN_USE_LIB_WRAPPER
 								            mIsSupportAHD = (getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_import_memory_android_hardware_buffer")
 								                 && mGpuType == MALI && OpenCLSymbolsOperator::getOpenclSymbolsPtr()->getFuncAddress(platforms[platformId](), "clImportMemoryARM"))
 								                 || (mGpuType == ADRENO && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_qcom_android_ahardwarebuffer_host_ptr"));
 								#endif
-												[MNN:Sync] Sync Internal 2.8.3

											
										
										
											2024-03-13 14:55:54 +08:00
+								            if(nullptr != contextPtr){
-												MNN:Sync: Sync Interal 3.0.2

											
										
										
											2024-12-19 16:20:00 +08:00
+								                mContext = std::shared_ptr<cl::Context>((cl::Context*)contextPtr, [](void* ptr) {
 								                    // Do nothing
 								                });
-												- dynamic computation graph (beta)
	- add supports (/express)
	- add tests
	- add benchmarks with it (/benchmark/exprModels)
- Python
	- MNN engine and tools were submitted to pip
	- available on Windows/macOS/Linux
- Engine/Converter
	- add supports for each op benchmarking
	- refactor optimizer by separating steps
- CPU
	- add supports for Conv3D, Pool3D, ELU, ReverseSequence
	- fix ArgMax, Permute, Scale, BinaryOp, Slice, SliceTf
- OpenCL
	- add half transform in CPU
	- add broadcast supports for binary
	- optimize Conv2D, Reshape, Eltwise, Gemm, etc.
- OpenGL
	- add sub, real div supports for binary
	- add supports for unary
	- optimize Conv2D, Reshape
- Vulkan
	- add max supports for eltwise
- Metal
	- fix metallib missing problem
- Train/Quantization
	- use express to refactor training codes

											
										
										
											2019-09-26 21:02:07 +08:00
+								            }else{
-												MNN:Bugfix: Fix bug for mac os opencl init error

											
										
										
											2024-12-20 11:14:30 +08:00
+								                if(context_properties.size() > 0){
 								                    context_properties.push_back(0);
 								                    mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), context_properties.data(), nullptr, nullptr, &res));
 								                }else{
 								                    mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), nullptr, nullptr, nullptr, &res));
 								                }
-												MNN:Sync: Sync Interal 3.0.2

											
										
										
											2024-12-19 16:20:00 +08:00
+								            }
 								            MNN_CHECK_CL_SUCCESS(res, "context");
 								            if (res != CL_SUCCESS) {
 								                mIsCreateError = true;
 								                return;
-												[MNN:Sync] Sync Internal 2.3.0

											
										
										
											2022-12-30 15:18:58 +08:00
+								            }
 								            mIsDeviceSupportedLowPower = (mIsDeviceSupportedLowPower || isPriorityHint);
 								            #ifdef MNN_USE_LIB_WRAPPER
 								            if(isPriorityHint)
 								            {
 								                if(true == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isPropError())
 								                {
 								                    mIsCreateError = true;
 								                    return;
 								                }
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
-												[MNN:Sync] Sync Internal Gitlab

											
										
										
											2023-02-28 10:41:24 +08:00
+								                cl_queue_properties prop[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR,
 								#ifdef ENABLE_OPENCL_TIME_PROFILER
 								                    CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE,
 								#endif
 };
-												[MNN:Sync] Sync Internal 2.3.0

											
										
										
											2022-12-30 15:18:58 +08:00
+								                mCommandQueuePtr.reset(new cl::CommandQueue(clCreateCommandQueueWithProperties((*mContext).get(), (*mFirstGPUDevicePtr).get(), prop, &res)));
 								            }
 								            else
 								            #endif
 								            {
 								                mCommandQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, properties, &res);
 								            }
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								            MNN_CHECK_CL_SUCCESS(res, "commandQueue");
-												[MNN:Sync] Sync Internal 2.3.0

											
										
										
											2022-12-30 15:18:58 +08:00
+								            if (res != CL_SUCCESS) {
 								                mIsCreateError = true;
 								                return;
 								            }
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								#ifdef ENABLE_OPENCL_TIME_PROFILER
 								            mCommandQueueTuning = mCommandQueuePtr;
 								#else
 								            mCommandQueueTuning = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, CL_QUEUE_PROFILING_ENABLE, &res);
 								#endif
 								            mCurrentCommandQueue = mCommandQueuePtr.get();
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								            mFirstGPUDevicePtr->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &mGPUGlobalMemeryCacheSize);
 								            mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &mGPUComputeUnits);
 								            mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &mMaxFreq);
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								            mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &mMaxMemAllocSize);
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								            mFirstGPUDevicePtr->getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &mMaxLocalMemSize);
 								            mMaxWorkGroupSize = mFirstGPUDevicePtr->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
 								            //set gpu mode, tuning level and memory object
 								            setGpuMode(cl_mode);
 								            if(mMemType == AUTO) {
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								                if(mGpuType == MALI || mGpuType == INTEL) {
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								                    mMemType = BUFFER;
 								                } else {
 								                    mMemType = IMAGE;
 								                }
 								            }
-												OpenCL:Bugfix: Fix bug for memory mode not valid when create backend

											
										
										
											2024-12-23 15:31:02 +08:00
+								            setPrecision(precision);
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
-.2.1.5

# integration
- add travis CI
- fix building parameters for python

# converter
- add half storage option for MNN converter
- fix op name lost in converter
- fix converter bug for print input output, identity remove output

# ops
- add quantized Convolution & Deconvolution support on OpenCL
- add more expression supports
- add DetectionPostProcess Op for TensorFlow Lite (ssd is supported directly now)
- add supports for LSTM & ELU for ONNX
- add support for Convolution that weights is not constant for ONNX
- fix Unary Op compile error on Linux
- fix Metal backend buffer reuse after resize
- fix Metal raw memory access after model releasing
- fix redundant transpose in Winograd generater

											
										
										
											2019-11-15 14:22:45 +08:00
+								            if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_int8")){
 								                mSupportDotInt8 = true;
 								            }
 								            if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_accumulate_int8")){
 								                mSupportDotAccInt8 = true;
 								            }
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
-												[MNN:Sync] Sync Internal 2.5.3

											
										
										
											2023-06-16 09:42:45 +08:00
+								#if !defined(ENABLE_OPENCL_TIME_PROFILER) && defined(MNN_USE_LIB_WRAPPER)
 								            {
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                if((false == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isQcomError())
 								                   && getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_qcom_recordable_queues")
 								                   && (cl_mode & MNN_GPU_RECORD_OP || cl_mode & MNN_GPU_RECORD_BATCH)){
-												[MNN:Sync] Sync Internal 2.6.3

											
										
										
											2023-08-21 14:51:54 +08:00
+								                    uint32_t MaxRecordableQueueSize = mFirstGPUDevicePtr->getInfo<CL_DEVICE_RECORDABLE_QUEUE_MAX_SIZE>();
-												[MNN:Sync] Sync Internal 2.5.3

											
										
										
											2023-06-16 09:42:45 +08:00
+								                    cl_int err;
-												[MNN:Sync] Sync Internal 2.8.1

											
										
										
											2023-12-27 17:26:44 +08:00
+								                    if(MaxRecordableQueueSize > 0){
-												MNN:Sync: Sync Internal 3.0.5

											
										
										
											2025-02-12 11:14:19 +08:00
+								                        mUseRecordableQueueSize = hint.encorderNumForCommit;
-												[MNN:Sync] Sync Internal 2.6.3

											
										
										
											2023-08-21 14:51:54 +08:00
+								                        mUseRecordableQueueSize = MaxRecordableQueueSize < mUseRecordableQueueSize ? MaxRecordableQueueSize : mUseRecordableQueueSize;
-												[MNN:Sync] Sync Internal 2.5.3

											
										
										
											2023-06-16 09:42:45 +08:00
+								                        mUseRecordQueue = true;
 								                        mRecordableQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, CL_QUEUE_RECORDABLE_QCOM, &err);
 								                        if(err != CL_SUCCESS){
 								                            mIsCreateError = true;
 								                            return;
 								                        }
 								                    }
 								                }
 								            }
 								#endif
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								        }else{
 								            mIsCreateError = true;
-												MNN:Sync: sync internal 3.0.3

											
										
										
											2024-12-31 15:34:08 +08:00
+								            MNN_ASSERT(1 <= gpuDevices.size());
-												beta 0.2.0.2
- CPU
  - add padding support
  - fix bug in permute when channel % 4 != 0
  - fix bug in exp with extreme value
- OpenCL
  - add protecting logics
- OpenGL
  - add protecting logics
  - support NCHW format in Squeeze and Reshape
- Converter
  - add ShuffleChannel support for Caffe
  - add Clip/Transpose/Unary/Pad supports for ONNX

											
										
										
											2019-07-02 18:01:08 +08:00
+								        }
 								    }else{
 								        mIsCreateError = true;
-												MNN:Sync: sync internal 3.0.3

											
										
										
											2024-12-31 15:34:08 +08:00
+								        MNN_ASSERT(platforms.size() > 0);
-												OpenCL:Bugfix: Fix bug for getInfo when create Error

											
										
										
											2024-12-22 17:45:01 +08:00
+								    }
 								    if (mIsCreateError) {
 								        return;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    }
-												MNN:Sync: sync internal 3.0.3

											
										
										
											2024-12-31 15:34:08 +08:00
+								    if (mMemType == IMAGE){
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								        // Init info
 								        size_t max_height, max_width;
 								        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
 								        MNN_CHECK_CL_SUCCESS(res, "image2Dsize");
 								        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
 								        MNN_CHECK_CL_SUCCESS(res, "image2Dsize");
 								        mMaxImageSize = {max_height, max_width};
 								    }
 								    do {
 								        int dims = 3;
 								        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &dims);
 								        MNN_CHECK_CL_SUCCESS(res, "DeviceGetInfo");
 								        if(dims < 3) {
 								            std::vector<uint32_t> workItem(3, 8);
 								            mMaxWorkIterms = workItem;
 								            break;
 								        }
 								        cl::vector<cl::size_type> _workItems(dims, 1);
 								        res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &_workItems);
 								        MNN_CHECK_CL_SUCCESS(res, "DeviceGetInfo");
 								        std::vector<uint32_t> workItems(dims, 1);
 								        for (int i = 0; i < dims; ++i) {
 								            workItems[i] = _workItems[i];
 								        }
 								        mMaxWorkIterms = workItems;
 								    } while(false);
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								}
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								void OpenCLRuntime::setGpuMode(const int cl_mode_num) {
 								    int totalSet = 0;
 								    bool isSet = (cl_mode_num & MNN_GPU_MEMORY_BUFFER);
 								    if(isSet) {
 								        mMemType = BUFFER;
 								        totalSet++;
 								    }
 								    isSet = (cl_mode_num & MNN_GPU_MEMORY_IMAGE);
 								    if(isSet) {
 								        mMemType = IMAGE;
 								        totalSet++;
 								    }
 								    if(totalSet > 1) {
 								        MNN_PRINT("set both BUFFER and IMAGE mode is not permitted, please check cl_mode:%x！\n", cl_mode_num);
 								    }
 								    totalSet = 0;
 								    isSet = (cl_mode_num & MNN_GPU_TUNING_NONE);
 								    if(isSet) {
 								        mTuneLevel = None;
 								        totalSet++;
 								    }
 								    isSet = (cl_mode_num & MNN_GPU_TUNING_FAST);
 								    if(isSet) {
 								        mTuneLevel = Fast;
 								        totalSet++;
 								    }
 								    isSet = (cl_mode_num & MNN_GPU_TUNING_NORMAL);
 								    if(isSet) {
 								        mTuneLevel = Normal;
 								        totalSet++;
 								    }
 								    isSet = (cl_mode_num & MNN_GPU_TUNING_HEAVY);
 								    if(isSet) {
 								        mTuneLevel = Heavy;
 								        totalSet++;
 								    }
 								    isSet = (cl_mode_num & MNN_GPU_TUNING_WIDE);
 								    if(isSet) {
 								        mTuneLevel = Wide;
 								        totalSet++;
 								    }
 								    if(totalSet != 1) {
 								        MNN_PRINT("set multi tuning mode is not permitted, please check cl_mode:%x！\n", cl_mode_num);
 								    }
-												[MNN:Sync] Sync Internal 2.6.3

											
										
										
											2023-08-21 14:51:54 +08:00
 								    totalSet = 0;
 								    isSet = (cl_mode_num & MNN_GPU_RECORD_OP);
 								    if(isSet) {
 								        mDevideOpRecord = true;
 								        totalSet++;
 								    }
 								    isSet = (cl_mode_num & MNN_GPU_RECORD_BATCH);
 								    if(isSet) {
 								        mDevideOpRecord = false;
 								        totalSet++;
 								    }
 								    if(totalSet > 1) {
 								        MNN_PRINT("set multi record kernel mode is not permitted, please check cl_mode:%x！\n", cl_mode_num);
 								    }
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								}
-												disable CL_QUEUE_PROFILING_ENABLE onExecute

											
										
										
											2020-06-22 11:23:12 +08:00
+								void OpenCLRuntime::setCommandQueueProfileEnable() {
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    mCurrentCommandQueue->finish();
 								    mCurrentCommandQueue = mCommandQueueTuning.get();
-												disable CL_QUEUE_PROFILING_ENABLE onExecute

											
										
										
											2020-06-22 11:23:12 +08:00
+								}
 								void OpenCLRuntime::setCommandQueueProfileDisable() {
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    mCurrentCommandQueue->finish();
 								    mCurrentCommandQueue = mCommandQueuePtr.get();
-												disable CL_QUEUE_PROFILING_ENABLE onExecute

											
										
										
											2020-06-22 11:23:12 +08:00
+								}
-												revise cl_flush count

											
										
										
											2020-06-23 17:50:24 +08:00
+								unsigned int OpenCLRuntime::getQueueNum() {
 								    mQueueCount++;
 								    return mQueueCount;
 								}
-												MNN:Sync: Sync Internal 2.9.4

											
										
										
											2024-08-24 15:46:21 +08:00
+								std::map<std::string, uint32_t>& OpenCLRuntime::preParamsMap(){
 								    return mPreParams;
 								}
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								std::map<std::vector<uint32_t>, std::vector<uint32_t>>& OpenCLRuntime::tunedGemmParamsMap() {
 								    return mTunedGemmParams;
 								}
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& OpenCLRuntime::tunedLwsMap() {
-												add tuned LWS save

											
										
										
											2020-07-06 17:48:55 +08:00
+								    return mTunedLws;
 								}
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
-												[MNN:Sync] Sync Internal 2.8.0

											
										
										
											2023-12-04 11:12:20 +08:00
+								std::map<std::string, std::vector<std::pair<std::vector<uint32_t>, std::pair<std::vector<uint32_t>, uint32_t>>>>& OpenCLRuntime::getTuneLwsMap() {
 								    return mTuneLws;
 								}
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								OpenCLRuntime::~OpenCLRuntime() {
 								#ifdef LOG_VERBOSE
 								    MNN_PRINT("start ~OpenCLRuntime !\n");
 								#endif
-												修正打印profile性能在mali平台不稳定的问题

											
										
										
											2023-08-31 15:31:45 +08:00
+								    clearEvent();
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    mBuildProgramMap.clear();
 								    mCommandQueuePtr.reset();
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    mCommandQueueTuning.reset();
-												[MNN:Sync] Sync Internal Gitlab

											
										
										
											2023-07-18 09:36:26 +08:00
+								    mRecordableQueuePtr.reset();
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    mContext.reset();
 								    mFirstGPUDevicePtr.reset();
 								#ifdef LOG_VERBOSE
 								    MNN_PRINT("end ~OpenCLRuntime !\n");
 								#endif
 								}
 								std::vector<size_t> OpenCLRuntime::getMaxImage2DSize() {
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    return mMaxImageSize;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								}
 								bool OpenCLRuntime::isSupportedFP16() const {
 								    return mIsSupportedFP16;
 								}
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
 								bool OpenCLRuntime::isDeviceSupportedFP16() const {
 								    return mIsDeviceSupportedFP16;
 								}
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
-												[MNN:Sync] Sync Internal 2.3.0

											
										
										
											2022-12-30 15:18:58 +08:00
+								bool OpenCLRuntime::isDeviceSupportedLowPower() const {
 								    return mIsDeviceSupportedLowPower;
 								}
-.2.1.5

# integration
- add travis CI
- fix building parameters for python

# converter
- add half storage option for MNN converter
- fix op name lost in converter
- fix converter bug for print input output, identity remove output

# ops
- add quantized Convolution & Deconvolution support on OpenCL
- add more expression supports
- add DetectionPostProcess Op for TensorFlow Lite (ssd is supported directly now)
- add supports for LSTM & ELU for ONNX
- add support for Convolution that weights is not constant for ONNX
- fix Unary Op compile error on Linux
- fix Metal backend buffer reuse after resize
- fix Metal raw memory access after model releasing
- fix redundant transpose in Winograd generater

											
										
										
											2019-11-15 14:22:45 +08:00
+								bool OpenCLRuntime::isSupportedDotInt8() const {
 								    return mSupportDotInt8;
 								}
 								bool OpenCLRuntime::isSupportedDotAccInt8() const {
 								    return mSupportDotAccInt8;
 								}
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								bool OpenCLRuntime::isSupportedIntelSubgroup() const {
 								    return mSupportedIntelSubgroup;
 								 }
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								cl::Context &OpenCLRuntime::context() {
 								    return *mContext;
 								}
 								cl::CommandQueue &OpenCLRuntime::commandQueue() {
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    return *mCurrentCommandQueue;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								}
-												[MNN:Sync] Sync Internal 2.5.3

											
										
										
											2023-06-16 09:42:45 +08:00
+								cl::CommandQueue &OpenCLRuntime::recordableQueue(){
 								    return *mRecordableQueuePtr;
 								}
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								uint64_t OpenCLRuntime::deviceGlobalMemeryCacheSize() const {
 								    return mGPUGlobalMemeryCacheSize;
 								}
 								uint32_t OpenCLRuntime::deviceComputeUnits() const {
 								    return mGPUComputeUnits;
 								}
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								uint32_t OpenCLRuntime::MaxThreadsPerDevice() const {
 								    return mMaxThreadsPerDevice;
 								}
 								uint32_t OpenCLRuntime::MaxWorkGroupSize() const {
 								    return mMaxWorkGroupSize;
 								}
-												MNN:Sync: Sync Internal 2.9.5

											
										
										
											2024-09-12 12:57:57 +08:00
+								uint32_t OpenCLRuntime::getPrecisionLevel() const {
 								    return mPrecisionLevel;
 								}
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								uint32_t OpenCLRuntime::maxFreq() const {
 								    return mMaxFreq;
 								}
 								uint64_t OpenCLRuntime::maxAllocSize() const {
 								    return mMaxMemAllocSize;
 								}
-												OpenCL:Bugfix: Fix bug for memory mode not valid when create backend

											
										
										
											2024-12-23 15:31:02 +08:00
+								void OpenCLRuntime::setPrecision(const BackendConfig::PrecisionMode precision){
 								    cl_device_fp_config fpConfig;
 								    auto success = mFirstGPUDevicePtr->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fpConfig);
 								    mIsDeviceSupportedFP16     = CL_SUCCESS == success && fpConfig > 0;
 								    bool checkFp16Exetension = getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_khr_fp16");
 								    mIsDeviceSupportedFP16 = (mIsDeviceSupportedFP16 && checkFp16Exetension);
 								    mPrecisionLevel = 1;
 								    if (mIsDeviceSupportedFP16) {
 								        if (precision == BackendConfig::Precision_Low) {
 								            mPrecisionLevel = 2;
 								        } else if (precision == BackendConfig::Precision_Normal && mMemType == BUFFER) {
 								            mPrecisionLevel = 0;
 								        }
 								    }
 								    // Is supported fp16 IO storage
 								    mIsSupportedFP16 = (mPrecisionLevel == 2 || mPrecisionLevel == 0);
 								}
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								bool OpenCLRuntime::loadProgram(const std::string &programName, cl::Program *program) {
-												[MNN:Sync] Sync Internal 2.3.1

											
										
										
											2023-02-15 10:30:27 +08:00
+								    std::lock_guard<std::mutex> lck(gCLMutex);
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    auto it_source = OpenCLProgramMap.find(programName);
 								    if (it_source != OpenCLProgramMap.end()) {
 								        cl::Program::Sources sources;
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								        std::string source(it_source->second);
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								        sources.push_back(source);
 								        *program = cl::Program(context(), sources);
 								        return true;
 								    } else {
 								        MNN_PRINT("Can't find kernel source !\n");
 								        return false;
 								    }
 								}
 								bool OpenCLRuntime::buildProgram(const std::string &buildOptionsStr, cl::Program *program) {
 								    AUTOTIME;
 								    cl_int ret = program->build({*mFirstGPUDevicePtr}, buildOptionsStr.c_str());
 								    if (ret != CL_SUCCESS) {
 								        if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*mFirstGPUDevicePtr) == CL_BUILD_ERROR) {
 								            std::string buildLog = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(*mFirstGPUDevicePtr);
 								            MNN_PRINT("Program build log: %s \n", buildLog.c_str());
 								        }
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								        MNN_PRINT("Build program failed, err:%d ! \n", ret);
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								        return false;
 								    }
 								    return true;
 								}
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
 								std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernel(const std::string &programName, const std::string &kernelName,
 								                                      const std::set<std::string> &buildOptions, const Tensor *input, const Tensor *output) {
 								    auto kwp = buildKernelWithCache(programName, kernelName, buildOptions, input, output, true);
 								    return kwp;
 								}
 								std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernelWithCache(const std::string &programName, const std::string &kernelName,
 								                                      const std::set<std::string> &buildOptions, const Tensor *input, const Tensor *output, bool useCache) {
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    std::string buildOptionsStr;
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
+								    if (mPrecisionLevel == 2) {// Fp16 Memory and fp16 compute
-												MNN:Sync: Sync Internal 3.0.1

											
										
										
											2024-12-02 10:12:08 +08:00
+								        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=half  -DCOMPUTE_FLOAT2=half2 -DCOMPUTE_FLOAT3=half3 -DCOMPUTE_FLOAT4=half4 -DCOMPUTE_FLOAT8=half8 -DCOMPUTE_FLOAT16=half16 -DCONVERT_COMPUTE_FLOAT=convert_half -DCONVERT_COMPUTE_FLOAT2=convert_half2 -DCONVERT_COMPUTE_FLOAT3=convert_half3 -DCONVERT_COMPUTE_FLOAT4=convert_half4 -DCONVERT_COMPUTE_FLOAT8=convert_half8 -DCONVERT_COMPUTE_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DCONVERT_FLOAT=convert_half  -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT3=convert_half3 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DMNN_SUPPORT_FP16";
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
+								    } else if (mPrecisionLevel == 0) {// Fp16 Memory and fp32 compute
-												MNN:Sync: Sync Internal 3.0.1

											
										
										
											2024-12-02 10:12:08 +08:00
+								        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT=convert_float -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT3=convert_float3 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DCONVERT_FLOAT=convert_half  -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT3=convert_half3 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DMNN_SUPPORT_FP16";
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
+								    } else {// Fp32 Memory and fp32 compute
-												MNN:Sync: Sync Internal 3.0.1

											
										
										
											2024-12-02 10:12:08 +08:00
+								        buildOptionsStr = "-DFLOAT=float -DFLOAT2=float2 -DFLOAT3=float3 -DFLOAT4=float4 -DFLOAT8=float8 -DFLOAT16=float16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT=convert_float  -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT3=convert_float3 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT=convert_float  -DCONVERT_FLOAT2=convert_float2 -DCONVERT_FLOAT3=convert_float3 -DCONVERT_FLOAT4=convert_float4 -DCONVERT_FLOAT8=convert_float8 -DCONVERT_FLOAT16=convert_float16";
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    }
-												[MNN:Sync] Sync Internal Github

											
										
										
											2020-07-04 01:21:30 +08:00
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    if(nullptr != input){
 								        if(input->getType().code == halide_type_int) {
 								            buildOptionsStr += " -DINPUT_TYPE_I=int";
 								            buildOptionsStr += " -DINPUT_TYPE_I4=int4";
 								            if(input->getType().bits == 8){
 								                buildOptionsStr += " -DINPUT_TYPE=char";
 								                buildOptionsStr += " -DINPUT_TYPE4=char4";
 								                buildOptionsStr += " -DRI_DATA=read_imagei";
 								            } else if(input->getType().bits == 32){
 								                buildOptionsStr += " -DINPUT_TYPE=int";
 								                buildOptionsStr += " -DINPUT_TYPE4=int4";
 								                buildOptionsStr += " -DRI_DATA=read_imagei";
 								            } else {
 								                MNN_PRINT("opencl input datatype not support, bit:%d\n", input->getType().bits);
 								                MNN_ASSERT(false);
 								            }
 								        } else if(input->getType().code == halide_type_uint){
 								            buildOptionsStr += " -DINPUT_TYPE_I=uint";
 								            buildOptionsStr += " -DINPUT_TYPE_I4=uint4";
 								            if(input->getType().bits == 8){
 								                buildOptionsStr += " -DINPUT_TYPE=uchar";
 								                buildOptionsStr += " -DINPUT_TYPE4=uchar4";
 								                buildOptionsStr += " -DRI_DATA=read_imageui";
 								            } else if(input->getType().bits == 32){
 								                buildOptionsStr += " -DINPUT_TYPE=uint";
 								                buildOptionsStr += " -DINPUT_TYPE4=uint4";
 								                buildOptionsStr += " -DRI_DATA=read_imageui";
 								            } else {
 								                MNN_PRINT("opencl input datatype not support, bit:%d\n", input->getType().bits);
 								                MNN_ASSERT(false);
 								            }
 								        } else {
 								            if(mIsSupportedFP16){
 								                buildOptionsStr += " -DINPUT_TYPE_I=half";
 								                buildOptionsStr += " -DINPUT_TYPE_I4=half4";
 								                buildOptionsStr += " -DINPUT_TYPE=half";
 								                buildOptionsStr += " -DINPUT_TYPE4=half4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DINPUT_TYPE16=half16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DRI_DATA=read_imageh";
 								            }else{
 								                buildOptionsStr += " -DINPUT_TYPE_I=float";
 								                buildOptionsStr += " -DINPUT_TYPE_I4=float4";
 								                buildOptionsStr += " -DINPUT_TYPE=float";
 								                buildOptionsStr += " -DINPUT_TYPE4=float4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DINPUT_TYPE16=float16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DRI_DATA=read_imagef";
 								            }
 								        }
 								    }
 								    if(nullptr != output){
 								        if(output->getType().code == halide_type_int) {
 								            buildOptionsStr += " -DOUTPUT_TYPE_I=int";
 								            buildOptionsStr += " -DOUTPUT_TYPE_I4=int4";
 								            buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_int4";
 								            if(output->getType().bits == 8){
 								                buildOptionsStr += " -DOUTPUT_TYPE=char";
 								                buildOptionsStr += " -DOUTPUT_TYPE4=char4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DOUTPUT_TYPE16=char16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_char4";
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_char16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DWI_DATA=write_imagei";
 								            } else if(output->getType().bits == 32){
 								                buildOptionsStr += " -DOUTPUT_TYPE=int";
 								                buildOptionsStr += " -DOUTPUT_TYPE4=int4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DOUTPUT_TYPE16=int16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_int4";
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_int16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DWI_DATA=write_imagei";
 								            } else {
-												MNN:Sync: Sync Interal 3.0.2

											
										
										
											2024-12-19 16:20:00 +08:00
+								                MNN_PRINT("opencl output datatype not support, bit:%d\n", output->getType().bits);
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                MNN_ASSERT(false);
 								            }
 								        } else if(output->getType().code == halide_type_uint){
 								            buildOptionsStr += " -DOUTPUT_TYPE_I=uint";
 								            buildOptionsStr += " -DOUTPUT_TYPE_I4=uint4";
 								            buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_uint4";
 								            if(output->getType().bits == 8){
 								                buildOptionsStr += " -DOUTPUT_TYPE=uchar";
 								                buildOptionsStr += " -DOUTPUT_TYPE4=uchar4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DOUTPUT_TYPE16=uchar16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_uchar4";
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_uchar16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DWI_DATA=write_imageui";
 								            } else if(output->getType().bits == 32){
 								                buildOptionsStr += " -DOUTPUT_TYPE=uint";
 								                buildOptionsStr += " -DOUTPUT_TYPE4=uint4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DOUTPUT_TYPE16=uint16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_uint4";
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_uint16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DWI_DATA=write_imageui";
 								            } else {
-												MNN:Sync: Sync Interal 3.0.2

											
										
										
											2024-12-19 16:20:00 +08:00
+								                MNN_PRINT("opencl output datatype not support, bit:%d\n", output->getType().bits);
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                MNN_ASSERT(false);
 								            }
 								        } else {
 								            if(mIsSupportedFP16){
 								                buildOptionsStr += " -DOUTPUT_TYPE_I=half";
 								                buildOptionsStr += " -DOUTPUT_TYPE_I4=half4";
 								                buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_half4";
 								                buildOptionsStr += " -DOUTPUT_TYPE=half";
 								                buildOptionsStr += " -DOUTPUT_TYPE4=half4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DOUTPUT_TYPE16=half16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_half4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_half16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DWI_DATA=write_imageh";
 								            }else{
 								                buildOptionsStr += " -DOUTPUT_TYPE_I=float";
 								                buildOptionsStr += " -DOUTPUT_TYPE_I4=float4";
 								                buildOptionsStr += " -DCONVERT_OUTPUT_I4=convert_float4";
 								                buildOptionsStr += " -DOUTPUT_TYPE=float";
 								                buildOptionsStr += " -DOUTPUT_TYPE4=float4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DOUTPUT_TYPE16=float16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT4=convert_float4";
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								                buildOptionsStr += " -DCONVERT_OUTPUT16=convert_float16";
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								                buildOptionsStr += " -DWI_DATA=write_imagef";
 								            }
 								        }
 								    }
-												[MNN:Sync] Sync Internal Github

											
										
										
											2020-07-04 01:21:30 +08:00
+								    if(isSetWorkGroupAttribute) {
 								        buildOptionsStr += " -DSET_ATTRIBUTE=true";
 								    } else {
 								        buildOptionsStr += " -DSET_ATTRIBUTE=false";
 								    }
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    for (auto &option : buildOptions) {
 								        buildOptionsStr += " " + option;
 								    }
 								    buildOptionsStr += mDefaultBuildParams;
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								    auto key = std::make_tuple(programName, buildOptionsStr);
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    auto buildProgramInter = mBuildProgramMap.find(key);
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    cl::Program program;
 								    if (buildProgramInter != mBuildProgramMap.end()) {
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								        program = buildProgramInter->second.program;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    } else {
 								        this->loadProgram(programName, &program);
 								        auto status = this->buildProgram(buildOptionsStr, &program);
 								        if (!status) {
 								            FUNC_PRINT_ALL(programName.c_str(), s);
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								            return nullptr;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								        }
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								        ProgramWithKernel pwk;
 								        pwk.program = program;
 								        mBuildProgramMap.emplace(key, pwk);
 								        buildProgramInter = mBuildProgramMap.find(key);
 								    }
 								    auto kiter = buildProgramInter->second.kernels.find(kernelName);
 								    std::shared_ptr<cl::Kernel> kernel;
 								    bool firstCreate = false;
 								    if (kiter == buildProgramInter->second.kernels.end()) {
 								        KernelPool pool;
 								        buildProgramInter->second.kernels.insert(std::make_pair(kernelName, pool));
 								        kiter = buildProgramInter->second.kernels.find(kernelName);
 								        firstCreate = true;
 								    }
 								    if (kiter->second.recycle.empty()) {
 								        cl_int res;
 								        kernel.reset(new cl::Kernel(program, kernelName.c_str(), &res));
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								        if(res != CL_SUCCESS) {
 								            MNN_ERROR("getKernel: %s error, res:%d\n", kernelName.c_str(), res);
 								            return nullptr;
 								        }
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								        if (firstCreate) {
 								            kernel->getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WORK_GROUP_SIZE, &kiter->second.maxWorkGroupSize);
 								        }
 								    } else {
 								        kernel = kiter->second.recycle.front();
 								        kiter->second.recycle.pop();
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    }
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    std::shared_ptr<KernelWrap> kw(new KernelWrap(kernel, &kiter->second));
 								    return kw;
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								}
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernelFromSource(const std::string& source, const std::string &kernelName,
-												[MNN:Sync] Sync Internal 2.3.1

											
										
										
											2023-02-15 10:30:27 +08:00
+								                                                const std::set<std::string> &buildOptions) {
 								    std::string buildOptionsStr;
 								    if (mIsSupportedFP16) {
 								        buildOptionsStr = "-DFLOAT=half -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DRI_F=read_imageh -DWI_F=write_imageh -DCONVERT_FLOAT4=convert_half4 -DMNN_SUPPORT_FP16";
 								    } else {
 								        buildOptionsStr = "-DFLOAT=float -DFLOAT4=float4 -DFLOAT8=float8 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT4=convert_float4";
 								    }
 								    if(isSetWorkGroupAttribute) {
 								        buildOptionsStr += " -DSET_ATTRIBUTE=true";
 								    } else {
 								        buildOptionsStr += " -DSET_ATTRIBUTE=false";
 								    }
 								    for (auto &option : buildOptions) {
 								        buildOptionsStr += " " + option;
 								    }
 								    buildOptionsStr += mDefaultBuildParams;
 								    cl::Program::Sources sources;
 								    sources.push_back(source);
 								    cl::Program program = cl::Program(context(), sources);
 								    auto status = this->buildProgram(buildOptionsStr, &program);
 								    if (!status) {
 								        FUNC_PRINT_ALL(kernelName.c_str(), s);
 								    }
 								    // mBuildProgramMap.emplace(key, program);
 								    cl_int res;
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    std::shared_ptr<cl::Kernel> kernel;
 								    kernel.reset(new cl::Kernel(program, kernelName.c_str(), &res));
-												[MNN:Sync] Sync Internal 2.3.1

											
										
										
											2023-02-15 10:30:27 +08:00
+								    MNN_CHECK_CL_SUCCESS(res, "getKernel");
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    std::shared_ptr<KernelWrap> kw(new KernelWrap(kernel, nullptr));
 								    return kw;
-												[MNN:Sync] Sync Internal 2.3.1

											
										
										
											2023-02-15 10:30:27 +08:00
+								}
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								uint64_t OpenCLRuntime::getMaxWorkGroupSize(std::shared_ptr<KernelWrap> kernel) {
 								    if (nullptr != kernel->mRecycle) {
 								        return kernel->mRecycle->maxWorkGroupSize;
 								    }
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    uint64_t maxWorkGroupSize = 0;
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    kernel->get().getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WORK_GROUP_SIZE, &maxWorkGroupSize);
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								    return maxWorkGroupSize;
 								}
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								uint64_t OpenCLRuntime::GetKernelWaveSize(std::shared_ptr<KernelWrap> kernel) {
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								    uint64_t kernelWaveSize = 0;
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    kernel->get().getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WAVE_SIZE_QCOM, &kernelWaveSize);
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								    return kernelWaveSize;
 								}
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								std::vector<uint32_t> OpenCLRuntime::getMaxWorkItemSizes() {
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    return mMaxWorkIterms;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								}
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								uint64_t OpenCLRuntime::getMaxLocalMem() const {
 								    return mMaxLocalMemSize;
 								}
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								double OpenCLRuntime::getCostTime(const cl::Event *event){
-												add opencl kernel profile & revise some info in onExecute to onResize stage

											
										
										
											2020-05-28 19:04:27 +08:00
+								    //cl_int res = mCommandQueuePtr->finish();
 								    cl_int res = event->wait();
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								    MNN_CHECK_CL_SUCCESS(res, "clEvent");
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								    mStartNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_START>();
 								    mStopNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_END>();
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    mKernelTime += (unsigned int)((mStopNanos - mStartNanos) / 1000.0);
-												add opencl kernel profile & revise some info in onExecute to onResize stage

											
										
										
											2020-05-28 19:04:27 +08:00
+								    return (mStopNanos - mStartNanos) / 1000.0;
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								}
 								double OpenCLRuntime::getQueuedTime(const cl::Event *event){
-												add opencl kernel profile & revise some info in onExecute to onResize stage

											
										
										
											2020-05-28 19:04:27 +08:00
+								    //cl_int res = mCommandQueuePtr->finish();
 								    cl_int res = event->wait();
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								    MNN_CHECK_CL_SUCCESS(res, "clEvent");
-												add opencl kernel profile & revise some info in onExecute to onResize stage

											
										
										
											2020-05-28 19:04:27 +08:00
+								    return (event->getProfilingInfo<CL_PROFILING_COMMAND_START>() - event->getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>()) / 1000.0;
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								}
 								double OpenCLRuntime::getSubmitTime(const cl::Event *event){
-												add opencl kernel profile & revise some info in onExecute to onResize stage

											
										
										
											2020-05-28 19:04:27 +08:00
+								    //cl_int res = mCommandQueuePtr->finish();
 								    cl_int res = event->wait();
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								    MNN_CHECK_CL_SUCCESS(res, "clEvent");
-												add opencl kernel profile & revise some info in onExecute to onResize stage

											
										
										
											2020-05-28 19:04:27 +08:00
+								    return (event->getProfilingInfo<CL_PROFILING_COMMAND_START>() - event->getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()) / 1000.0;
-												Update

											
										
										
											2019-12-27 22:16:57 +08:00
+								}
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
-												[MNN:Sync] Sync internal gitlab

											
										
										
											2022-01-04 10:50:40 +08:00
+								std::pair<const void*, size_t> OpenCLRuntime::makeCache(void* tuneInfo) {
 								    auto tune = reinterpret_cast<MNN::OpenCL::TuneInfo*>(tuneInfo);
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    std::unique_ptr<CacheT> cache(new CacheT);
-												[MNN:Sync] Sync internal gitlab

											
										
										
											2022-01-04 10:50:40 +08:00
+								    for (auto& p : tune->mInfos) {
 								        cache->tuned.emplace_back(std::move(p));
 								    }
 								    tune->mInfos.clear();
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    // Get All program's binary
 								    for (auto& iter : mBuildProgramMap) {
 								        std::unique_ptr<ShaderT> pro(new ShaderT);
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								        auto program = iter.second.program;
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								        auto bufferSize = iter.second.BufferSize;
 								        // Only use first one
 								        pro->program = std::get<0>(iter.first);
 								        pro->buildInfo = std::get<1>(iter.first);
 								        //MNN_PRINT("%s - %s - %s\n", pro->program.c_str(), pro->kernel.c_str(), pro->buildInfo.c_str());
 								        if(bufferSize != 0){
 								            pro->buffer.resize(bufferSize);
 								            ::memcpy(pro->buffer.data(), iter.second.Buffer.get(), bufferSize);
 								            cache->programs.emplace_back(std::move(pro));
 								            continue;
 								        }
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								        auto devicesNumber = program.getInfo<CL_PROGRAM_NUM_DEVICES>();
 								        auto devices = program.getInfo<CL_PROGRAM_DEVICES>();
 								        auto binSizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
 								        if (binSizes.empty() || devices.empty()) {
-												[MNN:Sync] Sync internal github

Commits:
        8148ae75c  弗人  bugfix
        14cb8ec7f  弗人  [Converter:Bugfix] bugfix for onnx depthwise convtranspose
        476fbcd90  雁行  [MNN:Feature] Open AVX cast and bugfix for contentCFG.
        5e26b9fd3  雁行  [Test:Feature] Add android test.
        37e147b25  雁行  [MNN:Bugfix] Bugfix for floordiv.
        144c185f5  tianbu.xsw  hangxing fix hiai
        b4fd429d6  tianbu.xsw  updateCacheFile bugfix -- update cache size
        d4ba572a8  雁行  [MNN:Bugfix] Support int8 in AVX2 and some Bugfix.
        43061f07e  xiaying  [MNN:Bugfix] Fix bug for module mode run part of model
        398cc5ab6  tianhang.yth  refactor demo
        736380600  xiaying  [Express:Bugfix] Fix memory leak for copy branch
        b8dab0a27  tianhang.yth  MNNFloat2Int8 sizeQuad=0 crash fix
        94b95bfed  ghz  [BugFix]1.Better method for fast pack valid check
        6a921f85e  xiaying  [Converter:Bugfix] Fix bug for Fuseconsttosubgraph
        5f77ae889  tianhang.yth  numThread bugfix
        a807ef879  tianhang.yth  add createSession(configs, runtimeinfo) API, add pymnn demo, pymnn logcat bugfix
        ad05409d3  xiaying  [MNN:Bugfix] Fix bug for StaticModule's sizecompute overflow, add error print for module mode
        9d81b8299  xiaying  [MNN:Bugfix] Fix bug for Unique op for output size = 1
        03b15e9af  xiaying  [Test:Feature] Add MatMulBConst Test, Fix bug for single Convert
        c944a76ee  tianhang.yth  add auto backend and getSessionInfo @tianbu
        91fa7267b  ghz  [BugFix]1.fix the error in eP check
        bf0041f77  ghz  [BugFix]1.Fix the logic error in eP check. 2.Fix the sp align error
        693871672  雁行  [CPU:Bugfix] rm adrp instruction for clang compiler bug.
        1b8f6b3d8  ghz  1.Fix the wronly use of r13 in arm32 version. 2.Fix the missing callee register save and restore process.
        feb7ecc4c  弗人  modify log of python offline quant
        040c04811  ghz  [BufFix]1.replace platform-related regs. 2.fix the same problem in arm32 version
        609f37db8  弗人  add log for python quant, python convert
        5511dd30a  ghz  [BugFix]1.Add testcases in SparseConv to check all functional code branch. 2. Fix the bug in "MNNPackC4ForMatMul_A.S" in arm64, which is caused by the missing check of eReal parameter.
        a93ff9280  tianhang.yth  add tf.Unique op support
        9729ff773  allen.lk  [Bugfix] Fix one arm32 instruction syntax that clang works but gcc DOES NOT work. use index instruction instead.
        297c1ad14  雁行  [Expr:Bugfix] bugfix for tensor content used by shape compute.
        ef8c369e3  弗人  catch exception
        07c2dd670  弗人  add dependence to setup, base64 encode url, add time log
        177e590c1  弗人  [Python:Feature] add aliyun log for python quant tool
        40a7928cf  allen.lk  [Debug:Sparse] 1.Add group parameter in torchscript converter. 2. Stop split running to avoid memory corruption when check failed in TransformGroupConvolution 3. fix Op split issue in TransformGroupConvolution
        3bdea84a1  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        c3c6fbdbd  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        bc590eee4  雁行  [Converter:Bugfix] bugfix for onnx instancenormalization convert.
        d8918593f  tianhang.yth  add auto backend and getSessionInfo @tianbu
        83a198ed7  杭行  update
        d0dd3e09b  杭行  update
        99540202e  xiaying  [Converter:Optimize] Opt the tensor convert insert
        333d8db82  allen.lk  [Debug:Sparse] Fix All platform-register r9 / x18 issue on arm32 and arm64.
        db5994672  杭行  merge
        6293de7b8  tianbu.xsw  fix pymnn updateCacheFile
        5c2e11cb1  tianbu.xsw  do updateCache in createSession
        6e7641ff4  tianbu.xsw  do not limit cacheFile for a model
        5287a65e4  tianbu.xsw  bugfix
        52ba53a91  tianbu.xsw  revert pymnn api
        60284d830  tianbu.xsw  bugfix
        6d8077490  tianbu.xsw  rename updateCacheFile api params
        3cb172710  tianhang.yth  updateCacheFile API size default value is 0
        c5b69aabf  tianbu.xsw  updateCacheFile python api fix
        5d5da7aa5  tianbu.xsw  reflector code
        5707877a4  雁行  [MNN:Speed] Speedup for softmax in x86 and arm.
        2a211825c  tianbu.xsw  reflector code for updateCacheFile
        76db3a835  tianbu.xsw  [Cache Feature]: Add updateCacheFile API for increment cache
        b06b0fd43  allen.lk  [Debug:Sparse] Fix and warning one kind of segmentfault cause by memory corruption when resize ConvolutionWinograd.  Avoid to use some registers as arm restriction.
        e68bfa495  雁行  [Converter:Feature] Add UUID when model convert.
        a9cb935dc  xiaying  [MNN:Speed] Support c4nhwc for more fastblit
        019f40353  xiaying  [Converter:Refractor] Reduce memory used by MNNConvert(bert from 5G ->         1G)
        d2a6d3d05  xiaying  [MNN:Bugfix] Fix bug for identity output not find
        604d0801b  xiaying  [Converter:Bugfix] Fix bug for FuseGeLu
        4bada2367  xiaying  [MNN:Refractor] SegmentMean rewrite as segment
        82070e708  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary
        e8ea4266e  xiaying  Fix bug for ShapeTensorConvert compute for dim = 1 error
        1f1cf1991  xiaying  [Tools:Bugfix] Fix system compability for fastTestOnnx
        6f422efe2  xiaying  [Tools:Bugfix] Remove color for checkDir for easy to dump
        968f7ec88  xiaying  [MNN:Speed] Support turn broadcast binary to loop
        3e7aaf46f  xiaying  [MNN:Refractor] Set Convolution1x1Strassen support variable input/output ptr
        1f65ab163  xiaying  [MNN:Bugfix] Fix bug for mini mnn can't convert model
        d65953d47  xiaying  [MNN:Bugfix] Fix bug for armv7a - android-14 + ARM82
        8b68be45c  xiaying  [MNN:Feature] Add segment
        8a8f264f5  xiaying  [Vulkan:Bugfix] Remove unuseful print
        025bb0fda  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        43900251e  tianbu.xsw  enable setCacheFile python API
        ebfb05c74  tianbu.xsw  [Metal Feature] support metallib obtain from walle transfer task
        9665c0a79  弗人  add check for path in json file
        c66fef224  xiaying  [Converter:Bugfix] Fix bug for oneof don't support
        42f192852  xiaying  [MNN:Bugfix] Fix bug for not set output / saveTensor into origin Schedule's outputs
        1b95354ff  雁行  [Feature]: Support shape compute for SetDiff1D, and null input for Prod.
        83966d043  xiaying  [Test:Feature] Add test for static module
        42d1be933  xiaying  [Converter:Bugfix] Fix bug for mnn convert and static model add more outputs for origin model
        9067531c3  xiaying  [Converter:Refractor] formatLicence
        99558bed9  xiaying  [Converter:Bugfix] Count the op for unuseful and controlflow
        4f6da0fa7  allen.lk  [Feature:GRUMultiOutput] fix multi output dimension type
        c6b219bce  xiaying  [Converter:Feature] Turn torch converter to object
        dd4e68a37  xiaying  [Converter:Feature] Support dump supported ops
        80b6a60a3  xiaying  [Converter:Info] If has output name, print output name instead of computed
        015278fc3  xiaying  [MNN:Refractor] Revert IfModule's debug info
        23ac967c4  xiaying  Don't transform for multi-input convolution/deconvolution
        b02b0d4de  xiaying  Fix bug for multi-input for conv1d
        254d8b1d4  xiaying  Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        d47d0b9ca  xiaying  Fix bug for CPURaster's fuse nc4hw4
        357c5bd33  xiaying  Fix ConvBiasAdd for conv's inputs op > 1
        55b1f0c9c  xiaying  [Converter:Bugfix] Don't transform for multi-input convolution/deconvolution
        1902a30f5  xiaying  [Converter:Bugfix] Fix bug for Conv1dSqueezeMove for multi input convolution 1d
        c23fe617b  xiaying  [MNN:Bugfix] Fix bug for multi-input for conv1d
        8ff018426  xiaying  [MNN:Bugfix] Fix bug for CPURaster's fuse nc4hw4
        d4e8cd602  xiaying  [Converter:Bugfix] Fix ConvBiasAdd for conv's inputs op > 1
        846266b42  tianbu.xsw  return when program and tune both nullptr
        fd67c76a9  xiaying  [Converter:Bugfix] DepthwiseConvWeightMerge only valid for tflite
        e77a242c4  xiaying  [Converter:Feature] Support tflite's half pixel
        be054c377  tianbu.xsw  [OpenCL Bugfix] do not rewrite cache when binary program is produced
        51e65aa35  xiaying  [Converter:Feature] Support tflite for fp16 and multi-input convolution
        1ccdfdeb5  tianbu.xsw  redefine svm macro name
        31234d372  tianbu.xsw  [OpenCL SVM] add macro for only use wrapper
        d739e35da  xiaying  [MNN:Bugfix] Fix compile bug for grid op
        24ab13c79  Joker  feat(arm82): add GridSample op support in arm82 backend, AVX(by xiaying)
        7b142978e  xiaying  [AVX512:Speed] Optimize for e <= 8
        5f6febe7b  tianbu.xsw  code refactor
        998d91b57  xiaying  [Express:Speed] Merge submodule for speed
        22c89146f  tianhang.yth  fix alpha div by zero bug and arm server compile bug
        8f829a170  tianbu.xsw  [OpenCL Pad] unify conv/deconv pad computing
        4a28f603e  xiaying  [Express:Speed] Shared Const for All Submodule
        c74cf28f3  xiaying  [MNN:Refractor] Seperate Const init and schedule
        2a1eebb7a  xiaying  [Tools:Bugfix] Fix bug for modelTest.py count size
        72f04008c  xiaying  [MNN:Refractor] Delete unuseful const op
        1e735d03c  xiaying  [Converter:Bugfix] Fix bug for static module gen
        4dfadbc6e  xiaying  [MNN:Refractor] Rewrite const init mode
        1fcf0417a  xiaying  [MNN:Bugfix] Fix bug for deconvolutin multi-input for multi-batch
        41d429cfd  xiaying  [Train:Bugfix] Revert convert NCHW for mnistTrain
        f947a5f01  xiaying  [Test:Feature] Add testTrain
        dad59b6f6  tianbu.xsw  move realize code from Backend.hpp to Tensor.cpp
        cf4473ad1  xiaying  [Train:Bugfix] Support pad for GeometryPoolGrad
        91ab13734  xiaying  [MNN:Bugfix] Fix compile bug for avx512
        742e80f47  xiaying  [MNN:Refractor] Opt the logic for checknan judge
        12543b841  xiaying  [ARM82:Bugfix] Fix compile bug for ios
        3a2b0a49f  xiaying  [ARM82:Speed] Opt Pack / Unpack for armv8
        c0f1995cd  xiaying  [ARM82:Speed] Opt MNNPackC8FP16 and MNNUnpackC8FP16 by asm
        e0fc77dcf  xiaying  [MNN:Speed] Fix bug for DeconvolutionWithStride for C4HW4, open it
        584bec578  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        d5bd4148d  xiaying  [MNN:Bugfix] Fix bug for format set error for onnx
        b00265841  xiaying  [MNN:Bugfix] Fix bug for SparseConvolutionTiledExecutor
        bb09188ac  xiaying  [Test:Bugfix] Fix bug for run into sparse auto
        426d1babd  xiaying  [MNN:Refractor] Small bugfix for Group convolution and pack
        7d0ea1c46  tianbu.xsw  [testModel Feature] support testModel.out input resize
        4169c54ce  xiaying  [MNN:Bugfix] Fix bug for checkNAN for origin
        412a82222  xiaying  [Test:Bugfix] Fix bug for CheckNAN's error of matmul
        319b1d425  xiaying  [MNN:Bugfix] Fix bug for multi-batch for ConvInt8
        050b728a6  xiaying  [Test:Bugfix] Use NCHW for ConvInt8Test
        7db3423a1  xiaying  [OpenCL:Bugfix] Fix bug for opencl::image,opencl::buffer for C4HW4
        adcec6a7f  xiaying  [Vulkan:Bugfix] Fix bug for invalid tensor size limit
        d2a7cf4e9  xiaying  [Vulkan:Bugfix] Fix bug for onCopyBuffer of nc4hw4
        557bebdd3  xiaying  [MNN:Bugfix] Fix bug for BF16-ARM32
        bbe186649  tianbu.xsw  [Update AUTO mode]: fix MNN_FORWARD_AUTO choose priority
        6deb23439  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        b137590e4  xiaying  [MNN:Bugfix] Fix bug for GeometryBinary don't care about NC4HW4 same size
        7003558ea  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        b5f8cae5a  xiaying  [Converter:Bugfix] Fix bug for onnx pad for serveral case
        29b09e125  xiaying  [MNN:Bugfix] Fix bug for arm64-bf16
        42ce00770  xiaying  [MNN:Bugfix] Fix bug for ARM64 - float
        a2d89fc18  雁行  [Converter:Feature] Support Binary Unary for Torch.
        7f1c0deb1  xiaying  [MNN:Bugfix] Fix bug for Raster for Int8
        8335a6f18  tianbu.xsw  [OpenCL Shared Memory] modify data_format method
        b359e031b  xiaying  [ARM82:Bugfix] Fix bug for arm82 and speed up pack / unpack c8
        24bf3fc88  雁行  [Convert:Feature] Support LayerNormFuse without gamma beta.
        3e629624b  xiaying  [MNN:Bugfix] Fix bug for float - armv7a
        2b7908ec7  tianbu.xsw  modify workItemSize
        3cee0d413  xiaying  [MNN:Bugfix] test wrong clear
        9cbbfb998  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        2d7a44484  xiaying  [MNN:Bugfix] fix compile bug for c++ < 14
        eb7d0cb53  xiaying  [Test:Bugfix] Don't test for NC4HW4 directly
        7b40ca8d1  xiaying  [MNN:Bugfix] Fix bug for ConvolutionGroup
        2694d8a91  xiaying  [MNN:Bugfix] Fix bug for CPUGridSample
        f89af60f6  xiaying  [MNN:Bugfix] Fix compile bug for arm
        a151abcdd  xiaying  [MNN:Bugfix] Fix bug for convert for int8 / int16
        b254dbe61  雁行  [MNN:Bugfix] Bugfix for Conv onClone.
        d08150631  xiaying  [MNN:Bugfix] Fix bug for fast rcnn
        e5568a0df  xiaying  [MNN:Bugfix] Fix bug for CPURaster treat NC4HW4 fast blit
        128318933  雁行  [Raster:Bugfix] bugfix for Raster merge onResize.
        03caacbea  xiaying  [MNN:Bugfix] fix bug for CPUDeconvolution and Convolution1x1Strassen for iw != ow
        e1e3c245c  xiaying  [MNN:Bugfix] Fix bug for ConvolutionWinograd
        2524cbc6d  xiaying  [MNN:Bugfix] Fix bug for CPUSoftmax
        44ec79b8f  xiaying  [MNN:Bugfix] Fix bug for CPUConvolutionDepthwise / Scale / DeconvolutionDW
        21ae956ce  xiaying  [MNN:Bugfix] Fix bug for Multi-Batch-TiledExecutor
        09a5069c7  xiaying  [MNN:Speed] Add offset for src and dst
        6776c6784  xiaying  [MNN:Bugfix] Fix bug for trainable model
        cc83ae30b  xiaying  [MNN:Bugfix] Fix bug for trainable model

											
										
										
											2021-07-29 11:46:59 +08:00
+								            MNN_ERROR("Can't load binary, binarySize:%lu, deviceSize:%lu\n", binSizes.size(), devices.size());
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								            continue;
 								        }
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								        pro->buffer.resize(binSizes[0]);
 								        auto proRaw = program.get();
 								        auto c = pro->buffer.data();
 								        clGetProgramInfo(proRaw, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &c, nullptr);
 								        cache->programs.emplace_back(std::move(pro));
 								    }
 								    // Get All Autotuning cache
 								    for (auto& iter : mTunedLws) {
 								        std::unique_ptr<AutotuningT> tuning(new AutotuningT);
 								        tuning->gloablSize = iter.first.second;
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								        tuning->localSize = iter.second.first;
 								        tuning->timeCost = iter.second.second;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								        tuning->key = iter.first.first;
 								        cache->tunings.emplace_back(std::move(tuning));
 								    }
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								    // Get All GemmInfo cache
 								    for (auto& iter : mTunedGemmParams) {
 								        std::unique_ptr<GemmInfoT> tuning(new GemmInfoT);
 								        tuning->gemmSize = iter.first;
 								        tuning->paramInfo = iter.second;
 								        cache->gemm.emplace_back(std::move(tuning));
 								    }
-												MNN:Sync: Sync Internal 2.9.4

											
										
										
											2024-08-24 15:46:21 +08:00
+								    // Get All PreParam cache
 								    for(auto& iter : mPreParams){
 								        std::unique_ptr<PreParamInfoT> info(new PreParamInfoT);
 								        info->preParamName = iter.first;
 								        info->preParamData = iter.second;
 								        cache->preParam.emplace_back(std::move(info));
 								    }
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    flatbuffers::FlatBufferBuilder builder;
 								    auto lastOffset = Cache::Pack(builder, cache.get());
 								    builder.Finish(lastOffset);
 								    mBuffer.resize(builder.GetSize());
 								    ::memcpy(mBuffer.data(), builder.GetBufferPointer(), builder.GetSize());
 								    return std::make_pair(mBuffer.data(), mBuffer.size());
 								}
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								bool OpenCLRuntime::setCache(std::pair<const void*, size_t> cache) {
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    if (nullptr == cache.first) {
 								        mBuffer.clear();
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								        return true;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    }
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    auto cacheBuffer = GetCache(cache.first);
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								    if(nullptr == cacheBuffer->programs() && nullptr == cacheBuffer->tunings() && nullptr == cacheBuffer->gemm()) {
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								        return false;
 								    }
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    // Load Program
 								    if (nullptr != cacheBuffer->programs()) {
 								        auto programs = cacheBuffer->programs();
 								        for (int i=0; i<programs->size(); ++i) {
 								            auto shaderInfo = programs->GetAs<Shader>(i);
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								            if (nullptr == shaderInfo->program()|| nullptr == shaderInfo->buildInfo() || nullptr == shaderInfo->buffer()) {
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								                MNN_ERROR("Invalid Cache\n");
 								                return false;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								            }
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								            auto program = shaderInfo->program()->str();
 								            // Builder Info
 								            std::string buildinfo = shaderInfo->buildInfo()->str();
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								            auto buffer = shaderInfo->buffer()->data();
 								            size_t bufferSize = shaderInfo->buffer()->size();
 								            auto deviceId = mFirstGPUDevicePtr->get();
 								            auto programRaw = clCreateProgramWithBinary(context().get(), 1, &deviceId, &bufferSize, (const unsigned char**)(&buffer), nullptr, nullptr);
 								            if (!programRaw) {
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								                MNN_ERROR("Can't load %s - %s load program\n", program.c_str(), buildinfo.c_str());
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								                return false;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								            }
 								            auto pro = cl::Program(programRaw);
 								            auto res = buildProgram(buildinfo, &pro);
 								            if (!res) {
-												[MNN:Sync] Sync Internal 2.5.0

											
										
										
											2023-04-27 15:11:05 +08:00
+								                MNN_ERROR("Can't build %s - %s load program\n", program.c_str(), buildinfo.c_str());
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								                return false;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								            }
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								            ProgramWithKernel pwk;
 								            pwk.program = pro;
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								            pwk.Buffer.reset(new char[bufferSize]);
 								            pwk.BufferSize = bufferSize;
 								            ::memcpy(pwk.Buffer.get(), buffer, bufferSize);
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								            mBuildProgramMap.insert(std::make_pair(std::make_tuple(program, buildinfo), pwk));
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								        }
 								    }
-												[BugFix] Fix typo in code comment

Fix typo in code comment
											
										
										
											2021-03-28 17:32:33 +08:00
+								    // Load Auto Tuning Info
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								    if (nullptr != cacheBuffer->tunings()) {
 								        auto tuningInfo = cacheBuffer->tunings();
 								        for (int i=0; i<tuningInfo->size(); ++i) {
 								            auto tun = tuningInfo->GetAs<Autotuning>(i);
 								            if (nullptr == tun->gloablSize() || nullptr == tun->localSize() || nullptr == tun->key()) {
 								                MNN_ERROR("Error tunning info\n");
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								                return false;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								            }
 								            std::vector<uint32_t> glo(tun->gloablSize()->size());
 								            for (int v=0; v<glo.size(); ++v) {
 								                glo[v] = tun->gloablSize()->data()[v];
 								            }
 								            std::vector<uint32_t> loc(tun->localSize()->size());
 								            for (int v=0; v<loc.size(); ++v) {
 								                loc[v] = tun->localSize()->data()[v];
 								            }
-												[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more
auto-turning mode

											
										
										
											2021-03-12 18:41:50 +08:00
+								            uint32_t cost = tun->timeCost();
 								            mTunedLws.insert(std::make_pair(std::make_pair(tun->key()->str(), glo), std::make_pair(loc, cost)));
-												[MNN:Sync] Sync Internal 2.8.0

											
										
										
											2023-12-04 11:12:20 +08:00
+								            mTuneLws[tun->key()->str()].push_back(std::make_pair(glo, std::make_pair(loc, cost)));
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								        }
 								    }
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
 								    // Load Gemm Info
 								    if (nullptr != cacheBuffer->gemm()) {
 								        auto tuningInfo = cacheBuffer->gemm();
 								        for (int i=0; i<tuningInfo->size(); ++i) {
 								            auto tun = tuningInfo->GetAs<GemmInfo>(i);
 								            if (nullptr == tun->gemmSize() || nullptr == tun->paramInfo()) {
 								                MNN_ERROR("Error tunning gemm info\n");
 								                return false;
 								            }
-												MNN:Sync: Sync Internal 2.9.3

											
										
										
											2024-07-22 19:51:53 +08:00
+								            MNN_ASSERT(tun->gemmSize()->size() == 7);
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								            std::vector<uint32_t> info(tun->gemmSize()->size());
 								            for (int v=0; v<info.size(); ++v) {
 								                info[v] = tun->gemmSize()->data()[v];
 								            }
-												MNN:Sync: Sync Internal 2.9.3

											
										
										
											2024-07-22 19:51:53 +08:00
+								            MNN_ASSERT(tun->paramInfo()->size() == 14);
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								            std::vector<uint32_t> params(tun->paramInfo()->size());
 								            for (int v=0; v<params.size(); ++v) {
 								                params[v] = tun->paramInfo()->data()[v];
 								            }
 								            mTunedGemmParams.insert(std::make_pair(info, params));
-												MNN:Sync: Sync Internal 2.9.5

											
										
										
											2024-09-12 12:57:57 +08:00
+								            mTuneLws["Xgemm_tune"].push_back(std::make_pair(info, std::make_pair(params, 0)));
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								        }
 								    }
-												MNN:Sync: Sync Internal 2.9.4

											
										
										
											2024-08-24 15:46:21 +08:00
 								    //Load PreParam Info
 								    if(nullptr != cacheBuffer->preParam()){
 								        auto preParamInfo = cacheBuffer->preParam();
 								        for(int i = 0; i < preParamInfo->size(); ++i){
 								            auto info = preParamInfo->GetAs<PreParamInfo>(i);
 								            if (nullptr == info->preParamName()) {
 								                MNN_ERROR("Error preParam info\n");
 								                return false;
 								            }
 								            mPreParams.insert(std::make_pair(info->preParamName()->str(), info->preParamData()));
 								        }
 								    }
-												Synchronize internal github for version 1.2.0 (#1518)


											
										
										
											2021-06-11 17:17:13 +08:00
+								    return true;
-												Github release 1.1.0

											
										
										
											2020-11-05 16:41:56 +08:00
+								}
-												修正打印profile性能在mali平台不稳定的问题

											
										
										
											2023-08-31 15:31:45 +08:00
+								void OpenCLRuntime::printEventTime(){
 								#ifdef ENABLE_OPENCL_TIME_PROFILER
 								    if(mEvents.empty()){
 								        return;
 								    }
-												[MNN:Sync] Sync Internal 2.8.4

											
										
										
											2024-04-19 11:58:21 +08:00
+								    int raster_num = 0, raster_time = 0;
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								    unsigned int conv_time = 0, loop_bg_time = 0, loop_bg_gemm_time = 0, loop_softmax_time = 0, ori_softmax_time = 0;
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								    unsigned int conv_gemm2_buf_time = 0, conv_gemm1_buf_time = 0;
 								    unsigned int conv_1x1_buf_time = 0, conv_ori_buf_time = 0, wino_gemm_time = 0;
 								    std::vector<std::pair<std::string, int>> kernels(mEvents.size());
-												修正打印profile性能在mali平台不稳定的问题

											
										
										
											2023-08-31 15:31:45 +08:00
+								    for(int i = 0; i < mEvents.size(); ++i){
 								        auto event = &mEvents[i].second;
 								        cl_int res = event->wait();
 								        MNN_CHECK_CL_SUCCESS(res, "clEvent");
 								        auto StartNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_START>();
 								        auto StopNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_END>();
 								        auto kernel_time = (unsigned int)((StopNanos - StartNanos) / 1000.0);
 								        mKernelTime += kernel_time;
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								        if (mEvents[i].first.length() >= 15 && mEvents[i].first.substr(0, 15) == "ConvBuf2D-gemm2") {
 								            conv_gemm2_buf_time += kernel_time;
 								            conv_time += kernel_time;
 								        } else if (mEvents[i].first.length() >= 15 && mEvents[i].first.substr(0, 15) == "ConvBuf2D-gemm1") {
 								            conv_gemm1_buf_time += kernel_time;
 								            conv_time += kernel_time;
 								        } else if (mEvents[i].first.length() >= 17 && mEvents[i].first.substr(0, 17) == "ConvBuf2D-conv1x1") {
 								            conv_1x1_buf_time += kernel_time;
 								            conv_time += kernel_time;
 								        } else if (mEvents[i].first.length() >= 13 && mEvents[i].first.substr(0, 13) == "ConvBuf2D-ori") {
 								            conv_ori_buf_time += kernel_time;
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
+								            conv_time += kernel_time;
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								        } else if (mEvents[i].first.length() >= 11 && mEvents[i].first.substr(0, 11) == "Convolution") {
 								            conv_time += kernel_time;
-												MNN:Sync: Sync Internal 2.9.5

											
										
										
											2024-09-12 12:57:57 +08:00
+								        } else if (mEvents[i].first.length() >= 8 && mEvents[i].first.substr(0, 8) == "Strassen") {
 								            conv_time += kernel_time;
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								        }
 								        if((mEvents[i].first.length() >= 10 && mEvents[i].first.substr(0, 10) == "While-gemm")) {
 								            loop_bg_time += kernel_time;
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
+								        }
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								        if((mEvents[i].first.length() >= 20 && mEvents[i].first.substr(0, 20) == "While-gemm-batchgemm")) {
 								            loop_bg_gemm_time += kernel_time;
-												MNN:Sync Sync Internal 2.9.0

											
										
										
											2024-05-11 19:17:02 +08:00
+								        }
-												MNN:Sync: Sync Internal 2.9.2

											
										
										
											2024-07-04 11:53:45 +08:00
+								        if((mEvents[i].first.length() >= 18 && mEvents[i].first.substr(0, 18) == "While-gemm-softmax")) {
 								            loop_softmax_time += kernel_time;
 								        }
 								        if((mEvents[i].first.length() >= 7 && mEvents[i].first.substr(0, 7) == "Softmax")) {
 								            ori_softmax_time += kernel_time;
 								        }
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
+								        if((mEvents[i].first.length() >= 23 && mEvents[i].first.substr(0, 23) == "Conv-winograd-batchgemm")) {
 								            wino_gemm_time += kernel_time;
 								            conv_time += kernel_time;
 								        }
-												MNN:Sync: Sync Internal 2.9.5

											
										
										
											2024-09-12 12:57:57 +08:00
+								        if((mEvents[i].first.length() >= 6 && mEvents[i].first.substr(0, 6) == "Raster")) {
 								            raster_num++;
 								            raster_time += kernel_time;
 								        }
-												[MNN:Sync] Sync Internal 2.9.1

											
										
										
											2024-06-03 20:09:34 +08:00
 								        kernels[i] = std::make_pair(mEvents[i].first, kernel_time);
 								    }
 								#ifdef SORT_PROFILE_TIME
 								    for(int i = 0; i < mEvents.size(); i++) {
 								        for(int j = i+1; j < mEvents.size(); j++) {
 								            if(kernels[i].second > kernels[j].second) {
 								                auto tmp = kernels[i];
 								                kernels[i].first = kernels[j].first;
 								                kernels[i].second = kernels[j].second;
 								                kernels[j].first = tmp.first;
 								                kernels[j].second = tmp.second;
 								            }
 								        }
 								    }
 								#endif
 								    for(int i = 0; i < mEvents.size(); i++) {
 								        MNN_PRINT("kernel time = %d    us %s\n", kernels[i].second, kernels[i].first.c_str());
-												修正打印profile性能在mali平台不稳定的问题

											
										
										
											2023-08-31 15:31:45 +08:00
+								    }
 								    mEvents.clear();
-												MNN:Sync: Sync Internal 2.9.5

											
										
										
											2024-09-12 12:57:57 +08:00
+								    MNN_PRINT("total kernel time = %d  us, conv time = %d us (gemm2:%d us, gemm1:%d us, 1x1:%d us, ori:%d us, wino: %d us, other: %d us), while gemm time = %d us (core gemm time: %d us, softmax:%d us), ori softmax: %d us, raster[%d] time: %d us\n", mKernelTime, conv_time, conv_gemm2_buf_time, conv_gemm1_buf_time, conv_1x1_buf_time, conv_ori_buf_time, wino_gemm_time, conv_time-conv_gemm2_buf_time-conv_gemm1_buf_time-conv_1x1_buf_time-conv_ori_buf_time-wino_gemm_time, loop_bg_time, loop_bg_gemm_time, loop_softmax_time, ori_softmax_time, raster_num, raster_time);
-												修正打印profile性能在mali平台不稳定的问题

											
										
										
											2023-08-31 15:31:45 +08:00
+								#endif
 								}
-												beta 0.1.0

											
										
										
											2019-04-17 10:49:11 +08:00
+								} // namespace MNN