Merge pull request #2583 from alibaba/feature/bugfix

Feature/bugfix
2023-09-07 10:01:42 +08:00 · 2023-09-07 10:01:42 +08:00 · 32f72f4fb9
parent 9e3cc72952 8514c073c7
commit 32f72f4fb9
46 changed files with 125 additions and 167 deletions
--- a/source/backend/opencl/core/ImageBufferConvertor.cpp
+++ b/source/backend/opencl/core/ImageBufferConvertor.cpp
@ -59,8 +59,7 @@ bool convertNCHWBufferToImage(const Tensor *input, Tensor *output, cl::Kernel &b
    }
    
    #ifdef ENABLE_OPENCL_TIME_PROFILER
-        int costTime = (int)runtime->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us inputFormatTransform\n",costTime);
+        runtime->pushEvent({"inputFormatTransform", event});
    #endif
    return true;
 }
@ -111,8 +110,7 @@ bool convertNHWCBufferToImage(const Tensor *input, Tensor *output, cl::Kernel &b
    }
    
    #ifdef ENABLE_OPENCL_TIME_PROFILER
-        int costTime = (int)runtime->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us inputFormatTransform\n",costTime);
+        runtime->pushEvent({"inputFormatTransform", event});
    #endif
    return true;
 }
@ -167,8 +165,7 @@ bool convertImageToNCHWBuffer(const Tensor *input, Tensor *output, cl::Kernel &i
    }
    
    #ifdef ENABLE_OPENCL_TIME_PROFILER
-        int costTime = (int)runtime->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us outputFormatTransform\n",costTime);
+        runtime->pushEvent({"outputFormatTransform", event});
    #endif
    return true;
 }
@ -220,8 +217,7 @@ bool convertNC4HW4BufferToImage(const Tensor *input, Tensor *output, cl::Kernel
    }
    
    #ifdef ENABLE_OPENCL_TIME_PROFILER
-        int costTime = (int)runtime->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us inputFormatTransform\n",costTime);
+        runtime->pushEvent({"inputFormatTransform", event});
    #endif
    return true;
 }
@ -285,8 +281,7 @@ bool convertImageToNC4HW4Buffer(const Tensor *input, Tensor *output, cl::Kernel
    }
    
    #ifdef ENABLE_OPENCL_TIME_PROFILER
-        int costTime = (int)runtime->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us outputFormatTransform\n",costTime);
+        runtime->pushEvent({"outputFormatTransform", event});
    #endif
    return true;
 }
@ -341,8 +336,7 @@ bool convertImageToNHWCBuffer(const Tensor *input, Tensor *output, cl::Kernel &i
    }
    
    #ifdef ENABLE_OPENCL_TIME_PROFILER
-        int costTime = (int)runtime->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us outputFormatTransform\n",costTime);
+        runtime->pushEvent({"outputFormatTransform", event});
    #endif

    return true;
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@ -514,14 +514,15 @@ void OpenCLBackend::onResizeEnd() {

 void OpenCLBackend::onExecuteBegin() const {
    mOpenCLRuntime->mQueueCount = 0;
-    mOpenCLRuntime->mKernelTime = 0;
    mOpenCLRuntime->clearRecord();
+    mOpenCLRuntime->clearEvent();   
 }

 void OpenCLBackend::onExecuteEnd() const {
    mOpenCLRuntime->mQueueCount = 0;
    mOpenCLRuntime->clearRecord();
    mOpenCLRuntime->enqeueRecord();
+    mOpenCLRuntime->printEventTime();
 }


@ -698,7 +699,7 @@ void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTen
    mOpenCLRuntime->clearRecord();
    //Convert format
    mCLRuntime->convertFromDevice(srcTensor, (const Tensor*)&interTensor, data_format, false);
-
+    mOpenCLRuntime->printEventTime();

 #ifdef ENABLE_OPENCL_TIME_PROFILER
    mOpenCLRuntime->commandQueue().finish();
@ -743,10 +744,6 @@ void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTen
            hostPtr = nullptr;
        }
    }
-
-#ifdef ENABLE_OPENCL_TIME_PROFILER
-    MNN_PRINT("total kernel time:%d us\n", (int)mOpenCLRuntime->mKernelTime);
-#endif
 }


--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@ -364,6 +364,7 @@ OpenCLRuntime::~OpenCLRuntime() {
 #ifdef LOG_VERBOSE
    MNN_PRINT("start ~OpenCLRuntime !\n");
 #endif
+    clearEvent();
    releaseRecord();
    mBuildProgramMap.clear();
    mRecordings.clear();
@ -779,4 +780,24 @@ void OpenCLRuntime::releaseRecord(){
    }
 #endif
 }
+
+void OpenCLRuntime::printEventTime(){
+#ifdef ENABLE_OPENCL_TIME_PROFILER
+    if(mEvents.empty()){
+        return;
+    }
+    for(int i = 0; i < mEvents.size(); ++i){
+        auto event = &mEvents[i].second;
+        cl_int res = event->wait();
+        MNN_CHECK_CL_SUCCESS(res, "clEvent");
+        auto StartNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_START>();
+        auto StopNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_END>();
+        auto kernel_time = (unsigned int)((StopNanos - StartNanos) / 1000.0);
+        mKernelTime += kernel_time;
+        MNN_PRINT("kernel time = %d    us %s\n", kernel_time, mEvents[i].first.c_str());
+    }
+    mEvents.clear();
+    MNN_PRINT("total kernel time = %d  us\n", mKernelTime);
+#endif
+}
 } // namespace MNN
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
@ -113,6 +113,14 @@ public:
    std::string getDeviceName() {
        return mDeviceName;
    }
+    void pushEvent(std::pair<std::string, cl::Event> data) {
+        return mEvents.push_back(data);
+    }
+    void printEventTime();
+    void clearEvent(){
+        mKernelTime = 0;
+        mEvents.clear();
+    }
    uint64_t maxAllocSize() const;
    void setCommandQueueProfileEnable();
    void setCommandQueueProfileDisable();
@ -181,6 +189,7 @@ private:
    GpuType mGpuType;
    MaliAr mMaliAr;
    float mCLVersion = 1.0f;
+    std::vector<std::pair<std::string, cl::Event>> mEvents;

 #ifdef MNN_OPENCL_SVM_ENABLE
    cl_device_svm_capabilities mSvmCapabilities;
--- a/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp
@ -107,8 +107,7 @@ ErrorCode ArgMaxBufExecution::onExecute(const std::vector<Tensor*>& inputs, cons
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us ArgMax\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ArgMax", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/CastBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/CastBufExecution.cpp
@ -23,11 +23,6 @@ ErrorCode CastBufExecution::onResize(const std::vector<Tensor*>& inputs, const s
    Tensor* output     = outputs[0];
    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
    auto runtime       = openCLBackend->getOpenCLRuntime();
-#ifdef MNN_SUPPORT_INTEL_SUBGROUP
-    if (runtime->isSupportedIntelSubgroup()) {
-        return SubgrouponResize(inputs, outputs);
-    }
-#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
    mKernel = runtime->buildKernel("cast_buf", "cast_buf", mBuildOptions);
    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));

@ -75,8 +70,7 @@ ErrorCode CastBufExecution::onExecute(const std::vector<Tensor*>& inputs, const
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Cast\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Cast", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
@ -674,8 +674,7 @@ ErrorCode ConvBufExecution::onExecute(const std::vector<Tensor *> &inputs, const
 #ifdef ENABLE_OPENCL_TIME_PROFILER
    cl::Event event;
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us ConvBuf2D\n",costTime); 
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvBuf2D", event});
 #else
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
--- a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
@ -570,9 +570,6 @@ ErrorCode ConvBufWinograd::onExecute(const std::vector<Tensor*>& inputs, const s
    auto input  = inputs[0];
    auto output = outputs[0];

-    #ifdef ENABLE_OPENCL_TIME_PROFILER
-    int costTime = 0;
-    #endif
    for (int b = 0; b < input->batch(); ++b) {
        int index = b;
        /*Source Transform*/
@ -581,10 +578,7 @@ ErrorCode ConvBufWinograd::onExecute(const std::vector<Tensor*>& inputs, const s
            cl::Event event;
            runKernel2D(mSourceTransform[index], mGWS_S[index], mLWS_S[index],
                        mOpenCLBackend->getOpenCLRuntime(), &event);
-            
-            int costTime0 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-            costTime += costTime0;
-            MNN_PRINT("kernel cost:%d    us ConvWino0\n",costTime0);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvWino0", event});
        #else
            runKernel2D(mSourceTransform[index], mGWS_S[index], mLWS_S[index],
                        mOpenCLBackend->getOpenCLRuntime());
@ -600,10 +594,7 @@ ErrorCode ConvBufWinograd::onExecute(const std::vector<Tensor*>& inputs, const s
            } else {
                runKernel2D(mMatMul[index], mGWS_M[index], mLWS_M[index], mOpenCLBackend->getOpenCLRuntime(), &event);
            }
-            
-            int costTime1 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-            costTime += costTime1;
-            MNN_PRINT("kernel cost:%d    us ConvWino1\n",costTime1);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvWino1", event});
        #else
            if (mUseSubgroup) {
                run3DKernelDefault(mMatMul[index], mGWS_M[index], mLWS_M[index], mOpenCLBackend->getOpenCLRuntime());
@ -619,19 +610,13 @@ ErrorCode ConvBufWinograd::onExecute(const std::vector<Tensor*>& inputs, const s
            cl::Event event;
            runKernel2D(mDestTransform[index], mGWS_D[index], mLWS_D[index],
                        mOpenCLBackend->getOpenCLRuntime(), &event);
-            
-            int costTime2 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-            costTime += costTime2;
-            MNN_PRINT("kernel cost:%d    us ConvWino2\n",costTime2);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvWino2", event});
        #else
            runKernel2D(mDestTransform[index], mGWS_D[index], mLWS_D[index],
                        mOpenCLBackend->getOpenCLRuntime());
        #endif
        }
    }
-    #ifdef ENABLE_OPENCL_TIME_PROFILER
-    MNN_PRINT("kernel cost:%d    us ConvWino total\n",costTime);
-    #endif

    return NO_ERROR;
 }
--- a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
@ -397,8 +397,7 @@ ErrorCode ConvSubgroupBuf::onExecute(const std::vector<Tensor *> &inputs, const
         
         cl::Event event;
         run3DKernelDefault(mTranseKernel, mTranseGlobalWorkSize, mTranseLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
-         int costTime0 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-         MNN_PRINT("kernel cost:%d    us ConvSubgroup transe\n", costTime0);
+         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvSubgroup", event});
 #else
         run3DKernelDefault(mTranseKernel, mTranseGlobalWorkSize, mTranseLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
@ -407,8 +406,7 @@ ErrorCode ConvSubgroupBuf::onExecute(const std::vector<Tensor *> &inputs, const
 #ifdef ENABLE_OPENCL_TIME_PROFILER
         cl::Event event;
         run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
-         int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-         MNN_PRINT("kernel cost:%d    us ConvSubgroupBuf2D\n", costTime);
+         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvSubgroupBuf2D", event});
 #else
         run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
--- a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
@ -177,8 +177,8 @@ ErrorCode DeconvBufExecution::onExecute(const std::vector<Tensor *> &inputs, con
                       mOpenCLBackend->getOpenCLRuntime(),
                       &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us DeconvBuf\n",costTime);
+    
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"DeconvBuf", event});
 #else
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
@ -307,9 +307,7 @@ ErrorCode DepthwiseConvBufExecution::onExecute(const std::vector<Tensor *> &inpu
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
                mOpenCLBackend->getOpenCLRuntime(),
                &event);
-    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us DepthwiseConvBuf\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"DepthwiseConvBuf", event});
 #else
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
                mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
@ -260,8 +260,8 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onExecute(const std::vector<Tensor

        run3DKernelDefault(mTranseKernel, mTranseGlobalWorkSize, mTranseLocalWorkSize,
                           mOpenCLBackend->getOpenCLRuntime(), &event);
-        int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us DepthwiseConvSubgroup transe\n", costTime);
+        
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"DepthwiseConvSubgroup transe", event});
 #else
        run3DKernelDefault(mTranseKernel, mTranseGlobalWorkSize, mTranseLocalWorkSize,
                           mOpenCLBackend->getOpenCLRuntime());
@ -274,8 +274,7 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onExecute(const std::vector<Tensor
                mOpenCLBackend->getOpenCLRuntime(),
                &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us DepthwiseConvSubgroupBuf\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"DepthwiseConvSubgroupBuf", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                        mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
@ -90,9 +90,8 @@ ErrorCode GridSampleBufExecution::onExecute(const std::vector<Tensor *> &inputs,
    cl::Event event;
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
        mOpenCLBackend->getOpenCLRuntime(), &event);
-
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us GridSample\n", costTime);
+    
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"GridSample", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
--- a/source/backend/opencl/execution/buffer/Interp3DBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/Interp3DBufExecution.cpp
@ -102,8 +102,7 @@ ErrorCode Interp3DBufExecution::onExecute(const std::vector<Tensor *> &inputs, c
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Interp\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Interp", event});
 #else
    run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
 #endif
--- a/source/backend/opencl/execution/buffer/InterpBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/InterpBufExecution.cpp
@ -96,8 +96,7 @@ ErrorCode InterpBufExecution::onExecute(const std::vector<Tensor *> &inputs, con
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Interp\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Interp", event});
 #else
    run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
 #endif
--- a/source/backend/opencl/execution/buffer/LayerNormBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/LayerNormBufExecution.cpp
@ -175,8 +175,7 @@ ErrorCode LayerNormBufExecution::onExecute(const std::vector<Tensor *> &inputs,
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us LayerNormBuf\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"LayerNormBuf", event});
 #else
    run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
 #endif
--- a/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
@ -111,9 +111,8 @@ ErrorCode MatMulBufExecution::onExecute(const std::vector<Tensor *> &inputs, con
    #ifdef ENABLE_OPENCL_TIME_PROFILER
        cl::Event event;
        runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, runtime, &event);
-        
-        int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us MatmulBuf\n",costTime);
+    
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"MatmulBuf", event});
    #else
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, runtime, nullptr);
    #endif
--- a/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
@ -254,8 +254,7 @@ ErrorCode PoolBufExecution::onExecute(const std::vector<Tensor *> &inputs, const
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Pooling\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Pooling", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                       mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/RangeBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/RangeBufExecution.cpp
@ -69,8 +69,7 @@ ErrorCode RangeBufExecution::onExecute(const std::vector<Tensor*>& inputs, const
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Range\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Range", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
@ -209,8 +209,8 @@ ErrorCode ReductionBufExecution::onExecute(const std::vector<Tensor *> &inputs,
        cl::Event event;
        run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
                               mOpenCLBackend->getOpenCLRuntime(), &event);
-        int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us Reduct1D\n",costTime);
+    
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Reduct1D", event});
    #else
        run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
                           mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/ScaleBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ScaleBufExecution.cpp
@ -160,8 +160,7 @@ ErrorCode ScaleBufExecution::onExecute(const std::vector<Tensor *> &inputs, cons
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
                mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Scale\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Scale", event});
 #else
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
                mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/SelectBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SelectBufExecution.cpp
@ -70,8 +70,7 @@ ErrorCode SelectBufExecution::onExecute(const std::vector<Tensor*>& inputs, cons
    runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Select\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Select", event});
 #else
    runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
@ -140,8 +140,7 @@ ErrorCode SoftmaxBufExecution::onExecute(const std::vector<Tensor *> &inputs, co
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Softmax\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Softmax", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
 #endif
--- a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
@ -136,8 +136,7 @@ ErrorCode UnaryBufExecution::onExecute(const std::vector<Tensor*>& inputs, const
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Unary\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Unary", event});
 #else
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime());
--- a/source/backend/opencl/execution/image/CommonExecution.cpp
+++ b/source/backend/opencl/execution/image/CommonExecution.cpp
@ -53,8 +53,7 @@ ErrorCode CommonExecution::onExecute(const std::vector<Tensor *> &inputs, const
                                                        &event);
        }
        
-        int costTime = (int)runtime->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us %s%d\n",costTime, EnumNameOpType(mOpType), idx++);
+        runtime->pushEvent({EnumNameOpType(mOpType) + std::to_string(idx++), event});
    #else
        if(lws_null == true) {
            res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel,
--- a/source/backend/opencl/execution/image/ConvExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvExecution.cpp
@ -562,8 +562,7 @@ ErrorCode ConvExecution::onExecute(const std::vector<Tensor *> &inputs, const st
        run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                           mOpenCLBackend->getOpenCLRuntime(), &event);
        
-        float costTime = mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-        MNN_PRINT("kernel cost:%f    us Conv UseLocalMem\n",costTime);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Conv UseLocalMem", event});
    #else
        if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
            if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
@ -583,8 +582,7 @@ ErrorCode ConvExecution::onExecute(const std::vector<Tensor *> &inputs, const st
    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
                mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Conv2D\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Conv2D", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/ConvWinograd.cpp
+++ b/source/backend/opencl/execution/image/ConvWinograd.cpp
@ -352,9 +352,7 @@ ErrorCode ConvWinograd::onExecute(const std::vector<Tensor*>& inputs, const std:
    auto input  = inputs[0];
    auto output = outputs[0];

-    #ifdef ENABLE_OPENCL_TIME_PROFILER
-    int costTime = 0;
-    #else
+    #ifndef ENABLE_OPENCL_TIME_PROFILER
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
            mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
@ -368,10 +366,8 @@ ErrorCode ConvWinograd::onExecute(const std::vector<Tensor*>& inputs, const std:
            cl::Event event;
            runKernel2D(mSourceTransform[b], mGWS_S[b], mLWS_S[b],
                        mOpenCLBackend->getOpenCLRuntime(), &event);
-                    
-            int costTime0 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-            costTime += costTime0;
-            MNN_PRINT("kernel cost:%d    us ConvWino0\n",costTime0);
+            
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvWino0", event});
            #else
                runKernel2D(mSourceTransform[b], mGWS_S[b], mLWS_S[b],
                        mOpenCLBackend->getOpenCLRuntime());
@ -384,10 +380,8 @@ ErrorCode ConvWinograd::onExecute(const std::vector<Tensor*>& inputs, const std:
            cl::Event event;
            runKernel2D(mMatMul[b], mGWS_M[b], mLWS_M[b],
                        mOpenCLBackend->getOpenCLRuntime(), &event);
-                            
-            int costTime1 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-            costTime += costTime1;
-            MNN_PRINT("kernel cost:%d    us ConvWino1\n",costTime1);
+            
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvWino1", event});
 #else
            runKernel2D(mMatMul[b], mGWS_M[b], mLWS_M[b],
                        mOpenCLBackend->getOpenCLRuntime());
@ -400,19 +394,14 @@ ErrorCode ConvWinograd::onExecute(const std::vector<Tensor*>& inputs, const std:
            cl::Event event;
            runKernel2D(mDestTransform[b], mGWS_D[b], mLWS_D[b],
                        mOpenCLBackend->getOpenCLRuntime(), &event);
-                    
-            int costTime2 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-            costTime += costTime2;
-            MNN_PRINT("kernel cost:%d    us ConvWino2\n",costTime2);
+            
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvWino2", event});
        #else
            runKernel2D(mDestTransform[b], mGWS_D[b], mLWS_D[b],
                        mOpenCLBackend->getOpenCLRuntime());
        #endif
        }
    }
-    #ifdef ENABLE_OPENCL_TIME_PROFILER
-    MNN_PRINT("kernel cost:%d    us ConvWino total\n",costTime);
-    #endif

    return NO_ERROR;
 }
--- a/source/backend/opencl/execution/image/DeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DeconvExecution.cpp
@ -178,8 +178,7 @@ ErrorCode DeconvExecution::onExecute(const std::vector<Tensor *> &inputs, const
                       mOpenCLBackend->getOpenCLRuntime(),
                       &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Deconv\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Deconv", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
@ -165,8 +165,7 @@ ErrorCode DepthwiseConvExecution::onExecute(const std::vector<Tensor *> &inputs,
                mOpenCLBackend->getOpenCLRuntime(),
                &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us DepthwiseConv\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"DepthwiseConv", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
@ -168,8 +168,7 @@ ErrorCode DepthwiseDeconvExecution::onExecute(const std::vector<Tensor *> &input
                       mOpenCLBackend->getOpenCLRuntime(),
                       &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us DepthwiseDeconv\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"DepthwiseDeconv", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/FuseExecution.cpp
+++ b/source/backend/opencl/execution/image/FuseExecution.cpp
@ -83,8 +83,7 @@ ErrorCode FuseExecution::onExecute(const std::vector<Tensor *> &inputs, const st
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Fuse\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Fuse", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/GridSampleExecution.cpp
+++ b/source/backend/opencl/execution/image/GridSampleExecution.cpp
@ -91,9 +91,8 @@ ErrorCode GridSampleExecution::onExecute(const std::vector<Tensor *> &inputs, co
    cl::Event event;
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
        mOpenCLBackend->getOpenCLRuntime(), &event);
-
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us GridSample\n", costTime);
+    
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"GridSample", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/Interp3DExecution.cpp
+++ b/source/backend/opencl/execution/image/Interp3DExecution.cpp
@ -103,8 +103,7 @@ ErrorCode Interp3DExecution::onExecute(const std::vector<Tensor *> &inputs, cons
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Interp3D\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Interp3D", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/InterpExecution.cpp
+++ b/source/backend/opencl/execution/image/InterpExecution.cpp
@ -95,8 +95,7 @@ ErrorCode InterpExecution::onExecute(const std::vector<Tensor *> &inputs, const
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Interp\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Interp", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/LayerNormExecution.cpp
+++ b/source/backend/opencl/execution/image/LayerNormExecution.cpp
@ -176,8 +176,7 @@ ErrorCode LayerNormExecution::onExecute(const std::vector<Tensor *> &inputs, con
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us LayerNorm\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"LayerNorm", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/MatmulExecution.cpp
+++ b/source/backend/opencl/execution/image/MatmulExecution.cpp
@ -115,9 +115,8 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
    #ifdef ENABLE_OPENCL_TIME_PROFILER
        cl::Event event;
        runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, runtime, &event);
-        
-        int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us Matmul\n",costTime);
+    
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Matmul", event});
    #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/PoolExecution.cpp
+++ b/source/backend/opencl/execution/image/PoolExecution.cpp
@ -151,8 +151,7 @@ ErrorCode PoolExecution::onExecute(const std::vector<Tensor *> &inputs, const st
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Pooling\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Pooling", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/ReductionExecution.cpp
+++ b/source/backend/opencl/execution/image/ReductionExecution.cpp
@ -200,8 +200,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
    #ifdef ENABLE_OPENCL_TIME_PROFILER
        cl::Event event;
        run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
-        int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-        MNN_PRINT("kernel cost:%d    us Reduct1D\n",costTime);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Reduct1D", event});
    #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/RoiPoolingExecution.cpp
+++ b/source/backend/opencl/execution/image/RoiPoolingExecution.cpp
@ -129,8 +129,7 @@ ErrorCode RoiPooling::onExecute(const std::vector<Tensor *> &inputs, const std::
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us RoiPooling\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"RoiPooling", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/ScaleExecution.cpp
+++ b/source/backend/opencl/execution/image/ScaleExecution.cpp
@ -171,8 +171,7 @@ ErrorCode ScaleExecution::onExecute(const std::vector<Tensor *> &inputs, const s
    cl::Event event;
    run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Softmax\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"scale", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/SoftmaxExecution.cpp
+++ b/source/backend/opencl/execution/image/SoftmaxExecution.cpp
@ -139,8 +139,7 @@ ErrorCode SoftmaxExecution::onExecute(const std::vector<Tensor *> &inputs, const
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Softmax\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Softmax", event});
 #else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
--- a/source/backend/opencl/execution/image/UnaryExecution.cpp
+++ b/source/backend/opencl/execution/image/UnaryExecution.cpp
@ -74,8 +74,7 @@ ErrorCode UnaryExecution::onExecute(const std::vector<Tensor*>& inputs, const st
    run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
-    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
-    MNN_PRINT("kernel cost:%d    us Unary\n",costTime);
+    mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Unary", event});
 #else
    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
    if(openCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
--- a/source/geometry/GeometryCrop.cpp
+++ b/source/geometry/GeometryCrop.cpp
@ -210,32 +210,34 @@ public:
            // Check Zero for inputs[2]
            bool zero = false;
            auto type = inputs[2]->getType();
-            switch (type.code) {
-                case halide_type_int:
-                {
-                    if (type.bits == 8) {
-                        zero = inputs[2]->host<int8_t>()[0] == 0;
-                    } else if (type.bits == 32) {
-                        zero = inputs[2]->host<int32_t>()[0] == 0;
+            if (!TensorUtils::getDescribe(inputs[2])->isMutable && inputs[2]->deviceId() == 0) {
+                switch (type.code) {
+                    case halide_type_int:
+                    {
+                        if (type.bits == 8) {
+                            zero = inputs[2]->host<int8_t>()[0] == 0;
+                        } else if (type.bits == 32) {
+                            zero = inputs[2]->host<int32_t>()[0] == 0;
+                        }
                    }
-                }
-                    break;
-                case halide_type_uint:
-                {
-                    if (type.bits == 8) {
-                        zero = inputs[2]->host<uint8_t>()[0] == 0;
-                    } else if (type.bits == 32) {
-                        zero = inputs[2]->host<uint32_t>()[0] == 0;
+                        break;
+                    case halide_type_uint:
+                    {
+                        if (type.bits == 8) {
+                            zero = inputs[2]->host<uint8_t>()[0] == 0;
+                        } else if (type.bits == 32) {
+                            zero = inputs[2]->host<uint32_t>()[0] == 0;
+                        }
                    }
+                        break;
+                    case halide_type_float:
+                    {
+                        zero = inputs[2]->host<float>()[0] == 0.0f;
+                    }
+                        break;
+                    default:
+                        break;
                }
-                    break;
-                case halide_type_float:
-                {
-                    zero = inputs[2]->host<float>()[0] == 0.0f;
-                }
-                    break;
-                default:
-                    break;
            }
            if (zero) {
                return true;
--- a/source/shape/ShapeStridedSlice.cpp
+++ b/source/shape/ShapeStridedSlice.cpp
@ -294,5 +294,5 @@ public:
    }
 };

-REGISTER_SHAPE_INPUTS(StridedSliceComputer, OpType_StridedSlice, (std::vector<int>{1,2,3}));
+REGISTER_SHAPE_INPUTS(StridedSliceComputer, OpType_StridedSlice, (std::vector<int>{1,2,3,4}));
 } // namespace MNN
--- a/source/shape/ShapeTopKV2.cpp
+++ b/source/shape/ShapeTopKV2.cpp
@ -44,5 +44,5 @@ class TopKV2SizeComputer : public SizeComputer {
    }
 };

-REGISTER_SHAPE_INPUTS(TopKV2SizeComputer, OpType_TopKV2, {1});
+REGISTER_SHAPE_INPUTS(TopKV2SizeComputer, OpType_TopKV2, (std::vector<int>{1,2}));
 } // namespace MNN
--- a/tools/converter/source/common/SaveHalfFloat.cpp
+++ b/tools/converter/source/common/SaveHalfFloat.cpp
@ -16,6 +16,9 @@ void CastParamsToHalf(std::unique_ptr<MNN::OpT>& op) {
        case MNN::OpType_Convolution:
        case MNN::OpType_ConvolutionDepthwise: {
            auto param           = op->main.AsConvolution2D();
+            if (param->quanParameter != nullptr) {
+                break;
+            }
            const int weightSize = param->weight.size();
            // const int biasSize = param->bias.size();
            std::vector<half_float::half> quantizedFp16Weight;