mirror of https://github.com/alibaba/MNN.git
211 lines
5.9 KiB
C++
211 lines
5.9 KiB
C++
//
|
|
// getPerformance.cpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2019/03/12.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include <string.h>
|
|
#include <chrono>
|
|
#include <cstdint>
|
|
#include <vector>
|
|
#include <stdlib.h>
|
|
#include <MNN/MNNDefine.h>
|
|
#include "core/Macro.h"
|
|
#ifdef MNN_USE_NEON
|
|
#include <arm_neon.h>
|
|
#endif
|
|
|
|
class Timer {
|
|
private:
|
|
std::chrono::high_resolution_clock::time_point inTime, outTime;
|
|
|
|
public:
|
|
void startTimer() {
|
|
inTime = std::chrono::high_resolution_clock::now();
|
|
}
|
|
|
|
// unit ms
|
|
float getCostTimer() {
|
|
outTime = std::chrono::high_resolution_clock::now();
|
|
return (float)(std::chrono::duration_cast<std::chrono::microseconds>(outTime - inTime).count());
|
|
}
|
|
};
|
|
|
|
int getCpuCounts() {
|
|
FILE* fp = fopen("/proc/cpuinfo", "rb");
|
|
if (fp == nullptr) {
|
|
MNN_PRINT("fopen error ! \n");
|
|
return 0;
|
|
}
|
|
int cpuCounts = 0;
|
|
char data[1024];
|
|
while (!feof(fp)) {
|
|
char* a = fgets(data, 1024, fp);
|
|
|
|
if (a == nullptr) {
|
|
break;
|
|
}
|
|
if (memcmp(data, "processor", 9) == 0) {
|
|
cpuCounts++;
|
|
}
|
|
}
|
|
|
|
fclose(fp);
|
|
fp = nullptr;
|
|
return cpuCounts;
|
|
}
|
|
|
|
// 0 max 1 min 2 cur
|
|
void getFreqKhz(int cpuid, std::vector<int>& freqVector) {
|
|
char path[256];
|
|
int freqKhz = -1;
|
|
// max
|
|
sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
|
|
FILE* fp = fopen(path, "rb");
|
|
if (nullptr == fp) {
|
|
MNN_PRINT("cpuinfo_max_freq fopen error ! \n");
|
|
freqVector.emplace_back(0);
|
|
} else {
|
|
fscanf(fp, "%d", &freqKhz);
|
|
fclose(fp);
|
|
freqVector.push_back(freqKhz);
|
|
}
|
|
|
|
// min
|
|
sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq", cpuid);
|
|
fp = fopen(path, "rb");
|
|
if (nullptr == fp) {
|
|
MNN_PRINT("cpuinfo_min_freq fopen error ! \n");
|
|
freqVector.emplace_back(0);
|
|
} else {
|
|
freqKhz = -1;
|
|
fscanf(fp, "%d", &freqKhz);
|
|
fclose(fp);
|
|
freqVector.push_back(freqKhz);
|
|
}
|
|
|
|
// cur
|
|
// sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_cur_freq", cpuid);
|
|
// fp = fopen(path, "rb");
|
|
// if(nullptr == fp){
|
|
// MNN_PRINT("cpuinfo_cur_freq fopen error ! \n");
|
|
// }else{
|
|
// freqKhz = -1;
|
|
// fscanf(fp, "%d", &freqKhz);
|
|
// fclose(fp);
|
|
// freqVector.push_back(freqKhz);
|
|
// }
|
|
}
|
|
|
|
void cpuFloatMlaTest(int32_t loopCounts) {
|
|
#ifdef MNN_USE_NEON
|
|
#ifndef __aarch64__
|
|
|
|
__asm__ __volatile__(
|
|
"mov r12, %0\n"
|
|
"0: \n"
|
|
"vmla.f32 q15, q15, d0[0] \n"
|
|
"vmla.f32 q14, q14, d0[1] \n"
|
|
"vmla.f32 q13, q13, d1[0] \n"
|
|
"vmla.f32 q12, q12, d1[1] \n"
|
|
"vmla.f32 q11, q11, d2[0] \n"
|
|
"vmla.f32 q10, q10, d2[1] \n"
|
|
"vmla.f32 q9, q9, d3[0] \n"
|
|
"vmla.f32 q8, q8, d3[1] \n"
|
|
"vmla.f32 q7, q7, d4[0] \n"
|
|
"vmla.f32 q6, q6, d4[1] \n"
|
|
"vmla.f32 q5, q5, d5[0] \n"
|
|
"vmla.f32 q4, q4, d5[1] \n"
|
|
"vmla.f32 q3, q3, d6[0] \n"
|
|
"subs r12, r12, #1 \n"
|
|
"bne 0b \n"
|
|
:
|
|
: "r"(loopCounts)
|
|
: "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q14", "q15"
|
|
);
|
|
|
|
#else
|
|
__asm__ __volatile__(
|
|
"mov w9, %w0\n"
|
|
"0: \n"
|
|
"fmla v31.4s, v31.4s, v0.s[0]\n"
|
|
"fmla v30.4s, v30.4s, v0.s[1]\n"
|
|
"fmla v29.4s, v29.4s, v0.s[2]\n"
|
|
"fmla v28.4s, v28.4s, v0.s[3]\n"
|
|
"fmla v27.4s, v27.4s, v1.s[0]\n"
|
|
"fmla v26.4s, v26.4s, v1.s[1]\n"
|
|
"fmla v25.4s, v25.4s, v1.s[2]\n"
|
|
"fmla v24.4s, v24.4s, v1.s[3]\n"
|
|
"fmla v23.4s, v23.4s, v3.s[0]\n"
|
|
"fmla v22.4s, v22.4s, v3.s[1]\n"
|
|
"fmla v21.4s, v21.4s, v3.s[2]\n"
|
|
"fmla v20.4s, v20.4s, v3.s[3]\n"
|
|
"fmla v19.4s, v19.4s, v4.s[0]\n"
|
|
"fmla v18.4s, v18.4s, v4.s[1]\n"
|
|
"fmla v17.4s, v17.4s, v4.s[2]\n"
|
|
"fmla v16.4s, v16.4s, v4.s[3]\n"
|
|
"fmla v15.4s, v15.4s, v5.s[0]\n"
|
|
"fmla v14.4s, v14.4s, v5.s[1]\n"
|
|
"fmla v13.4s, v13.4s, v5.s[2]\n"
|
|
"fmla v12.4s, v12.4s, v5.s[3]\n"
|
|
"fmla v11.4s, v11.4s, v6.s[0]\n"
|
|
"fmla v10.4s, v10.4s, v6.s[1]\n"
|
|
"fmla v9.4s, v9.4s, v6.s[2]\n"
|
|
"fmla v8.4s, v8.4s, v6.s[3]\n"
|
|
"fmla v7.4s, v7.4s, v2.s[0]\n"
|
|
"subs w9, w9, #1 \n"
|
|
"bne 0b \n"
|
|
:
|
|
: "r"(loopCounts)
|
|
: "cc", "memory", "w9", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
|
);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
void cpuFLOPSPerformance() {
|
|
int32_t loopCounts = 100000000;
|
|
MNN_PRINT("CPU PERFORMANCE -> loopCounts : %d \n", loopCounts);
|
|
|
|
std::vector<int> freqVector;
|
|
for (int i = 0; i < getCpuCounts(); i++) {
|
|
freqVector.clear();
|
|
getFreqKhz(i, freqVector);
|
|
MNN_PRINT("core %d : max : %d, min : %d \n",i, freqVector.at(0), freqVector.at(1));
|
|
}
|
|
|
|
// warm up
|
|
cpuFloatMlaTest(loopCounts);
|
|
|
|
Timer timeInstance;
|
|
timeInstance.startTimer();
|
|
cpuFloatMlaTest(loopCounts);
|
|
#ifdef MNN_USE_NEON
|
|
#ifndef __aarch64__
|
|
auto number = (double)loopCounts * 13;
|
|
#else
|
|
auto number = (double)loopCounts * 25;
|
|
#endif
|
|
#else
|
|
auto number = 0.0;
|
|
#endif
|
|
//FUNC_PRINT(number);
|
|
float costTime_ms = timeInstance.getCostTimer();
|
|
double costTime_s = (double)(costTime_ms) / 1000000.0f;
|
|
// MNN_PRINT("cost time : %f \n", costTime_s);
|
|
double mlaCounts_g = number * 4 / 1000000000.0f;
|
|
float gflops = mlaCounts_g / costTime_s;
|
|
MNN_PRINT(" ======================== float ===============================\n");
|
|
MNN_PRINT("CPU float gflops : %f\n", gflops);
|
|
}
|
|
|
|
int main(int argc, const char* argv[]) {
|
|
MNN_PRINT("Start PERFORMANCE !!! \n");
|
|
|
|
cpuFLOPSPerformance();
|
|
|
|
return 0;
|
|
}
|