c++矩阵计算性能对比:Eigen和GPU解读
作者:guotianqing
这篇文章主要介绍了c++矩阵计算性能对比:Eigen和GPU解读,具有很好的参考价值,希望对大家有所帮助。如有错误或未考虑完全的地方,望不吝赐教
生成随机矩阵
生成随机矩阵有多种方式,直接了当的方式是使用显式循环的方式为矩阵的每个元素赋随机值。
#include <iostream> #include <random> using namespace std; // 生成随机数 double GenerateRandomRealValue() { std::random_device rd; std::default_random_engine eng(rd()); std::uniform_real_distribution<double> distr(1, 10); return distr(eng); } int main() { // 3d矩阵 double a[3][3]; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { a[i][j] = GenerateRandomRealValue(); } } return 0; }
另一种方式是使用Eigen库,它提供了矩阵运算的库。
生成随机矩阵:
#include "Eigen/Dense" #include <functional> using namespace std; using namespace Eigen; MatrixXd Generate2DMatrixByEigen() { // 直接使用内置的Random,产生均匀分布随机矩阵 MatrixXd m = MatrixXd::Random(3,3); // 也可以调用自定义的随机数生成函数填充数据 // MatrixXd m = MatrixXd::Zero(3,3).unaryExpr(std::bind(GenerateRandomRealValue)); return m; }
计算矩阵点积
使用显式循环计算
直接上代码:
void CalcMatrixDotForLoop(const vector<vector<double>>& a, const vector<vector<double>>& b) { std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); if (a[0].size() != b.size()) { cout << "error:" << a.size() << "," << b[0].size() << endl; return; } vector<vector<double>> c; vector<double> c_row(b[0].size()); for (int i = 0; i < a.size(); ++i) { for (int j = 0; j < b[0].size(); ++j) { for (int k = 0; k < b.size(); ++k) { c_row[j] += a[i][k] * b[k][j]; } } c.emplace_back(c_row); } std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> time_span = t2 - t1; std::cout << "Loop takes " << time_span.count() << " ms\n"; // cout << "matrix c:\n"; // for (int i = 0; i < c.size(); ++i) { // for (int j = 0; j < c[0].size(); ++j) { // cout << c[i][j] << ","; // } // cout << endl; // } }
使用Eigen库
代码:
void ModeEigen(const int a_row, const int a_col, const int b_row, const int b_col) { std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); auto c = a * b; std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> time_span = t2 - t1; std::cout << "Eigen takes " << time_span.count() << " ms\n"; // cout << "matrix c:\n" << c << endl; }
使用GPU
代码片断:
auto t_begin = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now(); cudaMalloc((void**)&da,size); cudaMalloc((void**)&db,size); cudaMalloc((void**)&dc,size); t2 = std::chrono::high_resolution_clock::now(); time_span = t2 - t1; std::cout << "GPU malloc takes " << time_span.count() << " ms\n"; t1 = std::chrono::high_resolution_clock::now(); cudaMemcpy(da,a,size,cudaMemcpyHostToDevice); cudaMemcpy(db,b,size,cudaMemcpyHostToDevice); t2 = std::chrono::high_resolution_clock::now(); time_span = t2 - t1; std::cout << "cudaMemcpy takes " << time_span.count() << " ms\n"; t1 = std::chrono::high_resolution_clock::now(); dim3 dg(32,32); dim3 dbs((n+dg.x-1)/dg.x,(n+dg.y-1)/dg.y); mextix<<<dbs,dg>>>(da,db,dc,n); t2 = std::chrono::high_resolution_clock::now(); time_span = t2 - t1; std::cout << "gpu takes " << time_span.count() << " ms\n"; t1 = std::chrono::high_resolution_clock::now(); cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost); t2 = std::chrono::high_resolution_clock::now(); time_span = t2 - t1; std::cout << "cudaMemcpy back takes " << time_span.count() << " ms\n"; cudaFree(da); cudaFree(db); cudaFree(dc); auto t_end = std::chrono::high_resolution_clock::now(); time_span = t_end - t_begin; std::cout << "GPU total takes " << time_span.count() << " ms\n";
结果分析
经过测试,得到以下结论:
- 对于CPU上矩阵运算来说,使用Eigen远远优于显式循环(我只使用了单线程,你当然可以尝试多线程,但程度复杂度会明显上升)
- 对于小规模矩阵来说,Eigen库要快于GPU(数据在host和device之间的拷贝消耗了大量的时间)
- 对于较大规模矩阵来说,GPU的优势才显现出来(数据运算时间超过了拷贝耗时,运算量越大,GPU并行的优势也越明显)
总之:
- 绝对避免使用显式循环,使用Eigen库
- 对于一般的应用来说,使用Eigen库足够应付大多数场景,毕竟CPU机器要比GPU机器廉价且普遍
- 对于涉及大量的矩阵运算,包括机器学习等,GPU才是真正的用武之地
总结
以上为个人经验,希望能给大家一个参考,也希望大家多多支持脚本之家。