MPI并行计算学习笔记3——对等模式下的矩阵向量乘法
程序员文章站
2022-07-12 21:10:46
...
一.运行环境:VS2017 + MPI
二.概念说明:对等模式即每个计算核心大致都承担同样的计算任务,包括主进程;与之相对应的设计模式为主从模式,即存在一个只负责调度任务的主进程,从进程从主进程接受任务,完成后将结果发回主进程,再等待主进程发送一个新任务。两种模式不完全独立。
三.程序说明:C = A * B ,A为dim维矩阵,B为dim维向量
- 为了使程序表述的简便,写了很简化的矩阵类(Matrix),可跳过;
- 将矩阵A按行分块,为简单起见,每块的行数 part = dim/numprocs,且调用进程数能够整除行数;
- 在2的前提下,每块矩阵元素的物理内存连续,且个数相等,可使用MPI标准中的群集通信函数,如播撒MPI_Scatter,收集MPI_Gather,既简化了操作又提高了效率;
- 针对矩阵行数不能被进程总数整除,可添加元素都为0的多行,使满足整除条件。或者采用更高级的播散收集函数可使向每个进程发送的数据个数可以不同;
四:一个MPI群集通信函数的小坑
MPI_Scatter(&pC(0, 0), part*dim, MPI_DOUBLE, &A(0, 0), part*dim, MPI_DOUBLE, masterNode, MPI_COMM_WORLD);
该函数的作用是,将msaterNode进程中的矩阵 pC 的每块分别发送至每个进程的矩阵 A 中,尽管除了masterNode进程的pC矩阵起作用外,其它进程的pC矩阵是毫无意义的。由于函数调用不允许出现未定义的标识符,每个进程都应该存在pC对象,否则编译器会报错。然后为了节约内存,仅在masterNode进程中定义有意义的pC,而在其他进程中仅有个无意义的标识,于是就有了如下代码段:
Matrix *pC = nullptr;
if (myid == masterNode)
pC = new Matrix(dim, dim);
MPI_Scatter(&(*pC)(0, 0), part*dim, MPI_DOUBLE, &A(0, 0), part*dim, MPI_DOUBLE, masterNode, MPI_COMM_WORLD);
该段代码在编译时无错,但无法运行。原因是非主进程尽管不需要使用pC矩阵的内容,但要保证它能完整的从实参传递给形参,而上述代码pC指向nullptr,在实参阶段无论是调用()函数,或者取地址&,都是非法的。纠正错误后的程序采用了如下解决办法(在完整代码中,1是矩阵类构造函数的默认参数,因而省略了):
Matrix pC(1,1);
if (myid == masterNode)
pC = Matrix(dim, dim);
MPI_Scatter(&(*pC)(0, 0), part*dim, MPI_DOUBLE, &A(0, 0), part*dim, MPI_DOUBLE, masterNode, MPI_COMM_WORLD);
以下为完整源代码:
#include<iostream>
#include"mpi.h"
#include<ctime>
#include<cmath>
using namespace std;
/*对等模式下的矩阵向量乘法*/
double start, finish;//用于计算时间
int myid, numprocs;//进程标识号与进程总数,设为全局变量可在任意个函数中访问
int part;//每一个进程所需计算的向量乘法次数,共有dim次向量乘法
const int dim = 9000;//矩阵的维数,PC机运行时注意堆栈溢出问题
class Matrix
{
int row;
int column;
double *memoryPool;
double **p;
public:
Matrix(int scale = 1) :row(scale), column(scale)
{
int dim = scale;
int num = scale * scale;
memoryPool = new double[num] {0};
p = new double*[dim];
for (int i = 0; i < dim; ++i)
p[i] = memoryPool + i * scale;
}
Matrix(int _row, int _column) :row(_row), column(_column)
{
int num = row * column;
memoryPool = new double[num] {0};
p = new double*[row];
for (int i = 0; i < row; ++i)
p[i] = memoryPool + i * column;
}
~Matrix()
{
if (memoryPool) { delete[]memoryPool; }
if (p) { delete[]p; }
}
double& operator()(int i, int j)const { return p[i][j]; }
friend ostream& operator<<(ostream &out, const Matrix &obj)
{
for (int i = 0; i < obj.row; ++i)
{
for (int j = 0; j < obj.column; ++j)
out << obj(i, j) << '\t';
cout << endl;
}
return out;
}
Matrix(Matrix &&other)
{
row = other.row;
column = other.column;
memoryPool = other.memoryPool;
p = other.p;
other.memoryPool = nullptr;
other.p = nullptr;
}
Matrix(const Matrix &other)
{
//cout << "赋值构造函数" << endl;
*this = other;
}
Matrix& operator=(const Matrix &obj)
{
if (row != obj.row || column != obj.column)
{
if (memoryPool)delete[]memoryPool;
if (p)delete[]p;
row = obj.row;
column = obj.column;
memoryPool = new double[row*column];
p = new double*[row];
for (int i = 0; i < row; ++i)
p[i] = memoryPool + i * column;
}
for (int i = 0; i < row; ++i)
for (int j = 0; j < column; ++j)
p[i][j] = obj(i, j);
return *this;
}
Matrix& operator=(Matrix &&obj)
{
if (memoryPool) { delete[]memoryPool; }
if (p) { delete[]p; }
row = obj.row;
column = obj.column;
memoryPool = obj.memoryPool;
p = obj.p;
obj.memoryPool = nullptr;
obj.p = nullptr;
return *this;
}
void ranCreate(int val = 200)
{
srand(time(NULL));
for (int i = 0; i < row; ++i)
for (int j = 0; j < column; ++j)
p[i][j] = (rand() % val) / 100.0;
}
Matrix operator*(const Matrix &obj)
{
Matrix tmp(row, obj.column);
for (int i = 0; i < row; ++i)
for (int j = 0; j < obj.column; ++j)
{
for (int k = 0; k < column; ++k)
tmp(i, j) += p[i][k] * obj(k, j);
}
return tmp;
}
};
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);//MPI库的初始化
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);//获得进程数
MPI_Comm_rank(MPI_COMM_WORLD, &myid);//获得当前进程标识号0,1,2,3,....,numprocs - 1
int masterNode = 0;//定义0号进程为主进程
double com_start, com_finish;//定义通信主体部分的起止时间
part = dim / numprocs;//假定能够整除
Matrix B(dim, 1);//向量
Matrix A(part, dim);//按行分块的矩阵
Matrix pC;
if (myid == masterNode)
{
pC = Matrix(dim);
//随机生成A,B,随机数生成所占时间将在该算例中占据相当多的时间,因而不算在内
pC.ranCreate();
B.ranCreate();
start = MPI_Wtime();
if (numprocs == 1)
{
//cout << pC * B << endl;
Matrix result = pC * B;
finish = MPI_Wtime();
cout <<"串行计算时间:"<< finish - start << endl;
MPI_Finalize();
return 0;
}
}
com_start = MPI_Wtime();//获得通信初始时间
//将向量广播至每个进程
MPI_Bcast(&B(0, 0), dim, MPI_DOUBLE, masterNode, MPI_COMM_WORLD);
//将矩阵pC中的0,1,2,...,numprocs - 1块分别发送至标识号为0,1,2,...,numprocs-1的进程
MPI_Scatter(&pC(0, 0), part*dim, MPI_DOUBLE, &A(0, 0), part*dim, MPI_DOUBLE, masterNode, MPI_COMM_WORLD);
com_finish = MPI_Wtime();
Matrix partResult = A*B;//各进程的计算任务
Matrix result;
if (myid == masterNode)
result = Matrix(dim, 1);
//收集函数,与MPI_Scatter函数相对
MPI_Gather(&partResult(0, 0), part, MPI_DOUBLE, &result(0,0), part, MPI_DOUBLE, masterNode, MPI_COMM_WORLD);
if (myid == masterNode)
{
//cout << result << endl;
finish = MPI_Wtime();
cout << "通信时间:" << com_finish - com_start << endl;
cout <<"计算与通信时间和:" <<finish - start << endl;
}
MPI_Finalize();
return 0;
}
测试结果:
结果分析:
- 矩阵向量乘法中,矩阵中的每一个元素仅被用到一次,因而计算不大;
- 上述结果是在矩阵维数9000的情况下给出的,通信时间的占比已经大于计算时间了,因而未展示出并行计算的优势;
上一篇: 笔记