并行编程实现矩阵乘法优化
程序员文章站
2022-07-12 21:11:10
...
实现四种矩阵乘法程序,并对比运行效率。
1) 串行算法
2) Catch优化
3) SSE版本
4) 分片策略
#include<pmmintrin.h>
#include<cstdlib>
#include<algorithm>
#include<windows.h>
#include<iostream>
#include<ctime>
using namespace std;
const int maxN=1024;
const int T=64;
int n;
float a[maxN][maxN];
float b[maxN][maxN];
float c[maxN][maxN];
long long head,tail,freq;
double mul(int n,float a[][maxN],float b[][maxN],float c[][maxN]){
long long head,tail,freq;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
QueryPerformanceCounter((LARGE_INTEGER*)&head);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
c[i][j]=0.0;
for(int k=0;k<n;++k){
c[i][j]+=a[i][k]*b[k][j];
}
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&tail);
cout<<"mul:"<<(double)(tail-head)/(double)freq<<endl;
return (double)(tail-head)/(double)freq;
}
double trans_mul(int n,float a[][maxN],float b[][maxN],float c[][maxN]){
//transposition
long long head,tail,freq;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
QueryPerformanceCounter((LARGE_INTEGER*)&head);
for(int i=0;i<n;i++)for(int j=0;j<n;j++)swap(b[i][j],b[j][i]);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
c[i][j]=0.0;
for(int k=0;k<n;++k){
c[i][j]+=a[i][k]*b[k][j];
}
}
}
for(int i=0;i<n;i++)for(int j=0;j<n;j++)swap(b[i][j],b[j][i]);
QueryPerformanceCounter((LARGE_INTEGER*)&tail);
cout<<"trans_mul:"<<(double)(tail-head)/(double)freq<<endl;
return (double)(tail-head)/(double)freq;
}
double sse_mul(int n,float a[][maxN],float b[][maxN],float c[][maxN]){
__m128 t1,t2,sum;
long long head,tail,freq;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
QueryPerformanceCounter((LARGE_INTEGER*)&head);
for(int i=0;i<n;i++)for(int j=0;j<n;j++)swap(b[i][j],b[j][i]);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
c[i][j]=0.0;
sum=_mm_setzero_ps();
for(int k=n-4;k>=0;k-=4){
t1=_mm_loadu_ps(a[i]+k);
t2=_mm_loadu_ps(b[j]+k);
t1=_mm_mul_ps(t1,t2);
sum=_mm_add_ps(sum,t1);
}
sum=_mm_hadd_ps(sum,sum);
sum=_mm_hadd_ps(sum,sum);
_mm_store_ss(c[i]+j,sum);
for(int k=(n%4)-1;k>=0;--k){
c[i][j]+=a[i][k]*b[j][k];
}
}
}
for(int i=0;i<n;i++)for(int j=0;j<n;j++)swap(b[i][j],b[j][i]);
QueryPerformanceCounter((LARGE_INTEGER*)&tail);
cout<<"sse_mul:"<<(double)(tail-head)/(double)freq<<endl;
return (double)(tail-head)/(double)freq;
}
double sse_tile(int n,float a[][maxN],float b[][maxN],float c[][maxN]){
__m128 t1, t2, sum;
float t;
long long head,tail,freq;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
QueryPerformanceCounter((LARGE_INTEGER*)&head);
for (int i = 0; i < n; ++i) for (int j = 0; j < i; ++j) swap(b[i][j], b[j][i]);
for (int r = 0; r < n / T; ++r) for (int q = 0; q < n / T; ++q) {
for (int i = 0; i < T; ++i) for (int j = 0; j < T; ++j) c[r * T + i][q * T + j] = 0.0;
for (int p = 0; p < n / T; ++p) {
for (int i = 0; i < T; ++i) for (int j = 0; j < T; ++j) {
sum = _mm_setzero_ps();
for (int k = 0; k < T; k += 4){
t1 = _mm_loadu_ps(a[r * T + i] + p * T + k);
t2 = _mm_loadu_ps(b[q * T + j] + p * T + k);
t1 = _mm_mul_ps(t1, t2);
sum = _mm_add_ps(sum, t1);
}
sum = _mm_hadd_ps(sum, sum);
sum = _mm_hadd_ps(sum, sum);
_mm_store_ss(&t, sum);
c[r * T + i][q * T + j] += t;
}
}
}
for (int i = 0; i < n; ++i) for (int j = 0; j < i; ++j) swap(b[i][j], b[j][i]);
QueryPerformanceCounter((LARGE_INTEGER*)&tail);
cout<<"sse_tile:"<<(double)(tail-head)/(double)freq<<endl;
return (double)(tail-head)/(double)freq;
}
int main(){
n=1024;
srand((int)time(0));
for(int i=0;i<maxN;i++)for(int j=0;j<maxN;j++)a[i][j]=rand()%100;
for(int i=0;i<maxN;i++)for(int j=0;j<maxN;j++)b[i][j]=rand()%100;
double mul_sum = 0;
double trans_mul_sum = 0;
double sse_mul_sum = 0;
double sse_tile_sum = 0;
/*for (int i = 0; i < 10; i++){
for (int j = 0; j < 10; j++){
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
QueryPerformanceCounter((LARGE_INTEGER*)&head);
c[i][j]=rand()%100;
cout<<c[i][j]<<endl;
Sleep(1000);
QueryPerformanceCounter((LARGE_INTEGER*)&tail);
cout<<"ms:"<<(double)(tail-head)/(double)freq<<endl;
}
}
cout<<mul(n,a,b,c)<<endl<<trans_mul(n,a,b,c)<<endl<<sse_mul(n,a,b,c)<<endl<<sse_tile(n,a,b,c);*/
for(int i=0;i<10;i++){
cout<<"第"<<i+1<<"次运行结果:"<<endl;
mul_sum += mul(n,a,b,c);
trans_mul_sum += trans_mul(n,a,b,c);
sse_mul_sum += sse_mul(n,a,b,c);
sse_tile_sum += sse_tile(n,a,b,c);
cout<<endl;
}
cout<<"平均耗时:"<<endl;
cout<<"串行算法耗时:"<<mul_sum/10<<"s"<<endl;
cout<<"Cache优化耗时:"<<trans_mul_sum/10<<"s"<<endl;
cout<<"SSE版本耗时:"<<sse_mul_sum/10<<"s"<<endl;
cout<<"分片策略耗时:"<<sse_tile_sum/10<<"s"<<endl;
system("pause");
return 0;
}
上一篇: DelphiXE中Frame的使用
下一篇: 算法分析与设计-作业8 矩阵链的乘法