cuda编程实现计算两个向量之间的距离
程序员文章站
2022-03-31 23:02:25
...
代码解释:
实现的多个向量和一个向量之间的距离的计算。让kernal函数去执行开设一定的线程去做。代码详细解释如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <iomanip>
#include <stdio.h>
const int N=33*1024;//修改向量的个数
const int D=256;
//const int MAX=10;
#include<sys/time.h>
#include<math.h>
#include<time.h>
#include<iostream>
using namespace std;
__global__ void MatrixMultiply(float *a,float *b,float*c,int N){
int tx=threadIdx.x+blockIdx.x*blockDim.x;//因为线程块里面是多个快的线程并发执行的。
while(tx < N) {//N 表示的是总的向量个数,每个kernal都算的是(线程块*线程数)个向量的距离
float sum = 0;
for (int k = 0; k < D; ++k) {
sum += ((a[tx * D + k] - b[k])*(a[tx * D + k] - b[k]));
}
c[tx] = sum;
tx+=blockDim.x*gridDim.x;//每次增加的是一个grid的数量,也就是一个kernal执行的线程数量
}
}
cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t size);
int main()
{
//添加计算时间的函数
struct timeval start,end;
gettimeofday(&start,NULL);
float *a=new float[N*D];
float *b = new float[D];
float *c = new float[N];
// read
ifstream fin("../data/features_256.txt");//打开文件
//读入数据
for(int i=0;i<N;i++)
for(int j=0;j<D;j++)
fin>>a[i*D+j];
fin.close();
//输出刚刚读入的数据
for(int i=0;i<N;i++,cout<<endl)
for(int j=0;j<D;j++)
//cout<<"the num of a is"<<a[i*D+j];
//cout<<a[1*D+2];
//printf("this num is %d\n",a[0]);
for( int j=0;j<D;j++ )
{
b[j] = a[1*D+j];
//printf("this b is %f\n",b[j]);
}
//printf("this b is %f\n",b[0]);
cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);
cudaStatus = cudaThreadExit();
// host free
//释放申请的空间
//for(int i=0;i<N;i++) delete []a[i];
delete []a;
delete[] b;
delete[] c;
gettimeofday(&end,NULL);
int timeuse=1000000*(end.tv_sec-start.tv_sec)+end.tv_usec-start.tv_usec;
printf("total time is %d ms\n",timeuse/1000);
return 0;
}
cudaError_t matrixMultiplyWithCuda(float *a,float *b,float *c,size_t N){
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&dev_a, N * D * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b, D * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_c, N * sizeof(float));
cudaStatus = cudaMemcpy(dev_a, a, N * D * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, D * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("Something wrong\n");
}
// kernal invocation
//dim3 threadPerBlock(128, 1, 1);
//dim3 numBlocks(N / threadPerBlock.x+1, 1, 1);
MatrixMultiply<<<256, 512>>>(dev_a, dev_b, dev_c, N);//每个kernal分配256个块,每个块都有512个线程
if (cudaStatus != cudaSuccess) {
printf( "Calculate wrong\n");
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, N * sizeof(float), cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
printf("the %d ge distance is %f\n",i,c[i]);
}
// printf("the %f and %f is %f",a[0],b[0],c[0]);
Error:
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return cudaStatus;
}