欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

cuda编程实现计算两个向量之间的距离

程序员文章站 2022-03-31 23:02:25
...

代码解释:
实现的多个向量和一个向量之间的距离的计算。让kernal函数去执行开设一定的线程去做。代码详细解释如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <iomanip>
#include <stdio.h>
const int N=33*1024;//修改向量的个数
const int D=256;
//const int MAX=10;
#include<sys/time.h>
#include<math.h>
#include<time.h>
#include<iostream>
using namespace std;
__global__ void MatrixMultiply(float *a,float *b,float*c,int N){
	int tx=threadIdx.x+blockIdx.x*blockDim.x;//因为线程块里面是多个快的线程并发执行的。
		while(tx < N) {//N 表示的是总的向量个数,每个kernal都算的是(线程块*线程数)个向量的距离
		float sum = 0;
		for (int k = 0; k < D; ++k) {
			sum += ((a[tx * D + k] - b[k])*(a[tx * D + k] - b[k]));
		}
		c[tx] = sum;
		tx+=blockDim.x*gridDim.x;//每次增加的是一个grid的数量,也就是一个kernal执行的线程数量
	}
}

cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t size);

int main()
{
		//添加计算时间的函数
	struct timeval start,end;
	gettimeofday(&start,NULL);
	float *a=new float[N*D];
	float *b = new float[D];
	float *c = new float[N];

	// read 
	ifstream fin("../data/features_256.txt");//打开文件
//读入数据
	for(int i=0;i<N;i++)
       	     for(int j=0;j<D;j++) 
	            fin>>a[i*D+j];
    fin.close();
    //输出刚刚读入的数据
    for(int i=0;i<N;i++,cout<<endl)
        for(int j=0;j<D;j++) 
            //cout<<"the num of a is"<<a[i*D+j];
     //cout<<a[1*D+2];
	
//printf("this num is %d\n",a[0]);		


        for( int j=0;j<D;j++ )
	{
	  	b[j] = a[1*D+j];
                //printf("this b is %f\n",b[j]);
	}
	//printf("this b is %f\n",b[0]);	
	cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);

	
	cudaStatus = cudaThreadExit();

	// host free 
	   //释放申请的空间
    //for(int i=0;i<N;i++) delete []a[i];
    delete []a;

	delete[] b;
	delete[] c;
	gettimeofday(&end,NULL);
       	int timeuse=1000000*(end.tv_sec-start.tv_sec)+end.tv_usec-start.tv_usec;
	printf("total time is %d ms\n",timeuse/1000);
	return 0;
}
cudaError_t matrixMultiplyWithCuda(float *a,float *b,float *c,size_t N){
	float *dev_a = 0;
	float *dev_b = 0;
	float *dev_c = 0;
	cudaError_t cudaStatus;
	cudaStatus = cudaMalloc((void**)&dev_a, N * D * sizeof(float));
	cudaStatus = cudaMalloc((void**)&dev_b, D * sizeof(float));
	cudaStatus = cudaMalloc((void**)&dev_c, N * sizeof(float));
	cudaStatus = cudaMemcpy(dev_a, a, N * D * sizeof(float), cudaMemcpyHostToDevice);
	cudaStatus = cudaMemcpy(dev_b, b, D * sizeof(float), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		printf("Something wrong\n");
			}
	// kernal invocation 
	//dim3 threadPerBlock(128, 1, 1);
	//dim3 numBlocks(N / threadPerBlock.x+1, 1, 1);
	MatrixMultiply<<<256, 512>>>(dev_a, dev_b, dev_c, N);//每个kernal分配256个块,每个块都有512个线程
	if (cudaStatus != cudaSuccess) {
		printf( "Calculate wrong\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(c, dev_c, N * sizeof(float), cudaMemcpyDeviceToHost);
       for(int i=0;i<N;i++)
       {
       	printf("the %d ge distance is %f\n",i,c[i]);
       }
      // printf("the %f and %f is %f",a[0],b[0],c[0]);
Error:
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);
	return cudaStatus;
}