cuda练习(三):使用gpu进行排序
程序员文章站
2022-07-12 20:03:19
...
生成数据
为了简便期间,生成不重复的数据
#define NUM_ELEMENT 4096
#define NUM_LISTS 32
template<class T> void c_swap(T &x, T &y){ T tmp = x; x = y; y = tmp; }
unsigned int srcData[NUM_ELEMENT];
void gen_and_shuffle(unsigned int * const srcData)
{
for(int i = 0; i < NUM_ELEMENT; i++) //生成不重复的数据
srcData[i] = i;
for(int i = 0; i < NUM_ELEMENT; i++)
c_swap(srcData[rand()%NUM_ELEMENT], srcData[i]);
return;
}
void print_data(unsigned int * const srcData)
{
for(int i = 0; i < NUM_ELEMENT; i++)
{
printf("%4u", srcData[i]);
if((i+1)%32 == 0)
printf("\n");
}
}
cpu排序
sort(srcData, srcData+NUM_ELEMENT);
gpu排序
归并排序
归并排序因为可以分解,很适合并行计算;但是传统的归并排序在合并时,到最后只有少数线程在操作,效率比较低。因此先使用桶排序让数据在列方向有序,然后所有列一起归并,而不是两两归并。
__global__ void sortincuda(unsigned int * const data, gpu_calc_type type)
{
const unsigned int tid = threadIdx.x;
__shared__ unsigned int sort_tmp[NUM_ELEMENT], sort_tmp_1[NUM_ELEMENT];
copy_data_in_gpu(data, sort_tmp, tid); //因为数据要经常被读取写入,因此拷贝数据到共享内存中以加速
radix_sort2(sort_tmp, sort_tmp_1, tid); //桶排序
switch(type)
{
case cpu_sort: break;
case gpu_merge_1: merge_1(sort_tmp, data, tid); break; //单线程合并
case gpu_merge_all: merge_atomicMin(sort_tmp, data, tid); break; //多线程合并
case gpu_merge_reduction: merge_two(sort_tmp, data, tid); break; //两两归约合并
case gpu_merge_reduction_modified: merge_final(sort_tmp, data, tid); break; //分块归约合并
default: break;
}
}
上述内核函数只在一个线程块运行,线程数为列数
sortincuda<< <1, NUM_LISTS>> >(gpu_srcData, type);
桶排序
首先可以进行按列的桶排序,让数据在每一列有序。这里桶排序为0/1桶排序。
__device__ void radix_sort2(unsigned int * const sort_tmp,
unsigned int * const sort_tmp_1,
const unsigned int tid) //桶排序
{
for(unsigned int bit_mask = 1; bit_mask > 0; bit_mask <<= 1) //32位
{
unsigned int base_cnt_0 = 0;
unsigned int base_cnt_1 = 0;
for (unsigned int i = 0; i < NUM_ELEMENT; i+=NUM_LISTS)
{
if(sort_tmp[i+tid] & bit_mask) //该位是1,放到sort_tmp_1中
{
sort_tmp_1[base_cnt_1+tid] = sort_tmp[i+tid];
base_cnt_1 += NUM_LISTS;
}
else //该位是0,放到sort_tmp的前面的
{
sort_tmp[base_cnt_0+tid] = sort_tmp[i+tid];
base_cnt_0 += NUM_LISTS;
}
}
for (unsigned int i = 0; i < base_cnt_1; i+=NUM_LISTS) //将sort_tmp_1的数据放到sort_tmp后面
{
sort_tmp[base_cnt_0+i+tid] = sort_tmp_1[i+tid];
}
__syncthreads();
}
}
单线程合并
桶排序后,数据在每一列方向上是有序的,现在需要将所有列归并。
单线程归并指的是使用gpu中的一个线程进行归并操作。由于没有利用到多线程,速度较慢。
__device__ void merge_1(unsigned int * const srcData,
unsigned int * const dstData,
const unsigned int tid) //单线程合并
{
__shared__ unsigned int list_index[NUM_LISTS]; //不使用__shared__的话,就会创建在寄存器中,寄存器空间不够则会创建在全局内存中,很慢
list_index[tid] = tid; //使用多线程初始化
__syncthreads();
if(tid == 0) //使用单个线程merge
{
for(int i = 0; i < NUM_ELEMENT; i++) //执行NUM_ELEMENT次
{
unsigned int min_val = 0xFFFFFFFF;
unsigned int min_idx = 0;
for(int j = 0; j < NUM_LISTS; j++) //遍历每个list的头指针
{
if(list_index[j] >= NUM_ELEMENT) //列表已经走完则跳过
continue;
if(srcData[list_index[j]] < min_val)
{
min_val = srcData[list_index[j]];
min_idx = j;
}
}
list_index[min_idx] += NUM_LISTS; //最小的那个指针向后一位
dstData[i] = min_val;
}
}
}
多线程归并
思路比较简单,就是每个线程使用actomicMin()
函数,找到最小值
unsigned int * const dstData,
const unsigned int tid) //多线程合并
{
unsigned int self_index = tid;
for(int i = 0; i < NUM_ELEMENT; i++)
{
__shared__ unsigned int min_val;
unsigned int self_data = 0xFFFFFFFF;
if(self_index < NUM_ELEMENT)
{
self_data = srcData[self_index];
}
__syncthreads();
atomicMin(&min_val, self_data);
if(min_val == self_data)
{
self_index += NUM_LISTS;
dstData[i] = min_val;
min_val = 0xFFFFFFFF;
}
}
}
归约
用另一种方式找最小值,即两两归约
__device__ void merge_two(unsigned int * const srcData,
unsigned int * dstData,
const unsigned int tid) //归约合并
{
unsigned int self_index = tid;
__shared__ unsigned int data[NUM_LISTS];
__shared__ unsigned int tid_max;
for(int i = 0; i < NUM_ELEMENT; i++)
{
data[tid] = 0xFFFFFFFF;
if(self_index < NUM_ELEMENT)
{
data[tid] = srcData[self_index];
}
if(tid == 0)
{
tid_max = NUM_LISTS >> 1;
}
__syncthreads();
while(tid_max > 0)
{
if(tid < tid_max)
{
if(data[tid] > data[tid + tid_max]) //小的换到前半段
{
data[tid] = data[tid + tid_max];
}
}
__syncthreads();
if(tid == 0) //不清楚书里面为什么不让单一线程处理共享变量,不会出问题吗?
{
tid_max >>= 1;
}
__syncthreads();
}
if(srcData[self_index] == data[0])
{
dstData[i] = data[0];
self_index += NUM_LISTS;
}
}
}
分块归约
两两归约到后面,也存在这活动线程太少的问题,因此采用分块归约,即X个线程先归约到Y个值,然后Y个值再归约到1个值,X*Y=NUM_LISTS。
#define REDUCTION_SIZE 8
#define REDUCTION_SHIFT 3
__device__ void merge_final(unsigned int * const srcData,
unsigned int * const dstData,
const unsigned int tid) //分块的归约合并
{
__shared__ unsigned int min_val_reduction[NUM_LISTS/REDUCTION_SIZE];
unsigned int s_tid = tid >> REDUCTION_SHIFT;
unsigned int self_index = tid;
__shared__ unsigned int min_val;
for(int i = 0; i < NUM_ELEMENT; i++)
{
unsigned int self_data = 0xFFFFFFFF;
if(self_index < NUM_ELEMENT)
{
self_data = srcData[self_index];
}
if(tid < NUM_LISTS/REDUCTION_SIZE)
{
min_val_reduction[tid] = 0xFFFFFFFF;
}
__syncthreads();
atomicMin(&(min_val_reduction[s_tid]), self_data); //分块归约
__syncthreads();
if(tid == 0)
{
min_val = 0xFFFFFFFF;
}
__syncthreads();
if(tid < NUM_LISTS/REDUCTION_SIZE)
{
atomicMin(&min_val, min_val_reduction[tid]); //归约起来的值再归约
}
__syncthreads();
if(min_val == self_data)
{
dstData[i] = min_val;
self_index += NUM_LISTS;
min_val = 0xFFFFFFFF;
}
}
}
测速结果
方法 | 8 | 16 | 32 | 64 | 128 |
---|---|---|---|---|---|
cpu | 0.00050800 | 0.00050700 | 0.00050700 | 0.00050700 | 0.00050800 |
gpu单线程 | 0.00135200 | 0.00127200 | 0.00169200 | 0.00285700 | 0.00637600 |
gpu多线程 | 0.00122300 | 0.00088400 | 0.00107800 | 0.00100100 | 0.00094100 |
gpu归约 | 0.00340800 | 0.00378200 | 0.00431200 | 0.00493000 | 0.00578400 |
gpu分块归约 | 0.00186800 | 0.00148400 | 0.00130000 | 0.00124200 | 0.00124200 |
其中横轴代表NUM_LISTS。
首先不清楚为什么,gpu比cpu的速度还慢,猜测是algorithm库也使用了并行优化
其次,单个线程确实不如多个线程快;归约时,分块归约效果也比较明显,两两归约反而特别慢。
源代码
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <algorithm>
using namespace std;
#define NUM_ELEMENT 4096
#define NUM_LISTS 128
typedef enum{
cpu_sort = 0,
gpu_merge_1, //桶排序+单个线程合并
gpu_merge_all, //桶排序+多个线程合并
gpu_merge_reduction, //桶排序+规约合并
gpu_merge_reduction_modified, //桶排序+优化的分块规约合并
}gpu_calc_type;
template<class T> void c_swap(T &x, T &y){ T tmp = x; x = y; y = tmp; }
unsigned int srcData[NUM_ELEMENT];
void gen_and_shuffle(unsigned int * const srcData)
{
for(int i = 0; i < NUM_ELEMENT; i++) //生成不重复的数据
srcData[i] = i;
for(int i = 0; i < NUM_ELEMENT; i++)
c_swap(srcData[rand()%NUM_ELEMENT], srcData[i]);
return;
}
void print_data(unsigned int * const srcData)
{
for(int i = 0; i < NUM_ELEMENT; i++)
{
printf("%4u", srcData[i]);
if((i+1)%32 == 0)
printf("\n");
}
}
__device__ void copy_data_in_gpu(const unsigned int * const srcData,
unsigned int * const dstData,
const unsigned int tid)
{
for(int i = 0; i < NUM_ELEMENT; i+=NUM_LISTS)
dstData[i+tid] = srcData[i+tid]; //行拷贝
__syncthreads();
}
__device__ void radix_sort2(unsigned int * const sort_tmp,
unsigned int * const sort_tmp_1,
const unsigned int tid) //桶排序
{
for(unsigned int bit_mask = 1; bit_mask > 0; bit_mask <<= 1) //32位
{
unsigned int base_cnt_0 = 0;
unsigned int base_cnt_1 = 0;
for (unsigned int i = 0; i < NUM_ELEMENT; i+=NUM_LISTS)
{
if(sort_tmp[i+tid] & bit_mask) //该位是1,放到sort_tmp_1中
{
sort_tmp_1[base_cnt_1+tid] = sort_tmp[i+tid];
base_cnt_1 += NUM_LISTS;
}
else //该位是0,放到sort_tmp的前面的
{
sort_tmp[base_cnt_0+tid] = sort_tmp[i+tid];
base_cnt_0 += NUM_LISTS;
}
}
for (unsigned int i = 0; i < base_cnt_1; i+=NUM_LISTS) //将sort_tmp_1的数据放到sort_tmp后面
{
sort_tmp[base_cnt_0+i+tid] = sort_tmp_1[i+tid];
}
__syncthreads();
}
}
__device__ void merge_1(unsigned int * const srcData,
unsigned int * const dstData,
const unsigned int tid) //单线程合并
{
__shared__ unsigned int list_index[NUM_LISTS]; //不使用__shared__的话,就会创建在寄存器中,寄存器空间不够则会创建在全局内存中,很慢
list_index[tid] = tid; //使用多线程初始化
__syncthreads();
if(tid == 0) //使用单个线程merge
{
for(int i = 0; i < NUM_ELEMENT; i++) //执行NUM_ELEMENT次
{
unsigned int min_val = 0xFFFFFFFF;
unsigned int min_idx = 0;
for(int j = 0; j < NUM_LISTS; j++) //遍历每个list的头指针
{
if(list_index[j] >= NUM_ELEMENT) //列表已经走完则跳过
continue;
if(srcData[list_index[j]] < min_val)
{
min_val = srcData[list_index[j]];
min_idx = j;
}
}
list_index[min_idx] += NUM_LISTS; //最小的那个指针向后一位
dstData[i] = min_val;
}
}
}
__device__ void merge_atomicMin(unsigned int * const srcData,
unsigned int * const dstData,
const unsigned int tid) //多线程合并
{
unsigned int self_index = tid;
for(int i = 0; i < NUM_ELEMENT; i++)
{
__shared__ unsigned int min_val;
unsigned int self_data = 0xFFFFFFFF;
if(self_index < NUM_ELEMENT)
{
self_data = srcData[self_index];
}
__syncthreads();
atomicMin(&min_val, self_data);
if(min_val == self_data)
{
self_index += NUM_LISTS;
dstData[i] = min_val;
min_val = 0xFFFFFFFF;
}
}
}
__device__ void merge_two(unsigned int * const srcData,
unsigned int * dstData,
const unsigned int tid) //归约合并
{
unsigned int self_index = tid;
__shared__ unsigned int data[NUM_LISTS];
__shared__ unsigned int tid_max;
for(int i = 0; i < NUM_ELEMENT; i++)
{
data[tid] = 0xFFFFFFFF;
if(self_index < NUM_ELEMENT)
{
data[tid] = srcData[self_index];
}
if(tid == 0)
{
tid_max = NUM_LISTS >> 1;
}
__syncthreads();
while(tid_max > 0)
{
if(tid < tid_max)
{
if(data[tid] > data[tid + tid_max]) //小的换到前半段
{
data[tid] = data[tid + tid_max];
}
}
__syncthreads();
if(tid == 0) //不清楚书里面为什么不让单一线程处理共享变量,不会出问题吗?
{
tid_max >>= 1;
}
__syncthreads();
}
if(srcData[self_index] == data[0])
{
dstData[i] = data[0];
self_index += NUM_LISTS;
}
}
}
#define REDUCTION_SIZE 8
#define REDUCTION_SHIFT 3
__device__ void merge_final(unsigned int * const srcData,
unsigned int * const dstData,
const unsigned int tid) //分块的归约合并
{
__shared__ unsigned int min_val_reduction[NUM_LISTS/REDUCTION_SIZE];
unsigned int s_tid = tid >> REDUCTION_SHIFT;
unsigned int self_index = tid;
__shared__ unsigned int min_val;
for(int i = 0; i < NUM_ELEMENT; i++)
{
unsigned int self_data = 0xFFFFFFFF;
if(self_index < NUM_ELEMENT)
{
self_data = srcData[self_index];
}
if(tid < NUM_LISTS/REDUCTION_SIZE)
{
min_val_reduction[tid] = 0xFFFFFFFF;
}
__syncthreads();
atomicMin(&(min_val_reduction[s_tid]), self_data); //分块归约
__syncthreads();
if(tid == 0)
{
min_val = 0xFFFFFFFF;
}
__syncthreads();
if(tid < NUM_LISTS/REDUCTION_SIZE)
{
atomicMin(&min_val, min_val_reduction[tid]); //归约起来的值再归约
}
__syncthreads();
if(min_val == self_data)
{
dstData[i] = min_val;
self_index += NUM_LISTS;
min_val = 0xFFFFFFFF;
}
}
}
__global__ void sortincuda(unsigned int * const data, gpu_calc_type type)
{
const unsigned int tid = threadIdx.x;
__shared__ unsigned int sort_tmp[NUM_ELEMENT], sort_tmp_1[NUM_ELEMENT];
copy_data_in_gpu(data, sort_tmp, tid); //因为数据要经常被读取写入,因此拷贝数据到共享内存中以加速
radix_sort2(sort_tmp, sort_tmp_1, tid); //桶排序
switch(type)
{
case cpu_sort: break;
case gpu_merge_1: merge_1(sort_tmp, data, tid); break; //单线程合并
case gpu_merge_all: merge_atomicMin(sort_tmp, data, tid); break; //多线程合并
case gpu_merge_reduction: merge_two(sort_tmp, data, tid); break; //两两归约合并
case gpu_merge_reduction_modified: merge_final(sort_tmp, data, tid); break; //分块归约合并
default: break;
}
}
int main(void)
{
gen_and_shuffle(srcData);
//print_data(srcData);
//printf("\n\n");
unsigned int *gpu_srcData;
cudaMalloc((void**)&gpu_srcData, sizeof(unsigned int)*NUM_ELEMENT);
cudaMemcpy(gpu_srcData, srcData, sizeof(unsigned int)*NUM_ELEMENT, cudaMemcpyHostToDevice);
clock_t start, end;
for(gpu_calc_type type = cpu_sort; type <= gpu_merge_reduction_modified; type = (gpu_calc_type)(type+1))
{
if(type != cpu_sort) //gpu排序
{
start = clock();
sortincuda<< <1, NUM_LISTS>> >(gpu_srcData, type);
cudaDeviceSynchronize();
end = clock();
printf("type %d use time %.8lf\n", type, (double)(end-start)/CLOCKS_PER_SEC);
}
else //cpu排序
{
start = clock();
sort(srcData, srcData+NUM_ELEMENT);
end = clock();
printf("type %d use time %.8lf\n", type, (double)(end-start)/CLOCKS_PER_SEC);
}
}
cudaMemcpy(srcData, gpu_srcData, sizeof(unsigned int)*NUM_ELEMENT, cudaMemcpyDeviceToHost);
//print_data(srcData);
cudaFree(gpu_srcData);
return 0;
}
上一篇: 《并行程序设计导论》05openmp