欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

cuda练习(三):使用gpu进行排序

程序员文章站 2022-07-12 20:03:19
...

生成数据

为了简便期间,生成不重复的数据

#define NUM_ELEMENT 4096
#define NUM_LISTS   32

template<class T> void c_swap(T &x, T &y){ T tmp = x; x = y; y = tmp; }

unsigned int srcData[NUM_ELEMENT];

void gen_and_shuffle(unsigned int * const srcData)
{
    for(int i = 0; i < NUM_ELEMENT; i++)    //生成不重复的数据
        srcData[i] = i;
    for(int i = 0; i < NUM_ELEMENT; i++)
        c_swap(srcData[rand()%NUM_ELEMENT], srcData[i]);
    return;
}

void print_data(unsigned int * const srcData)
{
    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        printf("%4u", srcData[i]);
        if((i+1)%32 == 0)
            printf("\n");
    }
}

cpu排序

sort(srcData, srcData+NUM_ELEMENT);

gpu排序

归并排序

归并排序因为可以分解,很适合并行计算;但是传统的归并排序在合并时,到最后只有少数线程在操作,效率比较低。因此先使用桶排序让数据在列方向有序,然后所有列一起归并,而不是两两归并。

__global__ void sortincuda(unsigned int * const data, gpu_calc_type type)
{
    const unsigned int tid = threadIdx.x;
    __shared__ unsigned int sort_tmp[NUM_ELEMENT], sort_tmp_1[NUM_ELEMENT];

    copy_data_in_gpu(data, sort_tmp, tid);  //因为数据要经常被读取写入,因此拷贝数据到共享内存中以加速

    radix_sort2(sort_tmp, sort_tmp_1, tid); //桶排序

    switch(type)
    {
        case cpu_sort:  break;
        case gpu_merge_1: merge_1(sort_tmp, data, tid); break;  //单线程合并
        case gpu_merge_all: merge_atomicMin(sort_tmp, data, tid); break;    //多线程合并
        case gpu_merge_reduction: merge_two(sort_tmp, data, tid); break;    //两两归约合并
        case gpu_merge_reduction_modified: merge_final(sort_tmp, data, tid); break; //分块归约合并
        default: break;
    }
}

上述内核函数只在一个线程块运行,线程数为列数

sortincuda<< <1, NUM_LISTS>> >(gpu_srcData, type);

桶排序

首先可以进行按列的桶排序,让数据在每一列有序。这里桶排序为0/1桶排序。

__device__ void radix_sort2(unsigned int * const sort_tmp, 
                            unsigned int * const sort_tmp_1,
                            const unsigned int tid) //桶排序
{
    for(unsigned int bit_mask = 1; bit_mask > 0; bit_mask <<= 1)    //32位
    {
        unsigned int base_cnt_0 = 0;
        unsigned int base_cnt_1 = 0;

        for (unsigned int i = 0; i < NUM_ELEMENT; i+=NUM_LISTS) 
        {
            if(sort_tmp[i+tid] & bit_mask)  //该位是1,放到sort_tmp_1中
            {
                sort_tmp_1[base_cnt_1+tid] = sort_tmp[i+tid];
                base_cnt_1 += NUM_LISTS;
            }
            else    //该位是0,放到sort_tmp的前面的
            {
                sort_tmp[base_cnt_0+tid] = sort_tmp[i+tid];
                base_cnt_0 += NUM_LISTS;
            }
        }

        for (unsigned int i = 0; i < base_cnt_1; i+=NUM_LISTS)  //将sort_tmp_1的数据放到sort_tmp后面
        {
            sort_tmp[base_cnt_0+i+tid] = sort_tmp_1[i+tid];
        }
        __syncthreads();
    }
}

单线程合并

桶排序后,数据在每一列方向上是有序的,现在需要将所有列归并。
单线程归并指的是使用gpu中的一个线程进行归并操作。由于没有利用到多线程,速度较慢。

__device__ void merge_1(unsigned int * const srcData, 
                        unsigned int * const dstData,
                        const unsigned int tid) //单线程合并
{
    __shared__ unsigned int list_index[NUM_LISTS];  //不使用__shared__的话,就会创建在寄存器中,寄存器空间不够则会创建在全局内存中,很慢
    list_index[tid] = tid;    //使用多线程初始化
    __syncthreads();

    if(tid == 0)    //使用单个线程merge
    {
        for(int i = 0; i < NUM_ELEMENT; i++)    //执行NUM_ELEMENT次
        {
            unsigned int min_val = 0xFFFFFFFF;
            unsigned int min_idx = 0;
            for(int j = 0; j < NUM_LISTS; j++)  //遍历每个list的头指针
            {
                if(list_index[j] >= NUM_ELEMENT)    //列表已经走完则跳过
                    continue;
                if(srcData[list_index[j]] < min_val)    
                {
                    min_val = srcData[list_index[j]];
                    min_idx = j;
                }
            }
            list_index[min_idx] += NUM_LISTS;   //最小的那个指针向后一位
            dstData[i] = min_val;
        }
    }
}

多线程归并

思路比较简单,就是每个线程使用actomicMin()函数,找到最小值

                          unsigned int * const dstData, 
                          const unsigned int tid)   //多线程合并
{
    unsigned int self_index = tid;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        __shared__ unsigned int min_val;
        unsigned int self_data = 0xFFFFFFFF;
        
        if(self_index < NUM_ELEMENT)
        {
            self_data = srcData[self_index];
        }

        __syncthreads();

        atomicMin(&min_val, self_data);

        if(min_val == self_data)
        {
            self_index += NUM_LISTS;
            dstData[i] = min_val;
            min_val = 0xFFFFFFFF;
        }

    }
}

归约

用另一种方式找最小值,即两两归约

__device__ void merge_two(unsigned int * const srcData, 
                        unsigned int * dstData, 
                        const unsigned int tid) //归约合并
{
    unsigned int self_index = tid;
    __shared__ unsigned int data[NUM_LISTS];
    __shared__ unsigned int tid_max;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        data[tid] = 0xFFFFFFFF;
        
        if(self_index < NUM_ELEMENT)
        {
            data[tid] = srcData[self_index];
        }

        if(tid == 0)
        {
            tid_max = NUM_LISTS >> 1;
        }

        __syncthreads();

        while(tid_max > 0)
        {
            if(tid < tid_max)
            {
                if(data[tid] > data[tid + tid_max])   //小的换到前半段
                {
                    data[tid] = data[tid + tid_max];
                }
            }
            __syncthreads();
            if(tid == 0)    //不清楚书里面为什么不让单一线程处理共享变量,不会出问题吗?
            {
                tid_max >>= 1;
            }
            __syncthreads();
        }

        if(srcData[self_index] == data[0])
        {
            dstData[i] = data[0];
            self_index += NUM_LISTS;
        }
    }
}

分块归约

两两归约到后面,也存在这活动线程太少的问题,因此采用分块归约,即X个线程先归约到Y个值,然后Y个值再归约到1个值,X*Y=NUM_LISTS。

#define REDUCTION_SIZE  8
#define REDUCTION_SHIFT 3

__device__ void merge_final(unsigned int * const srcData, 
                            unsigned int * const dstData, 
                            const unsigned int tid) //分块的归约合并
{
    __shared__ unsigned int min_val_reduction[NUM_LISTS/REDUCTION_SIZE];
    unsigned int s_tid = tid >> REDUCTION_SHIFT;
    unsigned int self_index = tid;
    __shared__ unsigned int min_val;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        unsigned int self_data = 0xFFFFFFFF;

        if(self_index < NUM_ELEMENT)
        {
            self_data = srcData[self_index];
        }

        if(tid < NUM_LISTS/REDUCTION_SIZE)
        {
            min_val_reduction[tid] = 0xFFFFFFFF;
        }

        __syncthreads();

        atomicMin(&(min_val_reduction[s_tid]), self_data);  //分块归约

        __syncthreads();

        if(tid == 0)
        {
            min_val = 0xFFFFFFFF;
        }

        __syncthreads();

        if(tid < NUM_LISTS/REDUCTION_SIZE)
        {
            atomicMin(&min_val, min_val_reduction[tid]);    //归约起来的值再归约
        }

        __syncthreads();

        if(min_val == self_data)
        {
            dstData[i] = min_val;
            self_index += NUM_LISTS;
            min_val = 0xFFFFFFFF;
        }

    }
}

测速结果

方法 8 16 32 64 128
cpu 0.00050800 0.00050700 0.00050700 0.00050700 0.00050800
gpu单线程 0.00135200 0.00127200 0.00169200 0.00285700 0.00637600
gpu多线程 0.00122300 0.00088400 0.00107800 0.00100100 0.00094100
gpu归约 0.00340800 0.00378200 0.00431200 0.00493000 0.00578400
gpu分块归约 0.00186800 0.00148400 0.00130000 0.00124200 0.00124200

其中横轴代表NUM_LISTS。
首先不清楚为什么,gpu比cpu的速度还慢,猜测是algorithm库也使用了并行优化
其次,单个线程确实不如多个线程快;归约时,分块归约效果也比较明显,两两归约反而特别慢。

源代码

#include <iostream>
#include <cstdlib>
#include <time.h>
#include <algorithm>
using namespace std;

#define NUM_ELEMENT 4096
#define NUM_LISTS   128

typedef enum{
    cpu_sort = 0,
    gpu_merge_1,    //桶排序+单个线程合并
    gpu_merge_all,  //桶排序+多个线程合并
    gpu_merge_reduction,    //桶排序+规约合并
    gpu_merge_reduction_modified,   //桶排序+优化的分块规约合并
}gpu_calc_type;

template<class T> void c_swap(T &x, T &y){ T tmp = x; x = y; y = tmp; }

unsigned int srcData[NUM_ELEMENT];

void gen_and_shuffle(unsigned int * const srcData)
{
    for(int i = 0; i < NUM_ELEMENT; i++)    //生成不重复的数据
        srcData[i] = i;
    for(int i = 0; i < NUM_ELEMENT; i++)
        c_swap(srcData[rand()%NUM_ELEMENT], srcData[i]);
    return;
}

void print_data(unsigned int * const srcData)
{
    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        printf("%4u", srcData[i]);
        if((i+1)%32 == 0)
            printf("\n");
    }
}

__device__ void copy_data_in_gpu(const unsigned int * const srcData,
                                    unsigned int * const dstData, 
                                    const unsigned int tid)
{
    for(int i = 0; i < NUM_ELEMENT; i+=NUM_LISTS)
        dstData[i+tid] = srcData[i+tid];    //行拷贝
    __syncthreads();
}

__device__ void radix_sort2(unsigned int * const sort_tmp, 
                            unsigned int * const sort_tmp_1,
                            const unsigned int tid) //桶排序
{
    for(unsigned int bit_mask = 1; bit_mask > 0; bit_mask <<= 1)    //32位
    {
        unsigned int base_cnt_0 = 0;
        unsigned int base_cnt_1 = 0;

        for (unsigned int i = 0; i < NUM_ELEMENT; i+=NUM_LISTS) 
        {
            if(sort_tmp[i+tid] & bit_mask)  //该位是1,放到sort_tmp_1中
            {
                sort_tmp_1[base_cnt_1+tid] = sort_tmp[i+tid];
                base_cnt_1 += NUM_LISTS;
            }
            else    //该位是0,放到sort_tmp的前面的
            {
                sort_tmp[base_cnt_0+tid] = sort_tmp[i+tid];
                base_cnt_0 += NUM_LISTS;
            }
        }

        for (unsigned int i = 0; i < base_cnt_1; i+=NUM_LISTS)  //将sort_tmp_1的数据放到sort_tmp后面
        {
            sort_tmp[base_cnt_0+i+tid] = sort_tmp_1[i+tid];
        }
        __syncthreads();
    }
}

__device__ void merge_1(unsigned int * const srcData, 
                        unsigned int * const dstData,
                        const unsigned int tid) //单线程合并
{
    __shared__ unsigned int list_index[NUM_LISTS];  //不使用__shared__的话,就会创建在寄存器中,寄存器空间不够则会创建在全局内存中,很慢
    list_index[tid] = tid;    //使用多线程初始化
    __syncthreads();

    if(tid == 0)    //使用单个线程merge
    {
        for(int i = 0; i < NUM_ELEMENT; i++)    //执行NUM_ELEMENT次
        {
            unsigned int min_val = 0xFFFFFFFF;
            unsigned int min_idx = 0;
            for(int j = 0; j < NUM_LISTS; j++)  //遍历每个list的头指针
            {
                if(list_index[j] >= NUM_ELEMENT)    //列表已经走完则跳过
                    continue;
                if(srcData[list_index[j]] < min_val)    
                {
                    min_val = srcData[list_index[j]];
                    min_idx = j;
                }
            }
            list_index[min_idx] += NUM_LISTS;   //最小的那个指针向后一位
            dstData[i] = min_val;
        }
    }
}

__device__ void merge_atomicMin(unsigned int * const srcData, 
                          unsigned int * const dstData, 
                          const unsigned int tid)   //多线程合并
{
    unsigned int self_index = tid;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        __shared__ unsigned int min_val;
        unsigned int self_data = 0xFFFFFFFF;
        
        if(self_index < NUM_ELEMENT)
        {
            self_data = srcData[self_index];
        }

        __syncthreads();

        atomicMin(&min_val, self_data);

        if(min_val == self_data)
        {
            self_index += NUM_LISTS;
            dstData[i] = min_val;
            min_val = 0xFFFFFFFF;
        }

    }
}

__device__ void merge_two(unsigned int * const srcData, 
                        unsigned int * dstData, 
                        const unsigned int tid) //归约合并
{
    unsigned int self_index = tid;
    __shared__ unsigned int data[NUM_LISTS];
    __shared__ unsigned int tid_max;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        data[tid] = 0xFFFFFFFF;
        
        if(self_index < NUM_ELEMENT)
        {
            data[tid] = srcData[self_index];
        }

        if(tid == 0)
        {
            tid_max = NUM_LISTS >> 1;
        }

        __syncthreads();

        while(tid_max > 0)
        {
            if(tid < tid_max)
            {
                if(data[tid] > data[tid + tid_max])   //小的换到前半段
                {
                    data[tid] = data[tid + tid_max];
                }
            }
            __syncthreads();
            if(tid == 0)    //不清楚书里面为什么不让单一线程处理共享变量,不会出问题吗?
            {
                tid_max >>= 1;
            }
            __syncthreads();
        }

        if(srcData[self_index] == data[0])
        {
            dstData[i] = data[0];
            self_index += NUM_LISTS;
        }
    }
}

#define REDUCTION_SIZE  8
#define REDUCTION_SHIFT 3

__device__ void merge_final(unsigned int * const srcData, 
                            unsigned int * const dstData, 
                            const unsigned int tid) //分块的归约合并
{
    __shared__ unsigned int min_val_reduction[NUM_LISTS/REDUCTION_SIZE];
    unsigned int s_tid = tid >> REDUCTION_SHIFT;
    unsigned int self_index = tid;
    __shared__ unsigned int min_val;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        unsigned int self_data = 0xFFFFFFFF;

        if(self_index < NUM_ELEMENT)
        {
            self_data = srcData[self_index];
        }

        if(tid < NUM_LISTS/REDUCTION_SIZE)
        {
            min_val_reduction[tid] = 0xFFFFFFFF;
        }

        __syncthreads();

        atomicMin(&(min_val_reduction[s_tid]), self_data);  //分块归约

        __syncthreads();

        if(tid == 0)
        {
            min_val = 0xFFFFFFFF;
        }

        __syncthreads();

        if(tid < NUM_LISTS/REDUCTION_SIZE)
        {
            atomicMin(&min_val, min_val_reduction[tid]);    //归约起来的值再归约
        }

        __syncthreads();

        if(min_val == self_data)
        {
            dstData[i] = min_val;
            self_index += NUM_LISTS;
            min_val = 0xFFFFFFFF;
        }

    }
}

__global__ void sortincuda(unsigned int * const data, gpu_calc_type type)
{
    const unsigned int tid = threadIdx.x;
    __shared__ unsigned int sort_tmp[NUM_ELEMENT], sort_tmp_1[NUM_ELEMENT];

    copy_data_in_gpu(data, sort_tmp, tid);  //因为数据要经常被读取写入,因此拷贝数据到共享内存中以加速

    radix_sort2(sort_tmp, sort_tmp_1, tid); //桶排序

    switch(type)
    {
        case cpu_sort:  break;
        case gpu_merge_1: merge_1(sort_tmp, data, tid); break;  //单线程合并
        case gpu_merge_all: merge_atomicMin(sort_tmp, data, tid); break;    //多线程合并
        case gpu_merge_reduction: merge_two(sort_tmp, data, tid); break;    //两两归约合并
        case gpu_merge_reduction_modified: merge_final(sort_tmp, data, tid); break; //分块归约合并
        default: break;
    }
}

int main(void)
{
    gen_and_shuffle(srcData);
    //print_data(srcData);

    //printf("\n\n");

    unsigned int *gpu_srcData;

    cudaMalloc((void**)&gpu_srcData, sizeof(unsigned int)*NUM_ELEMENT);
    cudaMemcpy(gpu_srcData, srcData, sizeof(unsigned int)*NUM_ELEMENT, cudaMemcpyHostToDevice);

    clock_t start, end;

    for(gpu_calc_type type = cpu_sort; type <= gpu_merge_reduction_modified; type = (gpu_calc_type)(type+1))
    {
        if(type != cpu_sort)    //gpu排序
        {
            start = clock();
            sortincuda<< <1, NUM_LISTS>> >(gpu_srcData, type);
            cudaDeviceSynchronize();
            end = clock();
            printf("type %d use time %.8lf\n", type, (double)(end-start)/CLOCKS_PER_SEC);
        }
        else    //cpu排序
        {
            start = clock();
            sort(srcData, srcData+NUM_ELEMENT);
            end = clock();
            printf("type %d use time %.8lf\n", type, (double)(end-start)/CLOCKS_PER_SEC);
        }
    }

    cudaMemcpy(srcData, gpu_srcData, sizeof(unsigned int)*NUM_ELEMENT, cudaMemcpyDeviceToHost);
    //print_data(srcData);

    cudaFree(gpu_srcData);


    return 0;

}
相关标签: cuda