OpenCl加速矩阵运算
程序员文章站
2022-07-12 21:31:38
...
OpenCl运用并行的方法加速矩阵运算,在业界得到广泛运用,博主也试了一试,挺好玩的。
注意:1、OpenCl针对的数据量越大,加速效果越明显
2、OpenCl版本测试在nvidia730上,cuda7.5下的OpenCl文件,windows7
3、OpenCl_SDK位于C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5下
4、include位于v7.5下的CL文件夹
5、lib位于v7.5下的OpenCl.lib
下列程序经博主测试准确无误!
核函数(test2.cl)文件如下:
__kernel void adder(__global const float* a, __global float* result)
{
int idx = get_global_id(0);
result[idx] = a[idx] + 1;
}
主文件main.cpp
//OpenCl加速向量运算
//作者:samylee
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cstdlib>
#include <CL/cl.h>
cl_program load_program(cl_context context, const char* filename)
{
std::ifstream in(filename, std::ios_base::binary);
if (!in.good()) {
return 0;
}
// get file length
in.seekg(0, std::ios_base::end);
size_t length = in.tellg();
in.seekg(0, std::ios_base::beg);
// read program source
std::vector<char> data(length + 1);
in.read(&data[0], length);
data[length] = 0;
// create and build program
const char* source = &data[0];
cl_program program = clCreateProgramWithSource(context, 1, &source, 0, 0);
if (program == 0) {
return 0;
}
if (clBuildProgram(program, 0, 0, 0, 0, 0) != CL_SUCCESS) {
return 0;
}
return program;
}
int main()
{
cl_int err;
//调用两次clGetPlatformIDs函数,第一次获取可用的平台数量,第二次获取一个可用的平台。
cl_uint num;
err = clGetPlatformIDs(0, 0, &num);
if (err != CL_SUCCESS) {
std::cerr << "Unable to get platforms\n";
return 0;
}
std::vector<cl_platform_id> platforms(num);
err = clGetPlatformIDs(num, &platforms[0], &num);
if (err != CL_SUCCESS) {
std::cerr << "Unable to get platform ID\n";
return 0;
}
//上下文context可能会管理多个设备device。
cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[0]), 0 };
cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_DEFAULT, NULL, NULL, NULL);
if (context == 0) {
std::cerr << "Can't create OpenCL context\n";
return 0;
}
size_t cb;
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
std::vector<cl_device_id> devices(cb / sizeof(cl_device_id));
clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, &devices[0], 0);
//调用两次clGetDeviceIDs函数,第一次获取可用的设备数量,第二次获取一个可用的设备。
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 0, NULL, &cb);
std::string devname;
devname.resize(cb);
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, cb, &devname[0], 0);
//输出设备名称
std::cout << "Device: " << devname.c_str() << "\n";
//Create a command queue(调用clCreateCommandQueue函数)
//一个设备device对应一个command queue。
//上下文conetxt将命令发送到设备对应的command queue,设备就可以执行命令队列里的命令
cl_command_queue queue = clCreateCommandQueue(context, devices[0], 0, 0);
if (queue == 0) {
std::cerr << "Can't create command queue\n";
clReleaseContext(context);
return 0;
}
//Create device buffers(调用clCreateBuffer函数)
const int DATA_SIZE = 3;
std::vector<float> a(DATA_SIZE), res(DATA_SIZE);
for (int i = 0; i < DATA_SIZE; i++) {
a[i] = i;
}
cl_mem cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &a[0], NULL);
cl_mem cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
if (cl_a == 0 || cl_res == 0) {
std::cerr << "Can't create OpenCL buffer\n";
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
//Load kernel function
cl_program program = load_program(context, "test2.cl");
if (program == 0) {
std::cerr << "Can't load or build program\n";
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
//Create kernel function
cl_kernel adder = clCreateKernel(program, "adder", 0);
if (adder == 0) {
std::cerr << "Can't load kernel\n";
clReleaseProgram(program);
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
//设定函数参数
clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_res);
//执行函数
size_t work_size = DATA_SIZE;
err = clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, 0);
if (err == CL_SUCCESS) {
err = clEnqueueReadBuffer(queue, cl_res, CL_TRUE, 0, sizeof(float) * DATA_SIZE, &res[0], 0, 0, 0);
}
//验证是否正确
if (err == CL_SUCCESS) {
std::cout << res[1] << std::endl;
}
else {
std::cerr << "Can't run kernel or read back data\n";
}
//release all source
clReleaseKernel(adder);
clReleaseProgram(program);
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(queue);
clReleaseContext(context);
system("pause");
return 0;
}
效果如下:
任何问题请加唯一QQ2258205918(名称samylee)!
上一篇: OpenCL入门概念
下一篇: OpenCl_CPU加速矩阵运算