欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

OpenCl加速矩阵运算

程序员文章站 2022-07-12 21:31:38
...

OpenCl运用并行的方法加速矩阵运算,在业界得到广泛运用,博主也试了一试,挺好玩的。

注意:1、OpenCl针对的数据量越大,加速效果越明显

            2、OpenCl版本测试在nvidia730上,cuda7.5下的OpenCl文件,windows7

            3、OpenCl_SDK位于C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5下

            4、include位于v7.5下的CL文件夹

            5、lib位于v7.5下的OpenCl.lib


下列程序经博主测试准确无误!

核函数(test2.cl)文件如下:

__kernel void adder(__global const float* a, __global float* result)
{
	int idx = get_global_id(0);
	result[idx] = a[idx] + 1;
}

主文件main.cpp

//OpenCl加速向量运算
//作者:samylee
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cstdlib>
#include <CL/cl.h>

cl_program load_program(cl_context context, const char* filename)
{
	std::ifstream in(filename, std::ios_base::binary);
	if (!in.good()) {
		return 0;
	}

	// get file length
	in.seekg(0, std::ios_base::end);
	size_t len​​gth = in.tellg();
	in.seekg(0, std::ios_base::beg);
	// read program source
	std::vector<char> data(len​​gth + 1);
	in.read(&data[0], len​​gth);
	data[len​​gth] = 0;
	// create and build program
	const char* source = &data[0];
	cl_program program = clCreateProgramWithSource(context, 1, &source, 0, 0);
	if (program == 0) {
		return 0;
	}
	if (clBuildProgram(program, 0, 0, 0, 0, 0) != CL_SUCCESS) {
		return 0;
	}
	return program;
}

int main()

{
	cl_int err;

	//调用两次clGetPlatformIDs函数,第一次获取可用的平台数量,第二次获取一个可用的平台。
	cl_uint num;
	err = clGetPlatformIDs(0, 0, &num);
	if (err != CL_SUCCESS) {
		std::cerr << "Unable to get platforms\n";
		return 0;
	}

	std::vector<cl_platform_id> platforms(num);
	err = clGetPlatformIDs(num, &platforms[0], &num);
	if (err != CL_SUCCESS) {
		std::cerr << "Unable to get platform ID\n";
		return 0;
	}

	//上下文context可能会管理多个设备device。
	cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[0]), 0 };
	cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_DEFAULT, NULL, NULL, NULL);
	if (context == 0) {
		std::cerr << "Can't create OpenCL context\n";
		return 0;
	}

	size_t cb;
	clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
	std::vector<cl_device_id> devices(cb / sizeof(cl_device_id));
	clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, &devices[0], 0);

	//调用两次clGetDeviceIDs函数,第一次获取可用的设备数量,第二次获取一个可用的设备。
	clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 0, NULL, &cb);
	std::string devname;
	devname.resize(cb);
	clGetDeviceInfo(devices[0], CL_DEVICE_NAME, cb, &devname[0], 0);

	//输出设备名称
	std::cout << "Device: " << devname.c_str() << "\n";

	//Create a command queue(调用clCreateCommandQueue函数)
	//一个设备device对应一个command queue。
	//上下文conetxt将命令发送到设备对应的command queue,设备就可以执行命令队列里的命令
	cl_command_queue queue = clCreateCommandQueue(context, devices[0], 0, 0);
	if (queue == 0) {
		std::cerr << "Can't create command queue\n";
		clReleaseContext(context);
		return 0;
	}

	//Create device buffers(调用clCreateBuffer函数)
	const int DATA_SIZE = 3;
	std::vector<float> a(DATA_SIZE), res(DATA_SIZE);
	for (int i = 0; i < DATA_SIZE; i++) {
		a[i] = i;
	}

	cl_mem cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &a[0], NULL);
	cl_mem cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);

	if (cl_a == 0 || cl_res == 0) {
		std::cerr << "Can't create OpenCL buffer\n";
		clReleaseMemObject(cl_a);
		clReleaseMemObject(cl_res);
		clReleaseCommandQueue(queue);
		clReleaseContext(context);
		return 0;
	}

	//Load kernel function
	cl_program program = load_program(context, "test2.cl");
	if (program == 0) {
		std::cerr << "Can't load or build program\n";
		clReleaseMemObject(cl_a);
		clReleaseMemObject(cl_res);
		clReleaseCommandQueue(queue);
		clReleaseContext(context);
		return 0;
	}

	//Create kernel function
	cl_kernel adder = clCreateKernel(program, "adder", 0);
	if (adder == 0) {
		std::cerr << "Can't load kernel\n";
		clReleaseProgram(program);
		clReleaseMemObject(cl_a);
		clReleaseMemObject(cl_res);
		clReleaseCommandQueue(queue);
		clReleaseContext(context);
		return 0;
	}

	//设定函数参数
	clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
	clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_res);

	//执行函数
	size_t work_size = DATA_SIZE;
	err = clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, 0);
	if (err == CL_SUCCESS) {
		err = clEnqueueReadBuffer(queue, cl_res, CL_TRUE, 0, sizeof(float) * DATA_SIZE, &res[0], 0, 0, 0);
	}

	//验证是否正确
	if (err == CL_SUCCESS) {
		std::cout << res[1] << std::endl;
	}
	else {
		std::cerr << "Can't run kernel or read back data\n";
	}

	//release all source
	clReleaseKernel(adder);
	clReleaseProgram(program);
	clReleaseMemObject(cl_a);
	clReleaseMemObject(cl_res);
	clReleaseCommandQueue(queue);
	clReleaseContext(context);
	system("pause");
	return 0;
}


效果如下:

OpenCl加速矩阵运算


任何问题请加唯一QQ2258205918(名称samylee)!


相关标签: OpenCl