欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

要点初见:OpenCV3中CUDA ORB特征提取算法的实现(GPU加速的ORB算法)

程序员文章站 2022-06-11 18:08:32
...

前文链接:

要点初见:OpenCV3中ORB特征提取算法的实现与分析

http://blog.csdn.net/m0_37857300/article/details/79037988

OpenCV实用程序:“OpenCV相机”——获取、保存选定时刻的摄像头图像

http://blog.csdn.net/m0_37857300/article/details/79038894


前文大概介绍了CPU中的ORB特征提取算法的实现方法。其中提到了虽然ORB是专门为CPU设计的特征提取算法,但在OpenCV中的cudafeatures2d里也存在着用CUDA加速的ORB算法库(OpenCV编译时需交叉编译CUDA才可用)。网上关于OpenCV3中GPU加速的ORB算法的实例特别少,博主根据官方的reference介绍,参考CPU版的ORB算法,摸索出了一套CUDA ORB算法的程序并编译运行成功。

本文将给出CPU、GPU版精简化的ORB程序(不输出图像,模板、待测图片已给出,CPU版为单线程),并对二者在相同参数、相同匹配算法的环境下进行运行测试并比较二者各运行1000次所花的时间。运行的机器配置是i74200U、N840,运行的环境是Ubuntu16.04,用的OpenCV库版本为3.3。

先上结论:对于分辨率不特别大的图片间的ORB特征匹配,CPU运算得比GPU版的快(由于图像上传到GPU消耗了时间);但对于分辨率较大的图片,或者GPU比CPU好的机器(比如Nvidia Jetson系列),GPU版的ORB算法比CPU版的程序更高效。

用的模板:

要点初见:OpenCV3中CUDA ORB特征提取算法的实现(GPU加速的ORB算法)

分辨率为170*209,命名为model.jpg;

用的目标检测图片是

要点初见:OpenCV3中CUDA ORB特征提取算法的实现(GPU加速的ORB算法)

分辨率为1200*668,命名为ORB_test.jpg;

OpenCV3.3中CUDA ORB特征提取算法代码(在项目include目录中需添加/usr/local/cuda/include目录,否则会显示找不到cuda_runtime):

#include <iostream>
#include <signal.h>

#include <opencv2/opencv.hpp>
#include "opencv2/core/core.hpp"
#include "opencv2/cudabgsegm.hpp"
#include "opencv2/core/cuda.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/core/cuda_stream_accessor.hpp"
#include "opencv2/cudafeatures2d.hpp"

#include "opencv2/cudaimgproc.hpp"
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudafilters.hpp"
#include "opencv2/cudawarping.hpp"

#include "opencv2/features2d.hpp"
#include <vector>

using namespace cv;
using namespace cuda;
using namespace std;


bool stop = false;
void sigIntHandler(int signal)
{
    stop = true;
    cout<<"Honestly, you are out!"<<endl;
}


int main()
{
	Mat img_1 = imread("model.jpg");
	Mat img_2 = imread("ORB_test.jpg");

	if (!img_1.data || !img_2.data)
	{
		cout << "error reading images " << endl;
		return -1;
	}

	int times = 0;
	double startime = cv::getTickCount();
	signal(SIGINT, sigIntHandler);

	int64 start, end;
	double time;

	vector<Point2f> recognized;
	vector<Point2f> scene;

	for(times = 0;!stop; times++)
	{
		start = getTickCount();

		recognized.resize(500);
		scene.resize(500);

		cuda::GpuMat d_img1, d_img2;
		cuda::GpuMat d_srcL, d_srcR;

		d_img1.upload(img_1); d_img2.upload(img_2);

		Mat img_matches, des_L, des_R;

		cuda::cvtColor(d_img1, d_srcL, COLOR_BGR2GRAY);
		cuda::cvtColor(d_img2, d_srcR, COLOR_BGR2GRAY);

		Ptr<cuda::ORB> d_orb = cuda::ORB::create(500, 1.2f, 6, 31, 0, 2, 0, 31, 20,true);

		cuda::GpuMat d_keypointsL, d_keypointsR;
		cuda::GpuMat d_descriptorsL, d_descriptorsR, d_descriptorsL_32F, d_descriptorsR_32F;

		vector<KeyPoint> keyPoints_1, keyPoints_2;

		Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(NORM_L2);

		std::vector<DMatch> matches;
		std::vector<DMatch> good_matches;

		d_orb -> detectAndComputeAsync(d_srcL, cuda::GpuMat(), d_keypointsL, d_descriptorsL);
		d_orb -> convert(d_keypointsL, keyPoints_1);
		d_descriptorsL.convertTo(d_descriptorsL_32F, CV_32F);

		d_orb -> detectAndComputeAsync(d_srcR, cuda::GpuMat(), d_keypointsR, d_descriptorsR);
		d_orb -> convert(d_keypointsR, keyPoints_2);
		d_descriptorsR.convertTo(d_descriptorsR_32F, CV_32F);

		d_matcher -> match(d_descriptorsL_32F, d_descriptorsR_32F, matches);

		int sz = matches.size();
		double max_dist = 0; double min_dist = 100;

		for (int i = 0; i < sz; i++)
		{
			double dist = matches[i].distance;
			if (dist < min_dist) min_dist = dist;
			if (dist > max_dist) max_dist = dist;
		}

		cout << "\n-- Max dist : " << max_dist << endl;
		cout << "\n-- Min dist : " << min_dist << endl;

		for (int i = 0; i < sz; i++)
		{
			if (matches[i].distance < 0.6*max_dist)
			{
				good_matches.push_back(matches[i]);
			}
		}

		for (size_t i = 0; i < good_matches.size(); ++i)
		{
			scene.push_back(keyPoints_2[ good_matches[i].trainIdx ].pt);
		}

		for(unsigned int j = 0; j < scene.size(); j++)
			cv::circle(img_2, scene[j], 2, cv::Scalar(0, 255, 0), 2);

		//imshow("img_2", img_2);
		//waitKey(1);

		end = getTickCount();
		time = (double)(end - start) * 1000 / getTickFrequency();
		cout << "Total time : " << time << " ms"<<endl;

		if (times == 1000)
		{
			double maxvalue =  (cv::getTickCount() - startime)/cv::getTickFrequency();
			cout <<"zhenshu " << times/maxvalue <<"  zhen"<<endl;
		}
		cout <<"The number of frame is :  " <<times<<endl;
	}

	return 0;
}

CUDA ORB运行1000帧时测得的平均帧率:

要点初见:OpenCV3中CUDA ORB特征提取算法的实现(GPU加速的ORB算法)

可考虑把GPU版的程序用两个流(CUDA中的Stream)交替、隐藏在CPU之后,可大大加速。

OpenCV3.3中CPU版 ORB特征提取算法代码(前文中程序的简化版,去除了一些影响效率的操作):


#include <iostream>
#include <signal.h>
#include <vector>

#include <opencv2/opencv.hpp>

using namespace cv;
using namespace std;


bool stop = false;
void sigIntHandler(int signal)
{
	stop = true;
	cout<<"Honestly, you are out!"<<endl;
}

int main()
{
	Mat img_1 = imread("model.jpg");
	Mat img_2 = imread("ORB_test.jpg");

	if (!img_1.data || !img_2.data)
	{
		cout << "error reading images " << endl;
		return -1;
	}

	int times = 0;
	double startime = cv::getTickCount();
	signal(SIGINT, sigIntHandler);

	int64 start, end;
	double time;

	vector<Point2f> recognized;
	vector<Point2f> scene;

	for(times = 0;!stop; times++)
	{
		start = getTickCount();

		recognized.resize(500);
		scene.resize(500);

		Mat d_srcL, d_srcR;

		Mat img_matches, des_L, des_R;

		cvtColor(img_1, d_srcL, COLOR_BGR2GRAY);
		cvtColor(img_2, d_srcR, COLOR_BGR2GRAY);

		Ptr<ORB> d_orb = ORB::create(500,1.2f,6,31,0,2);

		Mat d_descriptorsL, d_descriptorsR, d_descriptorsL_32F, d_descriptorsR_32F;

		vector<KeyPoint> keyPoints_1, keyPoints_2;

		Ptr<DescriptorMatcher> d_matcher = DescriptorMatcher::create(NORM_L2);

		std::vector<DMatch> matches;
		std::vector<DMatch> good_matches;

		d_orb -> detectAndCompute(d_srcL, Mat(), keyPoints_1, d_descriptorsL);

		d_orb -> detectAndCompute(d_srcR, Mat(), keyPoints_2, d_descriptorsR);

		d_matcher -> match(d_descriptorsL, d_descriptorsR, matches);

		int sz = matches.size();
		double max_dist = 0; double min_dist = 100;

		for (int i = 0; i < sz; i++)
		{
			double dist = matches[i].distance;
			if (dist < min_dist) min_dist = dist;
			if (dist > max_dist) max_dist = dist;
		}

		cout << "\n-- Max dist : " << max_dist << endl;
		cout << "\n-- Min dist : " << min_dist << endl;

		for (int i = 0; i < sz; i++)
		{
			if (matches[i].distance < 0.6*max_dist)
			{
				good_matches.push_back(matches[i]);
			}
		}

		for (size_t i = 0; i < good_matches.size(); ++i)
		{
			scene.push_back(keyPoints_2[ good_matches[i].trainIdx ].pt);
		}

		for(unsigned int j = 0; j < scene.size(); j++)
			cv::circle(img_2, scene[j], 2, cv::Scalar(0, 255, 0), 2);

		//imshow("img_2", img_2);
		//waitKey(1);

		end = getTickCount();
		time = (double)(end - start) * 1000 / getTickFrequency();
		cout << "Total time : " << time << " ms"<<endl;

		if (times == 1000)
		{
			double maxvalue =  (cv::getTickCount() - startime)/cv::getTickFrequency();
			cout <<"zhenshu " << times/maxvalue <<"  zhen"<<endl;
		}
		cout <<"The number of frame is :  " <<times<<endl;
	}

	return 0;
}

CPU ORB运行1000帧时测得的平均帧率:

要点初见:OpenCV3中CUDA ORB特征提取算法的实现(GPU加速的ORB算法)

若用多线程改写CPU版的ORB算法,运行所花的时间会更少。

相比二图,可见CPU的ORB算法在当前分辨率的笔记本上运行的速度还是比GPU快的,但博主曾在NVIDIA Jetson TK1上运行过这两个算法的摄像头输入版本,GPU版的1000帧平均20帧每秒,CPU版的却只有平均11帧每秒,可说是各有千秋。

若想将该CPU或GPU的算法移植到摄像头上,仅需把摄像头读入的图片>> img_2即可。欢迎交流讨论!


**关于CUDA ORB的一个BUG的解决方法:

若更换模板、目标图片后CUDA ORB程序编译成功,但运行时程序在detectAndComputeAsync处出现

OpenCV Error: Assertion failed (0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows) in GpuMat
的ERROR提示时,需将程序中cuda::ORB::create()括号里的6减小,或将31减小可解决(但这两数要尽可能大,否则会影响keyPoint的选取)。

这个问题是博主读了OpenCV的源码/opencv-3.3.1/modules/cudafeatures2d/src/orb.cpp后解决的,在CUDA的ORB源码中void ORB_Impl::detectAndComputeAsync()里涉及到一个函数:

void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream)
其中

float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
导致了这个问题。getScale中是scaleFactor的(level-firstLevel)次方的运算。参考cv::cuda::ORB::create()的定义:

Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int nfeatures,
                                         float scaleFactor,
                                         int nlevels,
                                         int edgeThreshold,
                                         int firstLevel,
                                         int WTA_K,
                                         int scoreType,
                                         int patchSize,
                                         int fastThreshold,
                                         bool blurForDescriptor)
可见此处Rect inner的选取方式有可能导致:当detectAndComputeAsync()的输入图像分辨率小,而create设置的nlevels较大、edgeThreshold较大时,inner的区域超出图像本身的大小,从而导致ROI超出图像范围的问题。

而为什么同样的图片在CPU ORB算法中就不会发生这个问题呢?

博主读了CPU ORB的代码,发现其中ROI的选取方式完全不同:

void ORB_Impl::detectAndCompute( InputArray _image, InputArray _mask,
                                 std::vector<KeyPoint>& keypoints,
                                 OutputArray _descriptors, bool useProvidedKeypoints )
中,有如下ROI选取代码:

for( level = 0; level < nLevels; level++ )
    {
        float scale = getScale(level, firstLevel, scaleFactor);
        layerScale[level] = scale;
        Size sz(cvRound(image.cols/scale), cvRound(image.rows/scale));
        Size wholeSize(sz.width + border*2, sz.height + border*2);
        if( level_ofs.x + wholeSize.width > bufSize.width )
        {
            level_ofs = Point(0, level_ofs.y + level_dy);
            level_dy = wholeSize.height;
        }

        Rect linfo(level_ofs.x + border, level_ofs.y + border, sz.width, sz.height);
        layerInfo[level] = linfo;
        layerOfs[level] = linfo.y*bufSize.width + linfo.x;
        level_ofs.x += wholeSize.width;
    }
此处Rect的选取方式不会产生ROI超出范围。应该是CPU版的ORB算法经常更新优化,所以不容易出现ROI的这个问题。