要点初见:OpenCV3中CUDA ORB特征提取算法的实现(GPU加速的ORB算法)
前文链接:
要点初见:OpenCV3中ORB特征提取算法的实现与分析
http://blog.csdn.net/m0_37857300/article/details/79037988
OpenCV实用程序:“OpenCV相机”——获取、保存选定时刻的摄像头图像
http://blog.csdn.net/m0_37857300/article/details/79038894
前文大概介绍了CPU中的ORB特征提取算法的实现方法。其中提到了虽然ORB是专门为CPU设计的特征提取算法,但在OpenCV中的cudafeatures2d里也存在着用CUDA加速的ORB算法库(OpenCV编译时需交叉编译CUDA才可用)。网上关于OpenCV3中GPU加速的ORB算法的实例特别少,博主根据官方的reference介绍,参考CPU版的ORB算法,摸索出了一套CUDA ORB算法的程序并编译运行成功。
本文将给出CPU、GPU版精简化的ORB程序(不输出图像,模板、待测图片已给出,CPU版为单线程),并对二者在相同参数、相同匹配算法的环境下进行运行测试并比较二者各运行1000次所花的时间。运行的机器配置是i74200U、N840,运行的环境是Ubuntu16.04,用的OpenCV库版本为3.3。
先上结论:对于分辨率不特别大的图片间的ORB特征匹配,CPU运算得比GPU版的快(由于图像上传到GPU消耗了时间);但对于分辨率较大的图片,或者GPU比CPU好的机器(比如Nvidia Jetson系列),GPU版的ORB算法比CPU版的程序更高效。
用的模板:
分辨率为170*209,命名为model.jpg;
用的目标检测图片是
分辨率为1200*668,命名为ORB_test.jpg;
OpenCV3.3中CUDA ORB特征提取算法代码(在项目include目录中需添加/usr/local/cuda/include目录,否则会显示找不到cuda_runtime):
#include <iostream>
#include <signal.h>
#include <opencv2/opencv.hpp>
#include "opencv2/core/core.hpp"
#include "opencv2/cudabgsegm.hpp"
#include "opencv2/core/cuda.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/core/cuda_stream_accessor.hpp"
#include "opencv2/cudafeatures2d.hpp"
#include "opencv2/cudaimgproc.hpp"
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudafilters.hpp"
#include "opencv2/cudawarping.hpp"
#include "opencv2/features2d.hpp"
#include <vector>
using namespace cv;
using namespace cuda;
using namespace std;
bool stop = false;
void sigIntHandler(int signal)
{
stop = true;
cout<<"Honestly, you are out!"<<endl;
}
int main()
{
Mat img_1 = imread("model.jpg");
Mat img_2 = imread("ORB_test.jpg");
if (!img_1.data || !img_2.data)
{
cout << "error reading images " << endl;
return -1;
}
int times = 0;
double startime = cv::getTickCount();
signal(SIGINT, sigIntHandler);
int64 start, end;
double time;
vector<Point2f> recognized;
vector<Point2f> scene;
for(times = 0;!stop; times++)
{
start = getTickCount();
recognized.resize(500);
scene.resize(500);
cuda::GpuMat d_img1, d_img2;
cuda::GpuMat d_srcL, d_srcR;
d_img1.upload(img_1); d_img2.upload(img_2);
Mat img_matches, des_L, des_R;
cuda::cvtColor(d_img1, d_srcL, COLOR_BGR2GRAY);
cuda::cvtColor(d_img2, d_srcR, COLOR_BGR2GRAY);
Ptr<cuda::ORB> d_orb = cuda::ORB::create(500, 1.2f, 6, 31, 0, 2, 0, 31, 20,true);
cuda::GpuMat d_keypointsL, d_keypointsR;
cuda::GpuMat d_descriptorsL, d_descriptorsR, d_descriptorsL_32F, d_descriptorsR_32F;
vector<KeyPoint> keyPoints_1, keyPoints_2;
Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(NORM_L2);
std::vector<DMatch> matches;
std::vector<DMatch> good_matches;
d_orb -> detectAndComputeAsync(d_srcL, cuda::GpuMat(), d_keypointsL, d_descriptorsL);
d_orb -> convert(d_keypointsL, keyPoints_1);
d_descriptorsL.convertTo(d_descriptorsL_32F, CV_32F);
d_orb -> detectAndComputeAsync(d_srcR, cuda::GpuMat(), d_keypointsR, d_descriptorsR);
d_orb -> convert(d_keypointsR, keyPoints_2);
d_descriptorsR.convertTo(d_descriptorsR_32F, CV_32F);
d_matcher -> match(d_descriptorsL_32F, d_descriptorsR_32F, matches);
int sz = matches.size();
double max_dist = 0; double min_dist = 100;
for (int i = 0; i < sz; i++)
{
double dist = matches[i].distance;
if (dist < min_dist) min_dist = dist;
if (dist > max_dist) max_dist = dist;
}
cout << "\n-- Max dist : " << max_dist << endl;
cout << "\n-- Min dist : " << min_dist << endl;
for (int i = 0; i < sz; i++)
{
if (matches[i].distance < 0.6*max_dist)
{
good_matches.push_back(matches[i]);
}
}
for (size_t i = 0; i < good_matches.size(); ++i)
{
scene.push_back(keyPoints_2[ good_matches[i].trainIdx ].pt);
}
for(unsigned int j = 0; j < scene.size(); j++)
cv::circle(img_2, scene[j], 2, cv::Scalar(0, 255, 0), 2);
//imshow("img_2", img_2);
//waitKey(1);
end = getTickCount();
time = (double)(end - start) * 1000 / getTickFrequency();
cout << "Total time : " << time << " ms"<<endl;
if (times == 1000)
{
double maxvalue = (cv::getTickCount() - startime)/cv::getTickFrequency();
cout <<"zhenshu " << times/maxvalue <<" zhen"<<endl;
}
cout <<"The number of frame is : " <<times<<endl;
}
return 0;
}
CUDA ORB运行1000帧时测得的平均帧率:
可考虑把GPU版的程序用两个流(CUDA中的Stream)交替、隐藏在CPU之后,可大大加速。
OpenCV3.3中CPU版 ORB特征提取算法代码(前文中程序的简化版,去除了一些影响效率的操作):
#include <iostream>
#include <signal.h>
#include <vector>
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;
bool stop = false;
void sigIntHandler(int signal)
{
stop = true;
cout<<"Honestly, you are out!"<<endl;
}
int main()
{
Mat img_1 = imread("model.jpg");
Mat img_2 = imread("ORB_test.jpg");
if (!img_1.data || !img_2.data)
{
cout << "error reading images " << endl;
return -1;
}
int times = 0;
double startime = cv::getTickCount();
signal(SIGINT, sigIntHandler);
int64 start, end;
double time;
vector<Point2f> recognized;
vector<Point2f> scene;
for(times = 0;!stop; times++)
{
start = getTickCount();
recognized.resize(500);
scene.resize(500);
Mat d_srcL, d_srcR;
Mat img_matches, des_L, des_R;
cvtColor(img_1, d_srcL, COLOR_BGR2GRAY);
cvtColor(img_2, d_srcR, COLOR_BGR2GRAY);
Ptr<ORB> d_orb = ORB::create(500,1.2f,6,31,0,2);
Mat d_descriptorsL, d_descriptorsR, d_descriptorsL_32F, d_descriptorsR_32F;
vector<KeyPoint> keyPoints_1, keyPoints_2;
Ptr<DescriptorMatcher> d_matcher = DescriptorMatcher::create(NORM_L2);
std::vector<DMatch> matches;
std::vector<DMatch> good_matches;
d_orb -> detectAndCompute(d_srcL, Mat(), keyPoints_1, d_descriptorsL);
d_orb -> detectAndCompute(d_srcR, Mat(), keyPoints_2, d_descriptorsR);
d_matcher -> match(d_descriptorsL, d_descriptorsR, matches);
int sz = matches.size();
double max_dist = 0; double min_dist = 100;
for (int i = 0; i < sz; i++)
{
double dist = matches[i].distance;
if (dist < min_dist) min_dist = dist;
if (dist > max_dist) max_dist = dist;
}
cout << "\n-- Max dist : " << max_dist << endl;
cout << "\n-- Min dist : " << min_dist << endl;
for (int i = 0; i < sz; i++)
{
if (matches[i].distance < 0.6*max_dist)
{
good_matches.push_back(matches[i]);
}
}
for (size_t i = 0; i < good_matches.size(); ++i)
{
scene.push_back(keyPoints_2[ good_matches[i].trainIdx ].pt);
}
for(unsigned int j = 0; j < scene.size(); j++)
cv::circle(img_2, scene[j], 2, cv::Scalar(0, 255, 0), 2);
//imshow("img_2", img_2);
//waitKey(1);
end = getTickCount();
time = (double)(end - start) * 1000 / getTickFrequency();
cout << "Total time : " << time << " ms"<<endl;
if (times == 1000)
{
double maxvalue = (cv::getTickCount() - startime)/cv::getTickFrequency();
cout <<"zhenshu " << times/maxvalue <<" zhen"<<endl;
}
cout <<"The number of frame is : " <<times<<endl;
}
return 0;
}
CPU ORB运行1000帧时测得的平均帧率:
相比二图,可见CPU的ORB算法在当前分辨率的笔记本上运行的速度还是比GPU快的,但博主曾在NVIDIA Jetson TK1上运行过这两个算法的摄像头输入版本,GPU版的1000帧平均20帧每秒,CPU版的却只有平均11帧每秒,可说是各有千秋。
若想将该CPU或GPU的算法移植到摄像头上,仅需把摄像头读入的图片>> img_2即可。欢迎交流讨论!
**关于CUDA ORB的一个BUG的解决方法:
若更换模板、目标图片后CUDA ORB程序编译成功,但运行时程序在detectAndComputeAsync处出现
OpenCV Error: Assertion failed (0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows) in GpuMat
的ERROR提示时,需将程序中cuda::ORB::create()括号里的6减小,或将31减小可解决(但这两数要尽可能大,否则会影响keyPoint的选取)。
这个问题是博主读了OpenCV的源码/opencv-3.3.1/modules/cudafeatures2d/src/orb.cpp后解决的,在CUDA的ORB源码中void ORB_Impl::detectAndComputeAsync()里涉及到一个函数:
void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream)
其中
float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
导致了这个问题。getScale中是scaleFactor的(level-firstLevel)次方的运算。参考cv::cuda::ORB::create()的定义:
Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int nfeatures,
float scaleFactor,
int nlevels,
int edgeThreshold,
int firstLevel,
int WTA_K,
int scoreType,
int patchSize,
int fastThreshold,
bool blurForDescriptor)
可见此处Rect inner的选取方式有可能导致:当detectAndComputeAsync()的输入图像分辨率小,而create设置的nlevels较大、edgeThreshold较大时,inner的区域超出图像本身的大小,从而导致ROI超出图像范围的问题。
而为什么同样的图片在CPU ORB算法中就不会发生这个问题呢?
博主读了CPU ORB的代码,发现其中ROI的选取方式完全不同:
在
void ORB_Impl::detectAndCompute( InputArray _image, InputArray _mask,
std::vector<KeyPoint>& keypoints,
OutputArray _descriptors, bool useProvidedKeypoints )
中,有如下ROI选取代码:
for( level = 0; level < nLevels; level++ )
{
float scale = getScale(level, firstLevel, scaleFactor);
layerScale[level] = scale;
Size sz(cvRound(image.cols/scale), cvRound(image.rows/scale));
Size wholeSize(sz.width + border*2, sz.height + border*2);
if( level_ofs.x + wholeSize.width > bufSize.width )
{
level_ofs = Point(0, level_ofs.y + level_dy);
level_dy = wholeSize.height;
}
Rect linfo(level_ofs.x + border, level_ofs.y + border, sz.width, sz.height);
layerInfo[level] = linfo;
layerOfs[level] = linfo.y*bufSize.width + linfo.x;
level_ofs.x += wholeSize.width;
}
此处Rect的选取方式不会产生ROI超出范围。应该是CPU版的ORB算法经常更新优化,所以不容易出现ROI的这个问题。