CUDA编程:SSD的priorbox层
程序员文章站
2024-03-17 11:29:04
...
SSD中的pribox层的实现:
__global__ void PriKernel(float *top_data,const int layer_height,const int layer_width,const int img_height, const int img_width,const float step_w,const float step_h, const int offset, float *min_sizes,const int min_sizes_len,float*max_sizes,const int max_sizes_len,float *aspts,const int aspts_len, const int clip,const int num_priors)
{
Dtype* top_data = top[0]->mutable_gpu_data();
int dim = layer_height * layer_width * num_priors_ * 4; // 一般情况下w*h*6*4
int h = iThredNumber;
int idx = h * layer_width * num_priors *4;
// for (int h = 0; h < layer_height; ++h) { // 对于feature map上的每个点逐一映射
for (int w = 0; w < layer_width; ++w) {
// 这里和Faster RCNN 一样,就是把feature map上的点映射回原图,这里加上0.5也是为了四舍五入,和faster rcnn python代码类似
float center_x = (w + offset_) * step_w;
float center_y = (h + offset_) * step_h;
float box_width, box_height;
for (int s = 0; s < min_sizes_.size(); ++s) { // min_sizes_.size()=1
int min_size_ = min_sizes_[s];
// 这里的min_size从fc7_mbox_priorbox的60到最后的276,就是s_k从0.2到0.92的过程
// first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size_;
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width; //
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
if (max_sizes_.size() > 0) {
CHECK_EQ(min_sizes_.size(), max_sizes_.size());
int max_size_ = max_sizes_[s];
// second prior: aspect_ratio = 1, size = sqrt(min_size * max_size) // 这里就和论文中一致,s_k的选法,每个都不同
box_width = box_height = sqrt(min_size_ * max_size_);
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
}
// rest of priors
for (int r = 0; r < aspect_ratios_.size(); ++r) { // 其他几个比例计算
float ar = aspect_ratios_[r];
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size_ * sqrt(ar);
box_height = min_size_ / sqrt(ar);
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
}
} // end for min_size=1
} // end for w
//} // end for h
// 到这里,所有的prior_box选取完成,共6个比例,和论文中相符合,同时在每一层中算一个s_k,就是每一层都会设置一个min_size
// clip the prior's coordidate such that it is within [0, 1]
if (clip_) { // 裁剪到[0,1]
int idx2 = h * layer_width * num_priors *4;
for (int d = 0; d < layer_width * num_priors *4; ++d) {
top_data[idx2 + d] = std::min(std::max(top_data[idx2+d], 0.), 1.);
}
}
}
并行方差,适当的修改一下参数就可以实现GPU的并行;
__global__ void PriKernel2(float *top_data,const int layer_height,const int layer_width,
float *variance,const int variance_len,const int num_priors)
{
int count = iThredNum * layer_width * num_priors *4;
// for (int h = 0; h < layer_height; ++h) {
for (int w = 0; w < layer_width; ++w) {
for (int i = 0; i < num_priors_; ++i) {
for (int j = 0; j < 4; ++j) {
top_data[count] = variance_[j];
++count;
}
}
}
// }
}
第三步,就是把本地的数据copy到GPU上,caffe中是如果实现GPU的代码,默认使用GPU的代码;