欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Fast R-CNN roidb数据准备

程序员文章站 2024-03-14 08:47:16
...

在Faster R-CNN上项目代码上运行Fast R-CNN。关于初始的roidb数据,主要的几个相关文件有pascal_voc.py,imdb.py,roidb.py等。

(1)运行脚本是 fast_rcnn.sh

# ./experiments/scripts/fast_rcnn.sh 0 VGG_CNN_M_1024 pascal_voc --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400,500,600,700]" 

运行脚本参数包括gpuID,网络,数据库,其他等参数。主要是调用下面的train_net.py进行网络训练。

time ./tools/train_net.py --gpu ${GPU_ID} \
  --solver models/${PT_DIR}/${NET}/fast_rcnn/solver.prototxt \
  --weights data/imagenet_models/${NET}.caffemodel \
  --imdb ${TRAIN_IMDB} \
  --iters ${ITERS} \
  ${EXTRA_ARGS}

 (2)训练脚本 tools/train_net.py,主要包括参数解析,以及创建生成roidb.

    # set up caffe
    caffe.set_mode_gpu()
    caffe.set_device(args.gpu_id)

    imdb, roidb = combined_roidb(args.imdb_name)
    print '{:d} roidb entries'.format(len(roidb))

    output_dir = get_output_dir(imdb)
    print 'Output will be saved to `{:s}`'.format(output_dir)

    train_net(args.solver, roidb, output_dir,
              pretrained_model=args.pretrained_model,
              max_iters=args.max_iters)

     1. 调用combined_roidb(..)生成roidb

def combined_roidb(imdb_names):
    def get_roidb(imdb_name):
        imdb = get_imdb(imdb_name)
        print 'Loaded dataset `{:s}` for training'.format(imdb.name)
        imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
	#print len(imdb._roidb) #None
        #roidb = imdb.roidb # 5011 ,before flipped
        #print 'before fliiped,roidb_len:',len(roidb)
        print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
        roidb = get_training_roidb(imdb) #5011*2,after flipped
	#print 'after fliiped,roidb_len:',len(roidb)
        return roidb


    roidbs = [get_roidb(s) for s in imdb_names.split('+')]
    roidb = roidbs[0]
    if len(roidbs) > 1:
        for r in roidbs[1:]:
            roidb.extend(r)
        imdb = datasets.imdb.imdb(imdb_names)
    else:
        imdb = get_imdb(imdb_names)
    return imdb, roidb

   该步骤包括根据选定的生成proposal的方法,生成初始的roidb。 

        imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)

    在cfg文件中设定了是selcetive search方法。具体实现是在 lib/pascal_voc.py 下的  def selective_search_roidb(self): 方法

  def selective_search_roidb(self):
        """
        Return the database of selective search regions of interest.
        Ground-truth ROIs are also included.


        This function loads/saves from/to a cache file to speed up future calls.
        """
        cache_file = os.path.join(self.cache_path,
                                  self.name + '_selective_search_roidb.pkl')


        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                roidb = cPickle.load(fid)
            print '{} ss roidb loaded from {}'.format(self.name, cache_file)
            return roidb


        if int(self._year) == 2007 or self._image_set != 'test':
            gt_roidb = self.gt_roidb()
            ss_roidb = self._load_selective_search_roidb(gt_roidb)
            #print size(gt_roidb),size(ss_roidb)
            roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
        else:
            roidb = self._load_selective_search_roidb(None)
        with open(cache_file, 'wb') as fid:
            cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
        print 'wrote ss roidb to {}'.format(cache_file)


        return roidb
     该文件对于训练数据,将ground-truth box和SS方法生成候选框 box的信息一起保存到 roidb 。

     1.1   gt box的生成是通过 pascal_voc.py下定义的方法 gt_roidb()

        gt_roidb = self.gt_roidb()
    具体实现是通过pascal_voc.py下的 _load_pascal_annotation.也就是对每一幅图像读取标注的 gt_box信息。
 def gt_roidb(self):
        """
        Return the database of ground-truth regions of interest.


        This function loads/saves from/to a cache file to speed up future calls.
        """
        cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                roidb = cPickle.load(fid)
            print '{} gt roidb loaded from {}'.format(self.name, cache_file)
            return roidb


        gt_roidb = [self._load_pascal_annotation(index)
                    for index in self.image_index]
        with open(cache_file, 'wb') as fid:
            cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
        print 'wrote gt roidb to {}'.format(cache_file)


        return gt_roidb

     可以看下gt_roidb具体保存的信息,长度为num_image的列表,每个元素是下面返回的结构体。

 def _load_pascal_annotation(self, index):
        """
        Load image and bounding boxes info from XML file in the PASCAL VOC
        format.
        """
        filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
        tree = ET.parse(filename)
        objs = tree.findall('object')
        if not self.config['use_diff']:
            # Exclude the samples labeled as difficult
            non_diff_objs = [
                obj for obj in objs if int(obj.find('difficult').text) == 0]
            # if len(non_diff_objs) != len(objs):
            #     print 'Removed {} difficult objects'.format(
            #         len(objs) - len(non_diff_objs))
            objs = non_diff_objs
        num_objs = len(objs)


        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
        gt_classes = np.zeros((num_objs), dtype=np.int32)
        overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
        # "Seg" area for pascal is just the box area
        seg_areas = np.zeros((num_objs), dtype=np.float32)


        # Load object bounding boxes into a data frame.
        for ix, obj in enumerate(objs):
            bbox = obj.find('bndbox')
            # Make pixel indexes 0-based
            x1 = float(bbox.find('xmin').text) - 1
            y1 = float(bbox.find('ymin').text) - 1
            x2 = float(bbox.find('xmax').text) - 1
            y2 = float(bbox.find('ymax').text) - 1
            cls = self._class_to_ind[obj.find('name').text.lower().strip()]
            boxes[ix, :] = [x1, y1, x2, y2]
            def _load_pascal_annotation(self, index):
        """
        Load image and bounding boxes info from XML file in the PASCAL VOC
        format.
        """
        filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
        tree = ET.parse(filename)
        objs = tree.findall('object')
        if not self.config['use_diff']:
            # Exclude the samples labeled as difficult
            non_diff_objs = [
                obj for obj in objs if int(obj.find('difficult').text) == 0]
            # if len(non_diff_objs) != len(objs):
            #     print 'Removed {} difficult objects'.format(
            #         len(objs) - len(non_diff_objs))
            objs = non_diff_objs
        num_objs = len(objs)


        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
        gt_classes = np.zeros((num_objs), dtype=np.int32)
        overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
        # "Seg" area for pascal is just the box area
        seg_areas = np.zeros((num_objs), dtype=np.float32)


        # Load object bounding boxes into a data frame.
        for ix, obj in enumerate(objs):
            bbox = obj.find('bndbox')
            # Make pixel indexes 0-based
            x1 = float(bbox.find('xmin').text) - 1
            y1 = float(bbox.find('ymin').text) - 1
            x2 = float(bbox.find('xmax').text) - 1
            y2 = float(bbox.find('ymax').text) - 1
            cls = self._class_to_ind[obj.find('name').text.lower().strip()]
            boxes[ix, :] = [x1, y1, x2, y2]
            gt_classes[ix] = cls
	    #print cls
            overlaps[ix, cls] = 1.0
            seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
           
        overlaps = scipy.sparse.csr_matrix(overlaps)


        return {'boxes' : boxes,              #[num_box,x1,y1,x2,y2 ]
                'gt_classes': gt_classes,     #[num_box]
                'gt_overlaps' : overlaps,     #[num_box,num_class]
                'flipped' : False,
                'seg_areas' : seg_areas}

    每张图像对应一个结构体,包括:

        boxes,保存所有SS选择的候选框的位置信息,从xml文件读取。   

        gt_class,是读取object下name,从类别映射,得到类别(数字表示)。

        gt_overlaps,gt_box的overlaps值赋值为1.

       ....

     可以看下VOC2007中的标注xml信息,其中的object就给出了类别以及box位置。

              Fast R-CNN roidb数据准备

     1.2 通过_load_selective_search_roidb(gt_roidb) 加载从SS算法得到的候选框ss_box的信息。

      ss_roidb = self._load_selective_search_roidb(gt_roidb)

      SS算法有提供VOC候选框的mat文件,其中包括图像的名称,以及每张图像框定的box信息。下面包括读取box列表,以及调用create_roidb_from_box_list()函数生成ss_roidb.

       

def _load_selective_search_roidb(self, gt_roidb):
        filename = os.path.abspath(os.path.join(cfg.DATA_DIR,
                                                'selective_search_data',
                                                self.name + '.mat'))
        assert os.path.exists(filename), \
               'Selective search data not found at: {}'.format(filename)
        raw_data = sio.loadmat(filename)['boxes'].ravel()


        box_list = []
        for i in xrange(raw_data.shape[0]):
            boxes = raw_data[i][:, (1, 0, 3, 2)] - 1
            keep = ds_utils.unique_boxes(boxes)
            boxes = boxes[keep, :]
            keep = ds_utils.filter_small_boxes(boxes, self.config['min_size'])
            boxes = boxes[keep, :]
            box_list.append(boxes)
	
        return self.create_roidb_from_box_list(box_list, gt_roidb)

    具体实现是在lib/imdb,py中的方法 create_roidb_from_box_list() 

PS:有一篇博客介绍roidb.py写的很好,这个地方给出了roidb的具体信息。

Fast R-CNN roidb数据准备

   def create_roidb_from_box_list(self, box_list, gt_roidb):
        assert len(box_list) == self.num_images, \
                'Number of boxes must match number of ground-truth images'
        roidb = []
        for i in xrange(self.num_images):
            boxes = box_list[i]
            num_boxes = boxes.shape[0]
            overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)


            if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
                gt_boxes = gt_roidb[i]['boxes']
                gt_classes = gt_roidb[i]['gt_classes']
                gt_overlaps = bbox_overlaps(boxes.astype(np.float),
                                            gt_boxes.astype(np.float))
                argmaxes = gt_overlaps.argmax(axis=1)
                maxes = gt_overlaps.max(axis=1)
                I = np.where(maxes > 0)[0]
                overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]


            overlaps = scipy.sparse.csr_matrix(overlaps)
            roidb.append({
                'boxes' : boxes,
                'gt_classes' : np.zeros((num_boxes,), dtype=np.int32),
                'gt_overlaps' : overlaps,
                'flipped' : False,
                'seg_areas' : np.zeros((num_boxes,), dtype=np.float32),
            })
        return roidb

    这边生成的roidb数据是根据原始的候选框。  对每张图像,计算所有的候选框boxes与gt-box的overlaps(IoU),对每个候选框,将最大的overlaps(选择>0的box)保存在gt_overlaps(对应的class,列值)。此处ss_box的gt_classes全部赋值成0. (?)

    1.3 合并ss_roidb 和gt_roidb 信息为 roidb     

roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)

具体实现是在 lib/imdb,py下的 merger_roidbs()

 def merge_roidbs(a, b):
        assert len(a) == len(b)
        for i in xrange(len(a)):
            a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
            a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
                                            b[i]['gt_classes']))
            a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
                                                       b[i]['gt_overlaps']])
            a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'],
                                           b[i]['seg_areas']))
        return a

   1.4 对训练图像进行增强,再生成roidb

        roidb = get_training_roidb(imdb) #5011*2,after flipped
     get_training_roidb(imdb)定义在 lib/fast_rcnn/train.py下。实现对原始图像进行镜像增强,保存镜像图像对应的 roidb数据。另外调用lib/roibd.py下的 prepare_roidb(imdb)方法计算所有 roidb的max_classes以及相应overlaps值。
def get_training_roidb(imdb):
    """Returns a roidb (Region of Interest database) for use in training."""
    if cfg.TRAIN.USE_FLIPPED:
        print 'Appending horizontally-flipped training examples...'
        imdb.append_flipped_images()
        print 'done'


    print 'Preparing training data...'
    rdl_roidb.prepare_roidb(imdb)
    print 'done'


    return imdb.roidb

    2. 调用Caffe进行训练    

train_net(args.solver, roidb, output_dir,
              pretrained_model=args.pretrained_model,
              max_iters=args.max_iters)

   调用 lib/fast_rcnn/train.py 中的train_net(...)

def train_net(solver_prototxt, roidb, output_dir,
              pretrained_model=None, max_iters=40000):
    """Train a Fast R-CNN network."""


    roidb = filter_roidb(roidb)
    sw = SolverWrapper(solver_prototxt, roidb, output_dir,
                       pretrained_model=pretrained_model)


    print 'Solving...'
    model_paths = sw.train_model(max_iters)
    print 'done solving'
    return model_paths

     2.1 过滤掉部分即不存在前景也不存在背景的图像。

    (前景满足某个阈值([0.5,1] ,背景也是同样满足某个阈值[0.1,0.5) )

      2.2 定义SolverWrapper 类   

class SolverWrapper(object):
    """A simple wrapper around Caffe's solver.
    This wrapper gives us control over he snapshotting process, which we
    use to unnormalize the learned bounding-box regression weights.
    """


    def __init__(self, solver_prototxt, roidb, output_dir,
                 pretrained_model=None):
        """Initialize the SolverWrapper."""
        self.output_dir = output_dir


        if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
            cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
            # RPN can only use precomputed normalization because there are no
            # fixed statistics to compute a priori
            assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED


        if cfg.TRAIN.BBOX_REG:
            print 'Computing bounding-box regression targets...'
            self.bbox_means, self.bbox_stds = \
                    rdl_roidb.add_bbox_regression_targets(roidb)
            print 'done'


        self.solver = caffe.SGDSolver(solver_prototxt)
        if pretrained_model is not None:
            print ('Loading pretrained model '
                   'weights from {:s}').format(pretrained_model)
            self.solver.net.copy_from(pretrained_model)


        self.solver_param = caffe_pb2.SolverParameter()
        with open(solver_prototxt, 'rt') as f:
            pb2.text_format.Merge(f.read(), self.solver_param)


        self.solver.net.layers[0].set_roidb(roidb)

      该类可以实现RPN和bbox_regression.对于Fast R-CNN主要实现bbox_regression,通过roidb.py下的add_bbox_regression_targets()方法计算box_targets ,调用_compute_targets(rois,max_overlaps,max_classes)

   对于那些max_overlaps大于某个阈值的box,

def add_bbox_regression_targets(roidb):
    """Add information needed to train bounding-box regressors."""
    assert len(roidb) > 0
    assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'


    num_images = len(roidb)
    # Infer number of classes from the number of columns in gt_overlaps
    num_classes = roidb[0]['gt_overlaps'].shape[1]
    for im_i in xrange(num_images):
        rois = roidb[im_i]['boxes']
        max_overlaps = roidb[im_i]['max_overlaps']
	#print 'add_bbox_regression_targets:max_overlaps:',max_overlaps.shape
        max_classes = roidb[im_i]['max_classes']
        roidb[im_i]['bbox_targets'] = \
                _compute_targets(rois, max_overlaps, max_classes)


    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
        # Use fixed / precomputed "means" and "stds" instead of empirical values
        means = np.tile(
                np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
        stds = np.tile(
                np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
    else:
        # Compute values needed for means and stds
        # var(x) = E(x^2) - E(x)^2
        class_counts = np.zeros((num_classes, 1)) + cfg.EPS
        sums = np.zeros((num_classes, 4))
        squared_sums = np.zeros((num_classes, 4))
        for im_i in xrange(num_images):
            targets = roidb[im_i]['bbox_targets']
            for cls in xrange(1, num_classes):
                cls_inds = np.where(targets[:, 0] == cls)[0]
                if cls_inds.size > 0:
                    class_counts[cls] += cls_inds.size
                    sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
                    squared_sums[cls, :] += \
                            (targets[cls_inds, 1:] ** 2).sum(axis=0)


        means = sums / class_counts
        stds = np.sqrt(squared_sums / class_counts - means ** 2)


    print 'bbox target means:'
    print means
    print means[1:, :].mean(axis=0) # ignore bg class
    print 'bbox target stdevs:'
    print stds
    print stds[1:, :].mean(axis=0) # ignore bg class


    # Normalize targets
    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
        print "Normalizing targets"
        for im_i in xrange(num_images):
            targets = roidb[im_i]['bbox_targets']
            for cls in xrange(1, num_classes):
                cls_inds = np.where(targets[:, 0] == cls)[0]
                roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
                roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
    else:
        print "NOT normalizing targets"


    # These values will be needed for making predictions
    # (the predicts will need to be unnormalized and uncentered)
    return means.ravel(), stds.ravel()

    下面这段代码是来自上面的博客,注释部分写的很清楚了。

def _compute_targets(rois, overlaps, labels):  # 参数rois只含有当前图片的box信息
    """Compute bounding-box regression targets for an image."""
    # Indices目录 of ground-truth ROIs
    # ground-truth ROIs
    gt_inds = np.where(overlaps == 1)[0]
    if len(gt_inds) == 0:
        # Bail if the image has no ground-truth ROIs
        # 不存在gt ROI,返回空数组
        return np.zeros((rois.shape[0], 5), dtype=np.float32)
    # Indices of examples for which we try to make predictions
    # BBOX阈值,只有ROI与gt的重叠度大于阈值,这样的ROI才能用作bb回归的训练样本
    ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]


    # Get IoU overlap between each ex ROI and gt ROI
    # 计算ex ROI and gt ROI的IoU
    ex_gt_overlaps = bbox_overlaps(
        # 变数据格式为float
        np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
        np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))


    # Find which gt ROI each ex ROI has max overlap with:
    # this will be the ex ROI's gt target
    # 这里每一行代表一个ex_roi,列代表gt_roi,元素数值代表两者的IoU
    gt_assignment = ex_gt_overlaps.argmax(axis=1) #按行求最大,返回索引.
    gt_rois = rois[gt_inds[gt_assignment], :]  #每个ex_roi对应的gt_rois,与下面ex_roi数量相同
    ex_rois = rois[ex_inds, :]


    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
    targets[ex_inds, 0] = labels[ex_inds]  #第一个元素是label
    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)  #后4个元素是ex_box与gt_box的4个方位的偏移
    return targets
相关标签: object detection