FCNs学习笔记01-重新实现作者的结果

程序员文章站 2022-06-12 16:37:38

...

学习思路：重现作者结果，实现自己的任务。

环境：win10 + Caffe的Python接口 Anaconda3+Python3.5 CUDA8.0 Cudnn5.1

caffe编译：官方过程，需要提前安装git

>>git clone https://github.com/BVLC/caffe.git

>>cd caffe

>>git checkout windows

'''Edit any of the options inside build_win.cmd tosuit your needs'''

在scripts文件夹中的build_win.cmd右键notepad打开，修改需要的参数

>>scripts\build_win.cmd

重现作者过程

1。下载FCNs源代码

>>git clone https://github.com/shelhamer/fcn.berkeleyvision.org

得到fcn.berkeleyvision.org文件夹，文件夹内有很多模型源代码。我用的是fcn8s的，数据集用sbdd训练voc验证，对应文件夹为voc-fcn8s。

2。推导infer.py，红色的是改过的，用到的fcn8s-heavy-pascal.caffemodel文件是作者训练好的模型，下载地址在voc-fcn8文件下caffemodel-url文件内，地址为http://dl.caffe.berkeleyvision.org/fcn8s-heavy-pascal.caffemodel 。

import numpy as np
from PIL import Image
#添加caffe编译的目录
import sys
sys.path.append('../../caffe/python')
import caffe
import vis
# the demo image is "2007_000129" from PASCAL VOC
# load image, switch to BGR, subtract mean, and make dims C x H x W for Caffe
im = Image.open('demo/image.jpg')
in_ = np.array(im, dtype=np.float32)
in_ = in_[:,:,::-1]
in_ -= np.array((104.00698793,116.66876762,122.67891434))
in_ = in_.transpose((2,0,1))
# load net
net = caffe.Net('voc-fcn8s/deploy.prototxt', 'voc-fcn8s/fcn8s-heavy-pascal.caffemodel', caffe.TEST)
# shape for input (data blob is N x C x H x W), set data
net.blobs['data'].reshape(1, *in_.shape)
net.blobs['data'].data[...] = in_
# run net and take argmax for prediction
net.forward()
out = net.blobs['score'].data[0].argmax(axis=0)
#保存
# visualize segmentation in PASCAL VOC colors
voc_palette = vis.make_palette(21)
out_im = Image.fromarray(vis.color_seg(out, voc_palette))
out_im.save('demo/output.png')
masked_im = Image.fromarray(vis.vis_seg(im, out, voc_palette))
masked_im.save('demo/visualization.jpg')

3。训练solve.py，红色的是改过的

1)先下载sbdd和voc2012，其中sbdd用于训练，voc用于测试

sbdd是从别人那里下的，地址找不到了，需要dataset文件，没有的话可以用voc作训练集。readme有个网站，如下：

http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/resources.html

voc2012

https://pjreddie.com/projects/pascal-voc-dataset-mirror/

下载Train/ValidationData，解压

2)下载vgg16预训练模型

3)把voc_layers.py、surgery.py、score.py复制到voc-fcn8s文件内，训练需要加载。因为我的是python3，所以需要把print改为print()

FCNs学习笔记01-重新实现作者的结果

进入voc-fcn8s文件内，有如下几个文件(第一个是训练之后出现的)

FCNs学习笔记01-重新实现作者的结果

4)修改net.py，使用net.py生成train.prototxt和val.prototxt

import sys
sys.path.append('D:\\caffeForWINDOWS\\caffe\\python')
import caffe
from caffe import layers as L, params as P
from caffe.coord_map import crop
def conv_relu(bottom, nout, ks=3, stride=1, pad=1):
    conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
        num_output=nout, pad=pad,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    return conv, L.ReLU(conv, in_place=True)
def max_pool(bottom, ks=2, stride=2):
    return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride)
def fcn(split):
    n = caffe.NetSpec()
    pydata_params = dict(split=split, mean=(104.00699, 116.66877, 122.67892),
            seed=1337)
    if split == 'train':
        pydata_params['sbdd_dir'] = '../../sbdd/dataset'#下载的用于训练的数据
        pylayer = 'SBDDSegDataLayer'
    else:
        pydata_params['voc_dir'] = '../../VOC2012'#VOC2012数据
        pylayer = 'VOCSegDataLayer'
    n.data, n.label = L.Python(module='voc_layers', layer=pylayer,
            ntop=2, param_str=str(pydata_params))
    # the base net
    n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=100)
    n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64)
    n.pool1 = max_pool(n.relu1_2)
    n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128)
    n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128)
    n.pool2 = max_pool(n.relu2_2)
     n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256)
    n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256)
    n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256)
    n.pool3 = max_pool(n.relu3_3)
     n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512)
    n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512)
    n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512)
    n.pool4 = max_pool(n.relu4_3)
     n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512)
    n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512)
    n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512)
    n.pool5 = max_pool(n.relu5_3)
    # fully conv
    n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0)
    n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True)
    n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0)
    n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True)
    n.score_fr = L.Convolution(n.drop7, num_output=21, kernel_size=1, pad=0,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    n.upscore2 = L.Deconvolution(n.score_fr,
        convolution_param=dict(num_output=21, kernel_size=4, stride=2,
            bias_term=False),
        param=[dict(lr_mult=0)])
    n.score_pool4 = L.Convolution(n.pool4, num_output=21, kernel_size=1, pad=0,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    n.score_pool4c = crop(n.score_pool4, n.upscore2)
    n.fuse_pool4 = L.Eltwise(n.upscore2, n.score_pool4c,
            operation=P.Eltwise.SUM)
    n.upscore_pool4 = L.Deconvolution(n.fuse_pool4,
        convolution_param=dict(num_output=21, kernel_size=4, stride=2,
            bias_term=False),
        param=[dict(lr_mult=0)])
    n.score_pool3 = L.Convolution(n.pool3, num_output=21, kernel_size=1, pad=0,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    n.score_pool3c = crop(n.score_pool3, n.upscore_pool4)
    n.fuse_pool3 = L.Eltwise(n.upscore_pool4, n.score_pool3c,
            operation=P.Eltwise.SUM)
    n.upscore8 = L.Deconvolution(n.fuse_pool3,
        convolution_param=dict(num_output=21, kernel_size=16, stride=8,
            bias_term=False),
        param=[dict(lr_mult=0)])
    n.score = crop(n.upscore8, n.data)
    n.loss = L.SoftmaxWithLoss(n.score, n.label,
            loss_param=dict(normalize=False, ignore_label=255))
    return n.to_proto()
def make_net():
    with open('train.prototxt', 'w') as f:
        f.write(str(fcn('train')))
    with open('val.prototxt', 'w') as f:
        f.write(str(fcn('seg11valid')))
if __name__ == '__main__':
    make_net()

这会生成train.prototxt和val.prototxt，需要打开这两个文件，把fc6和fc7改成fc6_new和fc7_new(名字随意起的，和原来的不一样就行)，假如红色的地方没有改，直接在这两个文件里修改param_str就可以了。

5)修改solve.py

import sys
sys.path.append('../../caffe/python')
import caffe
import surgery, score
import numpy as np
import os
import sys
try:
    import setproctitle
    setproctitle.setproctitle(os.path.basename(os.getcwd()))
except:
    pass
weights = '../../vgg16-fcn.caffemodel'#你的预训练权重地址
# init
#caffe.set_device(int(sys.argv[1]))#不知道为什么要注释
#caffe.set_mode_gpu()#显卡不行，只好使用cpu了，或者在做数据集的时候把图片调小一点
solver = caffe.SGDSolver('solver.prototxt')
solver.net.copy_from(weights)
# surgeries
interp_layers = [k for k in solver.net.params.keys() if 'up' in k]
surgery.interp(solver.net, interp_layers)
# scoring
val = np.loadtxt('../../segvalid111.txt', dtype=str)#这个文件是VOC2012/ImageSets/Segmentation文件下的val.txt文件，重命名就行，好像是用来记录测试用到的分割图的名称。
for _ in range(1):#少循环几次试试能不能成
    solver.step(5)
    score.seg_tests(solver, False, val, layer='score')#测试

6)修改solver.prototxt

train_net、test_net地址对应上train.prototxt和val.prototxt，snapshot_prefix对应训练结果保存的地址

train_net: "train.prototxt"
test_net: "val.prototxt"
test_iter: 736
# make test net, but don't invoke it from the solver itself
test_interval: 999999999
display: 1
average_loss: 20
lr_policy: "fixed"
# lr for unnormalized softmax
base_lr: 1e-14
# high momentum
momentum: 0.99
# no gradient accumulation
iter_size: 1
max_iter: 100000
weight_decay: 0.0005
snapshot: 4000
snapshot_prefix: "../../snapshot/"
test_initialization: false

7)score.py只把print改为print()

8)surgery.py不做修改

9)voc_layers.py

import caffe#训练时已经调用了caffe，不用再改
import numpy as np
from PIL import Image
import random
class VOCSegDataLayer(caffe.Layer):
    """
    Load (input image, label image) pairs from PASCAL VOC
    one-at-a-time while reshaping the net to preserve dimensions.
    Use this to feed data to a fully convolutional network.
    """
    def setup(self, bottom, top):
        """
        Setup data layer according to parameters:
        - voc_dir: path to PASCAL VOC year dir
        - split: train / val / test
        - mean: tuple of mean values to subtract
        - randomize: load in random order (default: True)
        - seed: seed for randomization (default: None / current time)
        for PASCAL VOC semantic segmentation.
        example
        params = dict(voc_dir="/path/to/PASCAL/VOC2011",
            mean=(104.00698793, 116.66876762, 122.67891434),
            split="val")
        """
        # config
        params = eval(self.param_str)
        self.voc_dir = params['voc_dir']
        self.split = params['split']
        self.mean = np.array(params['mean'])
        self.random = params.get('randomize', True)
        self.seed = params.get('seed', None)
        # two tops: data and label
        if len(top) != 2:
            raise Exception("Need to define two tops: data and label.")
        # data layers have no bottoms
        if len(bottom) != 0:
            raise Exception("Do not define a bottom.")
        # load indices for images and labels
        split_f  = '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,
                self.split)#这个要求有./VOC2012/ImageSets/Segmentation/seg11valid.txt文件，同样把val.txt重命名
        self.indices = open(split_f, 'r').read().splitlines()
        self.idx = 0
        # make eval deterministic
        if 'train' not in self.split:
            self.random = False
        # randomization: seed and pick
        if self.random:
            random.seed(self.seed)
            self.idx = random.randint(0, len(self.indices)-1)
    def reshape(self, bottom, top):
        # load image + label image pair
        self.data = self.load_image(self.indices[self.idx])
        self.label = self.load_label(self.indices[self.idx])
        # reshape tops to fit (leading 1 is for batch dimension)
        top[0].reshape(1, *self.data.shape)
        top[1].reshape(1, *self.label.shape)
    def forward(self, bottom, top):
        # assign output
        top[0].data[...] = self.data
        top[1].data[...] = self.label
        # pick next input
        if self.random:
            self.idx = random.randint(0, len(self.indices)-1)
        else:
            self.idx += 1
            if self.idx == len(self.indices):
                self.idx = 0
    def backward(self, top, propagate_down, bottom):
        pass
    def load_image(self, idx):
        """
        Load input image and preprocess for Caffe:
        - cast to float
        - switch channels RGB -> BGR
        - subtract mean
        - transpose to channel x height x width order
        """
        im = Image.open('{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))
        in_ = np.array(im, dtype=np.float32)
        in_ = in_[:,:,::-1]
        in_ -= self.mean
        in_ = in_.transpose((2,0,1))
        return in_
    def load_label(self, idx):
        """
        Load label image as 1 x height x width integer array of label indices.
        The leading singleton dimension is required by the loss.
        """
        im = Image.open('{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))
        label = np.array(im, dtype=np.uint8)
        label = label[np.newaxis, ...]
        return label
class SBDDSegDataLayer(caffe.Layer):
    """
    Load (input image, label image) pairs from the SBDD extended labeling
    of PASCAL VOC for semantic segmentation
    one-at-a-time while reshaping the net to preserve dimensions.

    Use this to feed data to a fully convolutional network.
    """
    def setup(self, bottom, top):
        """
        Setup data layer according to parameters:
        - sbdd_dir: path to SBDD `dataset` dir
        - split: train / seg11valid
        - mean: tuple of mean values to subtract
        - randomize: load in random order (default: True)
        - seed: seed for randomization (default: None / current time)
        for SBDD semantic segmentation.
        N.B.segv11alid is the set of segval11 that does not intersect with SBDD.
        Find it here: https://gist.github.com/shelhamer/edb330760338892d511e.
        example
        params = dict(sbdd_dir="/path/to/SBDD/dataset",
            mean=(104.00698793, 116.66876762, 122.67891434),
            split="valid")
        """
        # config
        params = eval(self.param_str)
        self.sbdd_dir = params['sbdd_dir']
        self.split = params['split']
        self.mean = np.array(params['mean'])
        self.random = params.get('randomize', True)
        self.seed = params.get('seed', None)
        # two tops: data and label
        if len(top) != 2:
            raise Exception("Need to define two tops: data and label.")
        # data layers have no bottoms
        if len(bottom) != 0:
            raise Exception("Do not define a bottom.")
        # load indices for images and labels
        split_f  = '{}/{}.txt'.format(self.sbdd_dir,
                self.split)
        self.indices = open(split_f, 'r').read().splitlines()
        self.idx = 0
        # make eval deterministic
        if 'train' not in self.split:
            self.random = False
        # randomization: seed and pick
        if self.random:
            random.seed(self.seed)
            self.idx = random.randint(0, len(self.indices)-1)
    def reshape(self, bottom, top):
        # load image + label image pair
        self.data = self.load_image(self.indices[self.idx])
        self.label = self.load_label(self.indices[self.idx])
        # reshape tops to fit (leading 1 is for batch dimension)
        top[0].reshape(1, *self.data.shape)
        top[1].reshape(1, *self.label.shape)
    def forward(self, bottom, top):
        # assign output
        top[0].data[...] = self.data
        top[1].data[...] = self.label

        # pick next input
        if self.random:
            self.idx = random.randint(0, len(self.indices)-1)
        else:
            self.idx += 1
            if self.idx == len(self.indices):
                self.idx = 
    def backward(self, top, propagate_down, bottom):
        pass
    def load_image(self, idx):
        """
        Load input image and preprocess for Caffe:
        - cast to float
        - switch channels RGB -> BGR
        - subtract mean
        - transpose to channel x height x width order
        """
        im = Image.open('{}/img/{}.jpg'.format(self.sbdd_dir, idx))
        in_ = np.array(im, dtype=np.float32)
        in_ = in_[:,:,::-1]
        in_ -= self.mean
        in_ = in_.transpose((2,0,1))
        return in_
    def load_label(self, idx):
        """
        Load label image as 1 x height x width integer array of label indices.
        The leading singleton dimension is required by the loss.
        """
        import scipy.io
        mat = scipy.io.loadmat('{}/cls/{}.mat'.format(self.sbdd_dir, idx))
        label = mat['GTcls'][0]['Segmentation'][0].astype(np.uint8)
        label = label[np.newaxis, ...]
        return label

    #def load_label(self, idx):
    #    im = Image.open('{}/cls/{}.png'.format(self.sbdd_dir, idx))
    #    label = np.array(im, dtype=np.uint8)
    #    label = label[np.newaxis, ...]
    #    return label
    #这样可以直接使用分割图片而不用转换为mat图，不过目前还没整明白分割图的制作，没有测试

目前卡在制作自己数据这里，老是说label的number和prediction的number不匹配。

还有就是labelme制作的标签使用skimage里的函数上色永远都是按顺序上色。等做完了再更。。。

能力有限。。。