python生成lmdb格式的文件实例
程序员文章站
2022-06-08 21:58:10
在crnn训练的时候需要用到lmdb格式的数据集,下面是python生成lmdb个是数据集的代码,注意一定要在linux系统下,否则会读入图像的时候出问题,可能遇到的问题都...
在crnn训练的时候需要用到lmdb格式的数据集,下面是python生成lmdb个是数据集的代码,注意一定要在linux系统下,否则会读入图像的时候出问题,可能遇到的问题都在代码里面注释了,看代码即可。
#-*- coding:utf-8 -*- import os import lmdb#先pip install这个模块哦 import cv2 import glob import numpy as np def checkimageisvalid(imagebin): if imagebin is none: return false imagebuf = np.fromstring(imagebin, dtype=np.uint8) img = cv2.imdecode(imagebuf, cv2.imread_grayscale) if img is none: return false imgh, imgw = img.shape[0], img.shape[1] if imgh * imgw == 0: return false return true def writecache(env, cache): with env.begin(write=true) as txn: for k, v in cache.iteritems(): txn.put(k, v) def createdataset(outputpath, imagepathlist, labellist, lexiconlist=none, checkvalid=true): """ create lmdb dataset for crnn training. # args: outputpath : lmdb output path imagepathlist : list of image path labellist : list of corresponding groundtruth texts lexiconlist : (optional) list of lexicon lists checkvalid : if true, check the validity of every image """ # print (len(imagepathlist) , len(labellist)) assert(len(imagepathlist) == len(labellist)) nsamples = len(imagepathlist) print '...................' env = lmdb.open(outputpath, map_size=8589934592)#1099511627776)所需要的磁盘空间的最小值,之前是1t,我改成了8g,否则会报磁盘空间不足,这个数字是字节 cache = {} cnt = 1 for i in xrange(nsamples): imagepath = imagepathlist[i] label = labellist[i] if not os.path.exists(imagepath): print('%s does not exist' % imagepath) continue with open(imagepath, 'r') as f: imagebin = f.read() if checkvalid: if not checkimageisvalid(imagebin): print('%s is not a valid image' % imagepath)#注意一定要在linux下,否则f.read就不可用了,就会输出这个信息 continue imagekey = 'image-%09d' % cnt labelkey = 'label-%09d' % cnt cache[imagekey] = imagebin cache[labelkey] = label if lexiconlist: lexiconkey = 'lexicon-%09d' % cnt cache[lexiconkey] = ' '.join(lexiconlist[i]) if cnt % 1000 == 0: writecache(env, cache) cache = {} print('written %d / %d' % (cnt, nsamples)) cnt += 1 nsamples = cnt - 1 cache['num-samples'] = str(nsamples) writecache(env, cache) print('created dataset with %d samples' % nsamples) def read_text(path): with open(path) as f: text = f.read() text = text.strip() return text if __name__ == '__main__': # lmdb 输出目录 outputpath = 'd:/ruanjianxiazai/tuxiangyangben/fengehou/train'#训练集和验证集要跑两遍这个程序,分两次生成 path = "d:/ruanjianxiazai/tuxiangyangben/fengehou/chenguang/*.jpg"#将txt与jpg的都放在同一个文件里面 imagepathlist = glob.glob(path) print '------------',len(imagepathlist),'------------' imglabellists = [] for p in imagepathlist: try: imglabellists.append((p, read_text(p.replace('.jpg', '.txt')))) except: continue # imglabellist = [ (p, read_text(p.replace('.jpg', '.txt'))) for p in imagepathlist] # sort by labellist imglabellist = sorted(imglabellists, key = lambda x:len(x[1])) imgpaths = [ p[0] for p in imglabellist] txtlists = [ p[1] for p in imglabellist] createdataset(outputpath, imgpaths, txtlists, lexiconlist=none, checkvalid=true)
以上这篇python生成lmdb格式的文件实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持。
上一篇: 检测远程主机上的某个端口是否开启——telnet命令
下一篇: Jquery UI tabs