欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python+opencv进行表格识别并写入excel中

程序员文章站 2022-06-26 17:14:39
效果图如下:对于任意图标都不需要自定义模板,直接程序生成,不过需要注意,图中的表格必须是水平的,无法适配倾斜的表格。直接上代码:import cv2import numpy as npimport mathimport xlwtsrc='图片路径'raw = cv2.imread(src, 1)# 灰度图片gray = cv2.cvtColor(raw, cv2.COLOR_BGR2GRAY)binary = cv2.adaptiveThreshold(~gray, 255, c...

效果图如下:
python+opencv进行表格识别并写入excel中
python+opencv进行表格识别并写入excel中
对于任意图标都不需要自定义模板,直接程序生成,不过需要注意,图中的表格必须是水平的,无法适配倾斜的表格。

直接上代码:

import cv2
import numpy as np
import math
import xlwt
src='图片路径'

raw = cv2.imread(src, 1)
# 灰度图片
gray = cv2.cvtColor(raw, cv2.COLOR_BGR2GRAY)
binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, -5)
# 展示图片

rows, cols = binary.shape
scale2=15
scale = 20
# 自适应获取核值
# 识别横线:
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
kernel1 = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale2, 1))

eroded = cv2.erode(binary, kernel, iterations=1)
dilated_col = cv2.dilate(eroded, kernel1, iterations=1)
# cv2.imwrite("横线图.jpg", dilated_col)

# 识别竖线:
# scale = 40#scale越大,越能检测出不存在的线
kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (1, rows // scale2))

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, rows // scale))
eroded = cv2.erode(binary, kernel, iterations=1)
dilated_row = cv2.dilate(eroded, kernel2, iterations=1)
# cv2.imwrite("竖线图.jpg", dilated_row)

# cv2.imwrite("3.png", dilated_row)

# 将识别出来的横竖线合起来
bitwise_and = cv2.bitwise_and(dilated_col, dilated_row)#对二值图进行与操作
# cv2.imwrite("交点二值图.jpg", bitwise_and)

# 标识表格轮廓
merge = cv2.add(dilated_col, dilated_row)
ret,binary = cv2.threshold(merge, 127, 255, cv2.THRESH_BINARY)
_,contours, hierarchy = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
area=[]
for k in range(len(contours)):
	area.append(cv2.contourArea(contours[k]))
max_idx = np.argmax(np.array(area))
m_d_r=[]
m_u_l=[]
max_p=0
min_p=1e6
for  l1 in contours[max_idx]:
	for l2 in l1:
		if sum(l2)>max_p:
			max_p=sum(l2)
			d_r=l2
		if sum(l2)<min_p:
			min_p=sum(l2)
			u_l=l2
m_d_r=d_r
m_u_l=u_l
padding=5
x0=max(m_u_l[0]-padding,0)
x1=min(m_d_r[0]+padding,raw.shape[1])
y0=max(m_u_l[1]-padding,0)
y1=min(m_d_r[1]+padding,raw.shape[0])
bitwise_and_crop=bitwise_and[y0:y1,x0:x1]
merge=merge[y0:y1,x0:x1]
raw=raw[y0:y1,x0:x1]
# # 两张图片进行减法运算,去掉表格框线
# merge2 = cv2.subtract(binary, merge)
# cv2.imwrite("去表格图.jpg", merge2)

# new_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
# erode_image = cv2.morphologyEx(merge2, cv2.MORPH_OPEN, new_kernel)#先腐蚀,再膨胀
# cv2.imwrite('腐蚀膨胀后的图.jpg', erode_image)
# merge3 = cv2.add(erode_image, bitwise_and)
# cv2.imwrite('角点带字.jpg', merge3)

# 将焦点标识取出来
ys, xs = np.where(bitwise_and_crop > 0)
# print(xs)
# # print('---------------------------------')
# print(ys)
# 横纵坐标数组
y_point_arr = []
x_point_arr = []
# 通过排序,排除掉相近的像素点,只取相近值的最后一点
# 这个10就是两个像素点的距离,不是固定的,根据不同的图片会有调整,基本上为单元格表格的高度(y坐标跳变)和长度(x坐标跳变)
i = 0
sort_x_point = np.sort(xs)
# print(sort_x_point)
for i in range(len(sort_x_point) - 1):
    if sort_x_point[i + 1] - sort_x_point[i] > 3:
        x_point_arr.append(sort_x_point[i])
    i = i + 1
# 要将最后一个点加入
x_point_arr.append(sort_x_point[i])
i = 0
sort_y_point = np.sort(ys)
for i in range(len(sort_y_point) - 1):
    if sort_y_point[i + 1] - sort_y_point[i] > 3:
        y_point_arr.append(sort_y_point[i])
    i = i + 1
y_point_arr.append(sort_y_point[i])

h_list=[y_point_arr[i+1]-y_point_arr[i] for i in range(len(y_point_arr)-1)]
w_list=[x_point_arr[i+1]-x_point_arr[i] for i in range(len(x_point_arr)-1)]

col_alpha=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
print(h_list)
print(w_list)

import xlsxwriter
workbook = xlsxwriter.Workbook('chineseQA.xlsx')     #创建工作簿
worksheet = workbook.add_worksheet()
for i in range(len(w_list)):
	worksheet.set_column('{}:{}'.format(col_alpha[i],col_alpha[i]),w_list[i]/6)
for j in range(len(h_list)):
	worksheet.set_row(j,h_list[j])

def islianjie(p1,p2,img):#p的格式是先y后x
	if p1[0]==p2[0]:
		for i in range(min(p1[1],p2[1]),max(p1[1],p2[1])+1):

			if sum([img[j,i] for j in range(max(p1[0]-5,0),min(p1[0]+5,img.shape[0]))])==0:


				return False

		return True
	elif p1[1]==p2[1]:
		for i in range(min(p1[0],p2[0]),max(p1[0],p2[0])+1):
			if sum([img[i,j] for j in range(max(p1[1]-5,0),min(p1[1]+5,img.shape[1]))])==0:

				return False
		return True
	else:
		return False



class cell:
	def __init__(self,lt,rd,belong):
		self.lt=lt
		self.rd=rd
		self.belong=belong


lt_list_x=x_point_arr[:-1]
lt_list_y=y_point_arr[:-1]
rd_list_x=x_point_arr[1:]
rd_list_y=y_point_arr[1:]
d={}
for i in range(len(lt_list_x)):
	for j in range(len(lt_list_y)):
		d['cell_{}_{}'.format(i,j)]=cell([lt_list_x[i],lt_list_y[j]],[rd_list_x[i],rd_list_y[j]],[lt_list_x[i],lt_list_y[j]])
for i in range(len(lt_list_x)):
	for j in range(len(lt_list_y)):
		p1=[d['cell_{}_{}'.format(i,j)].rd[1],d['cell_{}_{}'.format(i,j)].lt[0]]#左下点
		p2=[d['cell_{}_{}'.format(i,j)].rd[1],d['cell_{}_{}'.format(i,j)].rd[0]]#右下点
		p3=[d['cell_{}_{}'.format(i,j)].lt[1],d['cell_{}_{}'.format(i,j)].rd[0]]#右上点

		if not islianjie(p1,p2,merge):
			d['cell_{}_{}'.format(i,j+1)].belong=d['cell_{}_{}'.format(i,j)].belong
		if not islianjie(p2,p3,merge):
			d['cell_{}_{}'.format(i+1,j)].belong=d['cell_{}_{}'.format(i,j)].belong

crop_list={}
for i in range(len(lt_list_x)):
	for j in range(len(lt_list_y)):
		crop_list['{},{}'.format(d['cell_{}_{}'.format(i,j)].belong[0],d['cell_{}_{}'.format(i,j)].belong[1])]=d['cell_{}_{}'.format(i,j)].rd
w_h_list=[]
zmax=0
zmin=1e6
zlt=[]
zrd=[]
for key in crop_list.keys():
	lt=[int(i) for i in key.split(',')]
	rd=crop_list[key]
	# print(lt,rd)
	if sum(rd)>zmax:
		zrd=rd
		zmax=sum(rd)
	if sum(lt)<zmin:
		zlt=lt
		zmin=sum(lt)
	cv2.imwrite('crop/{}.jpg'.format(key),raw[lt[1]:rd[1],lt[0]:rd[0]])


merge_format = workbook.add_format({
    'bold':     True,
    'border':   6,
    'align':    'center',#水平居中
    'valign':   'vcenter',#垂直居中
    'fg_color': '#D7E4BC',#颜色填充
})
for key in crop_list.keys():
	lt=[int(i) for i in key.split(',')]
	rd=crop_list[key]

	lt_=[lt[0]-zlt[0],lt[1]-zlt[1]]
	rd_=[rd[0]-zlt[0],rd[1]-zlt[1]]
	print(lt_)
	print(rd_)
	for i in range(len(w_list)+1):
		if lt_[0]==sum(w_list[:i]):
			lt_col=chr(ord('A')+i)
		if rd_[0]==sum(w_list[:i]):
			rd_col=chr(ord('A')+i-1)
	for i in range(len(h_list)+1):
		if lt_[1]==sum(h_list[:i]):
			lt_row=i+1
		if rd_[1]==sum(h_list[:i]):
			rd_row=i
	if lt_col==rd_col and lt_row==rd_row:
		worksheet.write('{}{}'.format(lt_col,lt_row),'',merge_format)
	else:
		worksheet.merge_range('{}{}:{}{}'.format(lt_col,lt_row,rd_col,rd_row),'',merge_format)



workbook.close()



本文地址:https://blog.csdn.net/sxl1399504891/article/details/110880866

相关标签: opencl cv