基于Python实现PDF区域文本提取工具
程序员文章站
2022-06-10 12:59:13
目录功能简介开发代码功能简介打开软件后界面如下:点击打开文件按钮打开之前的pdf文件后效果如下:框选区域后,标题栏会自动显示当前框选的区域提取到的文字,还可以左右按钮切换:实际我们需要提取文字的区域可...
功能简介
打开软件后界面如下:
点击打开文件按钮打开之前的pdf文件后效果如下:
框选区域后,标题栏会自动显示当前框选的区域提取到的文字,还可以左右按钮切换:
实际我们需要提取文字的区域可能不止这一个,所以程序支持多区域框选:
完成区域框选后就可以点击保存文件,将pdf每页提取到的文本保存到一个csv文件中,当前选区的保存结果如下:
可以看到已经按框选顺序依次保存了每一个区域的字符串。
如果选择区域时发现提取结果不准确,可以撤销后重新选择:
保存图片则会将pdf的每页的整体保存为一张图片,未选择区域时,以页码为文件名保存图片:
选择区域时,会自动提取最后一个区域提取的文本作为当前页的文件名:
开发代码
当然这个项目由于本人是一次使用wxpython,功能非常简约,现在将完整代码开源出来期待各位大佬的改进。
源码和已编译工具下载地址:
https://codechina.csdn.net/as604049322/python_gui
完整代码:
""" 小小明的代码 csdn主页:https://blog.csdn.net/as604049322 """ __author__ = '小小明' __time__ = '2021/11/24' import csv import wx import os import fitz class mycanvas(wx.panel): def __init__(self, parent): wx.panel.__init__(self, parent) self.parent = parent self.rects = [] self.bind(wx.evt_left_down, self.onleftbuttonevent) self.bind(wx.evt_left_up, self.onleftbuttonevent) self.bind(wx.evt_motion, self.onleftbuttonevent) self.bind(wx.evt_paint, self.dodrawing) b = wx.button(self, -1, "打开文件", (0, 0)) self.bind(wx.evt_button, self.onbutton, b) b = wx.button(self, -1, "保存文件", (75, 0)) self.bind(wx.evt_button, self.save_file, b) b = wx.button(self, -1, "保存图片", (150, 0)) self.bind(wx.evt_button, self.save_img, b) b = wx.button(self, -1, "撤销选区", (225, 0)) self.bind(wx.evt_button, self.back_select, b) b = wx.button(self, -1, "《", (300, 0), size=(25, 25)) self.bind(wx.evt_button, self.previous, b) b = wx.button(self, -1, "》", (325, 0), size=(25, 25)) self.bind(wx.evt_button, self.next, b) self.g1 = wx.gauge(self, -1, 100, (0, 30), (-1, 100), wx.ga_vertical) def previous(self, evt): if not hasattr(self, "pdfdoc"): return if self.i > 0: self.i -= 1 self.change_pdf_page(self.i, false) self.dodrawing(-1) if self.rects: self.parent.settitle(self.path + "|" + self.extract_pdf_text()) def next(self, evt): if not hasattr(self, "pdfdoc"): return if self.i < self.pagecount - 1: self.i += 1 self.change_pdf_page(self.i, false) self.dodrawing(-1) if self.rects: self.parent.settitle(self.path + "|" + self.extract_pdf_text()) def back_select(self, evt): if self.rects: self.rects.pop() self.dodrawing(-1) def onbutton(self, evt): dlg = wx.filedialog( self, message="选择一个pdf文件", defaultdir=os.getcwd(), defaultfile="", wildcard="pdf文件(*.pdf)|*.pdf", style=wx.fd_open | wx.fd_change_dir | wx.fd_file_must_exist | wx.fd_preview ) if dlg.showmodal() == wx.id_ok: self.rects = [] path = dlg.getpath() self.pdfdoc = fitz.open(path) self.i = 0 self.pagecount = self.pdfdoc.pagecount self.change_pdf_page(self.i) self.path = os.path.basename(path) self.parent.settitle(self.path) self.dodrawing(-1) dlg.destroy() def change_pdf_page(self, i, move=true): page = self.pdfdoc[i] rect = page.rect print("pdf范围:", rect) mat = fitz.matrix(1, 1) pix = page.get_pixmap(matrix=mat, alpha=false, clip=rect) pix.save("tmp.png") self.change_img("tmp.png", move) def save_filedialog(self, format="csv"): dlg = wx.filedialog( self, message=f"保存一个{format}文件", defaultdir=os.getcwd(), defaultfile="", wildcard=f"{format}文件(*.{format})|*.{format}", style=wx.fd_save | wx.fd_overwrite_prompt ) path = none if dlg.showmodal() == wx.id_ok: path = dlg.getpath() dlg.destroy() return path def save_img(self, evt): if not hasattr(self, "pdfdoc"): return dlg = wx.dirdialog(self, "选择图片保存的文件夹:", style=wx.dd_default_style # | wx.dd_dir_must_exist # | wx.dd_change_dir ) mat = fitz.matrix(1, 1) if dlg.showmodal() == wx.id_ok: path = dlg.getpath() for i in range(self.pdfdoc.pagecount): page = self.pdfdoc[i] clip = page.rect pix = page.get_pixmap(matrix=mat, alpha=false, clip=clip) if self.rects: name = self.extract_pdf_text(page=page, rect=self.rects[-1]) else: name = f"p{i:0>3d}" pix.save(f"{path}/{name}.png") self.g1.setvalue((i + 1) * 100 // self.pdfdoc.pagecount) dlg.destroy() os.system(f"explorer {path}") def save_file(self, evt): if not hasattr(self, "pdfdoc"): return path = self.save_filedialog() if path is none: return data = [] for i in range(self.pdfdoc.pagecount): page = self.pdfdoc[i] row = [self.extract_pdf_text(page, rect) for i, rect in enumerate(self.rects)] data.append(row) with open(path, "w") as f: writer = csv.writer(f, lineterminator="\n") row = [f"区域{i}" for i in range(1, len(row) + 1)] writer.writerow(row) for row in data: writer.writerow(row) os.system(f"cmd /c start {path}") def extract_pdf_text(self, page=none, rect=none): if page is none: page = self.pdfdoc[self.i] if rect is none: rect = self.rects[-1] a, b, c, d = rect clip = fitz.rect(a, b, a + c, b + d) text = page.get_text(clip=clip).strip() return text def change_img(self, img_path, move=true): self.bmp = wx.bitmap(img_path) self.setsize(self.bmp.getsize()) self.parent.setsize(self.parent.getbestsize()) if move: self.parent.center() def dodrawing(self, evt): if not hasattr(self, "bmp"): return dc = wx.clientdc(self) dc.drawbitmap(self.bmp, 0, 0, true) dc.setpen(wx.pen('blue')) dc.setbrush(wx.brush('white', wx.brushstyle_transparent)) dc.drawrectanglelist(self.rects) def onleftbuttonevent(self, event): if event.leftdown(): self.x, self.y = event.getposition() self.rects.append([self.x, self.y, 0, 0]) elif event.dragging(): x, y = event.getposition() self.rects[-1][2] = x - self.x self.rects[-1][3] = y - self.y self.dodrawing(-1) elif event.leftup(): print(self.rects) if self.rects[-1][2] < 5 or self.rects[-1][3] < 5: self.rects.pop() else: self.parent.settitle(self.path + "|" + self.extract_pdf_text()) app = wx.app() frm = wx.frame(none) pnl = mycanvas(frm) frm.center() frm.show() frm.settitle("pdf文本提取器") app.mainloop()
以上就是基于python实现pdf区域文本提取工具的详细内容,更多关于python pdf文本提取的资料请关注其它相关文章!