欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

基于Python实现PDF区域文本提取工具

程序员文章站 2022-06-10 12:59:13
目录功能简介开发代码功能简介打开软件后界面如下:点击打开文件按钮打开之前的pdf文件后效果如下:框选区域后,标题栏会自动显示当前框选的区域提取到的文字,还可以左右按钮切换:实际我们需要提取文字的区域可...

功能简介

打开软件后界面如下:

基于Python实现PDF区域文本提取工具

点击打开文件按钮打开之前的pdf文件后效果如下:

基于Python实现PDF区域文本提取工具

框选区域后,标题栏会自动显示当前框选的区域提取到的文字,还可以左右按钮切换:

基于Python实现PDF区域文本提取工具

实际我们需要提取文字的区域可能不止这一个,所以程序支持多区域框选:

基于Python实现PDF区域文本提取工具

完成区域框选后就可以点击保存文件,将pdf每页提取到的文本保存到一个csv文件中,当前选区的保存结果如下:

基于Python实现PDF区域文本提取工具

可以看到已经按框选顺序依次保存了每一个区域的字符串。

如果选择区域时发现提取结果不准确,可以撤销后重新选择:

基于Python实现PDF区域文本提取工具

保存图片则会将pdf的每页的整体保存为一张图片,未选择区域时,以页码为文件名保存图片:

基于Python实现PDF区域文本提取工具

选择区域时,会自动提取最后一个区域提取的文本作为当前页的文件名:

基于Python实现PDF区域文本提取工具

开发代码

当然这个项目由于本人是一次使用wxpython,功能非常简约,现在将完整代码开源出来期待各位大佬的改进。

源码和已编译工具下载地址:

https://codechina.csdn.net/as604049322/python_gui

完整代码:

"""
小小明的代码
csdn主页:https://blog.csdn.net/as604049322
"""
__author__ = '小小明'
__time__ = '2021/11/24'

import csv

import wx
import os
import fitz


class mycanvas(wx.panel):
    def __init__(self, parent):
        wx.panel.__init__(self, parent)
        self.parent = parent
        self.rects = []
        self.bind(wx.evt_left_down, self.onleftbuttonevent)
        self.bind(wx.evt_left_up, self.onleftbuttonevent)
        self.bind(wx.evt_motion, self.onleftbuttonevent)
        self.bind(wx.evt_paint, self.dodrawing)
        b = wx.button(self, -1, "打开文件", (0, 0))
        self.bind(wx.evt_button, self.onbutton, b)
        b = wx.button(self, -1, "保存文件", (75, 0))
        self.bind(wx.evt_button, self.save_file, b)
        b = wx.button(self, -1, "保存图片", (150, 0))
        self.bind(wx.evt_button, self.save_img, b)
        b = wx.button(self, -1, "撤销选区", (225, 0))
        self.bind(wx.evt_button, self.back_select, b)

        b = wx.button(self, -1, "《", (300, 0), size=(25, 25))
        self.bind(wx.evt_button, self.previous, b)
        b = wx.button(self, -1, "》", (325, 0), size=(25, 25))
        self.bind(wx.evt_button, self.next, b)

        self.g1 = wx.gauge(self, -1, 100, (0, 30), (-1, 100), wx.ga_vertical)

    def previous(self, evt):
        if not hasattr(self, "pdfdoc"):
            return
        if self.i > 0:
            self.i -= 1
            self.change_pdf_page(self.i, false)
            self.dodrawing(-1)
            if self.rects:
                self.parent.settitle(self.path + "|" + self.extract_pdf_text())

    def next(self, evt):
        if not hasattr(self, "pdfdoc"):
            return
        if self.i < self.pagecount - 1:
            self.i += 1
            self.change_pdf_page(self.i, false)
            self.dodrawing(-1)
            if self.rects:
                self.parent.settitle(self.path + "|" + self.extract_pdf_text())

    def back_select(self, evt):
        if self.rects:
            self.rects.pop()
            self.dodrawing(-1)

    def onbutton(self, evt):
        dlg = wx.filedialog(
            self, message="选择一个pdf文件",
            defaultdir=os.getcwd(),
            defaultfile="",
            wildcard="pdf文件(*.pdf)|*.pdf",
            style=wx.fd_open | wx.fd_change_dir |
                  wx.fd_file_must_exist | wx.fd_preview
        )
        if dlg.showmodal() == wx.id_ok:
            self.rects = []
            path = dlg.getpath()
            self.pdfdoc = fitz.open(path)
            self.i = 0
            self.pagecount = self.pdfdoc.pagecount
            self.change_pdf_page(self.i)
            self.path = os.path.basename(path)
            self.parent.settitle(self.path)
            self.dodrawing(-1)
        dlg.destroy()

    def change_pdf_page(self, i, move=true):
        page = self.pdfdoc[i]
        rect = page.rect
        print("pdf范围:", rect)
        mat = fitz.matrix(1, 1)
        pix = page.get_pixmap(matrix=mat, alpha=false, clip=rect)
        pix.save("tmp.png")
        self.change_img("tmp.png", move)

    def save_filedialog(self, format="csv"):
        dlg = wx.filedialog(
            self, message=f"保存一个{format}文件", defaultdir=os.getcwd(),
            defaultfile="", wildcard=f"{format}文件(*.{format})|*.{format}", style=wx.fd_save | wx.fd_overwrite_prompt
        )
        path = none
        if dlg.showmodal() == wx.id_ok:
            path = dlg.getpath()
        dlg.destroy()
        return path

    def save_img(self, evt):
        if not hasattr(self, "pdfdoc"):
            return
        dlg = wx.dirdialog(self, "选择图片保存的文件夹:",
                           style=wx.dd_default_style
                           # | wx.dd_dir_must_exist
                           # | wx.dd_change_dir
                           )
        mat = fitz.matrix(1, 1)
        if dlg.showmodal() == wx.id_ok:
            path = dlg.getpath()
            for i in range(self.pdfdoc.pagecount):
                page = self.pdfdoc[i]
                clip = page.rect
                pix = page.get_pixmap(matrix=mat, alpha=false, clip=clip)
                if self.rects:
                    name = self.extract_pdf_text(page=page, rect=self.rects[-1])
                else:
                    name = f"p{i:0>3d}"
                pix.save(f"{path}/{name}.png")
                self.g1.setvalue((i + 1) * 100 // self.pdfdoc.pagecount)
        dlg.destroy()
        os.system(f"explorer {path}")

    def save_file(self, evt):
        if not hasattr(self, "pdfdoc"):
            return
        path = self.save_filedialog()
        if path is none:
            return
        data = []
        for i in range(self.pdfdoc.pagecount):
            page = self.pdfdoc[i]
            row = [self.extract_pdf_text(page, rect)
                   for i, rect in enumerate(self.rects)]
            data.append(row)
        with open(path, "w") as f:
            writer = csv.writer(f, lineterminator="\n")
            row = [f"区域{i}" for i in range(1, len(row) + 1)]
            writer.writerow(row)
            for row in data:
                writer.writerow(row)
        os.system(f"cmd /c start {path}")

    def extract_pdf_text(self, page=none, rect=none):
        if page is none:
            page = self.pdfdoc[self.i]
        if rect is none:
            rect = self.rects[-1]
        a, b, c, d = rect
        clip = fitz.rect(a, b, a + c, b + d)
        text = page.get_text(clip=clip).strip()
        return text

    def change_img(self, img_path, move=true):
        self.bmp = wx.bitmap(img_path)
        self.setsize(self.bmp.getsize())
        self.parent.setsize(self.parent.getbestsize())
        if move:
            self.parent.center()

    def dodrawing(self, evt):
        if not hasattr(self, "bmp"):
            return
        dc = wx.clientdc(self)
        dc.drawbitmap(self.bmp, 0, 0, true)
        dc.setpen(wx.pen('blue'))
        dc.setbrush(wx.brush('white', wx.brushstyle_transparent))
        dc.drawrectanglelist(self.rects)

    def onleftbuttonevent(self, event):
        if event.leftdown():
            self.x, self.y = event.getposition()
            self.rects.append([self.x, self.y, 0, 0])
        elif event.dragging():
            x, y = event.getposition()
            self.rects[-1][2] = x - self.x
            self.rects[-1][3] = y - self.y
            self.dodrawing(-1)
        elif event.leftup():
            print(self.rects)
            if self.rects[-1][2] < 5 or self.rects[-1][3] < 5:
                self.rects.pop()
            else:
                self.parent.settitle(self.path + "|" + self.extract_pdf_text())


app = wx.app()
frm = wx.frame(none)
pnl = mycanvas(frm)
frm.center()
frm.show()
frm.settitle("pdf文本提取器")
app.mainloop() 

以上就是基于python实现pdf区域文本提取工具的详细内容,更多关于python pdf文本提取的资料请关注其它相关文章!