python读取pdf文件代码示例
程序员文章站
2022-03-11 10:13:28
python读取pdf文件的包有pdfminer, pdfminer3k,pdfplumber等,其他我没去试pdfminerimport iofrom pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowedfrom pdfminer.pdfinterp....
python读取pdf文件的包有pdfminer, pdfminer3k,pdfplumber等,其他我没去试
pdfminer
import io
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
class PDFUtils:
def pdf2txt(self, path):
output = io.StringIO()
with open(path, 'rb') as f:
praser = PDFParser(f)
doc = PDFDocument(praser)
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
pdfrm = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(pdfrm, laparams=laparams)
interpreter = PDFPageInterpreter(pdfrm, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if hasattr(x, "get_text"):
content = x.get_text()
output.write(content)
content = output.getvalue()
output.close()
return content
if __name__ == '__main__':
path = '测试.pdf'
pdf_utils = PDFUtils()
print(pdf_utils.pdf2txt(path))
pdfminer3k
from io import StringIO
from io import open
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
def read_pdf(pdf):
p_manager = PDFResourceManager()
out_fp = StringIO()
la_params = LAParams()
device = TextConverter(p_manager, out_fp, laparams=la_params)
process_pdf(p_manager, device, pdf)
device.close()
content = out_fp.getvalue()
out_fp.close()
return content
if __name__ == '__main__':
with open('测试.pdf', "rb") as my_pdf:
print(read_pdf(my_pdf))
pdfplumber
import pdfplumber
import pandas as pd
with pdfplumber.open("测试.pdf") as pdf:
page = pdf.pages[1] # 第一页的信息
text = page.extract_text()
print(text)
table = page.extract_tables()
for t in table:
# 得到的table是嵌套list类型,转化成DataFrame更加方便查看和分析
df = pd.DataFrame(t[1:], columns=t[0])
print(df)
本文地址:https://blog.csdn.net/q389797999/article/details/110230503
下一篇: MySQL优化教程之超大分页查询