欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python wkhtmltopdf 错误

程序员文章站 2024-01-28 09:17:22
...

使用wkhtmltopdf将网页转成pdf的时候一直出现错误,无法解决。

错误一:
Warning: A finished ResourceObject received a loading progress signal. This might be an indication of an iframe taking too long to load.

Loading pages (1/6)
[>                                                           ] 0%[======>                                                     ] 10%[==============================>                             ] 50%[============================================================] 100%[======>                                                     ] 10% [========>                                                   ] 14%[===========>                                                ] 19%[=============>                                              ] 23%[===============>                                            ] 26%[================>                                           ] 28%[==================>                                         ] 31%Warning: A finished ResourceObject received a loading progress signal. This might be an indication of an iframe taking too long to load.
Warning: A finished ResourceObject received a loading finished signal. This might be an indication of an iframe taking too long to load.
Counting pages (2/6)
[============================================================] Object 1 of 1Resolving links (4/6)                                                       
[============================================================] Object 1 of 1Loading headers and footers (5/6)                                           
Printing pages (6/6)
[>                                                           ] PreparingDone                                                                    

错误二:
ContentNotFoundError
OSError
IOError

Traceback (most recent call last):
  File "byszh.py", line 96, in <module>
    main()
  File "byszh.py", line 92, in main
    parsePage(url1)
  File "byszh.py", line 61, in parsePage
    savePdf(pure_url,file_path)
  File "byszh.py", line 87, in savePdf
    pdfkit.from_url(url,path,options=options, configuration=config)
  File "D:\Python 3\lib\site-packages\pdfkit\api.py", line 26, in from_url
    return r.to_pdf(output_path)
  File "D:\Python 3\lib\site-packages\pdfkit\pdfkit.py", line 156, in to_pdf
    raise IOError('wkhtmltopdf reported an error:\n' + stderr)
OSError: wkhtmltopdf reported an error:
Loading pages (1/6)
[>                                                           ] 0%[======>                                                     ] 10%[==============================>                             ] 50%[============================================================] 100%[======>                                                     ] 10% Error: Failed to load http://law.shufe.edu.cn/show.aspx?info_lb=10&flag=10&info_id=4356, with network status code 203 and http status code 404 - Error downloading http://law.shufe.edu.cn/show.aspx?info_lb=10&flag=10&info_id=4356 - server replied: Not Found
[==============================>                             ] 50%[============================================================] 100%Counting pages (2/6)                                               
[============================================================] Object 1 of 1Resolving links (4/6)                                                       
[============================================================] Object 1 of 1Loading headers and footers (5/6)                                           
Printing pages (6/6)
[>                                                           ] Preparing[============================================================] Page 1 of 1Done                                                                      
Exit with code 1 due to network error: ContentNotFoundError


出错代码
可以获得url链接和学校名称,但是在savePdf(pure_url,file_path)这一步出错了,一直无法解决。

import requests
from bs4 import BeautifulSoup
import re
import traceback

from PyPDF2 import PdfFileMerger
import pdfkit

import os
import urllib.request
import time

def getHTMLSoup(url,code="UTF-8"):
	try:
		headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
		r=requests.get(url, headers=headers)
		r.raise_for_status()
		r.encoding=r.apparent_encoding
		
		return r.text
	except:
		print("爬取失败")

def parsePage(url):
#	try:
		htmlText=getHTMLSoup(url,'GB2312')
		soup=BeautifulSoup(htmlText,"html.parser")
		urlList=soup.find_all('a',class_=re.compile("external"))
		titles=soup.find_all(string=re.compile('.{1,20}.{1,20}:$'))

		i=0
		while i < len(titles)-1:
			pure_url=urlList[i+1]['href'] #获得每个入营通知的链接

			pure_title1=titles[i+1].strip(':')
			subReg1=re.compile('-') #匹配学校名,将文件放到对应的文件夹里面
			pure_title2=re.sub(subReg1,'',pure_title1)
			subReg2=re.compile(r'\d、') #匹配学校名,将文件放到对应的文件夹里面
			pure_title=re.sub(subReg2,'',pure_title2)

			subReg3=re.compile('-.{1,20}:$') #匹配学校名,将文件放到对应的文件夹里面
			school=re.sub(subReg3,'',titles[i+1])

			school_path1='C:/Users/Administrator/Desktop/test/'+school+'/'  #如果不存在这个文件夹就创建一个文件夹,先创建学校的目录
			if not os.path.exists(school_path1):
				os.mkdir(school_path1)

			school_path='C:/Users/Administrator/Desktop/test/'+school+'/入营名单/'
			if not os.path.exists(school_path): #如果不存在这个文件夹就创建一个文件夹,再创建其下的子目录
				os.mkdir(school_path)

			file_path=pure_title+'2019年夏令营'+'入营名单.pdf'
			i=i+1
			savePdf(pure_url,file_path)


def savePdf(url,path):
		options = {
		'page-size': 'Letter',
		'margin-top': '0.75in',
		'margin-right': '0.75in',
		'margin-bottom': '0.75in',
		'margin-left': '0.75in',
		'encoding': "UTF-8",
		'no-outline': None
		}
		config = pdfkit.configuration(wkhtmltopdf=r'D:\wkhtmltopdf\bin\wkhtmltopdf.exe')
		pdfkit.from_url(url,path,options=options, configuration=config)

def main():
	url1="https://zhuanlan.zhihu.com/p/73335262"
	url2="https://zhuanlan.zhihu.com/p/72063554"
	parsePage(url1)
	parsePage(url2)


main()

相关标签: 无法解决的错误