爬虫 爬糗事百科前十页内容
程序员文章站
2022-05-02 22:13:59
...
import requests
class QiuShi:
def __init__(self):
"""
初始化必要参数,完成基础设置
:param qiusshi_name_craw:
"""
# self.qiushi_name = qiushi_name_craw
self.url_base = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0", }
def make_url_lists(self):
"""
生成下载列表
:return:
"""
return [self.url_base.format(i) for i in range(1,11)]
def download_url(self,url_str):
"""
使用requests get下载指定页面,并返回页面结果
:param url_str: 下载链接
:return: 下载结果
"""
result = requests.get(url_str,headers = self.headers)
return result.content
def save_result(self,result,page_num):
"""
存储下载内容
:param result:
:param page_num:
:return:
"""
file_path = "糗事百科-第{}页.html".format(page_num)
with open(file_path,"wb") as f:
f.write(result)
def run(self):
"""
下载主线程,实现主要的下载逻辑
:return:
"""
#生成下载列表
url_lists = self.make_url_lists()
for url_str in url_lists:
#下载指定页面,并返回页面内容
result_str = self.download_url(url_str)
p_num = url_lists.index(url_str) + 1
#保存下载内容
self.save_result(result_str,p_num)
if __name__ == '__main__':
qiu_shi = QiuShi()
qiu_shi.run()
下一篇: 《网页爬虫》