requests库+re正则表达式爬取并解析古诗文网
程序员文章站
2024-02-09 12:47:46
# requests + re# requests: 数据爬取# re:数据解析import requestsimport redef parse(url): # 定义请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',....
# requests + re
# requests: 数据爬取
# re:数据解析
import requests
import re
def parse(url):
# 定义请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}
response = requests.get(url,headers)
text = response.text
# 核心: 利用re来解析数据
# 限定符后面的?表示非贪婪模式
# re.DOTALL 可以让 . 运算符匹配到\n,.运算符默认是匹配不到\n的
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
contents_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
contents = []
for content in contents_tags:
content = re.sub(r'<.*?>', '', content)
contents.append(content.strip())
poems = []
for value in zip(titles, dynasties, authors, contents):
title, dynasty, author, content = value
poem = [
{
'title': title,
'dynasties': dynasty,
'authors': author,
'contents': content
}
]
# 将字典作为元素添加到列表中
poems.append(poem)
for poem in poems:
print(poem)
print('~'*100)
def main():
# 爬取指定的页数
for page in range(1, 51):
url = 'https://www.gushiwen.org/default_{}.aspx'.format(page)
parse(url)
if __name__ == '__main__':
main()
本文地址:https://blog.csdn.net/qq_39504519/article/details/107084243