python爬虫抓取中关村笑话网站
程序员文章站
2024-01-02 21:20:16
先上代码import _threadimport timeimport requestsfrom lxml import etreeheads = {}heads['User-Agent'] = 'Mozilla/5.0 ' \ '(Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 ' \ '(KHTML, like...
先上代码
import _thread
import time
import requests
from lxml import etree
heads = {}
heads['User-Agent'] = 'Mozilla/5.0 ' \
'(Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 ' \
'(KHTML, like Gecko) Version/5.1 Safari/534.50'
def getAllTheme():
content = requests.get(url="http://xiaohua.zol.com.cn/", params={}, headers=heads).text
return etree.HTML(content).xpath('//ul[@class="news-list classification-nav clearfix"]/li/a/@href')
def getMaxPage(theme):
content = requests.get(url="http://xiaohua.zol.com.cn" + theme +'1000.html', params={}, headers=heads).text
maxPage = etree.HTML(content).xpath('//div[@class="page"]/span[@class="now"]/text()')
return maxPage
def getJokeWithTheme(theme, maxPage):
for index in range(1, maxPage):
url = "http://xiaohua.zol.com.cn" + theme + str(index) + ".html"
print(url)
response = requests.get(url=url, params={}, headers=heads)
html = etree.HTML(response.text)
content = html.xpath(
'//li[@class="article-summary"]/div[@class="article-commentbar articleCommentbar clearfix"]/a/@href')
for uri in content:
content_url = "http://xiaohua.zol.com.cn" + uri;
content_response = requests.get(url=content_url, params={}, headers=heads)
content_content = etree.HTML(content_response.text).xpath('//div[@class="article-text"]/text()')
for joke in content_content:
print(joke.strip())
if __name__ == '__main__':
allTheme = getAllTheme()
for theme in allTheme:
maxPage = getMaxPage(theme)
if len(maxPage) == 0 :
getJokeWithTheme(theme, 1)
else:
getJokeWithTheme(theme, int(maxPage[0]))
while 1:
pass
标题抓取中关村笑话网站的所有笑话
- 解析所有主题
- 解析每个主题一共有多少页面
- 解析每个页面的笑话内容
解析所有主题
笑话内容的标签位置:
使用xpath解析
'//ul[@class="news-list classification-nav clearfix"]/li/a/@href'
获取每个主题的相对链接
然后是获取每个主题的页码, 打开一个主题。 浏览页码部分。
发现在网页源码中, 并没有显示一共有多少页, 无法获取最大页码。 尝试输入1000的页码, 看页面效果。 发现。
当我输入页码是1000的时候, 如果没有1000页, 就会把最大的页码显示出来。 利用这个特性。 每个主题都通过获取第1000页的内容, 获取到最大页面。 代码如下。
def getMaxPage(theme):
content = requests.get(url="http://xiaohua.zol.com.cn" + theme +'1000.html', params={}, headers=heads).text
maxPage = etree.HTML(content).xpath('//div[@class="page"]/span[@class="now"]/text()')
return maxPage
最后获取每个主题指定pag下的所有笑话
def getJokeWithTheme(theme, maxPage):
for index in range(1, maxPage):
url = "http://xiaohua.zol.com.cn" + theme + str(index) + ".html"
print(url)
response = requests.get(url=url, params={}, headers=heads)
html = etree.HTML(response.text)
content = html.xpath(
'//li[@class="article-summary"]/div[@class="article-commentbar articleCommentbar clearfix"]/a/@href')
for uri in content:
content_url = "http://xiaohua.zol.com.cn" + uri;
content_response = requests.get(url=content_url, params={}, headers=heads)
content_content = etree.HTML(content_response.text).xpath('//div[@class="article-text"]/text()')
for joke in content_content:
print(joke.strip())
本文地址:https://blog.csdn.net/weixin_43164781/article/details/110606611