python网络爬虫(简单实例)
程序员文章站
2022-05-08 18:29:50
...
python网络爬虫(简单实例)
(内容来自于O’Reilly(人民邮电出版社)的《Python网络爬虫权威指南》此博客仅用于记录学习,方便以后使用)
目前本系列文章(python网络爬虫笔记)更新情况:
第一章:python网络爬虫(第一章)
第二章:python网络爬虫(第二章)
简单实例:本文
欢迎大家查阅,有不足或错误之处不吝赐教
#randomGetLinks:从一个网页中随机选择一个网址进入
#getAllLinks:深度优先遍历的形式遍历网页中的所有网址及更深层的网址
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
from urllib import parse
import re
import datetime
import random
pages = set()
def getTitle(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bs = BeautifulSoup(html.read(), 'html.parser')
title = bs.body.h1
except AttributeError as e:
return None
return title
def getLinks(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bs = BeautifulSoup(html, 'html.parser')
links = bs.find('div', {'id': 'slider_relations'}).find_all('a', href = re.compile('/item/.*/[0-9]*'))
except AttributeError as e:
return None
return links
def getAllLinks(url):
global pages
try:
html = urlopen(url)
except HTTPError as e:
exit(1)
try:
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id': 'slider_relations'}).find_all('a', href = re.compile('/item/.*/[0-9]*')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = 'https://baike.baidu.com' + link.attrs['href']
title = getTitle(newPage)
print(title)
print(newPage)
pages.add(link.attrs['href'])
getAllLinks(newPage)
except AttributeError as e:
return
def randomGetLinks(url):
random.seed(datetime.datetime.now())
links = getLinks(url)
if links == None:
print('Wrong!')
else:
while len(links) > 0:
newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
newArticle = 'https://baike.baidu.com' + newArticle
title = getTitle(newArticle)
print(title)
print(newArticle)
links = getLinks(newArticle)
if links == None:
print('Wrong!')
break
url = 'https://baike.baidu.com/item/{}'.format(parse.quote('周星驰'))
#randomGetLinks(url)
getAllLinks(url)