BeautifulSoup使用相关知识
程序员文章站
2022-07-12 14:58:06
...
1基础使用,获取某一网址内容的h1标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError
def gettitle(url):
try:
html=urlopen(url)
except HTTPError as e:
return None
try:
bsobj=BeautifulSoup(html.read())
title=bsobj.body.h1
except AttributeError as e:
return None
return title
title=gettitle("http://www.pythonscraping.com/pages/page1.html")
if title==None:
print('not found')
else:
print(title)
2复杂HTML解析
- 通过属性查找标签的方法,标签组的使用,以 及标签解析树的导航过程
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError
def gettitle(url):
try:
html=urlopen(url)
except HTTPError as e:
return None
try:
bsobj=BeautifulSoup(html.read())
namelist=bsobj.findAll('span',{"class":'green'})
if namelist==None:
pass
else:
for name in namelist:
print(name.get_text())#get_text()清除标签,只保留内容
except AttributeError as e:
return None
gettitle("http://www.pythonscraping.com/pages/warandpeace.html")
find(tag,attributes,recursive,text,keywords)
findAll(tag,attributes,recursive,text,limit,keywords)
tag:标签(dv,h1,h2等等)
attributes:属性
recursive:设置为Flase只会查找一级标签,默认是True,会根据 筛选条件查找所有子标签
text:使用标签文本匹配,返回该字符的标签数量
limit:范围限制
keywords:使用关键词查找(使用class时,应该obj.findAll(class_=”green”)这样使用)
3采集一个网站,有很多界面,并且有部分界面是重复的,需要把以发现的所有链接放到一起,并保存在方便查询的列表里,只有心链接才会被采集(不能爬取有反爬机制的网站,例如知乎,百度)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages=set()
def get_links(pageurl):
global pages
html=urlopen('http://en.wikipedia.org'+pageurl)
bsObj=BeautifulSoup(html)
for link in bsObj.findAll('a',href=re.compile('^(/wiki/)')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage=link.attrs['href']
print(newPage)
pages.add(newPage)
get_links(newPage)
if __name__=='__main__':
get_links('')
4通过互联网采集:外链
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
pages=set()
random.seed(datetime.datetime.now())
#获取所有内链的链接
def getInternalLinks(bsObj,includeUrk):
internalLinks=[]
for link in bsObj.findAll('a',href=re.compile('^(/|.*'+includeUrk+')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks
#获取页面所有外链的链接列表
def getExternalLinks(bsObj,excludeUrl):
externalLinks=[]
for link in bsObj.findAll('a',href=re.compile('^(http|wwww|https)((?!'+excludeUrl+').)*$')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def splitAddress(address):
addressParts=address.replace('http://','').split('/')
return addressParts
def getRandomExternalLink(startingPage):
html=urlopen(startingPage)
bsObj=BeautifulSoup(html)
externalLinks=getExternalLinks(bsObj,splitAddress(startingPage)[0])
if len(externalLinks)==0:
internalLinks=getInternalLinks(startingPage)
return None
else:
return externalLinks[random.randint(0,len(externalLinks)-1)]
def followWExternalOnly(startingSite):
externalLink=getRandomExternalLink('http://jianshu.com')
print('随机外链:'+externalLink)
followWExternalOnly(externalLink)
followWExternalOnly('http://jianshu.com')
Python群:298948196
上一篇: Spring AOP 源码解析
下一篇: URL编码的一些思考