python - BeautifulSoup教程
程序员文章站
2022-03-07 19:50:54
...
BeautifulSoup
BeautifulSoup将网页元素的正则化查找简单化。
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
asdf
<div class="title">
<b>The Dormouse's story总共</b>
<h1>f</h1>
</div>
<div class="story">Once upon a time there were three little sisters; and their names were
<a class="sister0" id="link1">Els<span>f</span>ie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, features="lxml")
# 找到第一个a标签, 返回对象
tag1 = soup.find(name='a')
# 找到所有的a标签,返回对象
tag2 = soup.find_all(name='a')
# 找到id=link2的标签,返回对象,css选择器语法
tag3 = soup.select('#link2')
使用示例
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
...
</body>
</html>
"""
soup = BeautifulSoup(html_doc, features="lxml")
1. 获取和设置标签名称
tag = soup.find('a')
name = tag.name # 获取
print(name)
tag.name = 'span' # 设置
print(soup)
2. 获取和设置标签属性
tag = soup.find('a')
attrs = tag.attrs # 获取
print(attrs)
tag.attrs = {'ik':123} # 设置
tag.attrs['id'] = 'iiiii' # 设置
print(soup)
3. 按名称查找某标签
# soup查找tag名称
tag = soup.find('a')
print (tag)
4. 按类查找某标签
# soup查找类名称(属性)
# 方法一
tag = soup.find(class_= 'ConsTi')
print(tag)
# 方法二
tag = soup.find(attrs={'class': 'ConsTi'})
print(tag)
4. 按ID查找某标签
# soup查找id(属性)
tag = soup.find(id = 'banner')
print(tag)
tag = soup.find(attrs = {'id':'banner'})
print(tag)
5. 获取子节点和子孙节点
# 获取子节点,且剔除字符串对象,例如换行符
# 在获取子节点的时候,有Navigable和Tag两类对象,Navigable中经常存储换行符之类的对象。
from bs4.element import Tag
tags = tag.children
for i in tags:
if type(i) == Tag:
print(i)
else:
print('string type')
# 获取子孙节点,深度优先搜索
tags = tag.descendants
print([i for i in tags])
6. 清楚节点下内容
# 清空所有节点
# 1. 保留标签名
tag.clear()
# 2. 全部删除
tag.decompose()
# 3. 全部删除,并且将删除的内容返回
tag.extract()
7. 将tag转化为字符串
# 将tag对象转化为字符串
tag.decode() # 同上,获取字符串
str(tag) # 同上,获取字符串
tag.encode()#获取字节类型
8. find方法细说
tag = soup.find('a')
print(tag)
# recursive = True 深度优先搜索
# recursive = False 广度优先搜索,且单面
# text 文本匹配
tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
print(tag)
9. find_all方法细说
上述find()的方法同样适应于find_all()
# 列表内元素逻辑上为或关系
v = soup.find_all(name=['a','div'])
print(v)
v = soup.find_all(class_=['sister0', 'sister'])
print(v)
v = soup.find_all(text=['Tillie'])
print(v, type(v[0]))
v = soup.find_all(id=['link1','link2'])
print(v)
v = soup.find_all(href=['link1','link2'])
print(v)
10.正则查找
####### 正则 #######
import re
rep = re.compile('p')
rep = re.compile('^p')
v = soup.find_all(name=rep)
print(v)
rep = re.compile('sister.*')
v = soup.find_all(class_=rep)
print(v)
rep = re.compile('http://www.oldboy.com/static/.*')
v = soup.find_all(href=rep)
print(v)
11.其他补充
# 是否有特定属性
tag.has_attr('id')
# 获取内部文本内容
v = tag.get_text()
# 获取标签属性
tag = soup.find('a')
v = tag.get('id')
print(v)
12.查找索引
# print the index and the content
tag = soup.find('body')
for i,v in enumerate(tag):
print(i,v)
13. 判断自闭合标签
‘br’ , ‘hr’, ‘input’, ‘img’, ‘meta’,’spacer’, ‘link’, ‘frame’, ‘base’
tag = soup.find('br')
v = tag.is_empty_element
print(v)
14. CSS选择器
soup.select("title")
soup.select("p nth-of-type(3)")
soup.select("body a")
soup.select("html head title")
tag = soup.select("span,a")
soup.select("head > title")
soup.select("p > a")
soup.select("p > a:nth-of-type(2)")
soup.select("p > #link1")
soup.select("body > a")
soup.select("#link1 ~ .sister")
soup.select("#link1 + .sister")
soup.select(".sister")
soup.select("[class~=sister]")
soup.select("#link1")
上一篇: 轮播图自动翻页功能实现