Python爬虫【解析库之beautifulsoup】
程序员文章站
2023-04-03 14:46:27
解析库的安装 初始化 BeautifulSoup(str,"解析库") html='''
Hello
- 推荐使用lxml解析库,必要时使用html.parser
- 标签选择筛选功能弱但是速度快
- 建议使用find()、find_all() 查询匹配单个结果或者多个结果
- 如果对css选择器熟悉建议使用select()
- 记住常用的获取属性和文本值的方法
解析库的安装
pip3 install beautifulsoup4
初始化 beautifulsoup(str,"解析库")
from bs4 import beautifulsoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">foo</li>
<li class="element">bar</li>
<li class="element">jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">foo</li>
<li class="element">bar</li>
</ul>
</div>
</div>
'''
soup = beautifulsoup(html,"lxml") # soup = beautifulsoup(html,"html.parser")
标签选择器
选择元素 soup.e
html = """ <html><head><title>the dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>the dormouse's story</b></p> <p class="story">once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.title) print(type(soup.title)) print(soup.head) print(soup.p) """ 打印结果: <title>the dormouse's story</title> <class 'bs4.element.tag'> <head><title>the dormouse's story</title></head> <p class="title" name="dromouse"><b>the dormouse's story</b></p> """
获取名称 soup.e.name
html = """ <html><head><title>the dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>the dormouse's story</b></p> <p class="story">once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.title.name) # title
获取属性 soup.e.attrs[ ] or soup.e[ ]
html = """ <html><head><title>the dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>the dormouse's story</b></p> <p class="story">once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.p.attrs['name']) print(soup.p['name']) """ dromouse dromouse """
获取内容 soup.e.string
html = """ <html><head><title>the dormouse's story</title></head> <body> <p clss="title" name="dromouse"><b>the dormouse's story</b></p> <p class="story">once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.p.string) """ the dormouse's story """
嵌套选择 soup.e.e
html = """ <html><head><title>the dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>the dormouse's story</b></p> <p class="story">once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.head.title.string) """ the dormouse's story """
子节点 soup.e.contents
1 html = """ 2 <html> 3 <head> 4 <title>the dormouse's story</title> 5 </head> 6 <body> 7 <p class="story"> 8 once upon a time there were three little sisters; and their names were 9 <a href="http://example.com/elsie" class="sister" id="link1"> 10 <span>elsie</span> 11 </a> 12 <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> 13 and 14 <a href="http://example.com/tillie" class="sister" id="link3">tillie</a> 15 and they lived at the bottom of a well. 16 </p> 17 <p class="story">...</p> 18 """ 19 from bs4 import beautifulsoup 20 soup = beautifulsoup(html, 'lxml') 21 print(soup.p.contents) 22 """ 23 ['\n once upon a time there were three little sisters; and their names were\n ', <a class="sister" href="http://example.com/elsie" id="link1"> 24 <span>elsie</span> 25 </a>, '\n', <a class="sister" href="http://example.com/lacie" id="link2">lacie</a>, ' \n and\n ', <a class="sister" href="http://example.com/tillie" id="link3">tillie</a>, '\n and they lived at the bottom of a well.\n '] 26 27 """
子节点 soup.e.children
html = """ <html> <head> <title>the dormouse's story</title> </head> <body> <p class="story"> once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.p.children) for i, child in enumerate(soup.p.children): print(i, child) """ <list_iterator object at 0x00b116d0> 0 once upon a time there were three little sisters; and their names were 1 <a class="sister" href="http://example.com/elsie" id="link1"> <span>elsie</span> </a> 2 3 <a class="sister" href="http://example.com/lacie" id="link2">lacie</a> 4 and 5 <a class="sister" href="http://example.com/tillie" id="link3">tillie</a> 6 and they lived at the bottom of a well. """
子孙节点 soup.e.descendants 包括标签里面的文本都属于
html = """ <html> <head> <title>the dormouse's story</title> </head> <body> <p class="story"> once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.p.descendants) for i, child in enumerate(soup.p.descendants): print(i, child) """ <generator object descendants at 0x03ebd420> 0 once upon a time there were three little sisters; and their names were 1 <a class="sister" href="http://example.com/elsie" id="link1"> <span>elsie</span> </a> 2 3 <span>elsie</span> 4 elsie 5 6 7 <a class="sister" href="http://example.com/lacie" id="link2">lacie</a> 8 lacie 9 and 10 <a class="sister" href="http://example.com/tillie" id="link3">tillie</a> 11 tillie 12 and they lived at the bottom of a well. """
父节点 soup.e.parent
html = """ <html> <head> <title>the dormouse's story</title> </head> <body> <p class="story"> once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.a.parent)
祖先节点 soup.e.parents
html = """ <html> <head> <title>the dormouse's story</title> </head> <body> <p class="story"> once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(list(enumerate(soup.a.parents)))
兄弟节点 soup.e.next_siblings soup.e.previous_siblings
html = """ <html> <head> <title>the dormouse's story</title> </head> <body> <p class="story"> once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(list(enumerate(soup.a.next_siblings))) print(list(enumerate(soup.a.previous_siblings)))
标准选择器
find_all( name , attrs , recursive , text , **kwargs )
可根据标签名、属性、内容查找文档
标签名获取 soup.find_all('name')
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.find_all('ul'))
属性获取 soup.find_all(attrs={})
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1" name="elements"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.find_all(attrs={'id': 'list-1'})) print(soup.find_all(attrs={'name': 'elements'}))
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.find_all(id='list-1')) print(soup.find_all(class_='element'))
文本内容获取
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.find_all(text='foo'))
find( name , attrs , recursive , text , **kwargs )
find返回单个元素,find_all返回所有元素
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.find('ul')) print(type(soup.find('ul'))) print(soup.find('page'))
find_parents() find_parent()
find_parents()返回所有祖先节点,find_parent()返回直接父节点。
find_next_siblings() find_next_sibling()
find_next_siblings()返回后面所有兄弟节点,find_next_sibling()返回后面第一个兄弟节点。
find_previous_siblings() find_previous_sibling()
find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点。
find_all_next() find_next()
find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点
find_all_previous() 和 find_previous()
find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点
css选择器
通过select()直接传入css选择器即可完成选择 soup.select(css选择器)
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') print(soup.select('.panel .panel-heading')) print(soup.select('ul li')) print(soup.select('#list-2 .element')) print(type(soup.select('ul')[0]))
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') for ul in soup.select('ul'): print(ul.select('li'))
获取属性 e.attrs[] or e[]
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') for ul in soup.select('ul'): print(ul['id']) print(ul.attrs['id'])
获取内容 get_text()
html=''' <div class="panel"> <div class="panel-heading"> <h4>hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">foo</li> <li class="element">bar</li> <li class="element">jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">foo</li> <li class="element">bar</li> </ul> </div> </div> ''' from bs4 import beautifulsoup soup = beautifulsoup(html, 'lxml') for li in soup.select('li'): print(li.get_text())
总结
本文代码皆来自崔庆才《python3网络爬虫开发实战》
上一篇: HTML初学小技巧
下一篇: 雷军:小米将于明年在欧洲发布5G商用手机