爬虫之BeautifulSoup类
程序员文章站
2022-07-06 12:17:14
安装:pip install BeautifulSoup4 下表列出了主要的解析器,以及它们的优缺点:看个人习惯选取自己喜欢的解析方式 1 # 获取html代码 2 import requests 3 r = requests.get('http://www.python123.io/ws/demo ......
安装:pip install beautifulsoup4
下表列出了主要的解析器,以及它们的优缺点:看个人习惯选取自己喜欢的解析方式
1 # 获取html代码 2 import requests 3 r = requests.get('http://www.python123.io/ws/demo.html') 4 demo = r.text 5 from bs4 import beautifulsoup 6 soup = beautifulsoup(demo,'html.parser') 7 print(soup.prettify()) #按照标准的缩进格式的结构输出,代码如下 8 <html> 9 <head> 10 <title> 11 this is a python demo page 12 </title> 13 </head> 14 <body> 15 <p class="title"> 16 <b> 17 the demo python introduces several python courses. 18 </b> 19 </p> 20 <p class="course"> 21 python is a wonderful general-purpose programming language. you can learn python from novice to professional by tracking the following courses: 22 <a class="py1" href="http://www.icourse163.org/course/bit-268001" id="link1"> 23 basic python 24 </a> 25 and 26 <a class="py2" href="http://www.icourse163.org/course/bit-1001870001" id="link2"> 27 advanced python 28 </a> 29 . 30 </p> 31 </body> 32 </html>
简单浏览数据化方法的用法
#demo的源代码 html_d=""" <html><head><title>this is a python demo page</title></head> <body> <p class="title"><b>the demo python introduces several python courses.</b></p> <p class="course">python is a wonderful general-purpose programming language. you can learn python from novice to professional by tracking the following courses: <a href="http://www.icourse163.org/course/bit-268001" class="py1" id="link1">basic python</a> and <a href="http://www.icourse163.org/course/bit-1001870001" class="py2" id="link2">advanced python</a>.</p> </body></html> """ from bs4 import beautifulsoup soup=beautifulsoup(html_d,'html.parser') # 获取title标签 print(soup.title) #获取文本内容 print(soup.text) #获取标签名称 print(soup.title.name) #获取标签属性 print(soup.title.attrs) #获取head标签的子节点 print(soup.p.contents) print(soup.p.children) #获取所有的a标签 print(soup.find_all('a'))
常用解析方法
#demo的源代码 html_d=""" <html><head><title>this is a python demo page</title></head> <body> <p class="title"><b>the demo python introduces several python courses.</b></p> <p class="course">python is a wonderful general-purpose programming language. you can learn python from novice to professional by tracking the following courses: <a href="http://www.icourse163.org/course/bit-268001" class="py1" id="link1">basic python</a> and <a href="http://www.icourse163.org/course/bit-1001870001" class="py2" id="link2">advanced python</a>.</p> </body></html> """ from bs4 import beautifulsoup soup = beautifulsoup(html_d,"lxml") #p下面所有的子节点 print(soup.p.contents) soup.contents[0].name #children本身没有子节点,得到一个迭代器,包含p下所有子节点 print(soup.p.children) for child in enumerate(soup.p.children): print(child) #子孙节点p下面所有的标签都会出来 print(soup.p.descendants) for i in enumerate(soup.p.children): print(i) # string 下面有且只有一个子节皆可以取出,如有多个字节则返回为none print(soup.title.string) # strings 如果有多个字符串 for string in soup.strings: print(repr(string)) #去掉空白 for line in soup.stripped_strings: print(line) #获取a标签的父节点 print(soup.a.parent) #找到a标签的父辈节点 print(soup.a.parents) #兄弟节点 print(soup.a.next_sibling) #同一个兄弟 print(soup.a.next_sibling) #上一个兄弟 print(soup.a.next_sibling) #下一个兄弟
find_all的用法( name, attrs, recursive, text, **kwargs)
import re
from bs4 import beautifulsoup
soup = beautifulsoup(html_d,"lxml")
# name
for tag in soup.find_all(re.compile('b')):
print(tag.name)
#attrs
print(soup.find_all('p','course'))
#keyword
print(soup.find_all(id='link1'))
#recursive
# print(soup.find_all('a',recursive=false))
# string
# print(soup.find_all(string=re.compile('python')))
小案例
import requests from bs4 import beautifulsoup import bs4 #获取url里面信息 def gethtmltext(url): try: r= requests.get(url,timeout=30 ) r.encoding=r.apparent_encoding return r.text except: return "" #提起网页数据 def fillunivlist(ulist,html): soup = beautifulsoup(html,"html.parser") for tr in soup.find('tbody').children: if isinstance(tr,bs4.element.tag): tds = tr('td') ulist.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string]) pass #打印数据结果 def printunivlist(ulist,num): # tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}\t{:^10}" # print(tplt.format('排名', '学校名称', '省份','总分',chr(12288))) # for i in range(num): # u = ulist[i] # print(tplt.format(u[0], u[1], u[2],u[3],chr(12288))) print("{:^10}\t{:^6}\t{:^10}\t{:^10}".format('排名', '学校名称', '地区', '总分')) for i in range(num): u = ulist[i] print("{:^10}\t{:^6}\t{:^10}\t{:^10}".format(u[0], u[1], u[2], u[3])) return def main(): unifo = [] url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html' html = gethtmltext(url) fillunivlist(unifo,html) printunivlist(unifo,20) #打印前20所 main()
上一篇: 爆笑,又逗又损的二货