python爬虫_PyQuery库基础
程序员文章站
2022-05-08 18:33:50
...
1.初始化
字符串初始化
from pyquery import PyQuery as pq
html = """
<html><head><title>Demo</title></head>
<body>
<div>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('head'))
<head><title>Demo</title></head>
URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))
2.基础CSS选择器
from pyquery import PyQuery as pq
doc = pq(html)
html = """
<html><head><title>Demo</title></head>
<body>
<div>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('.test.test2 #span'))
<span id="span" class="test4">123</span>
查找元素
子元素
items = doc('#div')
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#div')
p = items.find('p')
print(p)
父元素
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#span')
container = items.parent()
containers = items.parents()
print(container)
print('-'*50)
print(containers)
兄弟元素
p = doc('.test.test2')
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test.test2')
print(p.siblings())
<a href="http://www.baidu.com">
<span class="test3">456</span>
</a>
遍历
单个元素
from pyquery import PyQuery as pq
doc = pq(html)
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('p').items() # 将其变为一个可遍历对象
for i in p:
print(i)
3.获取信息
获取属性、文本 、HTNL
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.attr('name')) # 属性
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.text()) #文本
Hello Python 123
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.html()) # html
Hello Python
<span id="span" class="test4">123</span>
3.DOM操作
addClass removeClass
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
p.removeClass('test2')
print(p)
p.addClass('active')
print(p)
attr、css
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p>
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
p.attr('age','18')
print(p)
p.css('font-size','14px')
print(p)
remove
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p id="remove">
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#div')
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p id="remove">
Hello World!
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#div')
print(div.find('#remove').remove())
print(div)
伪类选择器
html = """
<html><head><title>Demo</title></head>
<body>
<div id='div'>
<p name="test" class='test test2'>Hello Python
<span id='span' class='test4'>123</span>
</p>
<a href="http://www.baidu.com">
<span class='test3'>456</span>
</a>
<p id="remove">
Hello World!1
</p>
<p id="remove">
Hello World!2
</p>
</div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('p:first-child')) # 第一个同级元素
print(doc('p:last-child')) # 最后一个同级元素
print(doc('p:nth-child(2)')) # 第二个同级元素
print(doc('p:nth-child(2n)')) # 偶数个同级元素