欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫_PyQuery库基础

程序员文章站 2022-05-08 18:33:50
...

1.初始化

字符串初始化
from pyquery import PyQuery as pq
html = """
<html><head><title>Demo</title></head>
<body>
 <div>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('head'))
<head><title>Demo</title></head>
URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))

2.基础CSS选择器

from pyquery import PyQuery as pq
doc = pq(html)
html = """
<html><head><title>Demo</title></head>
<body>
 <div>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('.test.test2 #span'))
<span id="span" class="test4">123</span>

查找元素

子元素
items = doc('#div')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#div')
p = items.find('p')
print(p)
父元素
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#span')
container = items.parent()
containers = items.parents()
print(container)
print('-'*50)
print(containers)
兄弟元素
p = doc('.test.test2')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test.test2')
print(p.siblings())
<a href="http://www.baidu.com">
     <span class="test3">456</span>
   </a>

遍历

单个元素
	from pyquery import PyQuery as pq
	doc = pq(html)
	html = """
	<html><head><title>Demo</title></head>
	<body>
	 <div id='div'>
	   <p name="test" class='test test2'>Hello Python
	    <span id='span' class='test4'>123</span>
	   </p>
	   <a href="http://www.baidu.com">
	     <span class='test3'>456</span>
	   </a>
	   <p>
	       Hello World!
	   </p>
	 </div>
	</body>
	</html>
	"""
	from pyquery import PyQuery as pq
	doc = pq(html)
	p = doc('p').items()  # 将其变为一个可遍历对象
	for i in p:  
	    print(i)

3.获取信息

获取属性、文本 、HTNL
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.attr('name')) # 属性

	html = """
	<html><head><title>Demo</title></head>
	<body>
	 <div id='div'>
	   <p name="test" class='test test2'>Hello Python
	    <span id='span' class='test4'>123</span>
	   </p>
	   <a href="http://www.baidu.com">
	     <span class='test3'>456</span>
	   </a>
	   <p>
	       Hello World!
	   </p>
	 </div>
	</body>
	</html>
	"""
	from pyquery import PyQuery as pq
	doc = pq(html) 
	p = doc('.test')  
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.text())  #文本
Hello Python 123
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""

from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.html())   # html
Hello Python
    <span id="span" class="test4">123</span>

3.DOM操作

addClass removeClass
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')

html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
p.removeClass('test2')
print(p)
p.addClass('active')
print(p)
attr、css
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
p.attr('age','18')
print(p)
p.css('font-size','14px')
print(p)
remove
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p id="remove">
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#div')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p id="remove">
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#div')
print(div.find('#remove').remove())
print(div)
伪类选择器
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p id="remove">
       Hello World!1
   </p>
   <p id="remove">
       Hello World!2
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('p:first-child'))  # 第一个同级元素
print(doc('p:last-child'))   # 最后一个同级元素
print(doc('p:nth-child(2)'))  # 第二个同级元素
print(doc('p:nth-child(2n)')) # 偶数个同级元素
相关标签: python爬虫