Python爬虫之pyquery库的基本使用
程序员文章站
2022-07-02 13:57:35
PyQuery是强大而又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难记,如果你熟悉jQuery的语法,那么,PyQuery就是你绝佳的选择。 ......
1 # 字符串初始化 2 html = ''' 3 <div> 4 <ul> 5 <li class = "item-0">first item</li> 6 <li class = "item-1"><a href = "link2.html">second item</a></li> 7 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 8 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 9 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 10 </ul> 11 </div> 12 ''' 13 14 from pyquery import pyquery as pq 15 doc = pq(html) 16 print(doc('li')) 17 18 # url初始化 19 from pyquery import pyquery as pq 20 doc = pq(url = "http://www.baidu.com") 21 print(doc("head")) 22 23 # 文件初始化 24 from pyquery import pyquery as pq 25 doc = pq(filename = "demo.html") 26 print(doc('li')) 27 28 # 基本css选择器 29 html = ''' 30 <div id = "container"> 31 <ul class = "list"> 32 <li class = "item-0">first item</li> 33 <li class = "item-1"><a href = "link2.html">second item</a></li> 34 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 35 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 36 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 37 </ul> 38 </div> 39 ''' 40 from pyquery import pyquery as pq 41 doc = pq(html) 42 # 注意下面id 前面需要加上#,class 前面需要加上. 43 print(doc('#container .list li')) 44 45 # 查找元素 46 # 子元素 47 html = ''' 48 <div id = "container"> 49 <ul class = "list"> 50 <li class = "item-0">first item</li> 51 <li class = "item-1"><a href = "link2.html">second item</a></li> 52 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 53 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 54 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 55 </ul> 56 </div> 57 ''' 58 from pyquery import pyquery as pq 59 doc = pq(html) 60 items = doc('.list') 61 print(type(items)) 62 print(items) 63 lis = items.find('li') 64 print(type(lis)) 65 print(lis) 66 67 lis = items.children() 68 print(type(lis)) 69 print(lis) 70 71 lis = items.children('.active') 72 print(lis) 73 74 # 父元素 75 html = ''' 76 <div id = "container"> 77 <ul class = "list"> 78 <li class = "item-0">first item</li> 79 <li class = "item-1"><a href = "link2.html">second item</a></li> 80 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 81 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 82 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 83 </ul> 84 </div> 85 ''' 86 from pyquery import pyquery as pq 87 doc = pq(html) 88 items = doc('.list') 89 container = items.parent() 90 print(type(container)) 91 print(container) 92 93 html = ''' 94 <div class = "wrap"> 95 <div id = "container"> 96 <ul class = "list"> 97 <li class = "item-0">first item</li> 98 <li class = "item-1"><a href = "link2.html">second item</a></li> 99 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 100 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 101 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 102 </ul> 103 </div> 104 </div> 105 ''' 106 from pyquery import pyquery as pq 107 doc = pq(html) 108 items = doc('.list') 109 parents = items.parents() 110 print(type(parents)) 111 print(parents) 112 113 parents = items.parents('.wrap') 114 print(parents)
1 # 兄弟元素 2 html = ''' 3 <div class = "wrap"> 4 <div id = "container"> 5 <ul class = "list"> 6 <li class = "item-0">first item</li> 7 <li class = "item-1"><a href = "link2.html">second item</a></li> 8 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 9 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 10 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 11 </ul> 12 </div> 13 </div> 14 ''' 15 from pyquery import pyquery as pq 16 doc = pq(html) 17 # 注意下面item-0后面直接是. 没有空格 18 li = doc('.list .item-0.active') 19 print(li.siblings()) 20 21 print(li.siblings('.active')) 22 23 # 遍历 24 # 单个元素 25 html = ''' 26 <div class = "wrap"> 27 <div id = "container"> 28 <ul class = "list"> 29 <li class = "item-0">first item</li> 30 <li class = "item-1"><a href = "link2.html">second item</a></li> 31 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 32 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 33 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 34 </ul> 35 </div> 36 </div> 37 ''' 38 from pyquery import pyquery as pq 39 doc = pq(html) 40 li = doc('.item-0.active') 41 print(li) 42 43 html = ''' 44 <div class = "wrap"> 45 <div id = "container"> 46 <ul class = "list"> 47 <li class = "item-0">first item</li> 48 <li class = "item-1"><a href = "link2.html">second item</a></li> 49 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 50 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 51 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 52 </ul> 53 </div> 54 </div> 55 ''' 56 from pyquery import pyquery as pq 57 doc = pq(html) 58 lis = doc('li').items() 59 print(type(lis)) 60 for li in lis: 61 print(li) 62 63 # 获取信息 64 # 获取属性 65 html = ''' 66 <div class = "wrap"> 67 <div id = "container"> 68 <ul class = "list"> 69 <li class = "item-0">first item</li> 70 <li class = "item-1"><a href = "link2.html">second item</a></li> 71 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 72 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 73 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 74 </ul> 75 </div> 76 </div> 77 ''' 78 from pyquery import pyquery as pq 79 doc = pq(html) 80 a = doc('.item-0.active a') 81 print(a) 82 # 获取属性的两种方法 83 print(a.attr('href')) 84 print(a.attr.href) 85 86 # 获取文本 87 print(a.text()) 88 89 # 获取html 90 from pyquery import pyquery as pq 91 doc = pq(html) 92 li = doc('.item-0.active') 93 print(li) 94 # 得到<li>标签里面的代码 95 print(li.html()) 96 97 # dom操作 98 # addclass、removeclass 99 from pyquery import pyquery as pq 100 doc = pq(html) 101 li = doc('.item-0.active') 102 print(li) 103 li.remove_class('active') 104 print(li) 105 li.add_class('active') 106 print(li) 107 108 # attr css 109 li.attr('name', 'link') 110 print(li) 111 li.css('font-size', '14px') 112 print(li) 113 114 # remove 115 html = ''' 116 <div class = "wrap"> 117 hello,world 118 <p>this is a paragraph</p> 119 </div> 120 ''' 121 from pyquery import pyquery as pq 122 doc = pq(html) 123 wrap = doc('.wrap') 124 print(wrap.text()) 125 wrap.find('p').remove() 126 print(wrap.text()) 127 128 # 伪类选择器 129 html = ''' 130 <div class = "wrap"> 131 <div id = "container"> 132 <ul class = "list"> 133 <li class = "item-0">first item</li> 134 <li class = "item-1"><a href = "link2.html">second item</a></li> 135 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li> 136 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li> 137 <li class = "item-0"><a href = "link5.html">fifthth item</a></li> 138 </ul> 139 </div> 140 </div> 141 ''' 142 from pyquery import pyquery as pq 143 doc = pq(html) 144 # 获取第一个元素 145 li = doc('li:first-child') 146 print(li) 147 # 获取最后一个元素 148 li = doc('li:last-child') 149 print(li) 150 # 获取第二个元素 151 li = doc('li:nth-child(2)') 152 print(li) 153 # 获取下标为2的元素后面的所有元素(下标从0开始) 154 li = doc('li:gt(2)') 155 print(li) 156 # 获取下标为偶数的元素 157 li = doc('li:nth-child(2n)') 158 print(li) 159 # 获取内容包含second 的元素 160 li = doc('li:contains(second)') 161 print(li)