XPATH应用
程序员文章站
2022-04-19 15:34:20
1 # -*- coding:utf-8 -*- 2 ''' 3 Created on Sep 10, 2018 4 5 @author: SaShuangYiBing 6 ''' 7 from lxml import etree 8 9 html=''' 10 11 12 哈哈测试一下 13 14... ......
1 # -*- coding:utf-8 -*- 2 ''' 3 created on sep 10, 2018 4 5 @author: sashuangyibing 6 ''' 7 from lxml import etree 8 9 html=''' 10 <html> 11 <head> 12 <title>哈哈测试一下</title> 13 <link type="text/css" rel="stylesheet" href="haha.css" /> 14 <link type="text/css" rel="stylesheet" href="haha1.css" /> 15 <link type="text/css" rel="stylesheet" href="haha2.css" /> 16 <script type="text/javascript" src="haha.js"></script> 17 <script type="text/javascript" src="haha1.js"></script> 18 <script type="text/javascript" src="haha2.js"></script> 19 </head> 20 <body> 21 <div id="id1" class="class1"> 22 <div id="id2" class="class2"> 23 <ul class="cls_ul1"> 24 <li class="cls_li1"> 25 <div class="cls_3"> 26 <span>span_text1</span> 27 <span>span_text2</span> 28 <i>text_1</i> 29 </div> 30 <div> 31 <a href="a_1.html">a_1</a> 32 <a href="a_2.html">a_2</a> 33 <a href="a_3.html">a_3</a> 34 </div> 35 <div class="cls_4"> 36 <a href="a_4.html"> 37 <img href="a_img1.jpg" /> 38 </a> 39 </div> 40 </li> 41 <li class="cls_li1"> 42 <div class="cls_3"> 43 <span>span_text3</span> 44 <span>span_text4</span> 45 <i>text_2</i> 46 <i>text_22</i> 47 </div> 48 <div> 49 <a href="a_4.html">a_4</a> 50 <a href="a_5.html">a_5</a> 51 <a href="a_6.html">a_6</a> 52 </div> 53 <div class="cls_4"> 54 <a href="a_5.html"> 55 <img href="a_img2.jpg" /> 56 </a> 57 </div> 58 </li> 59 </ul> 60 </div> 61 <div id="id3" class="class3"> 62 <ul class="cls_ul2"> 63 <li class="cls_li2"> 64 <div class="cls_5"> 65 <span>span_text5</span> 66 <span>span_text6</span> 67 <i>text_3</i> 68 </div> 69 <div> 70 <a href="a_1.html">a_1</a> 71 <a href="a_2.html">a_2</a> 72 <a href="a_3.html">a_3</a> 73 </div> 74 <div class="cls_6"> 75 <a href="a_4.html"> 76 <img href="a_img3.jpg" /> 77 </a> 78 </div> 79 </li> 80 <li class="cls_li2"> 81 <div class="cls_5"> 82 <span>span_text7</span> 83 <span>span_text8</span> 84 <i>text_4</i> 85 </div> 86 <div> 87 <a href="a_4.html">a_4</a> 88 <a href="a_5.html">a_5</a> 89 <a href="a_6.html">a_6</a> 90 </div> 91 <div class="cls_6"> 92 <a href="a_5.html"> 93 <img href="a_img4.jpg" /> 94 </a> 95 </div> 96 </li> 97 </ul> 98 </div> 99 </div> 100 </body> 101 </html> 102 ''' 103 104 html_data = etree.html(html) 105 106 # 1、从根节点开始,沿着xml路径一步一步选择节点,text()表示节点内容 107 content = html_data.xpath("/html/head/title/text()") 108 for con in content: 109 print (con) 110 print ("~~~~~~~~~这是第一个分隔线~~~~~~~~~") 111 112 # 2、从根节点开始,沿着xml路径一步一步选择节点,text表示节点内容 113 nodes = html_data.xpath("/html/head/title") 114 for i in nodes: 115 print (i.text) 116 print ("~~~~~~~~~这是第二个分隔线~~~~~~~~~") 117 118 # 3、从文档中某个节点开始,不考虑此节点位置,text()表示节点内容 119 content = html_data.xpath("//title/text()") 120 for con in content: 121 print (con) 122 print ("~~~~~~~~~这是第三个分隔线~~~~~~~~~") 123 124 # 4、获取所有div(html/body/div/div)的id属性值 125 nodes = html_data.xpath("/html/body/div/div") 126 for i in range(len(nodes)): 127 content = nodes[i].xpath("@id") 128 for con in content: 129 print (con) 130 print ("~~~~~~~~~这是第四个分隔线~~~~~~~~~") 131 132 # 5、body节点下某节点的属性值 133 content = html_data.xpath("body/div/div[@id= 'id2']/ul/li[1]/div[2]/a/@href") 134 for con in content: 135 print (con) 136 print ("~~~~~~~~~这是第五个分隔线~~~~~~~~~") 137 138 # 6、div[@id='id2']节点下某节点的属性值 139 content = html_data.xpath("//div[@id = 'id2']/ul/li[1]/div[2]/a/@href") 140 for con in content: 141 print (con) 142 print ("~~~~~~~~~这是第六个分隔线~~~~~~~~~") 143 144 # 7、div[@id='id2']节点下某节点的内容 145 content = html_data.xpath("//div[@id= 'id2']/ul/li[1]/div[2]/a/text()") 146 for con in content: 147 print (con) 148 print ("~~~~~~~~~这是第七个分隔线~~~~~~~~~") 149 150 # 8、用'*'来匹配任何元素 151 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[2]/a/text()") 152 for con in content: 153 print (con) 154 print ("~~~~~~~~~这是第八个分隔线~~~~~~~~~") 155 156 # 9、选取多个节点 157 nodes = html_data.xpath("//i|//span") 158 for i in range(len(nodes)): 159 print (nodes[i].text) 160 print ("~~~~~~~~~这是第九个分隔线~~~~~~~~~") 161 162 # 10、选取所有li节点 163 nodes = html_data.xpath("//li") 164 for i in range(len(nodes)): 165 content = nodes[i].xpath("div/@class") # li节点下所有div节点的class属性值 166 print (i,'='*5) 167 for con in content: 168 print (con) 169 print ("~~~~~~~~~这是第十个分隔线~~~~~~~~~") 170 171 # 11、选取所有li节点 172 nodes = html_data.xpath("//li") 173 for i in range(len(nodes)): 174 content = nodes[i].xpath("div[last()]/@class") # li节点下最后一个div节点的class属性值 175 print (i, '='*5) 176 for con in content: 177 print (con) 178 print ("~~~~~~~~~这是第十一个分隔线~~~~~~~~~") 179 180 # 12、这里应用了'..'和'@',其中'..'表示父节点,具体就是上一步(title)的父节点head;'@'表示属性,就是它后面接是属性名,在这里的意思就是属性href的内容 181 content = html_data.xpath("/html/head/title/../script/@src") 182 for con in content: 183 print (con) 184 print ("~~~~~~~~~这是第十二个分隔线~~~~~~~~~") 185 186 # 13、div[@class='cls_3']的子节点span的兄弟节点i 187 nodes = html_data.xpath("//div[@class = 'cls_3']/span/following-sibling::i") 188 for i in range(len(nodes)): 189 content = nodes[i].xpath("./text()") # 当前节点内容 190 for con in content: 191 print (con) 192 print ("~~~~~~~~~这是第十三个分隔线~~~~~~~~~") 193 194 # 14、li[@class='cls_li1']后代节点里第一个div的class属性值 195 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::div[1]/@class") 196 for con in content: 197 print (con) 198 print ("~~~~~~~~~这是第十四个分隔线~~~~~~~~~") 199 200 201 # 15、li[@class='cls_li1']后代节点里span的内容 202 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::span/text()") 203 for con in content: 204 print (con) 205 print ("~~~~~~~~~这是第十五个分隔线~~~~~~~~~") 206 207 # 16、用'*'来匹配任何元素,且不包含class属性的div节点 208 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[not(@class)]/a/text()") 209 for con in content: 210 print (con) 211 print ("~~~~~~~~~这是第十六个分隔线~~~~~~~~~") 212 213 # 17、多个条件的情况 214 content = html_data.xpath("//div[@class= 'class1' and @id= 'id1']") 215 for con in content: 216 print (con) 217 print ("~~~~~~~~~这是第十七个分隔线~~~~~~~~~") 218 219 # 18、contains 包含的情况 220 content = html_data.xpath("//div[contains(@class,'class2')]") 221 for con in content: 222 print (con) 223 print ("~~~~~~~~~这是第十八个分隔线~~~~~~~~~") 224 225 226 227 输出如下: 228 哈哈测试一下 229 ~~~~~~~~~这是第一个分隔线~~~~~~~~~ 230 哈哈测试一下 231 ~~~~~~~~~这是第二个分隔线~~~~~~~~~ 232 哈哈测试一下 233 ~~~~~~~~~这是第三个分隔线~~~~~~~~~ 234 id2 235 id3 236 ~~~~~~~~~这是第四个分隔线~~~~~~~~~ 237 a_1.html 238 a_2.html 239 a_3.html 240 ~~~~~~~~~这是第五个分隔线~~~~~~~~~ 241 a_1.html 242 a_2.html 243 a_3.html 244 ~~~~~~~~~这是第六个分隔线~~~~~~~~~ 245 a_1 246 a_2 247 a_3 248 ~~~~~~~~~这是第七个分隔线~~~~~~~~~ 249 a_1 250 a_2 251 a_3 252 ~~~~~~~~~这是第八个分隔线~~~~~~~~~ 253 span_text1 254 span_text2 255 text_1 256 span_text3 257 span_text4 258 text_2 259 text_22 260 span_text5 261 span_text6 262 text_3 263 span_text7 264 span_text8 265 text_4 266 ~~~~~~~~~这是第九个分隔线~~~~~~~~~ 267 0 ===== 268 cls_3 269 cls_4 270 1 ===== 271 cls_3 272 cls_4 273 2 ===== 274 cls_5 275 cls_6 276 3 ===== 277 cls_5 278 cls_6 279 ~~~~~~~~~这是第十个分隔线~~~~~~~~~ 280 0 ===== 281 cls_4 282 1 ===== 283 cls_4 284 2 ===== 285 cls_6 286 3 ===== 287 cls_6 288 ~~~~~~~~~这是第十一个分隔线~~~~~~~~~ 289 haha.js 290 haha1.js 291 haha2.js 292 ~~~~~~~~~这是第十二个分隔线~~~~~~~~~ 293 text_1 294 text_2 295 text_22 296 ~~~~~~~~~这是第十三个分隔线~~~~~~~~~ 297 cls_3 298 cls_3 299 ~~~~~~~~~这是第十四个分隔线~~~~~~~~~ 300 span_text1 301 span_text2 302 span_text3 303 span_text4 304 ~~~~~~~~~这是第十五个分隔线~~~~~~~~~ 305 a_1 306 a_2 307 a_3 308 ~~~~~~~~~这是第十六个分隔线~~~~~~~~~ 309 <element div at 0x1fb1aeb7c08> 310 ~~~~~~~~~这是第十七个分隔线~~~~~~~~~ 311 <element div at 0x1fb1aeb7848> 312 ~~~~~~~~~这是第十八个分隔线~~~~~~~~~