scrapy与xpath的坑
程序员文章站
2022-05-11 17:32:12
...
用chrome打开http://www.downg.com/new/0_1.html
按F12打开chrome的开发者工具,切换到console标签
输入$x即可测试xpath语句
然后我输入$x("//*[@id='greendown']/div[2]/div[5]/div/div/div[1]/div/div[2]/ul/li/span[3]/a/@href")
直接返回了所要的50个URL链接
但用scrapy的xpath选择器则通不过,代码如下
# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
class MySpider(Spider):
name = "downg"
allowed_domains = ["downg.com"]
start_urls = [
'http://www.downg.com/new/0_%s.html' %x for x in xrange(1,7)
]
def parse(self, response):
sel=Selector(response)
urlsReqs=[]
urls_list=sel.xpath("//*[@id='greendown']/div[2]/div[5]/div/div/div[1]/div/div[2]/ul/li/span[3]/a/@href").extract()
print len(urls_list),urls_list
for url in urls_list:
req=Request(url,self.getDetail)
urlsReqs.append(req)
return urlsReqs
def getDetail(self,response):
print response.url
看来chrome有时候自动生成的xpath不能直接用于scrapy,要么用正则(见上一篇文章),要么自己手写吧,代码如下
# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
class MySpider(Spider):
name = "downg"
allowed_domains = ["downg.com"]
start_urls = [
'http://www.downg.com/new/0_%s.html' %x for x in xrange(1,7)
]
def parse(self, response):
sel=Selector(response)
urlsReqs=[]
urls_list=sel.xpath('//span[@class="app-name"]')
print len(urls_list),urls_list
for url in urls_list:
geturl=url.xpath('a/@href').extract()[0]
req=Request(geturl,self.getDetail)
urlsReqs.append(req)
return urlsReqs
def getDetail(self,response):
print response.url
update:进一步优化版
# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
class MySpider(Spider):
name = "downg"
allowed_domains = ["downg.com"]
start_urls = [
'http://www.downg.com/new/0_%s.html' %x for x in xrange(1,7)
]
def parse(self, response):
sel=Selector(response)
urlsReqs=[]
urls_list=sel.xpath('//span[@class="app-name"]/a/@href').extract()
print len(urls_list),urls_list
for url in urls_list:
req=Request(url,self.getDetail)
urlsReqs.append(req)
return urlsReqs
def getDetail(self,response):
print response.url
转载于:https://blog.51cto.com/pcliuyang/1534124
上一篇: samba
下一篇: 中小企业选择云存储服务 五个事项应注意