用chrome打开http://www.downg.com/new/0_1.html

按F12打开chrome的开发者工具,切换到console标签

输入$x即可测试xpath语句


然后我输入$x("//*[@id='greendown']/div[2]/div[5]/div/div/div[1]/div/div[2]/ul/li/span[3]/a/@href")


直接返回了所要的50个URL链接


但用scrapy的xpath选择器则通不过,代码如下

# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

class MySpider(Spider):
    name = "downg"
    allowed_domains = ["downg.com"]
    start_urls = [
        'http://www.downg.com/new/0_%s.html' %x for x in xrange(1,7)
    ]

    def parse(self, response):
        sel=Selector(response)
        urlsReqs=[]
        urls_list=sel.xpath("//*[@id='greendown']/div[2]/div[5]/div/div/div[1]/div/div[2]/ul/li/span[3]/a/@href").extract()
        print len(urls_list),urls_list
        for url in urls_list:
            req=Request(url,self.getDetail)
            urlsReqs.append(req)
        return urlsReqs

    def getDetail(self,response):
        print response.url
        
  
看来chrome有时候自动生成的xpath不能直接用于scrapy,要么用正则(见上一篇文章),要么自己手写吧,代码如下

# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

class MySpider(Spider):
    name = "downg"
    allowed_domains = ["downg.com"]
    start_urls = [
        'http://www.downg.com/new/0_%s.html' %x for x in xrange(1,7)
    ]

    def parse(self, response):
        sel=Selector(response)
        urlsReqs=[]
        urls_list=sel.xpath('//span[@class="app-name"]')
        print len(urls_list),urls_list
        for url in urls_list:
            geturl=url.xpath('a/@href').extract()[0]
            req=Request(geturl,self.getDetail)
            urlsReqs.append(req)
        return urlsReqs

    def getDetail(self,response):
        print response.url


update:进一步优化版

# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

class MySpider(Spider):
    name = "downg"
    allowed_domains = ["downg.com"]
    start_urls = [
        'http://www.downg.com/new/0_%s.html' %x for x in xrange(1,7)
    ]

    def parse(self, response):
        sel=Selector(response)
        urlsReqs=[]
        urls_list=sel.xpath('//span[@class="app-name"]/a/@href').extract()
        print len(urls_list),urls_list
        for url in urls_list:
            req=Request(url,self.getDetail)
            urlsReqs.append(req)
        return urlsReqs

    def getDetail(self,response):
        print response.url