scrapy 爬虫遇到的坑
- 第一部分 scrapy
开始一个工程
1 scrapy startproject suning_spidei
2 scrapy gensider suning suning.con 开始一个spider 程序模板
3 class Douban(CrawlSpider): #spider的方式有 crawlspider xmlfeedspider csvspider sitemapspider
4 def parse(self,response):
print responce.url
print responce.body #测试
5get与post
网页请求 get
class YnGsxtSpider(Spider): name = 'yn_gsxt_spider' start_urls = [ 'http://yn.gsxt.gov.cn/notice/search/GET/announce?type=0401&mode=all&areaId=&keyword=&pageNo=1', 'http://sh.gsxt.gov.cn/notice/search/GET/announce?type=0401&mode=all&areaId=&keyword=&pageNo=1', ] custom_settings = cn_punishments_settings def parse(self, response): print 'response', response.url
post请求
class JsgsjGovSpider(Spider): name = 'www_jsgsj_spider' start_urls = [ 'http://www.jsgsj.gov.cn:58888/province/NoticeServlet.json?queryGscfNoticeList=true&&pageSize=10&curPage=1' ] post_url = '' custom_settings = cn_punishments_settings
6字符串拼接:.join([ 1,2])
for i in range(1, 400): # sh 13391 page = ''.join([url, str(i)]) #####join() 为一个列表必须有[]
class JsgsjGovSpider(Spider):
domain = 'http://www.jsgsj.gov.cn:58888/province/NoticeServlet.json?queryGscfNoticeList=true'
name = 'www_jsgsj_spider'
start_urls = [
'http://www.jsgsj.gov.cn:58888/province/NoticeServlet.json?queryGscfNoticeList=true&&pageSize=10&curPage=1'
]
custom_settings = cn_punishments_settings
def start_requests(self):
for url in self.start_urls:
for page in xrange(1, 8):
data = {
'pageSize': '10',
'curPage': str(page)
# 'ccppListQueryRequest.manaType': 'C',
# 'ccppListQueryRequest.casePage.curPage': str(page),
# 'ccppListQueryRequest.casePage.pageSize': '20',
}
yield FormRequest(url, formdata=data, callback=self.parse)
yield scrapy.Request(url, callback=self.parse)
7re使用方式
text = re.search(ur'pageCount":\d+', response.text).group(0) #u为编码 r为匹配所有\r \n\t
total_page = re.search(ur'\d+', text).group(0)
8 字符串格式化
#urlss = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 6, 1)] 字符串
# for i in range(2, 3):
# next_url = 'http://www.jsgsj.gov.cn:58888/province/NoticeServlet.json?queryGscfNoticeList=true&&pageSize=10&curPage={}'.format(i)
# # next_urls.append(next_url)
# yield Request(next_url, callback=self.parse_detail
当遇到爬取正文时,各段落是分散的利用
a = response.xpath("//p") 取文本 b = a.xpath("string(.)").extract() 文本拼接为列表
9防止程序中出现乱码
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#修改系统的默认编码
10
Selector有四个基本的方法,最常用的还是xpath:
- xpath(): 传入xpath表达式,返回该表达式所对应的所有节点的selector list列表
- extract(): 序列化该节点为Unicode字符串并返回list
- css(): 传入CSS表达式,返回该表达式所对应的所有节点的selector list列表,语法同 BeautifulSoup4
- re(): 根据传入的正则表达式对数据进行提取,返回Unicode字符串list列表
# 使用 extract()方法返回 Unicode字符串列表
response.xpath('//title').extract()
[u'<title>\u804c\u4f4d\u641c\u7d22 | \u793e\u4f1a\u62db\u8058 | Tencent \u817e\u8baf\u62db\u8058</title>']
print u‘’
2需要 编写的 组件 item pipeline setting spider
(1)item:::
(2)spider:::::
def start_request(self):
def
def parse(self,responce):
item=doubanItem() #一定要初始化 item
for i in range(0, int(get_page)-1): # 循环语句 和 url_list = response.meta['list'] #接受 start_ruquest url = url_list.format(i) #{} 字符串的格式化
(3) setting::::
(4)pipeline::
如果用xpath解析页面 出现空列表 只针对tbody
# trs = response.xpath('ssssss/tbody/tr') or response.xpath('sssss/tr')
上一篇: windows环境下上传pypi包步骤
下一篇: 安装Scrapy的爬虫框架踩坑记录
推荐阅读