scrapy 一些设置和问题
程序员文章站
2022-05-11 14:12:25
scrapy设置ua池 设置后在setting启用 scrapy设置ip池 scrapy 设置自定义cookie:class LaogouwangSpider(scrapy.Spider): scrapy提供五种日志级别。 ......
scrapy设置ua池
设置后在setting启用
downloader_middlewares = {
'laogou.middlewares.laogoudownloadermiddleware': 543,
'laogou.middlewares.randomuseragentmiddleware': 400,
'laogou.middlewares.randomproxymiddleware': 400,
}
from scrapy.downloadermiddlewares.useragent import useragentmiddleware class randomuseragentmiddleware(useragentmiddleware): def __init__(self,user_agent=''): self.user_agent = user_agent def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: request.headers.setdefault('user-agent', ua) user_agent_list = [ \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.1 (khtml, like gecko) chrome/22.0.1207.1 safari/537.1" \ "mozilla/5.0 (x11; cros i686 2268.111.0) applewebkit/536.11 (khtml, like gecko) chrome/20.0.1132.57 safari/536.11", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.6 (khtml, like gecko) chrome/20.0.1092.0 safari/536.6", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.6 (khtml, like gecko) chrome/20.0.1090.0 safari/536.6", \ "mozilla/5.0 (windows nt 6.2; wow64) applewebkit/537.1 (khtml, like gecko) chrome/19.77.34.5 safari/537.1", \ "mozilla/5.0 (x11; linux x86_64) applewebkit/536.5 (khtml, like gecko) chrome/19.0.1084.9 safari/536.5", \ "mozilla/5.0 (windows nt 6.0) applewebkit/536.5 (khtml, like gecko) chrome/19.0.1084.36 safari/536.5", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1063.0 safari/536.3", \ "mozilla/5.0 (windows nt 5.1) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1063.0 safari/536.3", \ "mozilla/5.0 (macintosh; intel mac os x 10_8_0) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1063.0 safari/536.3", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1062.0 safari/536.3", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1062.0 safari/536.3", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.1 safari/536.3", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.1 safari/536.3", \ "mozilla/5.0 (windows nt 6.1) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.1 safari/536.3", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.0 safari/536.3", \ "mozilla/5.0 (x11; linux x86_64) applewebkit/535.24 (khtml, like gecko) chrome/19.0.1055.1 safari/535.24", \ "mozilla/5.0 (windows nt 6.2; wow64) applewebkit/535.24 (khtml, like gecko) chrome/19.0.1055.1 safari/535.24" ]
scrapy设置ip池
from scrapy.downloadermiddlewares.httpproxy import httpproxymiddleware class randomhttpproxymiddleware(httpproxymiddleware): def __init__(self,ip = ''): self.ip = ip def process_request(self, request, spider): ip = random.choice(self.ip_list) if ip: request.meta['proxy'] = ip ip_list = [ 'https://182.122.176.49:9999', 'https://125.123.141.20:9999' ]
scrapy 设置自定义cookie:class laogouwangspider(scrapy.spider):
name = 'laogouwang'
# allowed_domains = ['www.laogou.com']
# start_urls = ['http://www.laogou.com/'] def start_requests(self): url = 'https://www.lagou.com/' yield scrapy.request(url=url,callback=self.parse,meta={'cookiejar':1}) def parse(self, response): print(response.request.headers.getlist('cookie')) print(response.headers.getlist('set-cookie')) url = 'https://www.lagou.com/jobs/list_'+ str(settings.keys) +'?city='+ str(settings.cidy) +'&cl=false&fromsearch=true&labelwords=&suginput=' print(response.meta['cookiejar'])
yield scrapy.request(url=url,callback=self.download,meta={'cookiejar':response.meta['cookiejar'],'id':1},dont_filter=true)
def download(self, response):
# print(response.text)
print(response.request.headers.getlist('cookie'))
print(response.headers.getlist('set-cookie'))
i = response.meta.get('id')
file = 'false'
if i == 1:
file = 'true'
data = {
"first":file,
"pn":str(i),
"kd":str(settings.keys)
}
headers_post = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'content-type': 'application/x-www-form-urlencoded; charset=utf-8',
'content-length': str(len(urllib.parse.urlencode(data))),
'connection': 'keep-alive',
'referer':str(response.url),
'user-agent': 'mozilla/5.0 (windows nt 6.1; win64; x64; rv:65.0) gecko/20100101 firefox/65.0',
}
print(headers_post)
print(str(response.url))
print(data)
url = 'https://www.lagou.com/jobs/positionajax.json?needaddtionalresult=false'
yield scrapy.formrequest(url=url,formdata=data,headers=headers_post,callback=self.files,dont_filter=true,meta={'cookiejar':true,'dont_redirect': true,'handle_httpstatus_list': [301,302]})
meta={'cookiejar':1}这个是启动cookei记录,在后面的请求中使用'cookiejar':response.meta['cookiejar']可以更新cookie。
注意,需要在setting中设置cookies_enabled = true
获取请求cookies是response.request.headers.getlist('cookie'),响应cookies是response.headers.getlist('set-cookie')。
静止重定向dont_filter=true。
在meta里使用'dont_redirect': true,'handle_httpstatus_list': [301,302]可以在当前scrapy请求里禁用重定向。
scrapy 使用日志
import datetime,os time = datetime.datetime.now().strftime('%y_%m_%h_%m_%s') log_file = 'logs'+ os.sep +str(time) + '_' + "laogou.log" log_level = "debug"
log_stdout = true
scrapy提供五种日志级别。
1.critical -- 关键错误
2.error -- 一般级别的错误
3.warning -- 警告信息
4.info -- 信息消息的日志(建议生产模式使用)
5.debug -- 调试消息的日志(建议开发模式)
log_file 用于日志输出记录的文件名 默认none
log_level 要记录的最低级别 默认debug
log_stdout 如果为true 则进程的所有标准输出和错误都重定向到日志,列如print() 默认false
使用文件启动spider
#laogoustrart.py
from laogou.spiders.laogouwang import laogouwangspider from scrapy.crawler import crawlerprocess from scrapy.utils.project import get_project_settings process = crawlerprocess(get_project_settings()) process.crawl(laogouwangspider) process.start()
上一篇: html5新增表单元素
下一篇: html5中新增的表单元素