scrapy中间件(fake_useragent使用 + ip代理设置)
程序员文章站
2022-05-09 11:17:43
...
随机用户代理
from fake_useragent import UserAgent
headers = {
"User-Agent": UserAgent().chrome
}
from scrapy import signals
from fake_useragent import UserAgent
from scrapy.exceptions import IgnoreRequest
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
if spider.name =='bd':
ua = UserAgent(verify_ssl=False) //切记切记这里一定要加verify_ssl=False,都是泪的教训啊~~~~
request.headers['User-Agent'] = ua.random
return None
else:
raise IgnoreRequest
def process_response(self, request, response, spider):
if spider.name =='bd':
# print(response)
print(request.headers["User-Agent"])
return response
else:
raise IgnoreRequest
def process_exception(self, request, exception, spider):
pass
设置ip代理中间件
import requests
class ProxyMiddleware(object):
def __init__(self, proxy_pool_url):
self.proxy_pool_url = proxy_pool_url
@classmethod
def from_crawler(cls, crawler):
return cls(
proxy_pool_url=crawler.settings.get('PROXY_POOL_URL')
)
def _get_proxy(self):
try:
proxy = requests.get(proxy_pool_url)
return proxy.text
except ConnectionError:
return None
# 添加代理,需要在request的meta信息中添加proxy字段
# 代理的形式为: 协议+ip地址+端口
def process_response(self, request, response, spider):
if response.status != 200:
logger.warning('Need use proxy ~~~')
request.meta["proxy"] = 'http://' + self._get_proxy()
return request
else:
return response