如何构建自己的IP代理池
程序员文章站
2022-05-19 13:23:42
...
在学习爬虫的过程中,肯定遇到过被封IP的问题,这个时候我们就需要去用代理来帮我们完成爬取任务,然而,爬着爬着又被封了,好,再换一个代理,一会过后又封了。这种情况有两种方法来解决
方法一:对我们的爬虫进行限速,使爬取速度接近人工访问的速度
方法二:使用代理池
在这里只说明一下代理池,我们可以使用西刺的免费代理,将其爬取下来存在数据库中,这样只需要在每次爬取的时候从数据库中取一个IP就可以了
下面是从西刺上爬取数据并存储的代码
import requests
import pymongo
import threading
from requests.exceptions import HTTPError
from datetime import datetime
from lxml.html import fromstring
class DownLoad(object):
def __init__(self, proxy=None, headers=None):
self.proxy = proxy
self.headers = headers
self.client = pymongo.MongoClient(
'mongodb_url'
)
self.db = self.client['scrapy_items']
def __call__(self, url):
tree = self.downloader(url)
if tree is None:
print('HTTP ERROR!')
else:
ip_info = self.get_ips(tree)
for ip in ip_info:
if ip is None:
print('invalid ip and port')
else:
try:
self.db['IP'].insert_one(ip)
except Exception as e:
print(e)
def close(self):
self.client.close()
def downloader(self, url):
try:
html = requests.get(url, headers=self.headers)
except HTTPError as err:
print(err)
except Exception as e:
print(e)
else:
try:
tree = fromstring(html.text)
return tree
except Exception as e:
print(e)
return None
def get_ips(self, tree):
table = tree.xpath('//table[@id="ip_list"]//tr[@class]')
for tr in table:
ip_info = {}
try:
ip_info['ip'] = tr.xpath('.//td[2]/text()')[0]
ip_info['port'] = tr.xpath('.//td[3]/text()')[0]
ip_info['status'] = tr.xpath('.//td[5]/text()')[0]
ip_info['type'] = tr.xpath('.//td[6]/text()')[0]
ip_info['speed'] = float(tr.xpath('.//td[7]/div/@title')[0].split('秒')[0])
ip_info['connect_time'] = float(tr.xpath('.//td[8]/div/@title')[0].split('秒')[0])
except Exception as e:
print(e)
yield None
if self.verification_ip(ip_info['ip'], ip_info['port'], ip_info['type']):
ip_info['verification_time'] = datetime.now()
yield ip_info
else:
print(ip_info['ip'], end='')
yield None
def verification_ip(self, ip, port, type):
if type == 'HTTP':
proxy_dict = {
'http': 'http://%s:%s' % (ip, port),
}
else:
proxy_dict = {
'https': 'https://%s:%s' % (ip, port),
}
try:
html = requests.get('https://hao.360.com/', headers=self.headers, proxies=proxy_dict,
timeout=5)
except HTTPError as err:
print(err)
except Exception as e:
print(e)
return False
else:
if 200 <= html.status_code < 300:
return True
else:
return False
def runspider(downloader, base_url, start_url, end_url):
"""执行方法"""
for i in range(start_url, end_url):
url = base_url + str(i)
downloader(url)
其中包括爬取IP以及对IP进行检测是否能用和存入数据库中(代码结构可能有些问题请见谅,新手一枚),对爬取的IP进行检测是为了防止无效的IP存入数据库,毕竟是免费的IP,稳定性是不怎么样的,如果你还嫌这些IP有点少的话, 可以再爬取一下其他的IP代理网站
具体怎么执行看你自己想法啦,剩余的部分就不贴了(建议不要爬取太快,会封IP的,我就是爬取的太快,还没有爬完呢,就被封了)
接下来就是随机从数据库中取IP了
class GetIP(object):
def __init__(self):
self.client = pymongo.MongoClient(
'mongodb_url'
)
self.db = self.client['scrapy_items']
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
def judge_ip(self, ip):
if ip['type'] == 'HTTP':
proxy_dict = {
'http': 'http://%s:%s' % (ip['ip'], ip['port']),
}
else:
proxy_dict = {
'https': 'https://%s:%s' % (ip['ip'], ip['port']),
}
try:
html = requests.get('https://hao.360.com/', headers=self.headers, proxies=proxy_dict,
timeout=5)
except HTTPError as err:
print(ip['ip'], err)
return False
except Exception as e:
print(ip['ip'], e)
return False
else:
if 200 <= html.status_code < 300:
return True
else:
return False
def get_random_ip(self):
ip_info = self.db['IP'].aggregate([
{'$sample': {'size': 1}},
])
for ip in ip_info:
if self.judge_ip(ip):
return '%s://%s:%s' % (ip['type'], ip['ip'], ip['port'])
else:
self.delete_ip(ip)
return self.get_random_ip()
def delete_ip(self, ip):
self.db['IP'].remove({'ip': ip['ip']})
def close(self):
self.client.close()
这样我们就构建出了自己的IP代理池,我们可以使用下面方法来测试一下
get_ip = GetIP()
for i in range(5):
ip_port = get_ip.get_random_ip()
print(ip_port)
get_ip.close()
好了,IP代理池搞定,在爬取信息的时候可能还会用到用户代理池,用户代理池有一个第三方库非常方便,fake_useragent,直接搜索就可以了
在爬取信息的时候最好是代理与限速配合,毕竟谁都不想自己的服务器被别人拿来随意蹂躏
上一篇: 组建IP代理池的简单使用
下一篇: 当奶粉碰上“技术宅”,会擦出怎样的火花?