构建自己的IP代理池
程序员文章站
2022-05-19 13:21:24
...
借助免费的西刺IP代理构建自己IP代理池
需要安装的Python库
requests
scrapy.Selector
pymysql
数据库表
create table proxy_ip (
no BIGINT AUTO_INCREMENT,
ip VARCHAR(20) UNIQUE NOT NULL,
port VARCHAR(255) NOT NULL,
address VARCHAR(20) DEFAULT '',
proxy_type VARCHAR(5),
speed DECIMAL DEFAULT 0,
PRIMARY KEY (no)
) DEFAULT CHARSET = utf8;
代码如下:
import threading
import requests
import time
from scrapy import Selector
import pymysql
import sys
DB_URL = 'localhost'
DB_USER = 'username'
DB_PASSWORD = 'password'
DB_NAME = 'spider_data'
DB_CHARSET = 'utf8'
class MyProxy():
conn = pymysql.connect(DB_URL, DB_USER, DB_PASSWORD, DB_NAME, charset=DB_CHARSET)
cursor = conn.cursor()
def __init__(self):
DeleteIPThread().start()
def get_ip(self):
'''
从数据库中随机拿一个有效IP
返回None时表示没有地址可用了
:return: (ip, port, speed, type) or None
'''
sql = '''
select ip,port,speed,proxy_type from proxy_ip order by rand() limit 1;
'''
self.cursor.execute(sql)
if self.cursor.arraysize > 0:
# (ip, port, speed, type)
res = self.cursor.fetchone()
if self.judge_ip(res[0], res[1]):
return res
else:
return self.get_ip()
self.crawl_ips()
return self.get_ip()
def crawl_ips(self):
'''
爬取西刺免费代理的地址池
:return: 无返回
'''
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTZjNDNmNjgzZWY5OWQ4ZWRmNTA5MzU3YWJiOGJlYWMwBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMVBsU3h6aU0xa25KWlZXZE5qZ0tGd21xYkJtc3J0K2w0YlEwdUhlNjFBN009BjsARg%3D%3D--abe7f4154a205b8515bfb204e3fe924006ae1d68",
"Host": "www.xicidaili.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"
}
url = "http://www.xicidaili.com/nn/1"
response = None
for i in range(10):
try:
response = requests.get(url, headers=headers, timeout=10)
except requests.exceptions.Timeout:
print("请求超时,第%d次重新请求..." % (i+1))
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
break
if response is None:
print("网络太差,或者地址被封,11次请求均超时")
return
s = Selector(response)
all_list = s.xpath('//table[@id="ip_list"]/tr')[1:]
for item in all_list[1:]:
try:
line = item.xpath('./td')
ip = line[1].xpath('string(.)').extract_first()
port = line[2].xpath('string(.)').extract_first()
address = ''
if len(line[3].xpath('./a')) > 0:
address = line[3].xpath('./a/text()').extract_first()
address = str(address)
type = line[5].xpath('string(.)').extract_first()
speed = 0.0
if len(line[6].xpath('./div/@title')) > 0:
speed_str = line[6].xpath('./div/@title').extract_first()
speed = float(speed_str[:-1])
print(ip, port, address, type, speed)
sql = '''
INSERT
INTO proxy_ip(ip, port, address, proxy_type, speed)
VALUES ('{0}', '{1}', '{2}', '{3}', '{4}');
'''
self.cursor.execute(sql.format(ip, port, address, type, speed))
self.conn.commit()
except:
print(sys.exc_info())
def judge_ip(self, ip, port):
'''
判断给出的代理 ip 是否可用
:param ip:
:param port:
:return:
'''
http_url = 'https://www.baidu.com/'
proxy_url = 'http://{0}:{1}'.format(ip, port)
try:
proxy_dict = {
'http': proxy_url
}
print("正在测试代理IP是否可用 => ", proxy_url)
response = requests.get(http_url, proxies=proxy_dict, timeout=5)
except Exception as e:
print("代理:", proxy_url, "不可用,即将从数据库中删除")
self.delete_ip(ip)
return False
else:
code = response.status_code
if code >= 200 or code < 300:
print("代理 => ", proxy_url, "可用")
return True
else:
self.delete_ip(ip)
return False
def delete_ip(self, ip):
'''
删除不可用的IP
:param ip:
:return:
'''
sql = '''
delete from proxy_ip WHERE ip='{0}';
'''
self.cursor.execute(sql.format(ip))
self.conn.commit()
class DeleteIPThread(threading.Thread):
def __init__(self):
super().__init__()
self.daemon = True
def run(self):
conn = pymysql.connect(DB_URL, DB_USER, DB_PASSWORD, DB_NAME, charset=DB_CHARSET)
cursor = conn.cursor()
sql = "select ip,port from spider_data.proxy_ip;"
proxy = MyProxy()
while True:
cursor.execute(sql)
all_list = cursor.fetchall()
for ip,port in all_list:
print(ip, port)
if proxy.judge_ip(ip, port) is False:
proxy.delete_ip(ip)
time.sleep(1)
time.sleep(20)
def circle_judge(self):
pass
if __name__ == '__main__':
my_proxy = MyProxy()
my_proxy.crawl_ips()
# my_proxy.get_ip()
使用
创建对象后,调用crawl_ips()
开始爬取代理IP,调用get_ip
从数据库中随机选择一条IP,并验证是否可用,如果不可用则递归获取可用的代理IP,当数据库中的代理地址被用完后(全都不可用),则自动开始重新爬取代理。
在创建对象后,启动守护线程维护代理池中的所有地址,将无效地址剔除。
上一篇: 天猫国际,致力于从全球引进宠物产品品牌
下一篇: 数据交换(JAVA实现)