Python学习笔记六(免费获取代理IP)

程序员文章站 2022-05-21 14:48:29

为获取网上免费代理IP，闲的无聊，整合了一下，免费从三个代理网站获取免费代理IP，目的是在某一代理网站被限制时，仍可从可以访问的其他网站上获取代理IP。亲测可用哦！^_^ 仅供大家参考，以下脚本可添加函数，用于代理IP自动访问其他东西。 ......

　　为获取网上免费代理ip，闲的无聊，整合了一下，免费从三个代理网站获取免费代理ip，目的是在某一代理网站被限制时，仍可从可以访问的其他网站上获取代理ip。亲测可用哦！^_^ 仅供大家参考，以下脚本可添加函数，用于代理ip自动访问其他东西。

  1 import requests
  2 import urllib.request
  3 from bs4 import beautifulsoup
  4 import random,time,re
  5 import random
  6 from urllib.request import fancyurlopener
  7 
  8 ipregular = r"(([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5]).){3}([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])"
  9 headers = {'user-agent', 'mozilla/5.0 (windows nt 6.1; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/64.0.3282.186 safari/537.36'}
 10 header1 = {'user-agent': 'mozilla/5.0 (x11; linux x86_64) ''applewebkit/537.36 (khtml, like gecko) ''ubuntu chromium/44.0.2403.89 '
 11                          'chrome/44.0.2403.89 ''safari/537.36'}
 12 # 定义用于存放ip的列表
 13 ips = []
 14 proxy_list = []
 15 
 16 
 17 #代理一：
 18 def getiplist1():
 19     for i in range(1, 3):
 20         req = urllib.request.request(url='http://www.xicidaili.com/nt/{0}'.format(i), headers=header1)
 21         r = urllib.request.urlopen(req)
 22         soup = beautifulsoup(r,'html.parser',from_encoding='utf-8')
 23         table = soup.find('table', attrs={'id': 'ip_list'})
 24         tr = table.find_all('tr')[1:]
 25         #解析得到代理ip的地址，端口，和类型
 26         for item in tr:
 27             tds =  item.find_all('td')
 28             temp_dict = {}
 29             kind = "{0}:{1}".format(tds[1].get_text().lower(), tds[2].get_text())
 30             proxy_list.append(kind)
 31     return proxy_list
 32 
 33 #代理二：
 34 def getiplist2():
 35     # 代理网站的地址的格式
 36     # 根据观察url，发现各省的代理ip页面由数字控制
 37     # 所以我们先用占位符{}代替这个数字的位置
 38     url = 'http://www.66ip.cn/areaindex_16/{}.html'
 39     for page in range(10):
 40         # 先填占位符生成一个省的url
 41         url = url.format(page)
 42         # get()方法访问，得到一个response对象
 43         rsp = requests.get(url)
 44         # response对象的text属性得到源码
 45         text = rsp.text
 46         # 用beautifulsoup()方法将源码生成能被解析的lxml格式文件
 47         soup = beautifulsoup(text, 'lxml')
 48         # 用find()找放置ip的表
 49         table = soup.find(name='table', attrs={'border': '2px'})
 50         # 用find_all()找到所以的ip
 51         ip_list = table.find_all(name='tr')
 52         # 循环遍历每个ip
 53         for addr in ip_list:
 54             # 观察源码发现第一个tr里的内容不是ip，所以跳过
 55             if addr == ip_list[0]:
 56                 continue
 57             # 获取ip
 58             ip = addr.find_all(name='td')[0].string
 59             # 获取端口
 60             port = addr.find_all(name='td')[1].string
 61             proxy = '%s:%s' % (ip, port)
 62             proxy_list.append(proxy)
 63     return proxy_list
 64 
 65 #代理三：
 66 def getiplist3():
 67     request_list = []
 68     headers = {
 69         'host': 'www.iphai.com',
 70         'user-agent': 'mozilla/4.0 (compatible; msie 7.0; windows nt 6.0)',
 71         'accept': r'application/json, text/javascript, */*; q=0.01',
 72         'referer': r'http://www.iphai.com',}
 73     request_item = "http://www.iphai.com/free/ng"
 74     request_list.append(request_item)
 75     for req_id in request_list:
 76         req = urllib.request.request(req_id, headers=headers)
 77         response = urllib.request.urlopen(req)
 78         html = response.read().decode('utf-8')
 79 
 80         ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+', html)
 81         port_list = re.findall(r'(\s\d+\s)', html)
 82         for i in range(len(ip_list)):
 83             #total_count += 1
 84             ip = ip_list[i]
 85             port = re.sub(r'(^\s*)|(\s*$)/g','',port_list[i])
 86             proxy = '%s:%s' % (ip, port)
 87             proxy_list.append(proxy)
 88     return proxy_list
 89 
 90 
 91 
 92 if __name__ == "__main__":
 93     #选择可以访问的代理
 94     list1 = ['http://www.66ip.cn/','https://www.xicidaili.com','http://www.iphai.com/free/ng']
 95     while 1==1:
 96         for url in list1:
 97             try:
 98                 r = requests.get(url, timeout=5,headers=header1)
 99                 if r.status_code == 200:
100                     if url == list1[0]:
101                         print ("ok 网站访问正常", url)
102                         ips = getiplist2()
103                     elif url == list1[1]:
104                         print ("ok 网站访问正常", url)
105                         ips = getiplist1()
106                     elif url == list1[2]:
107                         print ("ok 网站访问正常", url)
108                         ips = getiplist3()
109                     break
110             except :
111                 print ("error 不能访问！", url)
112                 break
113         print("获取的代理ip为：",ips)
114         time.sleep(10)

上一篇： PHP实现顺时针打印矩阵及螺旋矩阵的方法

下一篇：女性用户看过来这些是最适合你的智能文胸