欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫:批量抓取代理ip,进行验证,抓取豆瓣网站影视信息

程序员文章站 2022-05-18 21:32:16
...

本文作为学习笔记参考用:
【1】批量抓取代理ip:
找到第三方ip代理的网站,进行分析,并批量抓取,抓取程序放到Proxies_spider.py中,如下所示:

import re
import requests
from retrying import retry


class proxies_spider:
    def __init__(self):
        self.url_temp = "https://www.freeip.top/?page={}&protocol=https"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
        }
        self.ip_lists = []

    def clear_data(self, res):
        mr = re.findall("<td>(.*?)</td>", res)
        # print(mr)
        i = 0
        while True:
            if i >= len(mr):
                break
            self.ip_lists.append(mr[i] + ":" + mr[i + 1])
            i += 11

    @retry(stop_max_attempt_number=3)
    def get_request(self, url):
        print("*" * 25)
        res = requests.get(url, headers=self.headers, timeout=3).content.decode("utf-8")
        return res

    def run(self):
        # urls = "https://www.freeip.top/?page=1&protocol=https"
        # 构造url
        num = 1
        while True:
            url = self.url_temp.format(num)
            try:
                res = self.get_request(url)
            except Exception as e:
                print(e)

            # 清洗数据,把数据存入列表
            self.clear_data(res)
            if len(self.ip_lists) % 15 != 0:
                break
            num += 1
            
        print(len(self.ip_lists))
        return self.ip_lists

【2】对代理ip进行验证,并将数据存入到 Proxies.txt中,如下所示:

import requests
from DouBan_Sprider import Proxies_spider
from retrying import retry
import json


class Proxies_vertify:
    def __init__(self):
        self.url = "https://www.baidu.com"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
        }
        self.end_ip_lists = []

    @retry(stop_max_attempt_number=2)
    def request_get(self, ip):
        requests.get(self.url, headers=self.headers, proxies=ip, timeout=2)

    def run(self):
        # 读取数据,测验是否完整
        proxies_spider = Proxies_spider.proxies_spider()
        ip_list = proxies_spider.run()
        for ip in ip_list:
            try:
                ips = {"https": "https://{}".format(ip)}
                self.request_get(ips)
                self.end_ip_lists.append(ip)
                print("*" * 20)
                print(ip)
            except Exception as e:
                print(e)
        print(self.end_ip_lists)
        with open("Proxies.txt", "a") as f:
            f.write(json.dumps(self.end_ip_lists) + "\n")
        return self.end_ip_lists

【3】抓取豆瓣网站影视信息,如下所示:

import requests
import json
from DouBan_Sprider import Proxies_vertify
from retrying import retry


class DouBan_spider:
    def __init__(self):
        # 初始url
        self.url_list_temp = "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_start={}"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
        }

    @retry(stop_max_attempt_number=2)
    def get_request(self, url, proxies):
        res_json = requests.get(url, headers=self.headers, proxies=proxies, timeout=3, verify=False)
        assert res_json.status_code == 200
        return res_json.content.decode("utf-8")

    def write_data(self, res_list):
        with open("douban_sprider.txt", "a", encoding="gbk") as f:
            for data in res_list:
                rate = data["rate"]
                title = data["title"]
                url = data["url"]
                cover = data["cover"]
                f.write("电视剧名称:" + title + ", 影评:" + rate + ", 播放地址:" + url + ", Img_url: " + cover + "\n")

    def run(self):
        # 下一页变量 num+20
        num = 0
        count = 0
        Vertify = Proxies_vertify.Proxies_vertify()
        ip_list = Vertify.run()
        while True:
            url = self.url_list_temp.format(num)
            proxies = {"https": "https://{}".format(ip_list[count])}
            count = (count + 1) % len(ip_list)
            # proxies = {"https": "https://68.183.235.141:8080"}
            try:
                res_json = self.get_request(url, proxies)
                res_list = json.loads(res_json)["subjects"]
                while len(res_list) == 0:
                    break
                # 把数据写入 douban_sprider.txt
                self.write_data(res_list)
                print("第 " + str(int(num / 20) + 1) + "页")
                num += 20
            except Exception as e:
                print(e)


if __name__ == '__main__':
    douban_spider = DouBan_spider()
    douban_spider.run()

一次性抓取到的并验证响应时间足够快的代理ip:

["103.28.121.58:3128", "47.99.65.77:3128", "51.254.167.223:3128", "47.100.120.52:3128", "165.227.84.45:3128", "51.158.68.133:8811", "68.183.232.63:3128", "43.255.228.150:3128", "116.196.85.166:3128", "116.196.85.166:3128", "51.158.68.26:8811", "163.172.189.32:8811", "51.158.113.142:8811", "51.158.106.54:8811", "159.203.8.223:3128", "218.60.8.99:3129", "210.22.5.117:3128", "51.158.98.121:8811", "163.172.136.226:8811", "163.172.152.52:8811", "103.28.121.58:80", "128.199.250.116:3128", "136.243.47.220:3128", "89.37.122.216:3129", "210.26.49.88:3128", "51.158.123.35:8811", "128.199.251.126:3128", "51.158.99.51:8811", "180.210.201.57:3130", "178.128.50.214:8080", "138.197.5.192:8888", "121.225.199.78:3128", "121.225.199.78:3128", "138.197.5.192:8888", "163.172.147.94:8811", "118.70.144.77:3128", "128.199.205.90:3128", "167.71.95.30:3128", "51.158.68.68:8811", "47.104.234.32:3128", "120.234.63.196:3128", "122.70.148.66:808", "116.196.85.150:3128", "128.199.227.242:3128", "167.172.135.255:8080", "68.183.123.186:8080", "14.207.26.94:8080", "14.207.124.80:8080", "51.158.108.135:8811", "51.158.111.229:8811", "159.203.44.177:3128", "209.97.174.166:8080", "52.80.58.248:3128"]

也可抓取多次,一次验证,写入文件,抓取网站的时候循环使用列表中的代理ip。
期间用到的Json数据分析的工具为:Chrome的插件——> JSON Viewer
如下图所示分析效果:python爬虫:批量抓取代理ip,进行验证,抓取豆瓣网站影视信息