欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

webMagic 代理池

程序员文章站 2022-05-02 22:12:29
...
package com.example.csdn.bean;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;

import java.io.IOException;
import java.util.*;


public class Main implements AfterExtractor {
    //失败次数
    private static int errorCount = 0;
    //重新抓取阙值
    private static final float reLoadProxy = 0.45f;
    //最大代理数
    private static final int maxProxySize = 200;

    private static Map<String,MyProxy> proxyArr = new HashMap<>();

    private static Map<Integer, String> blogUrl = new HashMap<>();

    private static int blogUrlSize = 0;
    private static final String blogHome = "https://blog.csdn.net/qq_36183235";

    private static Logger logger = Logger.getLogger(Main.class);

    public static void main(String[] args) {
        while (true) {

            loadBlogUrl();
            loadProxy();


            for (final String key : proxyArr.keySet()) {
                if (!proxyArr.get(key).getState())
                    break;
                System.getProperties().setProperty("https.proxyHost", proxyArr.get(key).getAddr());
                System.getProperties().setProperty("https.proxyPort", proxyArr.get(key).getPort());
                try {
                    Jsoup.connect(blogUrl.get(randomBlogUrl()))
                            .userAgent("Mozilla")
                            .cookie("auth", "token")
                            .timeout(5000)
                            .get();
                    System.out.println("complete !");
                } catch (IOException e) {
                    errorCount++;
                    proxyArr.get(key).setState(false);
                    e.printStackTrace();
                }
                sleepThread((new Random().nextInt(100)) + 30);
            }
        }
    }

    private static void loadBlogUrl() {
        try {
            Document doc = Jsoup.connect(blogHome).post();
            Elements h4 = doc.body().getElementsByClass("article-list").select("h4");
            for (int i = 0; i < h4.size(); i++) {
                blogUrl.put(i, h4.get(i).select("a").attr("href"));
            }
            blogUrlSize = h4.size();
            System.out.println("blog !");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    public static void loadProxy() {
        if(errorCount > maxProxySize * reLoadProxy)
            proxyArr.clear();

        if(proxyArr.size() >= maxProxySize)
            return;


        OOSpider.create(Site.me().setSleepTime(60*1000)
                , Main.class)
                .setIsExtractLinks(false)
                .addUrl(links().toArray(new String [0]))
                .run();
    }

    public static List<String> links (){
        List<String > var1 = new  ArrayList<>();
        for (int i = 1;i<maxProxySize/15 ; i++) {
            var1.add(String.format("https://www.kuaidaili.com/free/inha/%s/",i));
        }
        return  var1;
    }

    public static void sleepThread(int s) {
        try {
            long ms = s * 1000;
            Thread.sleep(ms);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    public static int randomBlogUrl() {
        return new Random().nextInt(blogUrlSize);
    }

    @Override
    public void afterProcess(Page page) {

        if (proxyArr.size() >= maxProxySize)
            return;
        for (int i = 1; i < 15; i++) {
            MyProxy proxy = new MyProxy();
            String addr = String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[1]/text()", i)));
            proxy.setAddr(addr);
            proxy.setPort(String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[2]/text()", i))));
            proxy.setState(true);
            proxyArr.put(addr,proxy);
        }
        System.out.println("get proxy! Size : "+proxyArr.size());
    }

    static class MyProxy {

        private String addr;
        private String port;
        private Boolean state;

        public String getAddr() {
            return addr;
        }

        public void setAddr(String addr) {
            this.addr = addr;
        }

        public String getPort() {
            return port;
        }

        public void setPort(String port) {
            this.port = port;
        }

        public Boolean getState() {
            return state;
        }

        public void setState(Boolean state) {
            this.state = state;
        }
    }

}

相关标签: JAVA webMagic