webMagic 代理池
程序员文章站
2022-05-02 22:12:29
...
package com.example.csdn.bean;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;
import java.io.IOException;
import java.util.*;
public class Main implements AfterExtractor {
//失败次数
private static int errorCount = 0;
//重新抓取阙值
private static final float reLoadProxy = 0.45f;
//最大代理数
private static final int maxProxySize = 200;
private static Map<String,MyProxy> proxyArr = new HashMap<>();
private static Map<Integer, String> blogUrl = new HashMap<>();
private static int blogUrlSize = 0;
private static final String blogHome = "https://blog.csdn.net/qq_36183235";
private static Logger logger = Logger.getLogger(Main.class);
public static void main(String[] args) {
while (true) {
loadBlogUrl();
loadProxy();
for (final String key : proxyArr.keySet()) {
if (!proxyArr.get(key).getState())
break;
System.getProperties().setProperty("https.proxyHost", proxyArr.get(key).getAddr());
System.getProperties().setProperty("https.proxyPort", proxyArr.get(key).getPort());
try {
Jsoup.connect(blogUrl.get(randomBlogUrl()))
.userAgent("Mozilla")
.cookie("auth", "token")
.timeout(5000)
.get();
System.out.println("complete !");
} catch (IOException e) {
errorCount++;
proxyArr.get(key).setState(false);
e.printStackTrace();
}
sleepThread((new Random().nextInt(100)) + 30);
}
}
}
private static void loadBlogUrl() {
try {
Document doc = Jsoup.connect(blogHome).post();
Elements h4 = doc.body().getElementsByClass("article-list").select("h4");
for (int i = 0; i < h4.size(); i++) {
blogUrl.put(i, h4.get(i).select("a").attr("href"));
}
blogUrlSize = h4.size();
System.out.println("blog !");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void loadProxy() {
if(errorCount > maxProxySize * reLoadProxy)
proxyArr.clear();
if(proxyArr.size() >= maxProxySize)
return;
OOSpider.create(Site.me().setSleepTime(60*1000)
, Main.class)
.setIsExtractLinks(false)
.addUrl(links().toArray(new String [0]))
.run();
}
public static List<String> links (){
List<String > var1 = new ArrayList<>();
for (int i = 1;i<maxProxySize/15 ; i++) {
var1.add(String.format("https://www.kuaidaili.com/free/inha/%s/",i));
}
return var1;
}
public static void sleepThread(int s) {
try {
long ms = s * 1000;
Thread.sleep(ms);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static int randomBlogUrl() {
return new Random().nextInt(blogUrlSize);
}
@Override
public void afterProcess(Page page) {
if (proxyArr.size() >= maxProxySize)
return;
for (int i = 1; i < 15; i++) {
MyProxy proxy = new MyProxy();
String addr = String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[1]/text()", i)));
proxy.setAddr(addr);
proxy.setPort(String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[2]/text()", i))));
proxy.setState(true);
proxyArr.put(addr,proxy);
}
System.out.println("get proxy! Size : "+proxyArr.size());
}
static class MyProxy {
private String addr;
private String port;
private Boolean state;
public String getAddr() {
return addr;
}
public void setAddr(String addr) {
this.addr = addr;
}
public String getPort() {
return port;
}
public void setPort(String port) {
this.port = port;
}
public Boolean getState() {
return state;
}
public void setState(Boolean state) {
this.state = state;
}
}
}
推荐阅读
-
CentOS 32位搭建squid http代理,解决TCP_MISS/503
-
CentOS 32位搭建squid http代理,解决TCP_MISS/503
-
php curl 代理问题
-
MySQL详解(7)-----------MySQL线程池总结(一)_MySQL
-
Java笔记-连接本地代理服务
-
Nginx反向代理Odoo并转为https
-
100行PHP代码实现socks5代理服务器,100行socks5_PHP教程
-
spring 下配置 dbcp,c3p0,proxool 等数据源连接池
-
spring 下配置 dbcp,c3p0,proxool 等数据源连接池
-
Spring 5.x 源码 —Spring AOP源码—代理方法的调用与增强