java代理实现爬取代理IP的示例
程序员文章站
2023-12-09 16:29:51
仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和jsoup(版本1.10.2)
如果用...
仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和jsoup(版本1.10.2)
如果用了pom,那么就是以下两个:
<dependency> <groupid>com.alibaba</groupid> <artifactid>fastjson</artifactid> <version>1.2.28</version> </dependency> <dependency> <groupid>org.jsoup</groupid> <artifactid>jsoup</artifactid> <version>1.10.2</version> </dependency>
完整的代码如下:
package com.tuniu.fcm.facade.ipproxy; import com.alibaba.fastjson.jsonobject; import org.jsoup.jsoup; import org.jsoup.nodes.document; import java.util.arraylist; import java.util.hashmap; import java.util.list; import java.util.map; import java.util.regex.matcher; import java.util.regex.pattern; /** * 获取代理ip,需要 * com.alibaba.fastjson.jsonobject以及jsoup */ public class proxycralwerunusedvpn { threadlocal<integer> localwantednumber = new threadlocal<integer>(); threadlocal<list<proxyinfo>> localproxyinfos = new threadlocal<list<proxyinfo>>(); public static void main(string[] args) { proxycralwerunusedvpn proxycrawler = new proxycralwerunusedvpn(); /** * 想要获取的代理ip个数,由需求方自行指定。(如果个数太多,将导致返回变慢) */ proxycrawler.startcrawler(1); } /** * 暴露给外部模块调用的入口 * @param wantednumber 调用方期望获取到的代理ip个数 */ public string startcrawler(int wantednumber) { localwantednumber.set(wantednumber); kuaidailicom("http://www.xicidaili.com/nn/", 15); kuaidailicom("http://www.xicidaili.com/nt/", 15); kuaidailicom("http://www.xicidaili.com/wt/", 15); kuaidailicom("http://www.kuaidaili.com/free/inha/", 15); kuaidailicom("http://www.kuaidaili.com/free/intr/", 15); kuaidailicom("http://www.kuaidaili.com/free/outtr/", 15); /** * 构造返回数据 */ proxyresponse response = new proxyresponse(); response.setsuccess("true"); map<string, object> datainfomap = new hashmap<string, object>(); datainfomap.put("numfound", localproxyinfos.get().size()); datainfomap.put("pagenum", 1); datainfomap.put("proxy", localproxyinfos.get()); response.setdata(datainfomap); string responsestring = jsonobject.tojson(response).tostring(); system.out.println(responsestring); return responsestring; } private void kuaidailicom(string baseurl, int totalpage) { string ipreg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}"; pattern ipptn = pattern.compile(ipreg); for (int i = 1; i < totalpage; i++) { if (getcurrentproxynumber() >= localwantednumber.get()) { return; } try { document doc = jsoup.connect(baseurl + i + "/") .header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("accept-encoding", "gzip, deflate, sdch") .header("accept-language", "zh-cn,zh;q=0.8,en;q=0.6") .header("cache-control", "max-age=0") .header("user-agent", "mozilla/5.0 (macintosh; intel mac os x 10_11_4) applewebkit/537.36 (khtml, like gecko) chrome/51.0.2704.103 safari/537.36") .header("cookie", "hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=ga1.2.1061361785.1462812244") .header("host", "www.kuaidaili.com") .header("referer", "http://www.kuaidaili.com/free/outha/") .timeout(30 * 1000) .get(); matcher m = ipptn.matcher(doc.text()); while (m.find()) { if (getcurrentproxynumber() >= localwantednumber.get()) { break; } string[] strs = m.group().split(" "); if (checkproxy(strs[0], integer.parseint(strs[1]))) { system.out.println("获取到可用代理ip\t" + strs[0] + "\t" + strs[1]); addproxy(strs[0], strs[1], "http"); } } } catch (exception e) { e.printstacktrace(); } } } private static boolean checkproxy(string ip, integer port) { try { //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页 jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2 * 1000) .proxy(ip, port) .get(); return true; } catch (exception e) { return false; } } private int getcurrentproxynumber() { list<proxyinfo> proxyinfos = localproxyinfos.get(); if (proxyinfos == null) { proxyinfos = new arraylist<proxyinfo>(); localproxyinfos.set(proxyinfos); return 0; } else { return proxyinfos.size(); } } private void addproxy(string ip, string port, string protocol){ list<proxyinfo> proxyinfos = localproxyinfos.get(); if (proxyinfos == null) { proxyinfos = new arraylist<proxyinfo>(); proxyinfos.add(new proxyinfo(ip, port, protocol)); } else { proxyinfos.add(new proxyinfo(ip, port, protocol)); } } } class proxyinfo { private string username = ""; private string ip; private string password = ""; private string type; private string port; private int is_internet = 1; public proxyinfo(string ip, string port, string type) { this.ip = ip; this.type = type; this.port = port; } public string getusername() { return username; } public void setusername(string username) { this.username = username; } public string getip() { return ip; } public void setip(string ip) { this.ip = ip; } public string getpassword() { return password; } public void setpassword(string password) { this.password = password; } public string gettype() { return type; } public void settype(string type) { this.type = type; } public string getport() { return port; } public void setport(string port) { this.port = port; } public int getis_internet() { return is_internet; } public void setis_internet(int is_internet) { this.is_internet = is_internet; } } class proxyresponse { private string success; private map<string, object> data; public string getsuccess() { return success; } public void setsuccess(string success) { this.success = success; } public map<string, object> getdata() { return data; } public void setdata(map<string, object> data) { this.data = data; } }
以上这篇java代理实现爬取代理ip的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持。