详解Java实现多种方式的http数据抓取
程序员文章站
2024-03-08 20:27:10
前言:
时下互联网第一波的浪潮已消逝,随着而来的基于万千数据的物联网时代,因而数据成为企业的重要战略资源之一。基于数据抓取技术,本文介绍了java相关抓取工具,并附上...
前言:
时下互联网第一波的浪潮已消逝,随着而来的基于万千数据的物联网时代,因而数据成为企业的重要战略资源之一。基于数据抓取技术,本文介绍了java相关抓取工具,并附上demo源码供感兴趣的朋友测试!
1)jdk自带http连接,获取页面或json
2) jdk自带url连接,获取页面或json
3)httpclient get工具,获取页面或json
4)commons-io工具,获取页面或json
5) jsoup工具(通常用于html字段解析),获取页面,非json返回格式】
--------------------------------------------------------------------------------
完整代码:
package com.yeezhao.common.http; import java.io.bufferedreader; import java.io.inputstream; import java.io.inputstreamreader; import java.net.httpurlconnection; import java.net.url; import org.apache.commons.httpclient.httpclient; import org.apache.commons.httpclient.httpmethod; import org.apache.commons.httpclient.methods.getmethod; import org.apache.commons.io.ioutils; import org.jsoup.jsoup; /** * http工具对比 * * @author administrator -> junhong * * 2016年12月27日 */ public class httpfetchutil { /** * 获取访问的状态码 * @param request * @return * @throws exception */ public static int getresponsecode(string request) throws exception { url url = new url(request); httpurlconnection conn = (httpurlconnection) url.openconnection(); return conn.getresponsecode(); } /** * 1)jdk自带http连接,获取页面或json * @param request * @param charset * @return * @throws exception */ public static string jdkfetch(string request, string charset) throws exception { url url = new url(request); httpurlconnection conn = (httpurlconnection) url.openconnection(); //模拟浏览器参数 conn.setrequestproperty("user-agent", "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36" + " (khtml, like gecko) chrome/45.0.2454.101 safari/537.36"); if (conn.getresponsecode() == httpurlconnection.http_ok) { inputstream input = conn.getinputstream(); stringbuffer sb = new stringbuffer(); bufferedreader reader = new bufferedreader(new inputstreamreader(input, charset)); string s; while ((s = reader.readline()) != null) { sb.append(s + "\n"); } input.close(); conn.disconnect(); return sb.tostring(); } return ""; } /** * 2) jdk自带url连接,获取页面或json * @param request * @param charset * @return * @throws exception */ public static string urlfetch(string request, string charset) throws exception { url url = new url(request); return ioutils.tostring(url.openstream()); } /** * 3)httpclient get工具,获取页面或json * @param url * @param charset * @return * @throws exception */ public static string httpclientfetch(string url, string charset) throws exception { // get httpclient httpclient = new httpclient(); httpclient.getparams().setcontentcharset(charset); httpmethod method = new getmethod(url); httpclient.executemethod(method); return method.getresponsebodyasstring(); } /** * 4)commons-io工具,获取页面或json * @param url * @param charset * @return * @throws exception */ public static string commonsiofetch(string url, string charset) throws exception { return ioutils.tostring(new url(url), charset); } /** * 5) jsoup工具(通常用于html字段解析),获取页面,非json返回格式 * @param url * @return * @throws exception */ public static string jsoupfetch(string url) throws exception { return jsoup.parse(new url(url), 2 * 1000).html(); } }
测试代码:
package com.yeezhao.common.http; import org.junit.after; import org.junit.before; import org.junit.test; /** * 测试类 * 3个测试链接: * 1)百科网页 * 2)浏览器模拟获取接口数据 * 3)获取普通接口数据 * @author administrator -> junhong * * 2016年12月27日 */ public class httpfetchutiltest { string seeds[] = {"http://baike.baidu.com/view/1.htm","http://m.ximalaya.com/tracks/26096131.json","http://remyapi.yeezhao.com/api/query?wd=%e5%91%a8%e6%98%9f%e9%a9%b0%e7%9a%84%e7%94%b5%e5%bd%b1"}; final static string default_charset = "utf-8"; @before public void setup() throws exception { } @after public void teardown() throws exception { system.out.println("--- down ---"); } @test public void testgetresponsecode() throws exception{ for(string seed:seeds){ int responsecode = httpfetchutil.getresponsecode(seed); system.out.println("ret="+responsecode); } } @test public void testjdkfetch() throws exception{ for(string seed:seeds){ string ret = httpfetchutil.jdkfetch(seed, default_charset); system.out.println("ret="+ret); } } @test public void testurlfetch() throws exception{ for(string seed:seeds){ string ret = httpfetchutil.urlfetch(seed, default_charset); system.out.println("ret="+ret); } } @test public void testhttpclientfetch()throws exception { for(string seed:seeds){ string ret = httpfetchutil.httpclientfetch(seed, default_charset); system.out.println("ret="+ret); } } @test public void testcommonsiofetch()throws exception { for(string seed:seeds){ string ret = httpfetchutil.commonsiofetch(seed, default_charset); system.out.println("ret="+ret); } } @test public void testjsoupfetch() throws exception{ for(string seed:seeds){ string ret = httpfetchutil.jsoupfetch(seed); system.out.println("ret="+ret); } } }
附:相关jar依赖
... <dependency> <groupid>org.jsoup</groupid> <artifactid>jsoup</artifactid> <version>1.7.3</version> </dependency> <dependency> <groupid>commons-httpclient</groupid> <artifactid>commons-httpclient</artifactid> <version>3.1</version> </dependency> <dependency> <groupid>commons-io</groupid> <artifactid>commons-io</artifactid> <version>2.4</version> </dependency> ...
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。
上一篇: ThinkPHP实现分页功能