java之Httpclient,HtmlUnit获取网页资源
程序员文章站
2022-05-05 14:14:55
...
1.获取网页静态资源(没有运行js和ajax)Httpclient
引入依赖:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
使用CloseableHttpClient类DefaultHttpclient类的new DefaultHttpclient()已经过期
public static String doGet(String url) {
try {
CloseableHttpClient aDefault = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
CloseableHttpResponse execute = aDefault.execute(httpGet);
HttpEntity entity = execute.getEntity();
if(entity!=null){
String value = entity.getContentType().getValue();
System.out.println("Content-Type:"+value);
String s = EntityUtils.toString(entity,"UTF-8");
execute.close();
aDefault.close();
return s;
}
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
2.获取网页动态资源HtmlUnit
导入依赖
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.18</version>
</dependency>
public static String htmlUnit(String url){
WebClient webClient = new WebClient(BrowserVersion.CHROME);
try {
WebClient wc = new WebClient(BrowserVersion.CHROME);
//是否使用不安全的SSL
wc.getOptions().setUseInsecureSSL(true);
//启用JS解释器,默认为true
wc.getOptions().setJavaScriptEnabled(true);
//禁用CSS
wc.getOptions().setCssEnabled(false);
//js运行错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnScriptError(false);
//状态码错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
//是否允许使用ActiveX
wc.getOptions().setActiveXNative(false);
//等待js时间
wc.waitForBackgroundJavaScript(600*1000);
//设置Ajax异步处理控制器即启用Ajax支持
wc.setAjaxController(new NicelyResynchronizingAjaxController());
//设置超时时间
wc.getOptions().setTimeout(1000000);
//不跟踪抓取
wc.getOptions().setDoNotTrackEnabled(false);
//模拟浏览器打开一个目标网址
HtmlPage htmlPage = wc.getPage(url);
//为了获取js执行的数据 线程开始沉睡等待
Thread.sleep(1000);//这个线程的等待 因为js加载需要时间的
//以xml形式获取响应文本
String xml = htmlPage.asXml();
System.out.println(xml);
return xml;
} catch (Exception e) {
e.printStackTrace();
return null;
}finally {
webClient.close();
}
上一篇: 分手挽回让人忍受不了的5件事!
下一篇: 如何判断真假性分手 假性分手挽回的全攻略