(娱乐)爬虫工具htmlunit
程序员文章站
2022-05-05 14:21:30
...
package com.example.demo2;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.LogFactory;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.HttpHostConnectException;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @description:
* @author: kaiba
* @date: 2019/12/27
*/
@Component
public class PaChongUtils {
private static WebClient webClient = new WebClient(BrowserVersion.BEST_SUPPORTED);
/**
* openUrl 方法
* 说明:
* 创建人:yangkai
* @param url:
* @return com.gargoylesoftware.htmlunit.html.HtmlPage
* @throws
*/
public static HtmlPage openUrl(String url,ProxyConfig proxyConfig){
webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
webClient.getOptions().setCssEnabled(false); // 禁用css支持
webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(10*1000); // 设置连接超时时间
webClient.getOptions().setUseInsecureSSL(true); // 启用ssl
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
if(proxyConfig!=null){
webClient.getOptions().setProxyConfig(proxyConfig);
}else {
}
webClient.waitForBackgroundJavaScript(500); // 等待js后台执行0.5秒
try {
HtmlPage htmlPage = webClient.getPage(url);
return htmlPage;
}catch (ConnectTimeoutException e){
e.printStackTrace();
return null;
}catch (HttpHostConnectException e){
e.printStackTrace();
return null;
}catch (SocketTimeoutException e){
e.printStackTrace();
return null;
}catch (IOException e){
e.printStackTrace();
return null;
}
}
// 正则匹配
public static List<String> getMatherSubstrs(String str, String regex) {
List<String> list = new ArrayList<String>();
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(str);
while (m.find()) {
list.add(m.group());
}
return list;
}
}
上一篇: Jsoup+HtmlUnit 爬虫