JAVA实现网页抓取(htmlunit)
程序员文章站
2022-05-05 14:51:54
...
准确条件
加入依赖jar包
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.15</version>
<scope>provided</scope>
</dependency>
代码示例
private WebClient initWc() throws IOException {
WebClient wc = new WebClient(BrowserVersion.CHROME);
wc.getOptions().setJavaScriptEnabled(false);
wc.getOptions().setCssEnabled(false);
wc.getOptions().setTimeout(8000);
wc.setJavaScriptTimeout(8000);
wc.setAjaxController(new NicelyResynchronizingAjaxController());
wc.waitForBackgroundJavaScript(8000);
// Cache cache=new Cache();
// wc.setCache(cache);
wc.getOptions().setThrowExceptionOnScriptError(false);
// wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
return wc;
}
public void loadData() {
WebClient wc = null;
if ( wc == null ) {
try {
wc = initWc();
} catch (IOException e) {
e.printStackTrace();
}
}
try {
//图片中文字解析时使用
IIORegistry registry = IIORegistry.getDefaultInstance();
registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageWriterSpi());
registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi());
StringBuffer errPage =new StringBuffer();
for(int i =1 ; i<=97;i++){
loadPage(i,errPage,wc);
riskCompanyDao.flush();
}
log.info("errPage:"+errPage);
// loadPage(27,errPage,wc);
} catch (Exception e) {
log.warn("loadData error! ", e);
} finally {
wc.closeAllWindows();
}
}
private void loadPage(int pageNo,StringBuffer errPage, WebClient wc){
HtmlPage page;
try {
String refer="http://www.baidu.com/";
URL link=new URL("http://www.kstba.org/minglu-79-"+pageNo+".html");
WebRequest request=new WebRequest(link);
request.setCharset("UTF-8");
request.setAdditionalHeader("Referer", refer);//设置请求报文头里的refer字段
////设置请求报文头里的User-Agent字段
request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");
request.setAdditionalHeader("Connection", "keep-alive");
request.setAdditionalHeader("Cookie", "ad_play_index=47; CNZZDATA1000215585=2014872656-1449554771-%7C1449572770");
page = wc.getPage(request);
HtmlPage pageResult = page;
HtmlTable tableResult = (HtmlTable) pageResult.getElementsByTagName("table").get(0);
HtmlTableBody body = (HtmlTableBody) tableResult.getChildNodes().get(1);
int indexRow = 0;
for ( DomNode node2 : body.getChildNodes() ) {
if (node2 instanceof HtmlTableRow ) {
HtmlTableRow row = (HtmlTableRow) node2;
List<HtmlTableCell> cells = row.getCells();
HtmlTableCell cell0=cells.get(0);
String companyName = cell0.getElementsByTagName("a").get(0).getTextContent();
String industryName = cell0.getElementsByTagName("div").get(0).getTextContent();
industryName = industryName.split(":")[1];
String addr = cell0.getElementsByTagName("div").get(1).getTextContent();
if (addr.split(":").length>1){
addr = addr.split(":")[1];
}else{
addr=null;
}
String mobile =null;
if (cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").size()>0){
HtmlImage img =(HtmlImage)cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").get(0);
String imgStr =img.getAttribute("src");
imgStr =imgStr.substring(0,imgStr.indexOf("&font=")).replace("fontsize=12", "fontsize=22");
mobile = ImageRead.getImgStr(imgStr);
log.info("mobile:"+mobile);
}
}
indexRow++;
}
} catch (Exception e) {
errPage.append(pageNo).append(",");
log.warn("page error :"+pageNo,e);
}
}
注意事项
1、普通的httpConnection容易被拦截,需设置请求报文头,模拟浏览器请求
2、WebClient在请求发起前初始化一次即可
3、不同浏览器版返回的html代码有一定差异,需单独调试
上一篇: HtmlUnit测试单元做爬虫