欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

JAVA实现网页抓取(htmlunit)

程序员文章站 2022-05-05 14:51:54
...

准确条件

加入依赖jar包

<dependency>
     <groupId>net.sourceforge.htmlunit</groupId>
      <artifactId>htmlunit</artifactId>
      <version>2.15</version>
      <scope>provided</scope>
</dependency>

代码示例

private WebClient initWc() throws IOException {
    WebClient wc = new WebClient(BrowserVersion.CHROME);
    wc.getOptions().setJavaScriptEnabled(false);
    wc.getOptions().setCssEnabled(false);
    wc.getOptions().setTimeout(8000);
    wc.setJavaScriptTimeout(8000);
    wc.setAjaxController(new NicelyResynchronizingAjaxController());
    wc.waitForBackgroundJavaScript(8000);
//        Cache cache=new Cache();
//        wc.setCache(cache);
    wc.getOptions().setThrowExceptionOnScriptError(false);
//        wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
    return wc;
}

public void loadData() {
  WebClient wc = null;

    if ( wc == null ) {
        try {
            wc = initWc();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    try {
        //图片中文字解析时使用
        IIORegistry registry = IIORegistry.getDefaultInstance();  
        registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageWriterSpi());  
        registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi());  

        StringBuffer errPage =new StringBuffer();
        for(int i =1 ; i<=97;i++){
            loadPage(i,errPage,wc);
            riskCompanyDao.flush();
        }
        log.info("errPage:"+errPage);
//            loadPage(27,errPage,wc);
    } catch (Exception e) {
        log.warn("loadData error! ", e);
    } finally {
        wc.closeAllWindows();
    }
}

private void loadPage(int pageNo,StringBuffer errPage, WebClient wc){
   HtmlPage page;
    try {

        String refer="http://www.baidu.com/";
        URL link=new URL("http://www.kstba.org/minglu-79-"+pageNo+".html"); 
        WebRequest request=new WebRequest(link); 
        request.setCharset("UTF-8");
        request.setAdditionalHeader("Referer", refer);//设置请求报文头里的refer字段
        ////设置请求报文头里的User-Agent字段
        request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");
        request.setAdditionalHeader("Connection", "keep-alive");
        request.setAdditionalHeader("Cookie", "ad_play_index=47; CNZZDATA1000215585=2014872656-1449554771-%7C1449572770");

        page = wc.getPage(request);

        HtmlPage pageResult = page;
        HtmlTable tableResult = (HtmlTable) pageResult.getElementsByTagName("table").get(0);
        HtmlTableBody body = (HtmlTableBody) tableResult.getChildNodes().get(1);
        int indexRow = 0;
        for ( DomNode node2 : body.getChildNodes() ) {

            if (node2 instanceof  HtmlTableRow ) {
                HtmlTableRow row = (HtmlTableRow) node2;
                List<HtmlTableCell> cells = row.getCells();
                HtmlTableCell cell0=cells.get(0);
                String companyName = cell0.getElementsByTagName("a").get(0).getTextContent();
                String industryName = cell0.getElementsByTagName("div").get(0).getTextContent();
                industryName = industryName.split(":")[1];
                String addr = cell0.getElementsByTagName("div").get(1).getTextContent();
                if (addr.split(":").length>1){
                    addr = addr.split(":")[1];
                }else{
                    addr=null;
                }
                String mobile =null;
                if (cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").size()>0){
                    HtmlImage img =(HtmlImage)cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").get(0);
                    String imgStr =img.getAttribute("src");
                    imgStr =imgStr.substring(0,imgStr.indexOf("&font=")).replace("fontsize=12", "fontsize=22");
                    mobile = ImageRead.getImgStr(imgStr);
                    log.info("mobile:"+mobile);
                }
               
            }
            indexRow++;
        }

    } catch (Exception e) {
        errPage.append(pageNo).append(",");
        log.warn("page error :"+pageNo,e);
    }

}

注意事项

1、普通的httpConnection容易被拦截,需设置请求报文头,模拟浏览器请求
2、WebClient在请求发起前初始化一次即可
3、不同浏览器版返回的html代码有一定差异,需单独调试

相关标签: java htmlunit