欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

java之Httpclient,HtmlUnit获取网页资源

程序员文章站 2022-05-05 14:14:55
...

1.获取网页静态资源(没有运行js和ajax)Httpclient

	引入依赖:
	 <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.2</version>
    </dependency>
	


使用CloseableHttpClient类DefaultHttpclient类的new DefaultHttpclient()已经过期
public static String doGet(String url) {
        try {
            CloseableHttpClient aDefault = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(url);
            httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
            CloseableHttpResponse execute = aDefault.execute(httpGet);
            HttpEntity entity = execute.getEntity();
            if(entity!=null){
                String value = entity.getContentType().getValue();
                System.out.println("Content-Type:"+value);
                String s = EntityUtils.toString(entity,"UTF-8");
                execute.close();
                aDefault.close();
                return s;
            }
            return null;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

2.获取网页动态资源HtmlUnit

导入依赖
 <dependency>
        <groupId>net.sourceforge.htmlunit</groupId>
        <artifactId>htmlunit</artifactId>
        <version>2.18</version>
    </dependency>
public static String htmlUnit(String url){
    WebClient webClient = new WebClient(BrowserVersion.CHROME);
    try {
        WebClient wc = new WebClient(BrowserVersion.CHROME);
                 //是否使用不安全的SSL
                 wc.getOptions().setUseInsecureSSL(true);
                 //启用JS解释器,默认为true
                 wc.getOptions().setJavaScriptEnabled(true);
                 //禁用CSS
                 wc.getOptions().setCssEnabled(false);
                 //js运行错误时,是否抛出异常
                 wc.getOptions().setThrowExceptionOnScriptError(false);
                 //状态码错误时,是否抛出异常
                 wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
                 //是否允许使用ActiveX
                 wc.getOptions().setActiveXNative(false);
                 //等待js时间
                wc.waitForBackgroundJavaScript(600*1000);
                 //设置Ajax异步处理控制器即启用Ajax支持
                 wc.setAjaxController(new NicelyResynchronizingAjaxController());
                 //设置超时时间
                 wc.getOptions().setTimeout(1000000);
                 //不跟踪抓取
                 wc.getOptions().setDoNotTrackEnabled(false);
                    //模拟浏览器打开一个目标网址
                    HtmlPage htmlPage = wc.getPage(url);
                     //为了获取js执行的数据 线程开始沉睡等待
                     Thread.sleep(1000);//这个线程的等待 因为js加载需要时间的
                    //以xml形式获取响应文本
                     String xml = htmlPage.asXml();
                     System.out.println(xml);
                 return xml;       
    } catch (Exception e) {
        e.printStackTrace();
        return null;
    }finally {
        webClient.close();
    }
相关标签: springboot