欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

(娱乐)爬虫工具htmlunit

程序员文章站 2022-05-05 14:21:30
...
package com.example.demo2;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.LogFactory;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.HttpHostConnectException;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @description:
 * @author: kaiba
 * @date: 2019/12/27
 */
@Component
public class PaChongUtils {
    private static WebClient webClient = new WebClient(BrowserVersion.BEST_SUPPORTED);
    /**
     * openUrl 方法
     * 说明:
     * 创建人:yangkai
     * @param url:
     * @return com.gargoylesoftware.htmlunit.html.HtmlPage
     * @throws
     */
    public static HtmlPage openUrl(String url,ProxyConfig proxyConfig){
        webClient.getOptions().setJavaScriptEnabled(true);              // 启用JS解释器,默认为true
        webClient.getOptions().setCssEnabled(false);                    // 禁用css支持
        webClient.getOptions().setThrowExceptionOnScriptError(false);   // js运行错误时,是否抛出异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setTimeout(10*1000);                   // 设置连接超时时间
        webClient.getOptions().setUseInsecureSSL(true);                 // 启用ssl
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
        if(proxyConfig!=null){
            webClient.getOptions().setProxyConfig(proxyConfig);
        }else {

        }
        webClient.waitForBackgroundJavaScript(500);  // 等待js后台执行0.5秒
        try {
            HtmlPage htmlPage = webClient.getPage(url);
            return htmlPage;
        }catch (ConnectTimeoutException e){
            e.printStackTrace();
            return null;
        }catch (HttpHostConnectException e){
            e.printStackTrace();
            return null;
        }catch (SocketTimeoutException e){
            e.printStackTrace();
            return null;
        }catch (IOException e){
            e.printStackTrace();
            return null;
        }
    }

    // 正则匹配
    public static List<String> getMatherSubstrs(String str, String regex) {
        List<String> list = new ArrayList<String>();
        Pattern p = Pattern.compile(regex);
        Matcher m = p.matcher(str);
        while (m.find()) {
            list.add(m.group());
        }
        return list;
    }
}