初学java爬虫,用htmlunit + jsoup解析JavaScript
程序员文章站
2022-05-05 14:48:36
...
在用jsoup爬晋江的时候,模拟登陆后仍获取不到v章内容。经验证,是因为jsoup无法解析JavaScript,而v章内容是js动态获取的。经过查阅资料,最后使用htmlunit+jsoup来实现
登陆和获取cookies在上一篇已经写了,不再赘述。
public static String getChapterContent(Chapter chapter, String novelUrl) {
System.out.println("正在获取第"+chapter.getChapterNum()+"章 "+chapter.getChapterTitle());
//如果是锁章,返回
if (chapter.getUrl() == null) {
return chapter.getContent();
}
//模拟浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//设置请求头、cookies、代理
WebRequest request = null;
try {
request = new WebRequest(new URL(chapter.getUrl()));
} catch (MalformedURLException e2) {
e2.printStackTrace();
}
request.setProxyHost(ip);//设置代理
request.setProxyPort(port);
request.setAdditionalHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
request.setAdditionalHeader("Accept-Encoding", "gzip, deflate");
request.setAdditionalHeader("Accept-Language", "zh-CN,zh;q=0.9");
request.setAdditionalHeader("Cache-Control", "max-age=0");
// request.setAdditionalHeader("Connection", "keep-alive");
request.setAdditionalHeader("Host", "www.jjwxc.net");
request.setAdditionalHeader("Referer", novelUrl);
request.setAdditionalHeader("User-Agent", USER_AGENT[randomNum]);
// 屏蔽HtmlUnit等系统 log
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);
//启用JavaScript
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(5000);
webClient.getCookieManager().setCookiesEnabled(true);//开启cookies
for (String key : cookies.keySet()) {//设置cookies
Cookie cookie = new Cookie(DOMAIN, key, cookies.get(key));
webClient.getCookieManager().addCookie(cookie);
}
HtmlPage rootPage = null;
try {
rootPage = webClient.getPage(request);
} catch (FailingHttpStatusCodeException | IOException e1) {
System.out.println("文章页获取失败");
e1.printStackTrace();
}
//设置一个运行JavaScript的时间
webClient.waitForBackgroundJavaScript(500);
String html = rootPage.asXml();
Document doc = Jsoup.parse(html);
//为了保留换行
String content = Jsoup.clean(new String(doc.getElementsByClass("noveltext").get(0).html()), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
//去头去尾
int beginIndex = content.indexOf("查看收藏列表") + "查看收藏列表".length();
int endIndex = content.lastIndexOf("插入书签");
int authorIndex = content.lastIndexOf("插入书签") + "插入书签".length();
if (beginIndex != -1 && endIndex != -1 && authorIndex != -1) {
content = content.substring(beginIndex, endIndex).trim() + content.substring(authorIndex).trim();
}
return content;
}