Java爬虫学习——实例:获取起点中文网站小说并保存成txt文件
程序员文章站
2024-03-14 16:57:16
...
目标:通过使用HttpCient抓取页面数据,Jsoup对数据进行分析然后拿到想要的数据
书名、作者、简介、书籍文件:(txt文件)
案例:https://github.com/yeahmahao/-ReptilesBook
HttpClient使用方式:https://blog.csdn.net/baidu_38688646/article/details/108883222
Httplient连接池:https://blog.csdn.net/baidu_38688646/article/details/108883458
Jsoup使用方式:https://blog.csdn.net/baidu_38688646/article/details/108883606
项目结构:
1. HttpUtils 封装HttpClient工具类
package cn.project.jd.util;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
//声明连接池管理器
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
//设置最大连接数(连接池大小)
cm.setMaxTotal(100);
//设置每个主机的连接数
cm.setDefaultMaxPerRoute(10);
}
//使用get请求获取页面数据
/**
* 使用get请求获取页面数据
* @param url
* @return 页面数据
*/
public String doGetHtml(String url){
//获取httpClinet对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
//设置httpget请求对象,设置url地址
HttpGet HG = new HttpGet(url);
HG.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9," +
"image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
HG.setHeader("accept-encoding","gzip, deflate, br");
HG.setHeader("accept-language","zh-CN,zh;q=0.9");
HG.setHeader("cookie","");
HG.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36");
//设置请求信息
HG.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
//设置HttpClient发起请求 获取响应
response = httpClient.execute(HG);
//解析响应,返回结果
if (response.getStatusLine().getStatusCode() == 200){
//判断响应体是否为null 不是则能使用EndityUtils
if (response.getEntity() != null){
String con =EntityUtils.toString(response.getEntity(),"utf8");
return con;
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
//关闭response
if (response !=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
private RequestConfig getConfig() {
RequestConfig rf = RequestConfig.custom()
.setConnectTimeout(1000)//创建连接的最长时间
.setConnectionRequestTimeout(500)//获取连接的最长时间
.setSocketTimeout(10000)//数据传输的最长时间
.build();
return rf;
}
}
2.textTask 页面数据获取分析定时任务
抓取页面地址:https://www.qidian.com/all
抓取悬疑类型的前2页所有书籍,因为起点中文网第一页的page=1,第二页为page=2所以使用for循环逐页获取。示例只抓取前2页的书籍每页20本。
//每隔多久执行一次任务
@Scheduled(fixedDelay = 100000*1000)
public void itemTask()throws Exception{
//声明需要解析的初始地址 https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&page=1&s=1&click=0
// https://www.qidian.com/all?chanId=21&action=1&orderId=&page=1&vip=0&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0 完本玄幻
// https://www.qidian.com/all?chanId=22&action=1&orderId=&page=1&vip=0&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0 完本仙侠
// String url ="https://www.qidian.com/all?chanId=21&subCateId=8";
String url ="https://www.qidian.com/all?chanId=10&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0page=";
for(int i = 1;i < 3 ;i ++){
String html = hus.doGetHtml(url+i);
//解析页面数据并存储
this.parse(html);
}
System.out.println("数据抓取完成");
}
1)获取书名以及作者
F12进行页面分析
Document doc = Jsoup.parse(html);
Elements els = doc.select("div.all-book-list > div > ul >li");
for (Element lis : els) {
//书名
String bookName= lis.select(".book-mid-info > h4").text();
System.out.println("书名:《"+bookName+"》");
//作者
String author= lis.select(".book-mid-info > .author > [data-eid=qd_B59]").text();
System.out.println("作者:"+author);
//简介
String brieIntroduction= lis.select(".book-mid-info > .intro").text();
System.out.println("简介:"+brieIntroduction);
}
2)根据书名超链接获取到对应书籍的地址,并使用工具类解析地址获取页面String
String herfs = lis.select(".book-mid-info > h4 > a").attr("href");
System.out.println("地址:https:"+herfs);
//读取单本书的内容
String bookhtml = hus.doGetHtml("https:"+herfs);
//使用Jsoup的DOM方式得到Document
Document bookdoc = Jsoup.parse(bookhtml);
3)根据地址获取章节以及章节内容并写入文件
Elements boks = bookdoc.select(".wrap > .book-detail-wrap > #j-catalogWrap > .volume-wrap > div");
// > .book-detail-wrap center990 > #j-catalogWrap > .volume-wrap > div
//命名
String txtname = bookName +".txt";
File file = new File("桌面\\悬疑\\"+txtname);
//保存成文件
PrintStream ps = new PrintStream(new FileOutputStream(file));
for (Element bok : boks) {
Elements boklis =bok.select("ul > li ");
for (Element bokli : boklis) {
String bokredurlz= bokli.select("a").attr("href");
// System.out.println("读书:https:"+bokredurlz);
String bokhtml = hus.doGetHtml("https:"+bokredurlz);
Document zdoc = Jsoup.parse(bokhtml);
Elements zs = zdoc.select(".wrap > .main-read-container > .read-main-wrap > #j_chapterBox > .text-wrap[data-purl]");
//章节
String zhangjie = zdoc.select(".wrap > .main-read-container > .read-main-wrap > #j_chapterBox > .text-wrap[data-purl] > .main-text-wrap > .text-head > h3").text();
System.out.println(zhangjie);
ps.append("\n");
ps.append("\n");
ps.append("\n");
ps.append(zhangjie);
ps.append("\n");
ps.append("\n");
//内容
String neirong = zdoc.select(".wrap > .main-read-container > .read-main-wrap > #j_chapterBox > .text-wrap[data-purl] > .main-text-wrap > .read-content").text();
// System.out.println(neirong);
ps.append(neirong);
}
}
ps.close();