爬虫实战爬取数据
1、需求分析
- 爬取首页的数据
- 爬取分页的数据
- 创建数据库,保存数据
2、创建项目
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>4.2.6.RELEASE</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.41</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.31</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.1</version>
</dependency>
3、代码开发(核心)
“`java
package huxiuSpider;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSON;
/**
* 开发遇到第一个问题 服务器返回500,简单的反爬虫技术。
*
* @author maoxiangyi
*
*/
public class HuxiuSpider {
private static ArticleDao articleDao = new ArticleDao();
private static ArrayBlockingQueue urls = new ArrayBlockingQueue(500);
private static ExecutorService pool = Executors.newFixedThreadPool(10);
public static void main(String[] args) throws Exception {
// 初始化六个线程用来解析每个新闻详情页
for (int i = 0; i < 10; i++) {
pool.submit(new Runnable() {
public void run() {
while (true) {
try {
String url = urls.take();
// 访问单个页面 得到html
String html = getHtmlByGet(url, getHeaders());
if (html != null) {
// 解析html文档,得到artice对象
Article article = parseArticle(html);
// 保存数据库
save2db(article);
}
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
});
}
// 1、通过httpclient发起get请求
HashMap<String, String> headers = getHeaders();
String html = getHtmlByGet("https://www.huxiu.com/", headers);
// 2、解析首页
parseIndex(headers, html);
// 3、获取分页信息 下一页
String last_dateline = getDateLineByIndexHtml(html);
paging(headers, last_dateline);
}
private static void paging(HashMap<String, String> headers, String last_dateline)
throws UnsupportedEncodingException, IOException, ClientProtocolException, Exception {
// 第二页
HttpPost httpPost = getPost(headers, last_dateline, 2);
String html = getHtmlByPost(httpPost);
// 解析json串
HuxiuResponse res = JSON.parseObject(html, HuxiuResponse.class);
Document doc = Jsoup.parse(res.getData());
// url article
Elements alist = doc.select("a[class=transition]");
for (Element element : alist) {
urls.put("http://www.huxiu.com" + element.attr("href"));
// html = getHtmlByGet("http://www.huxiu.com" +
// element.attr("href"), headers);
// Article article = parseArticle(html);
// save2db(article);
}
// 做第三页 4 5 6
// 最终可以多少页 总共分页页码,获得lastdateline
for (int page = 3; page <= res.getTotal_page(); page++) {
HttpPost hp = getPost(headers, res.getLast_dateline(), page);
html = getHtmlByPost(hp);
res = JSON.parseObject(html, HuxiuResponse.class);
doc = Jsoup.parse(res.getData());
// url article
alist = doc.select("a[class=transition]");
for (Element element : alist) {
urls.put("http://www.huxiu.com" + element.attr("href"));
// html = getHtmlByGet("http://www.huxiu.com" +
// element.attr("href"), headers);
// Article article = parseArticle(html);
// save2db(article);
}
System.out.println("-----------------------------分页完成------------------------");
}
}
private static String getHtmlByPost(HttpPost httpPost) throws IOException, ClientProtocolException {
CloseableHttpClient pagingHttpClient = HttpClients.createDefault();
// 发起请求
CloseableHttpResponse paginHtml = pagingHttpClient.execute(httpPost);
return EntityUtils.toString(paginHtml.getEntity());
}
private static HttpPost getPost(HashMap<String, String> headers, String last_dateline, int page)
throws UnsupportedEncodingException {
String api = "https://www.huxiu.com/v2_action/article_list";
HttpPost httpPost = new HttpPost(api);
// 提交一些参数
ArrayList<BasicNameValuePair> paramList = new ArrayList<BasicNameValuePair>();
paramList.add(new BasicNameValuePair("huxiu_hash_code", "a3bec0c023f9f2481ed8eeddf9c15225"));
paramList.add(new BasicNameValuePair("page", page + ""));
paramList.add(new BasicNameValuePair("last_dateline", last_dateline));
httpPost.setEntity(new UrlEncodedFormEntity(paramList));
// 提交请求头
for (Map.Entry<String, String> entry : headers.entrySet()) {
httpPost.addHeader(entry.getKey(), entry.getValue());
}
return httpPost;
}
private static void parseIndex(HashMap<String, String> headers, String html) {
if (html != null) {
getArticleListByIndex(html);
// // 2.1、获取首页中每个url对应的详情页
// for (String url : articleUrls) {
// /**
// * 思考一个性能问题:for循环是依次迭代的,假设articleUrls.size=200,
// * 抓取完所有的页面会需要很多时间,如何提高住区的速度?
// *
// * Java基础 多线程 一个进程中有个线程执行,可以提高处理速度。
// */
// new ProcessPageInfo(url, headers).start();
// }
}
}
private static String getDateLineByIndexHtml(String html) {
Document doc = Jsoup.parse(html);
Elements eles = doc.select("div[data-last_dateline]");
return eles.get(0).attr("data-last_dateline");
}
/**
* 将新闻保存到数据库中 数据库的连接(mysql) 账户密码 连接url 连接(datasource) mybatis jdbctemplate
*
* @param article
*/
public static void save2db(Article article) {
// 不能让数据库连接池 每次都创建
articleDao.save(article);
}
public static Article parseArticle(String html) {
Article article = new Article();
Document doc = Jsoup.parse(html);
// 获取标题
Elements titles = doc.select("h1.t-h1");
article.setTitle(titles.get(0).ownText());
// 获取作者信息
Elements authors = doc.select("span.author-name");
article.setAuthor(authors.get(0).text());
// 发布时间
Elements times = doc.select("span[class=article-time pull-left]");
article.setCreateTime(times.size() == 0 ? new Date().toString() : times.get(0).ownText());
// 收藏
Elements scs = doc.select("span[class=article-share pull-left]");
article.setSc(scs.size() == 0 ? "0" : scs.get(0).ownText().substring(2));
// 评论
Elements pls = doc.select("span[class=article-pl pull-left]");
article.setPl(pls.size() == 0 ? "0" : pls.get(0).ownText().substring(2));
// 获取赞
Elements zans = doc.select("span.num");
article.setZan(zans.get(0).ownText());
// 获取新闻的内容
Elements contents = doc.select("div.article-content-wrap");
article.setContent(contents.text());
System.out.println(article);
return article;
}
private static void getArticleListByIndex(String html) {
Document doc = Jsoup.parse(html);
// 第一步 获取文章列表区域
Elements articleContent = doc.select("div.mod-info-flow");
Elements aTags = articleContent.select("a[class=transition]");
for (Element element : aTags) {
String href = element.attr("href");
if (href.contains("article")) {
// 第二步 获取每个新闻详情页的url
try {
urls.put("https://www.huxiu.com" + href);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
private static HashMap<String, String> getHeaders() {
HashMap<String, String> headers = new HashMap<String, String>();
headers.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.1;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/59.0.3071.115 Safari/537.36");
return headers;
}
/**
* 通过httpGet的方式获取网页的内容
*
* @param url
* 网络自愿抵制
* @param headers
* 请求头
* @return 默认返回null,有也会返回null。200才返回数据。
*/
public static String getHtmlByGet(String url, Map<String, String> headers) {
String html = null;
try {
CloseableHttpClient hc = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
for (Map.Entry<String, String> entry : headers.entrySet()) {
httpGet.addHeader(entry.getKey(), entry.getValue());
}
CloseableHttpResponse response = hc.execute(httpGet);
int code = response.getStatusLine().getStatusCode();
if (200 == code) {
html = EntityUtils.toString(response.getEntity());
} else {
System.out.println(code);
System.out.println("请求url失败……" + url);
}
} catch (Exception e) {
System.out.println(url);
System.out.println("访问网络资源失败!" + e);
}
return html;
}
}
上一篇: Python爬取猫眼榜单
下一篇: Period