爬虫实战爬取数据

程序员文章站 2022-03-02 19:25:13

...

1、需求分析

爬取首页的数据
爬取分页的数据
创建数据库，保存数据

2、创建项目

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <!-- jsoup HTML parser library @ https://jsoup.org/ -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.2.6.RELEASE</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.41</version>
        </dependency>
        <dependency>
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.31</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.1</version>
        </dependency>

3、代码开发（核心）

“`java
package huxiuSpider;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;

/**
* 开发遇到第一个问题服务器返回500，简单的反爬虫技术。
*
* @author maoxiangyi
*
*/
public class HuxiuSpider {
private static ArticleDao articleDao = new ArticleDao();
private static ArrayBlockingQueue urls = new ArrayBlockingQueue(500);
private static ExecutorService pool = Executors.newFixedThreadPool(10);

public static void main(String[] args) throws Exception {
    // 初始化六个线程用来解析每个新闻详情页
    for (int i = 0; i < 10; i++) {
        pool.submit(new Runnable() {

            public void run() {
                while (true) {
                    try {
                        String url = urls.take();
                        // 访问单个页面 得到html
                        String html = getHtmlByGet(url, getHeaders());
                        if (html != null) {
                            // 解析html文档，得到artice对象
                            Article article = parseArticle(html);
                            // 保存数据库
                            save2db(article);
                        }

                    } catch (InterruptedException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }

                }

            }
        });
    }

    // 1、通过httpclient发起get请求
    HashMap<String, String> headers = getHeaders();
    String html = getHtmlByGet("https://www.huxiu.com/", headers);
    // 2、解析首页
    parseIndex(headers, html);
    // 3、获取分页信息 下一页
    String last_dateline = getDateLineByIndexHtml(html);
    paging(headers, last_dateline);

}

private static void paging(HashMap<String, String> headers, String last_dateline)
        throws UnsupportedEncodingException, IOException, ClientProtocolException, Exception {
    // 第二页
    HttpPost httpPost = getPost(headers, last_dateline, 2);
    String html = getHtmlByPost(httpPost);
    // 解析json串
    HuxiuResponse res = JSON.parseObject(html, HuxiuResponse.class);
    Document doc = Jsoup.parse(res.getData());
    // url article
    Elements alist = doc.select("a[class=transition]");
    for (Element element : alist) {
        urls.put("http://www.huxiu.com" + element.attr("href"));
        // html = getHtmlByGet("http://www.huxiu.com" +
        // element.attr("href"), headers);
        // Article article = parseArticle(html);
        // save2db(article);
    }

    // 做第三页 4 5 6
    // 最终可以多少页 总共分页页码，获得lastdateline
    for (int page = 3; page <= res.getTotal_page(); page++) {
        HttpPost hp = getPost(headers, res.getLast_dateline(), page);
        html = getHtmlByPost(hp);
        res = JSON.parseObject(html, HuxiuResponse.class);
        doc = Jsoup.parse(res.getData());
        // url article
        alist = doc.select("a[class=transition]");
        for (Element element : alist) {
            urls.put("http://www.huxiu.com" + element.attr("href"));
            // html = getHtmlByGet("http://www.huxiu.com" +
            // element.attr("href"), headers);
            // Article article = parseArticle(html);
            // save2db(article);
        }

        System.out.println("-----------------------------分页完成------------------------");

    }
}

private static String getHtmlByPost(HttpPost httpPost) throws IOException, ClientProtocolException {
    CloseableHttpClient pagingHttpClient = HttpClients.createDefault();
    // 发起请求
    CloseableHttpResponse paginHtml = pagingHttpClient.execute(httpPost);
    return EntityUtils.toString(paginHtml.getEntity());
}

private static HttpPost getPost(HashMap<String, String> headers, String last_dateline, int page)
        throws UnsupportedEncodingException {
    String api = "https://www.huxiu.com/v2_action/article_list";
    HttpPost httpPost = new HttpPost(api);
    // 提交一些参数
    ArrayList<BasicNameValuePair> paramList = new ArrayList<BasicNameValuePair>();
    paramList.add(new BasicNameValuePair("huxiu_hash_code", "a3bec0c023f9f2481ed8eeddf9c15225"));
    paramList.add(new BasicNameValuePair("page", page + ""));
    paramList.add(new BasicNameValuePair("last_dateline", last_dateline));
    httpPost.setEntity(new UrlEncodedFormEntity(paramList));
    // 提交请求头
    for (Map.Entry<String, String> entry : headers.entrySet()) {
        httpPost.addHeader(entry.getKey(), entry.getValue());
    }
    return httpPost;
}

private static void parseIndex(HashMap<String, String> headers, String html) {
    if (html != null) {
        getArticleListByIndex(html);
        // // 2.1、获取首页中每个url对应的详情页
        // for (String url : articleUrls) {
        // /**
        // * 思考一个性能问题：for循环是依次迭代的，假设articleUrls.size=200，
        // * 抓取完所有的页面会需要很多时间，如何提高住区的速度？
        // *
        // * Java基础 多线程 一个进程中有个线程执行，可以提高处理速度。
        // */
        // new ProcessPageInfo(url, headers).start();
        // }
    }
}

private static String getDateLineByIndexHtml(String html) {
    Document doc = Jsoup.parse(html);
    Elements eles = doc.select("div[data-last_dateline]");
    return eles.get(0).attr("data-last_dateline");
}

/**
 * 将新闻保存到数据库中 数据库的连接(mysql) 账户密码 连接url 连接(datasource) mybatis jdbctemplate
 * 
 * @param article
 */
public static void save2db(Article article) {
    // 不能让数据库连接池 每次都创建
    articleDao.save(article);

}

public static Article parseArticle(String html) {
    Article article = new Article();
    Document doc = Jsoup.parse(html);
    // 获取标题
    Elements titles = doc.select("h1.t-h1");
    article.setTitle(titles.get(0).ownText());
    // 获取作者信息
    Elements authors = doc.select("span.author-name");
    article.setAuthor(authors.get(0).text());
    // 发布时间
    Elements times = doc.select("span[class=article-time pull-left]");
    article.setCreateTime(times.size() == 0 ? new Date().toString() : times.get(0).ownText());
    // 收藏
    Elements scs = doc.select("span[class=article-share pull-left]");
    article.setSc(scs.size() == 0 ? "0" : scs.get(0).ownText().substring(2));
    // 评论
    Elements pls = doc.select("span[class=article-pl pull-left]");
    article.setPl(pls.size() == 0 ? "0" : pls.get(0).ownText().substring(2));

    // 获取赞
    Elements zans = doc.select("span.num");
    article.setZan(zans.get(0).ownText());

    // 获取新闻的内容
    Elements contents = doc.select("div.article-content-wrap");
    article.setContent(contents.text());

    System.out.println(article);
    return article;
}

private static void getArticleListByIndex(String html) {
    Document doc = Jsoup.parse(html);
    // 第一步 获取文章列表区域
    Elements articleContent = doc.select("div.mod-info-flow");
    Elements aTags = articleContent.select("a[class=transition]");
    for (Element element : aTags) {
        String href = element.attr("href");
        if (href.contains("article")) {
            // 第二步 获取每个新闻详情页的url
            try {
                urls.put("https://www.huxiu.com" + href);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
}

private static HashMap<String, String> getHeaders() {
    HashMap<String, String> headers = new HashMap<String, String>();
    headers.put("User-Agent",
            "Mozilla/5.0 (Windows NT 6.1;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/59.0.3071.115 Safari/537.36");
    return headers;
}

/**
 * 通过httpGet的方式获取网页的内容
 * 
 * @param url
 *            网络自愿抵制
 * @param headers
 *            请求头
 * @return 默认返回null，有也会返回null。200才返回数据。
 */
public static String getHtmlByGet(String url, Map<String, String> headers) {
    String html = null;
    try {
        CloseableHttpClient hc = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        for (Map.Entry<String, String> entry : headers.entrySet()) {
            httpGet.addHeader(entry.getKey(), entry.getValue());
        }

        CloseableHttpResponse response = hc.execute(httpGet);
        int code = response.getStatusLine().getStatusCode();
        if (200 == code) {
            html = EntityUtils.toString(response.getEntity());
        } else {
            System.out.println(code);
            System.out.println("请求url失败……" + url);
        }
    } catch (Exception e) {
        System.out.println(url);
        System.out.println("访问网络资源失败！" + e);
    }
    return html;
}

}

上一篇： Python爬取猫眼榜单

下一篇： Period

爬虫实战爬取数据

1、需求分析

2、创建项目

3、代码开发（核心）

爬虫scrapy框架爬取一药网

Python爬虫爬取疫情数据并可视化展示

Python百行不到爬取当当网的图片以及标题导入数据库

python3爬虫-通过selenium登陆拉钩，爬取职位信息

Python爬虫入门教程 13-100 斗图啦表情包多线程爬取

爬虫实践---新浪微博爬取+json+csv

Python爬虫实战之爬取某宝男装信息

有哪些网站用爬虫爬取能得到很有价值的数据？

python制作爬虫爬取京东商品评论教程

python爬取NUS-WIDE数据库图片

爬虫实战爬取数据

1、需求分析

2、创建项目

3、代码开发（核心）

爬虫scrapy框架 爬取一药网

Python爬虫爬取疫情数据并可视化展示

Python百行不到爬取当当网的图片以及标题导入数据库

python3爬虫-通过selenium登陆拉钩，爬取职位信息

Python爬虫入门教程 13-100 斗图啦表情包多线程爬取

爬虫实践---新浪微博爬取+json+csv

Python爬虫实战之爬取某宝男装信息

有哪些网站用爬虫爬取能得到很有价值的数据？

python制作爬虫爬取京东商品评论教程

python爬取NUS-WIDE数据库图片

爬虫scrapy框架爬取一药网