欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫实战爬取数据

程序员文章站 2022-03-02 19:25:13
...

1、需求分析

  • 爬取首页的数据
  • 爬取分页的数据
  • 创建数据库,保存数据

2、创建项目

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <!-- jsoup HTML parser library @ https://jsoup.org/ -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.2.6.RELEASE</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.41</version>
        </dependency>
        <dependency>
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.31</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.1</version>
        </dependency>

3、代码开发(核心)

“`java
package huxiuSpider;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;

/**
* 开发遇到第一个问题 服务器返回500,简单的反爬虫技术。
*
* @author maoxiangyi
*
*/
public class HuxiuSpider {
private static ArticleDao articleDao = new ArticleDao();
private static ArrayBlockingQueue urls = new ArrayBlockingQueue(500);
private static ExecutorService pool = Executors.newFixedThreadPool(10);

public static void main(String[] args) throws Exception {
    // 初始化六个线程用来解析每个新闻详情页
    for (int i = 0; i < 10; i++) {
        pool.submit(new Runnable() {

            public void run() {
                while (true) {
                    try {
                        String url = urls.take();
                        // 访问单个页面 得到html
                        String html = getHtmlByGet(url, getHeaders());
                        if (html != null) {
                            // 解析html文档,得到artice对象
                            Article article = parseArticle(html);
                            // 保存数据库
                            save2db(article);
                        }

                    } catch (InterruptedException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }

                }

            }
        });
    }

    // 1、通过httpclient发起get请求
    HashMap<String, String> headers = getHeaders();
    String html = getHtmlByGet("https://www.huxiu.com/", headers);
    // 2、解析首页
    parseIndex(headers, html);
    // 3、获取分页信息 下一页
    String last_dateline = getDateLineByIndexHtml(html);
    paging(headers, last_dateline);

}

private static void paging(HashMap<String, String> headers, String last_dateline)
        throws UnsupportedEncodingException, IOException, ClientProtocolException, Exception {
    // 第二页
    HttpPost httpPost = getPost(headers, last_dateline, 2);
    String html = getHtmlByPost(httpPost);
    // 解析json串
    HuxiuResponse res = JSON.parseObject(html, HuxiuResponse.class);
    Document doc = Jsoup.parse(res.getData());
    // url article
    Elements alist = doc.select("a[class=transition]");
    for (Element element : alist) {
        urls.put("http://www.huxiu.com" + element.attr("href"));
        // html = getHtmlByGet("http://www.huxiu.com" +
        // element.attr("href"), headers);
        // Article article = parseArticle(html);
        // save2db(article);
    }

    // 做第三页 4 5 6
    // 最终可以多少页 总共分页页码,获得lastdateline
    for (int page = 3; page <= res.getTotal_page(); page++) {
        HttpPost hp = getPost(headers, res.getLast_dateline(), page);
        html = getHtmlByPost(hp);
        res = JSON.parseObject(html, HuxiuResponse.class);
        doc = Jsoup.parse(res.getData());
        // url article
        alist = doc.select("a[class=transition]");
        for (Element element : alist) {
            urls.put("http://www.huxiu.com" + element.attr("href"));
            // html = getHtmlByGet("http://www.huxiu.com" +
            // element.attr("href"), headers);
            // Article article = parseArticle(html);
            // save2db(article);
        }

        System.out.println("-----------------------------分页完成------------------------");

    }
}

private static String getHtmlByPost(HttpPost httpPost) throws IOException, ClientProtocolException {
    CloseableHttpClient pagingHttpClient = HttpClients.createDefault();
    // 发起请求
    CloseableHttpResponse paginHtml = pagingHttpClient.execute(httpPost);
    return EntityUtils.toString(paginHtml.getEntity());
}

private static HttpPost getPost(HashMap<String, String> headers, String last_dateline, int page)
        throws UnsupportedEncodingException {
    String api = "https://www.huxiu.com/v2_action/article_list";
    HttpPost httpPost = new HttpPost(api);
    // 提交一些参数
    ArrayList<BasicNameValuePair> paramList = new ArrayList<BasicNameValuePair>();
    paramList.add(new BasicNameValuePair("huxiu_hash_code", "a3bec0c023f9f2481ed8eeddf9c15225"));
    paramList.add(new BasicNameValuePair("page", page + ""));
    paramList.add(new BasicNameValuePair("last_dateline", last_dateline));
    httpPost.setEntity(new UrlEncodedFormEntity(paramList));
    // 提交请求头
    for (Map.Entry<String, String> entry : headers.entrySet()) {
        httpPost.addHeader(entry.getKey(), entry.getValue());
    }
    return httpPost;
}

private static void parseIndex(HashMap<String, String> headers, String html) {
    if (html != null) {
        getArticleListByIndex(html);
        // // 2.1、获取首页中每个url对应的详情页
        // for (String url : articleUrls) {
        // /**
        // * 思考一个性能问题:for循环是依次迭代的,假设articleUrls.size=200,
        // * 抓取完所有的页面会需要很多时间,如何提高住区的速度?
        // *
        // * Java基础 多线程 一个进程中有个线程执行,可以提高处理速度。
        // */
        // new ProcessPageInfo(url, headers).start();
        // }
    }
}

private static String getDateLineByIndexHtml(String html) {
    Document doc = Jsoup.parse(html);
    Elements eles = doc.select("div[data-last_dateline]");
    return eles.get(0).attr("data-last_dateline");
}

/**
 * 将新闻保存到数据库中 数据库的连接(mysql) 账户密码 连接url 连接(datasource) mybatis jdbctemplate
 * 
 * @param article
 */
public static void save2db(Article article) {
    // 不能让数据库连接池 每次都创建
    articleDao.save(article);

}

public static Article parseArticle(String html) {
    Article article = new Article();
    Document doc = Jsoup.parse(html);
    // 获取标题
    Elements titles = doc.select("h1.t-h1");
    article.setTitle(titles.get(0).ownText());
    // 获取作者信息
    Elements authors = doc.select("span.author-name");
    article.setAuthor(authors.get(0).text());
    // 发布时间
    Elements times = doc.select("span[class=article-time pull-left]");
    article.setCreateTime(times.size() == 0 ? new Date().toString() : times.get(0).ownText());
    // 收藏
    Elements scs = doc.select("span[class=article-share pull-left]");
    article.setSc(scs.size() == 0 ? "0" : scs.get(0).ownText().substring(2));
    // 评论
    Elements pls = doc.select("span[class=article-pl pull-left]");
    article.setPl(pls.size() == 0 ? "0" : pls.get(0).ownText().substring(2));

    // 获取赞
    Elements zans = doc.select("span.num");
    article.setZan(zans.get(0).ownText());

    // 获取新闻的内容
    Elements contents = doc.select("div.article-content-wrap");
    article.setContent(contents.text());

    System.out.println(article);
    return article;
}

private static void getArticleListByIndex(String html) {
    Document doc = Jsoup.parse(html);
    // 第一步 获取文章列表区域
    Elements articleContent = doc.select("div.mod-info-flow");
    Elements aTags = articleContent.select("a[class=transition]");
    for (Element element : aTags) {
        String href = element.attr("href");
        if (href.contains("article")) {
            // 第二步 获取每个新闻详情页的url
            try {
                urls.put("https://www.huxiu.com" + href);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
}

private static HashMap<String, String> getHeaders() {
    HashMap<String, String> headers = new HashMap<String, String>();
    headers.put("User-Agent",
            "Mozilla/5.0 (Windows NT 6.1;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/59.0.3071.115 Safari/537.36");
    return headers;
}

/**
 * 通过httpGet的方式获取网页的内容
 * 
 * @param url
 *            网络自愿抵制
 * @param headers
 *            请求头
 * @return 默认返回null,有也会返回null。200才返回数据。
 */
public static String getHtmlByGet(String url, Map<String, String> headers) {
    String html = null;
    try {
        CloseableHttpClient hc = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        for (Map.Entry<String, String> entry : headers.entrySet()) {
            httpGet.addHeader(entry.getKey(), entry.getValue());
        }

        CloseableHttpResponse response = hc.execute(httpGet);
        int code = response.getStatusLine().getStatusCode();
        if (200 == code) {
            html = EntityUtils.toString(response.getEntity());
        } else {
            System.out.println(code);
            System.out.println("请求url失败……" + url);
        }
    } catch (Exception e) {
        System.out.println(url);
        System.out.println("访问网络资源失败!" + e);
    }
    return html;
}

}

上一篇: Python爬取猫眼榜单

下一篇: Period