欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫的综合案例

程序员文章站 2022-07-04 18:49:14
...

爬虫的综合案例爬取虎嗅网的完整Demo

创建Maven项目

  • 首先引入依赖
    <dependencies>

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <dependency>
            <!-- jsoup HTML parser library @ https://jsoup.org/ -->
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.2.6.RELEASE</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.41</version>
        </dependency>
        <dependency>
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.31</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.1</version>
        </dependency>

  </dependencies>

使用的是mysql数据库

  • 创建数据库名字为:spider

    create database spider;
    
  • 创建表的代码如下:

CREATE TABLE `huxiu_article` (
  `id` varchar(250) DEFAULT NULL,
  `title` varchar(250) DEFAULT NULL,
  `author` varchar(250) DEFAULT NULL,
  `createTime` varchar(250) DEFAULT NULL,
  `zan` varchar(250) DEFAULT NULL,
  `pl` varchar(250) DEFAULT NULL,
  `sc` varchar(250) DEFAULT NULL,
  `content` blob,
  `url` varchar(250) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8

具体代码如下

实体类Article

public class Article {
    private String id;
    private String url;
    private String title;
    private String author;
    private String createTime;
    private String pl;
    private String zan;
    private String sc;
    private String content;

    public String getId() {
        return id;
    }
    public void setId(String id) {
        this.id = id;
    }
    ........

}

操作数据库的ArticleDao

public class ArticleDao extends JdbcTemplate {

    public ArticleDao() {
        // 创建C3P0的datasource 1.配置 2.代码
        ComboPooledDataSource dataSource = new ComboPooledDataSource();
        // 1.url
        // 2.driver
        // 3.username&password
        dataSource.setUser("root");
        dataSource.setPassword("root");
        dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8");
        setDataSource(dataSource);
    }

    public void save(Article article) {
        String sql = "INSERT INTO `spider`.`huxiu_article` (`id`, `title`, `author`, `createTime`, `zan`, `pl`, `sc`, `content`, `url` ) VALUES( ?,?,?,?,?,?,?,?,?)";
        update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl());
    }
}

返回值的实体类HuxiuPagingResponse

    public class HuxiuPagingResponse {

    private String data;
    private String last_dateline;
    private String msg;
    private String result;
    private String total_page;
    public String getData() {
        return data;
    }
    public void setData(String data) {
        this.data = data;
    }
    ........

}

程序主方法入口HuXiuSpider

public class HuXiuSpider {
    // 保存数据
    public static ArticleDao articleDao = new ArticleDao();
    // dataLine用来做分页的请求
    private static String dateLine = null;
    // 创建固定大小的线程池(下载、解析、存储)
    private static ExecutorService threadPool = Executors.newFixedThreadPool(30);
    // 队列---从首页和分页解析出来的文章url,存放在这个队列中
    public static ArrayBlockingQueue<String> urlQueue = new ArrayBlockingQueue<String>(1000);
    // 队列---每个文章解析出来的Html文档,存放这个队列中
    public static ArrayBlockingQueue<String> articleHtmlQueue = new ArrayBlockingQueue<String>(1000);
    // 队列---每个文章的内容,也就是article对象,存放这个队列中
    public static ArrayBlockingQueue<Article> articleContentQueue = new ArrayBlockingQueue<Article>(1000);

    public static void main(String[] args) throws Exception {
        // 提交线程 用来针对每个文章的url ----进行网络请求
        for (int i = 0; i < 10; i++) {
            threadPool.execute(new ProcessSinglePageRunnable());
        }
        // 解析页面
        for (int i = 0; i < 10; i++) {
            threadPool.execute(new ParseHtmlRunnable());
        }
        // 保存数据
        threadPool.execute(new SaveDBRunnable());
        //获取首页的文章url列表
        getIndexArticleUrlList();
        //加载分页
        processPaging();
    }
    /**
     * 获取首页的文章列表信息
     * 
     * @throws IOException
     * @throws ClientProtocolException
     */
    private static void getIndexArticleUrlList() throws IOException, ClientProtocolException {
        // 1.指定首页url http://www.huxiu.com
        String indexUrl = "http://www.huxiu.com";
        // 2.发起一个HttpGet请求
        HttpGet indexHttpGet = new HttpGet(indexUrl);
        //设置User-Agent
        indexHttpGet.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");

        String html = getHtmlByRequest(indexHttpGet);

        // 5.使用Jsoup进行解析,得到 文章的列表 ,获得文章aid。
        Document indexDocument = Jsoup.parse(html);

        // 获取date_line
        Elements dateLines = indexDocument.select("[data-last_dateline]");
        dateLine = dateLines.get(0).attr("data-last_dateline");

        // 5.1 解析出div的某个属性data-id
        Elements aidElements = indexDocument.select("div[data-aid]");
        // 5.2 依次得到每个新闻的aid
        for (Element element : aidElements) {
            String aid = element.attr("data-aid");
            try {
                urlQueue.put(aid);
            } catch (InterruptedException e) {
                System.out.println("添加 aid 到urlQueue异常" + e);
            }
        }
    }

    private static void processPaging() {
        for (int page = 2; page <= 1615; page++) {
            try {
                // 编写分页
                String pagingUrl = "https://www.huxiu.com/v2_action/article_list";
                HttpPost httpPost = new HttpPost(pagingUrl);
                // 设置参数
                ArrayList<NameValuePair> arrayList = new ArrayList<NameValuePair>();
                arrayList.add(new BasicNameValuePair("huxiu_hash_code", "fb7f7403c58c3e8cb45aa47afc204c10"));
                arrayList.add(new BasicNameValuePair("page", page + ""));
                arrayList.add(new BasicNameValuePair("last_dateline", dateLine));
                httpPost.setEntity(new UrlEncodedFormEntity(arrayList));
                // 执行网络参数
                String jsonText = getHtmlByRequest(httpPost);
                // 想将json串转成对象
                Gson gson = new Gson();
                HuxiuPagingResponse huxiuPagingResponse = gson.fromJson(jsonText, HuxiuPagingResponse.class);
                // 每一次请求,都需要解析出新的dataLine
                dateLine = huxiuPagingResponse.getLast_dateline();
                // 获取数据
                String htmlData = huxiuPagingResponse.getData();

                Document doc = Jsoup.parse(htmlData);
                // 解析出div的某个属性data-id
                Elements aidElements = doc.select("div[data-aid]");
                // 依次得到每个新闻的aid
                for (Element element : aidElements) {
                    String aid = element.attr("data-aid");
                    urlQueue.put(aid);
                }
            } catch (Exception e) {
                // log.errer()
                System.out.println(page);
                System.out.println(e);
            }
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

        }
    }

    /**
     * 获取html文档
     * @throws IOException
     * @throws ClientProtocolException
     */
    public static String getHtml(String aidUrl) throws IOException, ClientProtocolException {
        // 2.发起一个httpget请求
        HttpGet indexHttpGet = new HttpGet(aidUrl);
        return getHtmlByRequest(indexHttpGet);
    }
    private static String getHtmlByRequest(HttpRequestBase request) throws IOException, ClientProtocolException {
        //设置请求头User-Agent
        request.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");

        // 3.使用HttpClient执行,得到一个entity。
        CloseableHttpClient indexHttpClient = HttpClients.createDefault();
        CloseableHttpResponse indexResponse = indexHttpClient.execute(request);
        String html = null;
        if (200 == indexResponse.getStatusLine().getStatusCode()) {
            HttpEntity indexEntity = indexResponse.getEntity();
            // 4.将entity转成字符串(html)
            html = EntityUtils.toString(indexEntity, Charset.forName("utf-8"));
        }
        return html;
    }
}

用来针对每个文章的url 进行网络请求ProcessSinglePageRunnable

public class ProcessSinglePageRunnable  implements Runnable {

    public void run() {
        while (true) {
            try {
                processSingleUrl();
                Thread.sleep(3000);
            } catch (InterruptedException e) {
            }
        }
    }

    private void processSingleUrl() throws InterruptedException {
        String aid = HuXiuSpider.urlQueue.take();
        String aidUrl = "http://www.huxiu.com/article/" + aid + ".html";
        try {
            /*Article article = new Article();
            article.setId(aid);*/
            // 获取到单个新闻页面的html
            String aidHtml = HuXiuSpider.getHtml(aidUrl);
            HuXiuSpider.articleHtmlQueue.put(aidHtml);
        } catch (Exception e) {
            System.out.println(aidUrl);
            System.out.println(e);
        }
    }
}

解析每个页面ParseHtmlRunnable

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class ParseHtmlRunnable implements Runnable {

    public void run() {
        while (true) {
            String html = null;
            try {
                html = HuXiuSpider.articleHtmlQueue.take();
            } catch (InterruptedException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            Article article = new Article();
            Document detailDocument = Jsoup.parse(html);

            //解析出div的某个属性data-id
            Elements aidElements = detailDocument.select("div[data-aid]");
            String aid=aidElements.get(0).attr("data-aid");
            article.setId(aid);
            System.out.println(aid+".........");

            // 解析文章title
            Elements titles = detailDocument.select(".t-h1");
            String title = titles.get(0).text();
            article.setTitle(title);

            // 解析文章author author-name
            Elements names = detailDocument.select(".author-name");
            String name = names.get(0).text();
            article.setAuthor(name);
            // 解析文章发布时间
            Elements dates = detailDocument.select("[class^=article-time]");
            String date = dates.get(0).text();
            article.setCreateTime(date);
            // 解析文章 收藏数
            Elements shares = detailDocument.select("[class^=article-share]");
            String share = shares.get(0).text();
            article.setSc(share);
            // 解析文章 评论数
            Elements pls = detailDocument.select("[class^=article-pl]");
            String pl = pls.get(0).text();
            article.setPl(pl);
            // 解析文章 点赞数 num
            Elements nums = detailDocument.select(".num");
            String num = nums.get(0).text();
            article.setZan(num);
            // 解析文章正文内容 article-content-wrap
            Elements content = detailDocument.select(".article-content-wrap p");
            String contentText = content.text();
            article.setContent(contentText);
            // article.setUrl(aidUrl);
            try {
                HuXiuSpider.articleContentQueue.put(article);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

}

保存数据的类SaveDBRunnable

public class SaveDBRunnable implements Runnable {
    public void run() {
        while (true) {
            try {
                Article article = HuXiuSpider.articleContentQueue.take();
                HuXiuSpider.articleDao.save(article);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
}