爬虫的综合案例

程序员文章站 2022-07-04 18:49:14

...

爬虫的综合案例爬取虎嗅网的完整Demo

创建Maven项目

首先引入依赖

    <dependencies>

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <dependency>
            <!-- jsoup HTML parser library @ https://jsoup.org/ -->
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.2.6.RELEASE</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.41</version>
        </dependency>
        <dependency>
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.31</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.1</version>
        </dependency>

  </dependencies>

使用的是mysql数据库

创建数据库名字为:spider
```
create database spider;
```
创建表的代码如下:

CREATE TABLE `huxiu_article` (
  `id` varchar(250) DEFAULT NULL,
  `title` varchar(250) DEFAULT NULL,
  `author` varchar(250) DEFAULT NULL,
  `createTime` varchar(250) DEFAULT NULL,
  `zan` varchar(250) DEFAULT NULL,
  `pl` varchar(250) DEFAULT NULL,
  `sc` varchar(250) DEFAULT NULL,
  `content` blob,
  `url` varchar(250) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8

具体代码如下

实体类Article

public class Article {
    private String id;
    private String url;
    private String title;
    private String author;
    private String createTime;
    private String pl;
    private String zan;
    private String sc;
    private String content;

    public String getId() {
        return id;
    }
    public void setId(String id) {
        this.id = id;
    }
    ........

}

操作数据库的ArticleDao

public class ArticleDao extends JdbcTemplate {

    public ArticleDao() {
        // 创建C3P0的datasource 1.配置 2.代码
        ComboPooledDataSource dataSource = new ComboPooledDataSource();
        // 1.url
        // 2.driver
        // 3.username&password
        dataSource.setUser("root");
        dataSource.setPassword("root");
        dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8");
        setDataSource(dataSource);
    }

    public void save(Article article) {
        String sql = "INSERT INTO `spider`.`huxiu_article` (`id`, `title`, `author`, `createTime`, `zan`, `pl`, `sc`, `content`, `url` ) VALUES( ?,?,?,?,?,?,?,?,?)";
        update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl());
    }
}

返回值的实体类HuxiuPagingResponse

    public class HuxiuPagingResponse {

    private String data;
    private String last_dateline;
    private String msg;
    private String result;
    private String total_page;
    public String getData() {
        return data;
    }
    public void setData(String data) {
        this.data = data;
    }
    ........

}

程序主方法入口HuXiuSpider

public class HuXiuSpider {
    // 保存数据
    public static ArticleDao articleDao = new ArticleDao();
    // dataLine用来做分页的请求
    private static String dateLine = null;
    // 创建固定大小的线程池(下载、解析、存储)
    private static ExecutorService threadPool = Executors.newFixedThreadPool(30);
    // 队列---从首页和分页解析出来的文章url，存放在这个队列中
    public static ArrayBlockingQueue<String> urlQueue = new ArrayBlockingQueue<String>(1000);
    // 队列---每个文章解析出来的Html文档，存放这个队列中
    public static ArrayBlockingQueue<String> articleHtmlQueue = new ArrayBlockingQueue<String>(1000);
    // 队列---每个文章的内容，也就是article对象，存放这个队列中
    public static ArrayBlockingQueue<Article> articleContentQueue = new ArrayBlockingQueue<Article>(1000);

    public static void main(String[] args) throws Exception {
        // 提交线程 用来针对每个文章的url ----进行网络请求
        for (int i = 0; i < 10; i++) {
            threadPool.execute(new ProcessSinglePageRunnable());
        }
        // 解析页面
        for (int i = 0; i < 10; i++) {
            threadPool.execute(new ParseHtmlRunnable());
        }
        // 保存数据
        threadPool.execute(new SaveDBRunnable());
        //获取首页的文章url列表
        getIndexArticleUrlList();
        //加载分页
        processPaging();
    }
    /**
     * 获取首页的文章列表信息
     * 
     * @throws IOException
     * @throws ClientProtocolException
     */
    private static void getIndexArticleUrlList() throws IOException, ClientProtocolException {
        // 1.指定首页url http://www.huxiu.com
        String indexUrl = "http://www.huxiu.com";
        // 2.发起一个HttpGet请求
        HttpGet indexHttpGet = new HttpGet(indexUrl);
        //设置User-Agent
        indexHttpGet.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");

        String html = getHtmlByRequest(indexHttpGet);

        // 5.使用Jsoup进行解析，得到 文章的列表 ，获得文章aid。
        Document indexDocument = Jsoup.parse(html);

        // 获取date_line
        Elements dateLines = indexDocument.select("[data-last_dateline]");
        dateLine = dateLines.get(0).attr("data-last_dateline");

        // 5.1 解析出div的某个属性data-id
        Elements aidElements = indexDocument.select("div[data-aid]");
        // 5.2 依次得到每个新闻的aid
        for (Element element : aidElements) {
            String aid = element.attr("data-aid");
            try {
                urlQueue.put(aid);
            } catch (InterruptedException e) {
                System.out.println("添加 aid 到urlQueue异常" + e);
            }
        }
    }

    private static void processPaging() {
        for (int page = 2; page <= 1615; page++) {
            try {
                // 编写分页
                String pagingUrl = "https://www.huxiu.com/v2_action/article_list";
                HttpPost httpPost = new HttpPost(pagingUrl);
                // 设置参数
                ArrayList<NameValuePair> arrayList = new ArrayList<NameValuePair>();
                arrayList.add(new BasicNameValuePair("huxiu_hash_code", "fb7f7403c58c3e8cb45aa47afc204c10"));
                arrayList.add(new BasicNameValuePair("page", page + ""));
                arrayList.add(new BasicNameValuePair("last_dateline", dateLine));
                httpPost.setEntity(new UrlEncodedFormEntity(arrayList));
                // 执行网络参数
                String jsonText = getHtmlByRequest(httpPost);
                // 想将json串转成对象
                Gson gson = new Gson();
                HuxiuPagingResponse huxiuPagingResponse = gson.fromJson(jsonText, HuxiuPagingResponse.class);
                // 每一次请求，都需要解析出新的dataLine
                dateLine = huxiuPagingResponse.getLast_dateline();
                // 获取数据
                String htmlData = huxiuPagingResponse.getData();

                Document doc = Jsoup.parse(htmlData);
                // 解析出div的某个属性data-id
                Elements aidElements = doc.select("div[data-aid]");
                // 依次得到每个新闻的aid
                for (Element element : aidElements) {
                    String aid = element.attr("data-aid");
                    urlQueue.put(aid);
                }
            } catch (Exception e) {
                // log.errer()
                System.out.println(page);
                System.out.println(e);
            }
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

        }
    }

    /**
     * 获取html文档
     * @throws IOException
     * @throws ClientProtocolException
     */
    public static String getHtml(String aidUrl) throws IOException, ClientProtocolException {
        // 2.发起一个httpget请求
        HttpGet indexHttpGet = new HttpGet(aidUrl);
        return getHtmlByRequest(indexHttpGet);
    }
    private static String getHtmlByRequest(HttpRequestBase request) throws IOException, ClientProtocolException {
        //设置请求头User-Agent
        request.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");

        // 3.使用HttpClient执行，得到一个entity。
        CloseableHttpClient indexHttpClient = HttpClients.createDefault();
        CloseableHttpResponse indexResponse = indexHttpClient.execute(request);
        String html = null;
        if (200 == indexResponse.getStatusLine().getStatusCode()) {
            HttpEntity indexEntity = indexResponse.getEntity();
            // 4.将entity转成字符串（html）
            html = EntityUtils.toString(indexEntity, Charset.forName("utf-8"));
        }
        return html;
    }
}

用来针对每个文章的url 进行网络请求ProcessSinglePageRunnable

public class ProcessSinglePageRunnable  implements Runnable {

    public void run() {
        while (true) {
            try {
                processSingleUrl();
                Thread.sleep(3000);
            } catch (InterruptedException e) {
            }
        }
    }

    private void processSingleUrl() throws InterruptedException {
        String aid = HuXiuSpider.urlQueue.take();
        String aidUrl = "http://www.huxiu.com/article/" + aid + ".html";
        try {
            /*Article article = new Article();
            article.setId(aid);*/
            // 获取到单个新闻页面的html
            String aidHtml = HuXiuSpider.getHtml(aidUrl);
            HuXiuSpider.articleHtmlQueue.put(aidHtml);
        } catch (Exception e) {
            System.out.println(aidUrl);
            System.out.println(e);
        }
    }
}

解析每个页面ParseHtmlRunnable

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class ParseHtmlRunnable implements Runnable {

    public void run() {
        while (true) {
            String html = null;
            try {
                html = HuXiuSpider.articleHtmlQueue.take();
            } catch (InterruptedException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            Article article = new Article();
            Document detailDocument = Jsoup.parse(html);

            //解析出div的某个属性data-id
            Elements aidElements = detailDocument.select("div[data-aid]");
            String aid=aidElements.get(0).attr("data-aid");
            article.setId(aid);
            System.out.println(aid+".........");

            // 解析文章title
            Elements titles = detailDocument.select(".t-h1");
            String title = titles.get(0).text();
            article.setTitle(title);

            // 解析文章author author-name
            Elements names = detailDocument.select(".author-name");
            String name = names.get(0).text();
            article.setAuthor(name);
            // 解析文章发布时间
            Elements dates = detailDocument.select("[class^=article-time]");
            String date = dates.get(0).text();
            article.setCreateTime(date);
            // 解析文章 收藏数
            Elements shares = detailDocument.select("[class^=article-share]");
            String share = shares.get(0).text();
            article.setSc(share);
            // 解析文章 评论数
            Elements pls = detailDocument.select("[class^=article-pl]");
            String pl = pls.get(0).text();
            article.setPl(pl);
            // 解析文章 点赞数 num
            Elements nums = detailDocument.select(".num");
            String num = nums.get(0).text();
            article.setZan(num);
            // 解析文章正文内容 article-content-wrap
            Elements content = detailDocument.select(".article-content-wrap p");
            String contentText = content.text();
            article.setContent(contentText);
            // article.setUrl(aidUrl);
            try {
                HuXiuSpider.articleContentQueue.put(article);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

}

保存数据的类SaveDBRunnable

public class SaveDBRunnable implements Runnable {
    public void run() {
        while (true) {
            try {
                Article article = HuXiuSpider.articleContentQueue.take();
                HuXiuSpider.articleDao.save(article);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
}

相关标签：爬虫 maven java 网络爬虫-java

上一篇：关于easyui与后台的联动CRUD

下一篇： OpenGL tutorial#1 创建窗口

爬虫的综合案例

爬虫的综合案例爬取虎嗅网的完整Demo

具体代码如下

实体类Article

操作数据库的ArticleDao

返回值的实体类HuxiuPagingResponse

程序主方法入口HuXiuSpider

用来针对每个文章的url 进行网络请求ProcessSinglePageRunnable

解析每个页面ParseHtmlRunnable

保存数据的类SaveDBRunnable

C# for循环的经典案例集锦

Three.js的基础动画实现案例

2014年创意营销的最新案例

jQuery插件实现非常实用的tab栏切换功能【案例】

一则python3的简单爬虫代码

Python与简单网络爬虫的编写

简单的抓取淘宝图片的Python爬虫

我的第一个爬虫，爬取北京地区短租房信息

爬取博客园有关爬虫的文章

集群技术在七牛云存储中的应用案例分享