node 爬虫demo

程序员文章站 2022-07-14 11:18:02

...

主要卡在getHrefList函数上比较久, 在map循环里发送请求，然后在请求结果中还要循环，起初一直拿不到urlsArray数组不是undefined 就是[] 主要还是异步的问题目前这个版本虽然拿到了所有链接,但是链接的顺序与页码数是否匹配这方面还没测试我想是没问题的毕竟我对获取每一页结构哪里使用了await 关键字啊...

代码

const axios = require("axios");
const cheerio = require("cheerio");
const async = require("async");
const Koa = require("koa");
const router = require("koa-router")();
const app = new Koa();
router.get('/', async ctx => {
    start()
})
let index = 0;
function start() {
    getHrefList().then(res => {
        // 控制并发请求
        async.mapLimit(res, 5, itemDo, (err, result) => {
            console.log("已完毕")
        })
    })
}

function itemDo(url, callback) {
    var delay = parseInt((Math.random() * 10000000) % 2000, 10);
    axios.get(url).then((res) => {
        index = index + 1;
        // 获取每一篇博客的信息
        getTitle(res.data, index, url, delay)
        setTimeout(function () {
            callback(null, res.data)
        }, delay);
    })
    console.log(1111)
}

function getHrefList() {
    let pageData = [];
    let pageCount = 1; // 页码总数
    for (let i = 0; i < pageCount; i ++) { // 储存每一页的请求参数, 我看他的接口接PageIndex在改变其他没变
        let obj = {
            CategoryId: 808,
            CategoryType: "SiteHome",
            ItemListActionName: "AggSitePostList",
            PageIndex: i + 1,
            ParentCategoryId: 0,
            TotalPostCount: 4000
        };
        pageData.push(obj);
    }
    let urlsArray = [];
    return new Promise((resolve, reject) => {
        Promise.all(pageData.map(async (item, index) => {
            // 获取每一页的结构
            await axios({
                url: "https://www.cnblogs.com/AggSite/AggSitePostList",
                method: "POST",
                data: item
            }).then(res => {
                var $ = cheerio.load(res.data);
                let curPageUrls = $('.titlelnk');
                curPageUrls.each((index, item) => {
                    // 存储每一页的a.titlelnk的href 链接
                    urlsArray.push($(item).attr('href'));
                })
            })
        })).then(() => {
            resolve(urlsArray)
        })
    })
}
// 输出
function getTitle(html, index, url, delay) {
    var $ = cheerio.load(html);
    console.log(index, '正在抓取的是', url, '标题:' + $('#cb_post_title_url').text(),  '耗时' + delay + '毫秒'); 
}

app.use(router.routes());

app.listen(3000, () => {
    console.log("app listen port 3000")
})

结果:

node 爬虫demo

node 爬虫demo

Node.js 实现简单的接口服务器的实例代码

详解用node编写自己的cli工具

Node.js对MongoDB数据库实现模糊查询的方法

浅谈Node.js轻量级Web框架Express4.x使用指南

利用node.js写一个爬取知乎妹纸图的小爬虫

详解Node.js串行化流程控制

js加密怎么破解（爬虫破解js加密的流程）

爬虫小项目！适合有基础的！爬取葡萄酒评分！哪个阶段和那种酒呢

Python爬虫实战用 BeautifulSoup 爬取电影网站信息

爬虫实战在 Python 中使用正则表达式