Node实现爬虫
程序员文章站
2022-05-30 12:15:32
...
个人链接
github:https://github.com/webxing
简书:https://www.jianshu.com/u/489662a091fd
爬虫
- 按照一定的规则自动抓取网络信息的程序
反爬虫
- user-Agent,Referer,验证码
- 单位时间访问次数,访问量
- 关键信息图片混淆
- 异步加载
爬虫利器puppeteer
- 中文文档:https://zhaoqize.github.io/puppeteer-api-zh_CN/#/
- 英文文档:https://github.com/GoogleChrome/puppeteer/blob/v1.8.0/docs/api.md
截屏
- https://github.com/GoogleChrome/puppeteer
- 安装puppeteer依赖:
npm i puppeteer
const puppeteer = require('puppeteer');
const { screenshot } = require('./config');
(async() => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://baidu.com')
await page.screenshot({path: `${screenshot}/${Date.now()}.png`})
await browser.close()
})()
爬取百度图片
// 获取图片
async function getImg(page, target) {
// 重置窗口大小
// await page.setViewport({
// width: 1920,
// height: 1080
// })
// console.log(chalk.green('reset viewport'))
await page.focus('#q')
await page.keyboard.sendCharacter(target)
await page.click('#search')
console.log(chalk.green('go to search list'))
page.on('load', async () => {
console.log(chalk.green('page loading done, start fetch...'))
const imgsCounts = await page.$$eval('img.main_img', imgs => imgs.map(i => i.src))
console.log('获取完成')
imgsCounts.forEach( async src => {
await src2img(src, imagesPath)
})
})
}
获取海词近义词
// 获取近义词
async function getWord(page, target) {
await sleep()
await page.focus('#q')
await page.keyboard.type(target, {delay: 100})
await page.click('#search')
console.log(chalk.green('go to search list'))
page.on('load', async () => {
console.log(chalk.green('page loading done, start fetch...'))
const wordsCounts = await page.$$eval('.rel .nfo li a', words => words.map(i => i.text.trim()))
console.log(wordsCounts)
await page.keyboard.press('Backspace')
})
}
结语
初学node,记录之用。