欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

node.js爬取国家统计局全国行政区数据

程序员文章站 2022-05-04 18:09:24
...

github地址:https://github.com/username-xu/node-districts

const fs = require('fs');

// 网络请求,文档可参考:https://www.jianshu.com/p/1432e0f29abd
const superagent = require('superagent');

// 设置编码格式,文档:https://www.npmjs.com/package/superagent-charset
require('superagent-charset')(superagent);

// DOM操作,语法类似jquery,文档可参考:https://www.jianshu.com/p/629a81b4e013
const cheerio = require('cheerio');

const main = () => {
    getSheng();
}

// 获取页面
const getPage = async (url2) => {
    
    // 阻塞停顿,防止请求过快,被防火墙拦截
    sleep(1000);

    const url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'
    let response = '';
    try{
        response = (await superagent
            .get(url1 + url2)
            .set({ 
                // 模拟浏览器请求
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
                'Accept-Encoding': 'gzip, deflate',
                'content-type': 'text/html',
                'Content-Length': Buffer.byteLength(""),
                'Connection': 'keep-alive'
            })
            .buffer(true)
            // 设置编码格式
            .charset('gb2312')).text;
    } catch(err){
        console.log(err)
    }
    
    return response;
}

// 获取省
const getSheng = async () => {
    let time = new Date().getTime();
    let timer = setInterval(() => {
        console.log('抓取中 '+ Math.floor((new Date().getTime() - time) / 1000));
    },1000)


    let list = [];
    let page = await getPage('index.html');
    let $ = cheerio.load(page);
    let htmlList = $('.provincetr td a');
    
    for(let i = 0; i < htmlList.length; i++){
        let url = $(htmlList[i]).attr('href');
        let item = {
            name: $(htmlList[i]).text(),
            code: url.slice(0, 2) + '0000',
            children: []
        };

        list.push(await getShi(url, item));
    }

    output(list);
    clearInterval(timer);
}

// 获取市
const getShi = async (url, shengItem) => {
    let page = await getPage(url);
    let $ = cheerio.load(page);
    let htmlList = $('.citytr');

    for(let i = 0; i < htmlList.length; i++){
        let first = $(htmlList[i]).find('td').first().find('a');
        let last = $(htmlList[i]).find('td').last().find('a');
        let itemUrl = $(first).attr('href');

        let item = {
            name: $(last).text(),
            code: $(first).text().slice(0, 6),
            children: []
        }

        shengItem.children.push(await getQu(itemUrl, item));
    }

    return shengItem;
}

// 获取区
const getQu = async (itemUrl, shiItem) => {
    let page = await getPage(itemUrl);
    let $ = cheerio.load(page);
    let htmlList = $('.countytr');

    for(let i = 0; i < htmlList.length; i++){
        let first = $(htmlList[i]).find('td').first().find('a');
        let last = $(htmlList[i]).find('td').last().find('a');

        if($(last).text() && $(first).text()){
            let item = {
                name: $(last).text(),
                code: $(first).text().slice(0, 6)
            }

            shiItem.children.push(item);
        }
        
    }

    return shiItem;
}

const sleep = d => {
    let t = new Date().getTime();
    while(new Date().getTime() - t <= d){}
}

// 输出
const output = data => {
    let dataStr = JSON.stringify(data);

    fs.writeFileSync(
        'data.json',
        dataStr,
        function(err){
            if(err){
                console.log(err);
            }
        }
    )
}

main();