欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

js爬取国家统计局行政区划(广东省)

程序员文章站 2022-05-04 18:09:06
...

整体说明

由于ajax不支持GBK,导致直接请求国家统计局返回的是乱码,所以用Java写了一个转发,就是单纯的将请求到国家统计局的返回信息,转成utf-8,返回而已,代码就不贴了

代码

<!DOCTYPE HTML
    PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3c.org/TR/1999/REC-html401-19991224/loose.dtd">
<HTML>

<HEAD>
    <META content="text/html; charset=gb2312" http-equiv=Content-Type>
</HEAD>

<body id="moshiyuan">

</body>

<head>
    <script src="https://code.jquery.com/jquery-3.1.1.min.js"></script>
    <script type="text/javascript">
        let areaData=[];
        $(function () {
            getShi();
        });
        getShi =async () => {
            const response= await getPage("44.html");
            let htmlList=$(response).find('.citytr');
            for(let i = 0; i < htmlList.length; i++){
                let value=htmlList[i];
                let first = $(value).find('td').first();
                let last = $(value).find('td').last();
                let item = {
                    name: $(last).text(),
                    code: $(first).text()
                }
                areaData.push(item);
                let href=$(first).find('a').attr('href');
                if(href){
                    await getQu(href);
                }
                console.log(areaData);
            }
            console.log(JSON.stringify(areaData));
        }
        getQu =async (url) => {
            const response= await getPage(url);
            let htmlList=$(response).find('.countytr');
            for(let i = 0; i < htmlList.length; i++){
                let value=htmlList[i];
                let first = $(value).find('td').first();
                let last = $(value).find('td').last();
                let item = {
                    name: $(last).text(),
                    code: $(first).text()
                }
                areaData.push(item);
                let href=$(first).find('a').attr('href');
                if(href){
                    await getJiedao(url,href);
                }
            }
        }
        getJiedao =async (url1,url2) => {
            const index=url1.lastIndexOf('/');
            let url=url1.substring(0,index+1)+url2;
            const response= await getPage(url);
            let htmlList=$(response).find('.towntr');
            for(let i = 0; i < htmlList.length; i++){
                let value=htmlList[i];
                let first = $(value).find('td').first();
                let last = $(value).find('td').last();
                let item = {
                    name: $(last).text(),
                    code: $(first).text()
                }
                areaData.push(item);
                let href=$(first).find('a').attr('href');
                if(href){
                    await getshequ(url,href);
                }
            }
        }
        getshequ =async (url1,url2) => {
            const index=url1.lastIndexOf('/');
            let url=url1.substring(0,index+1)+url2;
            const response= await getPage(url);
            let hrefList=[];
            let htmlList=$(response).find('.villagetr');
            for(let i = 0; i < htmlList.length; i++){
                let value=htmlList[i];
                let first = $(value).find('td').first();
                let last = $(value).find('td').last();
                let item = {
                    name: $(last).text(),
                    code: $(first).text()
                }
                areaData.push(item);
            }
        }
        getPage =(url2) =>{
            // 阻塞停顿,防止请求过快,被防火墙拦截
            sleep(500);
            const url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
            var data={"url":url1+url2};
            let response ='';
            return $.ajax({
                url: "http://127.0.0.1:9090/v1/test/area",
                data:JSON.stringify(data),
                type: "POST",
                contentType:"application/json",
                scriptCharset: 'gb2312'
            }).then(res=>{
                return res;
            })
        }
        function sleep (d){
            let t = new Date().getTime();
            while(new Date().getTime() - t <= d){}
        }
    </script>
</head>

</HTML>
相关标签: javascript