js爬取国家统计局行政区划(广东省)
程序员文章站
2022-05-04 18:09:06
...
整体说明
由于ajax不支持GBK,导致直接请求国家统计局返回的是乱码,所以用Java写了一个转发,就是单纯的将请求到国家统计局的返回信息,转成utf-8,返回而已,代码就不贴了
代码
<!DOCTYPE HTML
PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3c.org/TR/1999/REC-html401-19991224/loose.dtd">
<HTML>
<HEAD>
<META content="text/html; charset=gb2312" http-equiv=Content-Type>
</HEAD>
<body id="moshiyuan">
</body>
<head>
<script src="https://code.jquery.com/jquery-3.1.1.min.js"></script>
<script type="text/javascript">
let areaData=[];
$(function () {
getShi();
});
getShi =async () => {
const response= await getPage("44.html");
let htmlList=$(response).find('.citytr');
for(let i = 0; i < htmlList.length; i++){
let value=htmlList[i];
let first = $(value).find('td').first();
let last = $(value).find('td').last();
let item = {
name: $(last).text(),
code: $(first).text()
}
areaData.push(item);
let href=$(first).find('a').attr('href');
if(href){
await getQu(href);
}
console.log(areaData);
}
console.log(JSON.stringify(areaData));
}
getQu =async (url) => {
const response= await getPage(url);
let htmlList=$(response).find('.countytr');
for(let i = 0; i < htmlList.length; i++){
let value=htmlList[i];
let first = $(value).find('td').first();
let last = $(value).find('td').last();
let item = {
name: $(last).text(),
code: $(first).text()
}
areaData.push(item);
let href=$(first).find('a').attr('href');
if(href){
await getJiedao(url,href);
}
}
}
getJiedao =async (url1,url2) => {
const index=url1.lastIndexOf('/');
let url=url1.substring(0,index+1)+url2;
const response= await getPage(url);
let htmlList=$(response).find('.towntr');
for(let i = 0; i < htmlList.length; i++){
let value=htmlList[i];
let first = $(value).find('td').first();
let last = $(value).find('td').last();
let item = {
name: $(last).text(),
code: $(first).text()
}
areaData.push(item);
let href=$(first).find('a').attr('href');
if(href){
await getshequ(url,href);
}
}
}
getshequ =async (url1,url2) => {
const index=url1.lastIndexOf('/');
let url=url1.substring(0,index+1)+url2;
const response= await getPage(url);
let hrefList=[];
let htmlList=$(response).find('.villagetr');
for(let i = 0; i < htmlList.length; i++){
let value=htmlList[i];
let first = $(value).find('td').first();
let last = $(value).find('td').last();
let item = {
name: $(last).text(),
code: $(first).text()
}
areaData.push(item);
}
}
getPage =(url2) =>{
// 阻塞停顿,防止请求过快,被防火墙拦截
sleep(500);
const url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
var data={"url":url1+url2};
let response ='';
return $.ajax({
url: "http://127.0.0.1:9090/v1/test/area",
data:JSON.stringify(data),
type: "POST",
contentType:"application/json",
scriptCharset: 'gb2312'
}).then(res=>{
return res;
})
}
function sleep (d){
let t = new Date().getTime();
while(new Date().getTime() - t <= d){}
}
</script>
</head>
</HTML>
上一篇: Python教程(一)--第一个helloworld程序
下一篇: jar包执行完删除自己