国家地理区域获取(国家统计局抓包)
程序员文章站
2024-01-20 18:49:40
...
国家地理区域获取(国家统计局抓包)
-
国家统计局统计地址
- http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/
-
引入java HTML解析器Jsoup
-
<!--java HTML解析器--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency>
-
-
编写java程序代码
-
@Test public void getAreaTest() throws IOException { //创建文件输出流 FileWriter fileWriter = new FileWriter(new File("d://china_city.sql")); //设置访问url(国际统计局的统计地址) String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/"; //使用jsoup获取页面内容 Document doc = Jsoup.parse(new URL(baseUrl), 100000); //获取页面中<tr>标签且class = "provincetr" ,然后查询所有的a标签元素 Elements provinceTds = doc.select("tr[class=provincetr]").select("a"); String cityName; String parentCode; String baseFormat = "insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ({0},{1},{2},{3});"; //遍历数据 for (Element element : provinceTds) { //获取标签文本内容 cityName = element.text(); //获取省级code String href = element.attr("href"); int index = href.indexOf(".html"); String provinceCode = href.substring(0, index); parentCode = "0"; String provincePath = "''"; //格式化需要输出的字符串 String format = MessageFormat.format(baseFormat, "'" + cityName + "'", "'" + provinceCode + "'", "'" + parentCode + "'", provincePath); //System.out.println(format); //输出流输出内容并刷新 fileWriter.write(format); fileWriter.write("\r\n"); fileWriter.flush(); //获取市级访问url String cityUrl = baseUrl + href; doc = Jsoup.parse(new URL(cityUrl), 100000); 获取页面中<tr>标签且class = "citytr" Elements cityTds = doc.select("tr[class=citytr]"); for (Element cityTd : cityTds) { //获取元素中的所有a标签元素 Elements tds = cityTd.select("a"); //获取城市名 cityName = tds.get(1).text(); parentCode = provinceCode; //获取城市代码 String cityCode = tds.get(0).text(); //获取a标签的访问链接 href = tds.get(0).attr("href"); //格式化字符串 format = MessageFormat.format(baseFormat, "'" + cityName + "'", "'" + cityCode + "'", "'" + parentCode + "'", "'" + parentCode + "'"); fileWriter.write(format); fileWriter.write("\r\n"); fileWriter.flush(); //System.out.println(format); String countryUrl = baseUrl + href; doc = Jsoup.parse(new URL(countryUrl), 100000); Elements countryTds = doc.select("tr[class=countytr]"); for (Element countryTd : countryTds) { tds = countryTd.select("td"); cityName = tds.get(1).text(); parentCode = cityCode; String parentPath = provinceCode + "," + parentCode; String countryCode = tds.get(0).text(); format = MessageFormat.format(baseFormat, "'" + cityName + "'", "'" + countryCode + "'", "'" + parentCode + "'", "'" + parentPath + "'"); fileWriter.write(format); fileWriter.write("\r\n"); fileWriter.flush(); //System.out.println(format); } try { Thread.sleep(50); } catch (InterruptedException e) { e.printStackTrace(); } } } fileWriter.close(); }
-
-
获取统计内容(部分)如下
insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('北京市','11','0',''); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('市辖区','110100000000','11','11'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('东城区','110101000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('西城区','110102000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('朝阳区','110105000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('丰台区','110106000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('石景山区','110107000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('海淀区','110108000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('门头沟区','110109000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('房山区','110111000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('通州区','110112000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('顺义区','110113000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('昌平区','110114000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('大兴区','110115000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('怀柔区','110116000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('平谷区','110117000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('密云区','110118000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('延庆区','110119000000','110100000000','11,110100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('天津市','12','0',''); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('市辖区','120100000000','12','12'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('和平区','120101000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河东区','120102000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河西区','120103000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('南开区','120104000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河北区','120105000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('红桥区','120106000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('东丽区','120110000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('西青区','120111000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('津南区','120112000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('北辰区','120113000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('武清区','120114000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('宝坻区','120115000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('滨海新区','120116000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('宁河区','120117000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('静海区','120118000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('蓟州区','120119000000','120100000000','12,120100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河北省','13','0',''); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('石家庄市','130100000000','13','13'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('市辖区','130101000000','130100000000','13,130100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('长安区','130102000000','130100000000','13,130100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('桥西区','130104000000','130100000000','13,130100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('新华区','130105000000','130100000000','13,130100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('井陉矿区','130107000000','130100000000','13,130100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('裕华区','130108000000','130100000000','13,130100000000'); insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('藁城区','130109000000','130100000000','13,130100000000');
以上抓取方式可能,有所拙劣,如果代价有更好的方法,可以留言互相参考!!!
上一篇: 跟踪外推效果怎么样做好呢?
下一篇: 比网站被降权更可怕的是“头痛医脚”