java爬虫-Java爬虫-2018国家统计局区划和城乡划分代码以及数据库、json文件
程序员文章站
2022-03-02 20:53:02
...
国家统计局:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/
2018分析:
# 查看省份数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
# 查看 内蒙古 市级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html
# 查看 内蒙古 区级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/1509.html
# 查看 内蒙古 街道级数据http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/150902.html
# 查看 内蒙古 社区居委会级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/02/150902003.html
发现这个是有规律的,15是内蒙古的区划代码,而1509是乌兰察布市的区划代码,前面的http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/这一大串都是一样的,我们就叫commonUrl。
规律就是:
# 获取省的数据 commonUrl + index.html
# 获取市级数据 commonUrl + 对应省级区划代码.html
# 获取县区级数据 commonUrl + 对应省级区划代码 + / + 对应市级区划代码.html
详细代码如下:
实体类
package com.reptile.area.jsoup;
import lombok.*;
import java.util.List;
@Data
@ToString
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class Node {
private String name;
private String code;
private String dataFromUrl;
private List<Node> nodes;
}
具体实现:
package com.reptile.area.jsoup;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONUtil;
import cn.hutool.log.StaticLog;
/**
* * 省市区区划地址解析
*
* @author zhang.xiaoming
*/
public class CityStats {
private static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
private static final Charset CHARSET = CharsetUtil.CHARSET_GBK;
private CityStats() {
}
public static void parseProvince(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
// 获取 class='provincetr' 的元素
Elements elements = document.getElementsByClass("provincetr");
List<Node> provinces = new LinkedList<Node>();
for (Element element : elements) {
// 获取 elements 下属性是 href 的元素
Elements links = element.getElementsByAttribute("href");
for (Element link : links) {
String provinceName = link.text();
String href = link.attr("href");
String provinceCode = href.substring(0, 2);
StaticLog.info("provinceName: {} , provinceCode: {} .", provinceName, provinceCode);
Node provinceNode = Node.builder().code(provinceCode).name(provinceName).dataFromUrl(url).build();
StaticLog.info("省级数据: {} ", provinceNode);
parseCity(COMMON_URL + href, provinceNode);
provinces.add(provinceNode);
}
}
StaticLog.info(JSONUtil.toJsonPrettyStr(provinces));
}
public static void parseCity(String url, Node provinceNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("citytr");
List<Node> cities = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
String href = links.get(0).attr("href");
String cityCode = links.get(0).text().substring(0, 4);
String cityName = links.get(1).text();
Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url).build();
StaticLog.info(" 市级数据: {} ", cityNode);
parseCounty(COMMON_URL + href, cityNode);
cities.add(cityNode);
}
provinceNode.setNodes(cities);
}
public static void parseCounty(String url, Node cityNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("countytr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String countyCode = links.get(0).text().substring(0, 6);
String countyName = links.get(1).text();
Node countyNode = Node.builder().name(countyName).code(countyCode).dataFromUrl(url).build();
StaticLog.info(" 县级数据: {} ", countyNode);
parseTowntr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href, countyNode);
counties.add(cityNode);
}
cityNode.setNodes(counties);
}
public static void parseTowntr(String url, Node countyNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("towntr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String towntrCode = links.get(0).text().substring(0, 6);
String towntrName = links.get(1).text();
Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url).build();
StaticLog.info(" 乡镇级数据: {} ", towntrNode);
parseVillagetr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href,
countyNode);
counties.add(towntrNode);
}
countyNode.setNodes(counties);
}
public static void parseVillagetr(String url, Node countyNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("villagetr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements tds = tr.getElementsByTag("td");
if (tds == null || tds.size() != 3) {
continue;
}
String villagetrCode = tds.get(0).text();
String villagetrName = tds.get(2).text();
Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
StaticLog.info(" 村级数据: {} ", villagetrNode);
counties.add(villagetrNode);
}
countyNode.setNodes(counties);
}
public static void main(String[] args) {
/**
* # 查看省份数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
*
* # 查看 内蒙古 市级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html
*
* # 查看 内蒙古 区级数据
* http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/1509.html
*
* # 查看 内蒙古 街道级数据
* http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/150902.html
*
* # 查看 内蒙古 社区居委会级数据
* http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/02/150902003.html
*
* *我们发现这个是有规律的,15是内蒙古的区划代码,而1509是乌兰察布市的区划代码,
* *前面的http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/这一大串都是一样的,我们就叫commonUrl。
* *规律就是:
*
* # 获取省的数据 commonUrl + index.html
*
* # 获取市级数据 commonUrl + 对应省级区划代码.html
*
* # 获取县区级数据 commonUrl + 对应省级区划代码 + / + 对应市级区划代码.html
*/
String provinceUrl = COMMON_URL + "index.html";
CityStats.parseProvince(provinceUrl);
String cityUrl = COMMON_URL + "15.html";
CityStats.parseCity(cityUrl, new Node());
String countyUrl = COMMON_URL + "15/1509.html";
CityStats.parseCounty(countyUrl, new Node());
String towntrUrl = COMMON_URL + "15/09/150981.html";
CityStats.parseTowntr(towntrUrl, new Node());
}
}
####################################################################################################
mysql数据等代码实现
package com.reptile.area.decorator;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.reptile.area.jsoup.Node;
import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;
/**
* *省市县解析器
*/
public class CityParser implements ICityParser {
private static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
private static final Charset CHARSET = CharsetUtil.CHARSET_GBK;
public List<Node> parseProvinces(String url) {
return parseProvince(COMMON_URL + "index.html");
}
private List<Node> parseProvince(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
// 获取 class='provincetr' 的元素
Elements elements = document.getElementsByClass("provincetr");
List<Node> provinces = new LinkedList<Node>();
for (Element element : elements) {
// 获取 elements 下属性是 href 的元素
Elements links = element.getElementsByAttribute("href");
for (Element link : links) {
String provinceName = link.text();
String href = link.attr("href");
String provinceCode = href.substring(0, 2);
Node provinceNode = Node.builder().code(provinceCode).name(provinceName).dataFromUrl(url)
.nodes(parseCity(COMMON_URL + href)).build();
StaticLog.info("省级数据: {} ", provinceNode);
provinces.add(provinceNode);
}
}
return provinces;
}
private List<Node> parseCity(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("citytr");
List<Node> cities = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
String href = links.get(0).attr("href");
String cityCode = links.get(0).text().substring(0, 4);
String cityName = links.get(1).text();
Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url)
.nodes(parseCounty(COMMON_URL + href)).build();
StaticLog.info(" 市级数据: {} ", cityNode);
cities.add(cityNode);
}
return cities;
}
private List<Node> parseCounty(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("countytr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String countyCode = links.get(0).text().substring(0, 6);
String countyName = links.get(1).text();
Node countyNode = Node.builder().code(countyCode).name(countyName).dataFromUrl(url)
.nodes(parseTowntr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href)).build();
StaticLog.info(" 县级数据: {} ", countyNode);
counties.add(countyNode);
}
return counties;
}
public List<Node> parseTowntr(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("towntr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String towntrCode = links.get(0).text().substring(0, 6);
String towntrName = links.get(1).text();
Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url)
.nodes(parseVillagetr(
COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href))
.build();
StaticLog.info(" 乡镇级数据: {} ", towntrNode);
counties.add(towntrNode);
}
return counties;
}
public List<Node> parseVillagetr(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("villagetr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements tds = tr.getElementsByTag("td");
if (tds == null || tds.size() != 3) {
continue;
}
String villagetrCode = tds.get(0).text();
String villagetrName = tds.get(2).text();
Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
StaticLog.info(" 村级数据: {} ", villagetrNode);
counties.add(villagetrNode);
}
return counties;
}
}
package com.reptile.area.decorator;
import java.util.List;
import com.reptile.area.jsoup.Node;
public class CityParserDecorator implements ICityParser {
private ICityParser cityParser;
public CityParserDecorator(ICityParser cityParser) {
this.cityParser = cityParser;
}
public List<Node> parseProvinces(String url) {
return this.cityParser.parseProvinces(url);
}
}
package com.reptile.area.decorator;
import java.util.List;
import com.reptile.area.jsoup.Node;
public interface ICityParser {
/**
* *解析得到省市区数据
*
* @param url 请求url
* @return 城市
*/
List<Node> parseProvinces(String url);
}
package com.reptile.area.decorator;
import java.util.List;
import com.reptile.area.jsoup.Node;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.json.JSONUtil;
public class JsonCityParserDecorator extends CityParserDecorator{
public JsonCityParserDecorator(ICityParser cityParser) {
super(cityParser);
}
@Override
public List<Node> parseProvinces(String url) {
List<Node> provinces = super.parseProvinces(url);
String jsonStr = JSONUtil.toJsonStr(provinces);
// json数据写入到文件
FileWriter fileWriter = new FileWriter(FileUtil.touch("F://area.json"));
fileWriter.write(jsonStr);
return provinces;
}
}
package com.reptile.area.decorator;
import java.util.List;
import com.reptile.area.jsoup.Node;
import cn.hutool.log.StaticLog;
public class LocationCityParserDecorator extends CityParserDecorator {
public LocationCityParserDecorator(ICityParser cityParser) {
super(cityParser);
}
@Override
public List<Node> parseProvinces(String url) {
List<Node> provinces = super.parseProvinces(url);
StaticLog.info("查询出经纬度了. . . ");
return provinces;
}
}
package com.reptile.area.decorator;
import java.util.ArrayList;
import java.util.List;
import com.github.stuxuhai.jpinyin.PinyinException;
import com.github.stuxuhai.jpinyin.PinyinFormat;
import com.github.stuxuhai.jpinyin.PinyinHelper;
import com.reptile.area.jsoup.Node;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.core.text.StrFormatter;
import cn.hutool.log.StaticLog;
/**
* sql打印装饰器
*/
public class SqlCityParserDecorator extends CityParserDecorator {
private static final String SQL = "insert into area(`name`, `code`, full_spell, easy_spell, initial, parent_code, depth, data_from_url) values ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');";
public SqlCityParserDecorator(ICityParser cityParser) {
super(cityParser);
}
@Override
public List<Node> parseProvinces(String url) {
List<Node> provinces = super.parseProvinces(url);
List<String> buildSql = buildSql(provinces);
if (CollUtil.isNotEmpty(buildSql)) {
// json数据写入到文件
FileWriter fileWriter = new FileWriter(FileUtil.touch("F://area.sql"));
fileWriter.writeLines(buildSql);
}
return provinces;
}
/**
* *实体转sql数据
*
* @param provinces 省市县数据
*/
private List<String> buildSql(List<Node> provinces) {
List<String> sqls = null;
if (CollUtil.isNotEmpty(provinces)) {
sqls = new ArrayList<>();
for (Node province : provinces) {
sqls.add(initSql(province.getName(), province.getCode(), province.getDataFromUrl(), "", 1));
buildCitySql(sqls, province.getNodes(), province.getCode());
}
}
return sqls;
}
private void buildCitySql(List<String> sqls, List<Node> cities, String parentCode) {
if (CollUtil.isNotEmpty(cities)) {
for (Node city : cities) {
sqls.add(initSql(city.getName(), city.getCode(), city.getDataFromUrl(), parentCode, 2));
buildCountySql(sqls, city.getNodes(), city.getCode());
}
}
}
private void buildCountySql(List<String> sqls, List<Node> counties, String parentCode) {
if (CollUtil.isNotEmpty(counties)) {
for (Node county : counties) {
sqls.add(initSql(county.getName(), county.getCode(), county.getDataFromUrl(), parentCode, 3));
buildTowntrSql(sqls, county.getNodes(), county.getCode());
}
}
}
private void buildTowntrSql(List<String> sqls, List<Node> towies, String parentCode) {
if (CollUtil.isNotEmpty(towies)) {
for (Node towntr : towies) {
sqls.add(initSql(towntr.getName(), towntr.getCode(), towntr.getDataFromUrl(), parentCode, 4));
buildVillagetrSql(sqls, towntr.getNodes(), towntr.getCode());
}
}
}
private void buildVillagetrSql(List<String> sqls, List<Node> vilies, String parentCode) {
if (CollUtil.isNotEmpty(vilies)) {
for (Node villagetr : vilies) {
sqls.add(initSql(villagetr.getName(), villagetr.getCode(), villagetr.getDataFromUrl(), parentCode, 5));
}
}
}
/**
** 初始化sql语句
*/
private String initSql(String name, String code, String dataFromUrl, String parentCode, Integer depth) {
String insertSql = null;
try {
insertSql = StrFormatter.format(SQL, name, code,
PinyinHelper.convertToPinyinString(name, "", PinyinFormat.WITHOUT_TONE),
PinyinHelper.getShortPinyin(name), PinyinHelper.getShortPinyin(name).substring(0, 1).toString(),
parentCode, depth, dataFromUrl);
StaticLog.info(insertSql);
} catch (PinyinException e) {
StaticLog.error("拼音解析失败:{} .", e.getMessage());
}
return insertSql;
}
}
数据库表:
-- ----------------------------
-- Table structure for area
-- ----------------------------
DROP TABLE IF EXISTS `area`;
CREATE TABLE `area` (
`id` bigint(8) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '主键id',
`name` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '城市名称',
`code` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '城市代码',
`full_spell` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '全拼,北京全拼为beijing',
`easy_spell` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '简拼,北京简拼为bj',
`initial` char(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '首字母,北京首字母为b',
`parent_code` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '父级城市代码',
`depth` char(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '0' COMMENT '等级:省=1,市=2,县区=3,乡镇=4,村=5 ',
`data_from_url` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT NULL COMMENT '数据来源地址',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 703613 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci COMMENT = '省市县(区)表' ROW_FORMAT = Dynamic;
-- ----------------------------
#######################################-------完结----------###############################################
测试代码
package com.reptile.area;
import java.util.List;
import com.reptile.area.decorator.CityParser;
import com.reptile.area.decorator.ICityParser;
import com.reptile.area.decorator.JsonCityParserDecorator;
import com.reptile.area.decorator.LocationCityParserDecorator;
import com.reptile.area.decorator.SqlCityParserDecorator;
import com.reptile.area.jsoup.Node;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.log.StaticLog;
public class CityParserTest {
private static final String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html";
public static void main(String[] args) {
TimeInterval timer = DateUtil.timer();
// -------这是执行过程--------------
cityParserDecorator();
// ---------------------------------
long interval = timer.interval();// 花费毫秒数
long intervalMinute = timer.intervalMinute();// 花费分钟数
StaticLog.info("本次程序执行 花费毫秒数: {} , 花费分钟数:{} . ", interval, intervalMinute);
}
private static List<Node> cityParserDecorator() {
ICityParser cityParser = new CityParser();
// 1. 先查经纬度
ICityParser locationCityParser = new LocationCityParserDecorator(cityParser);
// 展示sql
ICityParser sqlCityParser = new SqlCityParserDecorator(locationCityParser);
// 打印json
ICityParser jsonCityParser = new JsonCityParserDecorator(sqlCityParser);
List<Node> parseProvinces = jsonCityParser.parseProvinces(url);
return parseProvinces;
}
}
懒人通道:https://download.csdn.net/download/qq_38765404/11017014