欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

java爬虫-Java爬虫-2018国家统计局区划和城乡划分代码以及数据库、json文件

程序员文章站 2022-03-02 20:53:02
...

国家统计局:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

2018分析:

 # 查看省份数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html

 # 查看 内蒙古 市级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html
 
 # 查看 内蒙古 区级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/1509.html
 
 # 查看 内蒙古 街道级数据http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/150902.html
 
 # 查看 内蒙古 社区居委会级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/02/150902003.html

发现这个是有规律的,15是内蒙古的区划代码,而1509是乌兰察布市的区划代码,前面的http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/这一大串都是一样的,我们就叫commonUrl。
    
规律就是:
		 
		 # 获取省的数据 commonUrl + index.html
		 
		 # 获取市级数据 commonUrl + 对应省级区划代码.html
		 
		 # 获取县区级数据 commonUrl + 对应省级区划代码 + / + 对应市级区划代码.html

详细代码如下:

实体类

package com.reptile.area.jsoup;

import lombok.*;

import java.util.List;

@Data
@ToString
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class Node {

    private String name;

    private String code;
    
    private String dataFromUrl;

    private List<Node> nodes;
}

具体实现:

package com.reptile.area.jsoup;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONUtil;
import cn.hutool.log.StaticLog;

/**
 * * 省市区区划地址解析
 * 
 * @author zhang.xiaoming
 */
public class CityStats {

	private static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";

	private static final Charset CHARSET = CharsetUtil.CHARSET_GBK;

	private CityStats() {
	}

	public static void parseProvince(String url) {

		String htmlStr = HttpUtil.get(url, CHARSET);

		Document document = Jsoup.parse(htmlStr);

		// 获取 class='provincetr' 的元素
		Elements elements = document.getElementsByClass("provincetr");
		List<Node> provinces = new LinkedList<Node>();
		for (Element element : elements) {
			// 获取 elements 下属性是 href 的元素
			Elements links = element.getElementsByAttribute("href");
			for (Element link : links) {
				String provinceName = link.text();
				String href = link.attr("href");
				String provinceCode = href.substring(0, 2);

				StaticLog.info("provinceName: {} , provinceCode: {} .", provinceName, provinceCode);

				Node provinceNode = Node.builder().code(provinceCode).name(provinceName).dataFromUrl(url).build();

				StaticLog.info("省级数据:  {}  ", provinceNode);

				parseCity(COMMON_URL + href, provinceNode);
				provinces.add(provinceNode);
			}
		}
		StaticLog.info(JSONUtil.toJsonPrettyStr(provinces));
	}

	public static void parseCity(String url, Node provinceNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("citytr");
		List<Node> cities = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			String href = links.get(0).attr("href");
			String cityCode = links.get(0).text().substring(0, 4);
			String cityName = links.get(1).text();

			Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url).build();

			StaticLog.info("	市级数据:  {}  ", cityNode);

			parseCounty(COMMON_URL + href, cityNode);
			cities.add(cityNode);
		}
		provinceNode.setNodes(cities);
	}

	public static void parseCounty(String url, Node cityNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("countytr");
		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String countyCode = links.get(0).text().substring(0, 6);
			String countyName = links.get(1).text();

			Node countyNode = Node.builder().name(countyName).code(countyCode).dataFromUrl(url).build();

			StaticLog.info("		县级数据:  {}  ", countyNode);

			parseTowntr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href, countyNode);
			counties.add(cityNode);
		}
		cityNode.setNodes(counties);
	}

	public static void parseTowntr(String url, Node countyNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("towntr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String towntrCode = links.get(0).text().substring(0, 6);
			String towntrName = links.get(1).text();

			Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url).build();

			StaticLog.info("		乡镇级数据:  {}  ", towntrNode);

			parseVillagetr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href,
					countyNode);

			counties.add(towntrNode);
		}
		countyNode.setNodes(counties);
	}

	public static void parseVillagetr(String url, Node countyNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("villagetr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements tds = tr.getElementsByTag("td");
			if (tds == null || tds.size() != 3) {
				continue;
			}
			String villagetrCode = tds.get(0).text();
			String villagetrName = tds.get(2).text();

			Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
			StaticLog.info("		村级数据:  {}  ", villagetrNode);

			counties.add(villagetrNode);

		}
		countyNode.setNodes(counties);
	}

	public static void main(String[] args) {
		/**
		 * # 查看省份数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
		 * 
		 * # 查看 内蒙古 市级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html
		 * 
		 * # 查看 内蒙古 区级数据
		 * http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/1509.html
		 * 
		 * # 查看 内蒙古 街道级数据
		 * http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/150902.html
		 * 
		 * # 查看 内蒙古 社区居委会级数据
		 * http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/02/150902003.html
		 *
		 * *我们发现这个是有规律的,15是内蒙古的区划代码,而1509是乌兰察布市的区划代码,
		 * *前面的http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/这一大串都是一样的,我们就叫commonUrl。
		 * *规律就是:
		 *
		 * # 获取省的数据 commonUrl + index.html
		 * 
		 * # 获取市级数据 commonUrl + 对应省级区划代码.html
		 * 
		 * # 获取县区级数据 commonUrl + 对应省级区划代码 + / + 对应市级区划代码.html
		 */

		String provinceUrl = COMMON_URL + "index.html";
		CityStats.parseProvince(provinceUrl);

		String cityUrl = COMMON_URL + "15.html";
		CityStats.parseCity(cityUrl, new Node());

		String countyUrl = COMMON_URL + "15/1509.html";
		CityStats.parseCounty(countyUrl, new Node());

		String towntrUrl = COMMON_URL + "15/09/150981.html";
		CityStats.parseTowntr(towntrUrl, new Node());
	}
}

####################################################################################################

mysql数据等代码实现

package com.reptile.area.decorator;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.reptile.area.jsoup.Node;

import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;

/**
 * *省市县解析器
 */
public class CityParser implements ICityParser {

	private static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";

	private static final Charset CHARSET = CharsetUtil.CHARSET_GBK;

	public List<Node> parseProvinces(String url) {
		return parseProvince(COMMON_URL + "index.html");
	}

	private List<Node> parseProvince(String url) {

		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);

		// 获取 class='provincetr' 的元素
		Elements elements = document.getElementsByClass("provincetr");
		List<Node> provinces = new LinkedList<Node>();
		for (Element element : elements) {
			// 获取 elements 下属性是 href 的元素
			Elements links = element.getElementsByAttribute("href");
			for (Element link : links) {
				String provinceName = link.text();
				String href = link.attr("href");
				String provinceCode = href.substring(0, 2);

				Node provinceNode = Node.builder().code(provinceCode).name(provinceName).dataFromUrl(url)
						.nodes(parseCity(COMMON_URL + href)).build();

				StaticLog.info("省级数据:  {}  ", provinceNode);

				provinces.add(provinceNode);
			}
		}
		return provinces;
	}

	private List<Node> parseCity(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("citytr");

		List<Node> cities = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			String href = links.get(0).attr("href");
			String cityCode = links.get(0).text().substring(0, 4);
			String cityName = links.get(1).text();

			Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url)
					.nodes(parseCounty(COMMON_URL + href)).build();

			StaticLog.info("	市级数据:  {}  ", cityNode);

			cities.add(cityNode);
		}
		return cities;
	}

	private List<Node> parseCounty(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("countytr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String countyCode = links.get(0).text().substring(0, 6);
			String countyName = links.get(1).text();

			Node countyNode = Node.builder().code(countyCode).name(countyName).dataFromUrl(url)
					.nodes(parseTowntr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href)).build();
			StaticLog.info("		县级数据:  {}  ", countyNode);

			counties.add(countyNode);
		}
		return counties;
	}

	public List<Node> parseTowntr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("towntr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String towntrCode = links.get(0).text().substring(0, 6);
			String towntrName = links.get(1).text();

			Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url)
					.nodes(parseVillagetr(
							COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href))
					.build();

			StaticLog.info("			乡镇级数据:  {}  ", towntrNode);

			counties.add(towntrNode);
		}
		return counties;
	}

	public List<Node> parseVillagetr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("villagetr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements tds = tr.getElementsByTag("td");
			if (tds == null || tds.size() != 3) {
				continue;
			}
			String villagetrCode = tds.get(0).text();
			String villagetrName = tds.get(2).text();
			
			Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
			StaticLog.info("				村级数据:  {}  ", villagetrNode);
			
			counties.add(villagetrNode);
		}
		return counties;
	}

}
package com.reptile.area.decorator;

import java.util.List;

import com.reptile.area.jsoup.Node;

public class CityParserDecorator implements ICityParser {

	private ICityParser cityParser;

	public CityParserDecorator(ICityParser cityParser) {
		this.cityParser = cityParser;
	}

	public List<Node> parseProvinces(String url) {
		return this.cityParser.parseProvinces(url);
	}
}
package com.reptile.area.decorator;


import java.util.List;

import com.reptile.area.jsoup.Node;

public interface ICityParser {

    /**
     * *解析得到省市区数据
     *
     * @param url 请求url
     * @return 城市
     */
    List<Node> parseProvinces(String url);
}
package com.reptile.area.decorator;

import java.util.List;

import com.reptile.area.jsoup.Node;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.json.JSONUtil;

public class JsonCityParserDecorator extends CityParserDecorator{

    public JsonCityParserDecorator(ICityParser cityParser) {
        super(cityParser);
    }

    @Override
    public List<Node> parseProvinces(String url) {
        List<Node> provinces = super.parseProvinces(url);
        String jsonStr = JSONUtil.toJsonStr(provinces);
        // json数据写入到文件
        FileWriter fileWriter = new FileWriter(FileUtil.touch("F://area.json"));
        fileWriter.write(jsonStr);
        return provinces;
    }
}
package com.reptile.area.decorator;

import java.util.List;

import com.reptile.area.jsoup.Node;

import cn.hutool.log.StaticLog;

public class LocationCityParserDecorator extends CityParserDecorator {

	public LocationCityParserDecorator(ICityParser cityParser) {
		super(cityParser);
	}

	@Override
	public List<Node> parseProvinces(String url) {
		List<Node> provinces = super.parseProvinces(url);
		StaticLog.info("查询出经纬度了. . . ");
		return provinces;
	}
}
package com.reptile.area.decorator;

import java.util.ArrayList;
import java.util.List;

import com.github.stuxuhai.jpinyin.PinyinException;
import com.github.stuxuhai.jpinyin.PinyinFormat;
import com.github.stuxuhai.jpinyin.PinyinHelper;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.core.text.StrFormatter;
import cn.hutool.log.StaticLog;

/**
 * sql打印装饰器
 */
public class SqlCityParserDecorator extends CityParserDecorator {

	private static final String SQL = "insert into area(`name`, `code`, full_spell, easy_spell, initial, parent_code, depth, data_from_url) values ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');";

	public SqlCityParserDecorator(ICityParser cityParser) {
		super(cityParser);
	}

	@Override
	public List<Node> parseProvinces(String url) {
		List<Node> provinces = super.parseProvinces(url);
		
		List<String> buildSql = buildSql(provinces);
		if (CollUtil.isNotEmpty(buildSql)) {
			// json数据写入到文件
			FileWriter fileWriter = new FileWriter(FileUtil.touch("F://area.sql"));
			fileWriter.writeLines(buildSql);
		}
		return provinces;
	}

	/**
	 * *实体转sql数据
	 * 
	 * @param provinces 省市县数据
	 */
	private List<String> buildSql(List<Node> provinces) {
		List<String> sqls = null;
		if (CollUtil.isNotEmpty(provinces)) {
			sqls = new ArrayList<>();
			for (Node province : provinces) {
				sqls.add(initSql(province.getName(), province.getCode(), province.getDataFromUrl(), "", 1));
				buildCitySql(sqls, province.getNodes(), province.getCode());
			}
		}
		return sqls;
	}

	private void buildCitySql(List<String> sqls, List<Node> cities, String parentCode) {
		if (CollUtil.isNotEmpty(cities)) {
			for (Node city : cities) {
				sqls.add(initSql(city.getName(), city.getCode(), city.getDataFromUrl(), parentCode, 2));
				buildCountySql(sqls, city.getNodes(), city.getCode());
			}
		}
	}

	private void buildCountySql(List<String> sqls, List<Node> counties, String parentCode) {
		if (CollUtil.isNotEmpty(counties)) {
			for (Node county : counties) {
				sqls.add(initSql(county.getName(), county.getCode(), county.getDataFromUrl(), parentCode, 3));
				buildTowntrSql(sqls, county.getNodes(), county.getCode());
			}
		}
	}

	private void buildTowntrSql(List<String> sqls, List<Node> towies, String parentCode) {
		if (CollUtil.isNotEmpty(towies)) {
			for (Node towntr : towies) {
				sqls.add(initSql(towntr.getName(), towntr.getCode(), towntr.getDataFromUrl(), parentCode, 4));
				buildVillagetrSql(sqls, towntr.getNodes(), towntr.getCode());
			}
		}
	}

	private void buildVillagetrSql(List<String> sqls, List<Node> vilies, String parentCode) {
		if (CollUtil.isNotEmpty(vilies)) {
			for (Node villagetr : vilies) {
				sqls.add(initSql(villagetr.getName(), villagetr.getCode(), villagetr.getDataFromUrl(), parentCode, 5));
			}
		}
	}

	/**
	 ** 初始化sql语句
	 */
	private String initSql(String name, String code, String dataFromUrl, String parentCode, Integer depth) {
		String insertSql = null;
		try {
			insertSql = StrFormatter.format(SQL, name, code,
					PinyinHelper.convertToPinyinString(name, "", PinyinFormat.WITHOUT_TONE),
					PinyinHelper.getShortPinyin(name), PinyinHelper.getShortPinyin(name).substring(0, 1).toString(),
					parentCode, depth, dataFromUrl);
			StaticLog.info(insertSql);
		} catch (PinyinException e) {
			StaticLog.error("拼音解析失败:{} .", e.getMessage());
		}
		return insertSql;
	}
}

数据库表:

-- ----------------------------
-- Table structure for area
-- ----------------------------
DROP TABLE IF EXISTS `area`;
CREATE TABLE `area`  (
  `id` bigint(8) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `name` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '城市名称',
  `code` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '城市代码',
  `full_spell` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '全拼,北京全拼为beijing',
  `easy_spell` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '简拼,北京简拼为bj',
  `initial` char(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '首字母,北京首字母为b',
  `parent_code` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '' COMMENT '父级城市代码',
  `depth` char(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT '0' COMMENT '等级:省=1,市=2,县区=3,乡镇=4,村=5  ',
  `data_from_url` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NULL DEFAULT NULL COMMENT '数据来源地址',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 703613 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci COMMENT = '省市县(区)表' ROW_FORMAT = Dynamic;

-- ----------------------------

 

#######################################-------完结----------###############################################

测试代码

package com.reptile.area;

import java.util.List;

import com.reptile.area.decorator.CityParser;
import com.reptile.area.decorator.ICityParser;
import com.reptile.area.decorator.JsonCityParserDecorator;
import com.reptile.area.decorator.LocationCityParserDecorator;
import com.reptile.area.decorator.SqlCityParserDecorator;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.date.DateUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.log.StaticLog;

public class CityParserTest {
	private static final String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html";

	public static void main(String[] args) {
		TimeInterval timer = DateUtil.timer();
		// -------这是执行过程--------------
		cityParserDecorator();
		// ---------------------------------
		long interval = timer.interval();// 花费毫秒数
		long intervalMinute = timer.intervalMinute();// 花费分钟数
		StaticLog.info("本次程序执行 花费毫秒数: {} ,   花费分钟数:{} . ", interval, intervalMinute);
	}

	private static List<Node> cityParserDecorator() {

		ICityParser cityParser = new CityParser();

		// 1. 先查经纬度
		ICityParser locationCityParser = new LocationCityParserDecorator(cityParser);

		// 展示sql
		ICityParser sqlCityParser = new SqlCityParserDecorator(locationCityParser);

		// 打印json
		ICityParser jsonCityParser = new JsonCityParserDecorator(sqlCityParser);

		List<Node> parseProvinces = jsonCityParser.parseProvinces(url);

		return parseProvinces;
	}
}

懒人通道:https://download.csdn.net/download/qq_38765404/11017014