Java多线程爬虫-Java爬虫-2018国家统计局区划和城乡划分代码以及数据库、json文件
程序员文章站
2022-03-02 21:02:49
...
package com.reptile.area.jsoup;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;
/**
* * 省市区区划地址解析
*
* @author zhang.xiaoming
*/
public class CityParserThread {
public static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
public static final String INDEX_URL = COMMON_URL + "index.html";
public static final String LEFT_SLANT = "/";
public static final Charset CHARSET = CharsetUtil.CHARSET_GBK;
public static Node parseCity(String url, Node node) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("citytr");
List<Node> cities = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
String href = links.get(0).attr("href");
String cityCode = links.get(0).text().substring(0, 4);
String cityName = links.get(1).text();
Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url).childNodeUrl(COMMON_URL + href)
.nodes(parseCounty(COMMON_URL + href)).build();
StaticLog.info(" 市级数据: {} - {} ", Thread.currentThread().getName(), cityNode);
cities.add(cityNode);
}
node.setNodes(cities);
return node;
}
public static List<Node> parseCounty(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("countytr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String countyCode = links.get(0).text().substring(0, 6);
String countyName = links.get(1).text();
Node countyNode = Node.builder().code(countyCode).name(countyName).dataFromUrl(url).childNodeUrl(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href)
.nodes(parseTowntr(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href)).build();
StaticLog.info(" 县级数据: {} - {} ", Thread.currentThread().getName(), countyNode);
counties.add(countyNode);
}
return counties;
}
public static List<Node> parseTowntr(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("towntr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String towntrCode = links.get(0).text().substring(0, 6);
String towntrName = links.get(1).text();
Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url).childNodeUrl(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href.substring(5, 7) + LEFT_SLANT + href)
.nodes(parseVillagetr(
COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href.substring(5, 7) + LEFT_SLANT + href))
.build();
//StaticLog.info(" 乡镇级数据: {} - {} ", Thread.currentThread().getName(), towntrNode);
counties.add(towntrNode);
}
return counties;
}
public static List<Node> parseVillagetr(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("villagetr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements tds = tr.getElementsByTag("td");
if (tds == null || tds.size() != 3) {
continue;
}
String villagetrCode = tds.get(0).text();
String villagetrName = tds.get(2).text();
Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
//StaticLog.info(" 村级数据: {} - {} ", Thread.currentThread().getName(), villagetrNode);
counties.add(villagetrNode);
}
return counties;
}
}
实体类:
package com.reptile.area.jsoup;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;
@Data
@ToString
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class Node {
private String name;
private String code;
private String childNodeUrl;
private String dataFromUrl;
private List<Node> nodes;
}
测试:
package com.reptile.area;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.reptile.area.jsoup.CityParserThread;
import com.reptile.area.jsoup.Node;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;
public class CityParserThreadTest {
public static List<Node> province() {
String htmlStr = HttpUtil.get(CityParserThread.INDEX_URL, CityParserThread.CHARSET);
Document document = Jsoup.parse(htmlStr);
// 获取 class='provincetr' 的元素
Elements elements = document.getElementsByClass("provincetr");
List<Node> provinces = new LinkedList<Node>();
for (Element element : elements) {
// 获取 elements 下属性是 href 的元素
Elements links = element.getElementsByAttribute("href");
for (Element link : links) {
String provinceName = link.text();
String href = link.attr("href");
String provinceCode = href.substring(0, 2);
Node provinceNode = Node.builder().code(provinceCode).name(provinceName)
.dataFromUrl(CityParserThread.INDEX_URL).childNodeUrl(CityParserThread.COMMON_URL + href)
.build();
provinces.add(provinceNode);
}
}
return provinces;
}
public static void main(String[] args) {
TimeInterval timer = DateUtil.timer();
// -------这是执行过程--------------
List<Node> provinces = province();
if (CollUtil.isNotEmpty(provinces)) {
List<Node> nodes = new LinkedList<Node>();
List<Future<Node>> futureList = new ArrayList<Future<Node>>();
ExecutorService pool = Executors.newFixedThreadPool(provinces.size());//根据实际情况定义大小
for (Node province : provinces) {
futureList.add(pool.submit(new TaskCallable(province.getChildNodeUrl(), province.getName(), province)));
}
pool.shutdown(); // 不允许再想线程池中增加线程
// 判断是否所有线程已经执行完毕
try {
boolean isFinish = pool.awaitTermination(1, TimeUnit.HOURS);
StaticLog.info("==========================");
// 如果没有执行完
if (!isFinish) {
// 线程池执行结束 不在等待线程执行完毕,直接执行下面的代码
pool.shutdownNow();
}
// 2.结果归集,用迭代器遍历futureList,高速轮询(模拟实现了并发),任务完成就移除
while (futureList.size() > 0) {
Iterator<Future<Node>> iterable = futureList.iterator();
// 遍历一遍
while (iterable.hasNext()) {
Future<Node> future = iterable.next();
// 如果任务完成取结果,否则判断下一个任务是否完成
if (future.isDone() && !future.isCancelled()) {
// 获取结果
nodes.add(future.get());
} else {
Thread.sleep(1);// 避免CPU高速运转,这里休息1毫秒,CPU纳秒级别
}
}
}
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
// 只给线程池中的线程1小时,然后就继续执行
StaticLog.info("it is ok !!!");
SqlJsonWriter.jsonWriter(nodes, "F://20190314area.json");
SqlJsonWriter.sqlWriter(nodes, "F://20190314area.sql");
// ---------------------------------
long interval = timer.interval();// 花费毫秒数
long intervalMinute = timer.intervalMinute();// 花费分钟数
StaticLog.info("本次程序执行 花费毫秒数: {} , 花费分钟数:{} . ", interval, intervalMinute);
}
}
}
class TaskCallable implements Callable<Node> {
private String url;
private String areaName;
private Node node;
public TaskCallable(String url, String areaName, Node node) {
this.url = url;
this.areaName = areaName;
this.node = node;
}
@Override
public Node call() throws Exception {
StaticLog.info("当前线程: {} , 地区名称: {} , 请求地址:{} 。 ", Thread.currentThread().getName(), areaName, url);
return CityParserThread.parseCity(url, node);
}
}
sql生成:
package com.reptile.area;
import java.util.ArrayList;
import java.util.List;
import com.github.stuxuhai.jpinyin.PinyinException;
import com.github.stuxuhai.jpinyin.PinyinFormat;
import com.github.stuxuhai.jpinyin.PinyinHelper;
import com.reptile.area.jsoup.Node;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileReader;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.core.text.StrFormatter;
import cn.hutool.json.JSONUtil;
import cn.hutool.log.StaticLog;
public class SqlJsonWriter {
public static void sqlWriter(List<Node> nodes, String path) {
if (CollUtil.isNotEmpty(nodes)) {
FileWriter sqlWriter = new FileWriter(FileUtil.touch(path));
sqlWriter.writeLines(nodes);
StaticLog.info("SQL文件保存成功...");
}
}
public static void jsonWriter(List<Node> nodes, String path) {
if (CollUtil.isNotEmpty(nodes)) {
// json数据写入到文件
FileWriter jsonWriter = new FileWriter(FileUtil.touch(path));
jsonWriter.write(JSONUtil.toJsonStr(nodes));
StaticLog.info("JSON文件保存成功...");
}
}
/**
* *实体转sql数据
*
* @param provinces 省市县数据
*/
private static List<String> buildSql(List<Node> provinces) {
List<String> sqls = null;
if (CollUtil.isNotEmpty(provinces)) {
sqls = new ArrayList<>();
for (Node province : provinces) {
sqls.add(initSql(province.getName(), province.getCode(), province.getDataFromUrl(), province.getChildNodeUrl(), "", 1));
buildCitySql(sqls, province.getNodes(), province.getCode());
}
}
return sqls;
}
private static void buildCitySql(List<String> sqls, List<Node> cities, String parentCode) {
if (CollUtil.isNotEmpty(cities)) {
for (Node city : cities) {
sqls.add(initSql(city.getName(), city.getCode(), city.getDataFromUrl(), city.getChildNodeUrl(), parentCode, 2));
buildCountySql(sqls, city.getNodes(), city.getCode());
}
}
}
private static void buildCountySql(List<String> sqls, List<Node> counties, String parentCode) {
if (CollUtil.isNotEmpty(counties)) {
for (Node county : counties) {
sqls.add(initSql(county.getName(), county.getCode(), county.getDataFromUrl(), county.getChildNodeUrl(), parentCode, 3));
buildTowntrSql(sqls, county.getNodes(), county.getCode());
}
}
}
private static void buildTowntrSql(List<String> sqls, List<Node> towies, String parentCode) {
if (CollUtil.isNotEmpty(towies)) {
for (Node towntr : towies) {
sqls.add(initSql(towntr.getName(), towntr.getCode(), towntr.getDataFromUrl(), towntr.getChildNodeUrl(), parentCode, 4));
buildVillagetrSql(sqls, towntr.getNodes(), towntr.getCode());
}
}
}
private static void buildVillagetrSql(List<String> sqls, List<Node> vilies, String parentCode) {
if (CollUtil.isNotEmpty(vilies)) {
for (Node villagetr : vilies) {
sqls.add(initSql(villagetr.getName(), villagetr.getCode(), villagetr.getDataFromUrl(), villagetr.getChildNodeUrl(), parentCode, 5));
}
}
}
/**
** 初始化sql语句
*/
private static String initSql(String name, String code, String dataFromUrl, String childNodeUrl, String parentCode, Integer depth) {
final String SQL = "insert into area(`name`, `code`, full_spell, easy_spell, initial, parent_code, depth, data_from_url, child_node_url) values ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');";
String insertSql = null;
try {
insertSql = StrFormatter.format(SQL, name, code,
PinyinHelper.convertToPinyinString(name, "", PinyinFormat.WITHOUT_TONE),
PinyinHelper.getShortPinyin(name), PinyinHelper.getShortPinyin(name).substring(0, 1).toString(),
parentCode, depth, dataFromUrl, childNodeUrl);
StaticLog.info(insertSql);
} catch (PinyinException e) {
StaticLog.error("拼音解析失败:{} .", e.getMessage());
}
return insertSql;
}
}