欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Java多线程爬虫-Java爬虫-2018国家统计局区划和城乡划分代码以及数据库、json文件

程序员文章站 2022-03-02 21:02:49
...

 

package com.reptile.area.jsoup;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;

/**
 * * 省市区区划地址解析
 * 
 * @author zhang.xiaoming
 */
public class CityParserThread {

	public static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
	
	public static final String INDEX_URL = COMMON_URL + "index.html";

	public static final String LEFT_SLANT = "/";
	
	public static final Charset CHARSET = CharsetUtil.CHARSET_GBK;

	public static Node parseCity(String url, Node node) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("citytr");

		List<Node> cities = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			String href = links.get(0).attr("href");
			String cityCode = links.get(0).text().substring(0, 4);
			String cityName = links.get(1).text();

			Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url).childNodeUrl(COMMON_URL + href)
					.nodes(parseCounty(COMMON_URL + href)).build();

			StaticLog.info("	市级数据:  {} - {}   ", Thread.currentThread().getName(), cityNode);

			cities.add(cityNode);
		}
		node.setNodes(cities);
		return node;
	}

	public static List<Node> parseCounty(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("countytr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String countyCode = links.get(0).text().substring(0, 6);
			String countyName = links.get(1).text();

			Node countyNode = Node.builder().code(countyCode).name(countyName).dataFromUrl(url).childNodeUrl(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href)
					.nodes(parseTowntr(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href)).build();
			StaticLog.info("		县级数据:  {} - {}   ", Thread.currentThread().getName(), countyNode);

			counties.add(countyNode);
		}
		return counties;
	}

	public static List<Node> parseTowntr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("towntr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String towntrCode = links.get(0).text().substring(0, 6);
			String towntrName = links.get(1).text();

			Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url).childNodeUrl(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href.substring(5, 7) + LEFT_SLANT + href)
					.nodes(parseVillagetr(
							COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href.substring(5, 7) + LEFT_SLANT + href))
					.build();

			//StaticLog.info("			乡镇级数据:  {} - {}  ", Thread.currentThread().getName(),  towntrNode);

			counties.add(towntrNode);
		}
		return counties;
	}

	public static List<Node> parseVillagetr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("villagetr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements tds = tr.getElementsByTag("td");
			if (tds == null || tds.size() != 3) {
				continue;
			}
			String villagetrCode = tds.get(0).text();
			String villagetrName = tds.get(2).text();
			
			Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
			//StaticLog.info("				村级数据:  {} - {} ", Thread.currentThread().getName(), villagetrNode);
			
			counties.add(villagetrNode);
		}
		return counties;
	}
}

实体类:

package com.reptile.area.jsoup;

import java.util.List;

import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;

@Data
@ToString
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class Node {

    private String name;

    private String code;
    
    private String childNodeUrl;
    
    private String dataFromUrl;

    private List<Node> nodes;
}

测试:

package com.reptile.area;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.reptile.area.jsoup.CityParserThread;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;

public class CityParserThreadTest {

	public static List<Node> province() {
		String htmlStr = HttpUtil.get(CityParserThread.INDEX_URL, CityParserThread.CHARSET);
		Document document = Jsoup.parse(htmlStr);

		// 获取 class='provincetr' 的元素
		Elements elements = document.getElementsByClass("provincetr");
		List<Node> provinces = new LinkedList<Node>();
		for (Element element : elements) {
			// 获取 elements 下属性是 href 的元素
			Elements links = element.getElementsByAttribute("href");
			for (Element link : links) {
				String provinceName = link.text();
				String href = link.attr("href");
				String provinceCode = href.substring(0, 2);

				Node provinceNode = Node.builder().code(provinceCode).name(provinceName)
						.dataFromUrl(CityParserThread.INDEX_URL).childNodeUrl(CityParserThread.COMMON_URL + href)
						.build();
				provinces.add(provinceNode);
			}
		}
		return provinces;
	}

	public static void main(String[] args) {

		TimeInterval timer = DateUtil.timer();
		// -------这是执行过程--------------

		List<Node> provinces = province();
		if (CollUtil.isNotEmpty(provinces)) {
			List<Node> nodes = new LinkedList<Node>();
			List<Future<Node>> futureList = new ArrayList<Future<Node>>();
			ExecutorService pool = Executors.newFixedThreadPool(provinces.size());//根据实际情况定义大小
			for (Node province : provinces) {
				futureList.add(pool.submit(new TaskCallable(province.getChildNodeUrl(), province.getName(), province)));
			}
			pool.shutdown(); // 不允许再想线程池中增加线程
			// 判断是否所有线程已经执行完毕
			try {
				boolean isFinish = pool.awaitTermination(1, TimeUnit.HOURS);
				StaticLog.info("==========================");
				// 如果没有执行完
				if (!isFinish) {
					// 线程池执行结束 不在等待线程执行完毕,直接执行下面的代码
					pool.shutdownNow();
				}
				// 2.结果归集,用迭代器遍历futureList,高速轮询(模拟实现了并发),任务完成就移除
				while (futureList.size() > 0) {
					Iterator<Future<Node>> iterable = futureList.iterator();
					// 遍历一遍
					while (iterable.hasNext()) {
						Future<Node> future = iterable.next();
						// 如果任务完成取结果,否则判断下一个任务是否完成
						if (future.isDone() && !future.isCancelled()) {
							// 获取结果
							nodes.add(future.get());
						} else {
							Thread.sleep(1);// 避免CPU高速运转,这里休息1毫秒,CPU纳秒级别
						}
					}
				}
			} catch (InterruptedException e) {
				e.printStackTrace();
			} catch (ExecutionException e) {
				e.printStackTrace();
			}
			// 只给线程池中的线程1小时,然后就继续执行
			StaticLog.info("it is ok !!!");
			
			SqlJsonWriter.jsonWriter(nodes, "F://20190314area.json");

			SqlJsonWriter.sqlWriter(nodes, "F://20190314area.sql");
			
			// ---------------------------------
			long interval = timer.interval();// 花费毫秒数
			long intervalMinute = timer.intervalMinute();// 花费分钟数
			StaticLog.info("本次程序执行 花费毫秒数: {} ,   花费分钟数:{} . ", interval, intervalMinute);
		}
	}
}

class TaskCallable implements Callable<Node> {
	private String url;

	private String areaName;

	private Node node;

	public TaskCallable(String url, String areaName, Node node) {
		this.url = url;
		this.areaName = areaName;
		this.node = node;
	}

	@Override
	public Node call() throws Exception {
		StaticLog.info("当前线程: {} ,  地区名称: {} ,  请求地址:{} 。 ", Thread.currentThread().getName(), areaName, url);
		return CityParserThread.parseCity(url, node);
	}
}

sql生成:

package com.reptile.area;

import java.util.ArrayList;
import java.util.List;

import com.github.stuxuhai.jpinyin.PinyinException;
import com.github.stuxuhai.jpinyin.PinyinFormat;
import com.github.stuxuhai.jpinyin.PinyinHelper;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileReader;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.core.text.StrFormatter;
import cn.hutool.json.JSONUtil;
import cn.hutool.log.StaticLog;

public class SqlJsonWriter {
	
	public static void sqlWriter(List<Node> nodes, String path) {
		if (CollUtil.isNotEmpty(nodes)) {
			FileWriter sqlWriter = new FileWriter(FileUtil.touch(path));
			sqlWriter.writeLines(nodes);
			StaticLog.info("SQL文件保存成功...");
		}
	}
	
	public static void jsonWriter(List<Node> nodes, String path) {
		if (CollUtil.isNotEmpty(nodes)) {
			// json数据写入到文件
			FileWriter jsonWriter = new FileWriter(FileUtil.touch(path));
			jsonWriter.write(JSONUtil.toJsonStr(nodes));
			StaticLog.info("JSON文件保存成功...");
		}
	}
	
	
	/**
	 * *实体转sql数据
	 * 
	 * @param provinces 省市县数据
	 */
	private static List<String> buildSql(List<Node> provinces) {
		List<String> sqls = null;
		if (CollUtil.isNotEmpty(provinces)) {
			sqls = new ArrayList<>();
			for (Node province : provinces) {
				sqls.add(initSql(province.getName(), province.getCode(), province.getDataFromUrl(), province.getChildNodeUrl(), "", 1));
				buildCitySql(sqls, province.getNodes(), province.getCode());
			}
		}
		return sqls;
	}

	private static void buildCitySql(List<String> sqls, List<Node> cities, String parentCode) {
		if (CollUtil.isNotEmpty(cities)) {
			for (Node city : cities) {
				sqls.add(initSql(city.getName(), city.getCode(), city.getDataFromUrl(), city.getChildNodeUrl(), parentCode, 2));
				buildCountySql(sqls, city.getNodes(), city.getCode());
			}
		}
	}

	private static void buildCountySql(List<String> sqls, List<Node> counties, String parentCode) {
		if (CollUtil.isNotEmpty(counties)) {
			for (Node county : counties) {
				sqls.add(initSql(county.getName(), county.getCode(), county.getDataFromUrl(), county.getChildNodeUrl(), parentCode, 3));
				buildTowntrSql(sqls, county.getNodes(), county.getCode());
			}
		}
	}

	private static void buildTowntrSql(List<String> sqls, List<Node> towies, String parentCode) {
		if (CollUtil.isNotEmpty(towies)) {
			for (Node towntr : towies) {
				sqls.add(initSql(towntr.getName(), towntr.getCode(), towntr.getDataFromUrl(), towntr.getChildNodeUrl(), parentCode, 4));
				buildVillagetrSql(sqls, towntr.getNodes(), towntr.getCode());
			}
		}
	}

	private static void buildVillagetrSql(List<String> sqls, List<Node> vilies, String parentCode) {
		if (CollUtil.isNotEmpty(vilies)) {
			for (Node villagetr : vilies) {
				sqls.add(initSql(villagetr.getName(), villagetr.getCode(), villagetr.getDataFromUrl(), villagetr.getChildNodeUrl(), parentCode, 5));
			}
		}
	}

	/**
	 ** 初始化sql语句
	 */
	private static String initSql(String name, String code, String dataFromUrl, String childNodeUrl, String parentCode, Integer depth) {
		final String SQL = "insert into area(`name`, `code`, full_spell, easy_spell, initial, parent_code, depth, data_from_url, child_node_url) values ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');";

		String insertSql = null;
		try {
			insertSql = StrFormatter.format(SQL, name, code,
					PinyinHelper.convertToPinyinString(name, "", PinyinFormat.WITHOUT_TONE),
					PinyinHelper.getShortPinyin(name), PinyinHelper.getShortPinyin(name).substring(0, 1).toString(),
					parentCode, depth, dataFromUrl, childNodeUrl);
			StaticLog.info(insertSql);
		} catch (PinyinException e) {
			StaticLog.error("拼音解析失败:{} .", e.getMessage());
		}
		return insertSql;
	}
}