欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

java 抓取2016年统计用区划代码和城乡划分代码

程序员文章站 2022-03-02 21:03:43
...
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 抓取
 *
 * @author brianye QQ 
 * @date 2017-7-10
 */
public class GetRegion {

	public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html";
	private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
	
	public static void main(String[] args) {
		System.out.println("抓取开始:"+ new Date());
		getProvince();
		StringBuffer content = new StringBuffer();
		for (RegionEntry one : regions) {
			content.append("insert into sys_region values('").append(one.getCode()).append("', '").append(one.getCode()).append("',null, null,'").append(one.getCode()).append("', '000000', '").append(one.getName()).append("', '1' );\r\n");
			for (RegionEntry two : one.getSub()) {
				content.append("insert into sys_region values('").append(two.getCode()).append("', '").append(two.getCode()).append("',null, '").append(two.getCode()).append("','").append(one.getCode()).append("', '").append(one.getCode()).append("', '").append(two.getName()).append("', '0' );\r\n");
				for (RegionEntry three : two.getSub()) {
					content.append("insert into sys_region values('").append(three.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getCode()).append("', '").append(two.getCode()).append("','").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getName()).append("', '0' );\r\n");
				}
			}
		}
		
		Region.writeFile(content.toString());
		System.out.println("抓取完毕:"+ new Date());
	}
	
	private static void getProvince() {
		try {
			Document doc = Jsoup.parse(new URL(SITE_URL).openStream(), "GBK", SITE_URL); //Jsoup.connect(SITE_URL).get();
			Elements links = doc.select("tr.provincetr").select("a");
			RegionEntry region = null;
			for (Element e : links) {
				region = new RegionEntry();
				String href = e.attr("href");
				String[] arr = href.split("\\.");
				String code = arr[0];
				if (arr[0].length() < 6) {
					for (int i = 0; i < 6 -arr[0].length(); i++ ) {
						code +="0";
					}
				}
				
				region.setCode(code);
				region.setName(e.text());
				
				String absHref = e.attr("abs:href");
				getCity(absHref, region);
				regions.add(region);
				try {
					Thread.sleep(2000);
				} catch (InterruptedException e1) {
					// TODO Auto-generated catch block
					e1.printStackTrace();
				}
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 获取市地址
	 * @param url
	 * @param region
	 */
	private static void getCity(String url, RegionEntry region) {
		Document doc;
		try {
			doc = Jsoup.parse(new URL(url).openStream(), "GBK", url); //Jsoup.connect(url).get().charset(charset);
			Elements links = doc.select("tr.citytr");
			RegionEntry city;
			for (Element e : links) {
				city = new RegionEntry();
				Elements alist = e.select("a");
				Element codeE = alist.get(0);
				Element codeN = alist.get(1);
				String name  = codeN.text();
				
				String code = codeE.text();
				code = code.substring(0, 6);
				
				if ("市辖区".equals(name)) {
					name = region.getName();
					//code = region.getCode();
				}
				
				city.setCode(code);
				city.setName(name);	
				
				String absHref = codeE.attr("abs:href");
				getArea(absHref, city);
				
				region.getSub().add(city);
			}
			
			
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
	
	/**
	 * 获取区县地址
	 * @param url
	 * @param region
	 */
	private static void getArea(String url, RegionEntry region) {
		Document doc;
		try {
			doc =Jsoup.parse(new URL(url).openStream(), "GBK", url); // Jsoup.connect(url).get();
			Elements links = doc.select("tr.countytr");
			RegionEntry area;
			for (Element e : links) {
				area = new RegionEntry();
				Elements alist = e.select("a");
				if (alist.size() > 0) {
					Element codeE = alist.get(0);
					
					String code = codeE.text();
					code = code.substring(0, 6);
					area.setCode(code);
					
					Element codeN = alist.get(1);
					String name  = codeN.text();
					area.setName(name);				
					region.getSub().add(area);
				} else {
					alist = e.select("td");
					area.setCode(alist.get(0).text());
					area.setName(alist.get(1).text());				
					region.getSub().add(area);
				}
				
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
	
}