httpclient自动获取页面编码设置进行字符编码,使httpclient适用所有网页抓取不乱码

程序员文章站 2022-07-13 11:37:00
...
//生成HttpMethod的方法就不举例了，网上很多，这里只是写明如何使得Httpclient适用所有编码的网页抓取

	/**
	 * 获取页面html内容
	 * @param method
	 * @param methodType
	 * @return String
	 * @throws UnsupportedEncodingException
	 * @throws IOException
	 */
	private static String readInputStream(HttpMethod method) throws Exception{
		String charset = "UTF-8";
		if(method instanceof PostMethod){
			charset = ((PostMethod)method).getResponseCharSet();
		}else{
			charset = ((GetMethod)method).getResponseCharSet();
		}
		byte[] bytes = method.getResponseBody();
		String body = new String(bytes,"UTF-8");
		charset = getCharSetByBody(body,charset);
		return new String(bytes,charset);
	}
	
	/**
	 * 根据页面body获取字符编码
	 * @param html
	 * @param charset
	 * @return
	 */
	private static String getCharSetByBody(String html,String charset){
		Document document = parseJSoupDocumentFromHtml(html, Constants.parseBaseUri);
		Elements elements = document.select("meta");
		for(Element metaElement : elements){
			if(metaElement!=null && StringUtils.isNotBlank(metaElement.attr("http-equiv")) && metaElement.attr("http-equiv").toLowerCase().equals("content-type")){
				String content = metaElement.attr("content");
				charset = getCharSet(content);
				break;
			}
		}
		return charset;
	}
	
	/**
	 * 正则获取字符编码
	 * @param content
	 * @return
	 */
	private static String getCharSet(String content){
		String regex = ".*charset=([^;]*).*";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(content);
		if(matcher.find())
			return matcher.group(1);
		else
			return null;
	}