httpclient自动获取页面编码设置进行字符编码,使httpclient适用所有网页抓取不乱码
程序员文章站
2022-07-13 11:37:00
...
//生成HttpMethod的方法就不举例了,网上很多,这里只是写明如何使得Httpclient适用所有编码的网页抓取 /** * 获取页面html内容 * @param method * @param methodType * @return String * @throws UnsupportedEncodingException * @throws IOException */ private static String readInputStream(HttpMethod method) throws Exception{ String charset = "UTF-8"; if(method instanceof PostMethod){ charset = ((PostMethod)method).getResponseCharSet(); }else{ charset = ((GetMethod)method).getResponseCharSet(); } byte[] bytes = method.getResponseBody(); String body = new String(bytes,"UTF-8"); charset = getCharSetByBody(body,charset); return new String(bytes,charset); } /** * 根据页面body获取字符编码 * @param html * @param charset * @return */ private static String getCharSetByBody(String html,String charset){ Document document = parseJSoupDocumentFromHtml(html, Constants.parseBaseUri); Elements elements = document.select("meta"); for(Element metaElement : elements){ if(metaElement!=null && StringUtils.isNotBlank(metaElement.attr("http-equiv")) && metaElement.attr("http-equiv").toLowerCase().equals("content-type")){ String content = metaElement.attr("content"); charset = getCharSet(content); break; } } return charset; } /** * 正则获取字符编码 * @param content * @return */ private static String getCharSet(String content){ String regex = ".*charset=([^;]*).*"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(content); if(matcher.find()) return matcher.group(1); else return null; }