使用HtmlParser 提取百度贴吧中的邮箱地址
程序员文章站
2022-06-08 08:49:07
...
经测试,实际用时1秒左右
private static final Pattern p = Pattern.compile("\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*"); public static ArrayList<String> extractEmail(int id,int start,int end){ ArrayList<String> list = new ArrayList<String>(); try { for (int i = start; i <= end; i++) { Parser parser = new Parser("http://tieba.baidu.com/p/"+id+"?pn="+i); NodeFilter filter = new RegexFilter( "\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*"); NodeList nodes = parser.extractAllNodesThatMatch(filter); if (nodes.size() > 0) { for (NodeIterator ni = nodes.elements(); ni.hasMoreNodes();) { Matcher m = p.matcher(ni.nextNode().toHtml()); if (m.find()) { list.add(m.group()); } } } } } catch (ParserException e) { } return list; }