Java精确抽取网页发布时间
程序员文章站
2024-03-12 11:25:50
对网页中各种不同格式的发布时间进行抽取,将发布时间以规整的“yyyy-mm-dd hh:mm:ss”格式表示出来,只能尽量追求精确,但是因为网络发布时间的格式十分灵活,所以...
对网页中各种不同格式的发布时间进行抽取,将发布时间以规整的“yyyy-mm-dd hh:mm:ss”格式表示出来,只能尽量追求精确,但是因为网络发布时间的格式十分灵活,所以做不到百分百地正确抽取
package whu.extract.pubtime.core; import java.util.arraylist; import java.util.calendar; import java.util.collections; import java.util.list; import java.util.regex.matcher; import java.util.regex.pattern; import whu.utils.timeutil; /** * created on 2014年3月13日 下午2:49:05 * @description 获取网页的发布时间 */ public class fetchpubtime { /** 表示url中连续的8位日期,例如http://www.baidu.com/20140311/2356.html */ private static string url_reg_whole= "([-|/|_]{1}20\\d{6})"; /** 表示 用-或者/隔开的日期,有年月日的,例如 http://www.baidu.com/2014-3-11/2356.html */ private static string url_reg_sep_ymd = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})"; /** 表示 用-或者/隔开的日期,只有年和月份的,例如 http://www.baidu.com/2014-3/2356.html */ private static string url_reg_sep_ym = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})"; private static calendar current = calendar.getinstance(); /** 格式正确的时间正则表达式*/ private static string righttimereg = "^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$"; /** * @param url * @param urlcontent * @return */ public static string getpubtimevarious(string url,string urlcontent) { string pubtime = getpubtimefromurl(url); //链接里面没有,匹配文本中的 if(pubtime == null) { if(urlcontent!=null&&!urlcontent.trim().equals("")) return extractpagedate(urlcontent); } return pubtime; } /**从url里面抽取出发布时间,返回yyyy-mm-dd hh:mm:ss格式的字符串 * @param url * @return */ public static string getpubtimefromurl(string url) { pattern p_whole = pattern.compile(url_reg_whole); matcher m_whole = p_whole.matcher(url); if(m_whole.find(0)&&m_whole.groupcount()>0) { string time = m_whole.group(0); time = time.substring(1,time.length()); //每一步都不能够超出当前时间 if(current.compareto(timeutil.strtocalendar(time, "yyyymmdd"))>=0) { return time.substring(0,4)+"-"+time.substring(4,6)+"-"+ time.substring(6,8)+" "+"00:00:00"; } } p_whole = null; m_whole = null; pattern p_sep = pattern.compile(url_reg_sep_ymd); matcher m_sep = p_sep.matcher(url); if(m_sep.find(0)&&m_sep.groupcount()>0) { string time = m_sep.group(0); time = time.substring(1,time.length()); string[] seg = time.split("[-|/|_]{1}"); calendar thetime = calendar.getinstance(); thetime.set(calendar.year,integer.parseint(seg[0])); thetime.set(calendar.month, integer.parseint(seg[1])); thetime.set(calendar.day_of_month, integer.parseint(seg[2])); if(current.compareto(thetime)>=0) { return seg[0]+"-"+seg[1]+"-"+seg[2]+" "+"00:00:00"; } } p_sep = null; m_sep = null; pattern p_sep_ym = pattern.compile(url_reg_sep_ym); matcher m_sep_ym = p_sep_ym.matcher(url); if(m_sep_ym.find(0)&&m_sep_ym.groupcount()>0) { string time = m_sep_ym.group(0); time = time.substring(1,time.length()); calendar thetime = calendar.getinstance(); string[] seg = time.split("[-|/|_]{1}"); thetime.set(calendar.year,integer.parseint(seg[0])); thetime.set(calendar.month, integer.parseint(seg[1])); thetime.set(calendar.day_of_month, 1); if(current.compareto(thetime)>=0) { return seg[0]+"-"+seg[1]+"-"+"01"+" "+"00:00:00"; } } return null; } /** 从网页源码中取出发布时间 * java中正则表达式提取字符串中日期实现代码 * 2013年12月19日15:58:42 * 读取出2013-12-19 15:48:33或者2013-12-19或者2012/3/05形式的时间 * @param text 待提取的字符串 * @return 返回日期 * @author: oschina * @createtime: jan 21, 2013 */ public static string extractpagedate(string text) { boolean containshms =false; string datestr = text.replaceall("r?n", " "); try { list matches = null; pattern p_detail = pattern.compile("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)", pattern.case_insensitive|pattern.multiline); //如果是仅仅抽取年月日,则按照上面的,如果是抽取年月日-时分秒,则按照下面的 pattern p = pattern.compile("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)", pattern.case_insensitive|pattern.multiline); //matcher matcher = p.matcher(datestr); matcher matcher_detail = p_detail.matcher(datestr); if(!(matcher_detail.find(0) && matcher_detail.groupcount() >= 1)) { matcher_detail = p.matcher(datestr); containshms = true; }else matcher_detail = p_detail.matcher(datestr); if (matcher_detail.find() && matcher_detail.groupcount() >= 1) { matches = new arraylist(); for (int i = 1; i <= matcher_detail.groupcount(); i++) { string temp = matcher_detail.group(i); matches.add(temp); } } else { matches = collections.empty_list; } if (matches.size() > 0) { for(int i=0;i<matches.size();i++) { string pubtime = matches.get(i).tostring().trim(); //取出第一个值 pubtime = pubtime.replace("/", "-").replace("年", "-").replace("月", "-").replace("日", "-"); if(current.compareto(timeutil.strtocalendar(pubtime, "yyyy-mm-dd"))>=0) { if(containshms) pubtime+=" "+"00:00:00"; if(pubtime.matches(righttimereg)) { return pubtime; } } } } else { return null; } } catch (exception e) { return null; } return null; } }
以上就是本文的全部内容,希望对大家学习java程序设计有所帮助。
上一篇: PHP封装的PDO数据库操作类实例