Java用正则表达式如何读取网页内容
程序员文章站
2022-05-04 16:18:00
学习java的正则表达式,抓取网页并解析html部分内容
package com.xiaofeng.picup;
import j...
学习java的正则表达式,抓取网页并解析html部分内容
package com.xiaofeng.picup; import java.io.bufferedreader; import java.io.ioexception; import java.io.inputstreamreader; import java.net.malformedurlexception; import java.net.url; import java.util.arraylist; import java.util.hashmap; import java.util.list; import java.util.regex.matcher; import java.util.regex.pattern; /** *//** * * @抓取页面文章标题及内容(测试) 手动输入网址抓取,可进一步自动抓取整个页面的全部内容 * */ public class webcontent ...{ /** *//** * 读取一个网页全部内容 */ public string getonehtml(string htmlurl) throws ioexception...{ url url; string temp; stringbuffer sb = new stringbuffer(); try ...{ url = new url(htmlurl); bufferedreader in = new bufferedreader(new inputstreamreader(url .openstream(), "utf-8"));// 读取网页全部内容 while ((temp = in.readline()) != null) ...{ sb.append(temp); } in.close(); }catch(malformedurlexception me)...{ system.out.println("你输入的url格式有问题!请仔细输入"); me.getmessage(); throw me; }catch (ioexception e) ...{ e.printstacktrace(); throw e; } return sb.tostring(); } /** *//** * * @param s * @return 获得网页标题 */ public string gettitle(string s) ...{ string regex; string title = ""; list<string> list = new arraylist<string>(); regex = "<title>.*?</title>"; pattern pa = pattern.compile(regex, pattern.canon_eq); matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } for (int i = 0; i < list.size(); i++) ...{ title = title + list.get(i); } return outtag(title); } /** *//** * * @param s * @return 获得链接 */ public list<string> getlink(string s) ...{ string regex; list<string> list = new arraylist<string>(); regex = "<a[^>]*href=("([^"]*)"|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>"; pattern pa = pattern.compile(regex, pattern.dotall); matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } return list; } /** *//** * * @param s * @return 获得脚本代码 */ public list<string> getscript(string s) ...{ string regex; list<string> list = new arraylist<string>(); regex = "<script.*?</script>"; pattern pa = pattern.compile(regex, pattern.dotall); matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } return list; } /** *//** * * @param s * @return 获得css */ public list<string> getcss(string s) ...{ string regex; list<string> list = new arraylist<string>(); regex = "<style.*?</style>"; pattern pa = pattern.compile(regex, pattern.dotall); matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } return list; } /** *//** * * @param s * @return 去掉标记 */ public string outtag(string s) ...{ return s.replaceall("<.*?>", ""); }
上一篇: 比较常用证件正则表达式验证大全