C#基于正则表达式实现获取网页中所有信息的网页抓取类实例
程序员文章站
2024-02-07 14:42:46
本文实例讲述了c#基于正则表达式实现获取网页中所有信息的网页抓取类。分享给大家供大家参考,具体如下:
类的代码:
using system;
using sy...
本文实例讲述了c#基于正则表达式实现获取网页中所有信息的网页抓取类。分享给大家供大家参考,具体如下:
类的代码:
using system; using system.data; using system.configuration; using system.net; using system.io; using system.text; using system.collections.generic; using system.text.regularexpressions; using system.threading; using system.web; using system.web.ui.mobilecontrols; /// <summary> /// 网页类 /// </summary> public class webpage { #region 私有成员 private uri m_uri; //url private list<link> m_links; //此网页上的链接 private string m_title; //标题 private string m_html; //html代码 private string m_outstr; //网页可输出的纯文本 private bool m_good; //网页是否可用 private int m_pagesize; //网页的大小 private static dictionary<string, cookiecontainer> webcookies = new dictionary<string, cookiecontainer>();//存放所有网页的cookie #endregion #region 属性 /// <summary> /// 通过此属性可获得本网页的网址,只读 /// </summary> public string url { get { return m_uri.absoluteuri; } } /// <summary> /// 通过此属性可获得本网页的标题,只读 /// </summary> public string title { get { if (m_title == "") { regex reg = new regex(@"(?m)<title[^>]*>(?<title>(?:\w|\w)*?)</title[^>]*>", regexoptions.multiline | regexoptions.ignorecase); match mc = reg.match(m_html); if (mc.success) m_title = mc.groups["title"].value.trim(); } return m_title; } } public string m_html { get { if (m_html == null) { m_html = ""; } return m_html; } } /// <summary> /// 此属性获得本网页的所有链接信息,只读 /// </summary> public list<link> links { get { if (m_links.count == 0) getlinks(); return m_links; } } /// <summary> /// 此属性返回本网页的全部纯文本信息,只读 /// </summary> public string context { get { if (m_outstr == "") getcontext(int16.maxvalue); return m_outstr; } } /// <summary> /// 此属性获得本网页的大小 /// </summary> public int pagesize { get { return m_pagesize; } } /// <summary> /// 此属性获得本网页的所有站内链接 /// </summary> public list<link> insitelinks { get { return getspeciallinksbyurl("^http://" + m_uri.host, int16.maxvalue); } } /// <summary> /// 此属性表示本网页是否可用 /// </summary> public bool isgood { get { return m_good; } } /// <summary> /// 此属性表示网页的所在的网站 /// </summary> public string host { get { return m_uri.host; } } #endregion /// <summary> /// 从html代码中分析出链接信息 /// </summary> /// <returns>list<link></returns> private list<link> getlinks() { if (m_links.count == 0) { regex[] regex = new regex[2]; regex[0] = new regex(@"<a\shref\s*=""(?<url>[^""]*).*?>(?<title>[^<]*)</a>", regexoptions.ignorecase | regexoptions.singleline); regex[1] = new regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", regexoptions.ignorecase); for (int i = 0; i < 2; i++) { match match = regex[i].match(m_html); while (match.success) { try { string url = httputility.urldecode(new uri(m_uri, match.groups["url"].value).absoluteuri); string text = ""; if (i == 0) text = new regex("(<[^>]+>)|(\\s)|( )|&|\"", regexoptions.multiline | regexoptions.ignorecase).replace(match.groups["text"].value, ""); link link = new link(); link.text = text; link.navigateurl = url; m_links.add(link); } catch (exception ex) { console.writeline(ex.message); }; match = match.nextmatch(); } } } return m_links; } /// <summary> /// 此私有方法从一段html文本中提取出一定字数的纯文本 /// </summary> /// <param name="instr">html代码</param> /// <param name="firstn">提取从头数多少个字</param> /// <param name="withlink">是否要链接里面的字</param> /// <returns>纯文本</returns> private string getfirstnchar(string instr, int firstn, bool withlink) { if (m_outstr == "") { m_outstr = instr.clone() as string; m_outstr = new regex(@"(?m)<script[^>]*>(\w|\w)*?</script[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, ""); m_outstr = new regex(@"(?m)<style[^>]*>(\w|\w)*?</style[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, ""); m_outstr = new regex(@"(?m)<select[^>]*>(\w|\w)*?</select[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, ""); if (!withlink) m_outstr = new regex(@"(?m)<a[^>]*>(\w|\w)*?</a[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, ""); regex objreg = new system.text.regularexpressions.regex("(<[^>]+?>)| ", regexoptions.multiline | regexoptions.ignorecase); m_outstr = objreg.replace(m_outstr, ""); regex objreg2 = new system.text.regularexpressions.regex("(\\s)+", regexoptions.multiline | regexoptions.ignorecase); m_outstr = objreg2.replace(m_outstr, " "); } return m_outstr.length > firstn ? m_outstr.substring(0, firstn) : m_outstr; } #region 公有文法 /// <summary> /// 此公有方法提取网页中一定字数的纯文本,包括链接文字 /// </summary> /// <param name="firstn">字数</param> /// <returns></returns> public string getcontext(int firstn) { return getfirstnchar(m_html, firstn, true); } /// <summary> /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的url满足某正则式 /// </summary> /// <param name="pattern">正则式</param> /// <param name="count">返回的链接的个数</param> /// <returns>list<link></returns> public list<link> getspeciallinksbyurl(string pattern, int count) { if (m_links.count == 0) getlinks(); list<link> speciallinks = new list<link>(); list<link>.enumerator i; i = m_links.getenumerator(); int cnt = 0; while (i.movenext() && cnt < count) { if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase).match(i.current.navigateurl).success) { speciallinks.add(i.current); cnt++; } } return speciallinks; } /// <summary> /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式 /// </summary> /// <param name="pattern">正则式</param> /// <param name="count">返回的链接的个数</param> /// <returns>list<link></returns> public list<link> getspeciallinksbytext(string pattern, int count) { if (m_links.count == 0) getlinks(); list<link> speciallinks = new list<link>(); list<link>.enumerator i; i = m_links.getenumerator(); int cnt = 0; while (i.movenext() && cnt < count) { if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase).match(i.current.text).success) { speciallinks.add(i.current); cnt++; } } return speciallinks; } /// <summary> /// 这公有方法提取本网页的纯文本中满足某正则式的文字 by 何问起 /// </summary> /// <param name="pattern">正则式</param> /// <returns>返回文字</returns> public string getspecialwords(string pattern) { if (m_outstr == "") getcontext(int16.maxvalue); regex regex = new regex(pattern, regexoptions.multiline | regexoptions.ignorecase); match mc = regex.match(m_outstr); if (mc.success) return mc.groups[1].value; return string.empty; } #endregion #region 构造函数 private void init(string _url) { try { m_uri = new uri(_url); m_links = new list<link>(); m_html = ""; m_outstr = ""; m_title = ""; m_good = true; if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi")) { m_good = false; return; } httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri); rqst.allowautoredirect = true; rqst.maximumautomaticredirections = 3; rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)"; rqst.keepalive = true; rqst.timeout = 10000; lock (webpage.webcookies) { if (webpage.webcookies.containskey(m_uri.host)) rqst.cookiecontainer = webpage.webcookies[m_uri.host]; else { cookiecontainer cc = new cookiecontainer(); webpage.webcookies[m_uri.host] = cc; rqst.cookiecontainer = cc; } } httpwebresponse rsps = (httpwebresponse)rqst.getresponse(); stream sm = rsps.getresponsestream(); if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22) { rsps.close(); m_good = false; return; } encoding cding = system.text.encoding.default; string contenttype = rsps.contenttype.tolower(); int ix = contenttype.indexof("charset="); if (ix != -1) { try { cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1)); } catch { cding = encoding.default; } //该处视情况而定 有的需要解码 //m_html = httputility.htmldecode(new streamreader(sm, cding).readtoend()); m_html = new streamreader(sm, cding).readtoend(); } else { //该处视情况而定 有的需要解码 //m_html = httputility.htmldecode(new streamreader(sm, cding).readtoend()); m_html = new streamreader(sm, cding).readtoend(); regex regex = new regex("charset=(?<cding>[^=]+)?\"", regexoptions.ignorecase); string strcding = regex.match(m_html).groups["cding"].value; try { cding = encoding.getencoding(strcding); } catch { cding = encoding.default; } byte[] bytes = encoding.default.getbytes(m_html.tochararray()); m_html = cding.getstring(bytes); if (m_html.split('?').length > 100) { m_html = encoding.default.getstring(bytes); } } m_pagesize = m_html.length; m_uri = rsps.responseuri; rsps.close(); } catch (exception ex) { } } public webpage(string _url) { string uurl = ""; try { uurl = uri.unescapedatastring(_url); _url = uurl; } catch { }; init(_url); } #endregion }
调用:
webpage webinfo = new webpage("http://hovertree.net/"); webinfo.context;//不包含html标签的所有内容 webinfo.m_html;//包含html标签的内容 by 何问起
ps:这里再为大家提供2款非常方便的正则表达式工具供大家参考使用:
javascript正则表达式在线测试工具:
正则表达式在线生成工具:
更多关于c#相关内容感兴趣的读者可查看本站专题:《c#正则表达式用法总结》、《c#编码操作技巧总结》、《c#中xml文件操作技巧汇总》、《c#常见控件用法教程》、《winform控件用法总结》、《c#数据结构与算法教程》、《c#面向对象程序设计入门教程》及《c#程序设计之线程使用技巧总结》
希望本文所述对大家c#程序设计有所帮助。