C#实现抓取和分析网页类实例
程序员文章站
2024-01-28 12:24:34
本文实例讲述了c#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下:
这里介绍了抓取和分析网页的类。
其主要功能有:
1、提取网页的纯文本,去所有html标签...
本文实例讲述了c#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下:
这里介绍了抓取和分析网页的类。
其主要功能有:
1、提取网页的纯文本,去所有html标签和javascript代码
2、提取网页的链接,包括href和frame及iframe
3、提取网页的title等(其它的标签可依此类推,正则是一样的)
4、可以实现简单的表单提交及cookie保存
/* * author:sunjoy at ccnu * 如果您改进了这个类请发一份代码给我(ccnusjy 在gmail.com) */ using system; using system.data; using system.configuration; using system.net; using system.io; using system.text; using system.collections.generic; using system.text.regularexpressions; using system.threading; using system.web; /// <summary> /// 网页类 /// </summary> public class webpage { #region 私有成员 private uri m_uri; //网址 private list<link> m_links; //此网页上的链接 private string m_title; //此网页的标题 private string m_html; //此网页的html代码 private string m_outstr; //此网页可输出的纯文本 private bool m_good; //此网页是否可用 private int m_pagesize; //此网页的大小 private static dictionary<string, cookiecontainer> webcookies = new dictionary<string, cookiecontainer>();//存放所有网页的cookie private string m_post; //此网页的登陆页需要的post数据 private string m_loginurl; //此网页的登陆页 #endregion #region 私有方法 /// <summary> /// 这私有方法从网页的html代码中分析出链接信息 /// </summary> /// <returns>list<link></returns> private list<link> getlinks() { if (m_links.count == 0) { regex[] regex = new regex[2]; regex[0] = new regex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\w)*?)</", regexoptions.multiline | regexoptions.ignorecase); regex[1] = new regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", regexoptions.multiline | regexoptions.ignorecase); for (int i = 0; i < 2; i++) { match match = regex[i].match(m_html); while (match.success) { try { string url = new uri(m_uri, match.groups["url"].value).absoluteuri; string text = ""; if (i == 0) text = new regex("(<[^>]+>)|(\\s)|( )|&|\"", regexoptions.multiline | regexoptions.ignorecase).replace(match.groups["text"].value, ""); link link = new link(url, text); m_links.add(link); } catch(exception ex){console.writeline(ex.message); }; match = match.nextmatch(); } } } return m_links; } /// <summary> /// 此私有方法从一段html文本中提取出一定字数的纯文本 /// </summary> /// <param name="instr">html代码</param> /// <param name="firstn">提取从头数多少个字</param> /// <param name="withlink">是否要链接里面的字</param> /// <returns>纯文本</returns> private string getfirstnchar(string instr, int firstn, bool withlink) { if (m_outstr == "") { m_outstr = instr.clone() as string; m_outstr = new regex(@"(?m)<script[^>]*>(\w|\w)*?</script[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, ""); m_outstr = new regex(@"(?m)<style[^>]*>(\w|\w)*?</style[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, ""); m_outstr = new regex(@"(?m)<select[^>]*>(\w|\w)*?</select[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, ""); if (!withlink) m_outstr = new regex(@"(?m)<a[^>]*>(\w|\w)*?</a[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, ""); regex objreg = new system.text.regularexpressions.regex("(<[^>]+?>)| ", regexoptions.multiline | regexoptions.ignorecase); m_outstr = objreg.replace(m_outstr, ""); regex objreg2 = new system.text.regularexpressions.regex("(\\s)+", regexoptions.multiline | regexoptions.ignorecase); m_outstr = objreg2.replace(m_outstr, " "); } return m_outstr.length > firstn ? m_outstr.substring(0, firstn) : m_outstr; } /// <summary> /// 此私有方法返回一个ip地址对应的无符号整数 /// </summary> /// <param name="x">ip地址</param> /// <returns></returns> private uint getuintfromip(ipaddress x) { byte[] bt = x.getaddressbytes(); uint i = (uint)(bt[0] * 256 * 256 * 256); i += (uint)(bt[1] * 256 * 256); i += (uint)(bt[2] * 256); i += (uint)(bt[3]); return i; } #endregion #region 公有文法 /// <summary> /// 此公有方法提取网页中一定字数的纯文本,包括链接文字 /// </summary> /// <param name="firstn">字数</param> /// <returns></returns> public string getcontext(int firstn) { return getfirstnchar(m_html, firstn, true); } /// <summary> /// 此公有方法提取网页中一定字数的纯文本,不包括链接文字 /// </summary> /// <param name="firstn"></param> /// <returns></returns> public string getcontextwithoutlink(int firstn) { return getfirstnchar(m_html, firstn, false); } /// <summary> /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的url满足某正则式 /// </summary> /// <param name="pattern">正则式</param> /// <param name="count">返回的链接的个数</param> /// <returns>list<link></returns> public list<link> getspeciallinksbyurl(string pattern,int count) { if(m_links.count==0)getlinks(); list<link> speciallinks = new list<link>(); list<link>.enumerator i; i = m_links.getenumerator(); int cnt = 0; while (i.movenext() && cnt<count) { if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase ).match(i.current.url).success) { speciallinks.add(i.current); cnt++; } } return speciallinks; } /// <summary> /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式 /// </summary> /// <param name="pattern">正则式</param> /// <param name="count">返回的链接的个数</param> /// <returns>list<link></returns> public list<link> getspeciallinksbytext(string pattern,int count) { if (m_links.count == 0) getlinks(); list<link> speciallinks = new list<link>(); list<link>.enumerator i; i = m_links.getenumerator(); int cnt = 0; while (i.movenext() && cnt < count) { if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase ).match(i.current.text).success) { speciallinks.add(i.current); cnt++; } } return speciallinks; } /// <summary> /// 此公有方法获得所有链接中在一定ip范围的链接 /// </summary> /// <param name="_ip_start">起始ip</param> /// <param name="_ip_end">终止ip</param> /// <returns></returns> public list<link> getspeciallinksbyip(string _ip_start, string _ip_end) { ipaddress ip_start = ipaddress.parse(_ip_start); ipaddress ip_end = ipaddress.parse(_ip_end); if (m_links.count == 0) getlinks(); list<link> speciallinks = new list<link>(); list<link>.enumerator i; i = m_links.getenumerator(); while (i.movenext()) { ipaddress ip; try { ip = dns.gethostentry(new uri(i.current.url).host).addresslist[0]; } catch { continue; } if(getuintfromip(ip)>=getuintfromip(ip_start) && getuintfromip(ip)<=getuintfromip(ip_end)) { speciallinks.add(i.current); } } return speciallinks; } /// <summary> /// 这公有方法提取本网页的纯文本中满足某正则式的文字 /// </summary> /// <param name="pattern">正则式</param> /// <returns>返回文字</returns> public string getspecialwords(string pattern) { if (m_outstr == "") getcontext(int16.maxvalue); regex regex = new regex(pattern, regexoptions.multiline | regexoptions.ignorecase ); match mc=regex.match(m_outstr); if (mc.success) return mc.groups[1].value; return string.empty; } #endregion #region 构造函数 private void init(string _url) { try { m_uri = new uri(_url); m_links = new list<link>(); m_html = ""; m_outstr = ""; m_title = ""; m_good = true; if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi")) { m_good = false; return; } httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri); rqst.allowautoredirect = true; rqst.maximumautomaticredirections = 3; rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)"; rqst.keepalive = true; rqst.timeout = 30000; lock (webpage.webcookies) { if (webpage.webcookies.containskey(m_uri.host)) rqst.cookiecontainer = webpage.webcookies[m_uri.host]; else { cookiecontainer cc = new cookiecontainer(); webpage.webcookies[m_uri.host] = cc; rqst.cookiecontainer = cc; } } httpwebresponse rsps = (httpwebresponse)rqst.getresponse(); stream sm = rsps.getresponsestream(); if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22) { rsps.close(); m_good = false; return; } encoding cding = system.text.encoding.default; string contenttype=rsps.contenttype.tolower(); int ix = contenttype.indexof("charset="); if (ix != -1) { try { cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1)); } catch { cding = encoding.default; } m_html = new streamreader(sm, cding).readtoend(); } else { m_html = new streamreader(sm, cding).readtoend(); regex regex = new regex("charset=(?<cding>[^=]+)?\"",regexoptions.ignorecase); string strcding = regex.match(m_html).groups["cding"].value; try { cding = encoding.getencoding(strcding); } catch{ cding = encoding.default; } byte[] bytes=encoding.default.getbytes(m_html.tochararray()); m_html = cding.getstring(bytes); if (m_html.split('?').length > 100) { m_html=encoding.default.getstring(bytes); } } m_pagesize = m_html.length; m_uri = rsps.responseuri; rsps.close(); } catch (exception ex) { console.writeline(ex.message+m_uri.tostring()); m_good = false; } } public webpage(string _url) { string uurl = ""; try { uurl = uri.unescapedatastring(_url); _url = uurl; } catch { }; regex re = new regex("(?<h>[^\x00-\xff]+)"); match mc = re.match(_url); if (mc.success) { string han = mc.groups["h"].value; _url = _url.replace(han, system.web.httputility.urlencode(han, encoding.getencoding("gb2312"))); } init(_url); } public webpage(string _url, string _loginurl, string _post) { string uurl = ""; try { uurl = uri.unescapedatastring(_url); _url = uurl; } catch { }; regex re = new regex("(?<h>[^\x00-\xff]+)"); match mc = re.match(_url); if (mc.success) { string han = mc.groups["h"].value; _url = _url.replace(han, system.web.httputility.urlencode(han, encoding.getencoding("gb2312"))); } if (_loginurl.trim() == "" || _post.trim() == "" || webpage.webcookies.containskey(new uri(_url).host)) { init(_url); } else { #region 登陆 string indata = _post; m_post = _post; m_loginurl = _loginurl; byte[] bytes = encoding.default.getbytes(_post); cookiecontainer mycookiecontainer = new cookiecontainer(); try { //新建一个cookiecontainer来存放cookie集合 httpwebrequest myhttpwebrequest = (httpwebrequest)webrequest.create(_loginurl); //新建一个httpwebrequest myhttpwebrequest.contenttype = "application/x-www-form-urlencoded"; myhttpwebrequest.allowautoredirect = false; myhttpwebrequest.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)"; myhttpwebrequest.timeout = 60000; myhttpwebrequest.keepalive = true; myhttpwebrequest.contentlength = bytes.length; myhttpwebrequest.method = "post"; myhttpwebrequest.cookiecontainer = mycookiecontainer; //设置httpwebrequest的cookiecontainer为刚才建立的那个mycookiecontainer stream myrequeststream = myhttpwebrequest.getrequeststream(); myrequeststream.write(bytes, 0, bytes.length); myrequeststream.close(); httpwebresponse myhttpwebresponse = (httpwebresponse)myhttpwebrequest.getresponse(); foreach (cookie ck in myhttpwebresponse.cookies) { mycookiecontainer.add(ck); } myhttpwebresponse.close(); } catch { init(_url); return; } #endregion #region 登陆后再访问页面 try { m_uri = new uri(_url); m_links = new list<link>(); m_html = ""; m_outstr = ""; m_title = ""; m_good = true; if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi")) { m_good = false; return; } httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri); rqst.allowautoredirect = true; rqst.maximumautomaticredirections = 3; rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)"; rqst.keepalive = true; rqst.timeout = 30000; rqst.cookiecontainer = mycookiecontainer; lock (webpage.webcookies) { webpage.webcookies[m_uri.host] = mycookiecontainer; } httpwebresponse rsps = (httpwebresponse)rqst.getresponse(); stream sm = rsps.getresponsestream(); if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22) { rsps.close(); m_good = false; return; } encoding cding = system.text.encoding.default; int ix = rsps.contenttype.tolower().indexof("charset="); if (ix != -1) { try { cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1)); } catch { cding = encoding.default; } } m_html = new streamreader(sm, cding).readtoend(); m_pagesize = m_html.length; m_uri = rsps.responseuri; rsps.close(); } catch (exception ex) { console.writeline(ex.message+m_uri.tostring()); m_good = false; } #endregion } } #endregion #region 属性 /// <summary> /// 通过此属性可获得本网页的网址,只读 /// </summary> public string url { get { return m_uri.absoluteuri; } } /// <summary> /// 通过此属性可获得本网页的标题,只读 /// </summary> public string title { get { if (m_title == "") { regex reg = new regex(@"(?m)<title[^>]*>(?<title>(?:\w|\w)*?)</title[^>]*>", regexoptions.multiline | regexoptions.ignorecase ); match mc = reg.match(m_html); if (mc.success) m_title= mc.groups["title"].value.trim(); } return m_title; } } /// <summary> /// 此属性获得本网页的所有链接信息,只读 /// </summary> public list<link> links { get { if (m_links.count == 0) getlinks(); return m_links; } } /// <summary> /// 此属性返回本网页的全部纯文本信息,只读 /// </summary> public string context { get { if (m_outstr == "") getcontext(int16.maxvalue); return m_outstr; } } /// <summary> /// 此属性获得本网页的大小 /// </summary> public int pagesize { get { return m_pagesize; } } /// <summary> /// 此属性获得本网页的所有站内链接 /// </summary> public list<link> insitelinks { get { return getspeciallinksbyurl("^http://"+m_uri.host,int16.maxvalue); } } /// <summary> /// 此属性表示本网页是否可用 /// </summary> public bool isgood { get { return m_good; } } /// <summary> /// 此属性表示网页的所在的网站 /// </summary> public string host { get { return m_uri.host; } } /// <summary> /// 此网页的登陆页所需的post数据 /// </summary> public string poststr { get { return m_post; } } /// <summary> /// 此网页的登陆页 /// </summary> public string loginurl { get { return m_loginurl; } } #endregion } /// <summary> /// 链接类 /// </summary> public class link { public string url; //链接网址 public string text; //链接文字 public link(string _url, string _text) { url = _url; text = _text; } }
希望本文所述对大家的c#程序设计有所帮助。