欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

C#实现抓取和分析网页类实例

程序员文章站 2022-06-03 18:54:27
本文实例讲述了c#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下: 这里介绍了抓取和分析网页的类。 其主要功能有: 1、提取网页的纯文本,去所有html标签...

本文实例讲述了c#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下:

这里介绍了抓取和分析网页的类。

其主要功能有:

1、提取网页的纯文本,去所有html标签和javascript代码
2、提取网页的链接,包括href和frame及iframe
3、提取网页的title等(其它的标签可依此类推,正则是一样的)
4、可以实现简单的表单提交及cookie保存

/*
* author:sunjoy at ccnu
* 如果您改进了这个类请发一份代码给我(ccnusjy 在gmail.com)
*/
using system;
using system.data;
using system.configuration;
using system.net;
using system.io;
using system.text;
using system.collections.generic;
using system.text.regularexpressions;
using system.threading;
using system.web;
/// <summary>
/// 网页类
/// </summary>
public class webpage
{
 #region 私有成员
 private uri m_uri; //网址
 private list<link> m_links; //此网页上的链接
 private string m_title;  //此网页的标题
 private string m_html;   //此网页的html代码
 private string m_outstr;  //此网页可输出的纯文本
 private bool m_good;   //此网页是否可用
 private int m_pagesize;  //此网页的大小
 private static dictionary<string, cookiecontainer> webcookies = new dictionary<string, cookiecontainer>();//存放所有网页的cookie
 private string m_post; //此网页的登陆页需要的post数据
 private string m_loginurl; //此网页的登陆页
 #endregion
 #region 私有方法
 /// <summary>
 /// 这私有方法从网页的html代码中分析出链接信息
 /// </summary>
 /// <returns>list<link></returns>
 private list<link> getlinks()
 {
  if (m_links.count == 0)
  {
   regex[] regex = new regex[2];
   regex[0] = new regex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\w)*?)</", regexoptions.multiline | regexoptions.ignorecase);
   regex[1] = new regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", regexoptions.multiline | regexoptions.ignorecase);
   for (int i = 0; i < 2; i++)
   {
    match match = regex[i].match(m_html);
    while (match.success)
    {
     try
     {
      string url = new uri(m_uri, match.groups["url"].value).absoluteuri;
      string text = "";
      if (i == 0) text = new regex("(<[^>]+>)|(\\s)|( )|&|\"", regexoptions.multiline | regexoptions.ignorecase).replace(match.groups["text"].value, "");
      link link = new link(url, text);
      m_links.add(link);
     }
     catch(exception ex){console.writeline(ex.message); };
     match = match.nextmatch();
    }
   }
  }
  return m_links;
 }
 /// <summary>
 /// 此私有方法从一段html文本中提取出一定字数的纯文本
 /// </summary>
 /// <param name="instr">html代码</param>
 /// <param name="firstn">提取从头数多少个字</param>
 /// <param name="withlink">是否要链接里面的字</param>
 /// <returns>纯文本</returns>
 private string getfirstnchar(string instr, int firstn, bool withlink)
 {
  if (m_outstr == "")
  {
   m_outstr = instr.clone() as string;
   m_outstr = new regex(@"(?m)<script[^>]*>(\w|\w)*?</script[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, "");
   m_outstr = new regex(@"(?m)<style[^>]*>(\w|\w)*?</style[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, "");
   m_outstr = new regex(@"(?m)<select[^>]*>(\w|\w)*?</select[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, "");
   if (!withlink) m_outstr = new regex(@"(?m)<a[^>]*>(\w|\w)*?</a[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
   regex objreg = new system.text.regularexpressions.regex("(<[^>]+?>)| ", regexoptions.multiline | regexoptions.ignorecase);
   m_outstr = objreg.replace(m_outstr, "");
   regex objreg2 = new system.text.regularexpressions.regex("(\\s)+", regexoptions.multiline | regexoptions.ignorecase);
   m_outstr = objreg2.replace(m_outstr, " ");
  }
  return m_outstr.length > firstn ? m_outstr.substring(0, firstn) : m_outstr;
 }
 /// <summary>
 /// 此私有方法返回一个ip地址对应的无符号整数
 /// </summary>
 /// <param name="x">ip地址</param>
 /// <returns></returns>
 private uint getuintfromip(ipaddress x)
 {
  byte[] bt = x.getaddressbytes();
  uint i = (uint)(bt[0] * 256 * 256 * 256);
  i += (uint)(bt[1] * 256 * 256);
  i += (uint)(bt[2] * 256);
  i += (uint)(bt[3]);
  return i;
 }
 #endregion
 #region 公有文法
 /// <summary>
 /// 此公有方法提取网页中一定字数的纯文本,包括链接文字
 /// </summary>
 /// <param name="firstn">字数</param>
 /// <returns></returns>
 public string getcontext(int firstn)
 {
  return getfirstnchar(m_html, firstn, true);
 }
 /// <summary>
 /// 此公有方法提取网页中一定字数的纯文本,不包括链接文字
 /// </summary>
 /// <param name="firstn"></param>
 /// <returns></returns>
 public string getcontextwithoutlink(int firstn)
 {
  return getfirstnchar(m_html, firstn, false);
 }
 /// <summary>
 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的url满足某正则式
 /// </summary>
 /// <param name="pattern">正则式</param>
 /// <param name="count">返回的链接的个数</param>
 /// <returns>list<link></returns>
 public list<link> getspeciallinksbyurl(string pattern,int count)
 {
  if(m_links.count==0)getlinks();
  list<link> speciallinks = new list<link>();
  list<link>.enumerator i;
  i = m_links.getenumerator();
  int cnt = 0;
  while (i.movenext() && cnt<count)
  {
   if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase ).match(i.current.url).success)
   {
    speciallinks.add(i.current);
    cnt++;
   }
  } 
  return speciallinks;
 }
 /// <summary>
 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
 /// </summary>
 /// <param name="pattern">正则式</param>
 /// <param name="count">返回的链接的个数</param>
 /// <returns>list<link></returns>
 public list<link> getspeciallinksbytext(string pattern,int count)
 {
  if (m_links.count == 0) getlinks();
  list<link> speciallinks = new list<link>();
  list<link>.enumerator i;
  i = m_links.getenumerator();
  int cnt = 0;
  while (i.movenext() && cnt < count)
  {
   if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase ).match(i.current.text).success)
   {
    speciallinks.add(i.current);
    cnt++;
   }
  }
  return speciallinks;
 }
 /// <summary>
 /// 此公有方法获得所有链接中在一定ip范围的链接
 /// </summary>
 /// <param name="_ip_start">起始ip</param>
 /// <param name="_ip_end">终止ip</param>
 /// <returns></returns>
 public list<link> getspeciallinksbyip(string _ip_start, string _ip_end)
 {
  ipaddress ip_start = ipaddress.parse(_ip_start);
  ipaddress ip_end = ipaddress.parse(_ip_end);
  if (m_links.count == 0) getlinks();
  list<link> speciallinks = new list<link>();
  list<link>.enumerator i;
  i = m_links.getenumerator();
  while (i.movenext())
  {
   ipaddress ip;
   try
   {
    ip = dns.gethostentry(new uri(i.current.url).host).addresslist[0];
   }
   catch { continue; }
   if(getuintfromip(ip)>=getuintfromip(ip_start) && getuintfromip(ip)<=getuintfromip(ip_end))
   {
    speciallinks.add(i.current);
   }
  }
  return speciallinks;
 }
 /// <summary>
 /// 这公有方法提取本网页的纯文本中满足某正则式的文字
 /// </summary>
 /// <param name="pattern">正则式</param>
 /// <returns>返回文字</returns>
 public string getspecialwords(string pattern)
 {
  if (m_outstr == "") getcontext(int16.maxvalue);
  regex regex = new regex(pattern, regexoptions.multiline | regexoptions.ignorecase );
  match mc=regex.match(m_outstr);
  if (mc.success)
   return mc.groups[1].value;
  return string.empty;
 }
 #endregion
 #region 构造函数
 private void init(string _url)
 {
  try
  {
   m_uri = new uri(_url);
   m_links = new list<link>();
   m_html = "";
   m_outstr = "";
   m_title = "";
   m_good = true;
   if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi"))
   {
    m_good = false;
    return;
   }
   httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri);
   rqst.allowautoredirect = true;
   rqst.maximumautomaticredirections = 3;
   rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
   rqst.keepalive = true;
   rqst.timeout = 30000;
   lock (webpage.webcookies)
   {
    if (webpage.webcookies.containskey(m_uri.host))
     rqst.cookiecontainer = webpage.webcookies[m_uri.host];
    else
    {
     cookiecontainer cc = new cookiecontainer();
     webpage.webcookies[m_uri.host] = cc;
     rqst.cookiecontainer = cc;
    }
   }
   httpwebresponse rsps = (httpwebresponse)rqst.getresponse();
   stream sm = rsps.getresponsestream();
   if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22)
   {
    rsps.close();
    m_good = false;
    return;
   }
   encoding cding = system.text.encoding.default;
   string contenttype=rsps.contenttype.tolower();
   int ix = contenttype.indexof("charset=");
   if (ix != -1)
   {
    try
    {
     cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1));
    }
    catch
    {
     cding = encoding.default;
    }
    m_html = new streamreader(sm, cding).readtoend();
   }
   else
   {
    m_html = new streamreader(sm, cding).readtoend();
    regex regex = new regex("charset=(?<cding>[^=]+)?\"",regexoptions.ignorecase);
    string strcding = regex.match(m_html).groups["cding"].value;
    try
    {
     cding = encoding.getencoding(strcding);
    }
    catch{
     cding = encoding.default;
    }
    byte[] bytes=encoding.default.getbytes(m_html.tochararray());
    m_html = cding.getstring(bytes);
    if (m_html.split('?').length > 100)
    {
     m_html=encoding.default.getstring(bytes);
    }
   }
   
   m_pagesize = m_html.length;
   m_uri = rsps.responseuri;
   rsps.close();
  }
  catch (exception ex)
  {
   console.writeline(ex.message+m_uri.tostring());
   m_good = false;
  }
 }
 public webpage(string _url)
 {
  string uurl = "";
  try
  {
   uurl = uri.unescapedatastring(_url);
   _url = uurl;
  }
  catch { };
  regex re = new regex("(?<h>[^\x00-\xff]+)");
  match mc = re.match(_url);
  if (mc.success)
  {
   string han = mc.groups["h"].value;
   _url = _url.replace(han, system.web.httputility.urlencode(han, encoding.getencoding("gb2312")));
  }
  init(_url);
 }
 public webpage(string _url, string _loginurl, string _post)
 {
  string uurl = "";
  try
  {
   uurl = uri.unescapedatastring(_url);
   _url = uurl;
  }
  catch { };
  regex re = new regex("(?<h>[^\x00-\xff]+)");
  match mc = re.match(_url);
  if (mc.success)
  {
   string han = mc.groups["h"].value;
   _url = _url.replace(han, system.web.httputility.urlencode(han, encoding.getencoding("gb2312")));
  }
  if (_loginurl.trim() == "" || _post.trim() == "" || webpage.webcookies.containskey(new uri(_url).host))
  {
   init(_url);
  }
  else
  {
   #region 登陆
   string indata = _post;
   m_post = _post;
   m_loginurl = _loginurl;
   byte[] bytes = encoding.default.getbytes(_post);
   cookiecontainer mycookiecontainer = new cookiecontainer();
   try
   {
    //新建一个cookiecontainer来存放cookie集合 
    httpwebrequest myhttpwebrequest = (httpwebrequest)webrequest.create(_loginurl);
    //新建一个httpwebrequest 
    myhttpwebrequest.contenttype = "application/x-www-form-urlencoded";
    myhttpwebrequest.allowautoredirect = false;
    myhttpwebrequest.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
    myhttpwebrequest.timeout = 60000;
    myhttpwebrequest.keepalive = true;
    myhttpwebrequest.contentlength = bytes.length;
    myhttpwebrequest.method = "post";
    myhttpwebrequest.cookiecontainer = mycookiecontainer;
    //设置httpwebrequest的cookiecontainer为刚才建立的那个mycookiecontainer 
    stream myrequeststream = myhttpwebrequest.getrequeststream();
    myrequeststream.write(bytes, 0, bytes.length);
    myrequeststream.close();
    httpwebresponse myhttpwebresponse = (httpwebresponse)myhttpwebrequest.getresponse();
    foreach (cookie ck in myhttpwebresponse.cookies)
    {
     mycookiecontainer.add(ck);
    }
    myhttpwebresponse.close();
   }
   catch
   {
    init(_url);
    return;
   }
   #endregion
   #region 登陆后再访问页面
   try
   {
    m_uri = new uri(_url);
    m_links = new list<link>();
    m_html = "";
    m_outstr = "";
    m_title = "";
    m_good = true;
    if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi"))
    {
     m_good = false;
     return;
    }
    httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri);
    rqst.allowautoredirect = true;
    rqst.maximumautomaticredirections = 3;
    rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
    rqst.keepalive = true;
    rqst.timeout = 30000;
    rqst.cookiecontainer = mycookiecontainer;
    lock (webpage.webcookies)
    {
     webpage.webcookies[m_uri.host] = mycookiecontainer;
    }
    httpwebresponse rsps = (httpwebresponse)rqst.getresponse();
    stream sm = rsps.getresponsestream();
    if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22)
    {
     rsps.close();
     m_good = false;
     return;
    }
    encoding cding = system.text.encoding.default;
    int ix = rsps.contenttype.tolower().indexof("charset=");
    if (ix != -1)
    {
     try
     {
      cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1));
     }
     catch
     {
      cding = encoding.default;
     }
    }
    m_html = new streamreader(sm, cding).readtoend();
    m_pagesize = m_html.length;
    m_uri = rsps.responseuri;
    rsps.close();
   }
   catch (exception ex)
   {
    console.writeline(ex.message+m_uri.tostring());
    m_good = false;
   }
   #endregion
  }
 }
 #endregion
 #region 属性
 /// <summary>
 /// 通过此属性可获得本网页的网址,只读
 /// </summary>
 public string url
 {
  get
  {
   return m_uri.absoluteuri;
  }
 }
 /// <summary>
 /// 通过此属性可获得本网页的标题,只读
 /// </summary>
 public string title
 {
  get
  {
   if (m_title == "")
   {
    regex reg = new regex(@"(?m)<title[^>]*>(?<title>(?:\w|\w)*?)</title[^>]*>", regexoptions.multiline | regexoptions.ignorecase );
    match mc = reg.match(m_html);
    if (mc.success)
     m_title= mc.groups["title"].value.trim();
   }
   return m_title;
  }
 }
 /// <summary>
 /// 此属性获得本网页的所有链接信息,只读
 /// </summary>
 public list<link> links
 {
  get
  {
   if (m_links.count == 0) getlinks();
   return m_links;
  }
 }
 /// <summary>
 /// 此属性返回本网页的全部纯文本信息,只读
 /// </summary>
 public string context
 {
  get
  {
   if (m_outstr == "") getcontext(int16.maxvalue);
   return m_outstr;
  }
 }
 /// <summary>
 /// 此属性获得本网页的大小
 /// </summary>
 public int pagesize
 {
  get
  {
   return m_pagesize;
  }
 }
 /// <summary>
 /// 此属性获得本网页的所有站内链接
 /// </summary>
 public list<link> insitelinks
 {
  get
  {
   return getspeciallinksbyurl("^http://"+m_uri.host,int16.maxvalue);
  }
 }
 /// <summary>
 /// 此属性表示本网页是否可用
 /// </summary>
 public bool isgood
 {
  get
  {
   return m_good;
  }
 }
 /// <summary>
 /// 此属性表示网页的所在的网站
 /// </summary>
 public string host
 {
  get
  {
   return m_uri.host;
  }
 }
 /// <summary>
 /// 此网页的登陆页所需的post数据
 /// </summary>
 public string poststr
 {
  get
  {
   return m_post;
  }
 }
 /// <summary>
 /// 此网页的登陆页
 /// </summary>
 public string loginurl
 {
  get
  {
   return m_loginurl;
  }
 }
 #endregion
}
/// <summary>
/// 链接类
/// </summary>
public class link
{
 public string url; //链接网址
 public string text; //链接文字
 public link(string _url, string _text)
 {
  url = _url;
  text = _text;
 }
}

希望本文所述对大家的c#程序设计有所帮助。