欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

C#基于正则表达式实现获取网页中所有信息的网页抓取类实例

程序员文章站 2024-02-07 14:42:46
本文实例讲述了c#基于正则表达式实现获取网页中所有信息的网页抓取类。分享给大家供大家参考,具体如下: 类的代码: using system; using sy...

本文实例讲述了c#基于正则表达式实现获取网页中所有信息的网页抓取类。分享给大家供大家参考,具体如下:

类的代码:

using system;
using system.data;
using system.configuration;
using system.net;
using system.io;
using system.text;
using system.collections.generic;
using system.text.regularexpressions;
using system.threading;
using system.web;
using system.web.ui.mobilecontrols;
/// <summary>
/// 网页类
/// </summary>
public class webpage
{
    #region 私有成员
    private uri m_uri;  //url
    private list<link> m_links;  //此网页上的链接
    private string m_title;    //标题
    private string m_html;     //html代码
    private string m_outstr;    //网页可输出的纯文本
    private bool m_good;      //网页是否可用
    private int m_pagesize;    //网页的大小
    private static dictionary<string, cookiecontainer> webcookies = new dictionary<string, cookiecontainer>();//存放所有网页的cookie
    #endregion
    #region 属性
    /// <summary>
    /// 通过此属性可获得本网页的网址,只读
    /// </summary>
    public string url
    {
      get
      {
        return m_uri.absoluteuri;
      }
    }
    /// <summary>
    /// 通过此属性可获得本网页的标题,只读
    /// </summary>
    public string title
    {
      get
      {
        if (m_title == "")
        {
          regex reg = new regex(@"(?m)<title[^>]*>(?<title>(?:\w|\w)*?)</title[^>]*>", regexoptions.multiline | regexoptions.ignorecase);
          match mc = reg.match(m_html);
          if (mc.success)
            m_title = mc.groups["title"].value.trim();
        }
        return m_title;
      }
    }
    public string m_html
    {
      get
      {
        if (m_html == null)
        {
          m_html = "";
        }
        return m_html;
      }
    }
    /// <summary>
    /// 此属性获得本网页的所有链接信息,只读
    /// </summary>
    public list<link> links
    {
      get
      {
        if (m_links.count == 0) getlinks();
        return m_links;
      }
    }
    /// <summary>
    /// 此属性返回本网页的全部纯文本信息,只读
    /// </summary>
    public string context
    {
      get
      {
        if (m_outstr == "") getcontext(int16.maxvalue);
        return m_outstr;
      }
    }
    /// <summary>
    /// 此属性获得本网页的大小
    /// </summary>
    public int pagesize
    {
      get
      {
        return m_pagesize;
      }
    }
    /// <summary>
    /// 此属性获得本网页的所有站内链接
    /// </summary>
    public list<link> insitelinks
    {
      get
      {
        return getspeciallinksbyurl("^http://" + m_uri.host, int16.maxvalue);
      }
    }
    /// <summary>
    /// 此属性表示本网页是否可用
    /// </summary>
    public bool isgood
    {
      get
      {
        return m_good;
      }
    }
    /// <summary>
    /// 此属性表示网页的所在的网站
    /// </summary>
    public string host
    {
      get
      {
        return m_uri.host;
      }
    }
    #endregion
    /// <summary>
    /// 从html代码中分析出链接信息
    /// </summary>
    /// <returns>list<link></returns>
    private list<link> getlinks()
    {
      if (m_links.count == 0)
      {
        regex[] regex = new regex[2];
        regex[0] = new regex(@"<a\shref\s*=""(?<url>[^""]*).*?>(?<title>[^<]*)</a>", regexoptions.ignorecase | regexoptions.singleline);
        regex[1] = new regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", regexoptions.ignorecase);
        for (int i = 0; i < 2; i++)
        {
          match match = regex[i].match(m_html);
          while (match.success)
          {
            try
            {
              string url = httputility.urldecode(new uri(m_uri, match.groups["url"].value).absoluteuri);
              string text = "";
              if (i == 0) text = new regex("(<[^>]+>)|(\\s)|( )|&|\"", regexoptions.multiline | regexoptions.ignorecase).replace(match.groups["text"].value, "");
              link link = new link();
              link.text = text;
              link.navigateurl = url;
              m_links.add(link);
            }
            catch (exception ex) { console.writeline(ex.message); };
            match = match.nextmatch();
          }
        }
      }
      return m_links;
    }
    /// <summary>
    /// 此私有方法从一段html文本中提取出一定字数的纯文本
    /// </summary>
    /// <param name="instr">html代码</param>
    /// <param name="firstn">提取从头数多少个字</param>
    /// <param name="withlink">是否要链接里面的字</param>
    /// <returns>纯文本</returns>
    private string getfirstnchar(string instr, int firstn, bool withlink)
    {
      if (m_outstr == "")
      {
        m_outstr = instr.clone() as string;
        m_outstr = new regex(@"(?m)<script[^>]*>(\w|\w)*?</script[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        m_outstr = new regex(@"(?m)<style[^>]*>(\w|\w)*?</style[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        m_outstr = new regex(@"(?m)<select[^>]*>(\w|\w)*?</select[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        if (!withlink) m_outstr = new regex(@"(?m)<a[^>]*>(\w|\w)*?</a[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        regex objreg = new system.text.regularexpressions.regex("(<[^>]+?>)| ", regexoptions.multiline | regexoptions.ignorecase);
        m_outstr = objreg.replace(m_outstr, "");
        regex objreg2 = new system.text.regularexpressions.regex("(\\s)+", regexoptions.multiline | regexoptions.ignorecase);
        m_outstr = objreg2.replace(m_outstr, " ");
      }
      return m_outstr.length > firstn ? m_outstr.substring(0, firstn) : m_outstr;
    }
    #region 公有文法
    /// <summary>
    /// 此公有方法提取网页中一定字数的纯文本,包括链接文字
    /// </summary>
    /// <param name="firstn">字数</param>
    /// <returns></returns>
    public string getcontext(int firstn)
    {
      return getfirstnchar(m_html, firstn, true);
    }
    /// <summary>
    /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的url满足某正则式
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <param name="count">返回的链接的个数</param>
    /// <returns>list<link></returns>
    public list<link> getspeciallinksbyurl(string pattern, int count)
    {
      if (m_links.count == 0) getlinks();
      list<link> speciallinks = new list<link>();
      list<link>.enumerator i;
      i = m_links.getenumerator();
      int cnt = 0;
      while (i.movenext() && cnt < count)
      {
        if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase).match(i.current.navigateurl).success)
        {
          speciallinks.add(i.current);
          cnt++;
        }
      }
      return speciallinks;
    }
    /// <summary>
    /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <param name="count">返回的链接的个数</param>
    /// <returns>list<link></returns>
    public list<link> getspeciallinksbytext(string pattern, int count)
    {
      if (m_links.count == 0) getlinks();
      list<link> speciallinks = new list<link>();
      list<link>.enumerator i;
      i = m_links.getenumerator();
      int cnt = 0;
      while (i.movenext() && cnt < count)
      {
        if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase).match(i.current.text).success)
        {
          speciallinks.add(i.current);
          cnt++;
        }
      }
      return speciallinks;
    }
    /// <summary>
    /// 这公有方法提取本网页的纯文本中满足某正则式的文字 by 何问起
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <returns>返回文字</returns>
    public string getspecialwords(string pattern)
    {
      if (m_outstr == "") getcontext(int16.maxvalue);
      regex regex = new regex(pattern, regexoptions.multiline | regexoptions.ignorecase);
      match mc = regex.match(m_outstr);
      if (mc.success)
        return mc.groups[1].value;
      return string.empty;
    }
    #endregion
    #region 构造函数
    private void init(string _url)
    {
      try
      {
        m_uri = new uri(_url);
        m_links = new list<link>();
        m_html = "";
        m_outstr = "";
        m_title = "";
        m_good = true;
        if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi"))
        {
          m_good = false;
          return;
        }
        httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri);
        rqst.allowautoredirect = true;
        rqst.maximumautomaticredirections = 3;
        rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
        rqst.keepalive = true;
        rqst.timeout = 10000;
        lock (webpage.webcookies)
        {
          if (webpage.webcookies.containskey(m_uri.host))
            rqst.cookiecontainer = webpage.webcookies[m_uri.host];
          else
          {
            cookiecontainer cc = new cookiecontainer();
            webpage.webcookies[m_uri.host] = cc;
            rqst.cookiecontainer = cc;
          }
        }
        httpwebresponse rsps = (httpwebresponse)rqst.getresponse();
        stream sm = rsps.getresponsestream();
        if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22)
        {
          rsps.close();
          m_good = false;
          return;
        }
        encoding cding = system.text.encoding.default;
        string contenttype = rsps.contenttype.tolower();
        int ix = contenttype.indexof("charset=");
        if (ix != -1)
        {
          try
          {
            cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1));
          }
          catch
          {
            cding = encoding.default;
          }
          //该处视情况而定 有的需要解码
          //m_html = httputility.htmldecode(new streamreader(sm, cding).readtoend());
          m_html = new streamreader(sm, cding).readtoend();
        }
        else
        {
         //该处视情况而定 有的需要解码
          //m_html = httputility.htmldecode(new streamreader(sm, cding).readtoend());
          m_html = new streamreader(sm, cding).readtoend();
          regex regex = new regex("charset=(?<cding>[^=]+)?\"", regexoptions.ignorecase);
          string strcding = regex.match(m_html).groups["cding"].value;
          try
          {
            cding = encoding.getencoding(strcding);
          }
          catch
          {
            cding = encoding.default;
          }
          byte[] bytes = encoding.default.getbytes(m_html.tochararray());
          m_html = cding.getstring(bytes);
          if (m_html.split('?').length > 100)
          {
            m_html = encoding.default.getstring(bytes);
          }
        }
        m_pagesize = m_html.length;
        m_uri = rsps.responseuri;
        rsps.close();
      }
      catch (exception ex)
      {
      }
    }
    public webpage(string _url)
    {
      string uurl = "";
      try
      {
        uurl = uri.unescapedatastring(_url);
        _url = uurl;
      }
      catch { };
      init(_url);
    }
    #endregion
}

调用:

webpage webinfo = new webpage("http://hovertree.net/");
webinfo.context;//不包含html标签的所有内容
webinfo.m_html;//包含html标签的内容 by 何问起

ps:这里再为大家提供2款非常方便的正则表达式工具供大家参考使用:

javascript正则表达式在线测试工具:

正则表达式在线生成工具:

更多关于c#相关内容感兴趣的读者可查看本站专题:《c#正则表达式用法总结》、《c#编码操作技巧总结》、《c#中xml文件操作技巧汇总》、《c#常见控件用法教程》、《winform控件用法总结》、《c#数据结构与算法教程》、《c#面向对象程序设计入门教程》及《c#程序设计之线程使用技巧总结

希望本文所述对大家c#程序设计有所帮助。