C#基于正则表达式实现获取网页中所有信息的网页抓取类实例

程序员文章站 2024-02-07 14:42:46

本文实例讲述了c#基于正则表达式实现获取网页中所有信息的网页抓取类。分享给大家供大家参考，具体如下：类的代码： using system; using sy...

本文实例讲述了c#基于正则表达式实现获取网页中所有信息的网页抓取类。分享给大家供大家参考，具体如下：

类的代码：

using system;
using system.data;
using system.configuration;
using system.net;
using system.io;
using system.text;
using system.collections.generic;
using system.text.regularexpressions;
using system.threading;
using system.web;
using system.web.ui.mobilecontrols;
/// <summary>
/// 网页类
/// </summary>
public class webpage
{
    #region 私有成员
    private uri m_uri;  //url
    private list<link> m_links;  //此网页上的链接
    private string m_title;    //标题
    private string m_html;     //html代码
    private string m_outstr;    //网页可输出的纯文本
    private bool m_good;      //网页是否可用
    private int m_pagesize;    //网页的大小
    private static dictionary<string, cookiecontainer> webcookies = new dictionary<string, cookiecontainer>();//存放所有网页的cookie
    #endregion
    #region 属性
    /// <summary>
    /// 通过此属性可获得本网页的网址，只读
    /// </summary>
    public string url
    {
      get
      {
        return m_uri.absoluteuri;
      }
    }
    /// <summary>
    /// 通过此属性可获得本网页的标题，只读
    /// </summary>
    public string title
    {
      get
      {
        if (m_title == "")
        {
          regex reg = new regex(@"(?m)<title[^>]*>(?<title>(?:\w|\w)*?)</title[^>]*>", regexoptions.multiline | regexoptions.ignorecase);
          match mc = reg.match(m_html);
          if (mc.success)
            m_title = mc.groups["title"].value.trim();
        }
        return m_title;
      }
    }
    public string m_html
    {
      get
      {
        if (m_html == null)
        {
          m_html = "";
        }
        return m_html;
      }
    }
    /// <summary>
    /// 此属性获得本网页的所有链接信息，只读
    /// </summary>
    public list<link> links
    {
      get
      {
        if (m_links.count == 0) getlinks();
        return m_links;
      }
    }
    /// <summary>
    /// 此属性返回本网页的全部纯文本信息，只读
    /// </summary>
    public string context
    {
      get
      {
        if (m_outstr == "") getcontext(int16.maxvalue);
        return m_outstr;
      }
    }
    /// <summary>
    /// 此属性获得本网页的大小
    /// </summary>
    public int pagesize
    {
      get
      {
        return m_pagesize;
      }
    }
    /// <summary>
    /// 此属性获得本网页的所有站内链接
    /// </summary>
    public list<link> insitelinks
    {
      get
      {
        return getspeciallinksbyurl("^http://" + m_uri.host, int16.maxvalue);
      }
    }
    /// <summary>
    /// 此属性表示本网页是否可用
    /// </summary>
    public bool isgood
    {
      get
      {
        return m_good;
      }
    }
    /// <summary>
    /// 此属性表示网页的所在的网站
    /// </summary>
    public string host
    {
      get
      {
        return m_uri.host;
      }
    }
    #endregion
    /// <summary>
    /// 从html代码中分析出链接信息
    /// </summary>
    /// <returns>list<link></returns>
    private list<link> getlinks()
    {
      if (m_links.count == 0)
      {
        regex[] regex = new regex[2];
        regex[0] = new regex(@"<a\shref\s*=""(?<url>[^""]*).*?>(?<title>[^<]*)</a>", regexoptions.ignorecase | regexoptions.singleline);
        regex[1] = new regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", regexoptions.ignorecase);
        for (int i = 0; i < 2; i++)
        {
          match match = regex[i].match(m_html);
          while (match.success)
          {
            try
            {
              string url = httputility.urldecode(new uri(m_uri, match.groups["url"].value).absoluteuri);
              string text = "";
              if (i == 0) text = new regex("(<[^>]+>)|(\\s)|( )|&|\"", regexoptions.multiline | regexoptions.ignorecase).replace(match.groups["text"].value, "");
              link link = new link();
              link.text = text;
              link.navigateurl = url;
              m_links.add(link);
            }
            catch (exception ex) { console.writeline(ex.message); };
            match = match.nextmatch();
          }
        }
      }
      return m_links;
    }
    /// <summary>
    /// 此私有方法从一段html文本中提取出一定字数的纯文本
    /// </summary>
    /// <param name="instr">html代码</param>
    /// <param name="firstn">提取从头数多少个字</param>
    /// <param name="withlink">是否要链接里面的字</param>
    /// <returns>纯文本</returns>
    private string getfirstnchar(string instr, int firstn, bool withlink)
    {
      if (m_outstr == "")
      {
        m_outstr = instr.clone() as string;
        m_outstr = new regex(@"(?m)<script[^>]*>(\w|\w)*?</script[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        m_outstr = new regex(@"(?m)<style[^>]*>(\w|\w)*?</style[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        m_outstr = new regex(@"(?m)<select[^>]*>(\w|\w)*?</select[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        if (!withlink) m_outstr = new regex(@"(?m)<a[^>]*>(\w|\w)*?</a[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
        regex objreg = new system.text.regularexpressions.regex("(<[^>]+?>)| ", regexoptions.multiline | regexoptions.ignorecase);
        m_outstr = objreg.replace(m_outstr, "");
        regex objreg2 = new system.text.regularexpressions.regex("(\\s)+", regexoptions.multiline | regexoptions.ignorecase);
        m_outstr = objreg2.replace(m_outstr, " ");
      }
      return m_outstr.length > firstn ? m_outstr.substring(0, firstn) : m_outstr;
    }
    #region 公有文法
    /// <summary>
    /// 此公有方法提取网页中一定字数的纯文本，包括链接文字
    /// </summary>
    /// <param name="firstn">字数</param>
    /// <returns></returns>
    public string getcontext(int firstn)
    {
      return getfirstnchar(m_html, firstn, true);
    }
    /// <summary>
    /// 此公有方法从本网页的链接中提取一定数量的链接，该链接的url满足某正则式
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <param name="count">返回的链接的个数</param>
    /// <returns>list<link></returns>
    public list<link> getspeciallinksbyurl(string pattern, int count)
    {
      if (m_links.count == 0) getlinks();
      list<link> speciallinks = new list<link>();
      list<link>.enumerator i;
      i = m_links.getenumerator();
      int cnt = 0;
      while (i.movenext() && cnt < count)
      {
        if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase).match(i.current.navigateurl).success)
        {
          speciallinks.add(i.current);
          cnt++;
        }
      }
      return speciallinks;
    }
    /// <summary>
    /// 此公有方法从本网页的链接中提取一定数量的链接，该链接的文字满足某正则式
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <param name="count">返回的链接的个数</param>
    /// <returns>list<link></returns>
    public list<link> getspeciallinksbytext(string pattern, int count)
    {
      if (m_links.count == 0) getlinks();
      list<link> speciallinks = new list<link>();
      list<link>.enumerator i;
      i = m_links.getenumerator();
      int cnt = 0;
      while (i.movenext() && cnt < count)
      {
        if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase).match(i.current.text).success)
        {
          speciallinks.add(i.current);
          cnt++;
        }
      }
      return speciallinks;
    }
    /// <summary>
    /// 这公有方法提取本网页的纯文本中满足某正则式的文字 by 何问起
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <returns>返回文字</returns>
    public string getspecialwords(string pattern)
    {
      if (m_outstr == "") getcontext(int16.maxvalue);
      regex regex = new regex(pattern, regexoptions.multiline | regexoptions.ignorecase);
      match mc = regex.match(m_outstr);
      if (mc.success)
        return mc.groups[1].value;
      return string.empty;
    }
    #endregion
    #region 构造函数
    private void init(string _url)
    {
      try
      {
        m_uri = new uri(_url);
        m_links = new list<link>();
        m_html = "";
        m_outstr = "";
        m_title = "";
        m_good = true;
        if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi"))
        {
          m_good = false;
          return;
        }
        httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri);
        rqst.allowautoredirect = true;
        rqst.maximumautomaticredirections = 3;
        rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
        rqst.keepalive = true;
        rqst.timeout = 10000;
        lock (webpage.webcookies)
        {
          if (webpage.webcookies.containskey(m_uri.host))
            rqst.cookiecontainer = webpage.webcookies[m_uri.host];
          else
          {
            cookiecontainer cc = new cookiecontainer();
            webpage.webcookies[m_uri.host] = cc;
            rqst.cookiecontainer = cc;
          }
        }
        httpwebresponse rsps = (httpwebresponse)rqst.getresponse();
        stream sm = rsps.getresponsestream();
        if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22)
        {
          rsps.close();
          m_good = false;
          return;
        }
        encoding cding = system.text.encoding.default;
        string contenttype = rsps.contenttype.tolower();
        int ix = contenttype.indexof("charset=");
        if (ix != -1)
        {
          try
          {
            cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1));
          }
          catch
          {
            cding = encoding.default;
          }
          //该处视情况而定 有的需要解码
          //m_html = httputility.htmldecode(new streamreader(sm, cding).readtoend());
          m_html = new streamreader(sm, cding).readtoend();
        }
        else
        {
         //该处视情况而定 有的需要解码
          //m_html = httputility.htmldecode(new streamreader(sm, cding).readtoend());
          m_html = new streamreader(sm, cding).readtoend();
          regex regex = new regex("charset=(?<cding>[^=]+)?\"", regexoptions.ignorecase);
          string strcding = regex.match(m_html).groups["cding"].value;
          try
          {
            cding = encoding.getencoding(strcding);
          }
          catch
          {
            cding = encoding.default;
          }
          byte[] bytes = encoding.default.getbytes(m_html.tochararray());
          m_html = cding.getstring(bytes);
          if (m_html.split('?').length > 100)
          {
            m_html = encoding.default.getstring(bytes);
          }
        }
        m_pagesize = m_html.length;
        m_uri = rsps.responseuri;
        rsps.close();
      }
      catch (exception ex)
      {
      }
    }
    public webpage(string _url)
    {
      string uurl = "";
      try
      {
        uurl = uri.unescapedatastring(_url);
        _url = uurl;
      }
      catch { };
      init(_url);
    }
    #endregion
}

调用：

webpage webinfo = new webpage("http://hovertree.net/");
webinfo.context;//不包含html标签的所有内容
webinfo.m_html;//包含html标签的内容 by 何问起

ps：这里再为大家提供2款非常方便的正则表达式工具供大家参考使用：

javascript正则表达式在线测试工具：

正则表达式在线生成工具：

更多关于c#相关内容感兴趣的读者可查看本站专题：《c#正则表达式用法总结》、《c#编码操作技巧总结》、《c#中xml文件操作技巧汇总》、《c#常见控件用法教程》、《winform控件用法总结》、《c#数据结构与算法教程》、《c#面向对象程序设计入门教程》及《c#程序设计之线程使用技巧总结》

希望本文所述对大家c#程序设计有所帮助。

上一篇：基于Python的Post请求数据爬取的方法详解

下一篇： Java基础教程之HashMap迭代删除使用方法