C#自写的一个HTML解析类（类似XElement语法）

程序员文章站 2023-11-13 14:01:16

功能： 1、轻松获取指元素html元素。 2、可以根据属性标签进行筛选 3、返回的都是llist强类型无需转换用过xelement的都知道用来解...

功能：

1、轻松获取指元素html元素。
2、可以根据属性标签进行筛选
3、返回的都是llist强类型无需转换

用过xelement的都知道用来解析xml非常的方便，但是对于html的格式多样化实在是没办法兼容。

所以我就写了这么一个类似xelement的 xhtmlelement

用法：

string filepath = server.mappath("~/file/test.htm");
      //获取html代码
      string mailbody = filehelper.filetostring(filepath);

      xhtmlelement xh = new xhtmlelement(mailbody);

      //获取body的子集a标签并且class="icon"
      var link = xh.descendants("body").childdescendants("a").where(c => c.attributes.any(a => a.key == "class" && a.value == "icon")).tolist();

      //获取带href的a元素
      var links = xh.descendants("a").where(c => c.attributes.any(a => a.key == "href")).tolist();
      foreach (var r in links)
      {
        response.write(r.attributes.single(c => c.key == "href").value); //出输href
      }

      //获取第一个img
      var img = xh.descendants("img");

      //获取最近的第一个p元素以及与他同一级的其它p元素
      var ps = xh.descendants("p");

代码：

using system;
using system.collections.generic;
using system.linq;
using system.web;
using system.text;
using system.text.regularexpressions;

namespace syntacticsugar
{
  /// <summary>
  /// ** 描述：html解析类
  /// ** 创始时间：2015-4-23
  /// ** 修改时间：-
  /// ** 作者：sunkaixuan
  /// ** qq：610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议
  /// </summary>
  public class xhtmlelement
  {
    private string _html;
    public xhtmlelement(string html)
    {
      _html = html;
    }

    /// <summary>
    /// 获取最近的相同层级的html元素
    /// </summary>
    /// <param name="elementname">等于null为所有元素</param>
    /// <returns></returns>
    public list<htmlinfo> descendants(string elementname = null)
    {
      if (_html == null)
      {
        throw new argumentnullexception("html不能这空！");
      }
      var alllist = rootdescendants(_html);
      var reval = alllist.where(c => elementname == null || c.tagname.tolower() == elementname.tolower()).tolist();
      if (reval == null || reval.count == 0)
      {
        reval = getdescendantssource(alllist, elementname);
      }
      return reval;
    }


    /// <summary>
    /// 获取第一级元素
    /// </summary>
    /// <param name="elementname"></param>
    /// <returns></returns>
    public list<htmlinfo> rootdescendants(string html = null)
    {
      /*
       * 业务逻辑:
             * 1、获取第一个html标签一直找结尾标签，如果在这个过程中遇到相同的标签收尾标签就要加1
             * 2、第一个标签取到后继续第一步操作，找第2个元素 。。第n个元素
       */
      if (html == null) html = _html;
      var firsttag = regex.match(html, "<.+?>");

      list<string> elelist = new list<string>();
      list<htmlinfo> reval = new list<htmlinfo>();
      getelementsstringlist(html, ref elelist);
      foreach (var r in elelist)
      {
        htmlinfo data = new htmlinfo();
        data.oldfullhtml = r;
        data.samelevehtml = html;
        data.tagname = regex.match(r, @"(?<=\s{1}|\<)[a-z,a-z]+(?=\>|\s)", regexoptions.ignorecase).value;
        data.innerhtml = regex.match(r, @"(?<=\>).+(?=<)", regexoptions.singleline).value;
        var elebegin = regex.match(r, "<.+?>").value;
        var attrlist = regex.matches(elebegin, @"[a-z,a-z]+\="".+?""").cast<match>().select(c => new { key = c.value.split('=').first(), value = c.value.split('=').last().trimend('"').trimstart('"') }).tolist();
        data.attributes = new dictionary<string, string>();
        if (attrlist != null && attrlist.count > 0)
        {
          foreach (var a in attrlist)
          {
            data.attributes.add(a.key, a.value);
          }
        }
        reval.add(data);
      }
      return reval;

    }





    #region private
    private list<htmlinfo> getdescendantssource(list<htmlinfo> alllist, string elementname)
    {
      foreach (var r in alllist)
      {
        if (r.innerhtml == null || !r.innerhtml.contains("<")) continue;
        var childlist = rootdescendants(r.innerhtml).where(c => elementname == null || c.tagname.tolower() == elementname.tolower()).tolist();
        if (childlist == null || childlist.count == 0)
        {
          childlist = getdescendantssource(rootdescendants(r.innerhtml), elementname);
          if (childlist != null && childlist.count > 0)
            return childlist;
        }
        else
        {
          return childlist;
        }
      }
      return null;
    }

    private void getelementsstringlist(string html, ref list<string> elelist)
    {
      htmlinfo info = new htmlinfo();
      info.tagname = regex.match(html, @"(?<=\<\s{0,5}|\<)([a-z,a-z]+|h\d{1})(?=\>|\s)", regexoptions.ignorecase).value;
      string currenttagbeginreg = @"<\s{0,10}" + info.tagname + @".*?>";//获取当前标签元素开始标签正则
      string currenttagendreg = @"\<\/" + info.tagname + @"\>";//获取当前标签元素收尾标签正则
      if (string.isnullorempty(info.tagname)) return;

      string elehtml = "";
      //情况1 <a/>
      //情况2 <a></a>
      //情况3 <a> 错误格式
      //情况4endif
      if (regex.ismatch(html, @"<\s{0,10}" + info.tagname + "[^<].*?/>"))//单标签
      {
        elehtml = regex.match(html, @"<\s{0,10}" + info.tagname + "[^<].*?/>").value;
      }
      else if (!regex.ismatch(html, currenttagendreg))//没有收尾
      {
        if (regex.ismatch(html, @"\s{0,10}\<\!\-\-\[if"))
        {
          elehtml = getelementstring(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1);
        }
        else
        {
          elehtml = regex.match(html, currenttagbeginreg,regexoptions.singleline).value;
        }
      }
      else
      {
        elehtml = getelementstring(html, currenttagbeginreg, currenttagendreg, 1);
      }


      try
      {
        elelist.add(elehtml);
        html = html.replace(elehtml, "");
        html = regex.replace(html, @"<\!doctype.*?>", "");
        if (!regex.ismatch(html, @"^\s*$"))
        {
          getelementsstringlist(html, ref elelist);
        }

      }
      catch (exception ex)
      {
        throw new exception("sorry,您的html格式不能解析！！！");

      }

    }

    private string getelementstring(string html, string currenttagbeginreg, string currenttagendreg, int i)
    {

      string newhtml = getregnextbynum(html, currenttagbeginreg, currenttagendreg, i);
      var currenttagbeginmatches = regex.matches(newhtml, currenttagbeginreg, regexoptions.singleline).cast<match>().select(c => c.value).tolist();
      var currenttagendmatches = regex.matches(newhtml, currenttagendreg).cast<match>().select(c => c.value).tolist();
      if (currenttagbeginmatches.count == currenttagendmatches.count)
      { //两个签标元素相等
        return newhtml;
      }
      return getelementstring(html, currenttagbeginreg, currenttagendreg, ++i);
    }

    private string getregnextbynum(string val, string currenttagbeginreg, string currenttagendreg, int i)
    {
      return regex.match(val, currenttagbeginreg + @"((.*?)" + currenttagendreg + "){" + i + "}?", regexoptions.ignorecase | regexoptions.singleline).value;
    }
    #endregion



  }
  public static class xhtmlelementextendsion
  {
    /// <summary>
    /// 获取最近的相同层级的html元素
    /// </summary>
    /// <param name="elementname">等于null为所有元素</param>
    /// <returns></returns>
    public static list<htmlinfo> descendants(this ienumerable<htmlinfo> htmlinfolist, string elementname = null)
    {
      var html = htmlinfolist.first().innerhtml;
      xhtmlelement xhe = new xhtmlelement(html);
      return xhe.descendants(elementname);
    }
    /// <summary>
    /// 获取下级元素
    /// </summary>
    /// <param name="elementname"></param>
    /// <returns></returns>
    public static list<htmlinfo> childdescendants(this ienumerable<htmlinfo> htmlinfolist, string elementname = null)
    {
      var html = htmlinfolist.first().innerhtml;
      xhtmlelement xhe = new xhtmlelement(html);
      return xhe.rootdescendants(html).where(c => elementname == null || c.tagname == elementname).tolist();
    }

    /// <summary>
    /// 获取父级
    /// </summary>
    /// <param name="htmlinfolist"></param>
    /// <returns></returns>
    public static list<htmlinfo> parentdescendant(this ienumerable<htmlinfo> htmlinfolist,string fullhtml)
    {
      var savelevehtml = htmlinfolist.first().samelevehtml;
      string replaceguid=guid.newguid().tostring();
      fullhtml = fullhtml.replace(savelevehtml,replaceguid);
      var parenthtml = regex.match(fullhtml, @"<[^<]+?>[^<]*?" + replaceguid + @".*?<\/.+?>").value;
      parenthtml = parenthtml.replace(replaceguid, savelevehtml);
      xhtmlelement xhe = new xhtmlelement(parenthtml);
      return xhe.rootdescendants();
    }
  }
  /// <summary>
  /// html信息类
  /// </summary>
  public class htmlinfo
  {
    /// <summary>
    /// 元素名
    /// </summary>
    public string tagname { get; set; }
    /// <summary>
    /// 元素属性
    /// </summary>
    public dictionary<string, string> attributes { get; set; }
    /// <summary>
    /// 元素内部html
    /// </summary>
    public string innerhtml { get; set; }

    public string oldfullhtml { get; set; }

    public string samelevehtml { get; set; }

    /// <summary>
    /// 得到元素的html
    /// </summary>
    /// <returns></returns>
    public string fullhtml
    {
      get
      {
        stringbuilder reval = new stringbuilder();
        string attributesstring = string.empty;
        if (attributes != null && attributes.count > 0)
        {
          attributesstring = string.join(" ", attributes.select(c => string.format("{0}=\"{1}\"", c.key, c.value)));
        }
        reval.appendformat("<{0} {2}>{1}</{0}>", tagname, innerhtml, attributesstring);
        return reval.tostring();
      }
    }
  }
}

前台html:

<!doctype html public "-//w3c//dtd xhtml 1.0 transitional//en" "http://www.w3.org/tr/xhtml1/dtd/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <title></title>
</head>
<body>
  <a id="1">我是1</a> 
  <a id="2" class="icon">icon</a>
  <img />
</body>
</html>

上一篇：为了完成这个功能，我竟然用5行代码制作了一个EXE可执行程序

下一篇： CentOS6.5下搭建文件共享服务（Samba）