C#自写的一个HTML解析类(类似XElement语法)
程序员文章站
2022-06-21 09:42:41
功能:
1、轻松获取指元素html元素。
2、可以根据属性标签进行筛选
3、返回的都是llist强类型无需转换
用过xelement的都知道 用来解...
功能:
1、轻松获取指元素html元素。
2、可以根据属性标签进行筛选
3、返回的都是llist强类型无需转换
用过xelement的都知道 用来解析xml非常的方便,但是对于html的格式多样化实在是没办法兼容。
所以我就写了这么一个类似xelement的 xhtmlelement
用法:
string filepath = server.mappath("~/file/test.htm"); //获取html代码 string mailbody = filehelper.filetostring(filepath); xhtmlelement xh = new xhtmlelement(mailbody); //获取body的子集a标签并且class="icon" var link = xh.descendants("body").childdescendants("a").where(c => c.attributes.any(a => a.key == "class" && a.value == "icon")).tolist(); //获取带href的a元素 var links = xh.descendants("a").where(c => c.attributes.any(a => a.key == "href")).tolist(); foreach (var r in links) { response.write(r.attributes.single(c => c.key == "href").value); //出输href } //获取第一个img var img = xh.descendants("img"); //获取最近的第一个p元素以及与他同一级的其它p元素 var ps = xh.descendants("p");
代码:
using system; using system.collections.generic; using system.linq; using system.web; using system.text; using system.text.regularexpressions; namespace syntacticsugar { /// <summary> /// ** 描述:html解析类 /// ** 创始时间:2015-4-23 /// ** 修改时间:- /// ** 作者:sunkaixuan /// ** qq:610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议 /// </summary> public class xhtmlelement { private string _html; public xhtmlelement(string html) { _html = html; } /// <summary> /// 获取最近的相同层级的html元素 /// </summary> /// <param name="elementname">等于null为所有元素</param> /// <returns></returns> public list<htmlinfo> descendants(string elementname = null) { if (_html == null) { throw new argumentnullexception("html不能这空!"); } var alllist = rootdescendants(_html); var reval = alllist.where(c => elementname == null || c.tagname.tolower() == elementname.tolower()).tolist(); if (reval == null || reval.count == 0) { reval = getdescendantssource(alllist, elementname); } return reval; } /// <summary> /// 获取第一级元素 /// </summary> /// <param name="elementname"></param> /// <returns></returns> public list<htmlinfo> rootdescendants(string html = null) { /* * 业务逻辑: * 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1 * 2、第一个标签取到后继续第一步操作,找第2个元素 。。第n个元素 */ if (html == null) html = _html; var firsttag = regex.match(html, "<.+?>"); list<string> elelist = new list<string>(); list<htmlinfo> reval = new list<htmlinfo>(); getelementsstringlist(html, ref elelist); foreach (var r in elelist) { htmlinfo data = new htmlinfo(); data.oldfullhtml = r; data.samelevehtml = html; data.tagname = regex.match(r, @"(?<=\s{1}|\<)[a-z,a-z]+(?=\>|\s)", regexoptions.ignorecase).value; data.innerhtml = regex.match(r, @"(?<=\>).+(?=<)", regexoptions.singleline).value; var elebegin = regex.match(r, "<.+?>").value; var attrlist = regex.matches(elebegin, @"[a-z,a-z]+\="".+?""").cast<match>().select(c => new { key = c.value.split('=').first(), value = c.value.split('=').last().trimend('"').trimstart('"') }).tolist(); data.attributes = new dictionary<string, string>(); if (attrlist != null && attrlist.count > 0) { foreach (var a in attrlist) { data.attributes.add(a.key, a.value); } } reval.add(data); } return reval; } #region private private list<htmlinfo> getdescendantssource(list<htmlinfo> alllist, string elementname) { foreach (var r in alllist) { if (r.innerhtml == null || !r.innerhtml.contains("<")) continue; var childlist = rootdescendants(r.innerhtml).where(c => elementname == null || c.tagname.tolower() == elementname.tolower()).tolist(); if (childlist == null || childlist.count == 0) { childlist = getdescendantssource(rootdescendants(r.innerhtml), elementname); if (childlist != null && childlist.count > 0) return childlist; } else { return childlist; } } return null; } private void getelementsstringlist(string html, ref list<string> elelist) { htmlinfo info = new htmlinfo(); info.tagname = regex.match(html, @"(?<=\<\s{0,5}|\<)([a-z,a-z]+|h\d{1})(?=\>|\s)", regexoptions.ignorecase).value; string currenttagbeginreg = @"<\s{0,10}" + info.tagname + @".*?>";//获取当前标签元素开始标签正则 string currenttagendreg = @"\<\/" + info.tagname + @"\>";//获取当前标签元素收尾标签正则 if (string.isnullorempty(info.tagname)) return; string elehtml = ""; //情况1 <a/> //情况2 <a></a> //情况3 <a> 错误格式 //情况4endif if (regex.ismatch(html, @"<\s{0,10}" + info.tagname + "[^<].*?/>"))//单标签 { elehtml = regex.match(html, @"<\s{0,10}" + info.tagname + "[^<].*?/>").value; } else if (!regex.ismatch(html, currenttagendreg))//没有收尾 { if (regex.ismatch(html, @"\s{0,10}\<\!\-\-\[if")) { elehtml = getelementstring(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1); } else { elehtml = regex.match(html, currenttagbeginreg,regexoptions.singleline).value; } } else { elehtml = getelementstring(html, currenttagbeginreg, currenttagendreg, 1); } try { elelist.add(elehtml); html = html.replace(elehtml, ""); html = regex.replace(html, @"<\!doctype.*?>", ""); if (!regex.ismatch(html, @"^\s*$")) { getelementsstringlist(html, ref elelist); } } catch (exception ex) { throw new exception("sorry,您的html格式不能解析!!!"); } } private string getelementstring(string html, string currenttagbeginreg, string currenttagendreg, int i) { string newhtml = getregnextbynum(html, currenttagbeginreg, currenttagendreg, i); var currenttagbeginmatches = regex.matches(newhtml, currenttagbeginreg, regexoptions.singleline).cast<match>().select(c => c.value).tolist(); var currenttagendmatches = regex.matches(newhtml, currenttagendreg).cast<match>().select(c => c.value).tolist(); if (currenttagbeginmatches.count == currenttagendmatches.count) { //两个签标元素相等 return newhtml; } return getelementstring(html, currenttagbeginreg, currenttagendreg, ++i); } private string getregnextbynum(string val, string currenttagbeginreg, string currenttagendreg, int i) { return regex.match(val, currenttagbeginreg + @"((.*?)" + currenttagendreg + "){" + i + "}?", regexoptions.ignorecase | regexoptions.singleline).value; } #endregion } public static class xhtmlelementextendsion { /// <summary> /// 获取最近的相同层级的html元素 /// </summary> /// <param name="elementname">等于null为所有元素</param> /// <returns></returns> public static list<htmlinfo> descendants(this ienumerable<htmlinfo> htmlinfolist, string elementname = null) { var html = htmlinfolist.first().innerhtml; xhtmlelement xhe = new xhtmlelement(html); return xhe.descendants(elementname); } /// <summary> /// 获取下级元素 /// </summary> /// <param name="elementname"></param> /// <returns></returns> public static list<htmlinfo> childdescendants(this ienumerable<htmlinfo> htmlinfolist, string elementname = null) { var html = htmlinfolist.first().innerhtml; xhtmlelement xhe = new xhtmlelement(html); return xhe.rootdescendants(html).where(c => elementname == null || c.tagname == elementname).tolist(); } /// <summary> /// 获取父级 /// </summary> /// <param name="htmlinfolist"></param> /// <returns></returns> public static list<htmlinfo> parentdescendant(this ienumerable<htmlinfo> htmlinfolist,string fullhtml) { var savelevehtml = htmlinfolist.first().samelevehtml; string replaceguid=guid.newguid().tostring(); fullhtml = fullhtml.replace(savelevehtml,replaceguid); var parenthtml = regex.match(fullhtml, @"<[^<]+?>[^<]*?" + replaceguid + @".*?<\/.+?>").value; parenthtml = parenthtml.replace(replaceguid, savelevehtml); xhtmlelement xhe = new xhtmlelement(parenthtml); return xhe.rootdescendants(); } } /// <summary> /// html信息类 /// </summary> public class htmlinfo { /// <summary> /// 元素名 /// </summary> public string tagname { get; set; } /// <summary> /// 元素属性 /// </summary> public dictionary<string, string> attributes { get; set; } /// <summary> /// 元素内部html /// </summary> public string innerhtml { get; set; } public string oldfullhtml { get; set; } public string samelevehtml { get; set; } /// <summary> /// 得到元素的html /// </summary> /// <returns></returns> public string fullhtml { get { stringbuilder reval = new stringbuilder(); string attributesstring = string.empty; if (attributes != null && attributes.count > 0) { attributesstring = string.join(" ", attributes.select(c => string.format("{0}=\"{1}\"", c.key, c.value))); } reval.appendformat("<{0} {2}>{1}</{0}>", tagname, innerhtml, attributesstring); return reval.tostring(); } } } }
前台html:
<!doctype html public "-//w3c//dtd xhtml 1.0 transitional//en" "http://www.w3.org/tr/xhtml1/dtd/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title></title> </head> <body> <a id="1">我是1</a> <a id="2" class="icon">icon</a> <img /> </body> </html>
上一篇: C#拷贝文件简单实现方法