如何提取html的正文以及保留某些<>内容?
程序员文章站
2022-05-27 13:53:47
...
正文提取就是去除掉html代码里的<>的内容。这段代码增加了可选择保留某些<>内容。
1 using System; 2 using System.Text; 3 namespace HtmlStrip 4 { 5 class MainClass 6 { 7 public static void Main (string[] args) 8 { 9 string str = "<p>abc</p><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo"; 10 //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html"); 11 //str=rd.ReadToEnd (); 12 HtmlParser t = new HtmlParser (str); // 13 t.KeepTag (new string[] { "br" }); //设置br标签不过虑 14 Console.Write (t.Text ()); 15 } 16 17 18 19 } 20 class HtmlParser 21 { 22 private string[] htmlcode; //把html转为数组形式用于分析 23 private StringBuilder result = new StringBuilder (); //输出的结果 24 private int seek; //分析文本时候的指针位置 25 private string[] keepTag; //用于保存要保留的尖括号内容 26 private bool _inTag; //标记现在的指针是不是在尖括号内 27 private bool needContent = true; //是否要提取正文 28 private string tagName; //当前尖括号的名字 29 private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容,一般这些标签的正文是不要的 30 31 /// <summary> 32 /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字 33 /// </summary> 34 public bool inTag { 35 get { return _inTag; } 36 set { 37 _inTag = value; 38 if (!value) 39 return; 40 bool ok = true; 41 tagName = ""; 42 while (ok) { 43 string word = read (); 44 if (word != " " && word != ">") { 45 tagName += word; 46 } else if (word == " " && tagName.Length > 0) { 47 ok = false; 48 } else if (word == ">") { 49 ok = false; 50 inTag = false; 51 seek -= 1; 52 } 53 } 54 } 55 } 56 /// <summary> 57 /// 初始化类 58 /// </summary> 59 /// <param name="html"> 60 /// 要分析的html代码 61 /// </param> 62 public HtmlParser (string html) 63 { 64 htmlcode = new string[html.Length]; 65 for (int i = 0; i < html.Length; i++) { 66 htmlcode[i] = html[i].ToString (); 67 } 68 KeepTag (new string[] { }); 69 } 70 /// <summary> 71 /// 设置要保存那些标签不要被过滤掉 72 /// </summary> 73 /// <param name="tags"> 74 /// 75 /// </param> 76 public void KeepTag (string[] tags) 77 { 78 keepTag = tags; 79 } 80 81 /// <summary> 82 /// 83 /// </summary> 84 /// <returns> 85 /// 输出处理后的文本 86 /// </returns> 87 public string Text () 88 { 89 int startTag = 0; 90 int endTag = 0; 91 while (seek < htmlcode.Length) { 92 string word = read (); 93 if (word.ToLower () == "<") { 94 startTag = seek; 95 inTag = true; 96 } else if (word.ToLower () == ">") { 97 endTag = seek; 98 inTag = false; 99 if (iskeepTag (tagName.Replace ("/", ""))) { 100 for (int i = startTag - 1; i < endTag; i++) { 101 result.Append (htmlcode[i].ToString ()); 102 } 103 } else if (tagName.StartsWith ("!--")) { 104 bool ok = true; 105 while (ok) { 106 if (read () == "-") { 107 if (read () == "-") { 108 if (read () == ">") { 109 ok = false; 110 } else { 111 seek -= 1; 112 } 113 } 114 } 115 } 116 } else { 117 foreach (string str in specialTag) { 118 if (tagName == str) { 119 needContent = false; 120 break; 121 } else 122 needContent = true; 123 } 124 } 125 } else if (!inTag && needContent) { 126 result.Append (word); 127 } 128 129 } 130 return result.ToString (); 131 } 132 /// <summary> 133 /// 判断是否要保存这个标签 134 /// </summary> 135 /// <param name="tag"> 136 /// A <see cref="System.String"/> 137 /// </param> 138 /// <returns> 139 /// A <see cref="System.Boolean"/> 140 /// </returns> 141 private bool iskeepTag (string tag) 142 { 143 foreach (string ta in keepTag) { 144 if (tag.ToLower () == ta.ToLower ()) { 145 return true; 146 } 147 } 148 return false; 149 } 150 private string read () 151 { 152 return htmlcode[seek++]; 153 } 154 155 } 156 } 157
以上就是如何提取html的正文以及保留某些<>内容?的详细内容,更多请关注其它相关文章!
下一篇: python代码之阶乘求和的方法