欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

c#中过滤html的正则表达式

程序员文章站 2023-11-17 18:59:52
实现代码 /// /// 去除html标记 /// ///

实现代码

///  <summary>
///  去除html标记
///  </summary>
///  <param  name=”nohtml”>包括html的源码  </param>
///  <returns>已经去除后的文字</returns>
public static string nohtml(string htmlstring)
{
  //删除脚本
  htmlstring = regex.replace(htmlstring, @"<script[^>]*?>.*?</script>", "",
  regexoptions.ignorecase);
  //删除html 
  htmlstring = regex.replace(htmlstring, @"<(.[^>]*)>", "",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"([\r\n])[\s]+", "",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"–>", "", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"<!–.*", "", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(quot|#34);", "\"",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(amp|#38);", "&",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(lt|#60);", "<",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(gt|#62);", ">",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(nbsp|#160);", "  ",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(iexcl|#161);", "\xa1", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(cent|#162);", "\xa2", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(pound|#163);", "\xa3", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(copy|#169);", "\xa9", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"(\d+);", "", regexoptions.ignorecase);
  htmlstring.replace("<", "");
  htmlstring.replace(">", "");
  htmlstring.replace("\r\n", "");
  htmlstring = httpcontext.current.server.htmlencode(htmlstring).trim();
  return htmlstring;
}

c#过滤html标签及空格

public static string filterhtml(string htmlstr)
    {
      if (!string.isnullorempty(htmlstr))
        return system.text.regularexpressions.regex.replace(htmlstr, "<[^>]*>| ", "");
      else
        return "";
    }

写一个静态方法移除html标签

#region
///  <summary>
///  移除html标签
///  </summary>
///  <param  name="htmlstr">htmlstr</param>
public static string parsetags(string htmlstr)
{
 return system.text.regularexpressions.regex.replace(htmlstr, "<[^>]*>", "");
}
#endregion

取出文本中的图片地址

#region
///  <summary>
///  取出文本中的图片地址
///  </summary>
///  <param  name="htmlstr">htmlstr</param>
public static string getimgurl(string htmlstr)
{
 string str = string.empty;
 string spattern = @"^<img\s+[^>]*>";
 regex r = new regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\s+)'?[^>]*>",
  regexoptions.compiled);
 match m = r.match(htmlstr.tolower());
 if (m.success)
  str = m.result("${url}");
 return str;
}
#endregion

提取html代码中文字的c#函数

///  <summary>
///  提取html代码中文字的c#函数
///  </summary>
///  <param  name="strhtml">包括html的源码  </param>
///  <returns>已经去除后的文字</returns>
using system;
using system.text.regularexpressions;
public class striphtmltest
{
 public static void main()
 {
  string s = striphtml(
   "<html><head><title>中国石龙信息平台</title></head><body>faddfs龙信息平台</body></html>");
  console.writeline(s);
 }

 public static string striphtml(string strhtml)
 {
  string[]aryreg =
  {
   @"<script[^>]*?>.*?</script>",

   @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\["
    "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @
    "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @
    "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);",
    @"&(copy|#169);", @"(\d+);", @"-->", @"<!--.*\n"
  };

  string[]aryrep =
  {
   "", "", "", "\"", "&", "<", ">", "  ", "\xa1", //chr(161),
   "\xa2", //chr(162),
   "\xa3", //chr(163),
   "\xa9", //chr(169),
   "", "\r\n", ""
  };

  string newreg = aryreg[0];
  string stroutput = strhtml;
  for (int i = 0; i < aryreg.length; i++)
  {
   regex regex = new regex(aryreg[i], regexoptions.ignorecase);
   stroutput = regex.replace(stroutput, aryrep[i]);
  }
  stroutput.replace("<", "");
  stroutput.replace(">", "");
  stroutput.replace("\r\n", "");
  return stroutput;
 }
}

tempcontent 表示包含有html的字符串;
tempcontent = system.text.regularexpressions.regex.replace(tempcontent,"<[^>]+>","");至少一个
tempcontent = system.text.regularexpressions.regex.replace(tempcontent,"<[^>]*>","");任意个