c# 正则表达式对网页进行有效内容抽取
程序员文章站
2023-01-29 19:51:08
搜索引擎中一个比较重要的环节就是从网页中抽取出有效内容。简单来说,就是吧html文本中的html标记去掉,留下我们用ie等浏览器打开html文档看到的部分(我们这里不考虑图...
搜索引擎中一个比较重要的环节就是从网页中抽取出有效内容。简单来说,就是吧html文本中的html标记去掉,留下我们用ie等浏览器打开html文档看到的部分(我们这里不考虑图片).
将html文本中的标记分为:注释,script ,style,以及其他标记分别去掉:
1.去注释,正则为:
output = regex.replace(input, @"<!--[^-]*-->", string.empty, regexoptions.ignorecase);
2.去script,正则为:
ouput = regex.replace(input, @"<script[^>]*?>.*?</script>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
output2 = regex.replace(ouput , @"<noscript[^>]*?>.*?</noscript>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
3.去style,正则为:
output = regex.replace(input, @"<style[^>]*?>.*?</style>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
4.去其他html标记
result = result.replace(" ", " ");
result = result.replace(""", "\"");
result = result.replace("<", "<");
result = result.replace(">", ">");
result = result.replace("&", "&");
result = result.replace("<br>", "\r\n");
result = regex.replace(result, @"<[\s\s]*?>", string.empty, regexoptions.ignorecase);
以上的代码中大家可以看到,我使用了regexoptions.singleline参数,这个参数很重要,他主要是为了让"."(小圆点)可以匹配换行符.如果没有这个参数,大多数情况下,用上面列正则表达式来消除网页html标记是无效的.
html发展至今,语法已经相当复杂,上面只列出了几种最主要的标记,更多的去html标记的正则我将在
rost webspider 的开发过程中补充进来。
下面用c#实现了一个从html字符串中提取有效内容的类:
using system;
using system.collections.generic;
using system.text;
using system.text.regularexpressions;
class htmlextract
{
#region private attributes
private string _strhtml;
#endregion
#region public mehtods
public htmlextract(string instrhtml)
{
_strhtml = instrhtml
}
public override string extracttext()
{
string result = _strhtml;
result = removecomment(result);
result = removescript(result);
result = removestyle(result);
result = removetags(result);
return result.trim();
}
#endregion
#region private methods
private string removecomment(string input)
{
string result = input;
//remove comment
result = regex.replace(result, @"<!--[^-]*-->", string.empty, regexoptions.ignorecase);
return result;
}
private string removestyle(string input)
{
string result = input;
//remove all styles
result = regex.replace(result, @"<style[^>]*?>.*?</style>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
return result;
}
private string removescript(string input)
{
string result = input;
result = regex.replace(result, @"<script[^>]*?>.*?</script>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
result = regex.replace(result, @"<noscript[^>]*?>.*?</noscript>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
return result;
}
private string removetags(string input)
{
string result = input;
result = result.replace(" ", " ");
result = result.replace(""", "\"");
result = result.replace("<", "<");
result = result.replace(">", ">");
result = result.replace("&", "&");
result = result.replace("<br>", "\r\n");
result = regex.replace(result, @"<[\s\s]*?>", string.empty, regexoptions.ignorecase);
return result;
}
#endregion
将html文本中的标记分为:注释,script ,style,以及其他标记分别去掉:
1.去注释,正则为:
output = regex.replace(input, @"<!--[^-]*-->", string.empty, regexoptions.ignorecase);
2.去script,正则为:
ouput = regex.replace(input, @"<script[^>]*?>.*?</script>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
output2 = regex.replace(ouput , @"<noscript[^>]*?>.*?</noscript>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
3.去style,正则为:
output = regex.replace(input, @"<style[^>]*?>.*?</style>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
4.去其他html标记
result = result.replace(" ", " ");
result = result.replace(""", "\"");
result = result.replace("<", "<");
result = result.replace(">", ">");
result = result.replace("&", "&");
result = result.replace("<br>", "\r\n");
result = regex.replace(result, @"<[\s\s]*?>", string.empty, regexoptions.ignorecase);
以上的代码中大家可以看到,我使用了regexoptions.singleline参数,这个参数很重要,他主要是为了让"."(小圆点)可以匹配换行符.如果没有这个参数,大多数情况下,用上面列正则表达式来消除网页html标记是无效的.
html发展至今,语法已经相当复杂,上面只列出了几种最主要的标记,更多的去html标记的正则我将在
rost webspider 的开发过程中补充进来。
下面用c#实现了一个从html字符串中提取有效内容的类:
using system;
using system.collections.generic;
using system.text;
using system.text.regularexpressions;
class htmlextract
{
#region private attributes
private string _strhtml;
#endregion
#region public mehtods
public htmlextract(string instrhtml)
{
_strhtml = instrhtml
}
public override string extracttext()
{
string result = _strhtml;
result = removecomment(result);
result = removescript(result);
result = removestyle(result);
result = removetags(result);
return result.trim();
}
#endregion
#region private methods
private string removecomment(string input)
{
string result = input;
//remove comment
result = regex.replace(result, @"<!--[^-]*-->", string.empty, regexoptions.ignorecase);
return result;
}
private string removestyle(string input)
{
string result = input;
//remove all styles
result = regex.replace(result, @"<style[^>]*?>.*?</style>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
return result;
}
private string removescript(string input)
{
string result = input;
result = regex.replace(result, @"<script[^>]*?>.*?</script>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
result = regex.replace(result, @"<noscript[^>]*?>.*?</noscript>", string.empty, regexoptions.ignorecase | regexoptions.singleline);
return result;
}
private string removetags(string input)
{
string result = input;
result = result.replace(" ", " ");
result = result.replace(""", "\"");
result = result.replace("<", "<");
result = result.replace(">", ">");
result = result.replace("&", "&");
result = result.replace("<br>", "\r\n");
result = regex.replace(result, @"<[\s\s]*?>", string.empty, regexoptions.ignorecase);
return result;
}
#endregion
上一篇: 做完人流后可以喝红糖水吗
下一篇: 让URL只允许一些字符的正则表达式