用DOM实现文章采集--通过jquery语法式的方法采集指定对象的文本
[csharp]
/// <summary>
/// dom查询器,用法跟jquery差不多
/// </summary>
public class domquery
{
/// <summary>
/// 获得节点
/// </summary>
/// <param name="_htmldocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
/// <remarks>dom选择器,用法跟jquery差不多</remarks>
public ilist<htmlnode> get(htmldocument _htmldocument, string selector)
{
string[] expressions = selector.split(new char[] { ' ' }, stringsplitoptions.removeemptyentries);
list<htmlnode> hnlist = new list<htmlnode>();
if (expressions[0].startswith("#"))
{
hnlist.add(_htmldocument.getelementbyid(expressions[0].trimstart('#')));
hnlist.removeall(x => { return x == null; });
if (expressions.length == 1)
{
return hnlist;
}
for (int i = 1; i < expressions.length; i++)
{
hnlist = get(hnlist, expressions[i]);
}
}
else
{
hnlist.addrange(_htmldocument.documentnode.childnodes.where(x => { return x.nodetype == htmlnodetype.element; }));
for (int i = 0; i < expressions.length; i++)
{
hnlist = get(hnlist, expressions[i]);
}
}
return hnlist;
}
/// <summary>
/// 查找节点,并直接返回innerhtml
/// </summary>
/// <param name="_htmldocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
public string singlegetinnerhtml(htmldocument _htmldocument, string selector)
{
htmlnode hn = singleget(_htmldocument, selector);
if (hn == null)
return null;
else
return hn.innerhtml;
}
/// <summary>
/// 查找节点,并直接返回innertext
/// </summary>
/// <param name="_htmldocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
public string singlegetinnertext(htmldocument _htmldocument, string selector)
{
htmlnode hn = singleget(_htmldocument, selector);
if (hn == null)
return null;
else
return hn.innertext.trim();
}
/// <summary>
/// 查找节点
/// </summary>
/// <param name="_htmldocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
public htmlnode singleget(htmldocument _htmldocument, string selector)
{
ilist<htmlnode> hnlist = get(_htmldocument, selector);
if (hnlist.count == 0)
{
return null;
}
else
{
return hnlist[0];
}
}
#region 获得属性
/// <summary>
/// 获得属性
/// </summary>
/// <param name="_htmlnodes"></param>
/// <param name="attr"></param>
/// <returns></returns>
public string[] attr(ilist<htmlnode> _htmlnodes, string attr)
{
if (_htmlnodes == null)
{
return new string[0];
}
if (_htmlnodes.count() == 0)
{
return new string[0];
}
var v = from x in _htmlnodes where x.attributes[attr] != null select x;
return (from x in v select x.attributes[attr].value).toarray();
}
#endregion
#region 根据选择器语法查找
/// <summary>
/// 根据选择器语法查找
/// </summary>
/// <param name="_htmlnodes"></param>
/// <param name="expression"></param>
/// <returns></returns>
private list<htmlnode> get(list<htmlnode> _htmlnodes, string expression)
{
string _expre = null;
string fun = null;
int index = -1;
string keyword = null;
regex reg = new regex(@"([.|\-|\w]+)", regexoptions.singleline);
matchcollection mc = reg.matches(expression);
for (int i = 0; i < mc.count; i++)
{
if (i == 0)
{
_expre = mc[i].value;
}
if (i == 1)
{
fun = mc[i].value;
}
if (i == 2)
{
if (int.tryparse(mc[i].value, out index) == false)
{
keyword = mc[i].value;
}
}
}
list<htmlnode> list = new list<htmlnode>();
if (string.isnullorempty(fun) == true)
{
if (expression.startswith("."))
{
return class(_htmlnodes, expression).tolist();
}
else
{
return nodetype(_htmlnodes, expression).tolist();
}
}
else
{
foreach (var n in _htmlnodes)
{
ienumerable<htmlnode> v;
if (_expre.startswith("."))
{
v = class(n, _expre);
}
else
{
v = nodetype(n, _expre);
}
list.addrange(funaction(v, fun, index, keyword));
}
return list;
}
}
#region 函数处理
/// <summary>
/// 函数处理
/// </summary>
/// <param name="v"></param>
/// <param name="fun"></param>
/// <returns></returns>
private ienumerable<htmlnode> funaction(ienumerable<htmlnode> v, string fun, int index, string keyword)
{
switch (fun.tolower())
{
case "eq":
return v.where((nn, _index) => _index == index);
case "lt":
return v.where((nn, _index) => _index < index);
case "gt":
return v.where((nn, _index) => _index > index);
case "first":
if (v.count() > 0)
return new htmlnode[] { v.first() };
else
return v;
case "last":
if (v.count() > 0)
return new htmlnode[] { v.last() };
else
return v;
case "even":
return v.where((nn, _index) => _index % 2 == 0);
case "odd":
return v.where((nn, _index) => (_index & 1) == 1);
case "next":
return v.select(nn => nn.nextsibling);
case "contains":
return v.where(x => { return x.innerhtml.contains(keyword); });
case "empty":
return v.where(x => { return x.haschildnodes == false; });
case "header":
string[] headers = new string[] { "h1", "h2", "h3", "h4", "h5", "h6" };
return findchildnodes(v.toarray()).where(x => { return headers.contains(x.originalname); });
default:
throw new notsupportedexception("函数不支持。");
}
}
#endregion
#endregion
#region 根据类名找节点
private parallelquery<htmlnode> class(htmlnode hn, string expression)
{
return class(new htmlnode[] { hn }, expression);
}
/// <summary>
/// 根据类名找节点
/// </summary>
/// <param name="_htmlnodes"></param>
/// <param name="expression"></param>
/// <returns></returns>
private parallelquery<htmlnode> class(ilist<htmlnode> _htmlnodes, string expression)
{
var v = findchildnodes(_htmlnodes).asparallel().where(x => x.attributes["class"] != null);
var y = v.where(x => x.attributes["class"].value.split(new char[] { ' ' }, stringsplitoptions.removeemptyentries).contains(expression.trimstart('.'), stringcomparer.currentcultureignorecase));
return y;
}
#endregion
#region 根据类型找节点
/// <summary>
/// 根据类型找节点
/// </summary>
/// <param name="hn"></param>
/// <param name="expression"></param>
/// <returns></returns>
private parallelquery<htmlnode> nodetype(htmlnode hn, string expression)
{
return nodetype(new htmlnode[] { hn }, expression);
}
/// <summary>
/// 根据类型找节点
/// </summary>
/// <param name="_htmlnodes"></param>
/// <param name="expression"></param>
/// <returns></returns>
private parallelquery<htmlnode> nodetype(ilist<htmlnode> _htmlnodes, string expression)
{
var v = findchildnodes(_htmlnodes).asparallel().where(
x => x.originalname.equals(expression, stringcomparison.currentcultureignorecase));
return v;
}
#endregion
#region 查找所有下级
/// <summary>
/// 查找所有下级
/// </summary>
/// <param name="_htmlnodes"></param>
/// <returns></returns>
private list<htmlnode> findchildnodes(ilist<htmlnode> _htmlnodes)
{
if (_htmlnodes == null)
{
throw new exception("");
}
list<htmlnode> list = new list<htmlnode>();
foreach (var v in _htmlnodes)
{
findchildnodesaction(v, list);
}
return list;
}
private void findchildnodesaction(htmlnode hn, list<htmlnode> list)
{
if (list == null)
{
throw new exception("");
}
foreach (var v in hn.childnodes)
{
if (hn.nodetype == htmlnodetype.element)
{
list.add(v);
findchildnodesaction(v, list);
}
}
}
#endregion
}
摘自 winner2050的专栏
上一篇: 网页切图的CSS和布局经验与要点
下一篇: Linux启动mysql命令