欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

用DOM实现文章采集--通过jquery语法式的方法采集指定对象的文本

程序员文章站 2022-03-30 11:44:15
[csharp] ///   /// dom查询器,用法跟jquery差不多  ///   public...

[csharp]
/// <summary> 
/// dom查询器,用法跟jquery差不多 
/// </summary> 
public class domquery 

    /// <summary> 
    /// 获得节点 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    /// <remarks>dom选择器,用法跟jquery差不多</remarks> 
    public ilist<htmlnode> get(htmldocument _htmldocument, string selector) 
    { 
        string[] expressions = selector.split(new char[] { ' ' }, stringsplitoptions.removeemptyentries); 
 
        list<htmlnode> hnlist = new list<htmlnode>(); 
 
        if (expressions[0].startswith("#")) 
        { 
            hnlist.add(_htmldocument.getelementbyid(expressions[0].trimstart('#'))); 
            hnlist.removeall(x => { return x == null; }); 
 
            if (expressions.length == 1) 
            { 
                return hnlist; 
            } 
 
            for (int i = 1; i < expressions.length; i++) 
            { 
                hnlist = get(hnlist, expressions[i]); 
            } 
        } 
        else 
        { 
            hnlist.addrange(_htmldocument.documentnode.childnodes.where(x => { return x.nodetype == htmlnodetype.element; })); 
 
            for (int i = 0; i < expressions.length; i++) 
            { 
                hnlist = get(hnlist, expressions[i]); 
            } 
        } 
 
 
 
 
 
        return hnlist; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回innerhtml 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string singlegetinnerhtml(htmldocument _htmldocument, string selector) 
    { 
        htmlnode hn = singleget(_htmldocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.innerhtml; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回innertext 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string singlegetinnertext(htmldocument _htmldocument, string selector) 
    { 
        htmlnode hn = singleget(_htmldocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.innertext.trim(); 
    } 
    /// <summary> 
    /// 查找节点 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public htmlnode singleget(htmldocument _htmldocument, string selector) 
    { 
        ilist<htmlnode> hnlist = get(_htmldocument, selector); 
 
        if (hnlist.count == 0) 
        { 
            return null; 
        } 
        else 
        { 
            return hnlist[0]; 
        } 
    } 
 
    #region 获得属性 
    /// <summary> 
    /// 获得属性 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="attr"></param> 
    /// <returns></returns> 
    public string[] attr(ilist<htmlnode> _htmlnodes, string attr) 
    { 
        if (_htmlnodes == null) 
        { 
            return new string[0]; 
        } 
        if (_htmlnodes.count() == 0) 
        { 
            return new string[0]; 
        } 
        var v = from x in _htmlnodes where x.attributes[attr] != null select x; 
 
        return (from x in v select x.attributes[attr].value).toarray(); 
    } 
    #endregion 
 
    #region 根据选择器语法查找 
    /// <summary> 
    /// 根据选择器语法查找 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private list<htmlnode> get(list<htmlnode> _htmlnodes, string expression) 
    { 
        string _expre = null; 
        string fun = null; 
        int index = -1; 
        string keyword = null; 
        regex reg = new regex(@"([.|\-|\w]+)", regexoptions.singleline); 
        matchcollection mc = reg.matches(expression); 
        for (int i = 0; i < mc.count; i++) 
        { 
            if (i == 0) 
            { 
                _expre = mc[i].value; 
            } 
            if (i == 1) 
            { 
                fun = mc[i].value; 
            } 
            if (i == 2) 
            { 
                if (int.tryparse(mc[i].value, out index) == false) 
                { 
                    keyword = mc[i].value; 
                } 
            } 
        } 
        list<htmlnode> list = new list<htmlnode>(); 
 
        if (string.isnullorempty(fun) == true) 
        { 
            if (expression.startswith(".")) 
            { 
                return class(_htmlnodes, expression).tolist(); 
            } 
            else 
            { 
                return nodetype(_htmlnodes, expression).tolist(); 
            } 
        } 
        else 
        { 
            foreach (var n in _htmlnodes) 
            { 
                ienumerable<htmlnode> v; 
                if (_expre.startswith(".")) 
                { 
                    v = class(n, _expre); 
                } 
                else 
                { 
                    v = nodetype(n, _expre); 
                } 
 
 
                list.addrange(funaction(v, fun, index, keyword)); 
            } 
            return list; 
        } 
    } 
    #region 函数处理 
    /// <summary> 
    /// 函数处理   
    /// </summary> 
    /// <param name="v"></param> 
    /// <param name="fun"></param> 
    /// <returns></returns> 
    private ienumerable<htmlnode> funaction(ienumerable<htmlnode> v, string fun, int index, string keyword) 
    { 
        switch (fun.tolower()) 
        { 
            case "eq": 
                return v.where((nn, _index) => _index == index); 
            case "lt": 
                return v.where((nn, _index) => _index < index); 
            case "gt": 
                return v.where((nn, _index) => _index > index); 
            case "first": 
                if (v.count() > 0) 
                    return new htmlnode[] { v.first() }; 
                else 
                    return v; 
            case "last": 
                if (v.count() > 0) 
                    return new htmlnode[] { v.last() }; 
                else 
                    return v; 
            case "even": 
                return v.where((nn, _index) => _index % 2 == 0); 
            case "odd": 
                return v.where((nn, _index) => (_index & 1) == 1); 
            case "next": 
                return v.select(nn => nn.nextsibling); 
            case "contains": 
                return v.where(x => { return x.innerhtml.contains(keyword); }); 
            case "empty": 
                return v.where(x => { return x.haschildnodes == false; }); 
            case "header": 
                string[] headers = new string[] { "h1", "h2", "h3", "h4", "h5", "h6" }; 
                return findchildnodes(v.toarray()).where(x => { return headers.contains(x.originalname); }); 
            default: 
                throw new notsupportedexception("函数不支持。"); 
        } 
    } 
    #endregion 
    #endregion 
 
    #region 根据类名找节点 
    private parallelquery<htmlnode> class(htmlnode hn, string expression) 
    { 
        return class(new htmlnode[] { hn }, expression); 
    } 
    /// <summary> 
    /// 根据类名找节点 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private parallelquery<htmlnode> class(ilist<htmlnode> _htmlnodes, string expression) 
    { 
        var v = findchildnodes(_htmlnodes).asparallel().where(x => x.attributes["class"] != null); 
 
        var y = v.where(x => x.attributes["class"].value.split(new char[] { ' ' }, stringsplitoptions.removeemptyentries).contains(expression.trimstart('.'), stringcomparer.currentcultureignorecase)); 
 
        return y; 
    } 
    #endregion 
 
    #region 根据类型找节点 
    /// <summary> 
    /// 根据类型找节点 
    /// </summary> 
    /// <param name="hn"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private parallelquery<htmlnode> nodetype(htmlnode hn, string expression) 
    { 
        return nodetype(new htmlnode[] { hn }, expression); 
    } 
    /// <summary> 
    /// 根据类型找节点 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private parallelquery<htmlnode> nodetype(ilist<htmlnode> _htmlnodes, string expression) 
    { 
        var v = findchildnodes(_htmlnodes).asparallel().where( 
                 x => x.originalname.equals(expression, stringcomparison.currentcultureignorecase)); 
 
 
        return v; 
    } 
    #endregion 
 
    #region 查找所有下级 
    /// <summary> 
    /// 查找所有下级 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <returns></returns> 
    private list<htmlnode> findchildnodes(ilist<htmlnode> _htmlnodes) 
    { 
        if (_htmlnodes == null) 
        { 
            throw new exception(""); 
        } 
        list<htmlnode> list = new list<htmlnode>(); 
        foreach (var v in _htmlnodes) 
        { 
            findchildnodesaction(v, list); 
        } 
 
        return list; 
    } 
    private void findchildnodesaction(htmlnode hn, list<htmlnode> list) 
    { 
        if (list == null) 
        { 
            throw new exception(""); 
        } 
        foreach (var v in hn.childnodes) 
        { 
            if (hn.nodetype == htmlnodetype.element) 
            { 
                list.add(v); 
                findchildnodesaction(v, list); 
            } 
        } 
    } 
 
    #endregion 
 
 

 

摘自 winner2050的专栏