欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

用DOM实现文章采集--采集到网页源码

程序员文章站 2022-07-02 18:35:59
先来个采集网页的代码。 [csharp] using system;  using system.collections.generic;  using...
先来个采集网页的代码。
[csharp]
using system; 
using system.collections.generic; 
using system.io; 
using system.io.compression; 
using system.net; 
using system.text; 
namespace topwincms.common 

    public class nethelper 
    { 
 
        //private string _http_user_agent = "mozilla/4.0+(compatible;+msie+6.0;+windows+nt+5.2;+sv1;+.net+clr+1.1.4322;+.net+clr+2.0.50727)"; 
        private string _useragent = "googlebot/2.1 (+https://www.google.com/bot.html)"; 
        private encoding _httpencoding = null; 
        private string _proxyhost = string.empty; 
        private int _proxyint = 8080; 
        private int _timeout = 200000; 
 
        #region 属性 
        /// <summary> 
        /// 设置useragent 
        /// </summary> 
        public string useragent 
        { 
            get 
            { 
                return this._useragent; 
            } 
            set 
            { 
                this._useragent = value; 
            } 
        } 
        /// <summary> 
        /// 设置编码 
        /// </summary> 
        public encoding httpencoding 
        { 
            get 
            { 
                return this._httpencoding; 
            } 
            set 
            { 
                this._httpencoding = value; 
            } 
        } 
        /// <summary> 
        /// 设置代理服务器 
        /// </summary> 
        public string proxyhost 
        { 
            get 
            { 
                return this._proxyhost; 
            } 
            set 
            { 
                this._proxyhost = value; 
            } 
        } 
        /// <summary> 
        /// 设置代理服务器端口 
        /// </summary> 
        public int proxyint 
        { 
            get 
            { 
                return this._proxyint; 
            } 
            set 
            { 
                this._proxyint = value; 
            } 
        } 
        /// <summary> 
        /// 设置默认超时时间 
        /// </summary> 
        public int timeout 
        { 
            get 
            { 
                return this._timeout; 
            } 
            set 
            { 
                this._timeout = value; 
            } 
        } 
        #endregion 
 
        public remoteres get(string uri) 
        { 
            return get(new uri(uri)); 
        } 
        public remoteres get(uri uri) 
        { 
            remoteres info = new remoteres(); 
 
            httpwebrequest request = (httpwebrequest)webrequest.create(uri); 
            request.timeout = this._timeout; 
            request.useragent = this._useragent; 
            request.method = "get";  
            request.referer = string.concat("https://", uri.host); 
 
            if (this._proxyhost.length > 0) 
            { 
                request.proxy = new webproxy(this._proxyhost, this._proxyint); 
            } 
            httpwebresponse response = null; 
            stream responsestream = null; 
            try 
            { 
                encoding encoding; 
                response = (httpwebresponse)request.getresponse(); 
                responsestream = response.getresponsestream(); 
               
                if (response.headers["accept-encoding"] != null) 
                { 
                    if (mycollections.contain(response.headers["accept-encoding"], "*", "gzip", "x-gzip")) 
                    { 
                        responsestream = new gzipstream(responsestream, compressionmode.decompress); 
                    } 
                } 
                else if (response.headers["content-encoding"] != null) 
                { 
                    if (mycollections.contain(response.headers["content-encoding"], "*", "gzip", "x-gzip")) 
                    { 
                        responsestream = new gzipstream(responsestream, compressionmode.decompress); 
                    } 
                } 
                
                if (this._httpencoding == null) 
                { 
                    string str = response.characterset.tolower(); 
                    if (str.length > 3) 
                    { 
                        if (str.substring(0, 3) == "iso") 
                        { 
                            encoding = encoding.default; 
                        } 
                        else 
                        { 
                            encoding = encoding.getencoding(response.characterset); 
                        } 
                    } 
                    else 
                    { 
                        encoding = encoding.getencoding(response.characterset); 
                    } 
                    if (str.length == 0) 
                    { 
                        encoding = encoding.utf8; 
                    } 
                } 
                else 
                { 
                    encoding = this._httpencoding; 
                } 
                info.html = new streamreader(responsestream, encoding).readtoend(); 
                info.contenttype = response.contenttype; 
                info.statuscode = response.statuscode; 
 
            } 
            catch (webexception we) 
            { 
                if (we.response != null) 
                { 
                    info.statuscode = (we.response as httpwebresponse).statuscode; 
                } 
                else 
                { 
                    info.statuscode = httpstatuscode.serviceunavailable; 
                } 
                info.code = "错误:" + we.message; 
 
            } 
            catch (exception ex) 
            { 
                info.code = "错误:" + ex.message; 
                info.statuscode = httpstatuscode.internalservererror; 
            } 
            finally 
            { 
                if (responsestream != null) 
                    responsestream.close(); 
                if (response != null) 
                    response.close(); 
            } 
 
            return info; 
        } 
 
        #region 取得远程资源 
        /// <summary> 
        /// 取得远程资源   
        /// </summary> 
        /// <param name="strurl">要取的url</param> 
        /// <returns>网页源代码</returns> 
        public remoteres getremoteresource(string strurl) 
        { 
            httpwebresponse response = null; 
            stream stream = null; 
            remoteres info = new remoteres(); 
            try 
            { 
                httpwebrequest request = (httpwebrequest)webrequest.create(strurl); 
                request.allowautoredirect = true; 
                request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506)"; 
                request.referer = "https://" + new uri(strurl).host; 
                response = request.getresponse() as httpwebresponse; 
                stream = response.getresponsestream(); 
                info.contenttype = response.contenttype; 
                memorystream ms = new memorystream(); 
 
                byte[] buffer = new byte[256]; 
 
                int c = stream.read(buffer, 0, buffer.length); 
 
                while (c > 0) 
                { 
                    ms.write(buffer, 0, c); 
                    c = stream.read(buffer, 0, buffer.length); 
                } 
                stream.close(); 
 
                info.statuscode = response.statuscode; 
 
                info.bytes = ms.toarray(); 
 
            } 
            catch (webexception we) 
            { 
                if (we.response != null) 
                { 
                    info.statuscode = (we.response as httpwebresponse).statuscode; 
                } 
                else 
                { 
                    info.statuscode = httpstatuscode.serviceunavailable; 
                } 
 
                return null; 
            } 
            catch 
            { 
                info.statuscode = httpstatuscode.internalservererror; 
 
                return null; 
            } 
            finally 
            { 
                if (stream != null) 
                    stream.close(); 
 
                if (response != null) 
                    response.close(); 
            } 
            return info; 
        } 
        #endregion 
 
 
        public remoteres post(string strurl, string postdata) 
        { 
            remoteres info = new remoteres(); 
            stream responsestream = null; 
            httpwebresponse response = null; 
            try 
            { 
                byte[] bytes = this._httpencoding.getbytes(postdata); 
                httpwebrequest request = (httpwebrequest)webrequest.create(strurl); 
                request.method = "post"; 
                request.contenttype = "application/x-www-form-urlencoded"; 
                request.contentlength = bytes.length; 
                request.timeout = this._timeout; 
                request.useragent = this._useragent;  
                //request.referer = string.concat("https://", uri.host); 
                if (this._proxyhost.length > 0) 
                { 
                    request.proxy = new webproxy(this._proxyhost, this._proxyint); 
                } 
                using (stream requeststream = request.getrequeststream()) 
                { 
                    requeststream.write(bytes, 0, bytes.length); 
                    requeststream.close(); 
                } 
                try 
                { 
                    encoding encoding; 
                    response = (httpwebresponse)request.getresponse(); 
                    responsestream = response.getresponsestream(); 
                    if (this._httpencoding == null) 
                    { 
                        string str = response.characterset.tolower(); 
                        if (str.length > 3) 
                        { 
                            if (str.substring(0, 3) == "iso") 
                            { 
                                encoding = encoding.default; 
                            } 
                            else 
                            { 
                                encoding = encoding.getencoding(response.characterset); 
                            } 
                        } 
                        else 
                        { 
                            encoding = encoding.getencoding(response.characterset); 
                        } 
                        if (str.length == 0) 
                        { 
                            encoding = encoding.default; 
                        } 
                    } 
                    else 
                    { 
                        encoding = this._httpencoding; 
                    } 
                    info.html = new streamreader(responsestream, encoding).readtoend(); 
                    info.statuscode = httpstatuscode.ok; 
 
                    responsestream.close(); 
                    response.close(); 
                    return info; 
                } 
                catch (exception ex) 
                { 
                    info.html = "错误:" + ex.message; 
                } 
 
            } 
            catch (exception ex) 
            { 
                info.html = "错误:" + ex.message; 
            } 
            finally 
            { 
                if (responsestream != null) 
                    responsestream.close(); 
                if (response != null) 
                    response.close(); 
            } 
            return info; 
        } 
 
        #region 检查链接 
        /// <summary> 
        /// 检查链接是否存在 
        /// </summary> 
        /// <param name="surl"></param> 
        /// <param name="allowbadnum"></param> 
        public bool urlexist(string strurl) 
        { 
            httpwebrequest request = (httpwebrequest)webrequest.create(strurl); 
            request.method = "head"; 
            request.allowautoredirect = false; 
            request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506; .net clr 3.5.21022; .net clr 1.0.3705; .net clr 1.1.4322)"; 
            httpwebresponse response = (httpwebresponse)request.getresponse(); 
            if (response.statuscode != httpstatuscode.ok) 
            { 
                response.close(); 
                return false; 
            } 
            else 
            { 
                return true; 
            } 
 
 
        } 
        /// <summary> 
        /// 检查死链接是否在能容忍的数量内 
        /// </summary> 
        /// <param name="urls"></param> 
        /// <param name="allowbadnum"></param> 
        /// <returns></returns> 
        public bool urlexist(list<string> urls, int allowbadnum) 
        { 
            //如果图片的数量小于能容忍的数量就不用检查了。 
            if (urls.count <= allowbadnum) 
            { 
                return true; 
            } 
            int inttemp = 0; 
            foreach (string strurl in urls) 
            { 
                if (urlexist(strurl) == false) 
                { 
                    inttemp++; 
                    if (inttemp > allowbadnum) 
                    { 
                        return false; 
                    } 
                } 
            } 
            return true; 
        } 
        #endregion 
    } 
 
    public class remoteres 
    { 
        private string _code; 
        private string _html; 
        private byte[] _bytes; 
        private string _contenttype; 
        private httpstatuscode _statuscode; 
        /// <summary> 
        /// 返回信息的代码 
        /// </summary> 
        public string code 
        { 
            get 
            { 
                return this._code; 
            } 
            set 
            { 
                this._code = value; 
            } 
        } 
        /// <summary> 
        /// 信息 
        /// </summary> 
        public string html 
        { 
            get 
            { 
                return this._html; 
            } 
            set 
            { 
                this._html = value; 
            } 
        } 
        /// <summary> 
        /// 远程资源 
        /// </summary> 
        public byte[] bytes 
        { 
            get 
            { 
                return this._bytes; 
            } 
            set 
            { 
                this._bytes = value; 
            } 
        } 
        /// <summary> 
        /// 内容类型 
        /// </summary> 
        public string contenttype 
        { 
            get 
            { 
                return this._contenttype; 
            } 
            set 
            { 
                this._contenttype = value; 
            } 
        } 
        /// <summary> 
        /// 状态代码 
        /// </summary> 
        public httpstatuscode statuscode 
        { 
            get 
            { 
                return this._statuscode; 
            } 
            set 
            { 
                this._statuscode = value; 
            } 
        } 
    } 




摘自 winner2050的专栏